diff --git a/.gitattributes b/.gitattributes
index 283fbfa5f5b59c93d9e4e77879c65a409bbe2afc..317ee21a9d4d4eb745891efdc9f7a956f274dcde 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -35,3 +35,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 *.mp4 filter=lfs diff=lfs merge=lfs -text
 *.png filter=lfs diff=lfs merge=lfs -text
+VideoLLaMA2/assets/cat_and_chicken.mp4 filter=lfs diff=lfs merge=lfs -text
+VideoLLaMA2/assets/logo.png filter=lfs diff=lfs merge=lfs -text
+VideoLLaMA2/assets/pipeline.png filter=lfs diff=lfs merge=lfs -text
+VideoLLaMA2/assets/sora.mp4 filter=lfs diff=lfs merge=lfs -text
+VideoLLaMA2/assets/sora.png filter=lfs diff=lfs merge=lfs -text
diff --git a/VideoLLaMA2/.gitignore b/VideoLLaMA2/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..5d2b4c1ab07337c2106b40a88610bf991e0707aa
--- /dev/null
+++ b/VideoLLaMA2/.gitignore
@@ -0,0 +1,58 @@
+# Python
+__pycache__
+*.pyc
+*.egg-info
+dist
+
+# Log
+*.log
+*.log.*
+*.json
+*.jsonl
+log_dir*/
+temp*/
+
+# Data
+!**/alpaca-data-conversation.json
+
+# Editor
+.idea
+*.swp
+
+# Other
+.DS_Store
+3rd_parties
+
+# jupyter
+.ipynb_checkpoints
+*.ipynb
+
+# DevContainer
+!.devcontainer/*
+
+# Demo
+serve_images/
+temp/
+
+# data folder
+data/
+dataset/
+datasets/
+
+# training folder
+wandb
+ckpts*
+output
+output/
+checkpoints
+checkpoints/
+work_dirs*/
+
+# evaluation folder
+/eval
+/eval*
+
+# pretrained weights
+pretrained/
+publish_models/
+public_models/
\ No newline at end of file
diff --git a/VideoLLaMA2/LICENSE b/VideoLLaMA2/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..261eeb9e9f8b2b4b0d119366dda99c6fd7d35c64
--- /dev/null
+++ b/VideoLLaMA2/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/VideoLLaMA2/README.md b/VideoLLaMA2/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..f87ed7489f82aea62e9921a4f278644cd716e8ae
--- /dev/null
+++ b/VideoLLaMA2/README.md
@@ -0,0 +1,329 @@
+<p align="center">
+    <img src="https://github.com/DAMO-NLP-SG/VideoLLaMA2/blob/e7bc34e0e9a96d77947a75b54399d9f96ccf209d/assets/logo.png" width="150" style="margin-bottom: 0.2;"/>
+<p>
+
+<h3 align="center"><a href="https://arxiv.org/abs/2406.07476" style="color:#9C276A">
+VideoLLaMA 2: Advancing Spatial-Temporal Modeling and Audio Understanding in Video-LLMs</a></h3>
+<h5 align="center"> If our project helps you, please give us a star ⭐ on GitHub to support us. 🙏🙏 </h2>
+
+<h5 align="center">
+
+[![hf_space](https://img.shields.io/badge/🤗-Demo-9C276A.svg)](https://huggingface.co/spaces/lixin4ever/VideoLLaMA2)
+[![hf_checkpoint](https://img.shields.io/badge/🤗-Checkpoints-9C276A.svg)](https://huggingface.co/collections/DAMO-NLP-SG/videollama-2-6669b6b6f0493188305c87ed)
+[![hf_data](https://img.shields.io/badge/🤗-MSVC-9C276A.svg)](https://huggingface.co/datasets/DAMO-NLP-SG/Multi-Source-Video-Captioning)
+[![arXiv](https://img.shields.io/badge/Arxiv-2406.07476-AD1C18.svg?logo=arXiv)](https://arxiv.org/abs/2406.07476) <br>
+[![License](https://img.shields.io/badge/License-Apache%202.0-yellow)](https://github.com/DAMO-NLP-SG/VideoLLaMA2/blob/main/LICENSE) 
+[![Hits](https://hits.seeyoufarm.com/api/count/incr/badge.svg?url=https%3A%2F%2Fgithub.com%2FDAMO-NLP-SG%2FVideoLLaMA2&count_bg=%2379C83D&title_bg=%23555555&icon=&icon_color=%23E7E7E7&title=Visitor&edge_flat=false)](https://hits.seeyoufarm.com)
+[![GitHub issues](https://img.shields.io/github/issues/DAMO-NLP-SG/VideoLLaMA2?color=critical&label=Issues)](https://github.com/DAMO-NLP-SG/VideoLLaMA2/issues?q=is%3Aopen+is%3Aissue)
+[![GitHub closed issues](https://img.shields.io/github/issues-closed/DAMO-NLP-SG/VideoLLaMA2?color=success&label=Issues)](https://github.com/DAMO-NLP-SG/VideoLLaMA2/issues?q=is%3Aissue+is%3Aclosed)  <br>
+
+</h5>
+
+<details open><summary>💡 Some other multimodal-LLM projects from our team may interest you ✨. </summary><p>
+<!--  may -->
+
+> [**Video-LLaMA: An Instruction-tuned Audio-Visual Language Model for Video Understanding**](https://github.com/DAMO-NLP-SG/Video-LLaMA) <br>
+> Hang Zhang, Xin Li, Lidong Bing <br>
+[![github](https://img.shields.io/badge/-Github-black?logo=github)](https://github.com/DAMO-NLP-SG/Video-LLaMA)  [![github](https://img.shields.io/github/stars/DAMO-NLP-SG/Video-LLaMA.svg?style=social)](https://github.com/DAMO-NLP-SG/Video-LLaMA) [![arXiv](https://img.shields.io/badge/Arxiv-2306.02858-b31b1b.svg?logo=arXiv)](https://arxiv.org/abs/2306.02858) <br>
+
+> [**VCD: Mitigating Object Hallucinations in Large Vision-Language Models through Visual Contrastive Decoding**](https://arxiv.org/abs/2311.16922) <br>
+> Sicong Leng, Hang Zhang, Guanzheng Chen, Xin Li, Shijian Lu, Chunyan Miao, Lidong Bing <br>
+[![github](https://img.shields.io/badge/-Github-black?logo=github)](https://github.com/DAMO-NLP-SG/VCD)  [![github](https://img.shields.io/github/stars/DAMO-NLP-SG/VCD.svg?style=social)](https://github.com/DAMO-NLP-SG/VCD)  [![arXiv](https://img.shields.io/badge/Arxiv-2311.16922-b31b1b.svg?logo=arXiv)](https://arxiv.org/abs/2311.16922) <br>
+
+</p></details>
+
+<div align="center"><video src="https://github.com/DAMO-NLP-SG/VideoLLaMA2/assets/18526640/e0e7951c-f392-42ed-afad-b2c7984d3e38" width="800"></div>
+
+
+## 📰 News
+* **[2024.07.30]**  Release checkpoints of [VideoLLaMA2-8x7B-Base](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2-8x7B-Base) and [VideoLLaMA2-8x7B](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2-8x7B).
+* **[2024.06.25]**  🔥🔥 As of Jun 25, our [VideoLLaMA2-7B-16F](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2-7B-16F) is the **Top-1** ~7B-sized VideoLLM on the [MLVU Leaderboard](https://github.com/JUNJIE99/MLVU?tab=readme-ov-file#trophy-mini-leaderboard).
+* **[2024.06.18]**  🔥🔥 As of Jun 18, our [VideoLLaMA2-7B-16F](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2-7B-16F) is the **Top-1** ~7B-sized VideoLLM on the [VideoMME Leaderboard](https://video-mme.github.io/home_page.html#leaderboard).
+* **[2024.06.17]**  👋👋 Update technical report with the latest results and the missing references. If you have works closely related to VideoLLaMA 2 but not mentioned in the paper, feel free to let us know.  
+* **[2024.06.14]**  🔥🔥 [Online Demo](https://huggingface.co/spaces/lixin4ever/VideoLLaMA2) is available.
+* **[2024.06.03]**  Release training, evaluation, and serving codes of VideoLLaMA 2.
+
+
+<img src="https://github.com/DAMO-NLP-SG/VideoLLaMA2/assets/18526640/b9faf24f-bdd2-4728-9385-acea17ea086d" width="800" />
+
+## 🛠️ Requirements and Installation
+Basic Dependencies:
+* Python >= 3.8
+* Pytorch >= 2.2.0
+* CUDA Version >= 11.8
+* transformers >= 4.41.2 (for mistral tokenizer)
+* tokenizers >= 0.19.1 (for mistral tokenizer)
+
+**[Online Mode]** Install required packages (better for development):
+```bash
+git clone https://github.com/DAMO-NLP-SG/VideoLLaMA2
+cd VideoLLaMA2
+pip install -r requirements.txt
+pip install flash-attn==2.5.8 --no-build-isolation
+```
+
+**[Offline Mode]** Install VideoLLaMA2 as a Python package (better for direct use):
+```bash
+git clone https://github.com/DAMO-NLP-SG/VideoLLaMA2
+cd VideoLLaMA2
+pip install --upgrade pip  # enable PEP 660 support
+pip install -e .
+pip install flash-attn==2.5.8 --no-build-isolation
+```
+
+## 🚀 Main Results
+
+### Multi-Choice Video QA & Video Captioning
+<p><img src="https://github.com/DAMO-NLP-SG/VideoLLaMA2/assets/18526640/9cc4a5ae-d850-4eef-bd51-83688b94698e" width="800" "/></p>
+
+
+###  Open-Ended Video QA
+<p><img src="https://github.com/DAMO-NLP-SG/VideoLLaMA2/assets/18526640/2ed7aa53-db56-4829-8375-85aefbc5120a" width="800" "/></p>
+
+## :earth_americas: Model Zoo
+| Model Name     | Model Type | Visual Encoder | Language Decoder | # Training Frames |
+|:----------------|:------------:|:----------------|:------------------|:----------------:|
+| [VideoLLaMA2-7B-Base](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2-7B-Base)  | Base  | [clip-vit-large-patch14-336](https://huggingface.co/openai/clip-vit-large-patch14-336) | [Mistral-7B-Instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2)  | 8 |
+| [VideoLLaMA2-7B](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2-7B)  | Chat | [clip-vit-large-patch14-336](https://huggingface.co/openai/clip-vit-large-patch14-336) | [Mistral-7B-Instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2)  | 8 |
+| [VideoLLaMA2-7B-16F-Base](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2-7B-16F-Base)  | Base  | [clip-vit-large-patch14-336](https://huggingface.co/openai/clip-vit-large-patch14-336) | [Mistral-7B-Instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2)  | 16 |
+| [VideoLLaMA2-7B-16F](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2-7B-16F)  | Chat | [clip-vit-large-patch14-336](https://huggingface.co/openai/clip-vit-large-patch14-336) | [Mistral-7B-Instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2)  | 16 |
+| [VideoLLaMA2-8x7B-Base](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2-8x7B-Base)  | Base | [clip-vit-large-patch14-336](https://huggingface.co/openai/clip-vit-large-patch14-336) | [Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1)  | 8 |
+| [VideoLLaMA2-8x7B](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2-8x7B)  | Chat | [clip-vit-large-patch14-336](https://huggingface.co/openai/clip-vit-large-patch14-336) | [Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1)  | 8 |
+
+
+## [🤗 Demo](https://huggingface.co/spaces/lixin4ever/VideoLLaMA2)
+
+It is highly recommended to try our [online demo](https://huggingface.co/spaces/lixin4ever/VideoLLaMA2) first.
+
+To run a video-based LLM (Large Language Model) web demonstration on your device, you will first need to ensure that you have the necessary model checkpoints prepared, followed by adhering to the steps outlined to successfully launch the demo.
+
+### Single-model Version
+
+* Launch a gradio app directly ([VideoLLaMA2-7B](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2-7B) is adopted by default):
+```bash
+python videollama2/serve/gradio_web_server_adhoc.py
+```
+
+### Multi-model Version
+
+1. Launch a global controller
+```bash
+cd /path/to/VideoLLaMA2
+python -m videollama2.serve.controller --host 0.0.0.0 --port 10000
+```
+
+2. Launch a gradio webserver
+```bash
+python -m videollama2.serve.gradio_web_server --controller http://localhost:10000 --model-list-mode reload
+```
+
+3. Launch one or multiple model workers
+```bash
+#  export HF_ENDPOINT=https://hf-mirror.com  # If you are unable to access Hugging Face, try to uncomment this line.
+python -m videollama2.serve.model_worker --host 0.0.0.0 --controller http://localhost:10000 --port 40000 --worker http://localhost:40000 --model-path /PATH/TO/MODEL1
+python -m videollama2.serve.model_worker --host 0.0.0.0 --controller http://localhost:10000 --port 40001 --worker http://localhost:40001 --model-path /PATH/TO/MODEL2
+python -m videollama2.serve.model_worker --host 0.0.0.0 --controller http://localhost:10000 --port 40002 --worker http://localhost:40002 --model-path /PATH/TO/MODEL3
+...
+```
+
+
+## 🗝️ Training & Evaluation
+
+### Quick Start
+
+To facilitate further development on top of our codebase, we provide a quick-start guide on how to train a customized [VideoLLaMA2](https://github.com/DAMO-NLP-SG/VideoLLaMA2) with [VideoLLaVA](https://github.com/PKU-YuanGroup/Video-LLaVA) dataset and evaluate the trained model on the mainstream video-llm benchmarks.
+
+1. Training Data Structure:
+```bash
+VideoLLaMA2
+├── datasets
+│   ├── videollava_pt
+|   |   ├── llava_image/ # Available at: https://pan.baidu.com/s/17GYcE69FcJjjUM0e4Gad2w?pwd=9ga3 or https://drive.google.com/drive/folders/1QmFj2FcMAoWNCUyiUtdcW0-IOhLbOBcf?usp=drive_link
+|   |   ├── valley/      # Available at: https://pan.baidu.com/s/1jluOimE7mmihEBfnpwwCew?pwd=jyjz or https://drive.google.com/drive/folders/1QmFj2FcMAoWNCUyiUtdcW0-IOhLbOBcf?usp=drive_link
+|   |   └── valley_llavaimage.json # Available at: https://drive.google.com/file/d/1zGRyVSUMoczGq6cjQFmT0prH67bu2wXD/view, including 703K video-text and 558K image-text pairs
+│   ├── videollava_sft
+|   |   ├── llava_image_tune/  # Available at: https://pan.baidu.com/s/1l-jT6t_DlN5DTklwArsqGw?pwd=o6ko
+|   |   ├── videochatgpt_tune/ # Available at: https://pan.baidu.com/s/10hJ_U7wVmYTUo75YHc_n8g?pwd=g1hf
+|   |   └── videochatgpt_llavaimage_tune.json # Available at: https://drive.google.com/file/d/1zGRyVSUMoczGq6cjQFmT0prH67bu2wXD/view, including 100K video-centric, 625K image-centric and 40K text-only conversations
+```
+2. Command:
+```bash
+# VideoLLaMA2-vllava pretraining
+bash scripts/vllava/pretrain.sh
+# VideoLLaMA2-vllava finetuning
+bash scripts/vllava/finetune.sh
+```
+3. Evaluation Data Structure:
+```bash
+VideoLLaMA2
+├── eval
+│   ├── egoschema # Official website: https://github.com/egoschema/EgoSchema
+|   |   ├── good_clips_git/ # Available at: https://drive.google.com/drive/folders/1SS0VVz8rML1e5gWq7D7VtP1oxE2UtmhQ
+|   |   └── questions.json  # Available at: https://github.com/egoschema/EgoSchema/blob/main/questions.json
+│   ├── mvbench # Official website: https://huggingface.co/datasets/OpenGVLab/MVBench
+|   |   ├── video/
+|   |   |   ├── clever/
+|   |   |   └── ...
+|   |   └── json/
+|   |   |   ├── action_antonym.json
+|   |   |   └── ...
+│   ├── perception_test_mcqa # Official website: https://huggingface.co/datasets/OpenGVLab/MVBench
+|   |   ├── videos/ # Available at: https://storage.googleapis.com/dm-perception-test/zip_data/test_videos.zip
+|   |   └── mc_question_test.json # Download from https://storage.googleapis.com/dm-perception-test/zip_data/mc_question_test_annotations.zip
+│   ├── videomme # Official website: https://video-mme.github.io/home_page.html#leaderboard
+|   |   ├── test-00000-of-00001.parquet
+|   |   ├── videos/
+|   |   └── subtitles/
+│   ├── Activitynet_Zero_Shot_QA # Official website: https://github.com/MILVLG/activitynet-qa
+|   |   ├── all_test/   # Available at: https://mbzuaiac-my.sharepoint.com/:u:/g/personal/hanoona_bangalath_mbzuai_ac_ae/EatOpE7j68tLm2XAd0u6b8ABGGdVAwLMN6rqlDGM_DwhVA?e=90WIuW
+|   |   ├── test_q.json # Available at: https://github.com/MILVLG/activitynet-qa/tree/master/dataset
+|   |   └── test_a.json # Available at: https://github.com/MILVLG/activitynet-qa/tree/master/dataset
+│   ├── MSVD_Zero_Shot_QA # Official website: https://github.com/xudejing/video-question-answering
+|   |   ├── videos/     
+|   |   ├── test_q.json 
+|   |   └── test_a.json
+│   ├── videochatgpt_gen # Official website: https://github.com/mbzuai-oryx/Video-ChatGPT/tree/main/quantitative_evaluation
+|   |   ├── Test_Videos/ # Available at: https://mbzuaiac-my.sharepoint.com/:u:/g/personal/hanoona_bangalath_mbzuai_ac_ae/EatOpE7j68tLm2XAd0u6b8ABGGdVAwLMN6rqlDGM_DwhVA?e=90WIuW
+|   |   ├── Test_Human_Annotated_Captions/ # Available at: https://mbzuaiac-my.sharepoint.com/personal/hanoona_bangalath_mbzuai_ac_ae/_layouts/15/onedrive.aspx?id=%2Fpersonal%2Fhanoona%5Fbangalath%5Fmbzuai%5Fac%5Fae%2FDocuments%2FVideo%2DChatGPT%2FData%5FCode%5FModel%5FRelease%2FQuantitative%5FEvaluation%2Fbenchamarking%2FTest%5FHuman%5FAnnotated%5FCaptions%2Ezip&parent=%2Fpersonal%2Fhanoona%5Fbangalath%5Fmbzuai%5Fac%5Fae%2FDocuments%2FVideo%2DChatGPT%2FData%5FCode%5FModel%5FRelease%2FQuantitative%5FEvaluation%2Fbenchamarking&ga=1
+|   |   ├── generic_qa.json     # These three json files available at: https://mbzuaiac-my.sharepoint.com/personal/hanoona_bangalath_mbzuai_ac_ae/_layouts/15/onedrive.aspx?id=%2Fpersonal%2Fhanoona%5Fbangalath%5Fmbzuai%5Fac%5Fae%2FDocuments%2FVideo%2DChatGPT%2FData%5FCode%5FModel%5FRelease%2FQuantitative%5FEvaluation%2Fbenchamarking%2FBenchmarking%5FQA&ga=1
+|   |   ├── temporal_qa.json
+|   |   └── consistency_qa.json
+```
+4. Command:
+```bash
+# mvbench evaluation
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/eval/eval_video_qa_mvbench.sh
+# activitynet-qa evaluation (need to set azure openai key/endpoint/deployname)
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/eval/eval_video_qa_mvbench.sh
+```
+
+### Data Format
+
+If you want to train a video-llm on your data, you need to follow the procedures below to prepare the video/image sft data:
+
+1. Suppose your data structure is like:
+```bash
+VideoLLaMA2
+├── datasets
+│   ├── custom_sft
+│   |   ├── images
+│   |   ├── videos
+|   |   └── custom.json
+```
+2. Then you should re-organize the annotated video/image sft data according to the following format:
+```json
+[
+    {
+        "id": 0,
+        "video": "images/xxx.jpg",
+        "conversations": [
+            {
+                "from": "human",
+                "value": "<image>\nWhat are the colors of the bus in the image?"
+            },
+            {
+                "from": "gpt",
+                "value": "The bus in the image is white and red."
+            },
+            ...
+        ],
+    }
+    {
+        "id": 1,
+        "video": "videos/xxx.mp4",
+        "conversations": [
+            {
+                "from": "human",
+                "value": "<video>\nWhat are the main activities that take place in the video?"
+            },
+            {
+                "from": "gpt",
+                "value": "The main activities that take place in the video are the preparation of camera equipment by a man, a group of men riding a helicopter, and a man sailing a boat through the water."
+            },
+            ...
+        ],
+    },
+    ...
+]
+```
+3. Modify the `scripts/custom/finetune.sh`:
+```bash
+...
+--data_path datasets/custom_sft/custom.json
+--data_folder datasets/custom_sft/
+--pretrain_mm_mlp_adapter CONNECTOR_DOWNLOAD_PATH (e.g., DAMO-NLP-SG/VideoLLaMA2-7B-Base)
+...
+```
+
+## 🤖 Inference
+
+Video/Image Inference:
+```python
+import sys
+sys.path.append('./')
+from videollama2 import model_init, mm_infer
+from videollama2.utils import disable_torch_init
+
+
+def inference():
+    disable_torch_init()
+
+    # Video Inference
+    modal = 'videp'
+    modal_path = 'assets/cat_and_chicken.mp4' 
+    instruct = 'What animals are in the video, what are they doing, and how does the video feel?'
+    # Reply:
+    # The video features a kitten and a baby chick playing together. The kitten is seen laying on the floor while the baby chick hops around. The two animals interact playfully with each other, and the video has a cute and heartwarming feel to it.
+
+    # Image Inference
+    modal = 'image'
+    modal_path = 'assets/sora.png'
+    instruct = 'What is the woman wearing, what is she doing, and how does the image feel?'
+    # Reply:
+    # The woman in the image is wearing a black coat and sunglasses, and she is walking down a rain-soaked city street. The image feels vibrant and lively, with the bright city lights reflecting off the wet pavement, creating a visually appealing atmosphere. The woman's presence adds a sense of style and confidence to the scene, as she navigates the bustling urban environment.
+
+    model_path = 'DAMO-NLP-SG/VideoLLaMA2-7B'
+    # Base model inference (only need to replace model_path)
+    # model_path = 'DAMO-NLP-SG/VideoLLaMA2-7B-Base'
+    model, processor, tokenizer = model_init(model_path)
+    output = mm_infer(processor[modal](modal_path), instruct, model=model, tokenizer=tokenizer, do_sample=False, modal=modal)
+
+    print(output)
+
+if __name__ == "__main__":
+    inference()
+```
+
+## 📑 Citation
+
+If you find VideoLLaMA useful for your research and applications, please cite using this BibTeX:
+```bibtex
+@article{damonlpsg2024videollama2,
+  title={VideoLLaMA 2: Advancing Spatial-Temporal Modeling and Audio Understanding in Video-LLMs},
+  author={Cheng, Zesen and Leng, Sicong and Zhang, Hang and Xin, Yifei and Li, Xin and Chen, Guanzheng and Zhu, Yongxin and Zhang, Wenqi and Luo, Ziyang and Zhao, Deli and Bing, Lidong},
+  journal={arXiv preprint arXiv:2406.07476},
+  year={2024},
+  url = {https://arxiv.org/abs/2406.07476}
+}
+
+@article{damonlpsg2023videollama,
+  title = {Video-LLaMA: An Instruction-tuned Audio-Visual Language Model for Video Understanding},
+  author = {Zhang, Hang and Li, Xin and Bing, Lidong},
+  journal = {arXiv preprint arXiv:2306.02858},
+  year = {2023},
+  url = {https://arxiv.org/abs/2306.02858}
+}
+```
+
+## 👍 Acknowledgement
+The codebase of VideoLLaMA 2 is adapted from [**LLaVA 1.5**](https:github.com/haotian-liu/LLaVA) and [**FastChat**](https://github.com/lm-sys/FastChat). We are also grateful for the following projects our VideoLLaMA 2 arise from:
+* [**LLaMA 2**](https://github.com/meta-llama/llama), [**Mistral-7B**](https://mistral.ai/news/announcing-mistral-7b/), [**OpenAI CLIP**](https://openai.com/index/clip/), [**Honeybee**](https://github.com/kakaobrain/honeybee).
+* [**Video-ChatGPT**](https://github.com/mbzuai-oryx/Video-ChatGPT), [**Video-LLaVA**](https://github.com/PKU-YuanGroup/Video-LLaVA). 
+* [**WebVid**](https://github.com/m-bain/webvid), [**Panda-70M**](https://github.com/snap-research/Panda-70M), [**LanguageBind**](https://github.com/PKU-YuanGroup/LanguageBind), [**InternVid**](https://github.com/OpenGVLab/InternVideo/tree/main/Data/InternVid).
+* [**VideoChat2**](https://github.com/OpenGVLab/Ask-Anything/tree/main/video_chat2), [**Valley**](https://github.com/RupertLuo/Valley), [**VTimeLLM**](https://github.com/huangb23/VTimeLLM), [**ShareGPT4V**](https://sharegpt4v.github.io/).
+
+
+## 🔒 License
+
+This project is released under the Apache 2.0 license as found in the LICENSE file.
+The service is a research preview intended for **non-commercial use ONLY**, subject to the model Licenses of LLaMA and Mistral, Terms of Use of the data generated by OpenAI, and Privacy Practices of ShareGPT. Please get in touch with us if you find any potential violations.
diff --git a/VideoLLaMA2/assets/cat_and_chicken.mp4 b/VideoLLaMA2/assets/cat_and_chicken.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..e5898c862833c17ffbce625736fc6614fb14fe7c
--- /dev/null
+++ b/VideoLLaMA2/assets/cat_and_chicken.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1f24723064ee27ea8fc7a30b4542601ed03a42952c0d20fe918213cf876bfec4
+size 18956323
diff --git a/VideoLLaMA2/assets/logo.png b/VideoLLaMA2/assets/logo.png
new file mode 100644
index 0000000000000000000000000000000000000000..1b2c52bfb2a042fb0df0d62d68425ce61d4506b2
--- /dev/null
+++ b/VideoLLaMA2/assets/logo.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cd9a3a969f931fb23ed371de960ddc589136a937df901b2b08e2750fabf6dd8e
+size 503863
diff --git a/VideoLLaMA2/assets/pipeline.png b/VideoLLaMA2/assets/pipeline.png
new file mode 100644
index 0000000000000000000000000000000000000000..29f32809e0e7744a07de4b00602a67549673f333
--- /dev/null
+++ b/VideoLLaMA2/assets/pipeline.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eeab6d9f13787337b40399427e419506f49b55a4fb4fe40ba91e25618d03eeb0
+size 4375052
diff --git a/VideoLLaMA2/assets/sora.mp4 b/VideoLLaMA2/assets/sora.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..61d726eac64b06fe969c9d4b105303b8c0eeb88e
--- /dev/null
+++ b/VideoLLaMA2/assets/sora.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:24e5f0ea3353f23225d00efcdf136fa6dc346301fc34082790e2152c80fa0490
+size 14978533
diff --git a/VideoLLaMA2/assets/sora.png b/VideoLLaMA2/assets/sora.png
new file mode 100644
index 0000000000000000000000000000000000000000..b1619da272d6caab2c128c62c4c05f3b143c43bb
--- /dev/null
+++ b/VideoLLaMA2/assets/sora.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b69de5c87b429c7b1a87de6f9cb3f5ec6aec5f58ab6ab7c0f727a5d0ec259a5
+size 1051277
diff --git a/VideoLLaMA2/pyproject.toml b/VideoLLaMA2/pyproject.toml
new file mode 100644
index 0000000000000000000000000000000000000000..a7c585b756fa0f68742d5ef64c79628a3376a858
--- /dev/null
+++ b/VideoLLaMA2/pyproject.toml
@@ -0,0 +1,41 @@
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "videollama2"
+version = "1.0"
+description = "Release of VideoLLaMA2"
+readme = "README.md"
+requires-python = ">=3.8"
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "License :: OSI Approved :: Apache Software License",
+]
+dependencies = [
+    "torch==2.2.0", "torchvision==0.17.0", 
+    "transformers==4.42.3", "tokenizers==0.19.1", 
+    "deepspeed==0.13.1", "accelerate==0.26.1",
+    "peft==0.4.0", "timm==1.0.3", "numpy==1.24.4",
+    "decord==0.6.0", "imageio==2.34.0", "imageio-ffmpeg==0.4.9",
+    "moviepy==1.0.3", "scenedetect==0.6.3",
+    "opencv-python==4.6.0.66", "pysubs2",
+    "scikit-learn==1.2.2", "huggingface_hub==0.23.4", "sentencepiece==0.1.99",
+    "shortuuid", "einops==0.6.1", "einops-exts==0.0.4", 
+    "bitsandbytes==0.43.0", "pydantic>=2.0", "markdown2[all]", 
+    "gradio==3.50.0", "gradio_client==0.6.1", "httpx==0.24.1",
+    "requests", "openai", "uvicorn", "fastapi", "tensorboard", "wandb", "tabulate"
+]
+
+[project.optional-dependencies]
+train = ["ninja"]
+
+[project.urls]
+"Homepage" = "https://github.com/DAMO-NLP-SG/VideoLLaMA2"
+"Bug Tracker" = "https://github.com/DAMO-NLP-SG/VideoLLaMA2/issues"
+
+[tool.setuptools.packages.find]
+exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"]
+
+[tool.wheel]
+exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"]
diff --git a/VideoLLaMA2/requirements.txt b/VideoLLaMA2/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4267869104a8bc72ffed9cbe03a4d51812ae3e01
--- /dev/null
+++ b/VideoLLaMA2/requirements.txt
@@ -0,0 +1,39 @@
+--extra-index-url https://download.pytorch.org/whl/cu118
+# basic dependencies
+torch==2.2.0
+torchvision==0.17.0
+transformers==4.42.3
+tokenizers==0.19.1
+deepspeed==0.13.1
+accelerate==0.26.1
+peft==0.4.0
+timm==1.0.3
+numpy==1.24.4
+# data processing
+decord==0.6.0
+imageio==2.34.0
+imageio-ffmpeg==0.4.9
+moviepy==1.0.3
+scenedetect==0.6.3
+opencv-python==4.6.0.66
+pysubs2
+# misc
+scikit-learn==1.2.2
+huggingface_hub==0.23.4
+sentencepiece==0.1.99
+shortuuid
+einops==0.6.1
+einops-exts==0.0.4
+bitsandbytes==0.43.0
+pydantic>=2.0
+markdown2[all]
+gradio==3.50.0
+gradio_client==0.6.1
+httpx==0.24.1
+requests
+openai
+uvicorn
+fastapi
+tensorboard
+wandb
+tabulate
\ No newline at end of file
diff --git a/VideoLLaMA2/scripts/custom/finetune.sh b/VideoLLaMA2/scripts/custom/finetune.sh
new file mode 100644
index 0000000000000000000000000000000000000000..550fdcf2029647b4a1b88ca1f5a0837659e0b7d8
--- /dev/null
+++ b/VideoLLaMA2/scripts/custom/finetune.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+
+# Environment Variables
+ARG_WORLD_SIZE=${1:-1}
+ARG_NPROC_PER_NODE=${2:-8}
+ARG_MASTER_ADDR="127.0.0.1"
+ARG_MASTER_PORT=16666
+ARG_RANK=0
+
+# Multiple conditions
+if [ ! -n "$WORLD_SIZE" ] || [ ! -n "$NPROC_PER_NODE" ]; then
+    WORLD_SIZE=$ARG_WORLD_SIZE
+    NPROC_PER_NODE=$ARG_NPROC_PER_NODE
+fi
+if [ ! -n "$MASTER_ADDR" ] || [ ! -n "$MASTER_PORT" ] || [ ! -n "$RANK" ]; then
+    MASTER_ADDR=$ARG_MASTER_ADDR
+    MASTER_PORT=$ARG_MASTER_PORT
+    RANK=$ARG_RANK
+fi
+
+echo "WORLD_SIZE: $WORLD_SIZE"
+echo "NPROC_PER_NODE: $NPROC_PER_NODE"
+
+# Training Arguments
+GLOBAL_BATCH_SIZE=128
+LOCAL_BATCH_SIZE=4
+GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$LOCAL_BATCH_SIZE)]
+
+# Log Arguments
+export TRANSFORMERS_OFFLINE=1
+export WANDB_PROJECT=videollama2
+RUN_NAME=downstream_sft_settings
+DATA_DIR=datasets
+OUTP_DIR=work_dirs
+
+torchrun --nnodes $WORLD_SIZE \
+    --nproc_per_node $NPROC_PER_NODE \
+    --master_addr=$MASTER_ADDR \
+    --master_port=$MASTER_PORT \
+    --node_rank $RANK \
+    videollama2/train_flash_attn.py \
+    --deepspeed scripts/zero3.json \
+    --model_type videollama2 \
+    --model_path mistralai/Mistral-7B-Instruct-v0.2 \
+    --vision_tower openai/clip-vit-large-patch14-336 \
+    --mm_projector_type stc_connector \
+    --pretrain_mm_mlp_adapter DAMO-NLP-SG/VideoLLaMA2-7B-Base/mm_projector.bin \
+    --data_path   ${DATA_DIR}/videollava_sft/videochatgpt_llavaimage_tune.json \
+    --data_folder ${DATA_DIR}/videollava_sft/ \
+    --mm_vision_select_layer -2 \
+    --image_aspect_ratio pad \
+    --num_frames 8 \
+    --bf16 True \
+    --tf32 True \
+    --fp16 False \
+    --output_dir ${OUTP_DIR}/${WANDB_PROJECT}/finetune_${RUN_NAME} \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size $LOCAL_BATCH_SIZE \
+    --per_device_eval_batch_size 4 \
+    --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 500 \
+    --save_total_limit 99 \
+    --learning_rate 2e-5 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --model_max_length 2048 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 4 \
+    --report_to tensorboard \
+    --run_name $RUN_NAME \
diff --git a/VideoLLaMA2/scripts/custom/finetune_lora.sh b/VideoLLaMA2/scripts/custom/finetune_lora.sh
new file mode 100644
index 0000000000000000000000000000000000000000..522f2cc5858f07d64ac3ff5957f64c5ca615de1a
--- /dev/null
+++ b/VideoLLaMA2/scripts/custom/finetune_lora.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+
+# Environment Variables
+ARG_WORLD_SIZE=${1:-1}
+ARG_NPROC_PER_NODE=${2:-8}
+ARG_MASTER_ADDR="127.0.0.1"
+ARG_MASTER_PORT=16666
+ARG_RANK=0
+
+# Multiple conditions
+if [ ! -n "$WORLD_SIZE" ] || [ ! -n "$NPROC_PER_NODE" ]; then
+    WORLD_SIZE=$ARG_WORLD_SIZE
+    NPROC_PER_NODE=$ARG_NPROC_PER_NODE
+fi
+if [ ! -n "$MASTER_ADDR" ] || [ ! -n "$MASTER_PORT" ] || [ ! -n "$RANK" ]; then
+    MASTER_ADDR=$ARG_MASTER_ADDR
+    MASTER_PORT=$ARG_MASTER_PORT
+    RANK=$ARG_RANK
+fi
+
+echo "WORLD_SIZE: $WORLD_SIZE"
+echo "NPROC_PER_NODE: $NPROC_PER_NODE"
+
+# Training Arguments
+GLOBAL_BATCH_SIZE=128
+LOCAL_BATCH_SIZE=4
+GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$LOCAL_BATCH_SIZE)]
+
+# Log Arguments
+export TRANSFORMERS_OFFLINE=1
+export WANDB_PROJECT=videollama2
+RUN_NAME=downstream_sft_settings_lora
+DATA_DIR=datasets
+OUTP_DIR=work_dirs
+
+torchrun --nnodes $WORLD_SIZE \
+    --nproc_per_node $NPROC_PER_NODE \
+    --master_addr=$MASTER_ADDR \
+    --master_port=$MASTER_PORT \
+    --node_rank $RANK \
+    videollama2/train_flash_attn.py \
+    --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \
+    --deepspeed scripts/zero3.json \
+    --model_type videollama2 \
+    --model_path mistralai/Mistral-7B-Instruct-v0.2 \
+    --vision_tower openai/clip-vit-large-patch14-336 \
+    --mm_projector_type stc_connector \
+    --pretrain_mm_mlp_adapter DAMO-NLP-SG/VideoLLaMA2-7B-Base/mm_projector.bin \
+    --data_path   ${DATA_DIR}/videollava_sft/videochatgpt_llavaimage_tune.json \
+    --data_folder ${DATA_DIR}/videollava_sft/ \
+    --mm_vision_select_layer -2 \
+    --image_aspect_ratio pad \
+    --num_frames 8 \
+    --bf16 True \
+    --tf32 True \
+    --fp16 False \
+    --output_dir ${OUTP_DIR}/${WANDB_PROJECT}/finetune_${RUN_NAME} \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size $LOCAL_BATCH_SIZE \
+    --per_device_eval_batch_size 4 \
+    --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 500 \
+    --save_total_limit 99 \
+    --learning_rate 2e-5 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --model_max_length 2048 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 4 \
+    --report_to tensorboard \
+    --run_name $RUN_NAME \
diff --git a/VideoLLaMA2/scripts/custom/finetune_qlora.sh b/VideoLLaMA2/scripts/custom/finetune_qlora.sh
new file mode 100644
index 0000000000000000000000000000000000000000..2f2e966e2c50e8ef1be5c79a68fe0c3bcb919772
--- /dev/null
+++ b/VideoLLaMA2/scripts/custom/finetune_qlora.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+
+# Environment Variables
+ARG_WORLD_SIZE=${1:-1}
+ARG_NPROC_PER_NODE=${2:-8}
+ARG_MASTER_ADDR="127.0.0.1"
+ARG_MASTER_PORT=16666
+ARG_RANK=0
+
+# Multiple conditions
+if [ ! -n "$WORLD_SIZE" ] || [ ! -n "$NPROC_PER_NODE" ]; then
+    WORLD_SIZE=$ARG_WORLD_SIZE
+    NPROC_PER_NODE=$ARG_NPROC_PER_NODE
+fi
+if [ ! -n "$MASTER_ADDR" ] || [ ! -n "$MASTER_PORT" ] || [ ! -n "$RANK" ]; then
+    MASTER_ADDR=$ARG_MASTER_ADDR
+    MASTER_PORT=$ARG_MASTER_PORT
+    RANK=$ARG_RANK
+fi
+
+echo "WORLD_SIZE: $WORLD_SIZE"
+echo "NPROC_PER_NODE: $NPROC_PER_NODE"
+
+# Training Arguments
+GLOBAL_BATCH_SIZE=128
+LOCAL_BATCH_SIZE=4
+GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$LOCAL_BATCH_SIZE)]
+
+# Log Arguments
+export TRANSFORMERS_OFFLINE=1
+export WANDB_PROJECT=videollama2
+RUN_NAME=downstream_sft_settings_qlora
+DATA_DIR=datasets
+OUTP_DIR=work_dirs
+
+torchrun --nnodes $WORLD_SIZE \
+    --nproc_per_node $NPROC_PER_NODE \
+    --master_addr=$MASTER_ADDR \
+    --master_port=$MASTER_PORT \
+    --node_rank $RANK \
+    videollama2/train_flash_attn.py \
+    --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 --bits 4 \
+    --deepspeed scripts/zero2.json \
+    --model_type videollama2 \
+    --model_path mistralai/Mistral-7B-Instruct-v0.2 \
+    --vision_tower openai/clip-vit-large-patch14-336 \
+    --mm_projector_type stc_connector \
+    --pretrain_mm_mlp_adapter DAMO-NLP-SG/VideoLLaMA2-7B-Base/mm_projector.bin \
+    --data_path   ${DATA_DIR}/videollava_sft/videochatgpt_llavaimage_tune.json \
+    --data_folder ${DATA_DIR}/videollava_sft/ \
+    --mm_vision_select_layer -2 \
+    --image_aspect_ratio pad \
+    --num_frames 8 \
+    --bf16 True \
+    --tf32 True \
+    --fp16 False \
+    --output_dir ${OUTP_DIR}/${WANDB_PROJECT}/finetune_${RUN_NAME} \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size $LOCAL_BATCH_SIZE \
+    --per_device_eval_batch_size 4 \
+    --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 500 \
+    --save_total_limit 99 \
+    --learning_rate 2e-5 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --model_max_length 2048 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 4 \
+    --report_to tensorboard \
+    --run_name $RUN_NAME \
diff --git a/VideoLLaMA2/scripts/eval/eval_video_cap_msvc.sh b/VideoLLaMA2/scripts/eval/eval_video_cap_msvc.sh
new file mode 100644
index 0000000000000000000000000000000000000000..07f2f2c15a171f9eb53f04414bc9ccbb5e27d077
--- /dev/null
+++ b/VideoLLaMA2/scripts/eval/eval_video_cap_msvc.sh
@@ -0,0 +1,67 @@
+set -x
+
+EVAL_DATA_DIR=eval
+OUTPUT_DIR=eval_output
+CKPT=DAMO-NLP-SG/VideoLLaMA2-7B
+CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
+
+gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
+IFS=',' read -ra GPULIST <<< "$gpu_list"
+
+# divide data via the number of GPUs per task
+GPUS_PER_TASK=1
+CHUNKS=$((${#GPULIST[@]}/$GPUS_PER_TASK))
+
+output_file=${OUTPUT_DIR}/MSVC/answers/${CKPT_NAME}/merge.json
+
+# judge if the number of json lines is 0
+if [ ! -f "$output_file" ] || [ $(cat "$output_file" | wc -l) -eq 0 ]; then
+    rm -f ${OUTPUT_DIR}/MSVC/answers/${CKPT_NAME}/*.json
+fi
+
+if [ ! -f "$output_file" ]; then
+    for IDX in $(seq 0 $((CHUNKS-1))); do
+        # select the GPUs for the task
+        gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")
+        TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${gpu_devices} python3 videollama2/eval/inference_video_cap_msvc.py \
+          --model-path ${CKPT} \
+          --video-folder ${EVAL_DATA_DIR}/MSVC \
+          --question-file ${EVAL_DATA_DIR}/MSVC/msvc.json \
+          --output-file ${OUTPUT_DIR}/MSVC/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json \
+          --num-chunks $CHUNKS \
+          --chunk-idx $IDX &
+    done
+
+    wait
+
+    # Clear out the output file if it exists.
+    > "$output_file"
+
+    #Loop through the indices and concatenate each file.
+    for IDX in $(seq 0 $((CHUNKS-1))); do
+        cat ${OUTPUT_DIR}/MSVC/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json >> "$output_file"
+    done
+fi
+
+
+AZURE_API_KEY=""
+AZURE_API_ENDPOINT=""
+AZURE_API_DEPLOYNAME=""
+
+python3 videollama2/new_eval/eval_video_cap_msvc_correctness.py \
+    --pred-path $output_file \
+    --output-dir ${OUTPUT_DIR}/MSVC/answers/${CKPT_NAME}/correctness_gpt \
+    --output-json ${OUTPUT_DIR}/MSVC/answers/${CKPT_NAME}/correctness_results.json \
+    --api-key $AZURE_API_KEY \
+    --api-endpoint $AZURE_API_ENDPOINT \
+    --api-deployname $AZURE_API_DEPLOYNAME \
+    --num-tasks 4 \
+
+python3 videollama2/new_eval/eval_video_cap_msvc_detailedness.py \
+    --pred-path $output_file \
+    --output-dir ${OUTPUT_DIR}/MSVC/answers/${CKPT_NAME}/detailedness_gpt \
+    --output-json ${OUTPUT_DIR}/MSVC/answers/${CKPT_NAME}/detailedness_results.json \
+    --api-key $AZURE_API_KEY \
+    --api-endpoint $AZURE_API_ENDPOINT \
+    --api-deployname $AZURE_API_DEPLOYNAME \
+    --num-tasks 4 \
diff --git a/VideoLLaMA2/scripts/eval/eval_video_mcqa_egoschema.sh b/VideoLLaMA2/scripts/eval/eval_video_mcqa_egoschema.sh
new file mode 100644
index 0000000000000000000000000000000000000000..2492251837cb2b801f1a99d9467410af51a8d0cb
--- /dev/null
+++ b/VideoLLaMA2/scripts/eval/eval_video_mcqa_egoschema.sh
@@ -0,0 +1,41 @@
+set -x
+
+EVAL_DATA_DIR=eval
+OUTPUT_DIR=eval_output
+CKPT=DAMO-NLP-SG/VideoLLaMA2-7B
+CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
+
+gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
+IFS=',' read -ra GPULIST <<< "$gpu_list"
+
+# divide data via the number of GPUs per task
+GPUS_PER_TASK=1
+CHUNKS=$((${#GPULIST[@]}/$GPUS_PER_TASK))
+
+output_file=${OUTPUT_DIR}/egoschema/answers/${CKPT_NAME}/merge.csv
+
+if [ ! -f "$output_file" ]; then
+    for IDX in $(seq 0 $((CHUNKS-1))); do
+        # select the GPUs for the task
+        gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")
+        TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${gpu_devices} python3 videollama2/eval/inference_video_mcqa_egoschema.py \
+            --model-path ${CKPT} \
+            --video-folder ${EVAL_DATA_DIR}/egoschema/good_clips_git \
+            --question-file ${EVAL_DATA_DIR}/egoschema/questions.json \
+            --answer-file ${OUTPUT_DIR}/egoschema/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.csv \
+            --num-chunks $CHUNKS \
+            --chunk-idx $IDX &
+    done
+
+    wait
+
+    # Clear out the output file if it exists.
+    > "$output_file"
+
+    echo 'q_uid, answer' >> "$output_file"
+
+    # Loop through the indices and concatenate each file.
+    for IDX in $(seq 0 $((CHUNKS-1))); do
+        cat ${OUTPUT_DIR}/egoschema/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.csv >> "$output_file"
+    done
+fi
\ No newline at end of file
diff --git a/VideoLLaMA2/scripts/eval/eval_video_mcqa_mvbench.sh b/VideoLLaMA2/scripts/eval/eval_video_mcqa_mvbench.sh
new file mode 100644
index 0000000000000000000000000000000000000000..3e18ea78430f9518a474d3f69eed9573725153ef
--- /dev/null
+++ b/VideoLLaMA2/scripts/eval/eval_video_mcqa_mvbench.sh
@@ -0,0 +1,46 @@
+set -x
+
+EVAL_DATA_DIR=eval
+OUTPUT_DIR=eval_output
+CKPT=DAMO-NLP-SG/VideoLLaMA2-7B
+CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
+
+gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
+IFS=',' read -ra GPULIST <<< "$gpu_list"
+
+# divide data via the number of GPUs per task
+GPUS_PER_TASK=1
+CHUNKS=$((${#GPULIST[@]}/$GPUS_PER_TASK))
+
+output_file=${OUTPUT_DIR}/mvbench/answers/${CKPT_NAME}/merge.json
+
+# judge if the number of json lines is 0
+if [ ! -f "$output_file" ] || [ $(cat "$output_file" | wc -l) -eq 0 ]; then
+    rm -f ${OUTPUT_DIR}/mvbench/answers/${CKPT_NAME}/*.json
+fi
+
+if [ ! -f "$output_file" ]; then
+    for IDX in $(seq 0 $((CHUNKS-1))); do
+        gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")
+        TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${gpu_devices} python3 videollama2/eval/inference_video_mcqa_mvbench.py \
+            --model-path ${CKPT} \
+            --video-folder ${EVAL_DATA_DIR}/mvbench/video \
+            --question-file ${EVAL_DATA_DIR}/mvbench/json \
+            --answer-file ${OUTPUT_DIR}/mvbench/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json \
+            --num-chunks $CHUNKS \
+            --chunk-idx $IDX &
+    done
+
+    wait
+
+    # Clear out the output file if it exists.
+    > "$output_file"
+
+    # Loop through the indices and concatenate each file.
+    for IDX in $(seq 0 $((CHUNKS-1))); do
+        cat ${OUTPUT_DIR}/mvbench/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json >> "$output_file"
+    done
+fi
+
+python3 videollama2/eval/eval_video_mcqa_mvbench.py \
+    --pred_path ${output_file} \
diff --git a/VideoLLaMA2/scripts/eval/eval_video_mcqa_perception_test_mcqa.sh b/VideoLLaMA2/scripts/eval/eval_video_mcqa_perception_test_mcqa.sh
new file mode 100644
index 0000000000000000000000000000000000000000..abc89edc636f9da975e27179ba8fcc8a0b1893ec
--- /dev/null
+++ b/VideoLLaMA2/scripts/eval/eval_video_mcqa_perception_test_mcqa.sh
@@ -0,0 +1,45 @@
+set -x
+
+EVAL_DATA_DIR=eval
+OUTPUT_DIR=eval_output
+CKPT=DAMO-NLP-SG/VideoLLaMA2-7B
+CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
+
+gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
+IFS=',' read -ra GPULIST <<< "$gpu_list"
+
+# divide data via the number of GPUs per task
+GPUS_PER_TASK=1
+CHUNKS=$((${#GPULIST[@]}/$GPUS_PER_TASK))
+
+output_file=${OUTPUT_DIR}/perception_test_mcqa/answers/${CKPT_NAME}/merge.json
+
+if [ ! -f "$output_file" ]; then
+    for IDX in $(seq 0 $((CHUNKS-1))); do
+        # select the GPUs for the task
+        gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")    
+        TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${gpu_devices} python3 videollama2/eval/inference_video_mcqa_perception_test_mcqa.py \
+            --model-path ${CKPT} \
+            --video-folder ${EVAL_DATA_DIR}/perception_test_mcqa/videos \
+            --question-file ${EVAL_DATA_DIR}/perception_test_mcqa/mc_question_test.json \
+            --answer-file ${OUTPUT_DIR}/perception_test_mcqa/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json \
+            --num-chunks $CHUNKS \
+            --chunk-idx $IDX &
+    done
+
+    wait
+
+    # Clear out the output file if it exists.
+    > "$output_file"
+
+    echo "{" >> "$output_file"
+
+    # Loop through the indices and concatenate each file.
+    for IDX in $(seq 0 $((CHUNKS-1))); do
+        cat ${OUTPUT_DIR}/perception_test_mcqa/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json >> "$output_file"
+    done
+
+    sed -i '$s/.$//' $output_file
+
+    echo "}" >> "$output_file"
+fi
\ No newline at end of file
diff --git a/VideoLLaMA2/scripts/eval/eval_video_mcqa_videomme.sh b/VideoLLaMA2/scripts/eval/eval_video_mcqa_videomme.sh
new file mode 100644
index 0000000000000000000000000000000000000000..aa86be7b587802a6a242178352d38ba5bc8f0e79
--- /dev/null
+++ b/VideoLLaMA2/scripts/eval/eval_video_mcqa_videomme.sh
@@ -0,0 +1,84 @@
+set -x
+
+EVAL_DATA_DIR=eval
+OUTPUT_DIR=eval_output
+CKPT=DAMO-NLP-SG/VideoLLaMA2-7B-16F
+CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
+
+gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
+IFS=',' read -ra GPULIST <<< "$gpu_list"
+
+# divide data via the number of GPUs per task
+GPUS_PER_TASK=1
+CHUNKS=$((${#GPULIST[@]}/$GPUS_PER_TASK))
+
+output_file=${OUTPUT_DIR}/videomme/answers/${CKPT_NAME}/merge.json
+output_sub_file=${OUTPUT_DIR}/videomme/answers/${CKPT_NAME}/merge_sub.json
+
+# judge if the number of json lines is 0
+if [ ! -f "$output_file" ] || [ $(cat "$output_file" | wc -l) -eq 0 ]; then
+    rm -f ${OUTPUT_DIR}/videomme/answers/${CKPT_NAME}/*.json
+fi
+
+
+if [ ! -f "$output_file" ]; then
+    for IDX in $(seq 0 $((CHUNKS-1))); do
+        # select the GPUs for the task
+        gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")
+        TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${gpu_devices} python3 videollama2/eval/inference_video_mcqa_videomme.py \
+            --model-path ${CKPT} \
+            --video-folder ${EVAL_DATA_DIR}/videomme/videos \
+            --subtitle-folder ${EVAL_DATA_DIR}/videomme/subtitles \
+            --question-file ${EVAL_DATA_DIR}/videomme/test-00000-of-00001.parquet \
+            --answer-file ${OUTPUT_DIR}/videomme/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json \
+            --num-chunks $CHUNKS \
+            --chunk-idx $IDX &
+    done
+
+    wait
+
+    # Clear out the output file if it exists.
+    > "$output_file"
+
+    echo "[" >> "$output_file"
+
+    #Loop through the indices and concatenate each file.
+    for IDX in $(seq 0 $((CHUNKS-1))); do
+        cat ${OUTPUT_DIR}/videomme/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json >> "$output_file"
+    done
+
+    sed -i '$s/.$//' $output_file
+
+    echo "]" >> "$output_file"
+
+    # Clear out the output file if it exists.
+    > "$output_sub_file"
+
+    echo "[" >> "$output_sub_file"
+
+    #Loop through the indices and concatenate each file.
+    for IDX in $(seq 0 $((CHUNKS-1))); do
+        cat ${OUTPUT_DIR}/videomme/answers/${CKPT_NAME}/${CHUNKS}_${IDX}_sub.json >> "$output_sub_file"
+    done
+
+    sed -i '$s/.$//' $output_sub_file
+
+    echo "]" >> "$output_sub_file"
+fi
+
+
+python videollama2/eval/eval_video_mcqa_videomme.py \
+    --results_file $output_file \
+    --video_duration_type "short,medium,long" \
+    --return_categories_accuracy \
+    --return_sub_categories_accuracy \
+    --return_task_types_accuracy \
+    --skip_missing \
+
+python videollama2/eval/eval_video_mcqa_videomme.py \
+    --results_file $output_sub_file \
+    --video_duration_type "short,medium,long" \
+    --return_categories_accuracy \
+    --return_sub_categories_accuracy \
+    --return_task_types_accuracy \
+    --skip_missing \
diff --git a/VideoLLaMA2/scripts/eval/eval_video_oqa_vcgpt_1_correctness.sh b/VideoLLaMA2/scripts/eval/eval_video_oqa_vcgpt_1_correctness.sh
new file mode 100644
index 0000000000000000000000000000000000000000..038404f1bfe51161f02bc3966d720bafaadc83ec
--- /dev/null
+++ b/VideoLLaMA2/scripts/eval/eval_video_oqa_vcgpt_1_correctness.sh
@@ -0,0 +1,58 @@
+set -x
+
+EVAL_DATA_DIR=dataset/videollm_eval
+OUTPUT_DIR=eval
+CKPT=DAMO-NLP-SG/VideoLLaMA2-7B
+CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
+
+gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
+IFS=',' read -ra GPULIST <<< "$gpu_list"
+
+# divide data via the number of GPUs per task
+GPUS_PER_TASK=1
+CHUNKS=$((${#GPULIST[@]}/$GPUS_PER_TASK))
+
+output_file=${OUTPUT_DIR}/videochatgpt_gen/answers/correctness/${CKPT_NAME}/merge.json
+
+if [ ! -f "$output_file" ]; then
+    for IDX in $(seq 0 $((CHUNKS-1))); do
+        # select the GPUs for the task
+        gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")
+        TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${gpu_devices} python3 videollama2/new_eval/inference_video_oqa_vcgpt_general.py \
+            --model-path ${CKPT} \
+            --video-folder ${EVAL_DATA_DIR}/videochatgpt_gen/Test_Videos \
+            --question-file ${EVAL_DATA_DIR}/videochatgpt_gen/generic_qa.json \
+            --answer-file ${OUTPUT_DIR}/videochatgpt_gen/answers/correctness/${CKPT_NAME}/${CHUNKS}_${IDX}.json \
+            --num-chunks $CHUNKS \
+            --chunk-idx $IDX &
+    done
+
+    wait
+
+    # Clear out the output file if it exists.
+    > "$output_file"
+
+    #Loop through the indices and concatenate each file.
+    for IDX in $(seq 0 $((CHUNKS-1))); do
+        cat ${OUTPUT_DIR}/videochatgpt_gen/answers/correctness/${CKPT_NAME}/${CHUNKS}_${IDX}.json >> "$output_file"
+    done
+
+    mkdir -p ${OUTPUT_DIR}/videochatgpt_gen/answers/detail/${CKPT_NAME}
+    mkdir -p ${OUTPUT_DIR}/videochatgpt_gen/answers/context/${CKPT_NAME}
+    cp ${output_file} ${OUTPUT_DIR}/videochatgpt_gen/answers/detail/${CKPT_NAME}/merge.json
+    cp ${output_file} ${OUTPUT_DIR}/videochatgpt_gen/answers/context/${CKPT_NAME}/merge.json
+fi
+
+
+AZURE_API_KEY=your_key
+AZURE_API_ENDPOINT=your_endpoint
+AZURE_API_DEPLOYNAME=your_deployname
+
+python3 videollama2/new_eval/eval_video_oqa_vcgpt_1_correctness.py \
+    --pred-path ${output_file} \
+    --output-dir ${OUTPUT_DIR}/videochatgpt_gen/answers/correctness/${CKPT_NAME}/gpt \
+    --output-json ${OUTPUT_DIR}/videochatgpt_gen/answers/correctness/${CKPT_NAME}/results.json \
+    --api-key $AZURE_API_KEY \
+    --api-endpoint $AZURE_API_ENDPOINT \
+    --api-deployname $AZURE_API_DEPLOYNAME \
+    --num-tasks 4
diff --git a/VideoLLaMA2/scripts/eval/eval_video_oqa_vcgpt_2_detail.sh b/VideoLLaMA2/scripts/eval/eval_video_oqa_vcgpt_2_detail.sh
new file mode 100644
index 0000000000000000000000000000000000000000..e24921e6ea558d8d98e5499870b53ff54ecd970e
--- /dev/null
+++ b/VideoLLaMA2/scripts/eval/eval_video_oqa_vcgpt_2_detail.sh
@@ -0,0 +1,58 @@
+set -x
+
+EVAL_DATA_DIR=dataset/videollm_eval
+OUTPUT_DIR=eval
+CKPT=DAMO-NLP-SG/VideoLLaMA2-7B
+CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
+
+gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
+IFS=',' read -ra GPULIST <<< "$gpu_list"
+
+# divide data via the number of GPUs per task
+GPUS_PER_TASK=1
+CHUNKS=$((${#GPULIST[@]}/$GPUS_PER_TASK))
+
+output_file=${OUTPUT_DIR}/videochatgpt_gen/answers/detail/${CKPT_NAME}/merge.json
+
+if [ ! -f "$output_file" ]; then
+    for IDX in $(seq 0 $((CHUNKS-1))); do
+        # select the GPUs for the task
+        gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")
+        TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${gpu_devices} python3 videollama2/eval/run_inference_video_qa_gpt_general.py \
+            --model-path ${CKPT} \
+            --video-folder ${EVAL_DATA_DIR}/videochatgpt_gen/Test_Videos \
+            --question-file ${EVAL_DATA_DIR}/videochatgpt_gen/generic_qa.json \
+            --answer-file ${OUTPUT_DIR}/videochatgpt_gen/answers/detail/${CKPT_NAME}/${CHUNKS}_${IDX}.json \
+            --num-chunks $CHUNKS \
+            --chunk-idx $IDX &
+    done
+
+    wait
+
+    # Clear out the output file if it exists.
+    > "$output_file"
+
+    #Loop through the indices and concatenate each file.
+    for IDX in $(seq 0 $((CHUNKS-1))); do
+        cat ${OUTPUT_DIR}/videochatgpt_gen/answers/detail/${CKPT_NAME}/${CHUNKS}_${IDX}.json >> "$output_file"
+    done
+
+    mkdir -p ${OUTPUT_DIR}/videochatgpt_gen/answers/correctness/${CKPT_NAME}
+    mkdir -p ${OUTPUT_DIR}/videochatgpt_gen/answers/context/${CKPT_NAME}
+    cp ${output_file} ${OUTPUT_DIR}/videochatgpt_gen/answers/correctness/${CKPT_NAME}/merge.json
+    cp ${output_file} ${OUTPUT_DIR}/videochatgpt_gen/answers/context/${CKPT_NAME}/merge.json
+fi
+
+
+AZURE_API_KEY=your_key
+AZURE_API_ENDPOINT=your_endpoint
+AZURE_API_DEPLOYNAME=your_deployname
+
+python3 videollama2/eval/eval_benchmark_2_detailed_orientation.py \
+    --pred-path ${output_file} \
+    --output-dir ${OUTPUT_DIR}/videochatgpt_gen/answers/detail/${CKPT_NAME}/gpt \
+    --output-json ${OUTPUT_DIR}/videochatgpt_gen/answers/detail/${CKPT_NAME}/results.json \
+    --api-key "35632dae7dd94d0a93338db373c63893" \
+    --api-endpoint https://damo-openai-gpt4v-test.openai.azure.com \
+    --api-deployname gpt-35-turbo \
+    --num-tasks 4
diff --git a/VideoLLaMA2/scripts/eval/eval_video_oqa_vcgpt_3_context.sh b/VideoLLaMA2/scripts/eval/eval_video_oqa_vcgpt_3_context.sh
new file mode 100644
index 0000000000000000000000000000000000000000..6ce634a77cff7b42f797aa365079793255d40543
--- /dev/null
+++ b/VideoLLaMA2/scripts/eval/eval_video_oqa_vcgpt_3_context.sh
@@ -0,0 +1,58 @@
+set -x
+
+EVAL_DATA_DIR=dataset/videollm_eval
+OUTPUT_DIR=eval
+CKPT=DAMO-NLP-SG/VideoLLaMA2-7B
+CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
+
+gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
+IFS=',' read -ra GPULIST <<< "$gpu_list"
+
+# divide data via the number of GPUs per task
+GPUS_PER_TASK=1
+CHUNKS=$((${#GPULIST[@]}/$GPUS_PER_TASK))
+
+output_file=${OUTPUT_DIR}/videochatgpt_gen/answers/context/${CKPT_NAME}/merge.json
+
+if [ ! -f "$output_file" ]; then
+    for IDX in $(seq 0 $((CHUNKS-1))); do
+        # select the GPUs for the task
+        gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")
+        TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${gpu_devices} python3 videollama2/eval/run_inference_video_qa_gpt_general.py \
+            --model-path ${CKPT} \
+            --video-folder ${EVAL_DATA_DIR}/videochatgpt_gen/Test_Videos \
+            --question-file ${EVAL_DATA_DIR}/videochatgpt_gen/generic_qa.json \
+            --answer-file ${OUTPUT_DIR}/videochatgpt_gen/answers/detail/${CKPT_NAME}/${CHUNKS}_${IDX}.json \
+            --num-chunks $CHUNKS \
+            --chunk-idx $IDX &
+    done
+
+    wait
+
+    # Clear out the output file if it exists.
+    > "$output_file"
+
+    #Loop through the indices and concatenate each file.
+    for IDX in $(seq 0 $((CHUNKS-1))); do
+        cat ${OUTPUT_DIR}/videochatgpt_gen/answers/context/${CKPT_NAME}/${CHUNKS}_${IDX}.json >> "$output_file"
+    done
+
+    mkdir -p ${OUTPUT_DIR}/videochatgpt_gen/answers/correctness/${CKPT_NAME}
+    mkdir -p ${OUTPUT_DIR}/videochatgpt_gen/answers/detail/${CKPT_NAME}
+    cp ${output_file} ${OUTPUT_DIR}/videochatgpt_gen/answers/correctness/${CKPT_NAME}/merge.json
+    cp ${output_file} ${OUTPUT_DIR}/videochatgpt_gen/answers/detail/${CKPT_NAME}/merge.json
+fi
+
+
+AZURE_API_KEY=your_key
+AZURE_API_ENDPOINT=your_endpoint
+AZURE_API_DEPLOYNAME=your_deployname
+
+python3 videollama2/eval/eval_benchmark_3_context.py \
+    --pred-path ${output_file} \
+    --output-dir ${OUTPUT_DIR}/videochatgpt_gen/answers/context/${CKPT_NAME}/gpt \
+    --output-json ${OUTPUT_DIR}/videochatgpt_gen/answers/context/${CKPT_NAME}/results.json \
+    --api-key $AZURE_API_KEY \
+    --api-endpoint $AZURE_API_ENDPOINT \
+    --api-deployname $AZURE_API_DEPLOYNAME \
+    --num-tasks 4
diff --git a/VideoLLaMA2/scripts/eval/eval_video_oqa_vcgpt_4_temporal.sh b/VideoLLaMA2/scripts/eval/eval_video_oqa_vcgpt_4_temporal.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b868c55c2ce0acc1371563deace2306b88d9f117
--- /dev/null
+++ b/VideoLLaMA2/scripts/eval/eval_video_oqa_vcgpt_4_temporal.sh
@@ -0,0 +1,54 @@
+set -x
+
+EVAL_DATA_DIR=eval
+OUTPUT_DIR=eval_output
+CKPT=DAMO-NLP-SG/VideoLLaMA2-7B
+CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
+
+gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
+IFS=',' read -ra GPULIST <<< "$gpu_list"
+
+# divide data via the number of GPUs per task
+GPUS_PER_TASK=1
+CHUNKS=$((${#GPULIST[@]}/$GPUS_PER_TASK))
+
+output_file=${OUTPUT_DIR}/videochatgpt_gen/answers/temporal/${CKPT_NAME}/merge.json
+
+# if output_file not exists then inference
+if [ ! -f "$output_file" ]; then
+    for IDX in $(seq 0 $((CHUNKS-1))); do
+        # select the GPUs for the task
+        gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")
+        TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${gpu_devices} python3 videollama2/eval/inference_video_oqa_vcgpt_general.py \
+            --model-path ${CKPT} \
+            --video-folder ${EVAL_DATA_DIR}/videochatgpt_gen/Test_Videos \
+            --question-file ${EVAL_DATA_DIR}/videochatgpt_gen/temporal_qa.json \
+            --answer-file ${OUTPUT_DIR}/videochatgpt_gen/answers/temporal/${CKPT_NAME}/${CHUNKS}_${IDX}.json \
+            --num-chunks $CHUNKS \
+            --chunk-idx $IDX &
+    done
+
+    wait
+
+    # Clear out the output file if it exists.
+    > "$output_file"
+
+    #Loop through the indices and concatenate each file.
+    for IDX in $(seq 0 $((CHUNKS-1))); do
+        cat ${OUTPUT_DIR}/videochatgpt_gen/answers/temporal/${CKPT_NAME}/${CHUNKS}_${IDX}.json >> "$output_file"
+    done
+fi
+
+
+AZURE_API_KEY=a7f9bc087b7143a69d59a68f01a2b450
+AZURE_API_ENDPOINT=https://vl-australiaeast.openai.azure.com
+AZURE_API_DEPLOYNAME=gpt35-turbo-0613
+
+python3 videollama2/eval/eval_video_oqa_vcgpt_4_temporal.py \
+    --pred-path ${output_file} \
+    --output-dir ${OUTPUT_DIR}/videochatgpt_gen/answers/temporal/${CKPT_NAME}/gpt \
+    --output-json ${OUTPUT_DIR}/videochatgpt_gen/answers/temporal/${CKPT_NAME}/results.json \
+    --api-key $AZURE_API_KEY \
+    --api-endpoint $AZURE_API_ENDPOINT \
+    --api-deployname $AZURE_API_DEPLOYNAME \
+    --num-tasks 4
diff --git a/VideoLLaMA2/scripts/eval/eval_video_oqa_vcgpt_5_consistency.sh b/VideoLLaMA2/scripts/eval/eval_video_oqa_vcgpt_5_consistency.sh
new file mode 100644
index 0000000000000000000000000000000000000000..185521f75b0f548a1540e40b4d3c1bf607d4ede4
--- /dev/null
+++ b/VideoLLaMA2/scripts/eval/eval_video_oqa_vcgpt_5_consistency.sh
@@ -0,0 +1,54 @@
+set -x
+
+EVAL_DATA_DIR=eval
+OUTPUT_DIR=eval_output
+CKPT=DAMO-NLP-SG/VideoLLaMA2-7B
+CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
+
+gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
+IFS=',' read -ra GPULIST <<< "$gpu_list"
+
+# divide data via the number of GPUs per task
+GPUS_PER_TASK=1
+CHUNKS=$((${#GPULIST[@]}/$GPUS_PER_TASK))
+
+output_file=${OUTPUT_DIR}/videochatgpt_gen/answers/consistency/${CKPT_NAME}/merge.json
+
+# if output_file not exists then inference
+if [ ! -f "$output_file" ]; then
+    for IDX in $(seq 0 $((CHUNKS-1))); do
+        # select the GPUs for the task
+        gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")
+        TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${gpu_devices} python3 videollama2/eval/inference_video_oqa_vcgpt_consistency.py \
+            --model-path ${CKPT} \
+            --video-folder ${EVAL_DATA_DIR}/videochatgpt_gen/Test_Videos \
+            --question-file ${EVAL_DATA_DIR}/videochatgpt_gen/consistency_qa.json \
+            --answer-file ${OUTPUT_DIR}/videochatgpt_gen/answers/consistency/${CKPT_NAME}/${CHUNKS}_${IDX}.json \
+            --num-chunks $CHUNKS \
+            --chunk-idx $IDX &
+    done
+
+    wait
+
+    # Clear out the output file if it exists.
+    > "$output_file"
+
+    #Loop through the indices and concatenate each file.
+    for IDX in $(seq 0 $((CHUNKS-1))); do
+        cat ${OUTPUT_DIR}/videochatgpt_gen/answers/consistency/${CKPT_NAME}/${CHUNKS}_${IDX}.json >> "$output_file"
+    done
+fi
+
+
+AZURE_API_KEY=your_key
+AZURE_API_ENDPOINT=your_endpoint
+AZURE_API_DEPLOYNAME=your_deployname
+
+python3 videollama2/eval/eval_video_oqa_vcgpt_5_consistency.py \
+    --pred-path ${output_file} \
+    --output-dir ${OUTPUT_DIR}/videochatgpt_gen/answers/consistency/${CKPT_NAME}/gpt \
+    --output-json ${OUTPUT_DIR}/videochatgpt_gen/answers/consistency/${CKPT_NAME}/results.json \
+    --api-key $AZURE_API_KEY \
+    --api-endpoint $AZURE_API_ENDPOINT \
+    --api-deployname $AZURE_API_DEPLOYNAME \
+    --num-tasks 4
diff --git a/VideoLLaMA2/scripts/eval/eval_video_oqa_vcgpt_activitynet.sh b/VideoLLaMA2/scripts/eval/eval_video_oqa_vcgpt_activitynet.sh
new file mode 100644
index 0000000000000000000000000000000000000000..253cc5457bfbef9f4e2f070ebe7ff5c88516f980
--- /dev/null
+++ b/VideoLLaMA2/scripts/eval/eval_video_oqa_vcgpt_activitynet.sh
@@ -0,0 +1,54 @@
+set -x
+
+EVAL_DATA_DIR=eval
+OUTPUT_DIR=eval_output
+CKPT=DAMO-NLP-SG/VideoLLaMA2-7B
+CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
+
+gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
+IFS=',' read -ra GPULIST <<< "$gpu_list"
+
+# divide data via the number of GPUs per task
+GPUS_PER_TASK=1
+CHUNKS=$((${#GPULIST[@]}/$GPUS_PER_TASK))
+
+output_file=${OUTPUT_DIR}/Activitynet_Zero_Shot_QA/answers/${CKPT_NAME}/merge.json
+
+if [ ! -f "$output_file" ]; then
+    for IDX in $(seq 0 $((CHUNKS-1))); do
+        # select the GPUs for the task
+        gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")
+        TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${gpu_devices} python3 videollama2/eval/inference_video_oqa_activitynet.py \
+            --model-path ${CKPT} \
+            --video-folder ${EVAL_DATA_DIR}/Activitynet_Zero_Shot_QA/all_test \
+            --question-file ${EVAL_DATA_DIR}/Activitynet_Zero_Shot_QA/test_q.json \
+            --answer-file ${EVAL_DATA_DIR}/Activitynet_Zero_Shot_QA/test_a.json \
+            --output-file ${OUTPUT_DIR}/Activitynet_Zero_Shot_QA/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json \
+            --num-chunks $CHUNKS \
+            --chunk-idx $IDX &
+    done
+
+    wait
+
+    # Clear out the output file if it exists.
+    > "$output_file"
+
+    #Loop through the indices and concatenate each file.
+    for IDX in $(seq 0 $((CHUNKS-1))); do
+        cat ${OUTPUT_DIR}/Activitynet_Zero_Shot_QA/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json >> "$output_file"
+    done
+fi
+
+
+AZURE_API_KEY=your_key
+AZURE_API_ENDPOINT=your_endpoint
+AZURE_API_DEPLOYNAME=your_deployname
+
+python3 videollama2/eval/eval_video_oqa_activitynet.py \
+    --pred-path ${output_file} \
+    --output-dir ${OUTPUT_DIR}/Activitynet_Zero_Shot_QA/answers/${CKPT_NAME}/gpt \
+    --output-json ${OUTPUT_DIR}/Activitynet_Zero_Shot_QA/answers/${CKPT_NAME}/results.json \
+    --api-key $AZURE_API_KEY \
+    --api-endpoint $AZURE_API_ENDPOINT \
+    --api-deployname $AZURE_API_DEPLOYNAME \
+    --num-tasks 4
diff --git a/VideoLLaMA2/scripts/eval/eval_video_oqa_vcgpt_msvd.sh b/VideoLLaMA2/scripts/eval/eval_video_oqa_vcgpt_msvd.sh
new file mode 100644
index 0000000000000000000000000000000000000000..9d2908048e7a157cb73d7f4ee603aef5e533d8da
--- /dev/null
+++ b/VideoLLaMA2/scripts/eval/eval_video_oqa_vcgpt_msvd.sh
@@ -0,0 +1,54 @@
+set -x
+
+EVAL_DATA_DIR=eval
+OUTPUT_DIR=eval_output
+CKPT=DAMO-NLP-SG/VideoLLaMA2-7B
+CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
+
+gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
+IFS=',' read -ra GPULIST <<< "$gpu_list"
+
+# divide data via the number of GPUs per task
+GPUS_PER_TASK=1
+CHUNKS=$((${#GPULIST[@]}/$GPUS_PER_TASK))
+
+output_file=${OUTPUT_DIR}/MSVD_Zero_Shot_QA/answers/${CKPT_NAME}/merge.json
+
+if [ ! -f "$output_file" ]; then
+    for IDX in $(seq 0 $((CHUNKS-1))); do
+        # select the GPUs for the task
+        gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")
+        TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${gpu_devices} python3 videollama2/eval/inference_video_oqa_activitynet.py \
+            --model-path ${CKPT} \
+            --video-folder ${EVAL_DATA_DIR}/MSVD_Zero_Shot_QA/videos \
+            --question-file ${EVAL_DATA_DIR}/MSVD_Zero_Shot_QA/test_q.json \
+            --answer-file ${EVAL_DATA_DIR}/MSVD_Zero_Shot_QA/test_a.json \
+            --output-file ${OUTPUT_DIR}/MSVD_Zero_Shot_QA/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json \
+            --num-chunks $CHUNKS \
+            --chunk-idx $IDX &
+    done
+
+    wait
+
+    # Clear out the output file if it exists.
+    > "$output_file"
+
+    #Loop through the indices and concatenate each file.
+    for IDX in $(seq 0 $((CHUNKS-1))); do
+        cat ${OUTPUT_DIR}/MSVD_Zero_Shot_QA/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json >> "$output_file"
+    done
+fi
+
+
+AZURE_API_KEY=your_key
+AZURE_API_ENDPOINT=your_endpoint
+AZURE_API_DEPLOYNAME=your_deployname
+
+python3 videollama2/eval/eval_video_oqa_activitynet.py \
+    --pred-path ${output_file} \
+    --output-dir ${OUTPUT_DIR}/MSVD_Zero_Shot_QA/answers/${CKPT_NAME}/gpt \
+    --output-json ${OUTPUT_DIR}/MSVD_Zero_Shot_QA/answers/${CKPT_NAME}/results.json \
+    --api-key $AZURE_API_KEY \
+    --api-endpoint $AZURE_API_ENDPOINT \
+    --api-deployname $AZURE_API_DEPLOYNAME \
+    --num-tasks 4
diff --git a/VideoLLaMA2/scripts/siglip/finetune_gemma2.sh b/VideoLLaMA2/scripts/siglip/finetune_gemma2.sh
new file mode 100644
index 0000000000000000000000000000000000000000..3d2034c785e7f54155104fc38be9019b797e7d67
--- /dev/null
+++ b/VideoLLaMA2/scripts/siglip/finetune_gemma2.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+
+# Environment Variables
+ARG_WORLD_SIZE=${1:-1}
+ARG_NPROC_PER_NODE=${2:-8}
+ARG_MASTER_ADDR="127.0.0.1"
+ARG_MASTER_PORT=16667
+ARG_RANK=0
+
+# Multiple conditions
+if [ ! -n "$WORLD_SIZE" ] || [ ! -n "$NPROC_PER_NODE" ]; then
+    WORLD_SIZE=$ARG_WORLD_SIZE
+    NPROC_PER_NODE=$ARG_NPROC_PER_NODE
+fi
+if [ ! -n "$MASTER_ADDR" ] || [ ! -n "$MASTER_PORT" ] || [ ! -n "$RANK" ]; then
+    MASTER_ADDR=$ARG_MASTER_ADDR
+    MASTER_PORT=$ARG_MASTER_PORT
+    RANK=$ARG_RANK
+fi
+
+echo "WORLD_SIZE: $WORLD_SIZE"
+echo "NPROC_PER_NODE: $NPROC_PER_NODE"
+
+# Training Arguments
+GLOBAL_BATCH_SIZE=128
+LOCAL_BATCH_SIZE=4
+GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$LOCAL_BATCH_SIZE)]
+echo $GRADIENT_ACCUMULATION_STEPS
+
+# Log Arguments
+export TRANSFORMERS_OFFLINE=1
+export WANDB_PROJECT=videollama2gemma2_siglip
+RUN_NAME=vllava_settings
+DATA_DIR=datasets
+OUTP_DIR=work_dirs
+
+torchrun --nnodes $WORLD_SIZE \
+    --nproc_per_node $NPROC_PER_NODE \
+    --master_addr=$MASTER_ADDR \
+    --master_port=$MASTER_PORT \
+    --node_rank $RANK \
+    videollama2/train_flash_attn.py \
+    --deepspeed scripts/zero3.json \
+    --model_type videollama2_gemma2 \
+    --model_path google/gemma-2-2b-it \
+    --vision_tower google/siglip-so400m-patch14-384 \
+    --mm_projector_type stc_connector_v35 \
+    --pretrain_mm_mlp_adapter ${OUTP_DIR}/${WANDB_PROJECT}/pretrain_${RUN_NAME}/mm_projector.bin \
+    --data_path   ${DATA_DIR}/videollava_sft/videochatgpt_llavaimage_tune.json \
+    --data_folder ${DATA_DIR}/videollava_sft/ \
+    --mm_vision_select_layer -2 \
+    --image_aspect_ratio pad \
+    --num_frames 8 \
+    --bf16 True \
+    --tf32 True \
+    --fp16 False \
+    --output_dir ${OUTP_DIR}/${WANDB_PROJECT}/finetune_${RUN_NAME} \
+    --num_train_epochs 3 \
+    --per_device_train_batch_size $LOCAL_BATCH_SIZE \
+    --per_device_eval_batch_size 4 \
+    --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 200 \
+    --save_total_limit 99 \
+    --learning_rate 2e-5 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --model_max_length 2048 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 4 \
+    --report_to tensorboard \
+    --run_name finetune_$RUN_NAME \
diff --git a/VideoLLaMA2/scripts/siglip/finetune_mistral.sh b/VideoLLaMA2/scripts/siglip/finetune_mistral.sh
new file mode 100644
index 0000000000000000000000000000000000000000..9a0487cced47fcc6349d3d6e863e2bb1334e86cc
--- /dev/null
+++ b/VideoLLaMA2/scripts/siglip/finetune_mistral.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+
+# Environment Variables
+ARG_WORLD_SIZE=${1:-1}
+ARG_NPROC_PER_NODE=${2:-8}
+ARG_MASTER_ADDR="127.0.0.1"
+ARG_MASTER_PORT=16667
+ARG_RANK=0
+
+# Multiple conditions
+if [ ! -n "$WORLD_SIZE" ] || [ ! -n "$NPROC_PER_NODE" ]; then
+    WORLD_SIZE=$ARG_WORLD_SIZE
+    NPROC_PER_NODE=$ARG_NPROC_PER_NODE
+fi
+if [ ! -n "$MASTER_ADDR" ] || [ ! -n "$MASTER_PORT" ] || [ ! -n "$RANK" ]; then
+    MASTER_ADDR=$ARG_MASTER_ADDR
+    MASTER_PORT=$ARG_MASTER_PORT
+    RANK=$ARG_RANK
+fi
+
+echo "WORLD_SIZE: $WORLD_SIZE"
+echo "NPROC_PER_NODE: $NPROC_PER_NODE"
+
+# Training Arguments
+GLOBAL_BATCH_SIZE=128
+LOCAL_BATCH_SIZE=4
+GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$LOCAL_BATCH_SIZE)]
+echo $GRADIENT_ACCUMULATION_STEPS
+
+# Log Arguments
+export TRANSFORMERS_OFFLINE=1
+export WANDB_PROJECT=videollama2mistral_siglip
+RUN_NAME=vllava_settings
+DATA_DIR=datasets
+OUTP_DIR=work_dirs
+
+torchrun --nnodes $WORLD_SIZE \
+    --nproc_per_node $NPROC_PER_NODE \
+    --master_addr=$MASTER_ADDR \
+    --master_port=$MASTER_PORT \
+    --node_rank $RANK \
+    videollama2/train_flash_attn.py \
+    --deepspeed scripts/zero3.json \
+    --model_type videollama2 \
+    --model_path mistralai/Mistral-7B-Instruct-v0.2 \
+    --vision_tower google/siglip-so400m-patch14-384 \
+    --mm_projector_type stc_connector_v35 \
+    --pretrain_mm_mlp_adapter ${OUTP_DIR}/${WANDB_PROJECT}/pretrain_${RUN_NAME}/mm_projector.bin \
+    --data_path   ${DATA_DIR}/videollava_sft/videochatgpt_llavaimage_tune.json \
+    --data_folder ${DATA_DIR}/videollava_sft/ \
+    --mm_vision_select_layer -2 \
+    --image_aspect_ratio pad \
+    --num_frames 8 \
+    --bf16 True \
+    --tf32 True \
+    --fp16 False \
+    --output_dir ${OUTP_DIR}/${WANDB_PROJECT}/finetune_${RUN_NAME} \
+    --num_train_epochs 3 \
+    --per_device_train_batch_size $LOCAL_BATCH_SIZE \
+    --per_device_eval_batch_size 4 \
+    --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 200 \
+    --save_total_limit 99 \
+    --learning_rate 2e-5 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --model_max_length 2048 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 4 \
+    --report_to wandb \
+    --run_name finetune_$RUN_NAME \
diff --git a/VideoLLaMA2/scripts/siglip/finetune_phi3.sh b/VideoLLaMA2/scripts/siglip/finetune_phi3.sh
new file mode 100644
index 0000000000000000000000000000000000000000..166eacab5d88fdbc3b532d7edb5ed9f126896750
--- /dev/null
+++ b/VideoLLaMA2/scripts/siglip/finetune_phi3.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+
+# Environment Variables
+ARG_WORLD_SIZE=${1:-1}
+ARG_NPROC_PER_NODE=${2:-8}
+ARG_MASTER_ADDR="127.0.0.1"
+ARG_MASTER_PORT=16667
+ARG_RANK=0
+
+# Multiple conditions
+if [ ! -n "$WORLD_SIZE" ] || [ ! -n "$NPROC_PER_NODE" ]; then
+    WORLD_SIZE=$ARG_WORLD_SIZE
+    NPROC_PER_NODE=$ARG_NPROC_PER_NODE
+fi
+if [ ! -n "$MASTER_ADDR" ] || [ ! -n "$MASTER_PORT" ] || [ ! -n "$RANK" ]; then
+    MASTER_ADDR=$ARG_MASTER_ADDR
+    MASTER_PORT=$ARG_MASTER_PORT
+    RANK=$ARG_RANK
+fi
+
+echo "WORLD_SIZE: $WORLD_SIZE"
+echo "NPROC_PER_NODE: $NPROC_PER_NODE"
+
+# Training Arguments
+GLOBAL_BATCH_SIZE=128
+LOCAL_BATCH_SIZE=4
+GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$LOCAL_BATCH_SIZE)]
+echo $GRADIENT_ACCUMULATION_STEPS
+
+# Log Arguments
+export TRANSFORMERS_OFFLINE=1
+export WANDB_PROJECT=videollama2phi3_siglip
+RUN_NAME=vllava_settings
+DATA_DIR=datasets
+OUTP_DIR=work_dirs
+
+torchrun --nnodes $WORLD_SIZE \
+    --nproc_per_node $NPROC_PER_NODE \
+    --master_addr=$MASTER_ADDR \
+    --master_port=$MASTER_PORT \
+    --node_rank $RANK \
+    videollama2/train_flash_attn.py \
+    --deepspeed scripts/zero3.json \
+    --model_type videollama2_phi3 \
+    --model_path microsoft/Phi-3-mini-4k-instruct \
+    --vision_tower google/siglip-so400m-patch14-384 \
+    --mm_projector_type stc_connector_v35 \
+    --pretrain_mm_mlp_adapter ${OUTP_DIR}/${WANDB_PROJECT}/pretrain_${RUN_NAME}/mm_projector.bin \
+    --data_path   ${DATA_DIR}/videollava_sft/videochatgpt_llavaimage_tune.json \
+    --data_folder ${DATA_DIR}/videollava_sft/ \
+    --mm_vision_select_layer -2 \
+    --image_aspect_ratio pad \
+    --num_frames 8 \
+    --bf16 True \
+    --tf32 True \
+    --fp16 False \
+    --output_dir ${OUTP_DIR}/${WANDB_PROJECT}/finetune_${RUN_NAME} \
+    --num_train_epochs 3 \
+    --per_device_train_batch_size $LOCAL_BATCH_SIZE \
+    --per_device_eval_batch_size 4 \
+    --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 200 \
+    --save_total_limit 99 \
+    --learning_rate 2e-5 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --model_max_length 2048 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 4 \
+    --report_to tensorboard \
+    --run_name finetune_$RUN_NAME \
diff --git a/VideoLLaMA2/scripts/siglip/finetune_qwen2.sh b/VideoLLaMA2/scripts/siglip/finetune_qwen2.sh
new file mode 100644
index 0000000000000000000000000000000000000000..e232f5ed34655f00588f1aec2c54dd869f04bb39
--- /dev/null
+++ b/VideoLLaMA2/scripts/siglip/finetune_qwen2.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+
+# Environment Variables
+ARG_WORLD_SIZE=${1:-1}
+ARG_NPROC_PER_NODE=${2:-8}
+ARG_MASTER_ADDR="127.0.0.1"
+ARG_MASTER_PORT=16666
+ARG_RANK=0
+
+# Multiple conditions
+if [ ! -n "$WORLD_SIZE" ] || [ ! -n "$NPROC_PER_NODE" ]; then
+    WORLD_SIZE=$ARG_WORLD_SIZE
+    NPROC_PER_NODE=$ARG_NPROC_PER_NODE
+fi
+if [ ! -n "$MASTER_ADDR" ] || [ ! -n "$MASTER_PORT" ] || [ ! -n "$RANK" ]; then
+    MASTER_ADDR=$ARG_MASTER_ADDR
+    MASTER_PORT=$ARG_MASTER_PORT
+    RANK=$ARG_RANK
+fi
+
+echo "WORLD_SIZE: $WORLD_SIZE"
+echo "NPROC_PER_NODE: $NPROC_PER_NODE"
+
+# Training Arguments
+GLOBAL_BATCH_SIZE=128
+LOCAL_BATCH_SIZE=4
+GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$LOCAL_BATCH_SIZE)]
+echo $GRADIENT_ACCUMULATION_STEPS
+
+# Log Arguments
+export TRANSFORMERS_OFFLINE=1
+export WANDB_PROJECT=videollama2qwen2_siglip
+RUN_NAME=vllava_settings
+DATA_DIR=datasets
+OUTP_DIR=work_dirs
+
+torchrun --nnodes $WORLD_SIZE \
+    --nproc_per_node $NPROC_PER_NODE \
+    --master_addr=$MASTER_ADDR \
+    --master_port=$MASTER_PORT \
+    --node_rank $RANK \
+    videollama2/train_flash_attn.py \
+    --deepspeed scripts/zero3.json \
+    --model_type videollama2_qwen2 \
+    --model_path Qwen/Qwen2-7B-Instruct \
+    --vision_tower google/siglip-so400m-patch14-384 \
+    --mm_projector_type stc_connector_v35 \
+    --pretrain_mm_mlp_adapter ${OUTP_DIR}/${WANDB_PROJECT}/pretrain_${RUN_NAME}/mm_projector.bin \
+    --data_path   ${DATA_DIR}/videollava_sft/videochatgpt_llavaimage_tune.json \
+    --data_folder ${DATA_DIR}/videollava_sft/ \
+    --mm_vision_select_layer -2 \
+    --image_aspect_ratio pad \
+    --num_frames 8 \
+    --bf16 True \
+    --tf32 True \
+    --fp16 False \
+    --output_dir ${OUTP_DIR}/${WANDB_PROJECT}/finetune_${RUN_NAME} \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size $LOCAL_BATCH_SIZE \
+    --per_device_eval_batch_size 4 \
+    --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 500 \
+    --save_total_limit 99 \
+    --learning_rate 2e-5 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --model_max_length 2048 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 4 \
+    --report_to tensorboard \
+    --run_name $RUN_NAME \
diff --git a/VideoLLaMA2/scripts/siglip/pretrain_gemma2.sh b/VideoLLaMA2/scripts/siglip/pretrain_gemma2.sh
new file mode 100644
index 0000000000000000000000000000000000000000..4afc56c4fbc82a1a30f25d22ec5d7da02afc8895
--- /dev/null
+++ b/VideoLLaMA2/scripts/siglip/pretrain_gemma2.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+
+# Environment Variables
+ARG_WORLD_SIZE=${1:-1}
+ARG_NPROC_PER_NODE=${2:-8}
+ARG_MASTER_ADDR="127.0.0.1"
+ARG_MASTER_PORT=16666
+ARG_RANK=0
+
+# Multiple conditions
+if [ ! -n "$WORLD_SIZE" ] || [ ! -n "$NPROC_PER_NODE" ]; then
+    WORLD_SIZE=$ARG_WORLD_SIZE
+    NPROC_PER_NODE=$ARG_NPROC_PER_NODE
+fi
+if [ ! -n "$MASTER_ADDR" ] || [ ! -n "$MASTER_PORT" ] || [ ! -n "$RANK" ]; then
+    MASTER_ADDR=$ARG_MASTER_ADDR
+    MASTER_PORT=$ARG_MASTER_PORT
+    RANK=$ARG_RANK
+fi
+
+echo "WORLD_SIZE: $WORLD_SIZE"
+echo "NPROC_PER_NODE: $NPROC_PER_NODE"
+
+# Training Arguments
+GLOBAL_BATCH_SIZE=256
+LOCAL_BATCH_SIZE=4
+GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$LOCAL_BATCH_SIZE)]
+echo $GRADIENT_ACCUMULATION_STEPS
+
+# Log Arguments
+export TRANSFORMERS_OFFLINE=1
+export WANDB_PROJECT=videollama2gemma2_siglip
+RUN_NAME=vllava_settings
+DATA_DIR=datasets
+OUTP_DIR=work_dirs
+
+torchrun --nnodes $WORLD_SIZE \
+    --nproc_per_node $NPROC_PER_NODE  \
+    --master_addr=$MASTER_ADDR \
+    --master_port=$MASTER_PORT \
+    --node_rank $RANK \
+    videollama2/train_flash_attn.py \
+    --deepspeed scripts/zero3.json \
+    --model_type videollama2_gemma2 \
+    --model_path google/gemma-2-2b-it \
+    --vision_tower google/siglip-so400m-patch14-384 \
+    --mm_projector_type stc_connector_v35 \
+    --tune_mm_mlp_adapter True \
+    --data_path   ${DATA_DIR}/videollava_pt/valley_llavaimage.json \
+    --data_folder ${DATA_DIR}/videollava_pt/ \
+    --mm_vision_select_layer -2 \
+    --num_frames 8 \
+    --bf16 True \
+    --tf32 True \
+    --fp16 False \
+    --output_dir ${OUTP_DIR}/${WANDB_PROJECT}/pretrain_${RUN_NAME} \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size $LOCAL_BATCH_SIZE \
+    --per_device_eval_batch_size 4 \
+    --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 500 \
+    --save_total_limit 99 \
+    --learning_rate 1e-3 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --model_max_length 2048 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 4 \
+    --lazy_preprocess True \
+    --report_to tensorboard \
+    --run_name pretrain_$RUN_NAME \
diff --git a/VideoLLaMA2/scripts/siglip/pretrain_mistral.sh b/VideoLLaMA2/scripts/siglip/pretrain_mistral.sh
new file mode 100644
index 0000000000000000000000000000000000000000..4914e5be8f8242ef13a8623595069d233790b445
--- /dev/null
+++ b/VideoLLaMA2/scripts/siglip/pretrain_mistral.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+
+# Environment Variables
+ARG_WORLD_SIZE=${1:-1}
+ARG_NPROC_PER_NODE=${2:-8}
+ARG_MASTER_ADDR="127.0.0.1"
+ARG_MASTER_PORT=16666
+ARG_RANK=0
+
+# Multiple conditions
+if [ ! -n "$WORLD_SIZE" ] || [ ! -n "$NPROC_PER_NODE" ]; then
+    WORLD_SIZE=$ARG_WORLD_SIZE
+    NPROC_PER_NODE=$ARG_NPROC_PER_NODE
+fi
+if [ ! -n "$MASTER_ADDR" ] || [ ! -n "$MASTER_PORT" ] || [ ! -n "$RANK" ]; then
+    MASTER_ADDR=$ARG_MASTER_ADDR
+    MASTER_PORT=$ARG_MASTER_PORT
+    RANK=$ARG_RANK
+fi
+
+echo "WORLD_SIZE: $WORLD_SIZE"
+echo "NPROC_PER_NODE: $NPROC_PER_NODE"
+
+# Training Arguments
+GLOBAL_BATCH_SIZE=256
+LOCAL_BATCH_SIZE=8
+GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$LOCAL_BATCH_SIZE)]
+echo $GRADIENT_ACCUMULATION_STEPS
+
+# Log Arguments
+export TRANSFORMERS_OFFLINE=1
+export WANDB_PROJECT=videollama2mistral_siglip
+RUN_NAME=vllava_settings
+DATA_DIR=datasets
+OUTP_DIR=work_dirs
+
+torchrun --nnodes $WORLD_SIZE \
+    --nproc_per_node $NPROC_PER_NODE  \
+    --master_addr=$MASTER_ADDR \
+    --master_port=$MASTER_PORT \
+    --node_rank $RANK \
+    videollama2/train_flash_attn.py \
+    --deepspeed scripts/zero3.json \
+    --model_type videollama2 \
+    --model_path mistralai/Mistral-7B-Instruct-v0.2 \
+    --vision_tower google/siglip-so400m-patch14-384 \
+    --mm_projector_type stc_connector_v35 \
+    --tune_mm_mlp_adapter True \
+    --data_path   ${DATA_DIR}/videollava_pt/valley_llavaimage.json \
+    --data_folder ${DATA_DIR}/videollava_pt/ \
+    --mm_vision_select_layer -2 \
+    --num_frames 8 \
+    --bf16 True \
+    --tf32 True \
+    --fp16 False \
+    --output_dir ${OUTP_DIR}/${WANDB_PROJECT}/pretrain_${RUN_NAME} \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size $LOCAL_BATCH_SIZE \
+    --per_device_eval_batch_size 4 \
+    --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 500 \
+    --save_total_limit 99 \
+    --learning_rate 1e-3 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --model_max_length 2048 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 16 \
+    --lazy_preprocess True \
+    --report_to tensorboard \
+    --run_name pretrain_$RUN_NAME \
diff --git a/VideoLLaMA2/scripts/siglip/pretrain_phi3.sh b/VideoLLaMA2/scripts/siglip/pretrain_phi3.sh
new file mode 100644
index 0000000000000000000000000000000000000000..d475fe26fc61b2496a5dc57304e3c58db5e0d02a
--- /dev/null
+++ b/VideoLLaMA2/scripts/siglip/pretrain_phi3.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+
+# Environment Variables
+ARG_WORLD_SIZE=${1:-1}
+ARG_NPROC_PER_NODE=${2:-8}
+ARG_MASTER_ADDR="127.0.0.1"
+ARG_MASTER_PORT=16666
+ARG_RANK=0
+
+# Multiple conditions
+if [ ! -n "$WORLD_SIZE" ] || [ ! -n "$NPROC_PER_NODE" ]; then
+    WORLD_SIZE=$ARG_WORLD_SIZE
+    NPROC_PER_NODE=$ARG_NPROC_PER_NODE
+fi
+if [ ! -n "$MASTER_ADDR" ] || [ ! -n "$MASTER_PORT" ] || [ ! -n "$RANK" ]; then
+    MASTER_ADDR=$ARG_MASTER_ADDR
+    MASTER_PORT=$ARG_MASTER_PORT
+    RANK=$ARG_RANK
+fi
+
+echo "WORLD_SIZE: $WORLD_SIZE"
+echo "NPROC_PER_NODE: $NPROC_PER_NODE"
+
+# Training Arguments
+GLOBAL_BATCH_SIZE=256
+LOCAL_BATCH_SIZE=8
+GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$LOCAL_BATCH_SIZE)]
+echo $GRADIENT_ACCUMULATION_STEPS
+
+# Log Arguments
+export TRANSFORMERS_OFFLINE=1
+export WANDB_PROJECT=videollama2phi3_siglip
+RUN_NAME=vllava_settings
+DATA_DIR=datasets
+OUTP_DIR=work_dirs
+
+torchrun --nnodes $WORLD_SIZE \
+    --nproc_per_node $NPROC_PER_NODE  \
+    --master_addr=$MASTER_ADDR \
+    --master_port=$MASTER_PORT \
+    --node_rank $RANK \
+    videollama2/train_flash_attn.py \
+    --deepspeed scripts/zero3.json \
+    --model_type videollama2_phi3 \
+    --model_path microsoft/Phi-3-mini-4k-instruct \
+    --vision_tower google/siglip-so400m-patch14-384 \
+    --mm_projector_type stc_connector_v35 \
+    --tune_mm_mlp_adapter True \
+    --data_path   ${DATA_DIR}/videollava_pt/valley_llavaimage.json \
+    --data_folder ${DATA_DIR}/videollava_pt/ \
+    --mm_vision_select_layer -2 \
+    --num_frames 8 \
+    --bf16 True \
+    --tf32 True \
+    --fp16 False \
+    --output_dir ${OUTP_DIR}/${WANDB_PROJECT}/pretrain_${RUN_NAME} \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size $LOCAL_BATCH_SIZE \
+    --per_device_eval_batch_size 4 \
+    --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 500 \
+    --save_total_limit 99 \
+    --learning_rate 1e-3 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --model_max_length 2048 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 4 \
+    --lazy_preprocess True \
+    --report_to tensorboard \
+    --run_name pretrain_$RUN_NAME \
diff --git a/VideoLLaMA2/scripts/siglip/pretrain_qwen2.sh b/VideoLLaMA2/scripts/siglip/pretrain_qwen2.sh
new file mode 100644
index 0000000000000000000000000000000000000000..d3fb400e7f3af0364d4040ee7f295bddeee53ed0
--- /dev/null
+++ b/VideoLLaMA2/scripts/siglip/pretrain_qwen2.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+
+# Environment Variables
+ARG_WORLD_SIZE=${1:-1}
+ARG_NPROC_PER_NODE=${2:-8}
+ARG_MASTER_ADDR="127.0.0.1"
+ARG_MASTER_PORT=16666
+ARG_RANK=0
+
+# Multiple conditions
+if [ ! -n "$WORLD_SIZE" ] || [ ! -n "$NPROC_PER_NODE" ]; then
+    WORLD_SIZE=$ARG_WORLD_SIZE
+    NPROC_PER_NODE=$ARG_NPROC_PER_NODE
+fi
+if [ ! -n "$MASTER_ADDR" ] || [ ! -n "$MASTER_PORT" ] || [ ! -n "$RANK" ]; then
+    MASTER_ADDR=$ARG_MASTER_ADDR
+    MASTER_PORT=$ARG_MASTER_PORT
+    RANK=$ARG_RANK
+fi
+
+echo "WORLD_SIZE: $WORLD_SIZE"
+echo "NPROC_PER_NODE: $NPROC_PER_NODE"
+
+# Training Arguments
+GLOBAL_BATCH_SIZE=256
+LOCAL_BATCH_SIZE=8
+GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$LOCAL_BATCH_SIZE)]
+echo $GRADIENT_ACCUMULATION_STEPS
+
+# Log Arguments
+export TRANSFORMERS_OFFLINE=1
+export WANDB_PROJECT=videollama2qwen2_siglip
+RUN_NAME=vllava_settings
+DATA_DIR=datasets
+OUTP_DIR=work_dirs
+
+torchrun --nnodes $WORLD_SIZE \
+    --nproc_per_node $NPROC_PER_NODE  \
+    --master_addr=$MASTER_ADDR \
+    --master_port=$MASTER_PORT \
+    --node_rank $RANK \
+    videollama2/train_flash_attn.py \
+    --deepspeed scripts/zero3.json \
+    --model_type videollama2_qwen2 \
+    --model_path Qwen/Qwen2-7B-Instruct \
+    --vision_tower google/siglip-so400m-patch14-384 \
+    --mm_projector_type stc_connector_v35 \
+    --tune_mm_mlp_adapter True \
+    --data_path   ${DATA_DIR}/videollava_pt/valley_llavaimage.json \
+    --data_folder ${DATA_DIR}/videollava_pt/ \
+    --mm_vision_select_layer -2 \
+    --num_frames 8 \
+    --bf16 True \
+    --tf32 True \
+    --fp16 False \
+    --output_dir ${OUTP_DIR}/${WANDB_PROJECT}/pretrain_${RUN_NAME} \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size $LOCAL_BATCH_SIZE \
+    --per_device_eval_batch_size 4 \
+    --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 500 \
+    --save_total_limit 99 \
+    --learning_rate 1e-3 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --model_max_length 2048 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 4 \
+    --lazy_preprocess True \
+    --report_to tensorboard \
+    --run_name $RUN_NAME \
diff --git a/VideoLLaMA2/scripts/vllava/finetune.sh b/VideoLLaMA2/scripts/vllava/finetune.sh
new file mode 100644
index 0000000000000000000000000000000000000000..d619915c72ced4997dc38f004b1ad9922b85e3eb
--- /dev/null
+++ b/VideoLLaMA2/scripts/vllava/finetune.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+
+# Environment Variables
+ARG_WORLD_SIZE=${1:-1}
+ARG_NPROC_PER_NODE=${2:-8}
+ARG_MASTER_ADDR="127.0.0.1"
+ARG_MASTER_PORT=16666
+ARG_RANK=0
+
+# Multiple conditions
+if [ ! -n "$WORLD_SIZE" ] || [ ! -n "$NPROC_PER_NODE" ]; then
+    WORLD_SIZE=$ARG_WORLD_SIZE
+    NPROC_PER_NODE=$ARG_NPROC_PER_NODE
+fi
+if [ ! -n "$MASTER_ADDR" ] || [ ! -n "$MASTER_PORT" ] || [ ! -n "$RANK" ]; then
+    MASTER_ADDR=$ARG_MASTER_ADDR
+    MASTER_PORT=$ARG_MASTER_PORT
+    RANK=$ARG_RANK
+fi
+
+echo "WORLD_SIZE: $WORLD_SIZE"
+echo "NPROC_PER_NODE: $NPROC_PER_NODE"
+
+# Training Arguments
+GLOBAL_BATCH_SIZE=128
+LOCAL_BATCH_SIZE=4
+GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$LOCAL_BATCH_SIZE)]
+
+# Log Arguments
+export TRANSFORMERS_OFFLINE=1
+export WANDB_PROJECT=videollama2
+RUN_NAME=vllava_settings
+DATA_DIR=datasets
+OUTP_DIR=work_dirs
+
+torchrun --nnodes $WORLD_SIZE \
+    --nproc_per_node $NPROC_PER_NODE \
+    --master_addr=$MASTER_ADDR \
+    --master_port=$MASTER_PORT \
+    --node_rank $RANK \
+    videollama2/train_flash_attn.py \
+    --deepspeed scripts/zero3.json \
+    --model_type videollama2 \
+    --model_path mistralai/Mistral-7B-Instruct-v0.2 \
+    --vision_tower openai/clip-vit-large-patch14-336 \
+    --mm_projector_type stc_connector \
+    --pretrain_mm_mlp_adapter ${OUTP_DIR}/${WANDB_PROJECT}/pretrain_${RUN_NAME}/mm_projector.bin \
+    --data_path   ${DATA_DIR}/videollava_sft/videochatgpt_llavaimage_tune.json \
+    --data_folder ${DATA_DIR}/videollava_sft/ \
+    --mm_vision_select_layer -2 \
+    --image_aspect_ratio pad \
+    --num_frames 8 \
+    --bf16 True \
+    --tf32 True \
+    --fp16 False \
+    --output_dir ${OUTP_DIR}/${WANDB_PROJECT}/finetune_${RUN_NAME} \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size $LOCAL_BATCH_SIZE \
+    --per_device_eval_batch_size 4 \
+    --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 500 \
+    --save_total_limit 99 \
+    --learning_rate 2e-5 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --model_max_length 2048 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 4 \
+    --report_to tensorboard \
+    --run_name $RUN_NAME \
diff --git a/VideoLLaMA2/scripts/vllava/finetune_qwen2.sh b/VideoLLaMA2/scripts/vllava/finetune_qwen2.sh
new file mode 100644
index 0000000000000000000000000000000000000000..73bd404aaa4f6037b5c115faa7f5b632b111bd2e
--- /dev/null
+++ b/VideoLLaMA2/scripts/vllava/finetune_qwen2.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+
+# Environment Variables
+ARG_WORLD_SIZE=${1:-1}
+ARG_NPROC_PER_NODE=${2:-8}
+ARG_MASTER_ADDR="127.0.0.1"
+ARG_MASTER_PORT=16666
+ARG_RANK=0
+
+# Multiple conditions
+if [ ! -n "$WORLD_SIZE" ] || [ ! -n "$NPROC_PER_NODE" ]; then
+    WORLD_SIZE=$ARG_WORLD_SIZE
+    NPROC_PER_NODE=$ARG_NPROC_PER_NODE
+fi
+if [ ! -n "$MASTER_ADDR" ] || [ ! -n "$MASTER_PORT" ] || [ ! -n "$RANK" ]; then
+    MASTER_ADDR=$ARG_MASTER_ADDR
+    MASTER_PORT=$ARG_MASTER_PORT
+    RANK=$ARG_RANK
+fi
+
+echo "WORLD_SIZE: $WORLD_SIZE"
+echo "NPROC_PER_NODE: $NPROC_PER_NODE"
+
+# Training Arguments
+GLOBAL_BATCH_SIZE=128
+LOCAL_BATCH_SIZE=4
+GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$LOCAL_BATCH_SIZE)]
+
+# Log Arguments
+export TRANSFORMERS_OFFLINE=1
+export WANDB_PROJECT=videollama2qwen2
+RUN_NAME=vllava_settings
+DATA_DIR=datasets
+OUTP_DIR=work_dirs
+
+torchrun --nnodes $WORLD_SIZE \
+    --nproc_per_node $NPROC_PER_NODE \
+    --master_addr=$MASTER_ADDR \
+    --master_port=$MASTER_PORT \
+    --node_rank $RANK \
+    videollama2/train_flash_attn.py \
+    --deepspeed scripts/zero3.json \
+    --model_type videollama2_qwen2 \
+    --model_path Qwen/Qwen2-7B-Instruct \
+    --vision_tower openai/clip-vit-large-patch14-336 \
+    --mm_projector_type stc_connector \
+    --pretrain_mm_mlp_adapter ${OUTP_DIR}/${WANDB_PROJECT}/pretrain_${RUN_NAME}/mm_projector.bin \
+    --data_path   ${DATA_DIR}/videollava_sft/videochatgpt_llavaimage_tune.json \
+    --data_folder ${DATA_DIR}/videollava_sft/ \
+    --mm_vision_select_layer -2 \
+    --image_aspect_ratio pad \
+    --num_frames 8 \
+    --bf16 True \
+    --tf32 True \
+    --fp16 False \
+    --output_dir ${OUTP_DIR}/${WANDB_PROJECT}/finetune_${RUN_NAME} \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size $LOCAL_BATCH_SIZE \
+    --per_device_eval_batch_size 4 \
+    --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 500 \
+    --save_total_limit 99 \
+    --learning_rate 2e-5 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --model_max_length 2048 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 4 \
+    --report_to tensorboard \
+    --run_name $RUN_NAME \
diff --git a/VideoLLaMA2/scripts/vllava/pretrain.sh b/VideoLLaMA2/scripts/vllava/pretrain.sh
new file mode 100644
index 0000000000000000000000000000000000000000..034703a03815c191c79651e90cb7bdb247339486
--- /dev/null
+++ b/VideoLLaMA2/scripts/vllava/pretrain.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+
+# Environment Variables
+ARG_WORLD_SIZE=${1:-1}
+ARG_NPROC_PER_NODE=${2:-8}
+ARG_MASTER_ADDR="127.0.0.1"
+ARG_MASTER_PORT=16666
+ARG_RANK=0
+
+# Multiple conditions
+if [ ! -n "$WORLD_SIZE" ] || [ ! -n "$NPROC_PER_NODE" ]; then
+    WORLD_SIZE=$ARG_WORLD_SIZE
+    NPROC_PER_NODE=$ARG_NPROC_PER_NODE
+fi
+if [ ! -n "$MASTER_ADDR" ] || [ ! -n "$MASTER_PORT" ] || [ ! -n "$RANK" ]; then
+    MASTER_ADDR=$ARG_MASTER_ADDR
+    MASTER_PORT=$ARG_MASTER_PORT
+    RANK=$ARG_RANK
+fi
+
+echo "WORLD_SIZE: $WORLD_SIZE"
+echo "NPROC_PER_NODE: $NPROC_PER_NODE"
+
+# Training Arguments
+GLOBAL_BATCH_SIZE=256
+LOCAL_BATCH_SIZE=8
+GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$LOCAL_BATCH_SIZE)]
+
+# Log Arguments
+export TRANSFORMERS_OFFLINE=1
+export WANDB_PROJECT=videollama2
+RUN_NAME=vllava_settings
+DATA_DIR=datasets
+OUTP_DIR=work_dirs
+
+torchrun --nnodes $WORLD_SIZE \
+    --nproc_per_node $NPROC_PER_NODE  \
+    --master_addr=$MASTER_ADDR \
+    --master_port=$MASTER_PORT \
+    --node_rank $RANK \
+    videollama2/train_flash_attn.py \
+    --deepspeed scripts/zero3.json \
+    --model_type videollama2 \
+    --model_path mistralai/Mistral-7B-Instruct-v0.2 \
+    --vision_tower openai/clip-vit-large-patch14-336 \
+    --mm_projector_type stc_connector \
+    --tune_mm_mlp_adapter True \
+    --data_path   ${DATA_DIR}/videollava_pt/valley_llavaimage.json \
+    --data_folder ${DATA_DIR}/videollava_pt/ \
+    --mm_vision_select_layer -2 \
+    --num_frames 8 \
+    --bf16 True \
+    --tf32 True \
+    --fp16 False \
+    --output_dir ${OUTP_DIR}/${WANDB_PROJECT}/pretrain_${RUN_NAME} \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size $LOCAL_BATCH_SIZE \
+    --per_device_eval_batch_size 4 \
+    --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 500 \
+    --save_total_limit 99 \
+    --learning_rate 1e-3 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --model_max_length 2048 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 4 \
+    --lazy_preprocess True \
+    --report_to tensorboard \
+    --run_name $RUN_NAME \
diff --git a/VideoLLaMA2/scripts/vllava/pretrain_qwen2.sh b/VideoLLaMA2/scripts/vllava/pretrain_qwen2.sh
new file mode 100644
index 0000000000000000000000000000000000000000..a154f14cba553ae5e512c1d128c79d8878086d72
--- /dev/null
+++ b/VideoLLaMA2/scripts/vllava/pretrain_qwen2.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+
+# Environment Variables
+ARG_WORLD_SIZE=${1:-1}
+ARG_NPROC_PER_NODE=${2:-8}
+ARG_MASTER_ADDR="127.0.0.1"
+ARG_MASTER_PORT=16666
+ARG_RANK=0
+
+# Multiple conditions
+if [ ! -n "$WORLD_SIZE" ] || [ ! -n "$NPROC_PER_NODE" ]; then
+    WORLD_SIZE=$ARG_WORLD_SIZE
+    NPROC_PER_NODE=$ARG_NPROC_PER_NODE
+fi
+if [ ! -n "$MASTER_ADDR" ] || [ ! -n "$MASTER_PORT" ] || [ ! -n "$RANK" ]; then
+    MASTER_ADDR=$ARG_MASTER_ADDR
+    MASTER_PORT=$ARG_MASTER_PORT
+    RANK=$ARG_RANK
+fi
+
+echo "WORLD_SIZE: $WORLD_SIZE"
+echo "NPROC_PER_NODE: $NPROC_PER_NODE"
+
+# Training Arguments
+GLOBAL_BATCH_SIZE=256
+LOCAL_BATCH_SIZE=8
+GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$LOCAL_BATCH_SIZE)]
+
+# Log Arguments
+export TRANSFORMERS_OFFLINE=1
+export WANDB_PROJECT=videollama2qwen2
+RUN_NAME=vllava_settings
+DATA_DIR=datasets
+OUTP_DIR=work_dirs
+
+torchrun --nnodes $WORLD_SIZE \
+    --nproc_per_node $NPROC_PER_NODE  \
+    --master_addr=$MASTER_ADDR \
+    --master_port=$MASTER_PORT \
+    --node_rank $RANK \
+    videollama2/train_flash_attn.py \
+    --deepspeed scripts/zero3.json \
+    --model_type videollama2_qwen2 \
+    --model_path Qwen/Qwen2-7B-Instruct \
+    --vision_tower openai/clip-vit-large-patch14-336 \
+    --mm_projector_type stc_connector \
+    --tune_mm_mlp_adapter True \
+    --data_path   ${DATA_DIR}/videollava_pt/valley_llavaimage.json \
+    --data_folder ${DATA_DIR}/videollava_pt/ \
+    --mm_vision_select_layer -2 \
+    --num_frames 8 \
+    --bf16 True \
+    --tf32 True \
+    --fp16 False \
+    --output_dir ${OUTP_DIR}/${WANDB_PROJECT}/pretrain_${RUN_NAME} \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size $LOCAL_BATCH_SIZE \
+    --per_device_eval_batch_size 4 \
+    --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 500 \
+    --save_total_limit 99 \
+    --learning_rate 1e-3 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --model_max_length 2048 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 4 \
+    --lazy_preprocess True \
+    --report_to tensorboard \
+    --run_name $RUN_NAME \
diff --git a/VideoLLaMA2/videollama2/__init__.py b/VideoLLaMA2/videollama2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..318d849cf008ae2c6d495830afd04bd6871261d9
--- /dev/null
+++ b/VideoLLaMA2/videollama2/__init__.py
@@ -0,0 +1,109 @@
+import os
+import copy
+import warnings
+import shutil
+from functools import partial
+
+import torch
+
+from .model import load_pretrained_model
+from .mm_utils import process_image, process_video, tokenizer_multimodal_token, get_model_name_from_path, KeywordsStoppingCriteria
+from .constants import NUM_FRAMES, DEFAULT_IMAGE_TOKEN, DEFAULT_VIDEO_TOKEN, MODAL_INDEX_MAP
+
+
+def model_init(model_path=None, **kwargs):
+    model_path = "DAMO-NLP-SG/VideoLLaMA2-7B" if model_path is None else model_path
+    model_name = get_model_name_from_path(model_path)
+    tokenizer, model, processor, context_len = load_pretrained_model(model_path, None, model_name, **kwargs)
+
+    if tokenizer.pad_token is None and tokenizer.unk_token is not None:
+        tokenizer.pad_token = tokenizer.unk_token
+
+    num_frames = model.config.num_frames if hasattr(model.config, "num_frames") else NUM_FRAMES
+
+    processor = {
+        'image': partial(process_image, processor=processor, aspect_ratio=None),
+        'video': partial(process_video, processor=processor, aspect_ratio=None, num_frames=num_frames),
+    }
+
+    return model, processor, tokenizer
+
+
+def mm_infer(image_or_video, instruct, model, tokenizer, modal='video', **kwargs):
+    """inference api of VideoLLaMA2 for video understanding.
+
+    Args:
+        model: VideoLLaMA2 model.
+        image_or_video (torch.Tensor): image tensor (1, C, H, W) / video tensor (T, C, H, W).
+        instruct (str): text instruction for understanding video.
+        tokenizer: tokenizer.
+        do_sample (bool): whether to sample.
+        modal (str): inference modality.
+    Returns:
+        str: response of the model.
+    """
+
+    # 1. text preprocess (tag process & generate prompt).
+    if modal == 'image':
+        modal_token = DEFAULT_IMAGE_TOKEN
+    elif modal == 'video':
+        modal_token = DEFAULT_VIDEO_TOKEN
+    else:
+        raise ValueError(f"Unsupported modal: {modal}")
+
+    if isinstance(instruct, str):
+        message = [{'role': 'user', 'content': modal_token + '\n' + instruct}]
+    elif isinstance(instruct, list):
+        message = copy.deepcopy(instruct)
+        message[0]['content'] = modal_token + '\n' + message[0]['content']
+    else:
+        raise ValueError(f"Unsupported type of instruct: {type(instruct)}")
+
+    if model.config.model_type in ['videollama2', 'videollama2_mistral', 'videollama2_mixtral']:
+        system_message = [
+            {'role': 'system', 'content': (
+            """<<SYS>>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature."""
+            """\n"""
+            """If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n<</SYS>>""")
+            }
+        ]
+    else:
+        system_message = []
+
+    message = system_message + message
+    prompt = tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=True)
+
+    input_ids = tokenizer_multimodal_token(prompt, tokenizer, modal_token, return_tensors='pt').unsqueeze(0).long().cuda()
+    attention_masks = input_ids.ne(tokenizer.pad_token_id).long().cuda()
+
+    # 2. vision preprocess (load & transform image or video).
+    tensor = image_or_video.half().cuda()
+
+    tensor = [(tensor, modal_token)]
+
+    # 3. generate response according to visual signals and prompts. 
+    keywords = [tokenizer.eos_token]
+    stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
+
+    do_sample = kwargs.get('do_sample', False)
+    temperature = kwargs.get('temperature', 0.2 if do_sample else 0.0)
+    top_p = kwargs.get('top_p', 0.9)
+    max_new_tokens = kwargs.get('max_new_tokens', 1024)
+
+    with torch.inference_mode():
+        output_ids = model.generate(
+            input_ids,
+            attention_mask=attention_masks,
+            images=tensor,
+            do_sample=do_sample,
+            temperature=temperature,
+            max_new_tokens=max_new_tokens,
+            top_p=top_p,
+            use_cache=True,
+            stopping_criteria=[stopping_criteria],
+            pad_token_id=tokenizer.eos_token_id,
+        )
+
+    outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
+
+    return outputs
diff --git a/VideoLLaMA2/videollama2/constants.py b/VideoLLaMA2/videollama2/constants.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba87b61becb0819594962652fd3e193a9c8c3a3f
--- /dev/null
+++ b/VideoLLaMA2/videollama2/constants.py
@@ -0,0 +1,32 @@
+CONTROLLER_HEART_BEAT_EXPIRATION = 30
+WORKER_HEART_BEAT_INTERVAL = 15
+
+LOGDIR = "."
+
+# Model Constants
+IGNORE_INDEX = -100
+
+# Image arguments
+IMAGE_TOKEN_INDEX = -200
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
+DEFAULT_IM_START_TOKEN = "<im_start>"
+DEFAULT_IM_END_TOKEN = "<im_end>"
+IMAGE_PLACEHOLDER = "<image-placeholder>"
+
+# Video arguments
+VIDEO_TOKEN_INDEX = -201
+DEFAULT_VIDEO_TOKEN = "<video>"
+NUM_FRAMES = 8
+MAX_FRAMES = 32
+NUM_FRAMES_PER_SECOND = 1
+
+# Audio arguments
+AUDIO_TOKEN_INDEX = -202
+DEFAULT_AUDIO_TOKEN = "<audio>"
+
+MODAL_INDEX_MAP = {
+    "<image>": -200,
+    "<video>": -201,
+    "<audio>": -202,
+}
diff --git a/videollama2/conversation.py b/VideoLLaMA2/videollama2/conversation.py
similarity index 91%
rename from videollama2/conversation.py
rename to VideoLLaMA2/videollama2/conversation.py
index 5186ed9f3615c9abdc28625268478c76bc099ae3..a59b62cd7ba36a54382d8c6db2a701186f9834a4 100644
--- a/videollama2/conversation.py
+++ b/VideoLLaMA2/videollama2/conversation.py
@@ -12,10 +12,9 @@ class SeparatorStyle(Enum):
     """Different separator style."""
     SINGLE = auto()
     TWO = auto()
-    MPT = auto()
     PLAIN = auto()
-    LLAMA_2 = auto()
-
+    LLAMA2 = auto()
+    QWEN = auto()
 
 @dataclasses.dataclass
 class Conversation:
@@ -65,16 +64,7 @@ class Conversation:
                     ret += role + ": " + message + seps[i % 2]
                 else:
                     ret += role + ":"
-        elif self.sep_style == SeparatorStyle.MPT:
-            ret = self.system + self.sep
-            for role, message in messages:
-                if message:
-                    if type(message) is tuple:
-                        message, _, _ = message
-                    ret += role + message + self.sep
-                else:
-                    ret += role
-        elif self.sep_style == SeparatorStyle.LLAMA_2:
+        elif self.sep_style == SeparatorStyle.LLAMA2:
             wrap_sys = lambda msg: f"<<SYS>>\n{msg}\n<</SYS>>\n\n"
             wrap_inst = lambda msg: f"[INST] {msg} [/INST]"
             ret = ""
@@ -95,6 +85,23 @@ class Conversation:
                 else:
                     ret += ""
             ret = ret.lstrip(self.sep)
+        elif self.sep_style == SeparatorStyle.QWEN:
+            ret = ""
+            # 1. Add system prompt
+            ret += self.system + self.sep + "\n"
+            # 2. Iterate message
+            for i, (role, message) in enumerate(messages):
+                if i == 0:
+                    assert message, "first message should not be none"
+                    assert role == self.roles[0], "first message should come from user"
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    # 2.1 Add role and message
+                    ret += role + message + self.sep + "\n"
+                else:
+                    # 2.2 Add generation prompt
+                    ret += role
         elif self.sep_style == SeparatorStyle.PLAIN:
             seps = [self.sep, self.sep2]
             ret = self.system
@@ -102,9 +109,9 @@ class Conversation:
                 if message:
                     if type(message) is tuple:
                         message, _, _ = message
-                    ret += message + seps[i % 2]
+                    ret += role + message + seps[i % 2]
                 else:
-                    ret += ""
+                    ret += role
         else:
             raise ValueError(f"Invalid style: {self.sep_style}")
 
@@ -113,7 +120,6 @@ class Conversation:
     def append_message(self, role, message):
         self.messages.append([role, message])
 
-
     def process_image(self, image, image_process_mode, return_pil=False, image_format='PNG', max_len=800, min_len=400):
         if image_process_mode == "Pad":
             def expand2square(pil_img, background_color=(122, 116, 104)):
@@ -308,17 +314,7 @@ class Conversation:
             "sep2": self.sep2,
         }
 
-conv_mistral_instruct = Conversation(
-    system="A chat between a curious user and an artificial intelligence assistant. "
-    "The assistant gives helpful, detailed, and polite answers to the user's questions.",
-    roles=("USER", "ASSISTANT"),
-    version="llama_v2",
-    messages=(),
-    offset=0,
-    sep_style=SeparatorStyle.LLAMA_2,
-    sep="",
-    sep2="</s>",
-)
+
 conv_vicuna_v0 = Conversation(
     system="A chat between a curious human and an artificial intelligence assistant. "
            "The assistant gives helpful, detailed, and polite answers to the human's questions.",
@@ -350,92 +346,43 @@ conv_vicuna_v0 = Conversation(
     sep="###",
 )
 
-conv_vicuna_v1 = Conversation(
-    system="A chat between a curious user and an artificial intelligence assistant. "
-    "The assistant gives helpful, detailed, and polite answers to the user's questions.",
-    roles=("USER", "ASSISTANT"),
-    version="v1",
-    messages=(),
-    offset=0,
-    sep_style=SeparatorStyle.TWO,
-    sep=" ",
-    sep2="</s>",
-)
-
-conv_llama_2 = Conversation(
-    system="""You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
-
-If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.""",
-    roles=("USER", "ASSISTANT"),
-    version="llama_v2",
-    messages=(),
-    offset=0,
-    sep_style=SeparatorStyle.LLAMA_2,
-    sep="<s>",
-    sep2="</s>",
-)
-
-conv_llava_llama_2 = Conversation(
-    system="You are a helpful language and vision assistant. "
-           "You are able to understand the visual content that the user provides, "
-           "and assist the user with a variety of tasks using natural language.",
-    roles=("USER", "ASSISTANT"),
-    version="llama_v2",
-    messages=(),
-    offset=0,
-    sep_style=SeparatorStyle.LLAMA_2,
-    sep="<s>",
-    sep2="</s>",
-)
-
-conv_mpt = Conversation(
-    system="""<|im_start|>system
-A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.""",
-    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
-    version="mpt",
-    messages=(),
-    offset=0,
-    sep_style=SeparatorStyle.MPT,
-    sep="<|im_end|>",
-)
-
 conv_llava_plain = Conversation(
     system="",
     roles=("", ""),
-    messages=(
-    ),
+    messages=(),
     offset=0,
     sep_style=SeparatorStyle.PLAIN,
-    sep="\n",
+    sep="",
+    sep2="\n"
 )
 
-conv_llava_v0 = Conversation(
-    system="A chat between a curious human and an artificial intelligence assistant. "
-           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+conv_llava_v0_mmtag = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+           "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
+           "The visual content will be provided with the following format: <Image>visual content</Image>.",
     roles=("Human", "Assistant"),
     messages=(
     ),
     offset=0,
     sep_style=SeparatorStyle.SINGLE,
     sep="###",
+    version="v0_mmtag",
 )
 
-conv_llava_v0_mmtag = Conversation(
-    system="A chat between a curious user and an artificial intelligence assistant. "
-           "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
-           "The visual content will be provided with the following format: <Image>visual content</Image>.",
+conv_llava_v0 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
     roles=("Human", "Assistant"),
     messages=(
     ),
     offset=0,
     sep_style=SeparatorStyle.SINGLE,
     sep="###",
-    version="v0_mmtag",
 )
 
-conv_llava_v1 = Conversation(
-    system="A chat between a curious human and an artificial intelligence assistant. "
-           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+conv_vicuna_v1 = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the user's questions.",
     roles=("USER", "ASSISTANT"),
     version="v1",
     messages=(),
@@ -458,25 +405,101 @@ conv_llava_v1_mmtag = Conversation(
     version="v1_mmtag",
 )
 
-default_conversation = conv_vicuna_v1
+conv_llava_v1 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+
+conv_llava_llama2 = Conversation(
+    system="You are a helpful language and vision assistant. "
+           "You are able to understand the visual content that the user provides, "
+           "and assist the user with a variety of tasks using natural language.",
+    roles=("USER", "ASSISTANT"),
+    version="llama2",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA2,
+    sep="<s>",
+    sep2="</s>",
+)
+
+conv_llama2 = Conversation(
+    system="""You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
+
+If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.""",
+    roles=("USER", "ASSISTANT"),
+    version="llama2",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA2,
+    sep="<s>",
+    sep2="</s>",
+)
+
+conv_mistral = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="llama2",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA2,
+    sep="",
+    sep2="</s>",
+)
+
+conv_qwen = Conversation(
+    system="<|im_start|>system\nYou are a helpful assistant.",
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.QWEN,
+    sep="<|im_end|>",
+    version="qwen",
+)
+
+conv_qwen_plain = Conversation(
+    system="",
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.PLAIN,
+    sep="<|im_end|>",
+    sep2="<|im_end|>",
+    version="qwen_plain",
+)
+
+default_conversation = conv_mistral
 conv_templates = {
     "default": conv_vicuna_v0,
-    "v0": conv_vicuna_v0,
-    "v1": conv_vicuna_v1,
-    "vicuna_v1": conv_vicuna_v1,
-    "llama_2": conv_llama_2,
-
+    # pretrain template
     "plain": conv_llava_plain,
+    # llava v0
+    "v0": conv_vicuna_v0,
     "v0_plain": conv_llava_plain,
-    "llava_v0": conv_llava_v0,
     "v0_mmtag": conv_llava_v0_mmtag,
-    "llava_v1": conv_llava_v1,
+    "llava_v0": conv_llava_v0,
+    # llava v1
+    "v1": conv_vicuna_v1,
     "v1_mmtag": conv_llava_v1_mmtag,
-    "llava_llama_2": conv_llava_llama_2,
-
-    "video_llama_beta": conv_llava_llama_2,
-    "mistral_instruct": conv_mistral_instruct,
-    "mpt": conv_mpt,
+    "llava_v1": conv_llava_v1,
+    "vicuna_v1": conv_vicuna_v1,
+    # llava v1.5
+    "llava_llama2": conv_llava_llama2,
+    # llama2
+    "llama2": conv_llama2,
+    # mistral
+    "mistral": conv_mistral,
+    # qwen
+    "qwen": conv_qwen,
+    "qwen_plain": conv_qwen_plain,
 }
 
 
diff --git a/VideoLLaMA2/videollama2/eval/eval_video_cap_msvc_correctness.py b/VideoLLaMA2/videollama2/eval/eval_video_cap_msvc_correctness.py
new file mode 100644
index 0000000000000000000000000000000000000000..2cfcc23b9b54d0fd2f8e1764a0212b055e8c6085
--- /dev/null
+++ b/VideoLLaMA2/videollama2/eval/eval_video_cap_msvc_correctness.py
@@ -0,0 +1,259 @@
+import re
+import os
+import ast
+import time
+import json
+import argparse
+from tqdm import tqdm
+from multiprocessing.pool import Pool
+
+import openai
+from openai import AzureOpenAI
+
+
+def init():
+    client = AzureOpenAI(
+        azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"), 
+        api_key=os.getenv("AZURE_OPENAI_KEY"),  
+        api_version="2024-02-15-preview"
+    )
+
+    return client
+
+
+def interaction(client, message_text):
+    completion = client.chat.completions.create(
+        model=os.getenv("AZURE_OPENAI_DEPLOYNAME"),
+        messages = message_text,
+        temperature=0.7,
+        max_tokens=800,
+        top_p=0.95,
+        frequency_penalty=0,
+        presence_penalty=0,
+        stop=None
+    )
+
+    return completion
+
+
+def annotate(prediction_set, caption_files, output_dir):
+    """
+    Evaluates question and answer pairs using GPT-3
+    Returns a score for correctness.
+    """
+    
+    for file in tqdm(caption_files):
+        key = file[:-5] # Strip file extension
+        qa_set = prediction_set[key]
+        question = qa_set['q']
+        answer = str(qa_set['a'])
+        pred = qa_set['pred']
+        try:
+            message = [
+                    {
+                        "role": "system",
+                        "content": 
+                            "You are an intelligent chatbot designed for evaluating the factual accuracy of generative outputs for video-based question-answer pairs. "
+                            "Your task is to compare the predicted answer with these correct answers and determine if they are factually consistent. Here's how you can accomplish the task:"
+                            "------"
+                            "##INSTRUCTIONS: "
+                            "- Focus on the factual consistency between the predicted answer and the correct answer. The predicted answer should not contain any misinterpretations or misinformation.\n"
+                            "- The predicted answer must be factually accurate and align with the video content.\n"
+                            "- Consider synonyms or paraphrases as valid matches.\n"
+                            "- Evaluate the factual accuracy of the prediction compared to the answer."
+                    },
+                    {
+                        "role": "user",
+                        "content":
+                            "Please evaluate the following video-based question-answer pair:\n\n"
+                            f"Question: {question}\n"
+                            f"Correct Answers: {answer}\n"
+                            f"Predicted Answer: {pred}\n\n"
+                            "Provide your evaluation only as a factual accuracy score where the factual accuracy score is an integer value between 0 and 5, with 5 indicating the highest level of factual consistency. "
+                            "Please generate the response in the form of a Python dictionary string with keys 'score', where its value is the factual accuracy score in INTEGER, not STRING."
+                            "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
+                            "For example, your response should look like this: {''score': 4.8}."
+                    }
+                ]
+            completion = interaction(client, message)
+            # Convert response to a Python dictionary.
+            response_message = completion.choices[0].message.content
+            response_dict = ast.literal_eval(response_message)
+            result_qa_pair = [response_dict, qa_set]
+            # # Save the question-answer pairs to a json file.
+            with open(f"{output_dir}/{key}.json", "w") as f:
+                json.dump(result_qa_pair, f)
+
+        except Exception as e:
+            print(f"Error processing file '{key}': {e}")
+        
+    time.sleep(1)
+
+
+def longest_repeating_substring(s):
+    n = len(s)
+    dp = [[0] * (n+1) for _ in range(n+1)]
+    res = ""
+    res_length = 0
+
+    index = 0
+    for i in range(1, n+1):
+        for j in range(i+1, n+1):
+            if (dp[i-1][j-1] > 0 and dp[i-1][j-1] < (j-i)) or s[i-1] == s[j-1]:
+                dp[i][j] = dp[i-1][j-1] + 1
+                if dp[i][j] > res_length:
+                    res_length = dp[i][j]
+                    index = max(i, index)
+            else:
+                dp[i][j] = 0
+
+    if res_length > 0:
+        for i in range(index-res_length+1, index+1):
+            res = res + s[i-1]
+
+    return res
+
+
+def main(args):
+    if args.num_chunks > 1:
+        pred_contents = []
+        for _idx in range(args.num_chunks):
+            file = os.path.join(args.pred_path, f"{args.num_chunks}_{_idx}.json")
+            pred_contents += [json.loads(line) for line in open(file)]
+    else:
+        pred_contents = [json.loads(line) for line in open(args.pred_path)]
+
+    # Dictionary to store the count of occurrences for each video_id
+    video_id_counts = {}
+    new_pred_contents = []
+
+    # Iterate through each sample in pred_contents
+    for sample in pred_contents:
+        video_id = sample["video_name"]
+        if video_id in video_id_counts:
+            video_id_counts[video_id] += 1
+        else:
+            video_id_counts[video_id] = 0
+
+        # Create a new sample with the modified key
+        new_sample = sample
+        new_sample["video_name"] = f"{video_id.split('/')[-1].split('.')[0]}_{video_id_counts[video_id]}"
+        new_pred_contents.append(new_sample)
+
+    # Generating list of id's and corresponding files
+    id_list = [x["video_name"] for x in new_pred_contents]
+    caption_files = [f"{id}.json" for id in id_list]
+
+    output_dir = args.output_dir
+    # Generate output directory if not exists.
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
+    # Preparing dictionary of question-answer sets
+    prediction_set = {}
+    for sample in new_pred_contents:
+        id = sample["video_name"]
+        # print(sample)
+        question = sample["question"]
+        answer = sample["answer"]
+        pred = sample["pred"]
+        qa_set = {"q": question, "a": answer, "pred": pred}
+        prediction_set[id] = qa_set
+
+    # # Set the OpenAI API key.
+    # openai.api_key = args.api_key  # Your API key here
+    # if args.api_base:
+    #     openai.api_base = args.api_base  # Your API base here
+    num_tasks = args.num_tasks
+
+    # While loop to ensure that all captions are processed.
+    while True:
+        try:
+            # Files that have not been processed yet.
+            completed_files = os.listdir(output_dir)
+            print(f"completed_files: {len(completed_files)}")
+
+            # Files that have not been processed yet.
+            incomplete_files = [f for f in caption_files if f not in completed_files]
+            print(f"incomplete_files: {len(incomplete_files)}")
+
+            # Break the loop when there are no incomplete files
+            if len(incomplete_files) == 0:
+                break
+            if len(incomplete_files) <= num_tasks:
+                num_tasks = 1
+
+            # Split tasks into parts.
+            part_len = len(incomplete_files) // num_tasks
+            all_parts = [incomplete_files[i : i + part_len] for i in range(0, len(incomplete_files), part_len)]
+            task_args = [(prediction_set, part, args.output_dir) for part in all_parts]
+            print("Generate", len(all_parts), "subprocess.")
+
+            # Use a pool of workers to process the files in parallel.
+            # with Pool() as pool:
+                # pool.starmap(annotate, task_args)
+            # import pdb;pdb.set_trace()
+            annotate(*task_args[0])
+
+        except Exception as e:
+            print(f"Error: {e}")
+
+    # Combine all the processed files into one
+    combined_contents = {}
+    json_path = args.output_json
+
+    # Iterate through json files
+    for file_name in os.listdir(output_dir):
+        if file_name.endswith(".json"):
+            file_path = os.path.join(output_dir, file_name)
+            with open(file_path, "r") as json_file:
+                try:
+                    content = json.load(json_file)
+                    combined_contents[file_name[:-5]] = content
+                except Exception as e:
+                    print(f"Error: {e}")
+                    pass
+
+    # Calculate average score
+    score_sum = 0
+    count = 0
+    for key, result in combined_contents.items():
+        count += 1
+        try:
+            # key = result[0].keys()[0]
+            # import pdb; pdb.set_trace()
+            for _ in result[0].keys():
+                score_match = result[0][_]
+                score = int(score_match)
+                score_sum += score
+                break
+        except Exception as e:
+            print(f"Error processing file '{key}': {e}")
+            import pdb; pdb.set_trace()
+    average_score = score_sum / count
+    combined_contents["average_score"] = average_score
+    with open(json_path, "w") as json_file:
+        json.dump(combined_contents, json_file, indent=4)
+    print("Average score for correctness:", average_score)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
+    parser.add_argument("--pred-path", required=True, help="The path to file containing prediction.")
+    parser.add_argument("--output-dir", required=True, help="The path to save annotation json files.")
+    parser.add_argument("--output-json", required=True, help="The path to save annotation final combined json file.")
+    parser.add_argument("--num-tasks", required=True, type=int, help="Number of splits.")
+    parser.add_argument("--num_chunks", default=1, type=int, help="Result splits")
+    parser.add_argument("--api-key", required=True, type=str, help="Azure Openai API key.")
+    parser.add_argument("--api-endpoint", required=True, type=str, help="Azure Openai API endpoint.")
+    parser.add_argument("--api-deployname", required=True, type=str, help="Azure Openai API deployname.")
+    args = parser.parse_args()
+
+    # Set the OpenAI API key.
+    os.environ["AZURE_OPENAI_KEY"] = args.api_key
+    os.environ["AZURE_OPENAI_ENDPOINT"] = args.api_endpoint
+    os.environ["AZURE_OPENAI_DEPLOYNAME"] = args.api_deployname
+
+    client = init()
+
+    main(args)
diff --git a/VideoLLaMA2/videollama2/eval/eval_video_cap_msvc_detailedness.py b/VideoLLaMA2/videollama2/eval/eval_video_cap_msvc_detailedness.py
new file mode 100644
index 0000000000000000000000000000000000000000..dae772ab8418804a587054a4e4d7b139a9d8e67d
--- /dev/null
+++ b/VideoLLaMA2/videollama2/eval/eval_video_cap_msvc_detailedness.py
@@ -0,0 +1,257 @@
+import re
+import os
+import ast
+import time
+import json
+import argparse
+from tqdm import tqdm
+from multiprocessing.pool import Pool
+
+import openai
+from openai import AzureOpenAI
+
+
+def init():
+    client = AzureOpenAI(
+        azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"), 
+        api_key=os.getenv("AZURE_OPENAI_KEY"),  
+        api_version="2024-02-15-preview"
+    )
+
+    return client
+
+
+def interaction(client, message_text):
+    completion = client.chat.completions.create(
+        model=os.getenv("AZURE_OPENAI_DEPLOYNAME"),
+        messages = message_text,
+        temperature=0.7,
+        max_tokens=800,
+        top_p=0.95,
+        frequency_penalty=0,
+        presence_penalty=0,
+        stop=None
+    )
+
+    return completion
+
+
+def annotate(prediction_set, caption_files, output_dir):
+    """
+    Evaluates question and answer pairs using GPT-3
+    Returns a score for correctness.
+    """
+    
+    for file in tqdm(caption_files):
+        key = file[:-5] # Strip file extension
+        qa_set = prediction_set[key]
+        question = qa_set['q']
+        answer = str(qa_set['a'])
+        pred = qa_set['pred']
+        try:
+            message = [
+                    {
+                        "role": "system",
+                        "content": "You are an intelligent chatbot designed for evaluating the detail orientation of generative outputs for video-based question-answer pairs. "
+                        "Your task is to compare the predicted answer with these correct answers and determine its level of detail, considering both completeness and specificity. Here's how you can accomplish the task:"
+                        "------"
+                        "##INSTRUCTIONS: "
+                        "- Check if the predicted answer covers all major points from the video. The response should not leave out any key aspects.\n"
+                        "- Evaluate whether the predicted answer includes specific details rather than just generic points. It should provide comprehensive information that is tied to specific elements of the video.\n"
+                        "- Consider synonyms or paraphrases as valid matches.\n"
+                        "- Provide a single evaluation score that reflects the level of detail orientation of the prediction, considering both completeness and specificity.",
+                    },
+                    {
+                        "role": "user",
+                        "content": "Please evaluate the following video-based question-answer pair:\n\n"
+                        f"Question: {question}\n"
+                        f"Correct Answers: {answer}\n"
+                        f"Predicted Answer: {pred}\n\n"
+                        "Provide your evaluation only as a detail orientation score where the detail orientation score is an integer value between 0 and 5, with 5 indicating the highest level of detail orientation. "
+                        "Please generate the response in the form of a Python dictionary string with keys 'score', where its value is the detail orientation score in INTEGER, not STRING."
+                        "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
+                        "For example, your response should look like this: {''score': 4.8}.",
+                    },
+                ]
+            completion = interaction(client, message)
+            # Convert response to a Python dictionary.
+            response_message = completion.choices[0].message.content
+            response_dict = ast.literal_eval(response_message)
+            result_qa_pair = [response_dict, qa_set]
+            # # Save the question-answer pairs to a json file.
+            with open(f"{output_dir}/{key}.json", "w") as f:
+                json.dump(result_qa_pair, f)
+
+        except Exception as e:
+            print(f"Error processing file '{key}': {e}")
+        
+    time.sleep(1)
+
+
+def longest_repeating_substring(s):
+    n = len(s)
+    dp = [[0] * (n+1) for _ in range(n+1)]
+    res = ""
+    res_length = 0
+
+    index = 0
+    for i in range(1, n+1):
+        for j in range(i+1, n+1):
+            if (dp[i-1][j-1] > 0 and dp[i-1][j-1] < (j-i)) or s[i-1] == s[j-1]:
+                dp[i][j] = dp[i-1][j-1] + 1
+                if dp[i][j] > res_length:
+                    res_length = dp[i][j]
+                    index = max(i, index)
+            else:
+                dp[i][j] = 0
+
+    if res_length > 0:
+        for i in range(index-res_length+1, index+1):
+            res = res + s[i-1]
+
+    return res
+
+
+def main(args):
+    if args.num_chunks > 1:
+        pred_contents = []
+        for _idx in range(args.num_chunks):
+            file = os.path.join(args.pred_path, f"{args.num_chunks}_{_idx}.json")
+            pred_contents += [json.loads(line) for line in open(file)]
+    else:
+        pred_contents = [json.loads(line) for line in open(args.pred_path)]
+
+    # Dictionary to store the count of occurrences for each video_id
+    video_id_counts = {}
+    new_pred_contents = []
+
+    # Iterate through each sample in pred_contents
+    for sample in pred_contents:
+        video_id = sample["video_name"]
+        if video_id in video_id_counts:
+            video_id_counts[video_id] += 1
+        else:
+            video_id_counts[video_id] = 0
+
+        # Create a new sample with the modified key
+        new_sample = sample
+        new_sample["video_name"] = f"{video_id.split('/')[-1].split('.')[0]}_{video_id_counts[video_id]}"
+        new_pred_contents.append(new_sample)
+
+    # Generating list of id's and corresponding files
+    id_list = [x["video_name"] for x in new_pred_contents]
+    caption_files = [f"{id}.json" for id in id_list]
+
+    output_dir = args.output_dir
+    # Generate output directory if not exists.
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
+    # Preparing dictionary of question-answer sets
+    prediction_set = {}
+    for sample in new_pred_contents:
+        id = sample["video_name"]
+        # print(sample)
+        question = sample["question"]
+        answer = sample["answer"]
+        pred = sample["pred"]
+        qa_set = {"q": question, "a": answer, "pred": pred}
+        prediction_set[id] = qa_set
+
+    # # Set the OpenAI API key.
+    # openai.api_key = args.api_key  # Your API key here
+    # if args.api_base:
+    #     openai.api_base = args.api_base  # Your API base here
+    num_tasks = args.num_tasks
+
+    # While loop to ensure that all captions are processed.
+    while True:
+        try:
+            # Files that have not been processed yet.
+            completed_files = os.listdir(output_dir)
+            print(f"completed_files: {len(completed_files)}")
+
+            # Files that have not been processed yet.
+            incomplete_files = [f for f in caption_files if f not in completed_files]
+            print(f"incomplete_files: {len(incomplete_files)}")
+
+            # Break the loop when there are no incomplete files
+            if len(incomplete_files) == 0:
+                break
+            if len(incomplete_files) <= num_tasks:
+                num_tasks = 1
+
+            # Split tasks into parts.
+            part_len = len(incomplete_files) // num_tasks
+            all_parts = [incomplete_files[i : i + part_len] for i in range(0, len(incomplete_files), part_len)]
+            task_args = [(prediction_set, part, args.output_dir) for part in all_parts]
+            print("Generate", len(all_parts), "subprocess.")
+
+            # Use a pool of workers to process the files in parallel.
+            # with Pool() as pool:
+                # pool.starmap(annotate, task_args)
+            # import pdb;pdb.set_trace()
+            annotate(*task_args[0])
+
+        except Exception as e:
+            print(f"Error: {e}")
+
+    # Combine all the processed files into one
+    combined_contents = {}
+    json_path = args.output_json
+
+    # Iterate through json files
+    for file_name in os.listdir(output_dir):
+        if file_name.endswith(".json"):
+            file_path = os.path.join(output_dir, file_name)
+            with open(file_path, "r") as json_file:
+                try:
+                    content = json.load(json_file)
+                    combined_contents[file_name[:-5]] = content
+                except Exception as e:
+                    print(f"Error: {e}")
+                    pass
+
+    # Calculate average score
+    score_sum = 0
+    count = 0
+    for key, result in combined_contents.items():
+        count += 1
+        try:
+            # key = result[0].keys()[0]
+            # import pdb; pdb.set_trace()
+            for _ in result[0].keys():
+                score_match = result[0][_]
+                score = int(score_match)
+                score_sum += score
+                break
+        except Exception as e:
+            print(f"Error processing file '{key}': {e}")
+            import pdb; pdb.set_trace()
+    average_score = score_sum / count
+    combined_contents["average_score"] = average_score
+    with open(json_path, "w") as json_file:
+        json.dump(combined_contents, json_file, indent=4)
+    print("Average score for detailedness:", average_score)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
+    parser.add_argument("--pred-path", required=True, help="The path to file containing prediction.")
+    parser.add_argument("--output-dir", required=True, help="The path to save annotation json files.")
+    parser.add_argument("--output-json", required=True, help="The path to save annotation final combined json file.")
+    parser.add_argument("--num-tasks", required=True, type=int, help="Number of splits.")
+    parser.add_argument("--num_chunks", default=1, type=int, help="Result splits")
+    parser.add_argument("--api-key", required=True, type=str, help="Azure Openai API key.")
+    parser.add_argument("--api-endpoint", required=True, type=str, help="Azure Openai API endpoint.")
+    parser.add_argument("--api-deployname", required=True, type=str, help="Azure Openai API deployname.")
+    args = parser.parse_args()
+
+    # Set the OpenAI API key.
+    os.environ["AZURE_OPENAI_KEY"] = args.api_key
+    os.environ["AZURE_OPENAI_ENDPOINT"] = args.api_endpoint
+    os.environ["AZURE_OPENAI_DEPLOYNAME"] = args.api_deployname
+
+    client = init()
+
+    main(args)
diff --git a/videollama2/eval/eval_video_qa_mvbench.py b/VideoLLaMA2/videollama2/eval/eval_video_mcqa_mvbench.py
similarity index 100%
rename from videollama2/eval/eval_video_qa_mvbench.py
rename to VideoLLaMA2/videollama2/eval/eval_video_mcqa_mvbench.py
diff --git a/VideoLLaMA2/videollama2/eval/eval_video_mcqa_videomme.py b/VideoLLaMA2/videollama2/eval/eval_video_mcqa_videomme.py
new file mode 100644
index 0000000000000000000000000000000000000000..afca31c461132fbfce7a6d2bc7803f18db7c6f01
--- /dev/null
+++ b/VideoLLaMA2/videollama2/eval/eval_video_mcqa_videomme.py
@@ -0,0 +1,277 @@
+import os
+import re
+import json
+import argparse
+from typing import List, Dict, Optional, Union
+
+CATEGORIES = [
+    "Knowledge",
+    "Film & Television",
+    "Sports Competition",
+    "Artistic Performance",
+    "Life Record",
+    "Multilingual"
+]
+
+SUB_CATEGORIES = [
+    "Humanity & History",
+    "Literature & Art",
+    "Biology & Medicine",
+    "Finance & Commerce",
+    "Astronomy",
+    "Geography",
+    "Law",
+    "Life Tip",
+    "Technology",
+    "Animation",
+    "Movie & TV Show",
+    "Documentary",
+    "News Report",
+    "Esports",
+    "Basketball",
+    "Football",
+    "Athletics",
+    "Other Sports",
+    "Stage Play",
+    "Magic Show",
+    "Variety Show",
+    "Acrobatics",
+    "Handicraft",
+    "Food",
+    "Fashion",
+    "Daily Life",
+    "Travel",
+    "Pet & Animal",
+    "Exercise",
+    "Multilingual"
+]
+
+TASK_CATEGORIES = [
+    "Temporal Perception",
+    "Spatial Perception",
+    "Attribute Perception",
+    "Action Recognition",
+    "Object Recognition",
+    "OCR Problems",
+    "Counting Problem",
+    "Temporal Reasoning",
+    "Spatial Reasoning",
+    "Action Reasoning",
+    "Object Reasoning",
+    "Information Synopsis",
+]
+
+
+def extract_characters_regex(s):
+    s = s.strip()
+    answer_prefixes = [
+        "The best answer is",
+        "The correct answer is",
+        "The answer is",
+        "The answer",
+        "The best option is"
+        "The correct option is",
+        "Best answer:"
+        "Best option:",
+    ]
+    for answer_prefix in answer_prefixes:
+        s = s.replace(answer_prefix, "")
+
+    if len(s.split()) > 10 and not re.search("[ABCD]", s):
+        return ""
+    matches = re.search(r'[ABCD]', s)
+    if matches is None:
+        return ""
+    return matches[0]
+
+
+def eval_your_results(
+        your_results_path: str, 
+        video_types: Optional[Union[List[str], str]] = None,
+        skip_missing: Optional[bool] = True,
+        return_categories_accuracy: Optional[bool] = True,
+        return_sub_categories_accuracy: Optional[bool] = False,
+        return_task_types_accuracy: Optional[bool] = False,
+        gt_answer_key: Optional[str] = "answer",
+        your_answer_key: Optional[str] = "response"
+
+    ):
+    """
+    Evaluate your results against the ground truth
+
+    Args:
+    - your_results_path (str): Path to your results file
+    - video_types (Optional[List[str], str]): List of video types to evaluate. 
+    - skip_missing (Optional[bool]): If True, missing files will be skipped. If False, an error will be raised if there are missing files.
+    - return_categories_accuracy (Optional[bool]): If True, the accuracy for each video category will be returned.
+    - return_sub_categories_accuracy (Optional[bool]): If True, the accuracy for each video sub category will be returned.
+    - return_task_types_accuracy (Optional[bool]): If True, the accuracy for each task category will be returned.
+    - gt_answer_key (Optional[str]): Key to access the ground truth answer in the results file.
+    - your_answer_key (Optional[str]): Key to access your answer in the results file.
+    """
+
+    # Load your results
+    with open(your_results_path, 'r') as f:
+        your_results = json.load(f)
+
+    if isinstance(video_types, str):
+        video_types = video_types.split(",")
+
+    q_type_dict = {}
+    v_type_dict = {}
+    v_sub_type_dict = {}
+
+
+    for video_type in video_types:
+
+        # Filter your results based on video types
+        your_results_video_type = [item for item in your_results if item["duration"] == video_type]
+
+        # Task Categories
+        q_type_dict[video_type] = {}
+        for q_type in TASK_CATEGORIES:
+            q_type_dict[video_type][q_type] = {"correct": 0, "answered": 0}
+
+        # Video categories
+        v_type_dict[video_type] = {}
+        for v_type in CATEGORIES:
+            v_type_dict[video_type][v_type] = {"correct": 0, "answered": 0}
+        
+        v_sub_type_dict[video_type] = {}
+        for v_sub_type in SUB_CATEGORIES:
+            v_sub_type_dict[video_type][v_sub_type] = {"correct": 0, "answered": 0}
+
+        if not skip_missing:
+            # Check if the number of files in your results and ground truth are the same
+            assert len(your_results_video_type) == 300, f"Number of files in {video_type} is not 300. Check if there are missing files."
+
+        for item in your_results_video_type:
+
+            if skip_missing and item["missing"]:
+                continue
+
+            # Get the video category, sub category and question category
+            video_category = item["domain"]
+            video_sub_category = item["sub_category"]
+            
+            questions = item["questions"]
+
+            for question in questions:
+                q_type = question["task_type"]
+
+                # Get the ground truth and your response
+                gt_answer = question[gt_answer_key]
+                response = question[your_answer_key]
+
+                # Extract the answer from the response
+                extration = extract_characters_regex(response)
+    
+                if extration != "":
+                    q_type_dict[video_type][q_type]["answered"] += 1
+                    q_type_dict[video_type][q_type]["correct"] += extration == gt_answer
+
+                    v_type_dict[video_type][video_category]["answered"] += 1
+                    v_type_dict[video_type][video_category]["correct"] += extration == gt_answer
+
+                    v_sub_type_dict[video_type][video_sub_category]["answered"] += 1
+                    v_sub_type_dict[video_type][video_sub_category]["correct"] += extration == gt_answer
+
+
+    # Print the results for each video type
+    for video_type in video_types:
+
+        print("=====================================")
+        print(f"Evaluation on video Type: {video_type}")
+        print("=====================================")
+        if return_categories_accuracy:
+            print("-------------------------------------")
+            print("Video Domains")
+            print("-------------------------------------")
+            for v_type in v_type_dict[video_type]:
+                print(f"{v_type}: {100 * v_type_dict[video_type][v_type]['correct'] / v_type_dict[video_type][v_type]['answered'] if v_type_dict[video_type][v_type]['answered'] > 0 else 0 : .1f}%")
+        if return_sub_categories_accuracy:
+            print("-------------------------------------")
+            print("Video Sub Categories")
+            print("-------------------------------------")
+            for v_sub_type in v_sub_type_dict[video_type]:
+                print(f"{v_sub_type}: {100 * v_sub_type_dict[video_type][v_sub_type]['correct'] / v_sub_type_dict[video_type][v_sub_type]['answered'] if v_sub_type_dict[video_type][v_sub_type]['answered'] > 0 else 0 : .1f}%")
+        if return_task_types_accuracy:
+            print("-------------------------------------")
+            print("Task Categories")
+            print("-------------------------------------")
+            for q_type in q_type_dict[video_type]:
+                print(f"{q_type}: {100 * q_type_dict[video_type][q_type]['correct'] / q_type_dict[video_type][q_type]['answered'] if q_type_dict[video_type][q_type]['answered'] > 0 else 0 : .1f}%")
+        
+        print("-------------------------------------")
+        print("Overall Performance")
+        print("-------------------------------------")
+        total_correct = sum([q_type_dict[video_type][q_type]["correct"] for q_type in TASK_CATEGORIES])
+        total_answered = sum([q_type_dict[video_type][q_type]["answered"] for q_type in TASK_CATEGORIES])
+        print(f"Overall: {100 * total_correct / total_answered if total_answered > 0 else 0 : .1f}%")
+
+        print("\n")
+
+    # Print the results for the entire dataset
+    print("=====================================")
+    print("Evaluation on the entire dataset")
+    print("=====================================")
+
+    if return_categories_accuracy:
+        print("-------------------------------------")
+        print("Video Categories")
+        print("-------------------------------------")
+        for v_type in CATEGORIES:
+            total_correct = sum([v_type_dict[video_type][v_type]["correct"] for video_type in video_types])
+            total_answered = sum([v_type_dict[video_type][v_type]["answered"] for video_type in video_types])
+            print(f"{v_type}: {100 * total_correct / total_answered if total_answered > 0 else 0 : .1f}%")
+    
+
+    if return_sub_categories_accuracy:
+        print("-------------------------------------")
+        print("Video Sub Categories")
+        print("-------------------------------------")
+
+        for v_sub_type in SUB_CATEGORIES:
+            total_correct = sum([v_sub_type_dict[video_type][v_sub_type]["correct"] for video_type in video_types])
+            total_answered = sum([v_sub_type_dict[video_type][v_sub_type]["answered"] for video_type in video_types])
+            print(f"{v_sub_type}: {100 * total_correct / total_answered if total_answered > 0 else 0 : .1f}%")
+
+
+    if return_task_types_accuracy:
+        print("-------------------------------------")
+        print("Task Categories")
+        print("-------------------------------------")
+        for q_type in TASK_CATEGORIES:
+
+            total_correct = sum([q_type_dict[video_type][q_type]["correct"] for video_type in video_types])
+            total_answered = sum([q_type_dict[video_type][q_type]["answered"] for video_type in video_types])
+            print(f"{q_type}: {100 * total_correct / total_answered if total_answered > 0 else 0 : .1f}%")
+
+    print("-------------------------------------")
+    print("Overall Performance")
+    print("-------------------------------------")
+    total_correct = sum([sum([q_type_dict[video_type][q_type]["correct"] for q_type in TASK_CATEGORIES]) for video_type in video_types])
+    total_answered = sum([sum([q_type_dict[video_type][q_type]["answered"] for q_type in TASK_CATEGORIES]) for video_type in video_types])
+    print(f"Overall: {100 * total_correct / total_answered if total_answered > 0 else 0 : .1f}%")
+
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--results_file", type=str, required=True)
+    parser.add_argument("--video_duration_type", type=str, required=True)
+    parser.add_argument("--return_categories_accuracy", action="store_true")
+    parser.add_argument("--return_sub_categories_accuracy", action="store_true")
+    parser.add_argument("--return_task_types_accuracy", action="store_true")
+    parser.add_argument("--skip_missing", action="store_true")
+
+    args = parser.parse_args()
+
+    eval_your_results(
+        args.results_file, 
+        video_types=args.video_duration_type,
+        skip_missing=args.skip_missing,
+        return_categories_accuracy=args.return_categories_accuracy,
+        return_sub_categories_accuracy=args.return_sub_categories_accuracy,
+        return_task_types_accuracy=args.return_task_types_accuracy,
+    )
diff --git a/videollama2/eval/eval_video_qa_gpt.py b/VideoLLaMA2/videollama2/eval/eval_video_oqa_activitynet.py
similarity index 100%
rename from videollama2/eval/eval_video_qa_gpt.py
rename to VideoLLaMA2/videollama2/eval/eval_video_oqa_activitynet.py
diff --git a/videollama2/eval/eval_benchmark_1_correctness.py b/VideoLLaMA2/videollama2/eval/eval_video_oqa_vcgpt_1_correctness.py
similarity index 100%
rename from videollama2/eval/eval_benchmark_1_correctness.py
rename to VideoLLaMA2/videollama2/eval/eval_video_oqa_vcgpt_1_correctness.py
diff --git a/videollama2/eval/eval_benchmark_2_detailed_orientation.py b/VideoLLaMA2/videollama2/eval/eval_video_oqa_vcgpt_2_detailed_orientation.py
similarity index 100%
rename from videollama2/eval/eval_benchmark_2_detailed_orientation.py
rename to VideoLLaMA2/videollama2/eval/eval_video_oqa_vcgpt_2_detailed_orientation.py
diff --git a/videollama2/eval/eval_benchmark_3_context.py b/VideoLLaMA2/videollama2/eval/eval_video_oqa_vcgpt_3_context.py
similarity index 100%
rename from videollama2/eval/eval_benchmark_3_context.py
rename to VideoLLaMA2/videollama2/eval/eval_video_oqa_vcgpt_3_context.py
diff --git a/videollama2/eval/eval_benchmark_4_temporal.py b/VideoLLaMA2/videollama2/eval/eval_video_oqa_vcgpt_4_temporal.py
similarity index 100%
rename from videollama2/eval/eval_benchmark_4_temporal.py
rename to VideoLLaMA2/videollama2/eval/eval_video_oqa_vcgpt_4_temporal.py
diff --git a/videollama2/eval/eval_benchmark_5_consistency.py b/VideoLLaMA2/videollama2/eval/eval_video_oqa_vcgpt_5_consistency.py
similarity index 100%
rename from videollama2/eval/eval_benchmark_5_consistency.py
rename to VideoLLaMA2/videollama2/eval/eval_video_oqa_vcgpt_5_consistency.py
diff --git a/VideoLLaMA2/videollama2/eval/inference_video_cap_msvc.py b/VideoLLaMA2/videollama2/eval/inference_video_cap_msvc.py
new file mode 100644
index 0000000000000000000000000000000000000000..6131696947d086f4e4f66bc9af1226cbed1c1dbc
--- /dev/null
+++ b/VideoLLaMA2/videollama2/eval/inference_video_cap_msvc.py
@@ -0,0 +1,78 @@
+import math
+import os
+import argparse
+import json
+import warnings
+from tqdm import tqdm
+
+import sys
+sys.path.append('./')
+from videollama2 import model_init, mm_infer
+from videollama2.utils import disable_torch_init
+
+# NOTE: Ignore TypedStorage warning, which refers to this link~(https://github.com/pytorch/pytorch/issues/97207#issuecomment-1494781560)
+warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
+
+
+def split_list(lst, n):
+    """Split a list into n (roughly) equal-sized chunks"""
+    chunk_size = math.ceil(len(lst) / n)  # integer division
+    return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
+
+
+def get_chunk(lst, n, k):
+    chunks = split_list(lst, n)
+    return chunks[k]
+
+
+def run_inference(args):
+    disable_torch_init()
+
+    model, processor, tokenizer = model_init(args.model_path)
+
+    gt_questions = json.load(open(args.question_file, "r"))
+    gt_questions = get_chunk(gt_questions, args.num_chunks, args.chunk_idx)
+
+    answer_file = os.path.join(args.output_file)
+    os.makedirs(os.path.dirname(args.output_file), exist_ok=True)
+    ans_file = open(answer_file, "w")
+
+    video_formats = ['.mp4', '.avi', '.mov', '.mkv']
+
+    # Iterate over each sample in the ground truth file
+    for idx, sample in enumerate(tqdm(gt_questions)):
+        video_name = sample['video_path']
+        question = sample['question']
+        answer = sample['captions']
+
+        video_path = os.path.join(args.video_folder, video_name)
+        video_tensor = processor['video'](video_path)
+
+        output = mm_infer(
+            video_tensor,
+            question, 
+            model=model,
+            tokenizer=tokenizer,
+            modal='video',
+            do_sample=False,
+        )
+
+        sample_set = {'video_name': video_name, 'question': question, 'answer': answer, 'pred': output}
+        ans_file.write(json.dumps(sample_set) + "\n")
+
+    ans_file.close()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('--model-path', help='', required=True)
+    parser.add_argument('--video-folder', help='Directory containing video files.', required=True)
+    parser.add_argument('--question-file', help='Path to the ground truth file containing question.', required=True)
+    parser.add_argument('--output-file', help='Directory to save the model results JSON.', required=True)
+    parser.add_argument("--num-chunks", type=int, default=1)
+    parser.add_argument("--chunk-idx", type=int, default=0)
+    parser.add_argument("--device", type=str, required=False, default='cuda:0')
+    args = parser.parse_args()
+
+    run_inference(args)
diff --git a/VideoLLaMA2/videollama2/eval/inference_video_mcqa_egoschema.py b/VideoLLaMA2/videollama2/eval/inference_video_mcqa_egoschema.py
new file mode 100644
index 0000000000000000000000000000000000000000..187412d37e9519b7e0c7d69543a34da9fe9e727c
--- /dev/null
+++ b/VideoLLaMA2/videollama2/eval/inference_video_mcqa_egoschema.py
@@ -0,0 +1,148 @@
+import os
+import re
+import math
+import json
+import argparse
+import warnings
+import traceback
+
+from tqdm import tqdm
+from torch.utils.data import Dataset, DataLoader
+
+import sys
+sys.path.append('./')
+from videollama2 import model_init, mm_infer
+from videollama2.utils import disable_torch_init
+
+# NOTE: Ignore TypedStorage warning, which refers to this link~(https://github.com/pytorch/pytorch/issues/97207#issuecomment-1494781560)
+warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
+
+
+def split_list(lst, n):
+    """Split a list into n (roughly) equal-sized chunks"""
+    chunk_size = math.ceil(len(lst) / n)  # integer division
+    return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
+
+
+def get_chunk(lst, n, k):
+    chunks = split_list(lst, n)
+    return chunks[k]
+
+
+class EgoschemaDataset(Dataset):
+
+    video_formats = ['.mp4', '.avi', '.mov', '.mkv']
+
+    def __init__(self, data_folder, data_list, processor):
+        self.data_folder = data_folder
+        self.data_list = data_list
+        self.processor = processor
+
+    def __len__(self):
+        return len(self.data_list)
+    
+    def __getitem__(self, idx):
+        line = self.data_list[idx]
+        q_uid = line['q_uid']
+
+        for fmt in self.video_formats:  # Added this line
+            temp_path = os.path.join(self.data_folder, f"{q_uid}{fmt}")
+            if os.path.exists(temp_path):
+                video_path = temp_path
+                break
+
+        video_tensor = self.processor(video_path)
+
+        question = line['question']
+        a0 = line['option 0']
+        a1 = line['option 1']
+        a2 = line['option 2']
+        a3 = line['option 3']
+        a4 = line['option 4']
+        axs = [a0, a1, a2, a3, a4]
+        ops = ['(A)', '(B)', '(C)', '(D)', '(E)']
+
+        instruct = f'Question: {question}\nOptions:\n(A) {a0}\n(B) {a1}\n(C) {a2}\n(D) {a3}\n(E) {a4}\nAnswer with the option\'s letter from the given choices directly and only give the best option.' 
+
+        return {
+            'q_uid': q_uid,
+            'video': video_tensor, 
+            'instruct': instruct,
+        }
+
+
+def build_egoschema_eval(args, processor):
+    questions = json.load(open(args.question_file, "r"))
+    questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
+    dataset = EgoschemaDataset(args.video_folder, questions, processor)
+    dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers)
+
+    return dataloader
+
+
+def egoschema_dump(ans_file, line, outputs):
+    for idx, output in enumerate(outputs):
+        q_uid = line['q_uid'][idx]
+        instruct = line['instruct'][idx]
+        letters = ['A', 'B', 'C', 'D', 'E']
+
+        output = output.replace('answer', '')
+        output = output.replace('Answer', '')
+        pred_answer = re.findall('[\(\ ]*[A-E][\)\ ]*', output)
+        try:
+            assert len(pred_answer) >= 1, 'The video \"{}\" output \"{}\" is not in the expected format'.format(line['q_uid'], instruct + '\n' + output)
+            pred_answer = pred_answer[0].strip()
+            pred_answer = pred_answer.strip('()')
+            pred_idx = letters.index(pred_answer)
+        except:
+            traceback.print_exc()
+            pred_idx = 2
+
+        ans_file.write(f'{q_uid}, {pred_idx}\n')
+
+
+def run_inference(args):
+    disable_torch_init()
+
+    model, processor, tokenizer = model_init(args.model_path)
+
+    answer_file = os.path.expanduser(args.answer_file)
+    os.makedirs(os.path.dirname(answer_file), exist_ok=True)
+    ans_file = open(answer_file, "w")
+
+    val_loader = build_egoschema_eval(args, processor['video'])
+
+    # Iterate over each sample in the ground truth file
+    for i, line in enumerate(tqdm(val_loader)):
+        video_tensor = line['video'][0]
+        instruct = line['instruct'][0]
+
+        pred = mm_infer(
+            video_tensor,
+            instruct,
+            model=model,
+            tokenizer=tokenizer,
+            modal='video',
+            do_sample=False,
+        )
+
+        egoschema_dump(ans_file, line, [pred])
+
+    ans_file.close()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Multiple-Choice Video QA Evaluation Script.')
+
+    parser.add_argument('--model-path', help='', required=True)
+    parser.add_argument('--video-folder', help='Directory containing video files.', required=True)
+    parser.add_argument('--question-file', help='Path to the ground truth file containing question.', required=True)
+    parser.add_argument('--answer-file', help='Path to the ground truth file containing answers.', required=True)
+    parser.add_argument("--num-chunks", type=int, default=1)
+    parser.add_argument("--chunk-idx", type=int, default=0)
+    parser.add_argument("--device", type=str, required=False, default='cuda:0')
+    parser.add_argument("--batch-size", type=int, default=1)
+    parser.add_argument("--num-workers", type=int, default=8)
+    args = parser.parse_args()
+
+    run_inference(args)
diff --git a/VideoLLaMA2/videollama2/eval/inference_video_mcqa_mvbench.py b/VideoLLaMA2/videollama2/eval/inference_video_mcqa_mvbench.py
new file mode 100644
index 0000000000000000000000000000000000000000..7230ee672faa637b5bab735e8a739e4f5fe10ab3
--- /dev/null
+++ b/VideoLLaMA2/videollama2/eval/inference_video_mcqa_mvbench.py
@@ -0,0 +1,203 @@
+import os
+import re
+import math
+import json
+import argparse
+import warnings
+import traceback
+
+import torch
+import numpy as np
+from PIL import Image
+from tqdm import tqdm
+from decord import VideoReader, cpu
+from torch.utils.data import Dataset, DataLoader
+
+import sys
+sys.path.append('./')
+from videollama2 import model_init, mm_infer
+from videollama2.utils import disable_torch_init
+
+# NOTE: Ignore TypedStorage warning, which refers to this link~(https://github.com/pytorch/pytorch/issues/97207#issuecomment-1494781560)
+warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
+
+
+def split_list(lst, n):
+    """Split a list into n (roughly) equal-sized chunks"""
+    chunk_size = math.ceil(len(lst) / n)  # integer division
+    return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
+
+
+def get_chunk(lst, n, k):
+    chunks = split_list(lst, n)
+    return chunks[k]
+
+
+class MVBenchDataset(Dataset):
+
+    def __init__(self, data_list, processor):
+        self.data_list = data_list
+        self.processor = processor
+
+    def __len__(self):
+        return len(self.data_list)
+
+    def __getitem__(self, idx):
+        bound = (None, None)
+        if self.data_list[idx]['bound']:
+            bound = (self.data_list[idx]['data']['start'], self.data_list[idx]['data']['end'])
+        video_path = os.path.join(self.data_list[idx]['prefix'], self.data_list[idx]['data']['video'])
+        torch_imgs = self.processor(video_path, s=bound[0], e=bound[1])
+        question = self.data_list[idx]['data']['question']
+        options = self.data_list[idx]['data']['candidates']
+        answer = self.data_list[idx]['data']['answer']
+        task_type = self.data_list[idx]['task_type']
+
+        answer_idx = -1
+        letters = []
+        options_string = ''
+        for option_idx, c in enumerate(options):
+            letters.append(f"{chr(ord('A') + option_idx)}")
+            options_string += f"({chr(ord('A') + option_idx)}) {c}\n"
+            if c == answer:
+                answer_idx = option_idx
+
+        instruct = f'Question: {question}\nOptions:\n{options_string}Answer with the option\'s letter from the given choices directly and only give the best option.' 
+
+        return {
+            'video': torch_imgs, 
+            'video_path': video_path,
+            'instruct': instruct,
+            'letters': letters,
+            'options': options,
+            'answer_idx': answer_idx,
+            'task_type': task_type
+        }
+
+
+tasks = {
+    "Action Sequence": ("action_sequence.json", "star/Charades_v1_480/", "video", True), # has start & end
+    "Action Prediction": ("action_prediction.json", "star/Charades_v1_480/", "video", True), # has start & end
+    "Action Antonym": ("action_antonym.json", "ssv2_video/", "video", False),
+    "Fine-grained Action": ("fine_grained_action.json", "Moments_in_Time_Raw/videos/", "video", False),
+    "Unexpected Action": ("unexpected_action.json", "FunQA_test/test/", "video", False),
+    "Object Existence": ("object_existence.json", "clevrer/video_validation/", "video", False),
+    "Object Interaction": ("object_interaction.json", "star/Charades_v1_480/", "video", True), # has start & end
+    "Object Shuffle": ("object_shuffle.json", "perception/videos/", "video", False),
+    "Moving Direction": ("moving_direction.json", "clevrer/video_validation/", "video", False),
+    "Action Localization": ("action_localization.json", "sta/sta_video/", "video", True),  # has start & end
+    "Scene Transition": ("scene_transition.json", "scene_qa/video/", "video", False),
+    "Action Count": ("action_count.json", "perception/videos/", "video", False),
+    "Moving Count": ("moving_count.json", "clevrer/video_validation/", "video", False),
+    "Moving Attribute": ("moving_attribute.json", "clevrer/video_validation/", "video", False),
+    "State Change": ("state_change.json", "perception/videos/", "video", False),
+    "Fine-grained Pose": ("fine_grained_pose.json", "nturgbd/", "video", False),
+    "Character Order": ("character_order.json", "perception/videos/", "video", False),
+    "Egocentric Navigation": ("egocentric_navigation.json", "vlnqa/", "video", False),
+    "Episodic Reasoning": ("episodic_reasoning.json", "tvqa/frames_fps3_hq/", "frame", True),  # has start & end, read frame
+    "Counterfactual Inference": ("counterfactual_inference.json", "clevrer/video_validation/", "video", False),
+}
+
+
+def build_mvbench_eval(args, processor):
+    data_list = []
+    for task_name, task in tasks.items():
+        json_file = os.path.join(args.question_file, task[0])
+        vis_folder = os.path.join(args.video_folder, task[1])
+        with open(json_file, 'r') as f:
+            json_data = json.load(f)
+        for data in json_data:
+            data_list.append({
+                'task_type': task_name,
+                'prefix': vis_folder,
+                'data_type': task[2],
+                'bound': task[3],
+                'data': data
+            })
+    data_list = get_chunk(data_list, args.num_chunks, args.chunk_idx)
+    dataset = MVBenchDataset(data_list, processor)
+    dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers)
+
+    return dataloader
+
+
+def mvbench_dump(vid, instruct, letters, options, output):
+    
+    output = output.replace('answer', '')
+    output = output.replace('Answer', '')
+    pred_answer = re.findall(f'[\(,\ ]*[{letters[0]}-{letters[-1]}][\),\ ]*', output)
+    try:
+        find_flag = False
+        if len(pred_answer) == 0:
+            for idx, opt in enumerate(options):
+                # Arabic numerals -> English words
+                if opt.lower() in output.lower():
+                    pred_idx = idx
+                    find_flag = True
+                    break
+        else:
+            pred_answer = pred_answer[0].strip()
+            pred_answer = pred_answer.strip('()')
+            pred_idx = letters.index(pred_answer)
+            find_flag = True
+
+        assert find_flag, 'The video \"{}\" output: \n\"{}\" is not in the expected format'.format(vid, instruct + '\n' + output)
+    except:
+        traceback.print_exc()
+        pred_idx = 2
+    
+    return pred_idx
+
+
+def run_inference(args):
+    disable_torch_init()
+
+    model, processor, tokenizer = model_init(args.model_path)
+
+    answer_file = os.path.expanduser(args.answer_file)
+    os.makedirs(os.path.dirname(answer_file), exist_ok=True)
+    ans_file = open(answer_file, "w")
+
+    val_loader = build_mvbench_eval(args, processor['video'])
+
+    # NOTE: only support batch size 1 for now
+    for i, line in enumerate(tqdm(val_loader)):
+        vid = line['video_path'][0]
+        video_tensor = line['video'][0]
+        task_type = line['task_type'][0]
+        instruct  = line['instruct'][0]
+        letters   = list(zip(*line['letters']))[0]
+        options   = list(zip(*line['options']))[0]
+        answer_idx = line['answer_idx'][0].item()
+
+        output = mm_infer(
+            video_tensor,
+            instruct,
+            model=model,
+            tokenizer=tokenizer,
+            modal='video',
+            do_sample=False,
+        )
+
+        pred_idx = mvbench_dump(vid, instruct, letters, options, output)
+
+        ans_file.write(json.dumps({"vid": vid, "task_type": task_type, "pred": pred_idx, "gt": answer_idx}) + '\n')
+
+    ans_file.close()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('--model-path', help='', required=True)
+    parser.add_argument('--video-folder', help='Directory containing video files.', required=True)
+    parser.add_argument('--question-file', help='Path to the ground truth file containing question.', required=True)
+    parser.add_argument('--answer-file', help='Path to the ground truth file containing answers.', required=True)
+    parser.add_argument("--num-chunks", type=int, default=1)
+    parser.add_argument("--chunk-idx", type=int, default=0)
+    parser.add_argument("--device", type=str, required=False, default='cuda:0')
+    parser.add_argument("--batch-size", type=int, default=1)
+    parser.add_argument("--num-workers", type=int, default=8)
+    args = parser.parse_args()
+
+    run_inference(args)
diff --git a/videollama2/eval/run_inference_video_qa_perception_test_mcqa.py b/VideoLLaMA2/videollama2/eval/inference_video_mcqa_perception_test_mcqa.py
similarity index 51%
rename from videollama2/eval/run_inference_video_qa_perception_test_mcqa.py
rename to VideoLLaMA2/videollama2/eval/inference_video_mcqa_perception_test_mcqa.py
index 1f0f1abf0613670fa39f540d7171a95b2f02a7d2..a0e59c5519da2315d2f7766a9f81b7736e5fdbf4 100644
--- a/videollama2/eval/run_inference_video_qa_perception_test_mcqa.py
+++ b/VideoLLaMA2/videollama2/eval/inference_video_mcqa_perception_test_mcqa.py
@@ -4,30 +4,16 @@ import math
 import json
 import argparse
 import warnings
+import traceback
 from tqdm import tqdm
 
 import torch
-import decord
-import numpy as np
-import transformers
-from decord import VideoReader, cpu
 from torch.utils.data import Dataset, DataLoader
 
 import sys
 sys.path.append('./')
-from videollama2.conversation import conv_templates, SeparatorStyle
-from videollama2.constants import NUM_FRAMES, DEFAULT_MMODAL_TOKEN, DEFAULT_MMODAL_START_TOKEN, DEFAULT_MMODAL_END_TOKEN, MMODAL_TOKEN_INDEX
-from videollama2.mm_utils import get_model_name_from_path, tokenizer_MMODAL_token, KeywordsStoppingCriteria, process_videos
-from videollama2.model.builder import load_pretrained_model
-
-
-# NOTE: Ignore TypedStorage warning, which refers to this link~(https://github.com/pytorch/pytorch/issues/97207#issuecomment-1494781560)
-warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
-
-default_mm_token = DEFAULT_MMODAL_TOKEN["VIDEO"]
-default_mm_start_token =  DEFAULT_MMODAL_START_TOKEN["VIDEO"]
-default_mm_end_token = DEFAULT_MMODAL_END_TOKEN["VIDEO"]
-modal_token_index = MMODAL_TOKEN_INDEX["VIDEO"]
+from videollama2 import model_init, mm_infer
+from videollama2.utils import disable_torch_init
 
 
 def split_list(lst, n):
@@ -45,10 +31,9 @@ class PerceptionTestMCQADataset(Dataset):
 
     video_formats = ['.mp4', '.avi', '.mov', '.mkv']
 
-    def __init__(self, data_list, processor, num_segments=8):
+    def __init__(self, data_list, processor):
         self.data_list = data_list
         self.processor = processor
-        self.num_segments = num_segments
 
     def __len__(self):
         return len(self.data_list)
@@ -63,28 +48,26 @@ class PerceptionTestMCQADataset(Dataset):
             if os.path.exists(temp_path):
                 video_path = temp_path
                 break
+        
+        video_tensor = self.processor(video_path)
 
-        decord_vr = VideoReader(uri=video_path, ctx=cpu(0))
-        frames = decord_vr.get_batch(np.linspace(0, len(decord_vr) - 1, self.num_segments, dtype=int)).asnumpy()
-        video_tensor = self.processor.preprocess(frames, return_tensors='pt')['pixel_values']  # do not pad for video frames
-
-        qs = []
+        instructs = []
         qids = []
         ops = []
         for q in mc_questions:
             question = q['question']
             qid = q['id']
             options = q['options']
-            option_question = f'Question: {question}\nOptions:\n(A) {options[0]}\n(B) {options[1]}\n(C) {options[2]}\nAnswer with the option\'s letter from the given choices directly and only give the best option.'
+            instruct = f'Question: {question}\nOptions:\n(A) {options[0]}\n(B) {options[1]}\n(C) {options[2]}\nAnswer with the option\'s letter from the given choices directly and only give the best option.'
 
-            qs.append(option_question)
+            instructs.append(instruct)
             qids.append(qid)
             ops.append(options)
 
         return {
             'video': video_tensor,
             'video_id': video_name,
-            'questions': qs,
+            'instructs': instructs,
             'question_ids': qids,
             'options': ops,
         }
@@ -93,98 +76,73 @@ class PerceptionTestMCQADataset(Dataset):
 def collate_fn(batch):
     vid = [x['video'] for x in batch]
     v_id = [x['video_id'] for x in batch]
-    qs = [x['questions'] for x in batch]
+    ins = [x['instructs'] for x in batch]
     q_ids = [x['question_ids'] for x in batch]
     ops = [x['options'] for x in batch]
     vid = torch.stack(vid, dim=0)
-    return vid, v_id, qs, q_ids, ops
-
-
-def get_model_output(model, tokenizer, qs, video_tensor, args):
-    if model.config.mm_use_im_start_end:
-        qs = default_mm_start_token + default_mm_token + default_mm_end_token + "\n" + qs
-    else:
-        qs = default_mm_token + "\n" + qs
-
-    conv = conv_templates[args.conv_mode].copy()
-    conv.append_message(conv.roles[0], qs)
-    conv.append_message(conv.roles[1], None)
-    prompt = conv.get_prompt()
-
-    # input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(args.device)
-    input_ids = tokenizer_MMODAL_token(prompt, tokenizer, modal_token_index, return_tensors='pt').to(args.device)
-
-    attention_mask=input_ids.ne(tokenizer.pad_token_id).to(args.device)
-
-    modal_list = ["video"]
-    video_tensor = video_tensor.to(dtype=torch.float16, device=args.device, non_blocking=True)
-
-    with torch.inference_mode():
-        output_ids = model.generate(
-            input_ids.unsqueeze(0),
-            attention_mask=attention_mask.unsqueeze(0),
-            images_or_videos=[video_tensor],
-            modal_list=modal_list,
-            do_sample=False,
-            max_new_tokens=1024,
-            use_cache=True,
-            pad_token_id=tokenizer.eos_token_id)
-
-    outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
-    return outputs
+    return vid, v_id, ins, q_ids, ops
 
 
 def run_inference(args):
-    # Initialize the model
-    model_name = get_model_name_from_path(args.model_path)
-    tokenizer, model, processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name)
+    disable_torch_init()
+
+    model, processor, tokenizer = model_init(args.model_path)
 
     questions = json.load(open(args.question_file, "r"))
     questions = list(questions.values())
     questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
 
-    num_frames = model.config.num_frames if hasattr(model.config, "num_frames") else NUM_FRAMES
-
     assert args.batch_size == 1, "Batch size must be 1 for inference"
-    dataset = PerceptionTestMCQADataset(questions, processor, num_frames)
+    dataset = PerceptionTestMCQADataset(questions, processor['video'])
     dataloader = DataLoader(dataset, shuffle=False, batch_size=args.batch_size, num_workers=args.num_workers, collate_fn=collate_fn)
 
     answer_file = os.path.expanduser(args.answer_file)
     os.makedirs(os.path.dirname(answer_file), exist_ok=True)
     ans_file = open(answer_file, "w")
 
-    output_list = []  # List to store the output results
-
     # Iterate over each sample in the ground truth file
-    for i, (video_tensor, video_id, questions, question_ids, options) in enumerate(tqdm(dataloader)):
+    for i, (video_tensor, video_id, instructs, question_ids, options) in enumerate(tqdm(dataloader)):
 
         # reduce batch dimension
         video_tensor = video_tensor[0]
         video_id = video_id[0]
-        questions = questions[0]
+        instructs = instructs[0]
         question_ids = question_ids[0]
         options = options[0]
 
         qas = []
-        for idx, question in enumerate(questions):
+        for idx, instruct in enumerate(instructs):
             letters = ['(A)', '(B)', '(C)']
             question_id = question_ids[idx]
             _options = options[idx]
 
-            output = get_model_output(model, tokenizer, question, video_tensor, args)
+            output = mm_infer(
+                video_tensor,
+                instruct,
+                model=model,
+                tokenizer=tokenizer,
+                modal='video',
+                do_sample=False,
+            )
+
+            output = output.replace('answer', '')
+            output = output.replace('Answer', '')
             pred_answer = re.findall('\(*[A-C]\)*', output)
-            if len(pred_answer) == 0:
+            try:
+                assert len(pred_answer) >= 1, 'The video \"{}\" output \"{}\" is not in the expected format'.format(video_id, instruct + '\n' + output)
+                pred_answer = pred_answer[0].strip()
+                # if not pred_answer.startswith('('):
+                pred_answer = pred_answer.strip('()')
+                pred_answer = f'({pred_answer})'
+                pred_idx = letters.index(pred_answer)
+            except:
+                traceback.print_exc()
                 tmp_options = [x.lower() for x in _options]
                 if output.lower() in tmp_options:
                     tmp_options = [x.lower() for x in _options]
                     pred_idx = tmp_options.index(output.lower())
                 else:
                     pred_idx = 2
-            else:
-                pred_answer = pred_answer[0].strip()
-                if not pred_answer.startswith('('):
-                    pred_answer = f'({pred_answer})'
-                pred_idx = letters.index(pred_answer)
 
             qas.append({'id': question_id, 'answer_id': pred_idx, 'answer': _options[pred_idx]})
 
@@ -196,13 +154,10 @@ def run_inference(args):
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
 
-    # Define the command-line arguments
     parser.add_argument('--model-path', help='', required=True)
-    parser.add_argument('--model_base', help='', default=None, type=str, required=False)
     parser.add_argument('--video-folder', help='Directory containing video files.', required=True)
     parser.add_argument('--question-file', help='Path to the ground truth file containing question.', required=True)
     parser.add_argument('--answer-file', help='Path to the ground truth file containing answers.', required=True)
-    parser.add_argument("--conv-mode", type=str, default="llava_v1")
     parser.add_argument("--num-chunks", type=int, default=1)
     parser.add_argument("--chunk-idx", type=int, default=0)
     parser.add_argument("--device", type=str, required=False, default='cuda:0')
diff --git a/VideoLLaMA2/videollama2/eval/inference_video_mcqa_videomme.py b/VideoLLaMA2/videollama2/eval/inference_video_mcqa_videomme.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e5a69810e474758122228c9777f9271c80c95f9
--- /dev/null
+++ b/VideoLLaMA2/videollama2/eval/inference_video_mcqa_videomme.py
@@ -0,0 +1,304 @@
+import os
+import re
+import math
+import json
+import copy
+import argparse
+import warnings
+import traceback
+
+import cv2
+import torch
+import pysubs2
+import numpy as np
+import pyarrow.parquet as pq
+from tqdm import tqdm
+from torch.utils.data import Dataset, DataLoader
+
+import sys
+sys.path.append('./')
+from videollama2 import model_init, mm_infer
+from videollama2.utils import disable_torch_init
+
+# NOTE: Ignore TypedStorage warning, which refers to this link~(https://github.com/pytorch/pytorch/issues/97207#issuecomment-1494781560)
+warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
+
+
+def split_list(lst, n):
+    """Split a list into n (roughly) equal-sized chunks"""
+    chunk_size = math.ceil(len(lst) / n)  # integer division
+    return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
+
+
+def get_chunk(lst, n, k):
+    chunks = split_list(lst, n)
+    return chunks[k]
+
+
+def get_seq_frames(total_num_frames, desired_num_frames):
+    """
+    Calculate the indices of frames to extract from a video.
+
+    Parameters:
+    total_num_frames (int): Total number of frames in the video.
+    desired_num_frames (int): Desired number of frames to extract.
+
+    Returns:
+    list: List of indices of frames to extract.
+    """
+
+    # Calculate the size of each segment from which a frame will be extracted
+    seg_size = float(total_num_frames - 1) / desired_num_frames
+
+    seq = []
+    for i in range(desired_num_frames):
+        # Calculate the start and end indices of each segment
+        start = int(np.round(seg_size * i))
+        end = int(np.round(seg_size * (i + 1)))
+
+        # Append the middle index of the segment to the list
+        seq.append((start + end) // 2)
+
+    return seq
+
+
+class VideoMMEDataset(Dataset):
+
+    video_formats = ['.mp4', '.avi', '.mov', '.mkv']
+
+    def __init__(self, video_folder, subtitle_folder, data_list, processor):
+        self.video_folder = video_folder
+        self.subtitle_folder = subtitle_folder
+        self.data_list = data_list
+        self.processor = processor
+
+    def __len__(self):
+        return len(self.data_list)
+    
+    def __getitem__(self, idx):
+        line = self.data_list[idx]
+
+        video_ytid = line['url'].split('watch?v=')[-1]
+
+        for fmt in self.video_formats:  # Added this line
+            temp_path = os.path.join(self.video_folder, f'{video_ytid}{fmt}')
+            if os.path.exists(temp_path):
+                video_path = temp_path
+                break
+
+        subtitle_path = os.path.join(self.subtitle_folder, f'{video_ytid}.srt')
+
+        try:
+            video_tensor = self.processor(video_path)
+            num_frames = video_tensor.shape[0]
+        except:
+            traceback.print_exc()
+            print(f'It occurs error when reading {video_ytid}')
+            video_tensor = None
+            num_frames = 0
+
+        if video_tensor is not None and os.path.exists(subtitle_path):
+            cv2_vr = cv2.VideoCapture(video_path)
+            duration = int(cv2_vr.get(cv2.CAP_PROP_FRAME_COUNT))
+            fps = cv2_vr.get(cv2.CAP_PROP_FPS)
+            selected_frame_ids = get_seq_frames(duration, num_frames)
+
+            subs = pysubs2.load(subtitle_path, encoding="utf-8")
+            subtitles = []
+            for seleced_frame_id in selected_frame_ids:
+                sub_text = ""
+                cur_time = pysubs2.make_time(fps=fps, frames=seleced_frame_id)
+                for sub in subs:
+                    if sub.start < cur_time and sub.end > cur_time:
+                        sub_text = sub.text.replace("\\N", " ")
+                        break
+                if sub_text.strip():
+                    subtitles.append(sub_text)
+            subtitles = "\n".join(subtitles)
+        else:
+            subtitles = ""
+
+        return {
+            'video': video_tensor,
+            'subtitle': subtitles,
+            'record': line,
+        }
+
+
+def collate_fn(batch):
+    vid = [x['video'] for x in batch]
+    sub = [x['subtitle'] for x in batch]
+    rcs = [x['record'] for x in batch]
+    return vid, sub, rcs
+
+
+def load_parquet(parquet_file):
+    table = pq.read_table(parquet_file)
+
+    # Convert PyArrow Table to pandas DataFrame
+    df = table.to_pandas()
+
+    jsons = []
+    for record in df.itertuples():
+
+        if len(jsons) < int(record.video_id):
+            jsons.append({
+                "video_id": record.video_id,
+                "youtube_id": record.videoID,
+                "url": record.url,
+                "duration": record.duration,
+                "domain": record.domain,
+                "sub_category": record.sub_category,
+                "questions": [
+                    {
+                        "question_id": record.question_id,
+                        "task_type": record.task_type,
+                        "question": record.question,
+                        "choices": list(record.options),
+                        "answer": record.answer,
+                    }
+                ]
+            })
+        else:
+            jsons[-1]['questions'].append({
+                "question_id": record.question_id,
+                "task_type": record.task_type,
+                "question": record.question,
+                "choices": list(record.options),
+                "answer": record.answer,
+            })
+
+    return jsons
+
+
+def build_videomme_eval(args, processor):
+    # convert parquet to json
+    questions = load_parquet(args.question_file)
+    # questions = json.load(open(args.question_file, "r"))
+    questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
+    dataset = VideoMMEDataset(args.video_folder, args.subtitle_folder, questions, processor)
+    dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn)
+
+    return dataloader
+
+
+def videomme_dump(record, instruct, options, output):
+    letters = ['A', 'B', 'C', 'D']
+
+    digit2word = {
+        '1': 'one',
+        '2': 'two',
+        '3': 'three',
+        '4': 'four',
+        '5': 'five',
+        '6': 'six',
+        '7': 'seven',
+        '8': 'eight',
+        '9': 'nine',
+        '0': 'zero',
+    }
+
+    output = output.replace('answer', '')
+    output = output.replace('Answer', '')
+    pred_answer = re.findall('[\(\ \[]*([A-D])[\)\.\ \]]*', output)
+    try:
+        find_flag = False
+        if len(pred_answer) == 0:
+            for idx, opt in enumerate(options):
+                # Arabic numerals -> English words
+                opt2 = opt
+                if opt in digit2word:
+                    opt2 = digit2word[opt]
+                if opt.lower() in output.lower() or opt2.lower() in output.lower():
+                    pred_idx = idx
+                    find_flag = True
+                    break
+        else:
+            pred_answer = pred_answer[0].strip()
+            pred_answer = pred_answer.strip('()')
+            pred_idx = letters.index(pred_answer)
+            find_flag = True
+
+        assert find_flag, 'The video \"{}\" output: \n\"{}\" is not in the expected format'.format(record['youtube_id'], instruct + '\n' + output)
+    except:
+        traceback.print_exc()
+        pred_idx = 2
+
+    return letters[pred_idx]
+
+
+def run_inference(args):
+    disable_torch_init()
+
+    # Initialize the model
+    model, processor, tokenizer = model_init(args.model_path)
+
+    answer_file = os.path.expanduser(args.answer_file)
+    answer_sub_file = answer_file.replace('.json', '_sub.json')
+    os.makedirs(os.path.dirname(answer_file), exist_ok=True)
+    ans_file = open(answer_file, "w")
+    ans_sub_file = open(answer_sub_file, "w")
+
+    val_loader = build_videomme_eval(args, processor['video'])
+
+    # Iterate over each sample in the ground truth file
+    for i, (videos, subtitles, records) in enumerate(tqdm(val_loader)):
+        video_tensor  = videos[0]
+        subtitle = subtitles[0]
+        record = records[0]
+
+        new_record = copy.deepcopy(record)
+        new_record_sub = copy.deepcopy(record)
+
+        if video_tensor is None:
+            new_record['missing'] = True
+            ans_file.write(json.dumps(new_record) + ",\n")
+            new_record_sub['missing'] = True
+            ans_sub_file.write(json.dumps(new_record_sub) + ",\n")
+            continue
+        else:
+            new_record['missing'] = False
+            new_record_sub['missing'] = False
+
+        questions = record['questions']
+        for idx, question in enumerate(questions):
+            q = question['question']
+            choices = question['choices']
+            options = [re.findall('[A-D]\. (.*).', c)[0] for c in choices]
+
+            instruct = "Select the best answer to the following multiple-choice question based on the video. Respond with only the letter (A, B, C, or D) of the correct option.\n"
+            instruct += f"{q}\n"
+            for cho_idx, cho in enumerate(choices):
+                instruct += f"{cho}\n"
+            # instruct += "The best option is: "
+            instruct += "Answer with the option\'s letter from the given choices directly and only give the best option. The best answer is: "
+            output = mm_infer(video_tensor, instruct, model=model, tokenizer=tokenizer, modal='video', do_sample=False)
+            new_record['questions'][idx]['response'] = videomme_dump(record, instruct, options, output)
+
+            instruct = f"This video's subtitles are listed below:\n{subtitle}\n" + instruct
+            output = mm_infer(video_tensor, instruct, model=model, tokenizer=tokenizer, modal='video', do_sample=False)
+            new_record_sub['questions'][idx]['response'] = videomme_dump(record, instruct, options, output)
+
+        ans_file.write(json.dumps(new_record) + ",\n")
+        ans_sub_file.write(json.dumps(new_record_sub) + ",\n")
+
+    ans_file.close()
+    ans_sub_file.close()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('--model-path', help='', required=True)
+    parser.add_argument('--video-folder', help='Directory containing video files.', required=True)
+    parser.add_argument('--subtitle-folder', help='Directory containing subtitle files.', required=True)
+    parser.add_argument('--question-file', help='Path to the ground truth file containing question.', required=True)
+    parser.add_argument('--answer-file', help='Path to the ground truth file containing answers.', required=True)
+    parser.add_argument("--num-chunks", type=int, default=1)
+    parser.add_argument("--chunk-idx", type=int, default=0)
+    parser.add_argument("--device", type=str, required=False, default='cuda:0')
+    parser.add_argument("--batch-size", type=int, default=1)
+    parser.add_argument("--num-workers", type=int, default=8)
+    args = parser.parse_args()
+
+    run_inference(args)
diff --git a/VideoLLaMA2/videollama2/eval/inference_video_oqa_activitynet.py b/VideoLLaMA2/videollama2/eval/inference_video_oqa_activitynet.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc7a77945f4a32718d05cff958b0a747b9af5d05
--- /dev/null
+++ b/VideoLLaMA2/videollama2/eval/inference_video_oqa_activitynet.py
@@ -0,0 +1,142 @@
+import os
+import json
+import math
+import argparse
+import warnings
+import traceback
+from tqdm import tqdm
+
+from torch.utils.data import Dataset, DataLoader
+
+import sys
+sys.path.append('./')
+from videollama2 import model_init, mm_infer
+from videollama2.utils import disable_torch_init
+
+# NOTE: Ignore TypedStorage warning, which refers to this link~(https://github.com/pytorch/pytorch/issues/97207#issuecomment-1494781560)
+warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
+
+
+def split_list(lst, n):
+    """Split a list into n (roughly) equal-sized chunks"""
+    chunk_size = math.ceil(len(lst) / n)  # integer division
+    return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
+
+
+def get_chunk(lst, n, k):
+    chunks = split_list(lst, n)
+    return chunks[k]
+
+
+class ActivitynetDataset(Dataset):
+
+    video_formats = ['.mp4', '.webm', '.avi', '.mov', '.mkv']
+
+    def __init__(self, questions, answers, processor):
+        self.questions = questions
+        self.answers   = answers
+        self.processor = processor
+
+    def __len__(self):
+        return len(self.questions)
+    
+    def __getitem__(self, idx):
+        sample = self.questions[idx]
+        answer = self.answers[idx]
+
+        video_name  = sample['video_name']
+        question    = sample['question']
+        question_id = sample['question_id']
+        answer      = answer['answer']
+
+        for fmt in self.video_formats:  # Added this line
+            temp_path = os.path.join(args.video_folder, f"v_{video_name}{fmt}")
+            if os.path.exists(temp_path):
+                video_path = temp_path
+                break
+            # BUG: compatibility for MSVD, MSRVTT, TGIF
+            temp_path = os.path.join(args.video_folder, f"{video_name}{fmt}")
+            if os.path.exists(temp_path):
+                video_path = temp_path
+                break
+
+        video_tensor = self.processor(video_path)
+
+        return {
+            'video':       video_tensor,
+            'video_name':  video_name,
+            'question':    question,
+            'question_id': question_id,
+            'answer':      answer,
+        }
+
+
+def collate_fn(batch):
+    vid  = [x['video'] for x in batch]
+    v_id = [x['video_name'] for x in batch]
+    qus  = [x['question'] for x in batch]
+    qid  = [x['question_id'] for x in batch]
+    ans  = [x['answer'] for x in batch]
+    return vid, v_id, qus, qid, ans
+
+
+def run_inference(args):
+    disable_torch_init()
+
+    # Initialize the model
+    model, processor, tokenizer = model_init(args.model_path)
+
+    gt_questions = json.load(open(args.question_file, "r"))
+    gt_questions = get_chunk(gt_questions, args.num_chunks, args.chunk_idx)
+    gt_answers = json.load(open(args.answer_file, "r"))
+    gt_answers = get_chunk(gt_answers, args.num_chunks, args.chunk_idx)
+
+    assert args.batch_size == 1, "Batch size must be 1 for inference"
+    dataset = ActivitynetDataset(gt_questions, gt_answers, processor['video'])
+    dataloader = DataLoader(dataset, shuffle=False, batch_size=args.batch_size, num_workers=args.num_workers, collate_fn=collate_fn)
+
+    answer_file = os.path.join(args.output_file)
+    os.makedirs(os.path.dirname(args.output_file), exist_ok=True)
+    ans_file = open(answer_file, "w")
+
+    # Iterate over each sample in the ground truth file
+    for i, (video_tensors, video_names, questions, question_ids, answers) in enumerate(tqdm(dataloader)):
+        video_tensor = video_tensors[0]
+        video_name   = video_names[0]
+        question     = questions[0]
+        question_id  = question_ids[0]
+        answer       = answers[0]
+
+        # question = question + '\n' + 'Answer the question using a single word or a short phrase with multiple words.'
+
+        output = mm_infer(
+            video_tensor,
+            question,
+            model=model,
+            tokenizer=tokenizer,
+            modal='video',
+            do_sample=False,
+        )
+
+        sample_set = {'id': question_id, 'question': question, 'answer': answer, 'pred': output}
+        ans_file.write(json.dumps(sample_set) + "\n")
+
+    ans_file.close()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('--model-path', help='', required=True)
+    parser.add_argument('--video-folder', help='Directory containing video files.', required=True)
+    parser.add_argument('--question-file', help='Path to the ground truth file containing question.', required=True)
+    parser.add_argument('--answer-file', help='Path to the ground truth file containing answers.', required=True)
+    parser.add_argument('--output-file', help='Directory to save the model results JSON.', required=True)
+    parser.add_argument("--num-chunks", type=int, default=1)
+    parser.add_argument("--chunk-idx", type=int, default=0)
+    parser.add_argument("--device", type=str, required=False, default='cuda:0')
+    parser.add_argument("--batch-size", type=int, required=False, default=1)
+    parser.add_argument("--num-workers", type=int, required=False, default=8)
+    args = parser.parse_args()
+
+    run_inference(args)
diff --git a/videollama2/eval/run_inference_video_qa_gpt_consistency.py b/VideoLLaMA2/videollama2/eval/inference_video_oqa_vcgpt_consistency.py
similarity index 59%
rename from videollama2/eval/run_inference_video_qa_gpt_consistency.py
rename to VideoLLaMA2/videollama2/eval/inference_video_oqa_vcgpt_consistency.py
index aa5dcb427a59fea1af626e1aad35137ac4cb5e54..59b58bba61d4b0d736baab008e024f87f3dbb2b3 100644
--- a/videollama2/eval/run_inference_video_qa_gpt_consistency.py
+++ b/VideoLLaMA2/videollama2/eval/inference_video_oqa_vcgpt_consistency.py
@@ -7,28 +7,16 @@ import warnings
 from tqdm import tqdm
 
 import torch
-import decord
-import numpy as np
-import transformers
-from decord import VideoReader, cpu
 from torch.utils.data import Dataset, DataLoader
 
 import sys
 sys.path.append('./')
-from videollama2.conversation import conv_templates, SeparatorStyle
-from videollama2.constants import NUM_FRAMES, DEFAULT_MMODAL_TOKEN, DEFAULT_MMODAL_START_TOKEN, DEFAULT_MMODAL_END_TOKEN, MMODAL_TOKEN_INDEX
-from videollama2.mm_utils import get_model_name_from_path, tokenizer_MMODAL_token, KeywordsStoppingCriteria, process_video
-from videollama2.model.builder import load_pretrained_model
-
+from videollama2 import model_init, mm_infer
+from videollama2.utils import disable_torch_init
 
 # NOTE: Ignore TypedStorage warning, which refers to this link~(https://github.com/pytorch/pytorch/issues/97207#issuecomment-1494781560)
 warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
 
-default_mm_token = DEFAULT_MMODAL_TOKEN["VIDEO"]
-default_mm_start_token =  DEFAULT_MMODAL_START_TOKEN["VIDEO"]
-default_mm_end_token = DEFAULT_MMODAL_END_TOKEN["VIDEO"]
-modal_token_index = MMODAL_TOKEN_INDEX["VIDEO"]
-
 
 def split_list(lst, n):
     """Split a list into n (roughly) equal-sized chunks"""
@@ -43,12 +31,11 @@ def get_chunk(lst, n, k):
 
 class VCGPTDataset(Dataset):
 
-    video_formats = ['.mp4', '.avi', '.mov', '.mkv']
+    video_formats = ['.mp4', '.webm', '.avi', '.mov', '.mkv']
 
-    def __init__(self, data_list, processor, num_frames):
+    def __init__(self, data_list, processor):
         self.data_list = data_list
         self.processor = processor
-        self.num_frames = num_frames
 
     def __len__(self):
         return len(self.data_list)
@@ -66,7 +53,7 @@ class VCGPTDataset(Dataset):
                 video_path = temp_path
                 break
 
-        video_tensor = process_video(video_path, self.processor, aspect_ratio=None, sample_scheme='uniform', num_frames=self.num_frames)
+        video_tensor = self.processor(video_path)
 
         return {
             'video': video_tensor,
@@ -87,51 +74,17 @@ def collate_fn(batch):
     return vid, v_id, qus1, qus2, ans
 
 
-def get_model_output(model, tokenizer, qs, video_tensor, args):
-    if model.config.mm_use_im_start_end:
-        qs = default_mm_start_token + default_mm_token + default_mm_end_token + "\n" + qs
-    else:
-        qs = default_mm_token + "\n" + qs
-
-    conv = conv_templates[args.conv_mode].copy()
-    conv.append_message(conv.roles[0], qs)
-    conv.append_message(conv.roles[1], None)
-    prompt = conv.get_prompt()
-
-    # input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(args.device)
-    input_ids = tokenizer_MMODAL_token(prompt, tokenizer, modal_token_index, return_tensors='pt').to(args.device)
-
-    attention_mask=input_ids.ne(tokenizer.pad_token_id).to(args.device)
-
-    modal_list = ["video"]
-    video_tensor = video_tensor.to(dtype=torch.float16, device=args.device, non_blocking=True)
-
-    with torch.inference_mode():
-        output_ids = model.generate(
-            input_ids.unsqueeze(0),
-            attention_mask=attention_mask.unsqueeze(0),
-            images_or_videos=[video_tensor],
-            modal_list=modal_list,
-            do_sample=False,
-            max_new_tokens=1024,
-            use_cache=True,
-            pad_token_id=tokenizer.eos_token_id)
-
-    outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
-    return outputs
-
-
 def run_inference(args):
-    model_name = get_model_name_from_path(args.model_path)
-    tokenizer, model, processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name)
+    disable_torch_init()
 
-    num_frames = model.config.num_frames if hasattr(model.config, "num_frames") else NUM_FRAMES
+    # Initialize the model
+    model, processor, tokenizer = model_init(args.model_path)
 
     questions = json.load(open(args.question_file, "r"))
     questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
 
     assert args.batch_size == 1, "Batch size must be 1 for inference"
-    dataset = VCGPTDataset(questions, processor, num_frames)
+    dataset = VCGPTDataset(questions, processor['video'])
     dataloader = DataLoader(dataset, shuffle=False, batch_size=args.batch_size, num_workers=args.num_workers, collate_fn=collate_fn)
 
     answer_file = os.path.expanduser(args.answer_file)
@@ -150,8 +103,23 @@ def run_inference(args):
         question2 = questions2[0]
         answer = answers[0]
 
-        output1 = get_model_output(model, tokenizer, question1, video_tensor, args)
-        output2 = get_model_output(model, tokenizer, question2, video_tensor, args)
+        output1 = mm_infer(
+            video_tensor,
+            question1, 
+            model=model,
+            tokenizer=tokenizer,
+            modal='video',
+            do_sample=False,
+        )
+
+        output2 = mm_infer(
+            video_tensor,
+            question2,
+            model=model,
+            tokenizer=tokenizer,
+            do_sample=False,
+            modal='video',
+        )
 
         qa = {'video_name': video_name, 'Q1': question1, 'Q2': question2, 'A': answer, 'P1': output1, 'P2': output2}
 
diff --git a/videollama2/eval/run_inference_video_qa_gpt_general.py b/VideoLLaMA2/videollama2/eval/inference_video_oqa_vcgpt_general.py
similarity index 54%
rename from videollama2/eval/run_inference_video_qa_gpt_general.py
rename to VideoLLaMA2/videollama2/eval/inference_video_oqa_vcgpt_general.py
index ecd93bfb5e855e2219d170ea84df23825c944b0b..e30a77fa5776b2940fcfca4a90357cf3acf52d50 100644
--- a/videollama2/eval/run_inference_video_qa_gpt_general.py
+++ b/VideoLLaMA2/videollama2/eval/inference_video_oqa_vcgpt_general.py
@@ -7,28 +7,16 @@ import warnings
 from tqdm import tqdm
 
 import torch
-import decord
-import numpy as np
-import transformers
-from decord import VideoReader, cpu
 from torch.utils.data import Dataset, DataLoader
 
 import sys
 sys.path.append('./')
-from videollama2.conversation import conv_templates, SeparatorStyle
-from videollama2.constants import NUM_FRAMES, DEFAULT_MMODAL_TOKEN, DEFAULT_MMODAL_START_TOKEN, DEFAULT_MMODAL_END_TOKEN, MMODAL_TOKEN_INDEX
-from videollama2.mm_utils import get_model_name_from_path, tokenizer_MMODAL_token, KeywordsStoppingCriteria, process_video
-from videollama2.model.builder import load_pretrained_model
-
+from videollama2 import model_init, mm_infer
+from videollama2.utils import disable_torch_init
 
 # NOTE: Ignore TypedStorage warning, which refers to this link~(https://github.com/pytorch/pytorch/issues/97207#issuecomment-1494781560)
 warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
 
-default_mm_token = DEFAULT_MMODAL_TOKEN["VIDEO"]
-default_mm_start_token =  DEFAULT_MMODAL_START_TOKEN["VIDEO"]
-default_mm_end_token = DEFAULT_MMODAL_END_TOKEN["VIDEO"]
-modal_token_index = MMODAL_TOKEN_INDEX["VIDEO"]
-
 
 def split_list(lst, n):
     """Split a list into n (roughly) equal-sized chunks"""
@@ -43,12 +31,11 @@ def get_chunk(lst, n, k):
 
 class VCGPTDataset(Dataset):
 
-    video_formats = ['.mp4', '.avi', '.mov', '.mkv']
+    video_formats = ['.mp4', '.webm', '.avi', '.mov', '.mkv']
 
-    def __init__(self, data_list, processor, num_frames):
+    def __init__(self, data_list, processor):
         self.data_list = data_list
         self.processor = processor
-        self.num_frames = num_frames
 
     def __len__(self):
         return len(self.data_list)
@@ -65,7 +52,7 @@ class VCGPTDataset(Dataset):
                 video_path = temp_path
                 break
 
-        video_tensor = process_video(video_path, self.processor, aspect_ratio=None, sample_scheme='uniform', num_frames=self.num_frames)
+        video_tensor = self.processor(video_path)
 
         return {
             'video': video_tensor,
@@ -84,59 +71,23 @@ def collate_fn(batch):
     return vid, v_id, qus, ans
 
 
-def get_model_output(model, tokenizer, qs, video_tensor, args):
-    if model.config.mm_use_im_start_end:
-        qs = default_mm_start_token + default_mm_token + default_mm_end_token + "\n" + qs
-    else:
-        qs = default_mm_token + "\n" + qs
-
-    conv = conv_templates[args.conv_mode].copy()
-    conv.append_message(conv.roles[0], qs)
-    conv.append_message(conv.roles[1], None)
-    prompt = conv.get_prompt()
-
-    # input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(args.device)
-    input_ids = tokenizer_MMODAL_token(prompt, tokenizer, modal_token_index, return_tensors='pt').to(args.device)
-
-    attention_mask=input_ids.ne(tokenizer.pad_token_id).to(args.device)
-
-    modal_list = ["video"]
-    video_tensor = video_tensor.to(dtype=torch.float16, device=args.device, non_blocking=True)
-
-    with torch.inference_mode():
-        output_ids = model.generate(
-            input_ids.unsqueeze(0),
-            attention_mask=attention_mask.unsqueeze(0),
-            images_or_videos=[video_tensor],
-            modal_list=modal_list,
-            do_sample=False,
-            max_new_tokens=1024,
-            use_cache=True,
-            pad_token_id=tokenizer.eos_token_id)
-
-    outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
-    return outputs
-
-
 def run_inference(args):
-    model_name = get_model_name_from_path(args.model_path)
-    tokenizer, model, processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name)
+    disable_torch_init()
 
-    num_frames = model.config.num_frames if hasattr(model.config, "num_frames") else NUM_FRAMES
+    # Initialize the model
+    model, processor, tokenizer = model_init(args.model_path)
 
     questions = json.load(open(args.question_file, "r"))
     questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
 
     assert args.batch_size == 1, "Batch size must be 1 for inference"
-    dataset = VCGPTDataset(questions, processor, num_frames)
+    dataset = VCGPTDataset(questions, processor['video'])
     dataloader = DataLoader(dataset, shuffle=False, batch_size=args.batch_size, num_workers=args.num_workers, collate_fn=collate_fn)
 
     answer_file = os.path.expanduser(args.answer_file)
     os.makedirs(os.path.dirname(answer_file), exist_ok=True)
     ans_file = open(answer_file, "w")
 
-    output_list = []  # List to store the output results
-
     # Iterate over each sample in the ground truth file
     for i, (video_tensors, video_names, questions, answers) in enumerate(tqdm(dataloader)):
 
@@ -146,7 +97,14 @@ def run_inference(args):
         question = questions[0]
         answer = answers[0]
 
-        output = get_model_output(model, tokenizer, question, video_tensor, args)
+        output = mm_infer(
+            video_tensor,
+            question,
+            model=model,
+            tokenizer=tokenizer,
+            modal='video',
+            do_sample=False,
+        )
 
         qa = {'video_name': video_name, 'Q': question, 'A': answer, 'P': output}
 
@@ -158,20 +116,15 @@ def run_inference(args):
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
 
-    # Define the command-line arguments
     parser.add_argument('--model-path', help='', required=True)
-    parser.add_argument('--model_base', help='', default=None, type=str, required=False)
     parser.add_argument('--video-folder', help='Directory containing video files.', required=True)
     parser.add_argument('--question-file', help='Path to the ground truth file containing question.', required=True)
     parser.add_argument('--answer-file', help='Path to the ground truth file containing answers.', required=True)
-    parser.add_argument("--conv-mode", type=str, default="llava_v1")
     parser.add_argument("--num-chunks", type=int, default=1)
     parser.add_argument("--chunk-idx", type=int, default=0)
     parser.add_argument("--device", type=str, required=False, default='cuda:0')
-    parser.add_argument("--model_max_length", type=int, required=False, default=2048)
     parser.add_argument("--batch-size", type=int, required=False, default=1)
     parser.add_argument("--num-workers", type=int, required=False, default=8)
-
     args = parser.parse_args()
 
     run_inference(args)
diff --git a/VideoLLaMA2/videollama2/mm_utils.py b/VideoLLaMA2/videollama2/mm_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e0c3529509aa14a683d3b53aaafd3d1367fc85b
--- /dev/null
+++ b/VideoLLaMA2/videollama2/mm_utils.py
@@ -0,0 +1,356 @@
+import ast
+import os
+import math
+import base64
+import traceback
+from io import BytesIO
+
+import torch
+import imageio
+import numpy as np
+from PIL import Image
+from decord import VideoReader, cpu
+from moviepy.editor import VideoFileClip
+from transformers import StoppingCriteria
+
+from .constants import NUM_FRAMES, MAX_FRAMES, NUM_FRAMES_PER_SECOND, MODAL_INDEX_MAP, DEFAULT_IMAGE_TOKEN
+
+
+def chunk_list(input_list, chunk_size):
+    return [input_list[i:i + chunk_size] for i in range(0, len(input_list), chunk_size)]
+
+
+def load_image_from_base64(image):
+    return Image.open(BytesIO(base64.b64decode(image)))
+
+
+def expand2square(pil_img, background_color):
+    width, height = pil_img.size
+    if width == height:
+        return pil_img
+    elif width > height:
+        result = Image.new(pil_img.mode, (width, width), background_color)
+        result.paste(pil_img, (0, (width - height) // 2))
+        return result
+    else:
+        result = Image.new(pil_img.mode, (height, height), background_color)
+        result.paste(pil_img, ((height - width) // 2, 0))
+        return result
+
+
+def create_photo_grid(arr, rows=None, cols=None):
+    """
+    Create a photo grid from a 4D numpy array with shape [t, h, w, c].
+
+    Parameters:
+        arr (numpy.ndarray): Input array with shape [t, h, w, c].
+        rows (int): Optional. Number of rows in the grid. If not set, it will be determined based on `cols` or the square root of `t`.
+        cols (int): Optional. Number of columns in the grid. If not set, it will be determined based on `rows` or the square root of `t`.
+
+    Returns:
+        numpy.ndarray: A 3D numpy array representing the photo grid.
+    """
+
+    if isinstance(arr, list):
+        if isinstance(arr[0], Image.Image):
+            arr = np.stack([np.array(img) for img in arr])
+        elif isinstance(arr[0], np.ndarray):
+            arr = np.stack(arr)
+        else:
+            raise ValueError("Invalid input type. Expected list of Images or numpy arrays.")
+
+    t, h, w, c = arr.shape
+    
+    # Calculate the number of rows and columns if not provided
+    if rows is None and cols is None:
+        rows = math.ceil(math.sqrt(t))
+        cols = math.ceil(t / rows)
+    elif rows is None:
+        rows = math.ceil(t / cols)
+    elif cols is None:
+        cols = math.ceil(t / rows)
+
+    # Check if the grid can hold all the images
+    if rows * cols < t:
+        raise ValueError(f"Not enough grid cells ({rows}x{cols}) to hold all images ({t}).")
+    
+    # Create the grid array with appropriate height and width
+    grid_height = h * rows
+    grid_width = w * cols
+    grid = np.zeros((grid_height, grid_width, c), dtype=arr.dtype)
+    
+    # Fill the grid with images
+    for i in range(t):
+        row_idx = i // cols
+        col_idx = i % cols
+        grid[row_idx*h:(row_idx+1)*h, col_idx*w:(col_idx+1)*w, :] = arr[i]
+    
+    return grid
+
+
+def process_image(image_path, processor, aspect_ratio='pad'):
+    image = Image.open(image_path).convert('RGB')
+
+    images = [np.array(image)]
+
+    if aspect_ratio == 'pad':
+        images = [Image.fromarray(f) for f in images]
+        images = [expand2square(image, tuple(int(x*255) for x in processor.image_mean)) for image in images]
+    else:
+        images = [Image.fromarray(f) for f in images]
+
+    images = processor.preprocess(images, return_tensors='pt')['pixel_values']
+    return images
+
+
+def frame_sample(duration, mode='uniform', num_frames=None, fps=None):
+    if mode == 'uniform':
+        assert num_frames is not None, "Number of frames must be provided for uniform sampling."
+        # NOTE: v1 version
+        # Calculate the size of each segment from which a frame will be extracted
+        seg_size = float(duration - 1) / num_frames
+
+        frame_ids = []
+        for i in range(num_frames):
+            # Calculate the start and end indices of each segment
+            start = seg_size * i
+            end   = seg_size * (i + 1)
+            # Append the middle index of the segment to the list
+            frame_ids.append((start + end) / 2)
+
+        return np.round(np.array(frame_ids) + 1e-6).astype(int)
+        # NOTE: v0 version
+        # return np.linspace(0, duration-1, num_frames, dtype=int)
+    elif mode == 'fps':
+        assert fps is not None, "FPS must be provided for FPS sampling."
+        segment_len = min(fps // NUM_FRAMES_PER_SECOND, duration)
+        return np.arange(segment_len // 2, duration, segment_len, dtype=int)
+    else:
+        raise ImportError(f'Unsupported frame sampling mode: {mode}')
+
+
+def process_video(video_path, processor, s=None, e=None, aspect_ratio='pad', num_frames=NUM_FRAMES):
+    if isinstance(video_path, str):
+        if s is not None and e is not None:
+            s = s if s >= 0. else 0.
+            e = e if e >= 0. else 0.
+            if s > e:
+                s, e = e, s
+            elif s == e:
+                e = s + 1
+
+        # 1. Loading Video
+        if os.path.isdir(video_path):                
+            frame_files = sorted(os.listdir(video_path))
+
+            fps = 3
+            num_frames_of_video = len(frame_files)
+        elif video_path.endswith('.gif'):
+            gif_reader = imageio.get_reader(video_path)
+
+            fps = 25
+            num_frames_of_video = len(gif_reader)
+        else:
+            vreader = VideoReader(video_path, ctx=cpu(0), num_threads=1)
+
+            fps = vreader.get_avg_fps()
+            num_frames_of_video = len(vreader)
+
+        # 2. Determine frame range & Calculate frame indices
+        f_start = 0                       if s is None else max(int(s * fps) - 1, 0)
+        f_end   = num_frames_of_video - 1 if e is None else min(int(e * fps) - 1, num_frames_of_video - 1)
+        frame_indices = list(range(f_start, f_end + 1))
+
+        duration = len(frame_indices)
+        # 3. Sampling frame indices 
+        if num_frames is None:
+            sampled_frame_indices = [frame_indices[i] for i in frame_sample(duration, mode='fps', fps=fps)]
+        else:
+            sampled_frame_indices = [frame_indices[i] for i in frame_sample(duration, mode='uniform', num_frames=num_frames)]
+
+        # 4. Acquire frame data
+        if os.path.isdir(video_path): 
+            video_data = [Image.open(os.path.join(video_path, frame_files[f_idx])) for f_idx in sampled_frame_indices]
+        elif video_path.endswith('.gif'):
+            video_data = [Image.fromarray(frame) for idx, frame in enumerate(gif_reader) if idx in sampled_frame_indices]
+        else:
+            video_data = [Image.fromarray(frame) for frame in vreader.get_batch(sampled_frame_indices).asnumpy()]
+
+    elif isinstance(video_path, np.ndarray):
+        video_data = [Image.fromarray(f) for f in video_path]
+    elif isinstance(video_path, list) and isinstance(video_path[0], np.ndarray):
+        video_data = [Image.fromarray(f) for f in video_path]
+    elif isinstance(video_path, list) and isinstance(video_path[0], str):
+        video_data = [Image.open(f) for f in video_path]
+    elif isinstance(video_path, list) and isinstance(video_path[0], Image.Image):
+        video_data = video_path
+    else:
+        raise ValueError(f"Unsupported video path type: {type(video_path)}")
+
+    while num_frames is not None and len(video_data) < num_frames:
+        video_data.append(Image.fromarray(np.zeros((*video_data[-1].size, 3), dtype=np.uint8)))
+
+    # MAX_FRAMES filter
+    video_data = video_data[:MAX_FRAMES]
+
+    if aspect_ratio == 'pad':
+        images = [expand2square(f, tuple(int(x*255) for x in processor.image_mean)) for f in video_data]
+        video = processor.preprocess(images, return_tensors='pt')['pixel_values']
+    else:
+        images = [f for f in video_data]
+        video = processor.preprocess(images, return_tensors='pt')['pixel_values']
+    return video
+
+
+def process_video_old(video_path, processor, aspect_ratio='pad', num_frames=NUM_FRAMES, image_grid=False, sample_scheme='uniform'):
+    def frame_sample(duration, mode='uniform', local_fps=None):
+        if mode == 'uniform':
+            # Calculate the size of each segment from which a frame will be extracted
+            seg_size = float(duration - 1) / num_frames
+
+            frame_ids = []
+            for i in range(num_frames):
+                # Calculate the start and end indices of each segment
+                start = int(np.round(seg_size * i))
+                end = int(np.round(seg_size * (i + 1)))
+                # Append the middle index of the segment to the list
+                frame_ids.append((start + end) // 2)
+
+            return frame_ids
+            # NOTE: old version
+            # return np.linspace(0, duration-1, num_frames, dtype=int)
+        elif mode == 'fps':
+            assert local_fps is not None
+            segment_len = min(local_fps // NUM_FRAMES_PER_SECOND, duration)
+            return np.arange(segment_len // 2, duration, segment_len, dtype=int)
+        else:
+            raise ImportError(f'Unsupported frame sampling mode: {mode}')
+
+    if isinstance(video_path, str):
+        if video_path.endswith('.gif'):
+            video_gif = imageio.get_reader(video_path)
+            duration, local_fps = len(video_gif), 10
+
+            frame_id_list = frame_sample(duration, mode=sample_scheme, local_fps=local_fps)
+            # limit the max input frames
+            if len(frame_id_list) > MAX_FRAMES:
+                frame_id_list = np.linspace(0, duration-1, MAX_FRAMES, dtype=int)
+            video_data = [frame for index, frame in enumerate(video_gif) if index in frame_id_list]
+        # added by lixin4ever, include the support of .webm files from sthsthv2
+        elif video_path.endswith('.webm'):
+            video_webm = VideoFileClip(video_path)
+            video_frames = np.array(list(video_webm.iter_frames()))
+
+            duration, local_fps = len(video_frames), video_webm.fps
+
+            frame_id_list = frame_sample(duration, mode=sample_scheme, local_fps=local_fps)
+            # limit the max input frames
+            if len(frame_id_list) > MAX_FRAMES:
+                frame_id_list = np.linspace(0, duration-1, MAX_FRAMES, dtype=int)
+            video_data = video_frames[frame_id_list]
+        else:
+            # NOTE: num_threads=1 is required to avoid deadlock in multiprocessing
+            decord_vr = VideoReader(uri=video_path, ctx=cpu(0), num_threads=1) 
+            duration, local_fps = len(decord_vr), float(decord_vr.get_avg_fps())
+        
+            frame_id_list = frame_sample(duration, mode=sample_scheme, local_fps=local_fps)
+            # limit the max input frames
+            if len(frame_id_list) > MAX_FRAMES:
+                frame_id_list = np.linspace(0, duration-1, MAX_FRAMES, dtype=int)
+            try:
+                video_data = decord_vr.get_batch(frame_id_list).numpy()
+            except:
+                video_data = decord_vr.get_batch(frame_id_list).asnumpy()
+
+    elif isinstance(video_path, np.ndarray):
+        assert len(video_path) == num_frames
+        video_data = video_path
+    elif isinstance(video_path, list):
+        assert len(video_path) == num_frames
+        video_data = np.stack([np.array(x) for x in video_path])
+
+    if image_grid:
+        grid_h = grid_w = math.ceil(math.sqrt(num_frames))
+        pg = create_photo_grid(video_data, grid_h, grid_w)
+        video_data = [pg, *video_data]
+
+    if aspect_ratio == 'pad':
+        images = [Image.fromarray(f.numpy() if isinstance(f, torch.Tensor) else f) for f in video_data]
+        images = [expand2square(image, tuple(int(x*255) for x in processor.image_mean)) for image in images]
+        video = processor.preprocess(images, return_tensors='pt')['pixel_values']
+    else:
+        images = [Image.fromarray(f.numpy() if isinstance(f, torch.Tensor) else f) for f in video_data]
+        video = processor.preprocess(images, return_tensors='pt')['pixel_values']
+
+    return video
+
+
+def tokenizer_multimodal_token(prompt, tokenizer, multimodal_token=DEFAULT_IMAGE_TOKEN, return_tensors=None):
+    """Tokenize text and multimodal tag to input_ids.
+
+    Args:
+        prompt (str): Text prompt (w/ multimodal tag), e.g., '<video>\nDescribe the video.'
+        tokenizer (transformers.PreTrainedTokenizer): Tokenizer object.
+        multimodal_token (int): Token index corresponding to the multimodal tag.
+    """
+    multimodal_token_index = MODAL_INDEX_MAP.get(multimodal_token, None)
+    if multimodal_token_index is None:
+        input_ids = tokenizer(prompt, add_special_tokens=False).input_ids
+    else:
+        prompt_chunks = [tokenizer(chunk, add_special_tokens=False).input_ids for idx, chunk in enumerate(prompt.split(multimodal_token))]
+
+        input_ids = []
+        for i in range(1, 2 * len(prompt_chunks)):
+            if i % 2 == 1:
+                input_ids.extend(prompt_chunks[i // 2])
+            else:
+                input_ids.append(multimodal_token_index)
+
+    if return_tensors is not None:
+        if return_tensors == 'pt':
+            return torch.tensor(input_ids, dtype=torch.long)
+        raise ValueError(f'Unsupported tensor type: {return_tensors}')
+    return input_ids
+
+
+def get_model_name_from_path(model_path):
+    model_path = model_path.strip("/")
+    model_paths = model_path.split("/")
+    if model_paths[-1].startswith('checkpoint-'):
+        return model_paths[-2] + "_" + model_paths[-1]
+    else:
+        return model_paths[-1]
+
+
+class KeywordsStoppingCriteria(StoppingCriteria):
+    def __init__(self, keywords, tokenizer, input_ids):
+        self.keywords = keywords
+        self.keyword_ids = []
+        self.max_keyword_len = 0
+        for keyword in keywords:
+            cur_keyword_ids = tokenizer(keyword).input_ids
+            if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id:
+                cur_keyword_ids = cur_keyword_ids[1:]
+            if len(cur_keyword_ids) > self.max_keyword_len:
+                self.max_keyword_len = len(cur_keyword_ids)
+            self.keyword_ids.append(torch.tensor(cur_keyword_ids))
+        self.tokenizer = tokenizer
+        self.start_len = input_ids.shape[1]
+    
+    def call_for_batch(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        offset = min(output_ids.shape[1] - self.start_len, self.max_keyword_len)
+        self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids]
+        for keyword_id in self.keyword_ids:
+            if (output_ids[0, -keyword_id.shape[0]:] == keyword_id).all():
+                return True
+        outputs = self.tokenizer.batch_decode(output_ids[:, -offset:], skip_special_tokens=True)[0]
+        for keyword in self.keywords:
+            if keyword in outputs:
+                return True
+        return False
+    
+    def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        outputs = []
+        for i in range(output_ids.shape[0]):
+            outputs.append(self.call_for_batch(output_ids[i].unsqueeze(0), scores))
+        return all(outputs)
diff --git a/videollama2/model/builder.py b/VideoLLaMA2/videollama2/model/__init__.py
similarity index 60%
rename from videollama2/model/builder.py
rename to VideoLLaMA2/videollama2/model/__init__.py
index 6fc12226fd0c0d759338de2088d59746caff1b5c..6fe1e50f4178ce9f81d95f7d4349db199fb55136 100644
--- a/videollama2/model/builder.py
+++ b/VideoLLaMA2/videollama2/model/__init__.py
@@ -21,9 +21,24 @@ import shutil
 import torch
 from transformers import PretrainedConfig, AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig
 
-from . import *
-from .multimodal_projector import load_mm_projector
-from ..constants import DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+from .projector import load_mm_projector
+from .videollama2_llama import Videollama2LlamaForCausalLM, Videollama2Config
+from .videollama2_mistral import Videollama2MistralForCausalLM, Videollama2MistralConfig
+from .videollama2_mixtral import Videollama2MixtralForCausalLM, Videollama2MixtralConfig
+from .videollama2_qwen2 import Videollama2Qwen2ForCausalLM, Videollama2Qwen2Config
+from .videollama2_gemma2 import Videollama2Gemma2ForCausalLM, Videollama2Gemma2Config
+from .videollama2_phi3 import Videollama2Phi3ForCausalLM, Videollama2Phi3Config
+
+
+VLLMs = {
+    "videollama2": Videollama2MistralForCausalLM,
+    "videollama2_llama": Videollama2LlamaForCausalLM,
+    "videollama2_mistral": Videollama2MistralForCausalLM,
+    "videollama2_mixtral": Videollama2MixtralForCausalLM,
+    "videollama2_qwen2": Videollama2Qwen2ForCausalLM,
+    "videollama2_gemma2": Videollama2Gemma2ForCausalLM,
+    "videollama2_phi3": Videollama2Phi3ForCausalLM,
+}
 
 
 def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, load_4bit=False, device_map="auto", device="cuda", use_flash_attn=False, **kwargs):
@@ -40,7 +55,8 @@ def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, l
     if load_8bit:
         kwargs['load_in_8bit'] = True
     elif load_4bit:
-        kwargs['load_in_4bit'] = True
+        # NOTE: High-version Transformers will report: """ValueError: You can't pass `load_in_4bit`or `load_in_8bit` as a kwarg when passing `quantization_config` argument at the same time."""
+        # kwargs['load_in_4bit'] = True
         kwargs['quantization_config'] = BitsAndBytesConfig(
             load_in_4bit=True,
             bnb_4bit_compute_dtype=torch.float16,
@@ -53,15 +69,29 @@ def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, l
     if use_flash_attn:
         kwargs['attn_implementation'] = 'flash_attention_2'
 
-    if "videollama" in model_name.lower():
-        # Load LLaVA model
-        if 'lora' in model_name.lower() and model_base is None:
-            warnings.warn('There is `lora` in model name but no `model_base` is provided. If you are loading a LoRA model, please provide the `model_base` argument. Detailed instruction: https://github.com/haotian-liu/LLaVA#launch-a-model-worker-lora-weights-unmerged.')
-        if 'lora' in model_name.lower() and model_base is not None:
+    if "videollama" in model_name.lower() or 'vlb' in model_name.lower():
+        # NOTE: lora/qlora model loading
+        if 'lora' in model_name.lower() or 'qlora' in model_name.lower():
+            if model_base is None:
+                cfg_pretrained = PretrainedConfig.from_pretrained(model_path, token=token)
+                # NOTE: AutoConfig will modify `_name_or_path` property to `model_path` if `model_path` is not None.
+                # cfg_pretrained = AutoConfig.from_pretrained(model_path, token=token)
+                model_base = model_base if model_base is not None else cfg_pretrained._name_or_path
+
             lora_cfg_pretrained = AutoConfig.from_pretrained(model_path)
-            tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
+            # NOTE: remove qlora training quantization config 
+            if hasattr(lora_cfg_pretrained, 'quantization_config'):
+                del lora_cfg_pretrained.quantization_config
+            tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False, token=token)
             print('Loading VideoLLaMA from base model...')
-            model = Videollama2LlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=lora_cfg_pretrained, **kwargs)
+
+            if 'vicuna' in model_base.lower():
+                model = Videollama2LlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=lora_cfg_pretrained, **kwargs)
+            elif 'mistral' in model_base.lower():
+                model = Videollama2MistralForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=lora_cfg_pretrained, **kwargs)
+            else:
+                model = Videollama2MistralForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=lora_cfg_pretrained, **kwargs)
+
             token_num, tokem_dim = model.lm_head.out_features, model.lm_head.in_features
             if model.lm_head.weight.shape[0] != token_num:
                 model.lm_head.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
@@ -92,7 +122,7 @@ def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, l
             model = model.merge_and_unload()
             print('Model is loaded...')
         elif model_base is not None or '-base' in model_name.lower():
-            # loading vision-language projector
+            # NOTE: Base/Pretrain model loading
             print('Loading VideoLLaMA 2 from base model...')
             cfg_pretrained = PretrainedConfig.from_pretrained(model_path, token=token)
             # NOTE: AutoConfig will modify `_name_or_path` property to `model_path` if `model_path` is not None.
@@ -101,27 +131,50 @@ def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, l
 
             tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False, token=token)
 
-            if 'vicuna' in model_name.lower():
+            if 'vicuna' in model_base.lower():
                 model = Videollama2LlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs)
-            elif 'mixtral' in model_name.lower():
+            elif 'mistral' in model_base.lower():
+                model = Videollama2MistralForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs)
+            elif 'mixtral' in model_base.lower():
                 model = Videollama2MixtralForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs)
+            elif 'qwen2' in model_base.lower():
+                model = Videollama2Qwen2ForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs)
+            elif 'gemma2' in model_base.lower():
+                model = Videollama2Gemma2ForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs)
+            elif 'phi3' in model_base.lower():
+                model = Videollama2Phi3ForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs)
             else:
                 model = Videollama2MistralForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs)
 
-            # NOTE: old codes for loading local mm_projector.bin
+            # NOTE; loading vision-language projector
+            # * old codes for loading local mm_projector.bin
             # mm_projector_weights = torch.load(os.path.join(model_path, 'mm_projector.bin'), map_location='cpu')
             # mm_projector_weights = {k: v.to(torch.float16) for k, v in mm_projector_weights.items()}
             # model.load_state_dict(mm_projector_weights, strict=False)
-            # NOTE: new codes which supports loading mm_projector.bin both offline and online 
+            # * new codes which supports loading mm_projector.bin both offline and online 
             mm_projector_weights = load_mm_projector(model_path, token=token)
             model.load_state_dict(mm_projector_weights, strict=False)
         else:
-            if 'vicuna' in model_name.lower():
+            # NOTE: SFT model loading
+            cfg_pretrained = PretrainedConfig.from_pretrained(model_path, token=token)
+            model_base = cfg_pretrained._name_or_path
+
+            if 'vicuna' in model_base.lower():
                 tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, token=token)
                 model = Videollama2LlamaForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
-            elif 'mixtral' in model_name.lower():
+            elif 'mistral' in model_base.lower():
+                tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, token=token)
+                model = Videollama2MistralForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
+            elif 'mixtral' in model_base.lower():
                 tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, token=token)
                 model = Videollama2MixtralForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
+            elif 'qwen2' in model_base.lower():
+                tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, token=token)
+                model = Videollama2Qwen2ForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
+            elif 'gemma2' in model_base.lower():
+                model = Videollama2Gemma2ForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs)
+            elif 'phi3' in model_base.lower():
+                model = Videollama2Phi3ForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs)
             else:
                 # NOTE: mistral-based model is our default model.
                 tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, token=token)
@@ -146,15 +199,7 @@ def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, l
 
     processor = None
 
-    if "videollama" in model_name.lower():
-        mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
-        mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", True)
-        if mm_use_im_patch_token:
-            tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
-        if mm_use_im_start_end:
-            tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
-        model.resize_token_embeddings(len(tokenizer))
-
+    if "videollama" in model_name.lower() or 'vlb' in model_name.lower():
         vision_tower = model.get_vision_tower()
         if not vision_tower.is_loaded:
             vision_tower.load_model()
diff --git a/VideoLLaMA2/videollama2/model/encoder.py b/VideoLLaMA2/videollama2/model/encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7be137cfdaa3a362f2d3728ed29aaffcedd52ca
--- /dev/null
+++ b/VideoLLaMA2/videollama2/model/encoder.py
@@ -0,0 +1,180 @@
+import os
+
+import torch
+import torch.nn as nn
+
+from transformers import (
+    CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig,
+    SiglipVisionModel, SiglipImageProcessor, SiglipVisionConfig
+)
+
+
+class CLIPVisionTower(nn.Module):
+
+    def __init__(self, vision_tower, args, delay_load=False):
+        super().__init__()
+
+        self.is_loaded = False
+
+        self.vision_tower_name = vision_tower
+        self.select_layer = args.mm_vision_select_layer
+        self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
+
+        if not delay_load:
+            self.load_model()
+        else:
+            self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name)
+
+    def load_model(self):
+        self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
+
+        self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name)
+        self.vision_tower.requires_grad_(False)
+
+        self.is_loaded = True
+
+    def feature_select(self, image_forward_outs):
+        image_features = image_forward_outs.hidden_states[self.select_layer]
+        if self.select_feature == 'patch':
+            image_features = image_features[:, 1:]
+        elif self.select_feature == 'cls_patch':
+            image_features = image_features
+        else:
+            raise ValueError(f'Unexpected select feature: {self.select_feature}')
+        return image_features
+
+    @torch.no_grad()
+    def forward(self, images):
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
+                image_feature = self.feature_select(image_forward_out).to(image.dtype)
+                image_features.append(image_feature)
+        else:
+            image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
+            image_features = self.feature_select(image_forward_outs).to(images.dtype)
+
+        return image_features
+
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+
+    @property
+    def dtype(self):
+        return self.vision_tower.dtype
+
+    @property
+    def device(self):
+        return self.vision_tower.device
+
+    @property
+    def config(self):
+        if self.is_loaded:
+            return self.vision_tower.config
+        else:
+            return self.cfg_only
+
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size
+
+    @property
+    def num_patches(self):
+        return (self.config.image_size // self.config.patch_size) ** 2
+
+    @property
+    def num_patches_per_side(self):
+        return self.config.image_size // self.config.patch_size
+
+
+class SiglipVisionTower(nn.Module):
+
+    def __init__(self, vision_tower, args, delay_load=False):
+        super().__init__()
+
+        self.is_loaded = False
+
+        self.vision_tower_name = vision_tower
+        self.select_layer = args.mm_vision_select_layer
+        self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
+
+        if not delay_load:
+            self.load_model()
+        else:
+            self.cfg_only = SiglipVisionConfig.from_pretrained(self.vision_tower_name)
+
+    def load_model(self):
+        self.image_processor = SiglipImageProcessor.from_pretrained(self.vision_tower_name)
+
+        self.vision_tower = SiglipVisionModel.from_pretrained(self.vision_tower_name)
+        self.vision_tower.requires_grad_(False)
+
+        self.is_loaded = True
+
+    def feature_select(self, image_forward_outs):
+        image_features = image_forward_outs.hidden_states[self.select_layer]
+        if self.select_feature == 'patch':
+            image_features = image_features
+        else:
+            raise ValueError(f'Unexpected select feature: {self.select_feature}')
+        return image_features
+
+    @torch.no_grad()
+    def forward(self, images):
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
+                image_feature = self.feature_select(image_forward_out).to(image.dtype)
+                image_features.append(image_feature)
+        else:
+            image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
+            image_features = self.feature_select(image_forward_outs).to(images.dtype)
+
+        return image_features
+
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+
+    @property
+    def dtype(self):
+        return self.vision_tower.dtype
+
+    @property
+    def device(self):
+        return self.vision_tower.device
+
+    @property
+    def config(self):
+        if self.is_loaded:
+            return self.vision_tower.config
+        else:
+            return self.cfg_only
+
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size
+
+    @property
+    def num_patches(self):
+        return (self.config.image_size // self.config.patch_size) ** 2
+
+    @property
+    def num_patches_per_side(self):
+        return self.config.image_size // self.config.patch_size
+        
+
+def build_vision_tower(vision_tower_cfg, **kwargs):
+    vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None))
+
+    if  'clip' in vision_tower:
+        vision_tower = CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
+    elif 'siglip' in vision_tower:
+        vision_tower = SiglipVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
+    else:
+        raise ValueError(f'Unknown vision tower: {vision_tower}')
+
+    return vision_tower
diff --git a/videollama2/model/multimodal_projector/builder.py b/VideoLLaMA2/videollama2/model/projector.py
similarity index 99%
rename from videollama2/model/multimodal_projector/builder.py
rename to VideoLLaMA2/videollama2/model/projector.py
index 333a424687f7411a75cb42ae3eaed08c0367686c..9acddd2f4711f776bf7bf34e77b4be3fba7ca2cb 100644
--- a/videollama2/model/multimodal_projector/builder.py
+++ b/VideoLLaMA2/videollama2/model/projector.py
@@ -20,7 +20,7 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from timm.models.regnet import RegStage
-from timm.models.layers import LayerNorm2d
+from timm.models.layers import LayerNorm, LayerNorm2d
 from transformers import TRANSFORMERS_CACHE
 
 
diff --git a/videollama2/model/videollama2_arch.py b/VideoLLaMA2/videollama2/model/videollama2_arch.py
similarity index 58%
rename from videollama2/model/videollama2_arch.py
rename to VideoLLaMA2/videollama2/model/videollama2_arch.py
index 6190671b76a26919ca612ac6c22ff6960bd6ab0c..7d0c2aef14140a97eec343a6165ffe7b861beb58 100644
--- a/videollama2/model/videollama2_arch.py
+++ b/VideoLLaMA2/videollama2/model/videollama2_arch.py
@@ -20,10 +20,9 @@ import einops
 import torch
 import torch.nn as nn
 
-from .multimodal_encoder.builder import build_vision_tower
-from .multimodal_projector.builder import build_vision_projector
-from ..mm_utils import get_anyres_image_grid_shape
-from ..constants import NUM_FRAMES, IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN,DEFAULT_MMODAL_PATCH_TOKEN, DEFAULT_MMODAL_START_TOKEN, DEFAULT_MMODAL_END_TOKEN, MMODAL_TOKEN_INDEX
+from .projector import load_mm_projector, build_vision_projector
+from .encoder import build_vision_tower
+from ..constants import IGNORE_INDEX, NUM_FRAMES, MODAL_INDEX_MAP
 
 
 class Videollama2MetaModel:
@@ -79,7 +78,10 @@ class Videollama2MetaModel:
         if pretrain_mm_mlp_adapter is not None:
             if os.path.exists(pretrain_mm_mlp_adapter):
                 is_local = True
-                mm_projector_weights = torch.load(pretrain_mm_mlp_adapter, map_location='cpu')
+                if os.path.isdir(pretrain_mm_mlp_adapter):
+                    mm_projector_weights = load_mm_projector(pretrain_mm_mlp_adapter)
+                else:
+                    mm_projector_weights = torch.load(pretrain_mm_mlp_adapter, map_location='cpu')
             else:
                 # Support loading projector weights from remote HuggingFace model hub
                 is_local = False
@@ -110,16 +112,23 @@ class Videollama2MetaForCausalLM(ABC):
     def get_vision_tower(self):
         return self.get_model().get_vision_tower()
 
-    def encode_images_or_videos(self, images_or_videos, modalities):
+    def encode_images_or_videos(self, images):
         num_frames = self.config.num_frames if hasattr(self.config, 'num_frames') else NUM_FRAMES
 
-        videos = [x.unsqueeze(0).expand(num_frames, -1, -1, -1) if modal == 'image' else x for x, modal in zip(images_or_videos, modalities)]
-        videos = torch.stack(videos, dim=0)
+        data_batch = []
+        for i, (data, modal) in enumerate(images):
+            if modal == 'image':
+                data = data.expand(num_frames, -1, -1, -1)
+            else:
+                data = data
+            data_batch.append(data)
+
+        data_batch = torch.stack(data_batch, dim=0)
 
-        assert len(videos.size()) == 5
-        batch_size = videos.size(0)
+        assert len(data_batch.size()) == 5
+        batch_size = data_batch.size(0)
 
-        frames = einops.rearrange(videos, 'b t c h w -> (b t) c h w')
+        frames = einops.rearrange(data_batch, 'b t c h w -> (b t) c h w')
         frames_features = self.get_model().get_vision_tower()(frames)
         frames_features = einops.rearrange(frames_features, '(b t) n h -> b t n h', b = batch_size)
 
@@ -151,58 +160,57 @@ class Videollama2MetaForCausalLM(ABC):
         return video_features
 
     def prepare_inputs_labels_for_multimodal(
-        self, input_ids, attention_mask, past_key_values, labels, X_modalities
+        self, input_ids, attention_mask, past_key_values, labels, images
     ):
         vision_tower = self.get_vision_tower()
         # NOTE: text-only situation
-        if vision_tower is None or X_modalities is None or input_ids.shape[1] == 1:
+        if vision_tower is None or images is None or input_ids.shape[1] == 1:
             # if past_key_values is not None and vision_tower is not None and Xs is not None and input_ids.shape[1] == 1:
             #    attention_mask = torch.ones((attention_mask.shape[0], past_key_values[-1][-1].shape[-2] + 1), dtype=attention_mask.dtype, device=attention_mask.device)
             return input_ids, attention_mask, past_key_values, None, labels
 
-        Xs, keys = X_modalities
-        X_features = self.encode_images_or_videos(Xs, keys)
+        mm_features = self.encode_images_or_videos(images)
 
         new_input_embeds = []
         new_labels = [] if labels is not None else None
-        cur_X_idx = 0
+        cur_mm_idx = 0
         # replace image/video/audio tokens with pre-computed embeddings
         for batch_idx, cur_input_ids in enumerate(input_ids):
-            # cur_X_features = X_features[batch_idx]
-            if (torch.any(torch.stack([cur_input_ids == MMODAL_TOKEN_INDEX[key.upper()] for key in keys]), dim=0)).sum() == 0:
+            num_multimodals = sum((cur_input_ids == mm_token_idx).sum() for mm_token_idx in MODAL_INDEX_MAP.values())
+            # pure text input
+            if num_multimodals == 0:
                 half_len = cur_input_ids.shape[0] // 2
-                cur_X_features = X_features[cur_X_idx]
+                cur_mm_features = mm_features[cur_mm_idx]
                 cur_input_embeds_1 = self.get_model().embed_tokens(cur_input_ids[:half_len])
                 cur_input_embeds_2 = self.get_model().embed_tokens(cur_input_ids[half_len:])
-                cur_input_embeds = torch.cat([cur_input_embeds_1, cur_X_features[0:0], cur_input_embeds_2], dim=0)
+                cur_input_embeds = torch.cat([cur_input_embeds_1, cur_mm_features[0:0], cur_input_embeds_2], dim=0)
                 new_input_embeds.append(cur_input_embeds)
                 if labels is not None:
                     new_labels.append(labels[batch_idx])
-                cur_X_idx += 1 
+                cur_mm_idx += 1 
                 continue
 
-            X_token_indices = torch.where(torch.any(torch.stack([cur_input_ids == MMODAL_TOKEN_INDEX[key.upper()] for key in keys]), dim=0))[0] 
             cur_new_input_embeds = []
             if labels is not None:
                 cur_labels = labels[batch_idx]
                 cur_new_labels = []
                 assert cur_labels.shape == cur_input_ids.shape
-            
-            # X_index_inonesample = 0
-            while X_token_indices.numel() > 0:
-                cur_X_features = X_features[cur_X_idx]
-                X_token_start = X_token_indices[0]
-
-                cur_new_input_embeds.append(self.get_model().embed_tokens(cur_input_ids[:X_token_start])) 
-                cur_new_input_embeds.append(cur_X_features)
+
+            mm_token_indices = torch.where(sum([cur_input_ids == mm_token_idx for mm_token_idx in MODAL_INDEX_MAP.values()]))[0]
+            while mm_token_indices.numel() > 0:
+                cur_mm_features = mm_features[cur_mm_idx]
+                mm_token_start = mm_token_indices[0]
+
+                cur_new_input_embeds.append(self.get_model().embed_tokens(cur_input_ids[:mm_token_start])) 
+                cur_new_input_embeds.append(cur_mm_features)
                 if labels is not None:
-                    cur_new_labels.append(cur_labels[:X_token_start])
-                    cur_new_labels.append(torch.full((cur_X_features.shape[0],), IGNORE_INDEX, device=labels.device, dtype=labels.dtype))
-                    cur_labels = cur_labels[X_token_start+1:]
+                    cur_new_labels.append(cur_labels[:mm_token_start])
+                    cur_new_labels.append(torch.full((cur_mm_features.shape[0],), IGNORE_INDEX, device=labels.device, dtype=labels.dtype))
+                    cur_labels = cur_labels[mm_token_start+1:]
 
-                cur_X_idx += 1
-                cur_input_ids = cur_input_ids[X_token_start+1:] 
-                X_token_indices = torch.where(torch.any(torch.stack([cur_input_ids == MMODAL_TOKEN_INDEX[key.upper()] for key in keys]), dim=0))[0]
+                cur_mm_idx += 1
+                cur_input_ids = cur_input_ids[mm_token_start+1:] 
+                mm_token_indices = torch.where(sum([cur_input_ids == mm_token_idx for mm_token_idx in MODAL_INDEX_MAP.values()]))[0]
 
             if cur_input_ids.numel() > 0:
                 cur_new_input_embeds.append(self.get_model().embed_tokens(cur_input_ids))
@@ -254,93 +262,3 @@ class Videollama2MetaForCausalLM(ABC):
                 assert attention_mask.shape == new_input_embeds.shape[:2]
 
         return None, attention_mask, past_key_values, new_input_embeds, new_labels
-
-    def initialize_vision_tokenizer(self, model_args, tokenizer):
-        if model_args.mm_use_im_patch_token:
-            tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
-            self.resize_token_embeddings(len(tokenizer))
-
-        if model_args.mm_use_im_start_end:
-            num_new_tokens = tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
-            self.resize_token_embeddings(len(tokenizer))
-
-            if num_new_tokens > 0:
-                input_embeddings  = self.get_input_embeddings().weight.data
-                output_embeddings = self.get_output_embeddings().weight.data
-
-                input_embeddings_avg  = input_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
-                output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
-
-                input_embeddings[-num_new_tokens:]  = input_embeddings_avg
-                output_embeddings[-num_new_tokens:] = output_embeddings_avg
-
-            if model_args.tune_mm_mlp_adapter:
-                for p in self.get_input_embeddings().parameters():
-                    p.requires_grad = True
-                for p in self.get_output_embeddings().parameters():
-                    p.requires_grad = False
-
-            if model_args.pretrain_mm_mlp_adapter:
-                mm_projector_weights = torch.load(model_args.pretrain_mm_mlp_adapter, map_location='cpu')
-                embed_tokens_weight = mm_projector_weights['model.embed_tokens.weight']
-                assert num_new_tokens == 2
-                if input_embeddings.shape == embed_tokens_weight.shape:
-                    input_embeddings[-num_new_tokens:] = embed_tokens_weight[-num_new_tokens:]
-                elif embed_tokens_weight.shape[0] == num_new_tokens:
-                    input_embeddings[-num_new_tokens:] = embed_tokens_weight
-                else:
-                    raise ValueError(f"Unexpected embed_tokens_weight shape. Pretrained: {embed_tokens_weight.shape}. Current: {input_embeddings.shape}. Numer of new tokens: {num_new_tokens}.")
-        elif model_args.mm_use_im_patch_token:
-            if model_args.tune_mm_mlp_adapter:
-                for p in self.get_input_embeddings().parameters():
-                    p.requires_grad = False
-                for p in self.get_output_embeddings().parameters():
-                    p.requires_grad = False
-
-    def initialize_MM_tokenizer(self, model_args, tokenizer):
-        if model_args.mm_use_im_patch_token:
-            for modal in ['IMAGE', 'VIDEO', 'AUDIO']:
-                tokenizer.add_tokens([DEFAULT_MMODAL_PATCH_TOKEN[modal.upper()]], special_tokens=True)
-            # tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
-            self.resize_token_embeddings(len(tokenizer))
-
-        if model_args.mm_use_im_start_end:
-            num_new_tokens = 0
-            for modal in ['IMAGE', 'VIDEO', 'AUDIO']:
-                num_new_tokens += tokenizer.add_tokens([DEFAULT_MMODAL_START_TOKEN[modal.upper()], DEFAULT_MMODAL_END_TOKEN[modal.upper()]], special_tokens=True)
-            self.resize_token_embeddings(len(tokenizer))
-
-            if num_new_tokens > 0:
-                input_embeddings = self.get_input_embeddings().weight.data
-                output_embeddings = self.get_output_embeddings().weight.data
-
-                input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
-                    dim=0, keepdim=True)
-                output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
-                    dim=0, keepdim=True)
-
-                input_embeddings[-num_new_tokens:] = input_embeddings_avg
-                output_embeddings[-num_new_tokens:] = output_embeddings_avg
-
-            if model_args.tune_mm_mlp_adapter:
-                for p in self.get_input_embeddings().parameters():
-                    p.requires_grad = True
-                for p in self.get_output_embeddings().parameters():
-                    p.requires_grad = False
-
-            if model_args.pretrain_mm_mlp_adapter:
-                mm_projector_weights = torch.load(model_args.pretrain_mm_mlp_adapter, map_location='cpu')
-                embed_tokens_weight = mm_projector_weights['model.embed_tokens.weight']
-                assert num_new_tokens == 6  # start/end tokens for image/video/audio
-                if input_embeddings.shape == embed_tokens_weight.shape:
-                    input_embeddings[-num_new_tokens:] = embed_tokens_weight[-num_new_tokens:]
-                elif embed_tokens_weight.shape[0] == num_new_tokens:
-                    input_embeddings[-num_new_tokens:] = embed_tokens_weight
-                else:
-                    raise ValueError(f"Unexpected embed_tokens_weight shape. Pretrained: {embed_tokens_weight.shape}. Current: {input_embeddings.shape}. Numer of new tokens: {num_new_tokens}.")
-        elif model_args.mm_use_im_patch_token:
-            if model_args.tune_mm_mlp_adapter:
-                for p in self.get_input_embeddings().parameters():
-                    p.requires_grad = False
-                for p in self.get_output_embeddings().parameters():
-                    p.requires_grad = False
diff --git a/VideoLLaMA2/videollama2/model/videollama2_gemma2.py b/VideoLLaMA2/videollama2/model/videollama2_gemma2.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e84169f94bced91c3ae4f8b8a17f1e97e267c2a
--- /dev/null
+++ b/VideoLLaMA2/videollama2/model/videollama2_gemma2.py
@@ -0,0 +1,157 @@
+# Adopted from: https://github.com/haotian-liu/LLaVA. Below is the original copyright:
+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from torch.nn import CrossEntropyLoss
+
+from transformers import AutoConfig, AutoModelForCausalLM, \
+                         Gemma2Config, Gemma2Model, Gemma2ForCausalLM
+
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers.generation.utils import GenerateOutput
+
+from .videollama2_arch import Videollama2MetaModel, Videollama2MetaForCausalLM
+
+
+class Videollama2Gemma2Config(Gemma2Config):
+    model_type = "videollama2_gemma2"
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.model_type = "videollama2_gemma2"
+
+
+class Videollama2Gemma2Model(Videollama2MetaModel, Gemma2Model):
+    config_class = Videollama2Gemma2Config
+
+    def __init__(self, config: Gemma2Config):
+        super(Videollama2Gemma2Model, self).__init__(config)
+
+
+class Videollama2Gemma2ForCausalLM(Gemma2ForCausalLM, Videollama2MetaForCausalLM):
+    config_class = Videollama2Gemma2Config
+
+    def __init__(self, config, **kwargs):
+        super(Gemma2ForCausalLM, self).__init__(config)
+        self.model = Videollama2Gemma2Model(config)
+        # self.pretraining_tp = config.pretraining_tp
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_model(self):
+        return self.model
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        images: Optional[torch.FloatTensor] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+
+        if inputs_embeds is None:
+            (
+                input_ids,
+                attention_mask,
+                past_key_values,
+                inputs_embeds,
+                labels
+            ) = self.prepare_inputs_labels_for_multimodal(
+                input_ids,
+                attention_mask,
+                past_key_values,
+                labels,
+                images
+            )
+
+        outputs = super().forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict
+        )
+
+        outputs.labels = labels
+
+        return outputs
+
+    @torch.no_grad()
+    def generate(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        images: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Union[GenerateOutput, torch.LongTensor]:
+        position_ids = kwargs.pop("position_ids", None)
+        attention_mask = kwargs.pop("attention_mask", None)
+        if "inputs_embeds" in kwargs:
+            raise NotImplementedError("`inputs_embeds` is not supported")
+
+        if images is not None:
+            (
+                input_ids,
+                attention_mask,
+                past_key_values,
+                inputs_embeds,
+                _
+            ) = self.prepare_inputs_labels_for_multimodal(
+                input_ids=inputs,
+                attention_mask=attention_mask,
+                past_key_values=None,
+                labels=None,
+                images=images
+            )
+        else:
+            inputs_embeds = self.get_model().embed_tokens(inputs)
+
+        return super().generate(
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            **kwargs
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
+        images = kwargs.pop("images", None)
+        _inputs = super().prepare_inputs_for_generation(
+            input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
+        )
+        if images is not None:
+            _inputs['images'] = images
+        return _inputs
+
+
+AutoConfig.register("videollama2_gemma2", Videollama2Gemma2Config)
+AutoModelForCausalLM.register(Videollama2Gemma2Config, Videollama2Gemma2ForCausalLM)
diff --git a/videollama2/model/language_model/videollama2_llama.py b/VideoLLaMA2/videollama2/model/videollama2_llama.py
similarity index 92%
rename from videollama2/model/language_model/videollama2_llama.py
rename to VideoLLaMA2/videollama2/model/videollama2_llama.py
index a4c800627022fc54459d240d7be4d0bd5eec7ca2..52d75a7fda4b94196a8a692ce435b5df66b11733 100644
--- a/videollama2/model/language_model/videollama2_llama.py
+++ b/VideoLLaMA2/videollama2/model/videollama2_llama.py
@@ -24,12 +24,16 @@ from transformers import AutoConfig, AutoModelForCausalLM, \
 from transformers.modeling_outputs import CausalLMOutputWithPast
 from transformers.generation.utils import GenerateOutput
 
-from ..videollama2_arch import Videollama2MetaModel, Videollama2MetaForCausalLM
+from .videollama2_arch import Videollama2MetaModel, Videollama2MetaForCausalLM
 
 
 class Videollama2Config(LlamaConfig):
     model_type = "videollama2_llama"
 
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.model_type = "videollama2_llama"
+
 
 class Videollama2LlamaModel(Videollama2MetaModel, LlamaModel):
     config_class = Videollama2Config
@@ -67,6 +71,7 @@ class Videollama2LlamaForCausalLM(LlamaForCausalLM, Videollama2MetaForCausalLM):
         output_hidden_states: Optional[bool] = None,
         images: Optional[torch.FloatTensor] = None,
         return_dict: Optional[bool] = None,
+        **kwargs
     ) -> Union[Tuple, CausalLMOutputWithPast]:
 
         if inputs_embeds is None:
@@ -84,7 +89,7 @@ class Videollama2LlamaForCausalLM(LlamaForCausalLM, Videollama2MetaForCausalLM):
                 images
             )
 
-        return super().forward(
+        outputs = super().forward(
             input_ids=input_ids,
             attention_mask=attention_mask,
             past_key_values=past_key_values,
@@ -96,12 +101,15 @@ class Videollama2LlamaForCausalLM(LlamaForCausalLM, Videollama2MetaForCausalLM):
             return_dict=return_dict
         )
 
+        outputs.labels = labels
+
+        return outputs
+
     @torch.no_grad()
     def generate(
         self,
         inputs: Optional[torch.Tensor] = None,
-        images_or_videos: Optional[torch.Tensor] = None,
-        modal_list: Optional[torch.Tensor] = None,
+        images: Optional[torch.Tensor] = None,
         **kwargs,
     ) -> Union[GenerateOutput, torch.LongTensor]:
         position_ids = kwargs.pop("position_ids", None)
@@ -109,7 +117,7 @@ class Videollama2LlamaForCausalLM(LlamaForCausalLM, Videollama2MetaForCausalLM):
         if "inputs_embeds" in kwargs:
             raise NotImplementedError("`inputs_embeds` is not supported")
 
-        if images_or_videos is not None:
+        if images is not None:
             (
                 input_ids,
                 attention_mask,
@@ -121,7 +129,7 @@ class Videollama2LlamaForCausalLM(LlamaForCausalLM, Videollama2MetaForCausalLM):
                 attention_mask=attention_mask,
                 past_key_values=None,
                 labels=None,
-                X_modalities=[images_or_videos, modal_list]
+                images=images
             )
         else:
             inputs_embeds = self.get_model().embed_tokens(inputs)
diff --git a/videollama2/model/language_model/videollama2_mistral.py b/VideoLLaMA2/videollama2/model/videollama2_mistral.py
similarity index 90%
rename from videollama2/model/language_model/videollama2_mistral.py
rename to VideoLLaMA2/videollama2/model/videollama2_mistral.py
index b81a6de9313f44bde447084e089e237300d77b5b..a4380df390b944e8b5f3d885a17e80ee92dfb56a 100644
--- a/videollama2/model/language_model/videollama2_mistral.py
+++ b/VideoLLaMA2/videollama2/model/videollama2_mistral.py
@@ -20,18 +20,22 @@ import torch
 import torch.nn as nn
 from torch.nn import CrossEntropyLoss
 
-from transformers import AutoConfig, AutoModelForCausalLM, \
+from transformers import AutoConfig, AutoModelForCausalLM, PretrainedConfig, \
                          MistralConfig, MistralModel, MistralForCausalLM
 
 from transformers.modeling_outputs import CausalLMOutputWithPast
 from transformers.generation.utils import GenerateOutput
 
-from ..videollama2_arch import Videollama2MetaModel, Videollama2MetaForCausalLM
+from .videollama2_arch import Videollama2MetaModel, Videollama2MetaForCausalLM
 
 
 class Videollama2MistralConfig(MistralConfig):
     model_type = "videollama2_mistral"
 
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.model_type = "videollama2_mistral"
+
 
 class Videollama2MistralModel(Videollama2MetaModel, MistralModel):
     config_class = Videollama2MistralConfig
@@ -69,6 +73,7 @@ class Videollama2MistralForCausalLM(MistralForCausalLM, Videollama2MetaForCausal
         output_hidden_states: Optional[bool] = None,
         images: Optional[torch.FloatTensor] = None,
         return_dict: Optional[bool] = None,
+        **kwargs
     ) -> Union[Tuple, CausalLMOutputWithPast]:
 
         if inputs_embeds is None:
@@ -86,7 +91,7 @@ class Videollama2MistralForCausalLM(MistralForCausalLM, Videollama2MetaForCausal
                 images
             )
 
-        return super().forward(
+        outputs = super().forward(
             input_ids=input_ids,
             attention_mask=attention_mask,
             past_key_values=past_key_values,
@@ -98,12 +103,15 @@ class Videollama2MistralForCausalLM(MistralForCausalLM, Videollama2MetaForCausal
             return_dict=return_dict
         )
 
+        outputs.labels = labels
+
+        return outputs
+
     @torch.no_grad()
     def generate(
         self,
         inputs: Optional[torch.Tensor] = None,
-        images_or_videos: Optional[torch.Tensor] = None,
-        modal_list: Optional[torch.Tensor] = None,
+        images: Optional[torch.Tensor] = None,
         **kwargs,
     ) -> Union[GenerateOutput, torch.LongTensor]:
         position_ids = kwargs.pop("position_ids", None)
@@ -111,7 +119,7 @@ class Videollama2MistralForCausalLM(MistralForCausalLM, Videollama2MetaForCausal
         if "inputs_embeds" in kwargs:
             raise NotImplementedError("`inputs_embeds` is not supported")
 
-        if images_or_videos is not None:
+        if images is not None:
             (
                 input_ids,
                 attention_mask,
@@ -123,7 +131,7 @@ class Videollama2MistralForCausalLM(MistralForCausalLM, Videollama2MetaForCausal
                 attention_mask=attention_mask,
                 past_key_values=None,
                 labels=None,
-                X_modalities=[images_or_videos, modal_list]
+                images=images
             )
         else:
             inputs_embeds = self.get_model().embed_tokens(inputs)
diff --git a/videollama2/model/language_model/videollama2_mixtral.py b/VideoLLaMA2/videollama2/model/videollama2_mixtral.py
similarity index 91%
rename from videollama2/model/language_model/videollama2_mixtral.py
rename to VideoLLaMA2/videollama2/model/videollama2_mixtral.py
index 7b9fe23c00863368406572773faefedd755aeef9..9bbdebb2995afec3edcfcf6080823692aa86ba81 100644
--- a/videollama2/model/language_model/videollama2_mixtral.py
+++ b/VideoLLaMA2/videollama2/model/videollama2_mixtral.py
@@ -25,12 +25,16 @@ from transformers import AutoConfig, AutoModelForCausalLM, \
 from transformers.modeling_outputs import CausalLMOutputWithPast
 from transformers.generation.utils import GenerateOutput
 
-from ..videollama2_arch import Videollama2MetaModel, Videollama2MetaForCausalLM
+from .videollama2_arch import Videollama2MetaModel, Videollama2MetaForCausalLM
 
 
 class Videollama2MixtralConfig(MixtralConfig):
     model_type = "videollama2_mixtral"
 
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.model_type = "videollama2_mixtral"
+
 
 class Videollama2MixtralModel(Videollama2MetaModel, MixtralModel):
     config_class = Videollama2MixtralConfig
@@ -68,6 +72,7 @@ class Videollama2MixtralForCausalLM(MixtralForCausalLM, Videollama2MetaForCausal
         output_hidden_states: Optional[bool] = None,
         images: Optional[torch.FloatTensor] = None,
         return_dict: Optional[bool] = None,
+        **kwargs
     ) -> Union[Tuple, CausalLMOutputWithPast]:
 
         if inputs_embeds is None:
@@ -101,9 +106,7 @@ class Videollama2MixtralForCausalLM(MixtralForCausalLM, Videollama2MetaForCausal
     def generate(
         self,
         inputs: Optional[torch.Tensor] = None,
-        images_or_videos: Optional[torch.Tensor] = None,
-        timestamps: Optional[torch.Tensor] = None,
-        modal_list: Optional[torch.Tensor] = None,
+        images: Optional[torch.Tensor] = None,
         **kwargs,
     ) -> Union[GenerateOutput, torch.LongTensor]:
         position_ids = kwargs.pop("position_ids", None)
@@ -111,8 +114,7 @@ class Videollama2MixtralForCausalLM(MixtralForCausalLM, Videollama2MetaForCausal
         if "inputs_embeds" in kwargs:
             raise NotImplementedError("`inputs_embeds` is not supported")
 
-        if images_or_videos is not None:
-            X_modalities = [images_or_videos, modal_list] if timestamps is None else [images_or_videos, modal_list, timestamps]
+        if images is not None:
             (
                 input_ids,
                 attention_mask,
@@ -124,7 +126,7 @@ class Videollama2MixtralForCausalLM(MixtralForCausalLM, Videollama2MetaForCausal
                 attention_mask=attention_mask,
                 past_key_values=None,
                 labels=None,
-                X_modalities=X_modalities
+                images=images
             )
         else:
             inputs_embeds = self.get_model().embed_tokens(inputs)
@@ -145,5 +147,6 @@ class Videollama2MixtralForCausalLM(MixtralForCausalLM, Videollama2MetaForCausal
             _inputs['images'] = images
         return _inputs
 
+
 AutoConfig.register("videollama2_mixtral", Videollama2MixtralConfig)
 AutoModelForCausalLM.register(Videollama2MixtralConfig, Videollama2MixtralForCausalLM)
diff --git a/VideoLLaMA2/videollama2/model/videollama2_phi3.py b/VideoLLaMA2/videollama2/model/videollama2_phi3.py
new file mode 100644
index 0000000000000000000000000000000000000000..894499ab6999373c5209e5d220ae2158e8147ce7
--- /dev/null
+++ b/VideoLLaMA2/videollama2/model/videollama2_phi3.py
@@ -0,0 +1,157 @@
+# Adopted from: https://github.com/haotian-liu/LLaVA. Below is the original copyright:
+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from torch.nn import CrossEntropyLoss
+
+from transformers import AutoConfig, AutoModelForCausalLM, PretrainedConfig, \
+                         Phi3Config, Phi3Model, Phi3ForCausalLM
+
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers.generation.utils import GenerateOutput
+
+from .videollama2_arch import Videollama2MetaModel, Videollama2MetaForCausalLM
+
+
+class Videollama2Phi3Config(Phi3Config):
+    model_type = "videollama2_phi3"
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.model_type = "videollama2_phi3"
+
+
+class Videollama2Phi3Model(Videollama2MetaModel, Phi3Model):
+    config_class = Videollama2Phi3Config
+
+    def __init__(self, config: Phi3Config):
+        super(Videollama2Phi3Model, self).__init__(config)
+
+
+class Videollama2Phi3ForCausalLM(Phi3ForCausalLM, Videollama2MetaForCausalLM):
+    config_class = Videollama2Phi3Config
+
+    def __init__(self, config, **kwargs):
+        super(Phi3ForCausalLM, self).__init__(config)
+        self.model = Videollama2Phi3Model(config)
+        # self.pretraining_tp = config.pretraining_tp
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_model(self):
+        return self.model
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        images: Optional[torch.FloatTensor] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+
+        if inputs_embeds is None:
+            (
+                input_ids,
+                attention_mask,
+                past_key_values,
+                inputs_embeds,
+                labels
+            ) = self.prepare_inputs_labels_for_multimodal(
+                input_ids,
+                attention_mask,
+                past_key_values,
+                labels,
+                images
+            )
+
+        outputs = super().forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict
+        )
+
+        outputs.labels = labels
+
+        return outputs
+
+    @torch.no_grad()
+    def generate(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        images: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Union[GenerateOutput, torch.LongTensor]:
+        position_ids = kwargs.pop("position_ids", None)
+        attention_mask = kwargs.pop("attention_mask", None)
+        if "inputs_embeds" in kwargs:
+            raise NotImplementedError("`inputs_embeds` is not supported")
+
+        if images is not None:
+            (
+                input_ids,
+                attention_mask,
+                past_key_values,
+                inputs_embeds,
+                _
+            ) = self.prepare_inputs_labels_for_multimodal(
+                input_ids=inputs,
+                attention_mask=attention_mask,
+                past_key_values=None,
+                labels=None,
+                images=images
+            )
+        else:
+            inputs_embeds = self.get_model().embed_tokens(inputs)
+
+        return super().generate(
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            **kwargs
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
+        images = kwargs.pop("images", None)
+        _inputs = super().prepare_inputs_for_generation(
+            input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
+        )
+        if images is not None:
+            _inputs['images'] = images
+        return _inputs
+
+
+AutoConfig.register("videollama2_phi3", Videollama2Phi3Config)
+AutoModelForCausalLM.register(Videollama2Phi3Config, Videollama2Phi3ForCausalLM)
diff --git a/VideoLLaMA2/videollama2/model/videollama2_qwen2.py b/VideoLLaMA2/videollama2/model/videollama2_qwen2.py
new file mode 100644
index 0000000000000000000000000000000000000000..90d728565dc690f21417efdf2a37ba9fa8f05c69
--- /dev/null
+++ b/VideoLLaMA2/videollama2/model/videollama2_qwen2.py
@@ -0,0 +1,151 @@
+# Adopted from: https://github.com/haotian-liu/LLaVA. Below is the original copyright:
+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+from transformers import AutoConfig, AutoModelForCausalLM, \
+                         Qwen2Config, Qwen2Model, Qwen2ForCausalLM
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers.generation.utils import GenerateOutput
+
+from .videollama2_arch import Videollama2MetaModel, Videollama2MetaForCausalLM
+
+
+class Videollama2Qwen2Config(Qwen2Config):
+    model_type = "videollama2_qwen2"
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.model_type = "videollama2_qwen2"
+
+
+class Videollama2Qwen2Model(Videollama2MetaModel, Qwen2Model):
+    config_class = Videollama2Qwen2Config
+
+    def __init__(self, config: Videollama2Qwen2Config):
+        super(Videollama2Qwen2Model, self).__init__(config)
+
+
+class Videollama2Qwen2ForCausalLM(Qwen2ForCausalLM, Videollama2MetaForCausalLM):
+    config_class = Videollama2Qwen2Config
+
+    def __init__(self, config, **kwargs):
+        super(Qwen2ForCausalLM, self).__init__(config)
+        self.model = Videollama2Qwen2Model(config)
+        # self.pretraining_tp = config.pretraining_tp
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_model(self):
+        return self.model
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        images: Optional[torch.FloatTensor] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+
+        if inputs_embeds is None:
+            (
+                input_ids,
+                attention_mask,
+                past_key_values,
+                inputs_embeds,
+                labels
+            ) = self.prepare_inputs_labels_for_multimodal(
+                input_ids,
+                attention_mask,
+                past_key_values,
+                labels,
+                images
+            )
+
+        return super().forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict
+        )
+
+    @torch.no_grad()
+    def generate(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        images: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Union[GenerateOutput, torch.LongTensor]:
+        position_ids = kwargs.pop("position_ids", None)
+        attention_mask = kwargs.pop("attention_mask", None)
+        if "inputs_embeds" in kwargs:
+            raise NotImplementedError("`inputs_embeds` is not supported")
+
+        if images is not None:
+            (
+                input_ids,
+                attention_mask,
+                past_key_values,
+                inputs_embeds,
+                _
+            ) = self.prepare_inputs_labels_for_multimodal(
+                input_ids=inputs,
+                attention_mask=attention_mask,
+                past_key_values=None,
+                labels=None,
+                images=images
+            )
+        else:
+            inputs_embeds = self.get_model().embed_tokens(inputs)
+
+        return super().generate(
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            **kwargs
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
+        images = kwargs.pop("images", None)
+        _inputs = super().prepare_inputs_for_generation(
+            input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
+        )
+        if images is not None:
+            _inputs['images'] = images
+        return _inputs
+
+
+AutoConfig.register("videollama2_qwen2", Videollama2Qwen2Config)
+AutoModelForCausalLM.register(Videollama2Qwen2Config, Videollama2Qwen2ForCausalLM)
diff --git a/videollama2/serve/cli.py b/VideoLLaMA2/videollama2/serve/cli.py
similarity index 95%
rename from videollama2/serve/cli.py
rename to VideoLLaMA2/videollama2/serve/cli.py
index f7bd1bac5ef1d22fcd4d60538f875c020cf93082..7a1fc24a38a9b727392ac74457a896f994891d76 100644
--- a/videollama2/serve/cli.py
+++ b/VideoLLaMA2/videollama2/serve/cli.py
@@ -47,15 +47,13 @@ def main(args):
     tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, args.load_8bit, args.load_4bit, device=args.device)
 
     # if "llama-2" in model_name.lower():
-    #     conv_mode = "llava_llama_2"
+    #     conv_mode = "llava_llama2"
     # elif "mistral" in model_name.lower():
-    #     conv_mode = "mistral_instruct"
+    #     conv_mode = "mistral"
     # elif "v1.6-34b" in model_name.lower():
     #     conv_mode = "chatml_direct"
     # elif "v1" in model_name.lower():
     #     conv_mode = "llava_v1"
-    # elif "mpt" in model_name.lower():
-    #     conv_mode = "mpt"
     # else:
     #     conv_mode = "llava_v0"
     conv_mode = "llava_v1" # fix conversation mode for now
@@ -66,10 +64,7 @@ def main(args):
         args.conv_mode = conv_mode
 
     conv = conv_templates[args.conv_mode].copy()
-    if "mpt" in model_name.lower():
-        roles = ('user', 'assistant')
-    else:
-        roles = conv.roles
+    roles = conv.roles
 
     image = load_image(args.image_file)
     image_size = image.size
diff --git a/videollama2/serve/controller.py b/VideoLLaMA2/videollama2/serve/controller.py
similarity index 100%
rename from videollama2/serve/controller.py
rename to VideoLLaMA2/videollama2/serve/controller.py
diff --git a/videollama2/serve/examples/1034346401.mp4 b/VideoLLaMA2/videollama2/serve/examples/1034346401.mp4
similarity index 100%
rename from videollama2/serve/examples/1034346401.mp4
rename to VideoLLaMA2/videollama2/serve/examples/1034346401.mp4
diff --git a/videollama2/serve/examples/desert.jpg b/VideoLLaMA2/videollama2/serve/examples/desert.jpg
similarity index 100%
rename from videollama2/serve/examples/desert.jpg
rename to VideoLLaMA2/videollama2/serve/examples/desert.jpg
diff --git a/videollama2/serve/examples/extreme_ironing.jpg b/VideoLLaMA2/videollama2/serve/examples/extreme_ironing.jpg
similarity index 100%
rename from videollama2/serve/examples/extreme_ironing.jpg
rename to VideoLLaMA2/videollama2/serve/examples/extreme_ironing.jpg
diff --git a/videollama2/serve/examples/sample_demo_1.mp4 b/VideoLLaMA2/videollama2/serve/examples/sample_demo_1.mp4
similarity index 100%
rename from videollama2/serve/examples/sample_demo_1.mp4
rename to VideoLLaMA2/videollama2/serve/examples/sample_demo_1.mp4
diff --git a/videollama2/serve/examples/sample_demo_3.mp4 b/VideoLLaMA2/videollama2/serve/examples/sample_demo_3.mp4
similarity index 100%
rename from videollama2/serve/examples/sample_demo_3.mp4
rename to VideoLLaMA2/videollama2/serve/examples/sample_demo_3.mp4
diff --git a/videollama2/serve/examples/sample_demo_9.mp4 b/VideoLLaMA2/videollama2/serve/examples/sample_demo_9.mp4
similarity index 100%
rename from videollama2/serve/examples/sample_demo_9.mp4
rename to VideoLLaMA2/videollama2/serve/examples/sample_demo_9.mp4
diff --git a/videollama2/serve/examples/waterview.jpg b/VideoLLaMA2/videollama2/serve/examples/waterview.jpg
similarity index 100%
rename from videollama2/serve/examples/waterview.jpg
rename to VideoLLaMA2/videollama2/serve/examples/waterview.jpg
diff --git a/videollama2/serve/gradio_web_server.py b/VideoLLaMA2/videollama2/serve/gradio_web_server.py
similarity index 98%
rename from videollama2/serve/gradio_web_server.py
rename to VideoLLaMA2/videollama2/serve/gradio_web_server.py
index 7595fcd69d4e8d9ca0cbc464a2fb45a4a16ac200..2581ccce5240b6be055d963cd6724587fc86d888 100644
--- a/videollama2/serve/gradio_web_server.py
+++ b/VideoLLaMA2/videollama2/serve/gradio_web_server.py
@@ -230,7 +230,7 @@ def http_bot(state, model_selector, temperature, top_p, max_new_tokens, request:
         # First round of conversation
         if "llava" in model_name.lower():
             if 'llama-2' in model_name.lower():
-                template_name = "llava_llama_2"
+                template_name = "llava_llama2"
             elif "v1" in model_name.lower():
                 if 'mmtag' in model_name.lower():
                     template_name = "v1_mmtag"
@@ -238,8 +238,6 @@ def http_bot(state, model_selector, temperature, top_p, max_new_tokens, request:
                     template_name = "v1_mmtag"
                 else:
                     template_name = "llava_v1"
-            elif "mpt" in model_name.lower():
-                template_name = "mpt"
             else:
                 if 'mmtag' in model_name.lower():
                     template_name = "v0_mmtag"
@@ -247,10 +245,8 @@ def http_bot(state, model_selector, temperature, top_p, max_new_tokens, request:
                     template_name = "v0_mmtag"
                 else:
                     template_name = "llava_v0"
-        elif "mpt" in model_name:
-            template_name = "mpt_text"
         elif "llama-2" in model_name:
-            template_name = "llama_2"
+            template_name = "llama2"
         else:
             template_name = "vicuna_v1"
         template_name = "llava_v1"
@@ -297,7 +293,7 @@ def http_bot(state, model_selector, temperature, top_p, max_new_tokens, request:
         "temperature": float(temperature),
         "top_p": float(top_p),
         "max_new_tokens": min(int(max_new_tokens), 1536),
-        "stop": state.sep if state.sep_style in [SeparatorStyle.SINGLE, SeparatorStyle.MPT] else state.sep2,
+        "stop": state.sep if state.sep_style in [SeparatorStyle.SINGLE] else state.sep2,
         #"images": f'List of {len(state.get_images())} images: {all_image_hash}',
         "images": f'List of {len(all_image_hash)} images: {all_image_hash}',
     }
diff --git a/VideoLLaMA2/videollama2/serve/gradio_web_server_adhoc.py b/VideoLLaMA2/videollama2/serve/gradio_web_server_adhoc.py
new file mode 100644
index 0000000000000000000000000000000000000000..152a0ce0e0751011ee594704fc22e26f7003b01f
--- /dev/null
+++ b/VideoLLaMA2/videollama2/serve/gradio_web_server_adhoc.py
@@ -0,0 +1,290 @@
+# import spaces
+
+import os
+
+import torch
+import gradio as gr
+
+import sys
+sys.path.append('./')
+from videollama2 import model_init, mm_infer
+from videollama2.utils import disable_torch_init
+
+
+title_markdown = ("""
+<div style="display: flex; justify-content: center; align-items: center; text-align: center;">
+  <a href="https://github.com/DAMO-NLP-SG/VideoLLaMA2" style="margin-right: 20px; text-decoration: none; display: flex; align-items: center;">
+    <img src="https://s2.loli.net/2024/06/03/D3NeXHWy5az9tmT.png" alt="VideoLLaMA 2 🔥🚀🔥" style="max-width: 120px; height: auto;">
+  </a>
+  <div>
+    <h1 >VideoLLaMA 2: Advancing Spatial-Temporal Modeling and Audio Understanding in Video-LLMs</h1>
+    <h5 style="margin: 0;">If this demo please you, please give us a star ⭐ on Github or 💖 on this space.</h5>
+  </div>
+</div>
+
+
+<div align="center">
+    <div style="display:flex; gap: 0.25rem; margin-top: 10px;" align="center">
+        <a href="https://github.com/DAMO-NLP-SG/VideoLLaMA2"><img src='https://img.shields.io/badge/Github-VideoLLaMA2-9C276A'></a>
+        <a href="https://arxiv.org/pdf/2406.07476.pdf"><img src="https://img.shields.io/badge/Arxiv-2406.07476-AD1C18"></a>
+        <a href="https://github.com/DAMO-NLP-SG/VideoLLaMA2/stargazers"><img src="https://img.shields.io/github/stars/DAMO-NLP-SG/VideoLLaMA2.svg?style=social"></a>
+    </div>
+</div>
+""")
+
+
+block_css = """
+#buttons button {
+    min-width: min(120px,100%);
+    color: #9C276A
+}
+"""
+
+
+tos_markdown = ("""
+### Terms of use
+By using this service, users are required to agree to the following terms:
+The service is a research preview intended for non-commercial use only. It only provides limited safety measures and may generate offensive content. It must not be used for any illegal, harmful, violent, racist, or sexual purposes. The service may collect user dialogue data for future research.
+Please click the "Flag" button if you get any inappropriate answer! We will collect those to keep improving our moderator.
+For an optimal experience, please use desktop computers for this demo, as mobile devices may compromise its quality.
+""")
+
+
+learn_more_markdown = ("""
+### License
+This project is released under the Apache 2.0 license as found in the LICENSE file. The service is a research preview intended for non-commercial use ONLY, subject to the model Licenses of LLaMA and Mistral, Terms of Use of the data generated by OpenAI, and Privacy Practices of ShareGPT. Please get in touch with us if you find any potential violations.
+""")
+
+
+plum_color = gr.themes.colors.Color(
+    name='plum',
+    c50='#F8E4EF',
+    c100='#E9D0DE',
+    c200='#DABCCD',
+    c300='#CBA8BC',
+    c400='#BC94AB',
+    c500='#AD809A',
+    c600='#9E6C89',
+    c700='#8F5878',
+    c800='#804467',
+    c900='#713056',
+    c950='#662647',
+)
+
+
+class Chat:
+
+    def __init__(self, model_path, load_8bit=False, load_4bit=False):
+        disable_torch_init()
+
+        self.model, self.processor, self.tokenizer = model_init(model_path, load_8bit=load_8bit, load_4bit=load_4bit)
+
+    # @spaces.GPU(duration=120)
+    @torch.inference_mode()
+    def generate(self, data: list, message, temperature, top_p, max_output_tokens):
+        # TODO: support multiple turns of conversation.
+        assert len(data) == 1
+
+        tensor, modal = data[0]
+        response = mm_infer(tensor, message, self.model, self.tokenizer, modal=modal.strip('<>'), 
+            do_sample=True if temperature > 0.0 else False,
+            temperature=temperature,
+            top_p=top_p,
+            max_new_tokens=max_output_tokens)
+
+        return response
+
+
+# @spaces.GPU(duration=120)
+def generate(image, video, message, chatbot, textbox_in, temperature, top_p, max_output_tokens, dtype=torch.float16):
+    data = []
+
+    image = image if image else "none"
+    video = video if video else "none"
+    assert not (os.path.exists(image) and os.path.exists(video))
+
+    processor = handler.processor
+    if os.path.exists(image) and not os.path.exists(video):
+        data.append((processor['image'](image).to(handler.model.device, dtype=dtype), '<image>'))
+    if not os.path.exists(image) and os.path.exists(video):
+        data.append((processor['video'](video).to(handler.model.device, dtype=dtype), '<video>'))
+    if os.path.exists(image) and os.path.exists(video):
+        raise NotImplementedError("Not support image and video at the same time")
+
+    assert len(message) % 2 == 0, "The message should be a pair of user and system message."
+
+    message.append({'role': 'user', 'content': textbox_in})
+    text_en_out = handler.generate(data, message, temperature=temperature, top_p=top_p, max_output_tokens=max_output_tokens)
+    message.append({'role': 'assistant', 'content': text_en_out})
+
+    show_images = ""
+    if os.path.exists(image):
+        show_images += f'<img src="./file={image}" style="display: inline-block;width: 250px;max-height: 400px;">'
+    if os.path.exists(video):
+        show_images += f'<video controls playsinline width="500" style="display: inline-block;"  src="./file={video}"></video>'
+
+    chatbot.append([textbox_in + "\n" + show_images, text_en_out])
+
+    return (
+        gr.update(value=image if os.path.exists(image) else None, interactive=True),
+        gr.update(value=video if os.path.exists(video) else None, interactive=True), 
+        message,
+        chatbot)
+
+
+def regenerate(message, chatbot):
+    message.pop(-1), message.pop(-1)
+    chatbot.pop(-1)
+    return message, chatbot
+
+
+def clear_history(message, chatbot):
+    message.clear(), chatbot.clear()
+    return (gr.update(value=None, interactive=True),
+            gr.update(value=None, interactive=True),
+            message, chatbot,
+            gr.update(value=None, interactive=True))
+
+
+# BUG of Zero Environment
+# 1. The environment is fixed to torch==2.0.1+cu117, gradio>=4.x.x
+# 2. The operation or tensor which requires cuda are limited in those functions wrapped via spaces.GPU
+# 3. The function can't return tensor or other cuda objects.
+
+model_path = 'DAMO-NLP-SG/VideoLLaMA2-7B-16F'
+
+handler = Chat(model_path, load_8bit=False, load_4bit=True)
+
+textbox = gr.Textbox(show_label=False, placeholder="Enter text and press ENTER", container=False)
+
+theme = gr.themes.Default(primary_hue=plum_color)
+# theme.update_color("primary", plum_color.c500)
+theme.set(slider_color="#9C276A")
+theme.set(block_title_text_color="#9C276A")
+theme.set(block_label_text_color="#9C276A")
+theme.set(button_primary_text_color="#9C276A")
+# theme.set(button_secondary_text_color="*neutral_800")
+
+
+with gr.Blocks(title='VideoLLaMA 2 🔥🚀🔥', theme=theme, css=block_css) as demo:
+    gr.Markdown(title_markdown)
+    message = gr.State([])
+
+    with gr.Row():
+        with gr.Column(scale=3):
+            image = gr.Image(label="Input Image", type="filepath")
+            video = gr.Video(label="Input Video")
+
+            with gr.Accordion("Parameters", open=True) as parameter_row:
+                # num_beams = gr.Slider(
+                #     minimum=1,
+                #     maximum=10,
+                #     value=1,
+                #     step=1,
+                #     interactive=True,
+                #     label="beam search numbers",
+                # )
+
+                temperature = gr.Slider(
+                    minimum=0.1,
+                    maximum=1.0,
+                    value=0.2,
+                    step=0.1,
+                    interactive=True,
+                    label="Temperature",
+                )
+
+                top_p = gr.Slider(
+                        minimum=0.0,
+                        maximum=1.0,
+                        value=0.7,
+                        step=0.1,
+                        interactive=True,
+                        label="Top P",
+                )
+
+                max_output_tokens = gr.Slider(
+                    minimum=64,
+                    maximum=1024,
+                    value=512,
+                    step=64,
+                    interactive=True,
+                    label="Max output tokens",
+                )
+
+        with gr.Column(scale=7):
+            chatbot = gr.Chatbot(label="VideoLLaMA 2", bubble_full_width=True, height=750)
+            with gr.Row():
+                with gr.Column(scale=8):
+                    textbox.render()
+                with gr.Column(scale=1, min_width=50):
+                    submit_btn = gr.Button(value="Send", variant="primary", interactive=True)
+            with gr.Row(elem_id="buttons") as button_row:
+                upvote_btn     = gr.Button(value="👍  Upvote", interactive=True)
+                downvote_btn   = gr.Button(value="👎  Downvote", interactive=True)
+                # flag_btn     = gr.Button(value="⚠️  Flag", interactive=True)
+                # stop_btn     = gr.Button(value="⏹️  Stop Generation", interactive=False)
+                regenerate_btn = gr.Button(value="🔄  Regenerate", interactive=True)
+                clear_btn      = gr.Button(value="🗑️  Clear history", interactive=True)
+
+    with gr.Row():
+        with gr.Column():
+            cur_dir = os.path.dirname(os.path.abspath(__file__))
+            gr.Examples(
+                examples=[
+                    [
+                        f"{cur_dir}/examples/extreme_ironing.jpg",
+                        "What happens in this image?",
+                    ],
+                    [
+                        f"{cur_dir}/examples/waterview.jpg",
+                        "What are the things I should be cautious about when I visit here?",
+                    ],
+                    [
+                        f"{cur_dir}/examples/desert.jpg",
+                        "If there are factual errors in the questions, point it out; if not, proceed answering the question. What’s happening in the desert?",
+                    ],
+                ],
+                inputs=[image, textbox],
+            )
+        with gr.Column():
+            gr.Examples(
+                examples=[
+                    [
+                        f"{cur_dir}/../../assets/cat_and_chicken.mp4",
+                        "What happens in this video?",
+                    ],
+                    [
+                        f"{cur_dir}/../../assets/sora.mp4",
+                        "Please describe this video.",
+                    ],
+                    [
+                        f"{cur_dir}/examples/sample_demo_1.mp4",
+                        "What does the baby do?",
+                    ],
+                ],
+                inputs=[video, textbox],
+            )
+
+    gr.Markdown(tos_markdown)
+    gr.Markdown(learn_more_markdown)
+
+    submit_btn.click(
+        generate, 
+        [image, video, message, chatbot, textbox, temperature, top_p, max_output_tokens],
+        [image, video, message, chatbot])
+
+    regenerate_btn.click(
+        regenerate, 
+        [message, chatbot], 
+        [message, chatbot]).then(
+        generate, 
+        [image, video, message, chatbot, textbox, temperature, top_p, max_output_tokens], 
+        [image, video, message, chatbot])
+
+    clear_btn.click(
+        clear_history, 
+        [message, chatbot],
+        [image, video, message, chatbot, textbox])
+
+demo.launch()
diff --git a/videollama2/serve/model_worker.py b/VideoLLaMA2/videollama2/serve/model_worker.py
similarity index 100%
rename from videollama2/serve/model_worker.py
rename to VideoLLaMA2/videollama2/serve/model_worker.py
diff --git a/videollama2/serve/register_worker.py b/VideoLLaMA2/videollama2/serve/register_worker.py
similarity index 100%
rename from videollama2/serve/register_worker.py
rename to VideoLLaMA2/videollama2/serve/register_worker.py
diff --git a/videollama2/serve/sglang_worker.py b/VideoLLaMA2/videollama2/serve/sglang_worker.py
similarity index 100%
rename from videollama2/serve/sglang_worker.py
rename to VideoLLaMA2/videollama2/serve/sglang_worker.py
diff --git a/videollama2/serve/test_message.py b/VideoLLaMA2/videollama2/serve/test_message.py
similarity index 100%
rename from videollama2/serve/test_message.py
rename to VideoLLaMA2/videollama2/serve/test_message.py
diff --git a/VideoLLaMA2/videollama2/train.py b/VideoLLaMA2/videollama2/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a2b6c1c074ff0815669f51ec78bcfffda7dd1cd
--- /dev/null
+++ b/VideoLLaMA2/videollama2/train.py
@@ -0,0 +1,584 @@
+# Adopted from https://github.com/haotian-liu/LLaVA. Below is the original copyright:
+# Adopted from https://github.com/lm-sys/FastChat. Below is the original copyright:
+# Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright:
+#    Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+import re
+import os
+import copy
+import json
+import random
+import pathlib
+import traceback
+from dataclasses import dataclass, field
+from typing import Dict, Optional, Sequence, List
+
+# torch-related packages
+# NOTE: torch must be imported before transformers. Otherwise, `Segmentation fault (core dumped)` will occur.
+import torch
+from torch.utils.data import Dataset
+
+import transformers
+from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock
+
+import sys
+sys.path.append('./')
+from videollama2.model import *
+from videollama2.constants import NUM_FRAMES, IGNORE_INDEX, MODAL_INDEX_MAP
+from videollama2.mm_utils import tokenizer_multimodal_token, process_video, process_image
+from videollama2.videollama2_trainer import (VideoLLaMA2Trainer,
+    get_peft_state_maybe_zero_3, get_peft_state_non_lora_maybe_zero_3, 
+    find_all_linear_names, safe_save_model_for_hf_trainer
+)
+
+# NOTE: fast tokenizer warning issue: https://github.com/huggingface/transformers/issues/5486   
+os.environ["TOKENIZERS_PARALLELISM"] = "true"
+
+local_rank = None
+
+
+def rank0_print(*args):
+    if local_rank == 0:
+        print(*args)
+
+
+def set_seed(seed=42):
+    """
+    Set the random seed for reproducible results.
+
+    :param seed: An integer value to be used as the random seed.
+    """
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)  # for multi-GPU setups
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+
+
+@dataclass
+class ModelArguments:
+    # LLM Arguments
+    model_type: Optional[str] = field(default="videollama2", metadata={"help": "Model type selected in the list: " + ", ".join(VLLMs.keys())})
+    model_path: Optional[str] = field(default="lmsys/vicuna-7b-v1.5")
+    version: Optional[str] = field(default="v1", metadata={"help": "Version of the conversation template."})
+    freeze_backbone: bool = field(default=False, metadata={"help": "Whether to freeze the LLM backbone."})
+    # Connector Arguments
+    mm_projector_type: Optional[str] = field(default='linear')
+    tune_mm_mlp_adapter: bool = field(default=False)
+    pretrain_mm_mlp_adapter: Optional[str] = field(default=None)
+    # Vision tower Arguments
+    vision_tower: Optional[str] = field(default=None)
+    mm_vision_select_layer: Optional[int] = field(default=-1)
+    mm_vision_select_feature: Optional[str] = field(default="patch")
+
+
+@dataclass
+class DataArguments:
+    # Path Arguments
+    data_path: str = field(default=None, metadata={"help": "Path to the training data."})
+    # image_folder: Optional[str] = field(default=None)
+    # video_folder: Optional[str] = field(default=None)
+    data_folder: Optional[str] = field(default=None)
+    # Loading Arguments
+    is_multimodal: bool = False
+    lazy_preprocess: bool = False
+    num_frames: Optional[int] = field(default=None)
+    # Preprocess Arguments
+    image_aspect_ratio: str = 'square'
+
+
+@dataclass
+class TrainingArguments(transformers.TrainingArguments):
+    optim: str = field(default="adamw_torch")
+    mm_projector_lr: Optional[float] = None
+    freeze_mm_mlp_adapter: bool = field(default=False)
+    remove_unused_columns: bool = field(default=False)
+    cache_dir: Optional[str] = field(default=None)
+    # Training Data Arguments 
+    group_by_modality_length: bool = field(default=False)
+    model_max_length: int = field(
+        default=512,
+        metadata={
+            "help":
+            "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
+        },
+    )
+    # Lora or Quant Arguments
+    double_quant: bool = field(
+        default=True,
+        metadata={"help": "Compress the quantization statistics through double quantization."}
+    )
+    quant_type: str = field(
+        default="nf4",
+        metadata={"help": "Quantization data type to use. Should be one of `fp4` or `nf4`."}
+    )
+    bits: int = field(
+        default=16,
+        metadata={"help": "How many bits to use."}
+    )
+    lora_enable: bool = False
+    lora_r: int = 64
+    lora_alpha: int = 16
+    lora_dropout: float = 0.05
+    lora_weight_path: str = ""
+    lora_bias: str = "none"
+
+
+def preprocess_plain(
+    sources: Sequence[str],
+    tokenizer: transformers.PreTrainedTokenizer,
+    modal_token: str = None,
+) -> Dict:
+    roles = {"human": "user", "gpt": "assistant"}
+    conversations = []
+    input_ids = []
+    targets = []
+    for source in sources:
+        # 1. apply chat template for input conversation
+        assert len(source) == 2
+        assert modal_token in source[0]['value']
+        message = [
+            {'role': 'user', 'content': modal_token},
+            {'role': 'assistant', 'content': source[1]['value']}
+        ]
+        conversation = tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=False)
+        # 2. tokenize conversations
+        input_ids.append(tokenizer_multimodal_token(conversation, tokenizer, modal_token, return_tensors='pt'))
+        # 3. make targets
+        targets.append(copy.deepcopy(input_ids[-1]))
+        instruction = tokenizer.apply_chat_template(message[:1], tokenize=False, add_generation_prompt=True)
+        instruction_len = len(tokenizer_multimodal_token(instruction, tokenizer, modal_token, return_tensors='pt'))
+        targets[-1][:instruction_len] = IGNORE_INDEX
+
+        # print("instruction: ----------------")
+        # print(instruction)
+        # print("conversation: ----------------")
+        # print(conversation)
+        # print("training targets: ----------------")
+        # print(tokenizer.decode(targets[-1][instruction_len:]))
+        # print(input_ids[-1])
+        # print(targets[-1])
+
+    return dict(input_ids=input_ids, labels=targets)
+
+
+def preprocess(
+    sources: Sequence[str],
+    tokenizer: transformers.PreTrainedTokenizer,
+    modal_token: str = None,
+) -> Dict:
+    roles = {"human": "user", "gpt": "assistant"}
+
+    # Apply prompt templates
+    conversations = []
+    input_ids = []
+    targets = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != "user":
+            # Skip the first one if it is not from human
+            source = source[1:]
+
+        message = [{'role': roles[sentence['from']], 'content': sentence['value']} for sentence in source]
+        conversation = tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=False)
+        input_ids.append(tokenizer_multimodal_token(conversation, tokenizer, modal_token, return_tensors='pt'))
+        targets.append(copy.deepcopy(input_ids[-1]))
+
+        assert len(source) % 2 == 0, f"Invalid conversation length {len(source)}."
+
+        cur = 0
+        message = []
+        for idx, sentence in enumerate(source):
+            if idx % 2 == 1:
+                tmp_message = [
+                    {'role': roles[source[idx-1]['from']], 'content': source[idx-1]['value']}, 
+                    {'role': roles[sentence['from']], 'content': sentence['value']}
+                ]
+
+                instruction = tokenizer.apply_chat_template(message + tmp_message[:1], tokenize=False, add_generation_prompt=True)
+                conversation = tokenizer.apply_chat_template(message + tmp_message, tokenize=False, add_generation_prompt=False)
+
+                instruction_len = len(tokenizer_multimodal_token(instruction, tokenizer, modal_token, return_tensors='pt'))
+                conversation_len = len(tokenizer_multimodal_token(conversation, tokenizer, modal_token, return_tensors='pt'))
+
+                targets[-1][cur:instruction_len] = IGNORE_INDEX
+
+                cur = conversation_len
+                message += tmp_message
+
+    return dict(input_ids=input_ids, labels=targets)
+
+
+def preprocess_multimodal(
+    sources: Sequence[str],
+    data_args: DataArguments,
+    modal_token: str = None,
+) -> Dict:
+    is_multimodal = data_args.is_multimodal
+    if not is_multimodal:
+        return sources
+
+    assert modal_token in MODAL_INDEX_MAP, f"Unsupported modal token {modal_token}."
+
+    for source in sources:
+        for sentence in source:
+            if modal_token in sentence['value']:
+                sentence['value'] = sentence['value'].replace(modal_token, '').strip()
+                sentence['value'] = modal_token + '\n' + sentence['value']
+                sentence['value'] = sentence['value'].strip()
+            replace_token = modal_token
+            # TODO: fix this for multimedia, e.g., <video>, <audio>, etc.
+            sentence["value"] = sentence["value"].replace(modal_token, replace_token)
+
+    return sources
+
+
+class LazySupervisedDataset(Dataset):
+    """Dataset for supervised fine-tuning."""
+
+    def __init__(self, data_path: str,
+                 tokenizer: transformers.PreTrainedTokenizer,
+                 data_args: DataArguments):
+        super(LazySupervisedDataset, self).__init__()
+        list_data_dict = json.load(open(data_path, "r"))
+
+        rank0_print("Formatting inputs...Skip in lazy mode")
+        self.tokenizer = tokenizer
+        self.list_data_dict = list_data_dict
+        self.data_args = data_args
+
+    def __len__(self):
+        return len(self.list_data_dict)
+
+    @property
+    def lengths(self):
+        length_list = []
+        for sample in self.list_data_dict:
+            img_tokens = 576 if 'image' in sample else 0
+            length_list.append(sum(len(conv['value'].split()) for conv in sample['conversations']) + img_tokens)
+        return length_list
+
+    @property
+    def modality_lengths(self):
+        length_list = []
+        for sample in self.list_data_dict:
+            cur_len = sum(len(conv['value'].split()) for conv in sample['conversations'])
+            cur_len = cur_len if 'image' in sample else -cur_len
+            length_list.append(cur_len)
+        return length_list
+
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        sources = self.list_data_dict[i]
+        if isinstance(i, int):
+            sources = [sources]
+        assert len(sources) == 1, "Don't know why it is wrapped to a list"  # FIXME
+
+        image_processor = self.data_args.image_processor
+        video_processor = self.data_args.video_processor
+
+        num_frames = NUM_FRAMES if self.data_args.num_frames is None else self.data_args.num_frames
+
+        if 'image' in sources[0]:
+            image_file = self.list_data_dict[i]['image']
+            image_folder = self.data_args.data_folder
+            image_file = os.path.join(image_folder, image_file)
+
+            try:
+                image = process_image(image_file, image_processor, aspect_ratio=self.data_args.image_aspect_ratio)
+            except:
+                traceback.print_exc()
+                backup_idx = random.randint(0, len(self.list_data_dict) - 1)
+                print(f"Encounted error when reading image {image_file}, use {backup_idx}-th example instead!!!")
+                return self.__getitem__(backup_idx)
+
+            # place <image> tag to question head.
+            modal_token = "<image>"
+            sources = preprocess_multimodal(copy.deepcopy([e["conversations"] for e in sources]), self.data_args, modal_token)
+        elif 'video' in sources[0]:
+            video_file = self.list_data_dict[i]['video']
+            video_folder = self.data_args.data_folder
+            video_file = os.path.join(video_folder, video_file)
+
+            try:
+                video = process_video(video_file, video_processor, aspect_ratio=self.data_args.image_aspect_ratio, num_frames=num_frames)
+            except Exception as e:
+                traceback.print_exc()
+                backup_idx = random.randint(0, len(self.list_data_dict) - 1)
+                print(f"Encounted error when reading video {video_file}, use {backup_idx}-th example instead!!!")
+                return self.__getitem__(backup_idx)
+
+            # place <video> tag to question head.
+            modal_token = "<video>"
+            sources = preprocess_multimodal(copy.deepcopy([e["conversations"] for e in sources]), self.data_args, modal_token)
+        else:
+            modal_token = None
+            sources = copy.deepcopy([e["conversations"] for e in sources])
+
+        if self.data_args.is_pretraining:
+            data_dict = preprocess_plain(sources, self.tokenizer, modal_token=modal_token)
+        else:
+            data_dict = preprocess(sources, self.tokenizer, modal_token=modal_token)
+
+        if isinstance(i, int):
+            data_dict = dict(input_ids=data_dict["input_ids"][0], labels=data_dict["labels"][0])
+
+        # image exist in the data
+        if 'image' in self.list_data_dict[i]:
+            data_dict['image'] = image
+        elif 'video' in self.list_data_dict[i]:
+            data_dict['video'] = video
+        elif self.data_args.is_multimodal:
+            # image does not exist in the data, but the model is multimodal
+            crop_size = self.data_args.image_processor.crop_size
+            data_dict['image'] = torch.zeros(3, crop_size['height'], crop_size['width'])
+        return data_dict
+
+
+@dataclass
+class DataCollatorForSupervisedDataset(object):
+    """Collate examples for supervised fine-tuning."""
+
+    tokenizer: transformers.PreTrainedTokenizer
+
+    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
+        input_ids, labels = tuple([instance[key] for instance in instances]
+                                  for key in ("input_ids", "labels"))
+        input_ids = torch.nn.utils.rnn.pad_sequence(
+            input_ids,
+            batch_first=True,
+            padding_value=self.tokenizer.pad_token_id)
+        labels = torch.nn.utils.rnn.pad_sequence(labels,
+                                                 batch_first=True,
+                                                 padding_value=IGNORE_INDEX)
+        input_ids = input_ids[:, :self.tokenizer.model_max_length]
+        labels = labels[:, :self.tokenizer.model_max_length]
+        batch = dict(
+            input_ids=input_ids,
+            labels=labels,
+            attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
+        )
+
+        # work for 'images' argument in `prepare_inputs_labels_for_multimodal` of LlavaMetaForCausalLM in llava_arch.py
+        batch['images'] = []
+        for instance in instances:
+            for modal_token in MODAL_INDEX_MAP.keys():
+                modal_token = modal_token.lower()
+                # MODAL_TOKEN shape like: <image>, <video>, ...
+                modal_name = re.findall(f'[<](.*)[>]', modal_token)
+                assert len(modal_name) == 1
+                modal_name = modal_name[0]
+                if modal_name in instance:
+                    batch['images'].append((instance[modal_name], modal_name))
+
+        return batch
+
+
+def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer,
+                                data_args) -> Dict:
+    """Make dataset and collator for supervised fine-tuning."""
+    train_dataset = LazySupervisedDataset(
+        tokenizer=tokenizer,
+        data_path=data_args.data_path,
+        data_args=data_args
+    )
+    data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
+    return dict(train_dataset=train_dataset,
+                eval_dataset=None,
+                data_collator=data_collator)
+
+
+def train(attn_implementation=None):
+    global local_rank
+    set_seed(42)
+
+    parser = transformers.HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
+    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    local_rank = training_args.local_rank
+    compute_dtype = (torch.float16 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32))
+
+    bnb_model_from_pretrained_args = {}
+    if training_args.bits in [4, 8]:
+        from transformers import BitsAndBytesConfig
+        bnb_model_from_pretrained_args.update(dict(
+            # device_map={"": training_args.device},
+            # BUG: High version transformers report error: 
+            # ValueError: You can't pass `load_in_4bit`or `load_in_8bit` as a kwarg when passing `quantization_config` argument at the same time
+            # load_in_4bit=training_args.bits == 4,
+            # load_in_8bit=training_args.bits == 8,
+            quantization_config=BitsAndBytesConfig(
+                load_in_4bit=training_args.bits == 4,
+                load_in_8bit=training_args.bits == 8,
+                llm_int8_skip_modules=["mm_projector"],
+                llm_int8_threshold=6.0,
+                llm_int8_has_fp16_weight=False,
+                bnb_4bit_compute_dtype=compute_dtype,
+                bnb_4bit_use_double_quant=training_args.double_quant,
+                bnb_4bit_quant_type=training_args.quant_type, # {'fp4', 'nf4'}
+                bnb_4bit_quant_storage=compute_dtype,
+            )
+        ))
+    
+    config = transformers.AutoConfig.from_pretrained(model_args.model_path, trust_remote_code=True)
+    if 'gemma2' in model_args.model_type:
+        config._attn_implementation = 'eager'
+    else:
+        config._attn_implementation = attn_implementation
+
+    if model_args.vision_tower is not None:
+        model = VLLMs[model_args.model_type].from_pretrained(
+            model_args.model_path,
+            config=config,
+            cache_dir=training_args.cache_dir,
+            torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
+            do_sample=True,
+            **bnb_model_from_pretrained_args
+        )
+        if 'mixtral' in model_args.model_type:
+            import deepspeed
+            deepspeed.utils.set_z3_leaf_modules(model, [MixtralSparseMoeBlock])
+    else:
+        model = transformers.LlamaForCausalLM.from_pretrained(
+            model_args.model_path,
+            config=config,
+            cache_dir=training_args.cache_dir,
+            torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
+            do_sample=True,
+            **bnb_model_from_pretrained_args
+        )
+    model.config.use_cache = False
+
+    if model_args.freeze_backbone:
+        model.model.requires_grad_(False)
+
+    if training_args.bits in [4, 8]:
+        from peft import prepare_model_for_kbit_training
+        model.config.torch_dtype=(torch.float32 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32))
+        model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=training_args.gradient_checkpointing)
+
+    if training_args.gradient_checkpointing:
+        if hasattr(model, "enable_input_require_grads"):
+            model.enable_input_require_grads()
+        else:
+            def make_inputs_require_grad(module, input, output):
+                output.requires_grad_(True)
+            model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
+
+    if training_args.lora_enable:
+        from peft import LoraConfig, get_peft_model
+        lora_config = LoraConfig(
+            r=training_args.lora_r,
+            lora_alpha=training_args.lora_alpha,
+            target_modules=find_all_linear_names(model),
+            lora_dropout=training_args.lora_dropout,
+            bias=training_args.lora_bias,
+            task_type="CAUSAL_LM",
+        )
+        if training_args.bits == 16:
+            if training_args.bf16:
+                model.to(torch.bfloat16)
+            if training_args.fp16:
+                model.to(torch.float16)
+        rank0_print("Adding LoRA adapters...")
+        model = get_peft_model(model, lora_config)
+
+
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        model_args.model_path,
+        cache_dir=training_args.cache_dir,
+        model_max_length=training_args.model_max_length,
+        padding_side="right",
+        use_fast=True,
+    )
+
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.unk_token
+
+    if model_args.vision_tower is not None:
+        # initialize vision encoder + multi-modal projector
+        model.get_model().initialize_vision_modules(model_args=model_args, fsdp=training_args.fsdp)
+
+        vision_tower = model.get_vision_tower()
+        vision_tower.to(dtype=torch.bfloat16 if training_args.bf16 else torch.float16, device=training_args.device)
+
+        data_args.image_processor = vision_tower.image_processor
+        data_args.video_processor = vision_tower.video_processor if hasattr(vision_tower, "video_processor") else vision_tower.image_processor
+
+        data_args.is_multimodal = True
+
+        model.config.image_aspect_ratio = data_args.image_aspect_ratio
+        model.config.tokenizer_padding_side = tokenizer.padding_side
+        model.config.tokenizer_model_max_length = tokenizer.model_max_length
+
+        model.config.tune_mm_mlp_adapter = training_args.tune_mm_mlp_adapter = model_args.tune_mm_mlp_adapter
+        if model_args.tune_mm_mlp_adapter:
+            model.requires_grad_(False)
+            for p in model.get_model().mm_projector.parameters():
+                p.requires_grad = True
+
+        if model_args.tune_mm_mlp_adapter:
+            data_args.is_pretraining = True
+        else:
+            data_args.is_pretraining = False
+
+        model.config.freeze_mm_mlp_adapter = training_args.freeze_mm_mlp_adapter
+        if training_args.freeze_mm_mlp_adapter:
+            for p in model.get_model().mm_projector.parameters():
+                p.requires_grad = False
+
+        if training_args.bits in [4, 8]:
+            model.get_model().mm_projector.to(dtype=compute_dtype, device=training_args.device)
+
+        model.config.mm_projector_lr = training_args.mm_projector_lr
+        model.config.num_frames = NUM_FRAMES if data_args.num_frames is None else data_args.num_frames
+
+    if training_args.bits in [4, 8]:
+        from peft.tuners.lora import LoraLayer
+        for name, module in model.named_modules():
+            if isinstance(module, LoraLayer):
+                if training_args.bf16:
+                    module = module.to(torch.bfloat16)
+            if 'norm' in name:
+                module = module.to(torch.float32)
+            if 'lm_head' in name or 'embed_tokens' in name:
+                if hasattr(module, 'weight'):
+                    if training_args.bf16 and module.weight.dtype == torch.float32:
+                        module = module.to(torch.bfloat16)
+
+    print("Current model:", model)
+    data_module = make_supervised_data_module(tokenizer=tokenizer, data_args=data_args)
+    # select a Trainer
+    trainer = VideoLLaMA2Trainer(model=model, tokenizer=tokenizer, args=training_args, **data_module)
+
+    if list(pathlib.Path(training_args.output_dir).glob("checkpoint-*")):
+        trainer.train(resume_from_checkpoint=True)
+    else:
+        trainer.train()
+    trainer.save_state()
+
+    model.config.use_cache = True
+
+    if training_args.lora_enable:
+        state_dict = get_peft_state_maybe_zero_3(model.named_parameters(), training_args.lora_bias)
+        non_lora_state_dict = get_peft_state_non_lora_maybe_zero_3(model.named_parameters())
+        if training_args.local_rank == 0 or training_args.local_rank == -1:
+            model.config.save_pretrained(training_args.output_dir)
+            model.save_pretrained(training_args.output_dir, state_dict=state_dict)
+            torch.save(non_lora_state_dict, os.path.join(training_args.output_dir, 'non_lora_trainables.bin'))
+    else:
+        safe_save_model_for_hf_trainer(trainer=trainer, output_dir=training_args.output_dir)
+
+
+if __name__ == "__main__":
+    train()
diff --git a/videollama2/train_flash_attn.py b/VideoLLaMA2/videollama2/train_flash_attn.py
similarity index 100%
rename from videollama2/train_flash_attn.py
rename to VideoLLaMA2/videollama2/train_flash_attn.py
diff --git a/videollama2/utils.py b/VideoLLaMA2/videollama2/utils.py
similarity index 100%
rename from videollama2/utils.py
rename to VideoLLaMA2/videollama2/utils.py
diff --git a/videollama2/videollama2_trainer.py b/VideoLLaMA2/videollama2/videollama2_trainer.py
similarity index 69%
rename from videollama2/videollama2_trainer.py
rename to VideoLLaMA2/videollama2/videollama2_trainer.py
index cda7cf12d51fb40e4d1e13b4b61a2c45e9544d2d..6e129352c4c2b3f7fb938fcc890da8939e3b73eb 100644
--- a/videollama2/videollama2_trainer.py
+++ b/VideoLLaMA2/videollama2/videollama2_trainer.py
@@ -1,5 +1,6 @@
 # Adopted from: https://github.com/haotian-liu/LLaVA/blob/main/llava/train/llava_trainer.py
 import os
+import logging
 from typing import List, Optional
 
 import torch
@@ -23,7 +24,7 @@ def maybe_zero_3(param, ignore_status=False, name=None):
     if hasattr(param, "ds_id"):
         if param.ds_status == ZeroParamStatus.NOT_AVAILABLE:
             if not ignore_status:
-                print(name, 'no ignore status')
+                logging.warning(f"{name}: param.ds_status != ZeroParamStatus.NOT_AVAILABLE: {param.ds_status}")
         with zero.GatheredParameters([param]):
             param = param.data.detach().cpu().clone()
     else:
@@ -37,6 +38,93 @@ def get_mm_adapter_state_maybe_zero_3(named_params, keys_to_match):
     return to_return
 
 
+# Borrowed from peft.utils.get_peft_model_state_dict
+def get_peft_state_maybe_zero_3(named_params, bias):
+    if bias == "none":
+        to_return = {k: t for k, t in named_params if "lora_" in k}
+    elif bias == "all":
+        to_return = {k: t for k, t in named_params if "lora_" in k or "bias" in k}
+    elif bias == "lora_only":
+        to_return = {}
+        maybe_lora_bias = {}
+        lora_bias_names = set()
+        for k, t in named_params:
+            if "lora_" in k:
+                to_return[k] = t
+                bias_name = k.split("lora_")[0] + "bias"
+                lora_bias_names.add(bias_name)
+            elif "bias" in k:
+                maybe_lora_bias[k] = t
+        for k, t in maybe_lora_bias:
+            if bias_name in lora_bias_names:
+                to_return[bias_name] = t
+    else:
+        raise NotImplementedError
+    to_return = {k: maybe_zero_3(v, ignore_status=True) for k, v in to_return.items()}
+    return to_return
+
+
+def get_peft_state_non_lora_maybe_zero_3(named_params, require_grad_only=True):
+    to_return = {k: t for k, t in named_params if "lora_" not in k}
+    if require_grad_only:
+        to_return = {k: t for k, t in to_return.items() if t.requires_grad}
+    to_return = {k: maybe_zero_3(v, ignore_status=True).cpu() for k, v in to_return.items()}
+    return to_return
+
+
+def find_all_linear_names(model):
+    cls = torch.nn.Linear
+    lora_module_names = set()
+    multimodal_keywords = ['mm_projector', 'vision_tower', 'vision_resampler']
+    for name, module in model.named_modules():
+        if any(mm_keyword in name for mm_keyword in multimodal_keywords):
+            continue
+        if isinstance(module, cls):
+            names = name.split('.')
+            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
+
+    if 'lm_head' in lora_module_names: # needed for 16-bit
+        lora_module_names.remove('lm_head')
+    return list(lora_module_names)
+
+
+def safe_save_model_for_hf_trainer(trainer: Trainer,
+                                   output_dir: str):
+    """Collects the state dict and dump to disk."""
+
+    if getattr(trainer.args, "tune_mm_mlp_adapter", False):
+        # Only save Adapter
+        keys_to_match = ['mm_projector']
+
+        weight_to_save = get_mm_adapter_state_maybe_zero_3(trainer.model.named_parameters(), keys_to_match)
+        trainer.model.config.save_pretrained(output_dir)
+
+        current_folder = output_dir.split('/')[-1]
+        parent_folder = os.path.dirname(output_dir)
+        if trainer.args.local_rank == 0 or trainer.args.local_rank == -1:
+            if current_folder.startswith('checkpoint-'):
+                mm_projector_folder = os.path.join(parent_folder, "mm_projector")
+                os.makedirs(mm_projector_folder, exist_ok=True)
+                torch.save(weight_to_save, os.path.join(mm_projector_folder, f'{current_folder}.bin'))
+            else:
+                torch.save(weight_to_save, os.path.join(output_dir, f'mm_projector.bin'))
+        return
+
+    if trainer.deepspeed:
+        torch.cuda.synchronize()
+        trainer.save_model(output_dir)
+        return
+
+    state_dict = trainer.model.state_dict()
+    if trainer.args.should_save:
+        cpu_state_dict = {
+            key: value.cpu()
+            for key, value in state_dict.items()
+        }
+        del state_dict
+        trainer._save(output_dir, state_dict=cpu_state_dict)  # noqa
+
+
 def split_to_even_chunks(indices, lengths, num_chunks):
     """
     Split a list of indices into `chunks` chunks of roughly equal lengths.
@@ -239,8 +327,6 @@ class VideoLLaMA2Trainer(Trainer):
 
             # Only save Adapter
             keys_to_match = ['mm_projector', 'vision_resampler']
-            if getattr(self.args, "use_im_start_end", False):
-                keys_to_match.extend(['embed_tokens', 'embed_in'])
 
             weight_to_save = get_mm_adapter_state_maybe_zero_3(self.model.named_parameters(), keys_to_match)
 
@@ -254,7 +340,27 @@ class VideoLLaMA2Trainer(Trainer):
             self.state.save_to_json(os.path.join(output_dir, TRAINER_STATE_NAME))
             self.args.distributed_state.wait_for_everyone()
         else:
-            super(VideoLLaMA2Trainer, self)._save_checkpoint(model, trial, metrics)
+            # NOTE: Supporting save complete lora checkpoint during training.
+            if self.args.lora_enable:
+                from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
+                checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}"
+
+                run_dir = self._get_output_dir(trial=trial)
+                output_dir = os.path.join(run_dir, checkpoint_folder)
+
+                state_dict = get_peft_state_maybe_zero_3(self.model.named_parameters(), self.args.lora_bias)
+                non_lora_state_dict = get_peft_state_non_lora_maybe_zero_3(self.model.named_parameters())
+                if self.args.local_rank == 0 or self.args.local_rank == -1:
+                    # save for acquring `config.json`
+                    self.model.config.save_pretrained(output_dir)
+                    # save for acquring `adapter_config.json`, `adapter_model.bin`
+                    # self.model.save_pretrained(output_dir, state_dict=state_dict)
+                    torch.save(non_lora_state_dict, os.path.join(output_dir, 'non_lora_trainables.bin'))
+
+                # save for acquring lora adapter parameters & trainer states: `adapter_config.json`, `adapter_model.safetensors`
+                super(VideoLLaMA2Trainer, self)._save_checkpoint(model, trial, metrics)
+            else:
+                super(VideoLLaMA2Trainer, self)._save_checkpoint(model, trial, metrics)
 
     def _save(self, output_dir: Optional[str] = None, state_dict=None):
         if getattr(self.args, 'tune_mm_mlp_adapter', False):
diff --git a/app.py b/app.py
index 0e38a06ee5a080d89bdb985a2bfc7a580d6fb212..91081c35cfd80153ca6090ee1493d7a2b72f6fd2 100644
--- a/app.py
+++ b/app.py
@@ -6,11 +6,9 @@ import torch
 import gradio as gr
 
 import sys
-sys.path.append('./')
-from videollama2.constants import NUM_FRAMES, MMODAL_TOKEN_INDEX, DEFAULT_MMODAL_TOKEN
-from videollama2.conversation import conv_templates, SeparatorStyle, Conversation
-from videollama2.model.builder import load_pretrained_model
-from videollama2.mm_utils import KeywordsStoppingCriteria, tokenizer_MMODAL_token, get_model_name_from_path, process_image, process_video
+sys.path.append('./VideoLLaMA2')
+from videollama2 import model_init, mm_infer
+from videollama2.utils import disable_torch_init
 
 
 title_markdown = ("""
@@ -76,113 +74,49 @@ plum_color = gr.themes.colors.Color(
 
 
 class Chat:
-    def __init__(self, model_path, conv_mode, model_base=None, load_8bit=False, load_4bit=False):
-        # disable_torch_init()
-        model_name = get_model_name_from_path(model_path)
-        self.tokenizer, self.model, processor, context_len = load_pretrained_model(
-            model_path, model_base, model_name,
-            load_8bit, load_4bit,
-            offload_folder="save_folder")
-        self.processor = processor
-        self.conv_mode = conv_mode
-        self.conv = conv_templates[conv_mode].copy()
-
-    def get_prompt(self, qs, state):
-        state.append_message(state.roles[0], qs)
-        state.append_message(state.roles[1], None)
-        return state
+
+    def __init__(self, model_path, load_8bit=False, load_4bit=False):
+        disable_torch_init()
+
+        self.model, self.processor, self.tokenizer = model_init(model_path, load_8bit=load_8bit, load_4bit=load_4bit)
 
     @spaces.GPU(duration=120)
     @torch.inference_mode()
-    def generate(self, tensor: list, modals: list, prompt: str, first_run: bool, state, temperature, top_p, max_output_tokens):
+    def generate(self, data: list, message, temperature, top_p, max_output_tokens):
         # TODO: support multiple turns of conversation.
-        assert len(tensor) == len(modals)
-
-        # 1. prepare model, tokenizer, and processor.
-        tokenizer, model, processor = self.tokenizer, self.model, self.processor
-
-        # 2. text preprocess (tag process & generate prompt).
-        state = self.get_prompt(prompt, state)
-        prompt = state.get_prompt()
-
-        input_ids = tokenizer_MMODAL_token(prompt, tokenizer, MMODAL_TOKEN_INDEX[modals[0]], return_tensors='pt')
-        input_ids = input_ids.unsqueeze(0).to(self.model.device)
-
-        # 3. generate response according to visual signals and prompts. 
-        stop_str = self.conv.sep if self.conv.sep_style in [SeparatorStyle.SINGLE] else self.conv.sep2
-        # keywords = ["<s>", "</s>"]
-        keywords = [stop_str]
-        stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
-
-        with torch.inference_mode():
-            output_ids = model.generate(
-                input_ids,
-                images_or_videos=tensor,
-                modal_list=modals,
-                do_sample=True,
-                temperature=temperature,
-                top_p=top_p,
-                max_new_tokens=max_output_tokens,
-                use_cache=True,
-                stopping_criteria=[stopping_criteria],
-            )
+        assert len(data) == 1
+
+        tensor, modal = data[0]
+        response = mm_infer(tensor, message, self.model, self.tokenizer, modal=modal.strip('<>'), 
+            do_sample=True if temperature > 0.0 else False,
+            temperature=temperature,
+            top_p=top_p,
+            max_new_tokens=max_output_tokens)
 
-        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]
-        print(outputs)
-        return outputs, state
+        return response
 
 
 @spaces.GPU(duration=120)
-def generate(image, video, state, state_, textbox_in, temperature, top_p, max_output_tokens, dtype=torch.float16):
-    if not textbox_in:
-        if len(state_.messages) > 0:
-            textbox_in = state_.messages[-1][1]
-            state_.messages.pop(-1)
-        else:
-            assert "Please enter instruction"
+def generate(image, video, message, chatbot, textbox_in, temperature, top_p, max_output_tokens, dtype=torch.float16):
+    data = []
 
     image = image if image else "none"
     video = video if video else "none"
     assert not (os.path.exists(image) and os.path.exists(video))
 
-    tensor = []
-    modals = []
-
-    if type(state) is not Conversation:
-        state = conv_templates[conv_mode].copy()
-        state_ = conv_templates[conv_mode].copy()
-
-    first_run = False if len(state.messages) > 0 else True
-
-    text_en_in = textbox_in.replace("picture", "image")
-
-    num_frames = handler.model.config.num_frames if hasattr(handler.model.config, "num_frames") else NUM_FRAMES
-
     processor = handler.processor
     if os.path.exists(image) and not os.path.exists(video):
-        tensor.append(process_image(image, processor).to(handler.model.device, dtype=dtype))
-        modals.append('IMAGE')
+        data.append((processor['image'](image).to(handler.model.device, dtype=dtype), '<image>'))
     if not os.path.exists(image) and os.path.exists(video):
-        tensor.append(process_video(video, processor, num_frames=num_frames, sample_scheme='fps').to(handler.model.device, dtype=dtype))
-        modals.append('VIDEO')
+        data.append((processor['video'](video).to(handler.model.device, dtype=dtype), '<video>'))
     if os.path.exists(image) and os.path.exists(video):
         raise NotImplementedError("Not support image and video at the same time")
 
-    # BUG: Only support single video and image inference now.
-    if os.path.exists(image) and not os.path.exists(video):
-        text_en_in = text_en_in.replace(DEFAULT_MMODAL_TOKEN['IMAGE'], '').strip()
-        text_en_in = DEFAULT_MMODAL_TOKEN['IMAGE'] + '\n' + text_en_in
-    if not os.path.exists(image) and os.path.exists(video):
-        text_en_in = text_en_in.replace(DEFAULT_MMODAL_TOKEN['VIDEO'], '').strip()
-        text_en_in = DEFAULT_MMODAL_TOKEN['VIDEO'] + '\n' + text_en_in
-    if os.path.exists(image) and os.path.exists(video):
-        text_en_in = text_en_in.replace(DEFAULT_MMODAL_TOKEN['VIDEO'], '').strip()
-        text_en_in = DEFAULT_MMODAL_TOKEN['VIDEO'] + '\n' + text_en_in
-    text_en_out, state_ = handler.generate(tensor, modals, text_en_in, first_run=first_run, state=state_, temperature=temperature, top_p=top_p, max_output_tokens=max_output_tokens)
-    state_.messages[-1] = (state_.roles[1], text_en_out)
+    assert len(message) % 2 == 0, "The message should be a pair of user and system message."
 
-    text_en_out = text_en_out.split('#')[0]
-    textbox_out = text_en_out
+    message.append({'role': 'user', 'content': textbox_in})
+    text_en_out = handler.generate(data, message, temperature=temperature, top_p=top_p, max_output_tokens=max_output_tokens)
+    message.append({'role': 'assistant', 'content': text_en_out})
 
     show_images = ""
     if os.path.exists(image):
@@ -190,32 +124,26 @@ def generate(image, video, state, state_, textbox_in, temperature, top_p, max_ou
     if os.path.exists(video):
         show_images += f'<video controls playsinline width="500" style="display: inline-block;"  src="./file={video}"></video>'
 
-    state.append_message(state.roles[0], textbox_in + "\n" + show_images)
-    state.append_message(state.roles[1], textbox_out)
-
-    # BUG: only support single turn conversation now.
-    state_.messages.pop(-1)
-    state_.messages.pop(-1)
+    chatbot.append([textbox_in + "\n" + show_images, text_en_out])
 
-    return (gr.update(value=image if os.path.exists(image) else None, interactive=True), 
-            gr.update(value=video if os.path.exists(video) else None, interactive=True), 
-            state.to_gradio_chatbot(), state, state_)
+    return (
+        gr.update(value=image if os.path.exists(image) else None, interactive=True),
+        gr.update(value=video if os.path.exists(video) else None, interactive=True), 
+        message,
+        chatbot)
 
 
-def regenerate(state, state_):
-    state.messages.pop(-1)
-    state.messages.pop(-1)
-    if len(state.messages) > 0:
-        return state.to_gradio_chatbot(), state, state_
-    return state.to_gradio_chatbot(), state, state_
+def regenerate(message, chatbot):
+    message.pop(-1), message.pop(-1)
+    chatbot.pop(-1)
+    return message, chatbot
 
 
-def clear_history(state, state_):
-    state = conv_templates[conv_mode].copy()
-    state_ = conv_templates[conv_mode].copy()
+def clear_history(message, chatbot):
+    message.clear(), chatbot.clear()
     return (gr.update(value=None, interactive=True),
             gr.update(value=None, interactive=True),
-            state.to_gradio_chatbot(), state, state_, 
+            message, chatbot,
             gr.update(value=None, interactive=True))
 
 
@@ -242,8 +170,7 @@ theme.set(button_primary_text_color="#9C276A")
 
 with gr.Blocks(title='VideoLLaMA 2 🔥🚀🔥', theme=theme, css=block_css) as demo:
     gr.Markdown(title_markdown)
-    state = gr.State()
-    state_ = gr.State()
+    message = gr.State([])
 
     with gr.Row():
         with gr.Column(scale=3):
@@ -346,20 +273,20 @@ with gr.Blocks(title='VideoLLaMA 2 🔥🚀🔥', theme=theme, css=block_css) as
 
     submit_btn.click(
         generate, 
-        [image, video, state, state_, textbox, temperature, top_p, max_output_tokens],
-        [image, video, chatbot, state, state_])
+        [image, video, message, chatbot, textbox, temperature, top_p, max_output_tokens],
+        [image, video, message, chatbot])
 
     regenerate_btn.click(
         regenerate, 
-        [state, state_], 
-        [chatbot, state, state_]).then(
+        [message, chatbot], 
+        [message, chatbot]).then(
         generate, 
-        [image, video, state, state_, textbox, temperature, top_p, max_output_tokens], 
-        [image, video, chatbot, state, state_])
+        [image, video, message, chatbot, textbox, temperature, top_p, max_output_tokens], 
+        [image, video, message, chatbot])
 
     clear_btn.click(
         clear_history, 
-        [state, state_],
-        [image, video, chatbot, state, state_, textbox])
+        [message, chatbot],
+        [image, video, message, chatbot, textbox])
 
 demo.launch()
diff --git a/videollama2/__init__.py b/videollama2/__init__.py
deleted file mode 100644
index 88b571df437fde7076e606ea9a87a45a077a82e9..0000000000000000000000000000000000000000
--- a/videollama2/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .model import Videollama2LlamaForCausalLM, Videollama2MistralForCausalLM
diff --git a/videollama2/constants.py b/videollama2/constants.py
deleted file mode 100644
index 3ebc326292c5664c1ab5304470708c4f807c84f3..0000000000000000000000000000000000000000
--- a/videollama2/constants.py
+++ /dev/null
@@ -1,38 +0,0 @@
-CONTROLLER_HEART_BEAT_EXPIRATION = 30
-WORKER_HEART_BEAT_INTERVAL = 15
-
-LOGDIR = "./log_dir"
-
-NUM_FRAMES = 8
-MAX_FRAMES = 32
-NUM_FRAMES_PER_SECOND = 1
-Grids = [(2, 2), (1, 2), (1, 3), (1, 4), (2, 1), (3, 1), (4, 1)]
-
-# Model Constants
-IGNORE_INDEX = -100
-IMAGE_TOKEN_INDEX = -200
-DEFAULT_IMAGE_TOKEN = "<image>"
-DEFAULT_VIDEO_TOKEN = "<video>"
-DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
-DEFAULT_IM_START_TOKEN = "<im_start>"
-DEFAULT_IM_END_TOKEN = "<im_end>"
-IMAGE_PLACEHOLDER = "<image-placeholder>"
-
-
-DEFAULT_IMAGE_TOKEN = "<image>"
-DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
-DEFAULT_IM_START_TOKEN = "<im_start>"
-DEFAULT_IM_END_TOKEN = "<im_end>"
-IMAGE_PLACEHOLDER = "<image-placeholder>"
-
-
-MMODAL_TOKEN_INDEX = {"IMAGE": -200, "VIDEO": -201, "AUDIO": -202}
-MMODAL_INDEX_TOKEN = {v: k for k, v in MMODAL_TOKEN_INDEX.items()}
-MMODAL_START_TOKEN_INDEX = {"IMAGE": "<im_start>", "VIDEO": "<vid_start>", "AUDIO": "<ad_start>"}
-MMODAL_END_TOKEN_INDEX = {"IMAGE": "<im_end>", "VIDEO": "<vid_end>", "AUDIO": "<ad_end>"}
-
-
-DEFAULT_MMODAL_TOKEN = {"IMAGE": "<image>", "VIDEO": "<video>", "AUDIO": "<audio>"}
-DEFAULT_MMODAL_PATCH_TOKEN = {"IMAGE": "<im_patch>", "VIDEO": "<vid_patch>", "AUDIO": "<ad_patch>"}
-DEFAULT_MMODAL_START_TOKEN = {"IMAGE": "<Image>", "VIDEO": "<Video>", "AUDIO": "<ad_start>"}
-DEFAULT_MMODAL_END_TOKEN = {"IMAGE": "<\Image>", "VIDEO": "<\Video>", "AUDIO": "<\Audio>"}
\ No newline at end of file
diff --git a/videollama2/eval/run_inference_video_qa_batch.py b/videollama2/eval/run_inference_video_qa_batch.py
deleted file mode 100644
index 2d0c61ea75d1b0e13598e6cc59740139ad081507..0000000000000000000000000000000000000000
--- a/videollama2/eval/run_inference_video_qa_batch.py
+++ /dev/null
@@ -1,563 +0,0 @@
-import os
-import re
-import math
-import json
-import argparse
-import warnings
-
-import torch
-import decord
-import numpy as np
-import transformers
-from PIL import Image
-from tqdm import tqdm
-from decord import VideoReader, cpu
-from torch.utils.data import Dataset, DataLoader
-from torchvision import transforms as T
-from torchvision.transforms import functional as F
-
-import sys
-sys.path.append('./')
-from videollama2.conversation import conv_templates, SeparatorStyle
-from videollama2.constants import NUM_FRAMES, DEFAULT_MMODAL_TOKEN, DEFAULT_MMODAL_START_TOKEN, DEFAULT_MMODAL_END_TOKEN, MMODAL_TOKEN_INDEX
-from videollama2.mm_utils import get_model_name_from_path, tokenizer_MMODAL_token, KeywordsStoppingCriteria, process_videos, expand2square
-from videollama2.model.builder import load_pretrained_model
-
-
-# NOTE: Ignore TypedStorage warning, which refers to this link~(https://github.com/pytorch/pytorch/issues/97207#issuecomment-1494781560)
-warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
-
-default_mm_token = DEFAULT_MMODAL_TOKEN["VIDEO"]
-default_mm_start_token =  DEFAULT_MMODAL_START_TOKEN["VIDEO"]
-default_mm_end_token = DEFAULT_MMODAL_END_TOKEN["VIDEO"]
-modal_token_index = MMODAL_TOKEN_INDEX["VIDEO"]
-
-
-def split_list(lst, n):
-    """Split a list into n (roughly) equal-sized chunks"""
-    chunk_size = math.ceil(len(lst) / n)  # integer division
-    return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
-
-
-def get_chunk(lst, n, k):
-    chunks = split_list(lst, n)
-    return chunks[k]
-
-
-class MVBenchDataset(Dataset):
-
-    def __init__(self, data_list, processor, num_segments=8):
-        self.data_list = data_list
-
-        self.decord_method = {
-            'video': self.read_video,
-            'gif': self.read_gif,
-            'frame': self.read_frame,
-        }
-
-        self.processor = processor
-        self.num_segments = num_segments
-
-    def __str__(self):
-        len_list = {}
-        option_list = {}
-        for data in self.data_list:
-            if data['task_type'] not in len_list:
-                len_list[data['task_type']] = 0
-            len_list[data['task_type']] += 1
-            if data['task_type'] not in option_list:
-                option_list[data['task_type']] = 0
-            option_list[data['task_type']] += len(data['data']['candidates'])
-
-        correct = 0
-        total = 0
-        res = f"There are {len(self.data_list)} videos as follow:\n"
-        for k, v in len_list.items():
-            correct += len_list[k]
-            total += option_list[k]
-            res += f"{v} for {k} ({option_list[k]} options => {len_list[k]/option_list[k]*100:.2f}%)\n"
-            correct = correct + 1 / option_list[k]
-        res += f"Total random accuracy: {correct/total*100:.2f}%"
-        return res.rstrip()
-
-    def __len__(self):
-        return len(self.data_list)
-    
-    def get_index(self, bound, fps, max_frame, first_idx=0):
-        if bound:
-            start, end = bound[0], bound[1]
-        else:
-            start, end = -100000, 100000
-        start_idx = max(first_idx, round(start * fps))
-        end_idx = min(round(end * fps), max_frame)
-        seg_size = float(end_idx - start_idx) / self.num_segments
-        frame_indices = np.array([
-            int(start_idx + (seg_size / 2) + np.round(seg_size * idx))
-            for idx in range(self.num_segments)
-        ])
-        return frame_indices
-
-    def read_video(self, video_path, bound=None):
-        vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
-        max_frame = len(vr) - 1
-        fps = float(vr.get_avg_fps())
-
-        images_group = list()
-        frame_indices = self.get_index(bound, fps, max_frame, first_idx=0) 
-        for frame_index in frame_indices:
-            img = Image.fromarray(vr[frame_index].asnumpy())
-            images_group.append(img)
-        # images_group = [expand2square(img, tuple(int(x*255) for x in self.processor.image_mean)) for img in images_group]
-        torch_imgs = self.processor(images_group, return_tensors='pt')['pixel_values']
-        return torch_imgs
-    
-    def read_gif(self, video_path, bound=None, fps=25):
-        gif = imageio.get_reader(video_path)
-        max_frame = len(gif) - 1
-        
-        images_group = list()
-        frame_indices = self.get_index(bound, fps, max_frame, first_idx=0) 
-        for index, frame in enumerate(gif):
-            if index in frame_indices:
-                img = cv2.cvtColor(frame, cv2.COLOR_RGBA2RGB)
-                img = Image.fromarray(img)
-                images_group.append(img)
-        # images_group = [expand2square(img, tuple(int(x*255) for x in self.processor.image_mean)) for img in images_group]
-        torch_imgs = self.processor(images_group, return_tensors='pt')['pixel_values']
-        return torch_imgs
-
-    def read_frame(self, video_path, bound=None, fps=3):
-        max_frame = len(os.listdir(video_path))
-        images_group = list()
-        frame_indices = self.get_index(bound, fps, max_frame, first_idx=1) # frame_idx starts from 1
-        for frame_index in frame_indices:
-            img = Image.open(os.path.join(video_path, f"{frame_index:05d}.jpg"))
-            images_group.append(img)
-        # images_group = [expand2square(img, tuple(int(x*255) for x in self.processor.image_mean)) for img in images_group]
-        torch_imgs = self.processor.preprocess(images_group, return_tensors='pt')['pixel_values']
-        return torch_imgs
-
-    def qa_template(self, data):
-        question = f"Question: {data['question']}\n"
-        question += "Options:\n"
-        answer = data['answer']
-        answer_idx = -1
-        for idx, c in enumerate(data['candidates']):
-            question += f"({chr(ord('A') + idx)}) {c}\n"
-            if c == answer:
-                answer_idx = idx
-        question = question.rstrip()
-        answer = f"({chr(ord('A') + answer_idx)}) {answer}"
-        return question, answer
-
-    def __getitem__(self, idx):
-        decord_method = self.decord_method[self.data_list[idx]['data_type']]
-        bound = None
-        if self.data_list[idx]['bound']:
-            bound = (
-                self.data_list[idx]['data']['start'],
-                self.data_list[idx]['data']['end'],
-            )
-        video_path = os.path.join(self.data_list[idx]['prefix'], self.data_list[idx]['data']['video'])
-        torch_imgs = decord_method(video_path, bound)
-        question = self.data_list[idx]['data']['question']
-        options = self.data_list[idx]['data']['candidates']
-        answer = self.data_list[idx]['data']['answer']
-        task_type = self.data_list[idx]['task_type']
-
-        # question, answer = self.qa_template(self.data_list[idx]['data'])
-
-        answer_idx = -1
-        letters = []
-        options_string = ''
-        for option_idx, c in enumerate(options):
-            letters.append(f"{chr(ord('A') + option_idx)}")
-            options_string += f"({chr(ord('A') + option_idx)}) {c}\n"
-            if c == answer:
-                answer_idx = option_idx
-
-        option_question = f'Question: {question}\nOptions:\n{options_string}Answer with the option\'s letter from the given choices directly and only give the best option.' 
-
-        return {
-            'video': torch_imgs, 
-            'video_path': video_path,
-            'question': option_question,
-            'letters': ','.join(letters),
-            'answer_idx': answer_idx,
-            'task_type': task_type
-        }
-
-
-tasks = {
-    "Action Sequence": ("action_sequence.json", "star/Charades_v1_480/", "video", True), # has start & end
-    "Action Prediction": ("action_prediction.json", "star/Charades_v1_480/", "video", True), # has start & end
-    "Action Antonym": ("action_antonym.json", "ssv2_video/", "video", False),
-    "Fine-grained Action": ("fine_grained_action.json", "Moments_in_Time_Raw/videos/", "video", False),
-    "Unexpected Action": ("unexpected_action.json", "FunQA_test/test/", "video", False),
-    "Object Existence": ("object_existence.json", "clevrer/video_validation/", "video", False),
-    "Object Interaction": ("object_interaction.json", "star/Charades_v1_480/", "video", True), # has start & end
-    "Object Shuffle": ("object_shuffle.json", "perception/videos/", "video", False),
-    "Moving Direction": ("moving_direction.json", "clevrer/video_validation/", "video", False),
-    "Action Localization": ("action_localization.json", "sta/sta_video/", "video", True),  # has start & end
-    "Scene Transition": ("scene_transition.json", "scene_qa/video/", "video", False),
-    "Action Count": ("action_count.json", "perception/videos/", "video", False),
-    "Moving Count": ("moving_count.json", "clevrer/video_validation/", "video", False),
-    "Moving Attribute": ("moving_attribute.json", "clevrer/video_validation/", "video", False),
-    "State Change": ("state_change.json", "perception/videos/", "video", False),
-    "Fine-grained Pose": ("fine_grained_pose.json", "nturgbd/", "video", False),
-    "Character Order": ("character_order.json", "perception/videos/", "video", False),
-    "Egocentric Navigation": ("egocentric_navigation.json", "vlnqa/", "video", False),
-    "Episodic Reasoning": ("episodic_reasoning.json", "tvqa/frames_fps3_hq/", "frame", True),  # has start & end, read frame
-    "Counterfactual Inference": ("counterfactual_inference.json", "clevrer/video_validation/", "video", False),
-}
-
-
-def build_mvbench_eval(args, processor, num_frames):
-    data_list = []
-    for task_name, task in tasks.items():
-        json_file = os.path.join(args.question_file, task[0])
-        vis_folder = os.path.join(args.video_folder, task[1])
-        with open(json_file, 'r') as f:
-            json_data = json.load(f)
-        for data in json_data:
-            data_list.append({
-                'task_type': task_name,
-                'prefix': vis_folder,
-                'data_type': task[2],
-                'bound': task[3],
-                'data': data
-            })
-    data_list = get_chunk(data_list, args.num_chunks, args.chunk_idx)
-    dataset = MVBenchDataset(data_list, processor, num_segments=num_frames)
-    dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers)
-    
-    return dataloader
-
-
-def mvbench_dump(ans_file, line, outputs):
-    for idx, output in enumerate(outputs):
-        vid = line['video_path'][idx]
-        task_type = line['task_type'][idx]
-        letters = line['letters'][idx].split(',')
-        answer_idx = line['answer_idx'][idx].item()
-
-        pred_answer = re.findall(f'[\(,\ ]*[{letters[0]}-{letters[-1]}][\),\ ]*', output)
-        if len(pred_answer) == 0:
-            pred_idx = (answer_idx + 1) % len(letters)
-        else:
-            pred_answer = pred_answer[0].strip()
-            if pred_answer.startswith('('):
-                pred_answer = pred_answer.strip('()')
-            pred_idx = letters.index(pred_answer)
-
-        ans_file.write(json.dumps({"vid": vid, "task_type": task_type, "pred": pred_idx, "gt": answer_idx}) + '\n')
-
-
-class NextoeDataset(Dataset):
-
-    video_formats = ['.mp4', '.avi', '.mov', '.mkv']
-
-    def __init__(self, data_list, processor, num_segments=8):
-        self.data_list = data_list
-        self.processor = processor
-        self.num_segments = num_segments
-
-    def __len__(self):
-        return len(self.data_list)
-    
-    def __getitem__(self, idx):
-        line = self.data_list[idx]
-        video_name = line['video']
-        question = line['question']
-        answer = line['answer']
-
-        for fmt in self.video_formats:  # Added this line
-            temp_path = os.path.join(args.video_folder, f"{video_name}{fmt}")
-            if os.path.exists(temp_path):
-                video_path = temp_path
-                break
-        
-        decord_vr = VideoReader(uri=video_path, ctx=cpu(0))
-        frames = decord_vr.get_batch(np.linspace(0, len(decord_vr) - 1, 8, dtype=int)).asnumpy()
-        video_tensor = self.processor.preprocess(frames, return_tensors='pt')['pixel_values']  # do not pad for video frames
-        
-        wrapped_question = f'Question: {question}\nAnswer the question using a single word or a short phrase with multiple words.'
-
-        return {
-            'video': video_tensor, 
-            'question': wrapped_question,
-            'answer': answer,
-            'qid': line['qid']
-        }
-
-
-def build_nextoe_eval(args, processor, num_frames):
-    questions = json.load(open(args.question_file, "r"))
-    questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
-    dataset = NextoeDataset(questions, processor, num_segments=num_frames)
-    dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers)
-    
-    return dataloader
-
-
-def nextoe_dump(ans_file, line, outputs):
-    for idx, output in enumerate(outputs):
-        vid, qid = line['qid'][idx].split('_')
-        ans_file.write(json.dumps({"vid": vid, "qid": qid, "prediction": output}) + '\n')
-
-
-class NextqaDataset(Dataset):
-
-    video_formats = ['.mp4', '.avi', '.mov', '.mkv']
-
-    def __init__(self, data_list, processor, num_segments=8):
-        self.data_list = data_list
-        self.processor = processor
-        self.num_segments = num_segments
-
-    def __len__(self):
-        return len(self.data_list)
-    
-    def __getitem__(self, idx):
-        line = self.data_list[idx]
-        video_name = line['video']
-        question = line['question']
-        answer = line['answer']
-
-        for fmt in self.video_formats:  # Added this line
-            temp_path = os.path.join(args.video_folder, f"{video_name}{fmt}")
-            if os.path.exists(temp_path):
-                video_path = temp_path
-                break
-        
-        decord_vr = VideoReader(uri=video_path, ctx=cpu(0))
-        frames = decord_vr.get_batch(np.linspace(0, len(decord_vr) - 1, 8, dtype=int)).asnumpy()
-        video_tensor = self.processor.preprocess(frames, return_tensors='pt')['pixel_values']  # do not pad for video frames
-        
-        assert line['num_option'] == 5
-        a0 = line['a0']
-        a1 = line['a1']
-        a2 = line['a2']
-        a3 = line['a3']
-        a4 = line['a4']
-
-        option_question = f'Question: {question}\nOptions:\n(A) {a0}\n(B) {a1}\n(C) {a2}\n(D) {a3}\n(E) {a4}\nAnswer with the option\'s letter from the given choices directly and only give the best option.' 
-
-        return {
-            'video': video_tensor, 
-            'question': option_question,
-            'answer': answer,
-            'qid': line['qid']
-        }
-
-
-def build_nextqa_eval(args, processor, num_frames):
-    questions = json.load(open(args.question_file, "r"))
-    questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
-    dataset = NextqaDataset(questions, processor, num_segments=num_frames)
-    dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers)
-    
-    return dataloader
-
-
-def nextqa_dump(ans_file, line, outputs):
-    for idx, output in enumerate(outputs):
-        qid = line['qid'][idx]
-        answer = line['answer'][idx].item()
-
-        letters = ['A', 'B', 'C', 'D', 'E']
-
-        pred_answer = re.findall('[\(,\ ]*[A-E][\),\ ]*', output)
-        if len(pred_answer) == 0:
-            pred_idx = 2
-        else:
-            pred_answer = pred_answer[0].strip()
-            if pred_answer.startswith('('):
-                pred_answer = pred_answer.strip('()')
-            pred_idx = letters.index(pred_answer)
-
-        ans_file.write(json.dumps({"id": qid, "prediction": pred_idx, "answer": answer}) + '\n')
-
-
-class EgoschemaDataset(Dataset):
-
-    video_formats = ['.mp4', '.avi', '.mov', '.mkv']
-
-    def __init__(self, data_list, processor, num_segments=8):
-        self.data_list = data_list
-        self.processor = processor
-        self.num_segments = num_segments
-
-    def __len__(self):
-        return len(self.data_list)
-    
-    def __getitem__(self, idx):
-        line = self.data_list[idx]
-        q_uid = line['q_uid']
-
-        for fmt in self.video_formats:  # Added this line
-            temp_path = os.path.join(args.video_folder, f"{q_uid}{fmt}")
-            if os.path.exists(temp_path):
-                video_path = temp_path
-                break
-
-        decord_vr = VideoReader(uri=video_path, ctx=cpu(0))
-        frames = decord_vr.get_batch(np.linspace(0, len(decord_vr) - 1, self.num_segments, dtype=int)).asnumpy()
-        video_tensor = self.processor.preprocess(frames, return_tensors='pt')['pixel_values']  # do not pad for video frames
-        
-        question = line['question']
-        a0 = line['option 0']
-        a1 = line['option 1']
-        a2 = line['option 2']
-        a3 = line['option 3']
-        a4 = line['option 4']
-        axs = [a0, a1, a2, a3, a4]
-        ops = ['(A)', '(B)', '(C)', '(D)', '(E)']
-
-        option_question = f'Question: {question}\nOptions:\n(A) {a0}\n(B) {a1}\n(C) {a2}\n(D) {a3}\n(E) {a4}\n.Answer with the option\'s letter from the given choices directly and only give the best option.' 
-
-        return {
-            'q_uid': q_uid,
-            'video': video_tensor, 
-            'question': option_question,
-        }
-
-
-def build_egoschema_eval(args, processor, num_frames):
-    questions = json.load(open(args.question_file, "r"))
-    questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
-    dataset = EgoschemaDataset(questions, processor, num_segments=num_frames)
-    dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers)
-    
-    return dataloader
-
-
-def egoschema_dump(ans_file, line, outputs):
-    for idx, output in enumerate(outputs):
-        q_uid = line['q_uid'][idx]
-        letters = ['A', 'B', 'C', 'D', 'E']
-
-        pred_answer = re.findall('[\(\ ]*[A-E][\)\ ]*', output)
-        if len(pred_answer) == 0:
-            pred_idx = 2
-        else:
-            pred_answer = pred_answer[0].strip()
-            # if pred_answer.startswith('('):
-            pred_answer = pred_answer.strip('()')
-            pred_idx = letters.index(pred_answer)
-        ans_file.write(f'{q_uid}, {pred_idx}\n')
-
-
-def get_model_output(model, video_tensor, tokenizer, questions, conv_mode="v1", device='cuda'):
-
-    input_ids = []
-    modal_list = []
-    for qs in questions:
-        if model.config.mm_use_im_start_end:
-            qs = default_mm_start_token + default_mm_token + default_mm_end_token + "\n" + qs
-        else:
-            qs = default_mm_token + "\n" + qs
-
-        conv = conv_templates[conv_mode].copy()
-        conv.append_message(conv.roles[0], qs)
-        conv.append_message(conv.roles[1], None)
-        prompt = conv.get_prompt()
-
-        input_id = tokenizer_MMODAL_token(prompt, tokenizer, modal_token_index, return_tensors='pt')
-        input_ids.append(input_id)
-        modal_list.append("video")
-
-    # left pad sequence
-    input_ids = torch.nn.utils.rnn.pad_sequence(
-        [x.flip(dims=[0]) for x in input_ids],
-        batch_first=True,
-        padding_value=tokenizer.pad_token_id).flip(dims=[1]).to(device)
-
-    attention_mask=input_ids.ne(tokenizer.pad_token_id).to(device)
-
-    video_tensor = video_tensor.half().to(args.device)
-
-    with torch.inference_mode():
-        output_ids = model.generate(
-            input_ids,
-            attention_mask=attention_mask,
-            images_or_videos=video_tensor,
-            modal_list=modal_list,
-            do_sample=False,
-            max_new_tokens=1024,
-            use_cache=True,
-            pad_token_id=tokenizer.eos_token_id)
-
-    outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
-    return outputs
-
-
-def run_inference(args):
-    """
-    Run inference on ActivityNet QA DataSet using the Video-ChatGPT model.
-
-    Args:
-        args: Command-line arguments.
-    """
-    # Initialize the model
-    model_name = get_model_name_from_path(args.model_path)
-    tokenizer, model, processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name)
-
-    num_frames = model.config.num_frames if hasattr(model.config, "num_frames") else NUM_FRAMES
-
-    answer_file = os.path.expanduser(args.answer_file)
-    os.makedirs(os.path.dirname(answer_file), exist_ok=True)
-    ans_file = open(answer_file, "w")
-
-    output_list = []  # List to store the output results
-
-    if args.dataset == 'mvbench':
-        val_loader = build_mvbench_eval(args, processor, num_frames)
-    elif args.dataset == 'nextoe':
-        val_loader = build_nextoe_eval(args, processor, num_frames)
-    elif args.dataset == 'nextqa':
-        val_loader = build_nextqa_eval(args, processor, num_frames)
-    elif args.dataset == 'egoschema':
-        val_loader = build_egoschema_eval(args, processor, num_frames)
-    else:
-        raise NotImplementedError(f"Dataset {args.dataset} not implemented.")
-
-    # Iterate over each sample in the ground truth file
-    for i, line in enumerate(tqdm(val_loader)):
-        video_tensor = line['video']
-        questions = line['question']
-
-        outputs = get_model_output(model, video_tensor, tokenizer, questions, args.conv_mode, args.device)
-
-        if args.dataset == 'mvbench':
-            mvbench_dump(ans_file, line, outputs)
-        elif args.dataset == 'nextoe':
-            nextoe_dump(ans_file, line, outputs)
-        elif args.dataset == 'nextqa':
-            nextqa_dump(ans_file, line, outputs)
-        elif args.dataset == 'egoschema':
-            egoschema_dump(ans_file, line, outputs)
-        else:
-            raise NotImplementedError(f"Dataset {args.dataset} not implemented.")
-
-    ans_file.close()
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description='Multiple-Choice Video QA Evaluation Script.')
-
-    parser.add_argument('--dataset', help='Dataset to evaluate on.', required=True)
-    parser.add_argument('--model-path', help='', required=True)
-    parser.add_argument('--model_base', help='', default=None, type=str, required=False)
-    parser.add_argument('--video-folder', help='Directory containing video files.', required=True)
-    parser.add_argument('--question-file', help='Path to the ground truth file containing question.', required=True)
-    parser.add_argument('--answer-file', help='Path to the ground truth file containing answers.', required=True)
-    parser.add_argument("--conv-mode", type=str, default="llava_v1")
-    parser.add_argument("--num-chunks", type=int, default=1)
-    parser.add_argument("--chunk-idx", type=int, default=0)
-    parser.add_argument("--device", type=str, required=False, default='cuda:0')
-    parser.add_argument("--model_max_length", type=int, required=False, default=2048)
-    parser.add_argument("--batch-size", type=int, default=1)
-    parser.add_argument("--num-workers", type=int, default=8)
-    args = parser.parse_args()
-    run_inference(args)
diff --git a/videollama2/eval/run_inference_video_qa_gpt.py b/videollama2/eval/run_inference_video_qa_gpt.py
deleted file mode 100644
index 3767f3b0e0ecf48cd14a9c40bb663c7b756d3c20..0000000000000000000000000000000000000000
--- a/videollama2/eval/run_inference_video_qa_gpt.py
+++ /dev/null
@@ -1,151 +0,0 @@
-import math
-import os
-import argparse
-import json
-import warnings
-from tqdm import tqdm
-
-import torch
-import numpy as np
-import transformers
-import decord
-from decord import VideoReader, cpu
-
-import sys
-sys.path.append('./')
-from videollama2.conversation import conv_templates, SeparatorStyle
-from videollama2.constants import NUM_FRAMES, DEFAULT_MMODAL_TOKEN, DEFAULT_MMODAL_START_TOKEN, DEFAULT_MMODAL_END_TOKEN, MMODAL_TOKEN_INDEX
-from videollama2.mm_utils import get_model_name_from_path, tokenizer_MMODAL_token, KeywordsStoppingCriteria, process_video
-from videollama2.model.builder import load_pretrained_model
-
-
-# NOTE: Ignore TypedStorage warning, which refers to this link~(https://github.com/pytorch/pytorch/issues/97207#issuecomment-1494781560)
-warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
-
-default_mm_token = DEFAULT_MMODAL_TOKEN["VIDEO"]
-default_mm_start_token =  DEFAULT_MMODAL_START_TOKEN["VIDEO"]
-default_mm_end_token = DEFAULT_MMODAL_END_TOKEN["VIDEO"]
-modal_token_index = MMODAL_TOKEN_INDEX["VIDEO"]
-
-
-def split_list(lst, n):
-    """Split a list into n (roughly) equal-sized chunks"""
-    chunk_size = math.ceil(len(lst) / n)  # integer division
-    return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
-
-
-def get_chunk(lst, n, k):
-    chunks = split_list(lst, n)
-    return chunks[k]
-
-
-def get_model_output(model, tokenizer, video_tensor, questions, conv_mode="v1", device='cuda'):
-
-    input_ids = []
-    modal_list = []
-    for qs in questions:
-        if model.config.mm_use_im_start_end:
-            qs = default_mm_start_token + default_mm_token + default_mm_end_token + "\n" + qs
-        else:
-            qs = default_mm_token + "\n" + qs
-
-        conv = conv_templates[conv_mode].copy()
-        conv.append_message(conv.roles[0], qs)
-        conv.append_message(conv.roles[1], None)
-        prompt = conv.get_prompt()
-
-        input_id = tokenizer_MMODAL_token(prompt, tokenizer, modal_token_index, return_tensors='pt')
-        input_ids.append(input_id)
-        modal_list.append("video")
-
-    # left pad sequence
-    input_ids = torch.nn.utils.rnn.pad_sequence(
-        [x.flip(dims=[0]) for x in input_ids],
-        batch_first=True,
-        padding_value=tokenizer.pad_token_id).flip(dims=[1]).to(device)
-
-    attention_mask=input_ids.ne(tokenizer.pad_token_id).to(device)
-
-    video_tensor = video_tensor.half().to(args.device)
-
-    with torch.inference_mode():
-        output_ids = model.generate(
-            input_ids,
-            attention_mask=attention_mask,
-            images_or_videos=video_tensor,
-            modal_list=modal_list,
-            do_sample=False,
-            max_new_tokens=1024,
-            use_cache=True,
-            pad_token_id=tokenizer.eos_token_id)
-
-    outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
-    return outputs
-
-
-def run_inference(args):
-    # Initialize the model
-    model_name = get_model_name_from_path(args.model_path)
-    tokenizer, model, processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name)
-
-    num_frames = model.config.num_frames if hasattr(model.config, "num_frames") else NUM_FRAMES
-
-    gt_questions = json.load(open(args.question_file, "r"))
-    gt_questions = get_chunk(gt_questions, args.num_chunks, args.chunk_idx)
-    gt_answers = json.load(open(args.answer_file, "r"))
-    gt_answers = get_chunk(gt_answers, args.num_chunks, args.chunk_idx)
-
-    answer_file = os.path.join(args.output_file)
-    os.makedirs(os.path.dirname(args.output_file), exist_ok=True)
-    ans_file = open(answer_file, "w")
-
-    video_formats = ['.mp4', '.avi', '.mov', '.mkv']
-
-    # Iterate over each sample in the ground truth file
-    for idx, sample in enumerate(tqdm(gt_questions)):
-        video_name = sample['video_name']
-        question = sample['question']
-        id = sample['question_id']
-        answer = gt_answers[idx]['answer']
-
-        # Load the video file
-        for fmt in video_formats:  # Added this line
-            temp_path = os.path.join(args.video_folder, f"v_{video_name}{fmt}")
-            if os.path.exists(temp_path):
-                video_path = temp_path
-                break
-            # BUG: compatibility for MSVD, MSRVTT, TGIF
-            temp_path = os.path.join(args.video_folder, f"{video_name}{fmt}")
-            if os.path.exists(temp_path):
-                video_path = temp_path
-                break
-
-        # question = question + '\n' + 'Answer the question using a single word or a short phrase with multiple words.'
-
-        video_tensor = process_video(video_path, processor, aspect_ratio=None, sample_scheme='uniform', num_frames=num_frames)
-        output = get_model_output(model, tokenizer, video_tensor[None], [question], args.conv_mode, args.device)[0]
-
-        sample_set = {'id': id, 'question': question, 'answer': answer, 'pred': output}
-        ans_file.write(json.dumps(sample_set) + "\n")
-
-    ans_file.close()
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    # Define the command-line arguments
-    parser.add_argument('--model-path', help='', required=True)
-    parser.add_argument('--model_base', help='', default=None, type=str, required=False)
-    parser.add_argument('--video-folder', help='Directory containing video files.', required=True)
-    parser.add_argument('--question-file', help='Path to the ground truth file containing question.', required=True)
-    parser.add_argument('--answer-file', help='Path to the ground truth file containing answers.', required=True)
-    parser.add_argument('--output-file', help='Directory to save the model results JSON.', required=True)
-    parser.add_argument("--conv-mode", type=str, default="llava_v1")
-    parser.add_argument("--num-chunks", type=int, default=1)
-    parser.add_argument("--chunk-idx", type=int, default=0)
-    parser.add_argument("--device", type=str, required=False, default='cuda:0')
-    parser.add_argument("--model_max_length", type=int, required=False, default=2048)
-
-    args = parser.parse_args()
-    run_inference(args)
diff --git a/videollama2/mm_utils.py b/videollama2/mm_utils.py
deleted file mode 100644
index 812c33d62bbea34afb18be5f851c398b921a56e8..0000000000000000000000000000000000000000
--- a/videollama2/mm_utils.py
+++ /dev/null
@@ -1,538 +0,0 @@
-import ast
-import math
-import base64
-from io import BytesIO
-
-import torch
-import decord
-import imageio
-import numpy as np
-from PIL import Image
-from decord import VideoReader, cpu
-from moviepy.editor import VideoFileClip
-from transformers import StoppingCriteria
-
-from scenedetect import open_video, SceneManager
-from scenedetect.detectors import ContentDetector
-from scenedetect.stats_manager import StatsManager
-
-from .constants import NUM_FRAMES, MAX_FRAMES, NUM_FRAMES_PER_SECOND, MMODAL_INDEX_TOKEN, IMAGE_TOKEN_INDEX
-
-
-def merge_scenes(cut_list, cut_scores, scene_list,num_frames,max_scene_num=4, num_frame_per_scene=8, min_frames_per_scene=30):
-    if len(scene_list) == len(cut_list) and len(scene_list) == 0:
-        frame_ids = np.linspace(0, num_frames-1, num_frame_per_scene, dtype=int)  # only one scene for current video
-        return [frame_ids]
-
-    scene_list, cut_results = merge_scenes_not_exeed_max_scene_num(cut_list,cut_scores,scene_list, max_scene_num)
-
-    prev_cut_point = 0
-    list_of_scene_frames = [] 
-    for (cur_cut_point, _) in cut_results:
-        frame_ids = list(np.linspace(prev_cut_point, cur_cut_point-1, num_frame_per_scene, dtype=int))
-        list_of_scene_frames.append(frame_ids)
-        prev_cut_point = cur_cut_point
-    if cur_cut_point < num_frames:
-        frame_ids = np.linspace(cur_cut_point, num_frames-1, num_frame_per_scene, dtype=int)
-        list_of_scene_frames.append(frame_ids)
-
-    return list_of_scene_frames
-
-
-def merge_scenes_not_exeed_max_scene_num(cut_list,cut_scores, scene_list, max_scene_num):
-    cut_frames = [ele.get_frames() for ele in cut_list]
-    cut_results = list(zip(cut_frames, cut_scores))
-    while len(scene_list) > max_scene_num:
-        min_idx = np.argmin(cut_scores)
-        cut_frames = [ele for idx, ele in enumerate(cut_frames) if idx != min_idx]
-        cut_scores = [ele for idx, ele in enumerate(cut_scores) if idx != min_idx]
-
-        # merge scene list
-        num_scenes = len(scene_list)
-        #print("Current min_idx:", min_idx)
-        s1 = scene_list[min_idx]
-        s2 = scene_list[min_idx+1]
-        new_scene = (s1[0], s2[1])
-        if min_idx == 0:
-            # merge the first two scenes
-            new_scene_list = [new_scene] + scene_list[2:]
-        elif min_idx == num_scenes - 1:
-            # # merge the last two scenes
-            new_scene_list = scene_list[:min_idx-1] + [new_scene]
-        else:
-            new_scene_list = scene_list[:min_idx] + [new_scene] + scene_list[min_idx+2:]
-        scene_list = new_scene_list
-        cut_results = list(zip(cut_frames, cut_scores))
-    return scene_list, cut_results
-
-
-def split_video_into_scenes(video_path, threshold=27.0, max_scene_num=10, num_frame_per_scene=8):
-    # Open video, create a scene manager, and add a detector.
-    video = open_video(video_path)
-    stats_manager = StatsManager()
-    scene_manager = SceneManager(stats_manager)
-    detector = ContentDetector(threshold=threshold)
-    scene_manager.add_detector(detector)
-    scene_manager.detect_scenes(video)
-    scene_list = scene_manager.get_scene_list()
-    cut_list = scene_manager.get_cut_list()
-    num_frames = video.duration.get_frames()
-    if len(scene_list) == len(cut_list) and len(scene_list) == 0:
-        frame_ids = np.linspace(0, num_frames-1, num_frame_per_scene, dtype=int)  # only one scene for current video
-        return [frame_ids]
-    assert len(scene_list) == len(cut_list) + 1, f"inconsistent lengths for scene list ({len(scene_list)}) vs. cut list ({len(cut_list)})"
-    cut_frames = [ele.get_frames() for ele in cut_list]
-    cut_scores = [stats_manager.get_metrics(f, ["delta_lum"])[0] for f in cut_frames]
-    cut_results = list(zip(cut_frames, cut_scores))
-    #print(f"Original cut scores: {cut_scores}, original scene list: {scene_list}")
-    while len(scene_list) > max_scene_num:
-        min_idx = np.argmin(cut_scores)
-        cut_frames = [ele for idx, ele in enumerate(cut_frames) if idx != min_idx]
-        cut_scores = [ele for idx, ele in enumerate(cut_scores) if idx != min_idx]
-
-        # merge scene list
-        num_scenes = len(scene_list)
-        #print("Current min_idx:", min_idx)
-        s1 = scene_list[min_idx]
-        s2 = scene_list[min_idx+1]
-        new_scene = (s1[0], s2[1])
-        if min_idx == 0:
-            # merge the first two scenes
-            new_scene_list = [new_scene] + scene_list[2:]
-        elif min_idx == num_scenes - 1:
-            # # merge the last two scenes
-            new_scene_list = scene_list[:min_idx-1] + [new_scene]
-        else:
-            new_scene_list = scene_list[:min_idx] + [new_scene] + scene_list[min_idx+2:]
-        scene_list = new_scene_list
-        cut_results = list(zip(cut_frames, cut_scores))
-    #print(f"Cut scores after merging: {cut_scores}, scene list: {scene_list}")
-    prev_cut_point = 0
-    list_of_scene_frames = [] 
-    for (cur_cut_point, _) in cut_results:
-        frame_ids = list(np.linspace(prev_cut_point, cur_cut_point-1, num_frame_per_scene, dtype=int))
-        list_of_scene_frames.append(frame_ids)
-        prev_cut_point = cur_cut_point
-    if cur_cut_point < num_frames:
-        frame_ids = np.linspace(cur_cut_point, num_frames-1, num_frame_per_scene, dtype=int)
-        list_of_scene_frames.append(frame_ids)
-    # print(f"Finally got {len(list_of_scene_frames)} scenes where we evenly sampled {num_frame_per_scene} frames for each scene")
-    return list_of_scene_frames
-
-
-def select_best_resolution(original_size, possible_resolutions):
-    """
-    Selects the best resolution from a list of possible resolutions based on the original size.
-    Args:
-        original_size (tuple): The original size of the image in the format (width, height).
-        possible_resolutions (list): A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].
-    Returns:
-        tuple: The best fit resolution in the format (width, height).
-    """
-    original_width, original_height = original_size
-    best_fit = None
-    max_effective_resolution = 0
-    min_wasted_resolution = float('inf')
-    for width, height in possible_resolutions:
-        scale = min(width / original_width, height / original_height)
-        downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale)
-        effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height)
-        wasted_resolution = (width * height) - effective_resolution
-        if effective_resolution > max_effective_resolution or (effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution):
-            max_effective_resolution = effective_resolution
-            min_wasted_resolution = wasted_resolution
-            best_fit = (width, height)
-    return best_fit
-
-
-def resize_and_pad_image(image, target_resolution):
-    """
-    Resize and pad an image to a target resolution while maintaining aspect ratio.
-    Args:
-        image (PIL.Image.Image): The input image.
-        target_resolution (tuple): The target resolution (width, height) of the image.
-    Returns:
-        PIL.Image.Image: The resized and padded image.
-    """
-    original_width, original_height = image.size
-    target_width, target_height = target_resolution
-    scale_w = target_width / original_width
-    scale_h = target_height / original_height
-    if scale_w < scale_h:
-        new_width = target_width
-        new_height = min(math.ceil(original_height * scale_w), target_height)
-    else:
-        new_height = target_height
-        new_width = min(math.ceil(original_width * scale_h), target_width)
-    # Resize the image
-    resized_image = image.resize((new_width, new_height))
-    new_image = Image.new('RGB', (target_width, target_height), (0, 0, 0))
-    paste_x = (target_width - new_width) // 2
-    paste_y = (target_height - new_height) // 2
-    new_image.paste(resized_image, (paste_x, paste_y))
-    return new_image
-
-
-def divide_to_patches(image, patch_size):
-    """
-    Divides an image into patches of a specified size.
-    Args:
-        image (PIL.Image.Image): The input image.
-        patch_size (int): The size of each patch.
-    Returns:
-        list: A list of PIL.Image.Image objects representing the patches.
-    """
-    patches = []
-    width, height = image.size
-    for i in range(0, height, patch_size):
-        for j in range(0, width, patch_size):
-            box = (j, i, j + patch_size, i + patch_size)
-            patch = image.crop(box)
-            patches.append(patch)
-    return patches
-
-
-def get_anyres_image_grid_shape(image_size, grids, patch_size):
-    """
-    Calculate the shape of the image patch grid after the preprocessing for images of any resolution.
-    Args:
-        image_size (tuple): The size of the input image in the format (width, height).
-        grids (str, List[tuple[int]]): Patch segmentation grid.
-        patch_size (int): The size of each image patch.
-    Returns:
-        tuple: The shape of the image patch grid in the format (width, height).
-    """
-    if type(grids) is list:
-        possible_resolutions = [(x * patch_size, y * patch_size) for x, y in grids]
-    else:
-        possible_resolutions = [(x * patch_size, y * patch_size) for x, y in ast.literal_eval(grids)]
-    width, height = select_best_resolution(image_size, possible_resolutions)
-    return width // patch_size, height // patch_size
-
-
-def process_anyres_image(image, grids, patch_size):
-    """
-    Process an image with variable resolutions.
-    Args:
-        image (PIL.Image.Image): The input image to be processed.
-        grids (str, List[tuple[int]]): Patch segmentation grid.
-        patch_size (int): The size of the patches to be extracted.
-    Returns:
-        torch.Tensor: A tensor containing the processed image patches.
-    """
-    if type(grids) is list:
-        possible_resolutions = [(x * patch_size, y * patch_size) for x, y in grids]
-    else:
-        possible_resolutions = [(x * patch_size, y * patch_size) for x, y in ast.literal_eval(grids)]
-    best_resolution = select_best_resolution(image.size, possible_resolutions)
-    image_padded = resize_and_pad_image(image, best_resolution)
-    patches = divide_to_patches(image_padded, patch_size)
-    image_original_resize = resize_and_pad_image(image, (patch_size, patch_size))
-    image_patches = [image_original_resize] + patches
-    return image_patches
-
-
-def chunk_list(input_list, chunk_size):
-    return [input_list[i:i + chunk_size] for i in range(0, len(input_list), chunk_size)]
-
-
-def frame_expansion(frame_list, n):
-    assert len(frame_list) == n * n
-    width, height = frame_list[0].width, frame_list[0].height
-    expanded_width = n * width
-    expanded_height = n * height
-    expanded_frame = Image.new('RGB', (expanded_width, expanded_height))
-    for i in range(n):
-        for j in range(n):
-            frame = frame_list[i * n + j]
-            coordinate = (j*width, i*height)
-            expanded_frame.paste(frame, coordinate)
-    return expanded_frame
-
-
-def load_image_from_base64(image):
-    return Image.open(BytesIO(base64.b64decode(image)))
-
-
-def expand2square(pil_img, background_color):
-    width, height = pil_img.size
-    if width == height:
-        return pil_img
-    elif width > height:
-        result = Image.new(pil_img.mode, (width, width), background_color)
-        result.paste(pil_img, (0, (width - height) // 2))
-        return result
-    else:
-        result = Image.new(pil_img.mode, (height, height), background_color)
-        result.paste(pil_img, ((height - width) // 2, 0))
-        return result
-
-
-def process_images(images, image_processor, model_cfg):
-    image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", None)
-    new_images = []
-    #print("Current image_aspect_ratio:", image_aspect_ratio)
-    if image_aspect_ratio == 'pad':
-        for image in images:
-            image = expand2square(image, tuple(int(x*255) for x in image_processor.image_mean))
-            image = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
-            new_images.append(image)
-    else:
-        return image_processor(images, return_tensors='pt')['pixel_values']
-    if all(x.shape == new_images[0].shape for x in new_images):
-        new_images = torch.stack(new_images, dim=0)
-    return new_images
-
-
-def process_videos(frames, image_processor, model_cfg):
-    # this function only used during inference
-    # image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", None)
-    # new_frames = []
-    # print("Current image_aspect_ratio:", image_aspect_ratio)
-    # if image_aspect_ratio == 'pad':
-    #     for image in frames:
-    #         image = Image.fromarray(image)
-    #         image = expand2square(image, tuple(int(x*255) for x in image_processor.image_mean))
-    #         image = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
-    #         new_frames.append(image)
-    # else:
-    #     return image_processor(frames, return_tensors='pt')['pixel_values']
-    # if all(x.shape == new_frames[0].shape for x in new_frames):
-    #     new_frames = torch.stack(new_frames, dim=0)
-    new_frames = image_processor.preprocess(frames, return_tensors='pt')['pixel_values']  # do not pad for video frames
-    return new_frames
-
-
-def create_photo_grid(arr, rows=None, cols=None):
-    """
-    Create a photo grid from a 4D numpy array with shape [t, h, w, c].
-
-    Parameters:
-        arr (numpy.ndarray): Input array with shape [t, h, w, c].
-        rows (int): Optional. Number of rows in the grid. If not set, it will be determined based on `cols` or the square root of `t`.
-        cols (int): Optional. Number of columns in the grid. If not set, it will be determined based on `rows` or the square root of `t`.
-
-    Returns:
-        numpy.ndarray: A 3D numpy array representing the photo grid.
-    """
-
-    if isinstance(arr, list):
-        if isinstance(arr[0], Image.Image):
-            arr = np.stack([np.array(img) for img in arr])
-        elif isinstance(arr[0], np.ndarray):
-            arr = np.stack(arr)
-        else:
-            raise ValueError("Invalid input type. Expected list of Images or numpy arrays.")
-
-    t, h, w, c = arr.shape
-    
-    # Calculate the number of rows and columns if not provided
-    if rows is None and cols is None:
-        rows = math.ceil(math.sqrt(t))
-        cols = math.ceil(t / rows)
-    elif rows is None:
-        rows = math.ceil(t / cols)
-    elif cols is None:
-        cols = math.ceil(t / rows)
-
-    # Check if the grid can hold all the images
-    if rows * cols < t:
-        raise ValueError(f"Not enough grid cells ({rows}x{cols}) to hold all images ({t}).")
-    
-    # Create the grid array with appropriate height and width
-    grid_height = h * rows
-    grid_width = w * cols
-    grid = np.zeros((grid_height, grid_width, c), dtype=arr.dtype)
-    
-    # Fill the grid with images
-    for i in range(t):
-        row_idx = i // cols
-        col_idx = i % cols
-        grid[row_idx*h:(row_idx+1)*h, col_idx*w:(col_idx+1)*w, :] = arr[i]
-    
-    return grid
-
-
-def process_image(image_path, processor, aspect_ratio='pad', num_frames=NUM_FRAMES, image_grid=False):
-    image = Image.open(image_path).convert('RGB')
-
-    if image_grid:
-        pg = np.stack([np.array(image)] * num_frames)
-        grid_h = grid_w = math.ceil(math.sqrt(num_frames))
-        pg = create_photo_grid(pg, grid_h, grid_w)
-        images = [pg, np.array(image)]
-    else:
-        images = [np.array(image)]
-
-    if aspect_ratio == 'pad':
-        images = [Image.fromarray(f) for f in images]
-        images = [expand2square(image, tuple(int(x*255) for x in processor.image_mean)) for image in images]
-    else:
-        images = [Image.fromarray(f) for f in images]
-
-    images = processor.preprocess(images, return_tensors='pt')['pixel_values']
-    return images
-
-
-def process_video(video_path, processor, aspect_ratio='pad', num_frames=NUM_FRAMES, image_grid=False, sample_scheme='uniform'):
-    def frame_sample(duration, mode='uniform', local_fps=None):
-        if mode == 'uniform':
-            return np.linspace(0, duration-1, num_frames, dtype=int)
-        elif mode == 'fps':
-            assert local_fps is not None
-            segment_len = min(local_fps // NUM_FRAMES_PER_SECOND, duration)
-            frame_id_list = np.arange(segment_len // 2, duration, segment_len, dtype=int)
-            if len(frame_id_list) < num_frames:
-                frame_id_list = np.linspace(0, duration-1, num_frames, dtype=int)
-            return frame_id_list
-        else:
-            raise ImportError(f'Unsupported frame sampling mode: {mode}')
-
-    if isinstance(video_path, str):
-        if video_path.endswith('.gif'):
-            video_gif = imageio.get_reader(video_path)
-            duration, local_fps = len(video_gif), 10
-
-            frame_id_list = frame_sample(duration, mode=sample_scheme, local_fps=local_fps)
-            # limit the max input frames
-            if len(frame_id_list) > MAX_FRAMES:
-                frame_id_list = np.linspace(0, duration-1, MAX_FRAMES, dtype=int)
-            video_data = [frame for index, frame in enumerate(video_gif) if index in frame_id_list]
-        # added by lixin4ever, include the support of .webm files from sthsthv2
-        elif video_path.endswith('.webm'):
-            video_webm = VideoFileClip(video_path)
-            video_frames = np.array(list(video_webm.iter_frames()))
-
-            duration, local_fps = len(video_frames), video_webm.fps
-
-            frame_id_list = frame_sample(duration, mode=sample_scheme, local_fps=local_fps)
-            # limit the max input frames
-            if len(frame_id_list) > MAX_FRAMES:
-                frame_id_list = np.linspace(0, duration-1, MAX_FRAMES, dtype=int)
-            video_data = video_frames[frame_id_list]
-        else:
-            decord_vr = VideoReader(uri=video_path, ctx=cpu(0)) if "Valley/finetune/source_videos" not in video_path else VideoReader(uri=video_path, ctx=cpu(0), num_threads=1)  # add num_threads=1 for Valley videos
-            duration, local_fps = len(decord_vr), float(decord_vr.get_avg_fps())
-        
-            frame_id_list = frame_sample(duration, mode=sample_scheme, local_fps=local_fps)
-            # limit the max input frames
-            if len(frame_id_list) > MAX_FRAMES:
-                frame_id_list = np.linspace(0, duration-1, MAX_FRAMES, dtype=int)
-            try:
-                video_data = decord_vr.get_batch(frame_id_list).numpy()
-            except:
-                video_data = decord_vr.get_batch(frame_id_list).asnumpy()
-
-            # if self.data_args.use_temp_aug:
-            #     frame_id_list = np.linspace(0, duration-1, num_frames * 2 * 2, dtype=int)
-            #     video_data = decord_vr.get_batch(frame_id_list)
-            #     video_frames = [Image.fromarray(f) for f in video_data.numpy()]
-            #     chunked_video_frames = chunk_list(video_frames, 2*2)
-            #     video_data = [frame_expansion(frame_list, 2) for frame_list in chunked_video_frames]
-    else:
-        video = video_path
-        frame_id_list = frame_sample(duration, mode='uniform')
-        video_data = [video.get_data(frame_id) for frame_id in frame_id_list]
-
-    if image_grid:
-        grid_h = grid_w = math.ceil(math.sqrt(num_frames))
-        pg = create_photo_grid(video_data, grid_h, grid_w)
-        video_data = [pg, *video_data]
-
-    if aspect_ratio == 'pad':
-        images = [Image.fromarray(f.numpy() if isinstance(f, torch.Tensor) else f) for f in video_data]
-        images = [expand2square(image, tuple(int(x*255) for x in processor.image_mean)) for image in images]
-        video = processor.preprocess(images, return_tensors='pt')['pixel_values']
-    else:
-        images = [Image.fromarray(f.numpy() if isinstance(f, torch.Tensor) else f) for f in video_data]
-        video = processor.preprocess(images, return_tensors='pt')['pixel_values']
-
-    return video
-
-
-def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None):
-    prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('<image>')]
-
-    def insert_separator(X, sep):
-        return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1]
-
-    input_ids = []
-    offset = 0
-    if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
-        offset = 1
-        input_ids.append(prompt_chunks[0][0])
-
-    for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
-        input_ids.extend(x[offset:])
-
-    if return_tensors is not None:
-        if return_tensors == 'pt':
-            return torch.tensor(input_ids, dtype=torch.long)
-        raise ValueError(f'Unsupported tensor type: {return_tensors}')
-    return input_ids
-
-
-def tokenizer_MMODAL_token(prompt, tokenizer, MMODAL_token_index=IMAGE_TOKEN_INDEX, return_tensors=None):
-    prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split(f'<{MMODAL_INDEX_TOKEN[MMODAL_token_index].lower()}>')]
-    num_prompt_chunks = len(prompt.split(f'<{MMODAL_INDEX_TOKEN[MMODAL_token_index].lower()}>'))
-
-    def insert_separator(X, sep):
-        return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1]
-
-    input_ids = []
-    offset = 0
-    if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
-        offset = 1
-        input_ids.append(prompt_chunks[0][0])
-
-    for x in insert_separator(prompt_chunks, [MMODAL_token_index] * (offset + 1)):
-        input_ids.extend(x[offset:])
-
-    if return_tensors is not None:
-        if return_tensors == 'pt':
-            return torch.tensor(input_ids, dtype=torch.long)
-        raise ValueError(f'Unsupported tensor type: {return_tensors}')
-    return input_ids
-
-
-def get_model_name_from_path(model_path):
-    model_path = model_path.strip("/")
-    model_paths = model_path.split("/")
-    if model_paths[-1].startswith('checkpoint-'):
-        return model_paths[-2] + "_" + model_paths[-1]
-    else:
-        return model_paths[-1]
-
-
-class KeywordsStoppingCriteria(StoppingCriteria):
-    def __init__(self, keywords, tokenizer, input_ids):
-        self.keywords = keywords
-        self.keyword_ids = []
-        self.max_keyword_len = 0
-        for keyword in keywords:
-            cur_keyword_ids = tokenizer(keyword).input_ids
-            if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id:
-                cur_keyword_ids = cur_keyword_ids[1:]
-            if len(cur_keyword_ids) > self.max_keyword_len:
-                self.max_keyword_len = len(cur_keyword_ids)
-            self.keyword_ids.append(torch.tensor(cur_keyword_ids))
-        self.tokenizer = tokenizer
-        self.start_len = input_ids.shape[1]
-    
-    def call_for_batch(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
-        offset = min(output_ids.shape[1] - self.start_len, self.max_keyword_len)
-        self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids]
-        for keyword_id in self.keyword_ids:
-            if (output_ids[0, -keyword_id.shape[0]:] == keyword_id).all():
-                return True
-        outputs = self.tokenizer.batch_decode(output_ids[:, -offset:], skip_special_tokens=True)[0]
-        for keyword in self.keywords:
-            if keyword in outputs:
-                return True
-        return False
-    
-    def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
-        outputs = []
-        for i in range(output_ids.shape[0]):
-            outputs.append(self.call_for_batch(output_ids[i].unsqueeze(0), scores))
-        return all(outputs)
diff --git a/videollama2/model/__init__.py b/videollama2/model/__init__.py
deleted file mode 100644
index 13ae526398029275a80afb8542ea3c1a39083ef0..0000000000000000000000000000000000000000
--- a/videollama2/model/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from .language_model.videollama2_llama import Videollama2LlamaForCausalLM, Videollama2Config
-from .language_model.videollama2_mistral import Videollama2MistralForCausalLM, Videollama2MistralConfig
-from .language_model.videollama2_mixtral import Videollama2MixtralForCausalLM, Videollama2MixtralConfig
\ No newline at end of file
diff --git a/videollama2/model/multimodal_encoder/builder.py b/videollama2/model/multimodal_encoder/builder.py
deleted file mode 100644
index ad751f66838fbf13deac4fb9499c8ffb640b51b0..0000000000000000000000000000000000000000
--- a/videollama2/model/multimodal_encoder/builder.py
+++ /dev/null
@@ -1,15 +0,0 @@
-import os
-
-from .clip_encoder import CLIPVisionTower
-
-
-def build_vision_tower(vision_tower_cfg, **kwargs):
-    vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None))
-
-    is_absolute_path_exists = os.path.exists(vision_tower)
-    if  vision_tower.startswith("openai") or vision_tower.startswith("laion") or 'clip' in vision_tower:
-        vision_tower = CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
-    else:
-        raise ValueError(f'Unknown vision tower: {vision_tower}')
-
-    return vision_tower
diff --git a/videollama2/model/multimodal_encoder/clip_encoder.py b/videollama2/model/multimodal_encoder/clip_encoder.py
deleted file mode 100644
index 4c99c0149c98f79559473427d41b8a1b51a05a4c..0000000000000000000000000000000000000000
--- a/videollama2/model/multimodal_encoder/clip_encoder.py
+++ /dev/null
@@ -1,84 +0,0 @@
-import torch
-import torch.nn as nn
-
-from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig
-
-
-class CLIPVisionTower(nn.Module):
-
-    def __init__(self, vision_tower, args, delay_load=False):
-        super().__init__()
-
-        self.is_loaded = False
-
-        self.vision_tower_name = vision_tower
-        self.select_layer = args.mm_vision_select_layer
-        self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
-
-        if not delay_load:
-            self.load_model()
-        else:
-            self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name)
-
-    def load_model(self):
-        self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
-
-        self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name)
-        self.vision_tower.requires_grad_(False)
-
-        self.is_loaded = True
-
-    def feature_select(self, image_forward_outs):
-        image_features = image_forward_outs.hidden_states[self.select_layer]
-        if self.select_feature == 'patch':
-            image_features = image_features[:, 1:]
-        elif self.select_feature == 'cls_patch':
-            image_features = image_features
-        else:
-            raise ValueError(f'Unexpected select feature: {self.select_feature}')
-        return image_features
-
-    @torch.no_grad()
-    def forward(self, images):
-        if type(images) is list:
-            image_features = []
-            for image in images:
-                image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
-                image_feature = self.feature_select(image_forward_out).to(image.dtype)
-                image_features.append(image_feature)
-        else:
-            image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
-            image_features = self.feature_select(image_forward_outs).to(images.dtype)
-
-        return image_features
-
-    @property
-    def dummy_feature(self):
-        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
-
-    @property
-    def dtype(self):
-        return self.vision_tower.dtype
-
-    @property
-    def device(self):
-        return self.vision_tower.device
-
-    @property
-    def config(self):
-        if self.is_loaded:
-            return self.vision_tower.config
-        else:
-            return self.cfg_only
-
-    @property
-    def hidden_size(self):
-        return self.config.hidden_size
-
-    @property
-    def num_patches(self):
-        return (self.config.image_size // self.config.patch_size) ** 2
-
-    @property
-    def num_patches_per_side(self):
-        return self.config.image_size // self.config.patch_size
diff --git a/videollama2/model/multimodal_projector/__init__.py b/videollama2/model/multimodal_projector/__init__.py
deleted file mode 100644
index 97fa0ef8b8c329ea0ef1d9e7f85ae235489fca07..0000000000000000000000000000000000000000
--- a/videollama2/model/multimodal_projector/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .builder import load_mm_projector
\ No newline at end of file
diff --git a/videollama2/train.py b/videollama2/train.py
deleted file mode 100644
index c7e5bad521cee2e0c4d46cd95e9326416987de23..0000000000000000000000000000000000000000
--- a/videollama2/train.py
+++ /dev/null
@@ -1,963 +0,0 @@
-# Adopted from https://github.com/haotian-liu/LLaVA. Below is the original copyright:
-# Adopted from https://github.com/lm-sys/FastChat. Below is the original copyright:
-# Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright:
-#    Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li
-#
-#    Licensed under the Apache License, Version 2.0 (the "License");
-#    you may not use this file except in compliance with the License.
-#    You may obtain a copy of the License at
-#
-#        http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS,
-#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#    See the License for the specific language governing permissions and
-#    limitations under the License.
-
-import os
-import sys
-import copy
-import json
-import random
-import logging
-import pathlib
-from dataclasses import dataclass, field
-from typing import Dict, Optional, Sequence, List
-
-# torch-related packages
-import torch
-from torch.utils.data import Dataset
-from torchvision.transforms import Compose, Lambda, ToTensor
-from pytorchvideo.data.encoded_video import EncodedVideo
-from pytorchvideo.transforms import ApplyTransformToKey, ShortSideScale, UniformTemporalSubsample
-
-import cv2
-import decord
-import imageio
-import traceback
-import numpy as np
-import transformers
-from PIL import Image
-from decord import VideoReader, cpu
-from moviepy.editor import VideoFileClip
-from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock
-
-sys.path.append('./')
-from videollama2 import conversation as conversation_lib
-from videollama2.constants import NUM_FRAMES, IGNORE_INDEX, MMODAL_TOKEN_INDEX, DEFAULT_MMODAL_TOKEN, DEFAULT_MMODAL_START_TOKEN, DEFAULT_MMODAL_END_TOKEN
-from videollama2.videollama2_trainer import VideoLLaMA2Trainer
-from videollama2.model import *
-from videollama2.mm_utils import tokenizer_MMODAL_token, tokenizer_image_token, expand2square, process_video, process_image
-
-local_rank = None
-
-
-def rank0_print(*args):
-    if local_rank == 0:
-        print(*args)
-
-
-def set_seed(seed=42):
-    """
-    Set the random seed for reproducible results.
-
-    :param seed: An integer value to be used as the random seed.
-    """
-    torch.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
-    torch.cuda.manual_seed_all(seed)  # for multi-GPU setups
-    torch.backends.cudnn.deterministic = True
-    torch.backends.cudnn.benchmark = False
-
-
-@dataclass
-class ModelArguments:
-    # LLM Arguments
-    model_name_or_path: Optional[str] = field(default="lmsys/vicuna-7b-v1.5")
-    version: Optional[str] = field(default="v1", metadata={"help": "Version of the conversation template."})
-    freeze_backbone: bool = field(default=False, metadata={"help": "Whether to freeze the LLM backbone."})
-    # Connector Arguments
-    mm_projector_type: Optional[str] = field(default='linear')
-    tune_mm_mlp_adapter: bool = field(default=False)
-    pretrain_mm_mlp_adapter: Optional[str] = field(default=None)
-    # Vision tower Arguments
-    vision_tower: Optional[str] = field(default=None)
-    mm_vision_select_layer: Optional[int] = field(default=-1)
-    mm_vision_select_feature: Optional[str] = field(default="patch")
-    # Other Arguments
-    mm_use_im_start_end: bool = field(default=False)
-    mm_use_im_patch_token: bool = field(default=True)
-
-
-@dataclass
-class DataArguments:
-    # Path Arguments
-    data_path: str = field(default=None, metadata={"help": "Path to the training data."})
-    # image_folder: Optional[str] = field(default=None)
-    # video_folder: Optional[str] = field(default=None)
-    data_folder: Optional[str] = field(default=None)
-    # Loading Arguments
-    is_multimodal: bool = False
-    lazy_preprocess: bool = False
-    num_frames: Optional[int] = field(default=None)
-    # Preprocess Arguments
-    image_aspect_ratio: str = 'square'
-
-
-@dataclass
-class TrainingArguments(transformers.TrainingArguments):
-    optim: str = field(default="adamw_torch")
-    mm_projector_lr: Optional[float] = None
-    freeze_mm_mlp_adapter: bool = field(default=False)
-    remove_unused_columns: bool = field(default=False)
-    cache_dir: Optional[str] = field(default=None)
-    # Training Data Arguments 
-    group_by_modality_length: bool = field(default=False)
-    model_max_length: int = field(
-        default=512,
-        metadata={
-            "help":
-            "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
-        },
-    )
-    # Lora or Quant Arguments
-    double_quant: bool = field(
-        default=True,
-        metadata={"help": "Compress the quantization statistics through double quantization."}
-    )
-    quant_type: str = field(
-        default="nf4",
-        metadata={"help": "Quantization data type to use. Should be one of `fp4` or `nf4`."}
-    )
-    bits: int = field(
-        default=16,
-        metadata={"help": "How many bits to use."}
-    )
-    lora_enable: bool = False
-    lora_r: int = 64
-    lora_alpha: int = 16
-    lora_dropout: float = 0.05
-    lora_weight_path: str = ""
-    lora_bias: str = "none"
-
-
-def maybe_zero_3(param, ignore_status=False, name=None):
-    from deepspeed import zero
-    from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
-    if hasattr(param, "ds_id"):
-        if param.ds_status == ZeroParamStatus.NOT_AVAILABLE:
-            if not ignore_status:
-                logging.warning(f"{name}: param.ds_status != ZeroParamStatus.NOT_AVAILABLE: {param.ds_status}")
-        with zero.GatheredParameters([param]):
-            param = param.data.detach().cpu().clone()
-    else:
-        param = param.detach().cpu().clone()
-    return param
-
-
-# Borrowed from peft.utils.get_peft_model_state_dict
-def get_peft_state_maybe_zero_3(named_params, bias):
-    if bias == "none":
-        to_return = {k: t for k, t in named_params if "lora_" in k}
-    elif bias == "all":
-        to_return = {k: t for k, t in named_params if "lora_" in k or "bias" in k}
-    elif bias == "lora_only":
-        to_return = {}
-        maybe_lora_bias = {}
-        lora_bias_names = set()
-        for k, t in named_params:
-            if "lora_" in k:
-                to_return[k] = t
-                bias_name = k.split("lora_")[0] + "bias"
-                lora_bias_names.add(bias_name)
-            elif "bias" in k:
-                maybe_lora_bias[k] = t
-        for k, t in maybe_lora_bias:
-            if bias_name in lora_bias_names:
-                to_return[bias_name] = t
-    else:
-        raise NotImplementedError
-    to_return = {k: maybe_zero_3(v, ignore_status=True) for k, v in to_return.items()}
-    return to_return
-
-
-def get_peft_state_non_lora_maybe_zero_3(named_params, require_grad_only=True):
-    to_return = {k: t for k, t in named_params if "lora_" not in k}
-    if require_grad_only:
-        to_return = {k: t for k, t in to_return.items() if t.requires_grad}
-    to_return = {k: maybe_zero_3(v, ignore_status=True).cpu() for k, v in to_return.items()}
-    return to_return
-
-
-def get_mm_adapter_state_maybe_zero_3(named_params, keys_to_match):
-    to_return = {k: t for k, t in named_params if any(key_match in k for key_match in keys_to_match)}
-    to_return = {k: maybe_zero_3(v, ignore_status=True).cpu() for k, v in to_return.items()}
-    return to_return
-
-
-def find_all_linear_names(model):
-    cls = torch.nn.Linear
-    lora_module_names = set()
-    multimodal_keywords = ['mm_projector', 'vision_tower', 'vision_resampler']
-    for name, module in model.named_modules():
-        if any(mm_keyword in name for mm_keyword in multimodal_keywords):
-            continue
-        if isinstance(module, cls):
-            names = name.split('.')
-            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
-
-    if 'lm_head' in lora_module_names: # needed for 16-bit
-        lora_module_names.remove('lm_head')
-    return list(lora_module_names)
-
-
-def safe_save_model_for_hf_trainer(trainer: transformers.Trainer,
-                                   output_dir: str):
-    """Collects the state dict and dump to disk."""
-
-    if getattr(trainer.args, "tune_mm_mlp_adapter", False):
-        # Only save Adapter
-        keys_to_match = ['mm_projector']
-        if getattr(trainer.args, "use_im_start_end", False):
-            keys_to_match.extend(['embed_tokens', 'embed_in'])
-
-        weight_to_save = get_mm_adapter_state_maybe_zero_3(trainer.model.named_parameters(), keys_to_match)
-        trainer.model.config.save_pretrained(output_dir)
-
-        current_folder = output_dir.split('/')[-1]
-        parent_folder = os.path.dirname(output_dir)
-        if trainer.args.local_rank == 0 or trainer.args.local_rank == -1:
-            if current_folder.startswith('checkpoint-'):
-                mm_projector_folder = os.path.join(parent_folder, "mm_projector")
-                os.makedirs(mm_projector_folder, exist_ok=True)
-                torch.save(weight_to_save, os.path.join(mm_projector_folder, f'{current_folder}.bin'))
-            else:
-                torch.save(weight_to_save, os.path.join(output_dir, f'mm_projector.bin'))
-        return
-
-    if trainer.deepspeed:
-        torch.cuda.synchronize()
-        trainer.save_model(output_dir)
-        return
-
-    state_dict = trainer.model.state_dict()
-    if trainer.args.should_save:
-        cpu_state_dict = {
-            key: value.cpu()
-            for key, value in state_dict.items()
-        }
-        del state_dict
-        trainer._save(output_dir, state_dict=cpu_state_dict)  # noqa
-
-
-def smart_tokenizer_and_embedding_resize(
-    special_tokens_dict: Dict,
-    tokenizer: transformers.PreTrainedTokenizer,
-    model: transformers.PreTrainedModel,
-):
-    """Resize tokenizer and embedding.
-
-    Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
-    """
-    num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
-    model.resize_token_embeddings(len(tokenizer))
-
-    if num_new_tokens > 0:
-        input_embeddings = model.get_input_embeddings().weight.data
-        output_embeddings = model.get_output_embeddings().weight.data
-
-        input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
-            dim=0, keepdim=True)
-        output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
-            dim=0, keepdim=True)
-
-        input_embeddings[-num_new_tokens:] = input_embeddings_avg
-        output_embeddings[-num_new_tokens:] = output_embeddings_avg
-
-
-def _tokenize_fn(strings: Sequence[str],
-                 tokenizer: transformers.PreTrainedTokenizer) -> Dict:
-    """Tokenize a list of strings."""
-    tokenized_list = [
-        tokenizer(
-            text,
-            return_tensors="pt",
-            padding="longest",
-            max_length=tokenizer.model_max_length,
-            truncation=True,
-        ) for text in strings
-    ]
-    input_ids = labels = [
-        tokenized.input_ids[0] for tokenized in tokenized_list
-    ]
-    input_ids_lens = labels_lens = [
-        tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item()
-        for tokenized in tokenized_list
-    ]
-    return dict(
-        input_ids=input_ids,
-        labels=labels,
-        input_ids_lens=input_ids_lens,
-        labels_lens=labels_lens,
-    )
-
-
-def _mask_targets(target, tokenized_lens, speakers):
-    # cur_idx = 0
-    cur_idx = tokenized_lens[0]
-    tokenized_lens = tokenized_lens[1:]
-    target[:cur_idx] = IGNORE_INDEX
-    for tokenized_len, speaker in zip(tokenized_lens, speakers):
-        if speaker == "human":
-            target[cur_idx+2:cur_idx + tokenized_len] = IGNORE_INDEX
-        cur_idx += tokenized_len
-
-
-def _add_speaker_and_signal(header, source, get_conversation=True):
-    """Add speaker and start/end signal on each round."""
-    BEGIN_SIGNAL = "### "
-    END_SIGNAL = "\n"
-    conversation = header
-    for sentence in source:
-        from_str = sentence["from"]
-        if from_str.lower() == "human":
-            from_str = conversation_lib.default_conversation.roles[0]
-        elif from_str.lower() == "gpt":
-            from_str = conversation_lib.default_conversation.roles[1]
-        else:
-            from_str = 'unknown'
-        sentence["value"] = (BEGIN_SIGNAL + from_str + ": " +
-                             sentence["value"] + END_SIGNAL)
-        if get_conversation:
-            conversation += sentence["value"]
-    conversation += BEGIN_SIGNAL
-    return conversation
-
-
-def preprocess_multimodal(sources: Sequence[str], data_args: DataArguments) -> Dict:
-    is_multimodal = data_args.is_multimodal
-    if not is_multimodal:
-        return sources
-
-    for source in sources:
-        for sentence in source:
-            # NOTE: scan token of each modal and move them to the beginning of the sentence. 
-            for DEFAULT_TOKEN in DEFAULT_MMODAL_TOKEN.values():
-                MODAL_TYPE = None
-                if DEFAULT_TOKEN in sentence['value']:
-                    MODAL_TYPE = DEFAULT_TOKEN[1:-1]
-                    sentence['value'] = sentence['value'].replace(DEFAULT_TOKEN, '').strip()
-                    sentence['value'] = DEFAULT_TOKEN + '\n' + sentence['value']
-                    sentence['value'] = sentence['value'].strip()
-                    if "mmtag" in conversation_lib.default_conversation.version:
-                        sentence['value'] = sentence['value'].replace(DEFAULT_TOKEN, f'<{MODAL_TYPE.capitalize()}>' + DEFAULT_TOKEN + f'</{MODAL_TYPE.capitalize()}>')
-                replace_token = DEFAULT_TOKEN
-                if data_args.mm_use_im_start_end and MODAL_TYPE is not None:
-                    replace_token = DEFAULT_MMODAL_START_TOKEN[MODAL_TYPE.upper()] + replace_token + DEFAULT_MMODAL_START_TOKEN[MODAL_TYPE.upper()]
-                sentence["value"] = sentence["value"].replace(DEFAULT_TOKEN, replace_token)
-
-    return sources
-
-
-def preprocess_llama_2(
-    sources,
-    tokenizer: transformers.PreTrainedTokenizer,
-    MODAL_list = [],
-) -> Dict:
-    conv = conversation_lib.default_conversation.copy()
-    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
-
-    # Apply prompt templates
-    conversations = []
-    for i, source in enumerate(sources):
-        if roles[source[0]["from"]] != conv.roles[0]:
-            # Skip the first one if it is not from human
-            source = source[1:]
-
-        conv.messages = []
-        for j, sentence in enumerate(source):
-            role = roles[sentence["from"]]
-            assert role == conv.roles[j % 2], f"{i}"
-            conv.append_message(role, sentence["value"])
-        conversations.append(conv.get_prompt())
-
-    # Tokenize conversations
-    if len(MODAL_list) > 0:
-        # input_ids = torch.stack([tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0)
-        input_ids = torch.stack([tokenizer_MMODAL_token(prompt, tokenizer, MMODAL_TOKEN_INDEX[MODAL_list[i]], return_tensors='pt') for i, prompt in enumerate(conversations)], dim=0)
-    else:
-        input_ids = tokenizer(
-            conversations,
-            return_tensors="pt",
-            padding="longest",
-            max_length=tokenizer.model_max_length,
-            truncation=True,
-        ).input_ids
-
-    targets = input_ids.clone()
-
-    assert conv.sep_style == conversation_lib.SeparatorStyle.LLAMA_2
-
-    # Mask targets
-    sep = "[/INST] "
-    for idx, (conversation, target) in enumerate(zip(conversations, targets)):
-        total_len = int(target.ne(tokenizer.pad_token_id).sum())
-
-        rounds = conversation.split(conv.sep2)
-        cur_len = 1
-        target[:cur_len] = IGNORE_INDEX
-        for i, rou in enumerate(rounds):
-            if rou == "":
-                break
-
-            parts = rou.split(sep)
-            if len(parts) != 2:
-                break
-            parts[0] += sep
-
-            if len(MODAL_list) > 0:
-                # round_len = len(tokenizer_image_token(rou, tokenizer))
-                # instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 2
-                round_len = len(tokenizer_MMODAL_token(rou, tokenizer, MMODAL_TOKEN_INDEX[MODAL_list[idx]]))
-                instruction_len = len(tokenizer_MMODAL_token(parts[0], tokenizer, MMODAL_TOKEN_INDEX[MODAL_list[idx]])) - 2
-            else:
-                round_len = len(tokenizer(rou).input_ids)
-                instruction_len = len(tokenizer(parts[0]).input_ids) - 2
-
-            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
-
-            cur_len += round_len
-        target[cur_len:] = IGNORE_INDEX
-
-        if cur_len < tokenizer.model_max_length:
-            if cur_len != total_len:
-                target[:] = IGNORE_INDEX
-                print(
-                    f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
-                    f" (ignored)"
-                )
-
-    return dict(
-        input_ids=input_ids,
-        labels=targets,
-    )
-
-
-def preprocess_v1(
-    sources,
-    tokenizer: transformers.PreTrainedTokenizer,
-    MODAL_list = [],
-) -> Dict:
-    conv = conversation_lib.default_conversation.copy()
-    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
-
-    assert len(sources) == len(MODAL_list)
-    # Apply prompt templates
-    conversations = []
-    for i, source in enumerate(sources):
-        if roles[source[0]["from"]] != conv.roles[0]:
-            # Skip the first one if it is not from human
-            source = source[1:]
-
-        conv.messages = []
-        # source is the conversations in the input data
-        for j, sentence in enumerate(source):
-            role = roles[sentence["from"]]
-            assert role == conv.roles[j % 2], f"{i}"
-            conv.append_message(role, sentence["value"])
-        conversations.append(conv.get_prompt())
-
-    # Tokenize conversations
-    if len(MODAL_list) > 0:
-        # input_ids = torch.stack([tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0)
-        input_ids = torch.stack([tokenizer_MMODAL_token(prompt, tokenizer, MMODAL_TOKEN_INDEX[MODAL_list[i]], return_tensors='pt') for i, prompt in enumerate(conversations)], dim=0)
-    else:
-        input_ids = tokenizer(
-            conversations,
-            return_tensors="pt",
-            padding="longest",
-            max_length=tokenizer.model_max_length,
-            truncation=True,
-        ).input_ids
-
-    targets = input_ids.clone()
-
-    assert conv.sep_style == conversation_lib.SeparatorStyle.TWO
-
-    # Mask targets
-    sep = conv.sep + conv.roles[1] + ": "
-    #for conversation, target in zip(conversations, targets):
-    for idx, (conversation, target) in enumerate(zip(conversations, targets)):
-        total_len = int(target.ne(tokenizer.pad_token_id).sum())
-
-        rounds = conversation.split(conv.sep2)
-        cur_len = 1
-        target[:cur_len] = IGNORE_INDEX
-        for i, rou in enumerate(rounds):
-            if rou == "":
-                break
-
-            parts = rou.split(sep)
-            if len(parts) != 2:
-                break
-            parts[0] += sep
-
-            if len(MODAL_list) > 0:
-                # round_len = len(tokenizer_image_token(rou, tokenizer)) 
-                # instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 2
-                # fix the issue of tokenization mismatch
-                round_len = len(tokenizer_MMODAL_token(rou, tokenizer, MMODAL_TOKEN_INDEX[MODAL_list[idx]]))
-                instruction_len = len(tokenizer_MMODAL_token(parts[0], tokenizer, MMODAL_TOKEN_INDEX[MODAL_list[idx]])) - 2
-            else:
-                round_len = len(tokenizer(rou).input_ids)
-                instruction_len = len(tokenizer(parts[0]).input_ids) - 2
-
-            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
-
-            cur_len += round_len
-        target[cur_len:] = IGNORE_INDEX
-
-        if cur_len < tokenizer.model_max_length:
-            if cur_len != total_len:
-                target[:] = IGNORE_INDEX
-                print(
-                    f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
-                    f" (ignored)"
-                )
-
-    return dict(
-        input_ids=input_ids,
-        labels=targets,
-    )
-
-
-def preprocess_plain(
-    sources: Sequence[str],
-    tokenizer: transformers.PreTrainedTokenizer,
-    MODAL_list=[]
-) -> Dict:
-    # add end signal and concatenate together
-    conversations = []
-    DEFAULT_TOKEN = DEFAULT_MMODAL_TOKEN[MODAL_list[0]]
-    for source in sources:
-        assert len(source) == 2
-        source[0]['value'] = DEFAULT_TOKEN
-        conversation = source[0]['value'] + source[1]['value'] + conversation_lib.default_conversation.sep
-        conversations.append(conversation)
-    # tokenize conversations
-    input_ids = [tokenizer_MMODAL_token(prompt, tokenizer, MMODAL_TOKEN_INDEX[MODAL_list[0]], return_tensors='pt') for prompt in conversations]
-    targets = copy.deepcopy(input_ids)
-    for target, source in zip(targets, sources):
-        tokenized_len = len(tokenizer_MMODAL_token(source[0]['value'], tokenizer, MMODAL_TOKEN_INDEX[MODAL_list[0]]))
-        target[:tokenized_len] = IGNORE_INDEX
-
-    return dict(input_ids=input_ids, labels=targets)
-
-
-def preprocess(
-    sources: Sequence[str],
-    tokenizer: transformers.PreTrainedTokenizer,
-    MODAL_list: list = []
-) -> Dict:
-    """
-    Given a list of sources, each is a conversation list. This transform:
-    1. Add signal '### ' at the beginning each sentence, with end signal '\n';
-    2. Concatenate conversations together;
-    3. Tokenize the concatenated conversation;
-    4. Make a deepcopy as the target. Mask human words with IGNORE_INDEX.
-    """
-    if conversation_lib.default_conversation.sep_style == conversation_lib.SeparatorStyle.PLAIN:
-        return preprocess_plain(sources, tokenizer, MODAL_list)
-    if conversation_lib.default_conversation.sep_style == conversation_lib.SeparatorStyle.LLAMA_2:
-        return preprocess_llama_2(sources, tokenizer, MODAL_list)
-    if conversation_lib.default_conversation.version.startswith("v1"):
-        return preprocess_v1(sources, tokenizer, MODAL_list)
-    # add end signal and concatenate together
-    conversations = []
-    for source in sources:
-        header = f"{conversation_lib.default_conversation.system}\n\n"
-        conversation = _add_speaker_and_signal(header, source)
-        conversations.append(conversation)
-    # tokenize conversations
-    def get_tokenize_len(prompts, token_index):
-        return [len(tokenizer_MMODAL_token(prompt, tokenizer, token_index)) for prompt in prompts]
-
-    if len(MODAL_list) > 0:
-        input_ids = [tokenizer_MMODAL_token(prompt, tokenizer, MMODAL_TOKEN_INDEX[MODAL_list[i]], return_tensors='pt') for i, prompt in enumerate(conversations)]
-    else:
-        conversations_tokenized = _tokenize_fn(conversations, tokenizer)
-        input_ids = conversations_tokenized["input_ids"]
-
-    targets = copy.deepcopy(input_ids)
-    for idx, (target, source) in enumerate(zip(targets, sources)):
-        if len(MODAL_list) > 0:
-            tokenized_lens = get_tokenize_len([header] + [s["value"] for s in source], MODAL_list[idx])
-        else:
-            tokenized_lens = _tokenize_fn([header] + [s["value"] for s in source], tokenizer)["input_ids_lens"]
-        speakers = [sentence["from"] for sentence in source]
-        _mask_targets(target, tokenized_lens, speakers)
-
-    return dict(input_ids=input_ids, labels=targets)
-
-
-class LazySupervisedDataset(Dataset):
-    """Dataset for supervised fine-tuning."""
-
-    def __init__(self, data_path: str,
-                 tokenizer: transformers.PreTrainedTokenizer,
-                 data_args: DataArguments):
-        super(LazySupervisedDataset, self).__init__()
-        list_data_dict = json.load(open(data_path, "r"))
-
-        rank0_print("Formatting inputs...Skip in lazy mode")
-        self.tokenizer = tokenizer
-        self.list_data_dict = list_data_dict
-        self.data_args = data_args
-
-    def __len__(self):
-        return len(self.list_data_dict)
-
-    @property
-    def lengths(self):
-        length_list = []
-        for sample in self.list_data_dict:
-            img_tokens = 513 if 'image' in sample else 0
-            length_list.append(sum(len(conv['value'].split()) for conv in sample['conversations']) + img_tokens)
-        return length_list
-
-    @property
-    def modality_lengths(self):
-        length_list = []
-        for sample in self.list_data_dict:
-            cur_len = sum(len(conv['value'].split()) for conv in sample['conversations'])
-            cur_len = cur_len if 'image' in sample else -cur_len
-            length_list.append(cur_len)
-        return length_list
-
-    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
-        sources = self.list_data_dict[i]
-        image_processor = self.data_args.image_processor
-        video_processor = self.data_args.video_processor
-
-        num_frames = NUM_FRAMES if self.data_args.num_frames is None else self.data_args.num_frames
-
-        if isinstance(i, int):
-            sources = [sources]
-        assert len(sources) == 1, "Don't know why it is wrapped to a list"  # FIXME
-        MODAL_list = []
-        if 'image' in sources[0]:
-            image_file = self.list_data_dict[i]['image']
-            image_file = os.path.join(self.data_args.data_folder, image_file)
-
-            try:
-                image = process_image(image_file, image_processor, self.data_args.image_aspect_ratio)[0]
-            except Exception as e:
-                traceback.print_exc()
-                backup_idx = random.randint(0, len(self.list_data_dict)-1)
-                print(f"Encounted error when reading image {image_file}, use {backup_idx}-th example instead!!!")
-                return self.__getitem__(backup_idx)
-
-            sources = preprocess_multimodal(copy.deepcopy([e["conversations"] for e in sources]), self.data_args)
-            MODAL_list.append('IMAGE')
-        elif 'video' in sources[0]:
-            video_file = self.list_data_dict[i]['video']
-            video_file = os.path.join(self.data_args.data_folder, video_file)
-
-            try: 
-                video = process_video(video_file, video_processor, self.data_args.image_aspect_ratio, num_frames)
-            except Exception as e:
-                traceback.print_exc()
-                backup_idx = random.randint(0, len(self.list_data_dict)-1)
-                print(f"Encounted error when reading video {video_file}, use {backup_idx}-th example instead!!!")
-                return self.__getitem__(backup_idx)
-
-            sources = preprocess_multimodal(copy.deepcopy([e["conversations"] for e in sources]), self.data_args)
-            MODAL_list.append('VIDEO')
-        else:
-            sources = copy.deepcopy([e["conversations"] for e in sources])
-            # NOTE: for sharegpt data in the sft stage, we use the default IMAGE as modal token
-            MODAL_list.append('IMAGE')
-
-        data_dict = preprocess(sources, self.tokenizer, MODAL_list=MODAL_list)
-        if isinstance(i, int):
-            data_dict = dict(input_ids=data_dict["input_ids"][0], labels=data_dict["labels"][0])
-
-        if 'image' in self.list_data_dict[i]:
-            data_dict['image'] = image
-        elif 'video' in self.list_data_dict[i]:
-            data_dict['video'] = video
-        elif self.data_args.is_multimodal:
-            # image does not exist in the data, but the model is multimodal
-            crop_size = self.data_args.image_processor.crop_size
-            data_dict['image'] = torch.zeros(3, crop_size['height'], crop_size['width'])
-        return data_dict
-
-
-@dataclass
-class DataCollatorForSupervisedDataset(object):
-    """Collate examples for supervised fine-tuning."""
-
-    tokenizer: transformers.PreTrainedTokenizer
-
-    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
-        input_ids, labels = tuple([instance[key] for instance in instances]
-                                  for key in ("input_ids", "labels"))
-        input_ids = torch.nn.utils.rnn.pad_sequence(
-            input_ids,
-            batch_first=True,
-            padding_value=self.tokenizer.pad_token_id)
-        labels = torch.nn.utils.rnn.pad_sequence(
-            labels,
-            batch_first=True,
-            padding_value=IGNORE_INDEX)
-        input_ids = input_ids[:, :self.tokenizer.model_max_length]
-        labels = labels[:, :self.tokenizer.model_max_length]
-        batch = dict(
-            input_ids=input_ids,
-            labels=labels,
-            attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
-        )
-
-        Xs, keys = [], []
-        for instance in instances:
-            for x in DEFAULT_MMODAL_TOKEN.keys():
-                x = x.lower()
-                if x in instance:
-                    Xs.append(instance[x])
-                    keys.append(x)
-        batch['images'] = [Xs, keys]  # we do not change the key's name.
-        return batch
-
-
-def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer,
-                                data_args) -> Dict:
-    """Make dataset and collator for supervised fine-tuning."""
-    train_dataset = LazySupervisedDataset(
-        tokenizer=tokenizer,
-        data_path=data_args.data_path,
-        data_args=data_args
-    )
-    data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
-    return dict(train_dataset=train_dataset,
-                eval_dataset=None,
-                data_collator=data_collator)
-
-
-def train(attn_implementation=None):
-    global local_rank
-    set_seed(42)
-
-    parser = transformers.HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
-    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
-
-    local_rank = training_args.local_rank
-    compute_dtype = (torch.float16 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32))
-
-    bnb_model_from_pretrained_args = {}
-    if training_args.bits in [4, 8]:
-        from transformers import BitsAndBytesConfig
-        bnb_model_from_pretrained_args.update(dict(
-            device_map={"": training_args.device},
-            load_in_4bit=training_args.bits == 4,
-            load_in_8bit=training_args.bits == 8,
-            quantization_config=BitsAndBytesConfig(
-                load_in_4bit=training_args.bits == 4,
-                load_in_8bit=training_args.bits == 8,
-                llm_int8_skip_modules=["mm_projector"],
-                llm_int8_threshold=6.0,
-                llm_int8_has_fp16_weight=False,
-                bnb_4bit_compute_dtype=compute_dtype,
-                bnb_4bit_use_double_quant=training_args.double_quant,
-                bnb_4bit_quant_type=training_args.quant_type # {'fp4', 'nf4'}
-            )
-        ))
-
-    if model_args.vision_tower is not None:
-        if 'mistral' in model_args.model_name_or_path.lower():
-            config = transformers.AutoConfig.from_pretrained(model_args.model_name_or_path, trust_remote_code=True)
-            config._attn_implementation = attn_implementation
-            model = Videollama2MistralForCausalLM.from_pretrained(
-                model_args.model_name_or_path,
-                config=config,
-                cache_dir=training_args.cache_dir,
-                torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
-                do_sample=True,
-                **bnb_model_from_pretrained_args
-            )
-        elif 'mixtral' in model_args.model_name_or_path.lower():
-            config = transformers.AutoConfig.from_pretrained(model_args.model_name_or_path, trust_remote_code=True)
-            config._attn_implementation = attn_implementation
-            model = Videollama2MixtralForCausalLM.from_pretrained(
-                model_args.model_name_or_path,
-                config=config,
-                cache_dir=training_args.cache_dir,
-                torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
-                do_sample=True,
-                **bnb_model_from_pretrained_args
-            )
-            import deepspeed
-            deepspeed.utils.set_z3_leaf_modules(model, [MixtralSparseMoeBlock])
-        else:
-            config = transformers.AutoConfig.from_pretrained(model_args.model_name_or_path, trust_remote_code=True)
-            config._attn_implementation = attn_implementation
-            model = Videollama2LlamaForCausalLM.from_pretrained(
-                model_args.model_name_or_path,
-                config=config,
-                cache_dir=training_args.cache_dir,
-                torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
-                do_sample=True,
-                **bnb_model_from_pretrained_args
-            )
-    else:
-        config = transformers.AutoConfig.from_pretrained(model_args.model_name_or_path, trust_remote_code=True)
-        config._attn_implementation = attn_implementation
-        model = transformers.LlamaForCausalLM.from_pretrained(
-            model_args.model_name_or_path,
-            config=config,
-            cache_dir=training_args.cache_dir,
-            torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
-            do_sample=True,
-            **bnb_model_from_pretrained_args
-        )
-    model.config.use_cache = False
-
-    if model_args.freeze_backbone:
-        model.model.requires_grad_(False)
-
-    if training_args.bits in [4, 8]:
-        from peft import prepare_model_for_kbit_training
-        model.config.torch_dtype=(torch.float32 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32))
-        model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=training_args.gradient_checkpointing)
-
-    if training_args.gradient_checkpointing:
-        if hasattr(model, "enable_input_require_grads"):
-            model.enable_input_require_grads()
-        else:
-            def make_inputs_require_grad(module, input, output):
-                output.requires_grad_(True)
-            model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
-
-    if training_args.lora_enable:
-        from peft import LoraConfig, get_peft_model
-        lora_config = LoraConfig(
-            r=training_args.lora_r,
-            lora_alpha=training_args.lora_alpha,
-            target_modules=find_all_linear_names(model),
-            lora_dropout=training_args.lora_dropout,
-            bias=training_args.lora_bias,
-            task_type="CAUSAL_LM",
-        )
-        if training_args.bits == 16:
-            if training_args.bf16:
-                model.to(torch.bfloat16)
-            if training_args.fp16:
-                model.to(torch.float16)
-        rank0_print("Adding LoRA adapters...")
-        model = get_peft_model(model, lora_config)
-
-
-    tokenizer = transformers.AutoTokenizer.from_pretrained(
-        model_args.model_name_or_path,
-        cache_dir=training_args.cache_dir,
-        model_max_length=training_args.model_max_length,
-        padding_side="right",
-        use_fast=True,
-    )
-
-    if model_args.version == "v0":
-        if tokenizer.pad_token is None:
-            smart_tokenizer_and_embedding_resize(
-                special_tokens_dict=dict(pad_token="[PAD]"),
-                tokenizer=tokenizer,
-                model=model,
-            )
-    elif model_args.version == "v0.5":
-        tokenizer.pad_token = tokenizer.unk_token
-    else:
-        tokenizer.pad_token = tokenizer.unk_token
-        if model_args.version in conversation_lib.conv_templates:
-            conversation_lib.default_conversation = conversation_lib.conv_templates[model_args.version]
-        else:
-            if model_args.version == "v1":
-                conversation_lib.default_conversation = conversation_lib.conv_templates["vicuna_v1"]
-            elif model_args.version == "v1_mistral":
-                conversation_lib.default_conversation = conversation_lib.conv_templates["mistral_instruct"]
-
-    if model_args.vision_tower is not None:
-        # initialize vision encoder + multi-modal projector
-        model.get_model().initialize_vision_modules(model_args=model_args, fsdp=training_args.fsdp)
-
-        vision_tower = model.get_vision_tower()
-        vision_tower.to(dtype=torch.bfloat16 if training_args.bf16 else torch.float16, device=training_args.device)
-
-        data_args.image_processor = vision_tower.image_processor
-        data_args.video_processor = vision_tower.video_processor if hasattr(vision_tower, "video_processor") else vision_tower.image_processor
-
-        data_args.is_multimodal = True
-
-        model.config.image_aspect_ratio = data_args.image_aspect_ratio
-        model.config.tokenizer_padding_side = tokenizer.padding_side
-        model.config.tokenizer_model_max_length = tokenizer.model_max_length
-
-        model.config.tune_mm_mlp_adapter = training_args.tune_mm_mlp_adapter = model_args.tune_mm_mlp_adapter
-        if model_args.tune_mm_mlp_adapter:
-            model.requires_grad_(False)
-            for p in model.get_model().mm_projector.parameters():
-                p.requires_grad = True
-
-        model.config.freeze_mm_mlp_adapter = training_args.freeze_mm_mlp_adapter
-        if training_args.freeze_mm_mlp_adapter:
-            for p in model.get_model().mm_projector.parameters():
-                p.requires_grad = False
-
-        if training_args.bits in [4, 8]:
-            model.get_model().mm_projector.to(dtype=compute_dtype, device=training_args.device)
-
-        model.config.mm_use_im_start_end = data_args.mm_use_im_start_end = model_args.mm_use_im_start_end
-        model.config.mm_projector_lr = training_args.mm_projector_lr
-        training_args.use_im_start_end = model_args.mm_use_im_start_end
-        model.config.mm_use_im_patch_token = model_args.mm_use_im_patch_token
-        model.initialize_MM_tokenizer(model_args, tokenizer=tokenizer)
-
-        model.config.num_frames = NUM_FRAMES if data_args.num_frames is None else data_args.num_frames
-
-    if training_args.bits in [4, 8]:
-        from peft.tuners.lora import LoraLayer
-        for name, module in model.named_modules():
-            if isinstance(module, LoraLayer):
-                if training_args.bf16:
-                    module = module.to(torch.bfloat16)
-            if 'norm' in name:
-                module = module.to(torch.float32)
-            if 'lm_head' in name or 'embed_tokens' in name:
-                if hasattr(module, 'weight'):
-                    if training_args.bf16 and module.weight.dtype == torch.float32:
-                        module = module.to(torch.bfloat16)
-
-    print("Current model:", model)
-    data_module = make_supervised_data_module(tokenizer=tokenizer, data_args=data_args)
-    # select a Trainer
-    trainer = VideoLLaMA2Trainer(model=model, tokenizer=tokenizer, args=training_args, **data_module)
-
-    if list(pathlib.Path(training_args.output_dir).glob("checkpoint-*")):
-        trainer.train(resume_from_checkpoint=True)
-    else:
-        trainer.train()
-    trainer.save_state()
-
-    model.config.use_cache = True
-
-    if training_args.lora_enable:
-        state_dict = get_peft_state_maybe_zero_3(model.named_parameters(), training_args.lora_bias)
-        non_lora_state_dict = get_peft_state_non_lora_maybe_zero_3(model.named_parameters())
-        if training_args.local_rank == 0 or training_args.local_rank == -1:
-            model.config.save_pretrained(training_args.output_dir)
-            model.save_pretrained(training_args.output_dir, state_dict=state_dict)
-            torch.save(non_lora_state_dict, os.path.join(training_args.output_dir, 'non_lora_trainables.bin'))
-    else:
-        safe_save_model_for_hf_trainer(trainer=trainer, output_dir=training_args.output_dir)
-
-
-if __name__ == "__main__":
-    train()