ClownRat commited on
Commit
1d5815e
β€’
1 Parent(s): 4049b6f

Update videollama2 codebase.

Browse files
Files changed (50) hide show
  1. VideoLLaMA2/README.md +23 -9
  2. VideoLLaMA2/pyproject.toml +1 -1
  3. VideoLLaMA2/requirements.txt +3 -2
  4. VideoLLaMA2/scripts/custom/finetune.sh +10 -11
  5. VideoLLaMA2/scripts/custom/finetune_lora.sh +10 -11
  6. VideoLLaMA2/scripts/custom/finetune_qlora.sh +10 -11
  7. VideoLLaMA2/scripts/eval/eval_video_cap_msvc.sh +16 -16
  8. VideoLLaMA2/scripts/eval/eval_video_mcqa_egoschema.sh +1 -1
  9. VideoLLaMA2/scripts/eval/eval_video_mcqa_mvbench.sh +1 -1
  10. VideoLLaMA2/scripts/eval/eval_video_mcqa_perception_test_mcqa.sh +1 -1
  11. VideoLLaMA2/scripts/eval/eval_video_mcqa_videomme.sh +1 -1
  12. VideoLLaMA2/scripts/eval/{eval_video_oqa_vcgpt_activitynet.sh β†’ eval_video_oqa_activitynet.sh} +1 -1
  13. VideoLLaMA2/scripts/eval/{eval_video_oqa_vcgpt_msvd.sh β†’ eval_video_oqa_msvd.sh} +1 -1
  14. VideoLLaMA2/scripts/eval/eval_video_oqa_vcgpt_1_correctness.sh +5 -5
  15. VideoLLaMA2/scripts/eval/eval_video_oqa_vcgpt_2_detail.sh +7 -7
  16. VideoLLaMA2/scripts/eval/eval_video_oqa_vcgpt_3_context.sh +4 -4
  17. VideoLLaMA2/scripts/eval/eval_video_oqa_vcgpt_4_temporal.sh +4 -4
  18. VideoLLaMA2/scripts/eval/eval_video_oqa_vcgpt_5_consistency.sh +1 -1
  19. VideoLLaMA2/scripts/siglip/finetune_gemma2.sh +0 -75
  20. VideoLLaMA2/scripts/siglip/finetune_mistral.sh +0 -75
  21. VideoLLaMA2/scripts/siglip/finetune_phi3.sh +0 -75
  22. VideoLLaMA2/scripts/siglip/finetune_qwen2.sh +0 -75
  23. VideoLLaMA2/scripts/siglip/pretrain_gemma2.sh +0 -75
  24. VideoLLaMA2/scripts/siglip/pretrain_mistral.sh +0 -75
  25. VideoLLaMA2/scripts/siglip/pretrain_phi3.sh +0 -75
  26. VideoLLaMA2/scripts/siglip/pretrain_qwen2.sh +0 -75
  27. VideoLLaMA2/scripts/vllava/finetune.sh +9 -10
  28. VideoLLaMA2/scripts/vllava/finetune_qwen2.sh +0 -74
  29. VideoLLaMA2/scripts/vllava/pretrain.sh +9 -10
  30. VideoLLaMA2/scripts/vllava/pretrain_qwen2.sh +0 -74
  31. VideoLLaMA2/videollama2/__init__.py +2 -2
  32. VideoLLaMA2/videollama2/eval/inference_video_cap_msvc.py +50 -8
  33. VideoLLaMA2/videollama2/eval/inference_video_mcqa_egoschema.py +15 -10
  34. VideoLLaMA2/videollama2/eval/inference_video_mcqa_mvbench.py +1 -1
  35. VideoLLaMA2/videollama2/eval/inference_video_mcqa_perception_test_mcqa.py +1 -1
  36. VideoLLaMA2/videollama2/eval/inference_video_mcqa_videomme.py +1 -1
  37. VideoLLaMA2/videollama2/eval/inference_video_oqa_activitynet.py +16 -8
  38. VideoLLaMA2/videollama2/mm_utils.py +2 -1
  39. VideoLLaMA2/videollama2/model/__init__.py +110 -131
  40. VideoLLaMA2/videollama2/model/encoder.py +9 -1
  41. VideoLLaMA2/videollama2/model/videollama2_arch.py +3 -0
  42. VideoLLaMA2/videollama2/model/videollama2_gemma2.py +0 -157
  43. VideoLLaMA2/videollama2/model/videollama2_llama.py +6 -6
  44. VideoLLaMA2/videollama2/model/videollama2_mistral.py +1 -1
  45. VideoLLaMA2/videollama2/model/videollama2_mixtral.py +1 -1
  46. VideoLLaMA2/videollama2/model/videollama2_phi3.py +0 -157
  47. VideoLLaMA2/videollama2/model/videollama2_qwen2.py +1 -1
  48. VideoLLaMA2/videollama2/serve/gradio_web_server_adhoc.py +21 -15
  49. VideoLLaMA2/videollama2/train.py +20 -32
  50. VideoLLaMA2/videollama2/train_flash_attn.py +0 -12
VideoLLaMA2/README.md CHANGED
@@ -19,6 +19,12 @@ VideoLLaMA 2: Advancing Spatial-Temporal Modeling and Audio Understanding in Vid
19
 
20
  </h5>
21
 
 
 
 
 
 
 
22
  <details open><summary>πŸ’‘ Some other multimodal-LLM projects from our team may interest you ✨. </summary><p>
23
  <!-- may -->
24
 
@@ -36,6 +42,8 @@ VideoLLaMA 2: Advancing Spatial-Temporal Modeling and Audio Understanding in Vid
36
 
37
 
38
  ## πŸ“° News
 
 
39
  * **[2024.07.30]** Release checkpoints of [VideoLLaMA2-8x7B-Base](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2-8x7B-Base) and [VideoLLaMA2-8x7B](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2-8x7B).
40
  * **[2024.06.25]** πŸ”₯πŸ”₯ As of Jun 25, our [VideoLLaMA2-7B-16F](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2-7B-16F) is the **Top-1** ~7B-sized VideoLLM on the [MLVU Leaderboard](https://github.com/JUNJIE99/MLVU?tab=readme-ov-file#trophy-mini-leaderboard).
41
  * **[2024.06.18]** πŸ”₯πŸ”₯ As of Jun 18, our [VideoLLaMA2-7B-16F](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2-7B-16F) is the **Top-1** ~7B-sized VideoLLM on the [VideoMME Leaderboard](https://video-mme.github.io/home_page.html#leaderboard).
@@ -51,8 +59,8 @@ Basic Dependencies:
51
  * Python >= 3.8
52
  * Pytorch >= 2.2.0
53
  * CUDA Version >= 11.8
54
- * transformers >= 4.41.2 (for mistral tokenizer)
55
- * tokenizers >= 0.19.1 (for mistral tokenizer)
56
 
57
  **[Online Mode]** Install required packages (better for development):
58
  ```bash
@@ -74,11 +82,12 @@ pip install flash-attn==2.5.8 --no-build-isolation
74
  ## πŸš€ Main Results
75
 
76
  ### Multi-Choice Video QA & Video Captioning
77
- <p><img src="https://github.com/DAMO-NLP-SG/VideoLLaMA2/assets/18526640/9cc4a5ae-d850-4eef-bd51-83688b94698e" width="800" "/></p>
78
-
79
 
80
  ### Open-Ended Video QA
81
- <p><img src="https://github.com/DAMO-NLP-SG/VideoLLaMA2/assets/18526640/2ed7aa53-db56-4829-8375-85aefbc5120a" width="800" "/></p>
 
 
82
 
83
  ## :earth_americas: Model Zoo
84
  | Model Name | Model Type | Visual Encoder | Language Decoder | # Training Frames |
@@ -89,6 +98,11 @@ pip install flash-attn==2.5.8 --no-build-isolation
89
  | [VideoLLaMA2-7B-16F](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2-7B-16F) | Chat | [clip-vit-large-patch14-336](https://huggingface.co/openai/clip-vit-large-patch14-336) | [Mistral-7B-Instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2) | 16 |
90
  | [VideoLLaMA2-8x7B-Base](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2-8x7B-Base) | Base | [clip-vit-large-patch14-336](https://huggingface.co/openai/clip-vit-large-patch14-336) | [Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1) | 8 |
91
  | [VideoLLaMA2-8x7B](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2-8x7B) | Chat | [clip-vit-large-patch14-336](https://huggingface.co/openai/clip-vit-large-patch14-336) | [Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1) | 8 |
 
 
 
 
 
92
 
93
 
94
  ## [πŸ€— Demo](https://huggingface.co/spaces/lixin4ever/VideoLLaMA2)
@@ -251,7 +265,7 @@ VideoLLaMA2
251
  ...
252
  --data_path datasets/custom_sft/custom.json
253
  --data_folder datasets/custom_sft/
254
- --pretrain_mm_mlp_adapter CONNECTOR_DOWNLOAD_PATH (e.g., DAMO-NLP-SG/VideoLLaMA2-7B-Base)
255
  ...
256
  ```
257
 
@@ -269,7 +283,7 @@ def inference():
269
  disable_torch_init()
270
 
271
  # Video Inference
272
- modal = 'videp'
273
  modal_path = 'assets/cat_and_chicken.mp4'
274
  instruct = 'What animals are in the video, what are they doing, and how does the video feel?'
275
  # Reply:
@@ -282,9 +296,9 @@ def inference():
282
  # Reply:
283
  # The woman in the image is wearing a black coat and sunglasses, and she is walking down a rain-soaked city street. The image feels vibrant and lively, with the bright city lights reflecting off the wet pavement, creating a visually appealing atmosphere. The woman's presence adds a sense of style and confidence to the scene, as she navigates the bustling urban environment.
284
 
285
- model_path = 'DAMO-NLP-SG/VideoLLaMA2-7B'
286
  # Base model inference (only need to replace model_path)
287
- # model_path = 'DAMO-NLP-SG/VideoLLaMA2-7B-Base'
288
  model, processor, tokenizer = model_init(model_path)
289
  output = mm_infer(processor[modal](modal_path), instruct, model=model, tokenizer=tokenizer, do_sample=False, modal=modal)
290
 
 
19
 
20
  </h5>
21
 
22
+ [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/videollama-2-advancing-spatial-temporal/zero-shot-video-question-answer-on-egoschema-1)](https://paperswithcode.com/sota/zero-shot-video-question-answer-on-egoschema-1?p=videollama-2-advancing-spatial-temporal) <br>
23
+ [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/videollama-2-advancing-spatial-temporal/video-question-answering-on-perception-test)](https://paperswithcode.com/sota/video-question-answering-on-perception-test?p=videollama-2-advancing-spatial-temporal) <br>
24
+ [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/videollama-2-advancing-spatial-temporal/video-question-answering-on-mvbench)](https://paperswithcode.com/sota/video-question-answering-on-mvbench?p=videollama-2-advancing-spatial-temporal) <br>
25
+ [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/videollama-2-advancing-spatial-temporal/zero-shot-video-question-answer-on-video-mme-1)](https://paperswithcode.com/sota/zero-shot-video-question-answer-on-video-mme-1?p=videollama-2-advancing-spatial-temporal) <br>
26
+ [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/videollama-2-advancing-spatial-temporal/zero-shot-video-question-answer-on-video-mme)](https://paperswithcode.com/sota/zero-shot-video-question-answer-on-video-mme?p=videollama-2-advancing-spatial-temporal) <br>
27
+
28
  <details open><summary>πŸ’‘ Some other multimodal-LLM projects from our team may interest you ✨. </summary><p>
29
  <!-- may -->
30
 
 
42
 
43
 
44
  ## πŸ“° News
45
+ * **[2024.10.15]** Release checkpoints of [VideoLLaMA2.1-7B-16F-Base](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2.1-7B-16F-Base) and [VideoLLaMA2.1-7B-16F](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2.1-7B-16F)
46
+ * **[2024.08.14]** Release checkpoints of [VideoLLaMA2-72B-Base](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2-72B-Base) and [VideoLLaMA2-72B](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2-72B)
47
  * **[2024.07.30]** Release checkpoints of [VideoLLaMA2-8x7B-Base](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2-8x7B-Base) and [VideoLLaMA2-8x7B](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2-8x7B).
48
  * **[2024.06.25]** πŸ”₯πŸ”₯ As of Jun 25, our [VideoLLaMA2-7B-16F](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2-7B-16F) is the **Top-1** ~7B-sized VideoLLM on the [MLVU Leaderboard](https://github.com/JUNJIE99/MLVU?tab=readme-ov-file#trophy-mini-leaderboard).
49
  * **[2024.06.18]** πŸ”₯πŸ”₯ As of Jun 18, our [VideoLLaMA2-7B-16F](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2-7B-16F) is the **Top-1** ~7B-sized VideoLLM on the [VideoMME Leaderboard](https://video-mme.github.io/home_page.html#leaderboard).
 
59
  * Python >= 3.8
60
  * Pytorch >= 2.2.0
61
  * CUDA Version >= 11.8
62
+ * transformers == 4.40.0 (for reproducing paper results)
63
+ * tokenizers == 0.19.1
64
 
65
  **[Online Mode]** Install required packages (better for development):
66
  ```bash
 
82
  ## πŸš€ Main Results
83
 
84
  ### Multi-Choice Video QA & Video Captioning
85
+ <p><img src="https://github.com/user-attachments/assets/e87fe4cf-07ea-4fde-998b-a0c63671c3b4" width="800" "/></p>
 
86
 
87
  ### Open-Ended Video QA
88
+ <p><img src="https://github.com/user-attachments/assets/80b16c04-75ac-43b8-bc22-6952fdf994bb" width="800" "/></p>
89
+
90
+
91
 
92
  ## :earth_americas: Model Zoo
93
  | Model Name | Model Type | Visual Encoder | Language Decoder | # Training Frames |
 
98
  | [VideoLLaMA2-7B-16F](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2-7B-16F) | Chat | [clip-vit-large-patch14-336](https://huggingface.co/openai/clip-vit-large-patch14-336) | [Mistral-7B-Instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2) | 16 |
99
  | [VideoLLaMA2-8x7B-Base](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2-8x7B-Base) | Base | [clip-vit-large-patch14-336](https://huggingface.co/openai/clip-vit-large-patch14-336) | [Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1) | 8 |
100
  | [VideoLLaMA2-8x7B](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2-8x7B) | Chat | [clip-vit-large-patch14-336](https://huggingface.co/openai/clip-vit-large-patch14-336) | [Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1) | 8 |
101
+ | [VideoLLaMA2-72B-Base](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2-72B-Base) | Base | [clip-vit-large-patch14-336](https://huggingface.co/openai/clip-vit-large-patch14-336) | [Qwen2-72B-Instruct](https://huggingface.co/Qwen/Qwen2-72B-Instruct) | 8 |
102
+ | [VideoLLaMA2-72B](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2-72B) | Chat | [clip-vit-large-patch14-336](https://huggingface.co/openai/clip-vit-large-patch14-336) | [Qwen2-72B-Instruct](https://huggingface.co/Qwen/Qwen2-72B-Instruct) | 8 |
103
+ | [VideoLLaMA2.1-7B-16F-Base](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2.1-7B-16F-Base) | Base | [siglip-so400m-patch14-384](https://huggingface.co/google/siglip-so400m-patch14-384) | [Qwen2-7B-Instruct](https://huggingface.co/Qwen/Qwen2-7B-Instruct) | 16 |
104
+ | [VideoLLaMA2.1-7B-16F](https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2.1-7B-16F) | Chat | [siglip-so400m-patch14-384](https://huggingface.co/google/siglip-so400m-patch14-384) | [Qwen2-7B-Instruct](https://huggingface.co/Qwen/Qwen2-7B-Instruct) | 16 |
105
+
106
 
107
 
108
  ## [πŸ€— Demo](https://huggingface.co/spaces/lixin4ever/VideoLLaMA2)
 
265
  ...
266
  --data_path datasets/custom_sft/custom.json
267
  --data_folder datasets/custom_sft/
268
+ --pretrain_mm_mlp_adapter CONNECTOR_DOWNLOAD_PATH (e.g., DAMO-NLP-SG/VideoLLaMA2.1-7B-16F-Base)
269
  ...
270
  ```
271
 
 
283
  disable_torch_init()
284
 
285
  # Video Inference
286
+ modal = 'video'
287
  modal_path = 'assets/cat_and_chicken.mp4'
288
  instruct = 'What animals are in the video, what are they doing, and how does the video feel?'
289
  # Reply:
 
296
  # Reply:
297
  # The woman in the image is wearing a black coat and sunglasses, and she is walking down a rain-soaked city street. The image feels vibrant and lively, with the bright city lights reflecting off the wet pavement, creating a visually appealing atmosphere. The woman's presence adds a sense of style and confidence to the scene, as she navigates the bustling urban environment.
298
 
299
+ model_path = 'DAMO-NLP-SG/VideoLLaMA2.1-7B-16F'
300
  # Base model inference (only need to replace model_path)
301
+ # model_path = 'DAMO-NLP-SG/VideoLLaMA2.1-7B-16F-Base'
302
  model, processor, tokenizer = model_init(model_path)
303
  output = mm_infer(processor[modal](modal_path), instruct, model=model, tokenizer=tokenizer, do_sample=False, modal=modal)
304
 
VideoLLaMA2/pyproject.toml CHANGED
@@ -14,7 +14,7 @@ classifiers = [
14
  ]
15
  dependencies = [
16
  "torch==2.2.0", "torchvision==0.17.0",
17
- "transformers==4.42.3", "tokenizers==0.19.1",
18
  "deepspeed==0.13.1", "accelerate==0.26.1",
19
  "peft==0.4.0", "timm==1.0.3", "numpy==1.24.4",
20
  "decord==0.6.0", "imageio==2.34.0", "imageio-ffmpeg==0.4.9",
 
14
  ]
15
  dependencies = [
16
  "torch==2.2.0", "torchvision==0.17.0",
17
+ "transformers==4.40.0", "tokenizers==0.19.1",
18
  "deepspeed==0.13.1", "accelerate==0.26.1",
19
  "peft==0.4.0", "timm==1.0.3", "numpy==1.24.4",
20
  "decord==0.6.0", "imageio==2.34.0", "imageio-ffmpeg==0.4.9",
VideoLLaMA2/requirements.txt CHANGED
@@ -2,7 +2,7 @@
2
  # basic dependencies
3
  torch==2.2.0
4
  torchvision==0.17.0
5
- transformers==4.42.3
6
  tokenizers==0.19.1
7
  deepspeed==0.13.1
8
  accelerate==0.26.1
@@ -36,4 +36,5 @@ uvicorn
36
  fastapi
37
  tensorboard
38
  wandb
39
- tabulate
 
 
2
  # basic dependencies
3
  torch==2.2.0
4
  torchvision==0.17.0
5
+ transformers==4.40.0
6
  tokenizers==0.19.1
7
  deepspeed==0.13.1
8
  accelerate==0.26.1
 
36
  fastapi
37
  tensorboard
38
  wandb
39
+ tabulate
40
+ spaces==0.29.2
VideoLLaMA2/scripts/custom/finetune.sh CHANGED
@@ -5,7 +5,7 @@ ARG_WORLD_SIZE=${1:-1}
5
  ARG_NPROC_PER_NODE=${2:-8}
6
  ARG_MASTER_ADDR="127.0.0.1"
7
  ARG_MASTER_PORT=16666
8
- ARG_RANK=0
9
 
10
  # Multiple conditions
11
  if [ ! -n "$WORLD_SIZE" ] || [ ! -n "$NPROC_PER_NODE" ]; then
@@ -28,8 +28,8 @@ GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$L
28
 
29
  # Log Arguments
30
  export TRANSFORMERS_OFFLINE=1
31
- export WANDB_PROJECT=videollama2
32
- RUN_NAME=downstream_sft_settings
33
  DATA_DIR=datasets
34
  OUTP_DIR=work_dirs
35
 
@@ -38,18 +38,18 @@ torchrun --nnodes $WORLD_SIZE \
38
  --master_addr=$MASTER_ADDR \
39
  --master_port=$MASTER_PORT \
40
  --node_rank $RANK \
41
- videollama2/train_flash_attn.py \
42
  --deepspeed scripts/zero3.json \
43
- --model_type videollama2 \
44
- --model_path mistralai/Mistral-7B-Instruct-v0.2 \
45
- --vision_tower openai/clip-vit-large-patch14-336 \
46
- --mm_projector_type stc_connector \
47
- --pretrain_mm_mlp_adapter DAMO-NLP-SG/VideoLLaMA2-7B-Base/mm_projector.bin \
48
  --data_path ${DATA_DIR}/videollava_sft/videochatgpt_llavaimage_tune.json \
49
  --data_folder ${DATA_DIR}/videollava_sft/ \
50
  --mm_vision_select_layer -2 \
51
  --image_aspect_ratio pad \
52
- --num_frames 8 \
53
  --bf16 True \
54
  --tf32 True \
55
  --fp16 False \
@@ -58,7 +58,6 @@ torchrun --nnodes $WORLD_SIZE \
58
  --per_device_train_batch_size $LOCAL_BATCH_SIZE \
59
  --per_device_eval_batch_size 4 \
60
  --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
61
- --evaluation_strategy "no" \
62
  --save_strategy "steps" \
63
  --save_steps 500 \
64
  --save_total_limit 99 \
 
5
  ARG_NPROC_PER_NODE=${2:-8}
6
  ARG_MASTER_ADDR="127.0.0.1"
7
  ARG_MASTER_PORT=16666
8
+ ARG_RANK=${3:-0}
9
 
10
  # Multiple conditions
11
  if [ ! -n "$WORLD_SIZE" ] || [ ! -n "$NPROC_PER_NODE" ]; then
 
28
 
29
  # Log Arguments
30
  export TRANSFORMERS_OFFLINE=1
31
+ export WANDB_PROJECT=videollama2qwen2_downstream_sft
32
+ RUN_NAME=siglip_tcv35_7b_16f
33
  DATA_DIR=datasets
34
  OUTP_DIR=work_dirs
35
 
 
38
  --master_addr=$MASTER_ADDR \
39
  --master_port=$MASTER_PORT \
40
  --node_rank $RANK \
41
+ videollama2/train.py \
42
  --deepspeed scripts/zero3.json \
43
+ --model_type videollama2_qwen2 \
44
+ --model_path Qwen/Qwen2-7B-Instruct \
45
+ --vision_tower google/siglip-so400m-patch14-384 \
46
+ --mm_projector_type stc_connector_v35 \
47
+ --pretrain_mm_mlp_adapter DAMO-NLP-SG/VideoLLaMA2.1-7B-16F-Base/mm_projector.bin \
48
  --data_path ${DATA_DIR}/videollava_sft/videochatgpt_llavaimage_tune.json \
49
  --data_folder ${DATA_DIR}/videollava_sft/ \
50
  --mm_vision_select_layer -2 \
51
  --image_aspect_ratio pad \
52
+ --num_frames 16 \
53
  --bf16 True \
54
  --tf32 True \
55
  --fp16 False \
 
58
  --per_device_train_batch_size $LOCAL_BATCH_SIZE \
59
  --per_device_eval_batch_size 4 \
60
  --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
 
61
  --save_strategy "steps" \
62
  --save_steps 500 \
63
  --save_total_limit 99 \
VideoLLaMA2/scripts/custom/finetune_lora.sh CHANGED
@@ -5,7 +5,7 @@ ARG_WORLD_SIZE=${1:-1}
5
  ARG_NPROC_PER_NODE=${2:-8}
6
  ARG_MASTER_ADDR="127.0.0.1"
7
  ARG_MASTER_PORT=16666
8
- ARG_RANK=0
9
 
10
  # Multiple conditions
11
  if [ ! -n "$WORLD_SIZE" ] || [ ! -n "$NPROC_PER_NODE" ]; then
@@ -28,8 +28,8 @@ GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$L
28
 
29
  # Log Arguments
30
  export TRANSFORMERS_OFFLINE=1
31
- export WANDB_PROJECT=videollama2
32
- RUN_NAME=downstream_sft_settings_lora
33
  DATA_DIR=datasets
34
  OUTP_DIR=work_dirs
35
 
@@ -38,19 +38,19 @@ torchrun --nnodes $WORLD_SIZE \
38
  --master_addr=$MASTER_ADDR \
39
  --master_port=$MASTER_PORT \
40
  --node_rank $RANK \
41
- videollama2/train_flash_attn.py \
42
  --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \
43
  --deepspeed scripts/zero3.json \
44
- --model_type videollama2 \
45
- --model_path mistralai/Mistral-7B-Instruct-v0.2 \
46
- --vision_tower openai/clip-vit-large-patch14-336 \
47
- --mm_projector_type stc_connector \
48
- --pretrain_mm_mlp_adapter DAMO-NLP-SG/VideoLLaMA2-7B-Base/mm_projector.bin \
49
  --data_path ${DATA_DIR}/videollava_sft/videochatgpt_llavaimage_tune.json \
50
  --data_folder ${DATA_DIR}/videollava_sft/ \
51
  --mm_vision_select_layer -2 \
52
  --image_aspect_ratio pad \
53
- --num_frames 8 \
54
  --bf16 True \
55
  --tf32 True \
56
  --fp16 False \
@@ -59,7 +59,6 @@ torchrun --nnodes $WORLD_SIZE \
59
  --per_device_train_batch_size $LOCAL_BATCH_SIZE \
60
  --per_device_eval_batch_size 4 \
61
  --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
62
- --evaluation_strategy "no" \
63
  --save_strategy "steps" \
64
  --save_steps 500 \
65
  --save_total_limit 99 \
 
5
  ARG_NPROC_PER_NODE=${2:-8}
6
  ARG_MASTER_ADDR="127.0.0.1"
7
  ARG_MASTER_PORT=16666
8
+ ARG_RANK=${3:-0}
9
 
10
  # Multiple conditions
11
  if [ ! -n "$WORLD_SIZE" ] || [ ! -n "$NPROC_PER_NODE" ]; then
 
28
 
29
  # Log Arguments
30
  export TRANSFORMERS_OFFLINE=1
31
+ export WANDB_PROJECT=videollama2qwen2_downstream_sft
32
+ RUN_NAME=siglip_tcv35_7b_16f_lora
33
  DATA_DIR=datasets
34
  OUTP_DIR=work_dirs
35
 
 
38
  --master_addr=$MASTER_ADDR \
39
  --master_port=$MASTER_PORT \
40
  --node_rank $RANK \
41
+ videollama2/train.py \
42
  --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \
43
  --deepspeed scripts/zero3.json \
44
+ --model_type videollama2_qwen2 \
45
+ --model_path Qwen/Qwen2-7B-Instruct \
46
+ --vision_tower google/siglip-so400m-patch14-384 \
47
+ --mm_projector_type stc_connector_v35 \
48
+ --pretrain_mm_mlp_adapter DAMO-NLP-SG/VideoLLaMA2.1-7B-16F-Base/mm_projector.bin \
49
  --data_path ${DATA_DIR}/videollava_sft/videochatgpt_llavaimage_tune.json \
50
  --data_folder ${DATA_DIR}/videollava_sft/ \
51
  --mm_vision_select_layer -2 \
52
  --image_aspect_ratio pad \
53
+ --num_frames 16 \
54
  --bf16 True \
55
  --tf32 True \
56
  --fp16 False \
 
59
  --per_device_train_batch_size $LOCAL_BATCH_SIZE \
60
  --per_device_eval_batch_size 4 \
61
  --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
 
62
  --save_strategy "steps" \
63
  --save_steps 500 \
64
  --save_total_limit 99 \
VideoLLaMA2/scripts/custom/finetune_qlora.sh CHANGED
@@ -5,7 +5,7 @@ ARG_WORLD_SIZE=${1:-1}
5
  ARG_NPROC_PER_NODE=${2:-8}
6
  ARG_MASTER_ADDR="127.0.0.1"
7
  ARG_MASTER_PORT=16666
8
- ARG_RANK=0
9
 
10
  # Multiple conditions
11
  if [ ! -n "$WORLD_SIZE" ] || [ ! -n "$NPROC_PER_NODE" ]; then
@@ -28,8 +28,8 @@ GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$L
28
 
29
  # Log Arguments
30
  export TRANSFORMERS_OFFLINE=1
31
- export WANDB_PROJECT=videollama2
32
- RUN_NAME=downstream_sft_settings_qlora
33
  DATA_DIR=datasets
34
  OUTP_DIR=work_dirs
35
 
@@ -38,19 +38,19 @@ torchrun --nnodes $WORLD_SIZE \
38
  --master_addr=$MASTER_ADDR \
39
  --master_port=$MASTER_PORT \
40
  --node_rank $RANK \
41
- videollama2/train_flash_attn.py \
42
  --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 --bits 4 \
43
  --deepspeed scripts/zero2.json \
44
- --model_type videollama2 \
45
- --model_path mistralai/Mistral-7B-Instruct-v0.2 \
46
- --vision_tower openai/clip-vit-large-patch14-336 \
47
- --mm_projector_type stc_connector \
48
- --pretrain_mm_mlp_adapter DAMO-NLP-SG/VideoLLaMA2-7B-Base/mm_projector.bin \
49
  --data_path ${DATA_DIR}/videollava_sft/videochatgpt_llavaimage_tune.json \
50
  --data_folder ${DATA_DIR}/videollava_sft/ \
51
  --mm_vision_select_layer -2 \
52
  --image_aspect_ratio pad \
53
- --num_frames 8 \
54
  --bf16 True \
55
  --tf32 True \
56
  --fp16 False \
@@ -59,7 +59,6 @@ torchrun --nnodes $WORLD_SIZE \
59
  --per_device_train_batch_size $LOCAL_BATCH_SIZE \
60
  --per_device_eval_batch_size 4 \
61
  --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
62
- --evaluation_strategy "no" \
63
  --save_strategy "steps" \
64
  --save_steps 500 \
65
  --save_total_limit 99 \
 
5
  ARG_NPROC_PER_NODE=${2:-8}
6
  ARG_MASTER_ADDR="127.0.0.1"
7
  ARG_MASTER_PORT=16666
8
+ ARG_RANK=${3:-0}
9
 
10
  # Multiple conditions
11
  if [ ! -n "$WORLD_SIZE" ] || [ ! -n "$NPROC_PER_NODE" ]; then
 
28
 
29
  # Log Arguments
30
  export TRANSFORMERS_OFFLINE=1
31
+ export WANDB_PROJECT=videollama2qwen2_downstream_sft
32
+ RUN_NAME=siglip_tcv35_7b_16f_qlora
33
  DATA_DIR=datasets
34
  OUTP_DIR=work_dirs
35
 
 
38
  --master_addr=$MASTER_ADDR \
39
  --master_port=$MASTER_PORT \
40
  --node_rank $RANK \
41
+ videollama2/train.py \
42
  --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 --bits 4 \
43
  --deepspeed scripts/zero2.json \
44
+ --model_type videollama2_qwen2 \
45
+ --model_path Qwen/Qwen2-7B-Instruct \
46
+ --vision_tower google/siglip-so400m-patch14-384 \
47
+ --mm_projector_type stc_connector_v35 \
48
+ --pretrain_mm_mlp_adapter DAMO-NLP-SG/VideoLLaMA2.1-7B-16F-Base/mm_projector.bin \
49
  --data_path ${DATA_DIR}/videollava_sft/videochatgpt_llavaimage_tune.json \
50
  --data_folder ${DATA_DIR}/videollava_sft/ \
51
  --mm_vision_select_layer -2 \
52
  --image_aspect_ratio pad \
53
+ --num_frames 16 \
54
  --bf16 True \
55
  --tf32 True \
56
  --fp16 False \
 
59
  --per_device_train_batch_size $LOCAL_BATCH_SIZE \
60
  --per_device_eval_batch_size 4 \
61
  --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
 
62
  --save_strategy "steps" \
63
  --save_steps 500 \
64
  --save_total_limit 99 \
VideoLLaMA2/scripts/eval/eval_video_cap_msvc.sh CHANGED
@@ -2,7 +2,7 @@ set -x
2
 
3
  EVAL_DATA_DIR=eval
4
  OUTPUT_DIR=eval_output
5
- CKPT=DAMO-NLP-SG/VideoLLaMA2-7B
6
  CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
7
 
8
  gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
@@ -12,11 +12,11 @@ IFS=',' read -ra GPULIST <<< "$gpu_list"
12
  GPUS_PER_TASK=1
13
  CHUNKS=$((${#GPULIST[@]}/$GPUS_PER_TASK))
14
 
15
- output_file=${OUTPUT_DIR}/MSVC/answers/${CKPT_NAME}/merge.json
16
 
17
  # judge if the number of json lines is 0
18
  if [ ! -f "$output_file" ] || [ $(cat "$output_file" | wc -l) -eq 0 ]; then
19
- rm -f ${OUTPUT_DIR}/MSVC/answers/${CKPT_NAME}/*.json
20
  fi
21
 
22
  if [ ! -f "$output_file" ]; then
@@ -25,9 +25,9 @@ if [ ! -f "$output_file" ]; then
25
  gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")
26
  TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${gpu_devices} python3 videollama2/eval/inference_video_cap_msvc.py \
27
  --model-path ${CKPT} \
28
- --video-folder ${EVAL_DATA_DIR}/MSVC \
29
- --question-file ${EVAL_DATA_DIR}/MSVC/msvc.json \
30
- --output-file ${OUTPUT_DIR}/MSVC/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json \
31
  --num-chunks $CHUNKS \
32
  --chunk-idx $IDX &
33
  done
@@ -39,28 +39,28 @@ if [ ! -f "$output_file" ]; then
39
 
40
  #Loop through the indices and concatenate each file.
41
  for IDX in $(seq 0 $((CHUNKS-1))); do
42
- cat ${OUTPUT_DIR}/MSVC/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json >> "$output_file"
43
  done
44
  fi
45
 
46
 
47
- AZURE_API_KEY=""
48
- AZURE_API_ENDPOINT=""
49
- AZURE_API_DEPLOYNAME=""
50
 
51
- python3 videollama2/new_eval/eval_video_cap_msvc_correctness.py \
52
  --pred-path $output_file \
53
- --output-dir ${OUTPUT_DIR}/MSVC/answers/${CKPT_NAME}/correctness_gpt \
54
- --output-json ${OUTPUT_DIR}/MSVC/answers/${CKPT_NAME}/correctness_results.json \
55
  --api-key $AZURE_API_KEY \
56
  --api-endpoint $AZURE_API_ENDPOINT \
57
  --api-deployname $AZURE_API_DEPLOYNAME \
58
  --num-tasks 4 \
59
 
60
- python3 videollama2/new_eval/eval_video_cap_msvc_detailedness.py \
61
  --pred-path $output_file \
62
- --output-dir ${OUTPUT_DIR}/MSVC/answers/${CKPT_NAME}/detailedness_gpt \
63
- --output-json ${OUTPUT_DIR}/MSVC/answers/${CKPT_NAME}/detailedness_results.json \
64
  --api-key $AZURE_API_KEY \
65
  --api-endpoint $AZURE_API_ENDPOINT \
66
  --api-deployname $AZURE_API_DEPLOYNAME \
 
2
 
3
  EVAL_DATA_DIR=eval
4
  OUTPUT_DIR=eval_output
5
+ CKPT=DAMO-NLP-SG/VideoLLaMA2.1-7B-16F
6
  CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
7
 
8
  gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 
12
  GPUS_PER_TASK=1
13
  CHUNKS=$((${#GPULIST[@]}/$GPUS_PER_TASK))
14
 
15
+ output_file=${OUTPUT_DIR}/msvc/answers/${CKPT_NAME}/merge.json
16
 
17
  # judge if the number of json lines is 0
18
  if [ ! -f "$output_file" ] || [ $(cat "$output_file" | wc -l) -eq 0 ]; then
19
+ rm -f ${OUTPUT_DIR}/msvc/answers/${CKPT_NAME}/*.json
20
  fi
21
 
22
  if [ ! -f "$output_file" ]; then
 
25
  gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")
26
  TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${gpu_devices} python3 videollama2/eval/inference_video_cap_msvc.py \
27
  --model-path ${CKPT} \
28
+ --video-folder ${EVAL_DATA_DIR}/msvc \
29
+ --question-file ${EVAL_DATA_DIR}/msvc/msvc.json \
30
+ --output-file ${OUTPUT_DIR}/msvc/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json \
31
  --num-chunks $CHUNKS \
32
  --chunk-idx $IDX &
33
  done
 
39
 
40
  #Loop through the indices and concatenate each file.
41
  for IDX in $(seq 0 $((CHUNKS-1))); do
42
+ cat ${OUTPUT_DIR}/msvc/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.json >> "$output_file"
43
  done
44
  fi
45
 
46
 
47
+ AZURE_API_KEY=your_key
48
+ AZURE_API_ENDPOINT=your_endpoint
49
+ AZURE_API_DEPLOYNAME=your_deployname
50
 
51
+ python3 videollama2/eval/eval_video_cap_msvc_correctness.py \
52
  --pred-path $output_file \
53
+ --output-dir ${OUTPUT_DIR}/msvc/answers/${CKPT_NAME}/correctness_gpt \
54
+ --output-json ${OUTPUT_DIR}/msvc/answers/${CKPT_NAME}/correctness_results.json \
55
  --api-key $AZURE_API_KEY \
56
  --api-endpoint $AZURE_API_ENDPOINT \
57
  --api-deployname $AZURE_API_DEPLOYNAME \
58
  --num-tasks 4 \
59
 
60
+ python3 videollama2/eval/eval_video_cap_msvc_detailedness.py \
61
  --pred-path $output_file \
62
+ --output-dir ${OUTPUT_DIR}/msvc/answers/${CKPT_NAME}/detailedness_gpt \
63
+ --output-json ${OUTPUT_DIR}/msvc/answers/${CKPT_NAME}/detailedness_results.json \
64
  --api-key $AZURE_API_KEY \
65
  --api-endpoint $AZURE_API_ENDPOINT \
66
  --api-deployname $AZURE_API_DEPLOYNAME \
VideoLLaMA2/scripts/eval/eval_video_mcqa_egoschema.sh CHANGED
@@ -2,7 +2,7 @@ set -x
2
 
3
  EVAL_DATA_DIR=eval
4
  OUTPUT_DIR=eval_output
5
- CKPT=DAMO-NLP-SG/VideoLLaMA2-7B
6
  CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
7
 
8
  gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 
2
 
3
  EVAL_DATA_DIR=eval
4
  OUTPUT_DIR=eval_output
5
+ CKPT=DAMO-NLP-SG/VideoLLaMA2.1-7B-16F
6
  CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
7
 
8
  gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
VideoLLaMA2/scripts/eval/eval_video_mcqa_mvbench.sh CHANGED
@@ -2,7 +2,7 @@ set -x
2
 
3
  EVAL_DATA_DIR=eval
4
  OUTPUT_DIR=eval_output
5
- CKPT=DAMO-NLP-SG/VideoLLaMA2-7B
6
  CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
7
 
8
  gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 
2
 
3
  EVAL_DATA_DIR=eval
4
  OUTPUT_DIR=eval_output
5
+ CKPT=DAMO-NLP-SG/VideoLLaMA2.1-7B-16F
6
  CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
7
 
8
  gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
VideoLLaMA2/scripts/eval/eval_video_mcqa_perception_test_mcqa.sh CHANGED
@@ -2,7 +2,7 @@ set -x
2
 
3
  EVAL_DATA_DIR=eval
4
  OUTPUT_DIR=eval_output
5
- CKPT=DAMO-NLP-SG/VideoLLaMA2-7B
6
  CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
7
 
8
  gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 
2
 
3
  EVAL_DATA_DIR=eval
4
  OUTPUT_DIR=eval_output
5
+ CKPT=DAMO-NLP-SG/VideoLLaMA2.1-7B-16F
6
  CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
7
 
8
  gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
VideoLLaMA2/scripts/eval/eval_video_mcqa_videomme.sh CHANGED
@@ -2,7 +2,7 @@ set -x
2
 
3
  EVAL_DATA_DIR=eval
4
  OUTPUT_DIR=eval_output
5
- CKPT=DAMO-NLP-SG/VideoLLaMA2-7B-16F
6
  CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
7
 
8
  gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 
2
 
3
  EVAL_DATA_DIR=eval
4
  OUTPUT_DIR=eval_output
5
+ CKPT=DAMO-NLP-SG/VideoLLaMA2.1-7B-16F
6
  CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
7
 
8
  gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
VideoLLaMA2/scripts/eval/{eval_video_oqa_vcgpt_activitynet.sh β†’ eval_video_oqa_activitynet.sh} RENAMED
@@ -2,7 +2,7 @@ set -x
2
 
3
  EVAL_DATA_DIR=eval
4
  OUTPUT_DIR=eval_output
5
- CKPT=DAMO-NLP-SG/VideoLLaMA2-7B
6
  CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
7
 
8
  gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 
2
 
3
  EVAL_DATA_DIR=eval
4
  OUTPUT_DIR=eval_output
5
+ CKPT=DAMO-NLP-SG/VideoLLaMA2.1-7B-16F
6
  CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
7
 
8
  gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
VideoLLaMA2/scripts/eval/{eval_video_oqa_vcgpt_msvd.sh β†’ eval_video_oqa_msvd.sh} RENAMED
@@ -2,7 +2,7 @@ set -x
2
 
3
  EVAL_DATA_DIR=eval
4
  OUTPUT_DIR=eval_output
5
- CKPT=DAMO-NLP-SG/VideoLLaMA2-7B
6
  CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
7
 
8
  gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 
2
 
3
  EVAL_DATA_DIR=eval
4
  OUTPUT_DIR=eval_output
5
+ CKPT=DAMO-NLP-SG/VideoLLaMA2.1-7B-16F
6
  CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
7
 
8
  gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
VideoLLaMA2/scripts/eval/eval_video_oqa_vcgpt_1_correctness.sh CHANGED
@@ -1,8 +1,8 @@
1
  set -x
2
 
3
- EVAL_DATA_DIR=dataset/videollm_eval
4
- OUTPUT_DIR=eval
5
- CKPT=DAMO-NLP-SG/VideoLLaMA2-7B
6
  CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
7
 
8
  gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
@@ -18,7 +18,7 @@ if [ ! -f "$output_file" ]; then
18
  for IDX in $(seq 0 $((CHUNKS-1))); do
19
  # select the GPUs for the task
20
  gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")
21
- TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${gpu_devices} python3 videollama2/new_eval/inference_video_oqa_vcgpt_general.py \
22
  --model-path ${CKPT} \
23
  --video-folder ${EVAL_DATA_DIR}/videochatgpt_gen/Test_Videos \
24
  --question-file ${EVAL_DATA_DIR}/videochatgpt_gen/generic_qa.json \
@@ -48,7 +48,7 @@ AZURE_API_KEY=your_key
48
  AZURE_API_ENDPOINT=your_endpoint
49
  AZURE_API_DEPLOYNAME=your_deployname
50
 
51
- python3 videollama2/new_eval/eval_video_oqa_vcgpt_1_correctness.py \
52
  --pred-path ${output_file} \
53
  --output-dir ${OUTPUT_DIR}/videochatgpt_gen/answers/correctness/${CKPT_NAME}/gpt \
54
  --output-json ${OUTPUT_DIR}/videochatgpt_gen/answers/correctness/${CKPT_NAME}/results.json \
 
1
  set -x
2
 
3
+ EVAL_DATA_DIR=eval
4
+ OUTPUT_DIR=eval_output
5
+ CKPT=DAMO-NLP-SG/VideoLLaMA2.1-7B-16F
6
  CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
7
 
8
  gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 
18
  for IDX in $(seq 0 $((CHUNKS-1))); do
19
  # select the GPUs for the task
20
  gpu_devices=$(IFS=,; echo "${GPULIST[*]:$(($IDX*$GPUS_PER_TASK)):$GPUS_PER_TASK}")
21
+ TRANSFORMERS_OFFLINE=1 CUDA_VISIBLE_DEVICES=${gpu_devices} python3 videollama2/eval/inference_video_oqa_vcgpt_general.py \
22
  --model-path ${CKPT} \
23
  --video-folder ${EVAL_DATA_DIR}/videochatgpt_gen/Test_Videos \
24
  --question-file ${EVAL_DATA_DIR}/videochatgpt_gen/generic_qa.json \
 
48
  AZURE_API_ENDPOINT=your_endpoint
49
  AZURE_API_DEPLOYNAME=your_deployname
50
 
51
+ python3 videollama2/eval/eval_video_oqa_vcgpt_1_correctness.py \
52
  --pred-path ${output_file} \
53
  --output-dir ${OUTPUT_DIR}/videochatgpt_gen/answers/correctness/${CKPT_NAME}/gpt \
54
  --output-json ${OUTPUT_DIR}/videochatgpt_gen/answers/correctness/${CKPT_NAME}/results.json \
VideoLLaMA2/scripts/eval/eval_video_oqa_vcgpt_2_detail.sh CHANGED
@@ -1,8 +1,8 @@
1
  set -x
2
 
3
- EVAL_DATA_DIR=dataset/videollm_eval
4
- OUTPUT_DIR=eval
5
- CKPT=DAMO-NLP-SG/VideoLLaMA2-7B
6
  CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
7
 
8
  gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
@@ -48,11 +48,11 @@ AZURE_API_KEY=your_key
48
  AZURE_API_ENDPOINT=your_endpoint
49
  AZURE_API_DEPLOYNAME=your_deployname
50
 
51
- python3 videollama2/eval/eval_benchmark_2_detailed_orientation.py \
52
  --pred-path ${output_file} \
53
  --output-dir ${OUTPUT_DIR}/videochatgpt_gen/answers/detail/${CKPT_NAME}/gpt \
54
  --output-json ${OUTPUT_DIR}/videochatgpt_gen/answers/detail/${CKPT_NAME}/results.json \
55
- --api-key "35632dae7dd94d0a93338db373c63893" \
56
- --api-endpoint https://damo-openai-gpt4v-test.openai.azure.com \
57
- --api-deployname gpt-35-turbo \
58
  --num-tasks 4
 
1
  set -x
2
 
3
+ EVAL_DATA_DIR=eval
4
+ OUTPUT_DIR=eval_output
5
+ CKPT=DAMO-NLP-SG/VideoLLaMA2.1-7B-16F
6
  CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
7
 
8
  gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 
48
  AZURE_API_ENDPOINT=your_endpoint
49
  AZURE_API_DEPLOYNAME=your_deployname
50
 
51
+ python3 videollama2/eval/eval_video_oqa_vcgpt_2_detailed_orientation.py \
52
  --pred-path ${output_file} \
53
  --output-dir ${OUTPUT_DIR}/videochatgpt_gen/answers/detail/${CKPT_NAME}/gpt \
54
  --output-json ${OUTPUT_DIR}/videochatgpt_gen/answers/detail/${CKPT_NAME}/results.json \
55
+ --api-key $AZURE_API_KEY \
56
+ --api-endpoint $AZURE_API_ENDPOINT \
57
+ --api-deployname $AZURE_API_DEPLOYNAME \
58
  --num-tasks 4
VideoLLaMA2/scripts/eval/eval_video_oqa_vcgpt_3_context.sh CHANGED
@@ -1,8 +1,8 @@
1
  set -x
2
 
3
- EVAL_DATA_DIR=dataset/videollm_eval
4
- OUTPUT_DIR=eval
5
- CKPT=DAMO-NLP-SG/VideoLLaMA2-7B
6
  CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
7
 
8
  gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
@@ -48,7 +48,7 @@ AZURE_API_KEY=your_key
48
  AZURE_API_ENDPOINT=your_endpoint
49
  AZURE_API_DEPLOYNAME=your_deployname
50
 
51
- python3 videollama2/eval/eval_benchmark_3_context.py \
52
  --pred-path ${output_file} \
53
  --output-dir ${OUTPUT_DIR}/videochatgpt_gen/answers/context/${CKPT_NAME}/gpt \
54
  --output-json ${OUTPUT_DIR}/videochatgpt_gen/answers/context/${CKPT_NAME}/results.json \
 
1
  set -x
2
 
3
+ EVAL_DATA_DIR=eval
4
+ OUTPUT_DIR=eval_output
5
+ CKPT=DAMO-NLP-SG/VideoLLaMA2.1-7B-16F
6
  CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
7
 
8
  gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 
48
  AZURE_API_ENDPOINT=your_endpoint
49
  AZURE_API_DEPLOYNAME=your_deployname
50
 
51
+ python3 videollama2/eval/eval_video_oqa_vcgpt_3_context.py \
52
  --pred-path ${output_file} \
53
  --output-dir ${OUTPUT_DIR}/videochatgpt_gen/answers/context/${CKPT_NAME}/gpt \
54
  --output-json ${OUTPUT_DIR}/videochatgpt_gen/answers/context/${CKPT_NAME}/results.json \
VideoLLaMA2/scripts/eval/eval_video_oqa_vcgpt_4_temporal.sh CHANGED
@@ -2,7 +2,7 @@ set -x
2
 
3
  EVAL_DATA_DIR=eval
4
  OUTPUT_DIR=eval_output
5
- CKPT=DAMO-NLP-SG/VideoLLaMA2-7B
6
  CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
7
 
8
  gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
@@ -40,9 +40,9 @@ if [ ! -f "$output_file" ]; then
40
  fi
41
 
42
 
43
- AZURE_API_KEY=a7f9bc087b7143a69d59a68f01a2b450
44
- AZURE_API_ENDPOINT=https://vl-australiaeast.openai.azure.com
45
- AZURE_API_DEPLOYNAME=gpt35-turbo-0613
46
 
47
  python3 videollama2/eval/eval_video_oqa_vcgpt_4_temporal.py \
48
  --pred-path ${output_file} \
 
2
 
3
  EVAL_DATA_DIR=eval
4
  OUTPUT_DIR=eval_output
5
+ CKPT=DAMO-NLP-SG/VideoLLaMA2.1-7B-16F
6
  CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
7
 
8
  gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 
40
  fi
41
 
42
 
43
+ AZURE_API_KEY=your_key
44
+ AZURE_API_ENDPOINT=your_endpoint
45
+ AZURE_API_DEPLOYNAME=your_deployname
46
 
47
  python3 videollama2/eval/eval_video_oqa_vcgpt_4_temporal.py \
48
  --pred-path ${output_file} \
VideoLLaMA2/scripts/eval/eval_video_oqa_vcgpt_5_consistency.sh CHANGED
@@ -2,7 +2,7 @@ set -x
2
 
3
  EVAL_DATA_DIR=eval
4
  OUTPUT_DIR=eval_output
5
- CKPT=DAMO-NLP-SG/VideoLLaMA2-7B
6
  CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
7
 
8
  gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 
2
 
3
  EVAL_DATA_DIR=eval
4
  OUTPUT_DIR=eval_output
5
+ CKPT=DAMO-NLP-SG/VideoLLaMA2.1-7B-16F
6
  CKPT_NAME=$(echo $CKPT | rev | cut -d'/' -f1 | rev)
7
 
8
  gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
VideoLLaMA2/scripts/siglip/finetune_gemma2.sh DELETED
@@ -1,75 +0,0 @@
1
- #!/bin/bash
2
-
3
- # Environment Variables
4
- ARG_WORLD_SIZE=${1:-1}
5
- ARG_NPROC_PER_NODE=${2:-8}
6
- ARG_MASTER_ADDR="127.0.0.1"
7
- ARG_MASTER_PORT=16667
8
- ARG_RANK=0
9
-
10
- # Multiple conditions
11
- if [ ! -n "$WORLD_SIZE" ] || [ ! -n "$NPROC_PER_NODE" ]; then
12
- WORLD_SIZE=$ARG_WORLD_SIZE
13
- NPROC_PER_NODE=$ARG_NPROC_PER_NODE
14
- fi
15
- if [ ! -n "$MASTER_ADDR" ] || [ ! -n "$MASTER_PORT" ] || [ ! -n "$RANK" ]; then
16
- MASTER_ADDR=$ARG_MASTER_ADDR
17
- MASTER_PORT=$ARG_MASTER_PORT
18
- RANK=$ARG_RANK
19
- fi
20
-
21
- echo "WORLD_SIZE: $WORLD_SIZE"
22
- echo "NPROC_PER_NODE: $NPROC_PER_NODE"
23
-
24
- # Training Arguments
25
- GLOBAL_BATCH_SIZE=128
26
- LOCAL_BATCH_SIZE=4
27
- GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$LOCAL_BATCH_SIZE)]
28
- echo $GRADIENT_ACCUMULATION_STEPS
29
-
30
- # Log Arguments
31
- export TRANSFORMERS_OFFLINE=1
32
- export WANDB_PROJECT=videollama2gemma2_siglip
33
- RUN_NAME=vllava_settings
34
- DATA_DIR=datasets
35
- OUTP_DIR=work_dirs
36
-
37
- torchrun --nnodes $WORLD_SIZE \
38
- --nproc_per_node $NPROC_PER_NODE \
39
- --master_addr=$MASTER_ADDR \
40
- --master_port=$MASTER_PORT \
41
- --node_rank $RANK \
42
- videollama2/train_flash_attn.py \
43
- --deepspeed scripts/zero3.json \
44
- --model_type videollama2_gemma2 \
45
- --model_path google/gemma-2-2b-it \
46
- --vision_tower google/siglip-so400m-patch14-384 \
47
- --mm_projector_type stc_connector_v35 \
48
- --pretrain_mm_mlp_adapter ${OUTP_DIR}/${WANDB_PROJECT}/pretrain_${RUN_NAME}/mm_projector.bin \
49
- --data_path ${DATA_DIR}/videollava_sft/videochatgpt_llavaimage_tune.json \
50
- --data_folder ${DATA_DIR}/videollava_sft/ \
51
- --mm_vision_select_layer -2 \
52
- --image_aspect_ratio pad \
53
- --num_frames 8 \
54
- --bf16 True \
55
- --tf32 True \
56
- --fp16 False \
57
- --output_dir ${OUTP_DIR}/${WANDB_PROJECT}/finetune_${RUN_NAME} \
58
- --num_train_epochs 3 \
59
- --per_device_train_batch_size $LOCAL_BATCH_SIZE \
60
- --per_device_eval_batch_size 4 \
61
- --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
62
- --evaluation_strategy "no" \
63
- --save_strategy "steps" \
64
- --save_steps 200 \
65
- --save_total_limit 99 \
66
- --learning_rate 2e-5 \
67
- --weight_decay 0. \
68
- --warmup_ratio 0.03 \
69
- --lr_scheduler_type "cosine" \
70
- --logging_steps 1 \
71
- --model_max_length 2048 \
72
- --gradient_checkpointing True \
73
- --dataloader_num_workers 4 \
74
- --report_to tensorboard \
75
- --run_name finetune_$RUN_NAME \
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
VideoLLaMA2/scripts/siglip/finetune_mistral.sh DELETED
@@ -1,75 +0,0 @@
1
- #!/bin/bash
2
-
3
- # Environment Variables
4
- ARG_WORLD_SIZE=${1:-1}
5
- ARG_NPROC_PER_NODE=${2:-8}
6
- ARG_MASTER_ADDR="127.0.0.1"
7
- ARG_MASTER_PORT=16667
8
- ARG_RANK=0
9
-
10
- # Multiple conditions
11
- if [ ! -n "$WORLD_SIZE" ] || [ ! -n "$NPROC_PER_NODE" ]; then
12
- WORLD_SIZE=$ARG_WORLD_SIZE
13
- NPROC_PER_NODE=$ARG_NPROC_PER_NODE
14
- fi
15
- if [ ! -n "$MASTER_ADDR" ] || [ ! -n "$MASTER_PORT" ] || [ ! -n "$RANK" ]; then
16
- MASTER_ADDR=$ARG_MASTER_ADDR
17
- MASTER_PORT=$ARG_MASTER_PORT
18
- RANK=$ARG_RANK
19
- fi
20
-
21
- echo "WORLD_SIZE: $WORLD_SIZE"
22
- echo "NPROC_PER_NODE: $NPROC_PER_NODE"
23
-
24
- # Training Arguments
25
- GLOBAL_BATCH_SIZE=128
26
- LOCAL_BATCH_SIZE=4
27
- GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$LOCAL_BATCH_SIZE)]
28
- echo $GRADIENT_ACCUMULATION_STEPS
29
-
30
- # Log Arguments
31
- export TRANSFORMERS_OFFLINE=1
32
- export WANDB_PROJECT=videollama2mistral_siglip
33
- RUN_NAME=vllava_settings
34
- DATA_DIR=datasets
35
- OUTP_DIR=work_dirs
36
-
37
- torchrun --nnodes $WORLD_SIZE \
38
- --nproc_per_node $NPROC_PER_NODE \
39
- --master_addr=$MASTER_ADDR \
40
- --master_port=$MASTER_PORT \
41
- --node_rank $RANK \
42
- videollama2/train_flash_attn.py \
43
- --deepspeed scripts/zero3.json \
44
- --model_type videollama2 \
45
- --model_path mistralai/Mistral-7B-Instruct-v0.2 \
46
- --vision_tower google/siglip-so400m-patch14-384 \
47
- --mm_projector_type stc_connector_v35 \
48
- --pretrain_mm_mlp_adapter ${OUTP_DIR}/${WANDB_PROJECT}/pretrain_${RUN_NAME}/mm_projector.bin \
49
- --data_path ${DATA_DIR}/videollava_sft/videochatgpt_llavaimage_tune.json \
50
- --data_folder ${DATA_DIR}/videollava_sft/ \
51
- --mm_vision_select_layer -2 \
52
- --image_aspect_ratio pad \
53
- --num_frames 8 \
54
- --bf16 True \
55
- --tf32 True \
56
- --fp16 False \
57
- --output_dir ${OUTP_DIR}/${WANDB_PROJECT}/finetune_${RUN_NAME} \
58
- --num_train_epochs 3 \
59
- --per_device_train_batch_size $LOCAL_BATCH_SIZE \
60
- --per_device_eval_batch_size 4 \
61
- --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
62
- --evaluation_strategy "no" \
63
- --save_strategy "steps" \
64
- --save_steps 200 \
65
- --save_total_limit 99 \
66
- --learning_rate 2e-5 \
67
- --weight_decay 0. \
68
- --warmup_ratio 0.03 \
69
- --lr_scheduler_type "cosine" \
70
- --logging_steps 1 \
71
- --model_max_length 2048 \
72
- --gradient_checkpointing True \
73
- --dataloader_num_workers 4 \
74
- --report_to wandb \
75
- --run_name finetune_$RUN_NAME \
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
VideoLLaMA2/scripts/siglip/finetune_phi3.sh DELETED
@@ -1,75 +0,0 @@
1
- #!/bin/bash
2
-
3
- # Environment Variables
4
- ARG_WORLD_SIZE=${1:-1}
5
- ARG_NPROC_PER_NODE=${2:-8}
6
- ARG_MASTER_ADDR="127.0.0.1"
7
- ARG_MASTER_PORT=16667
8
- ARG_RANK=0
9
-
10
- # Multiple conditions
11
- if [ ! -n "$WORLD_SIZE" ] || [ ! -n "$NPROC_PER_NODE" ]; then
12
- WORLD_SIZE=$ARG_WORLD_SIZE
13
- NPROC_PER_NODE=$ARG_NPROC_PER_NODE
14
- fi
15
- if [ ! -n "$MASTER_ADDR" ] || [ ! -n "$MASTER_PORT" ] || [ ! -n "$RANK" ]; then
16
- MASTER_ADDR=$ARG_MASTER_ADDR
17
- MASTER_PORT=$ARG_MASTER_PORT
18
- RANK=$ARG_RANK
19
- fi
20
-
21
- echo "WORLD_SIZE: $WORLD_SIZE"
22
- echo "NPROC_PER_NODE: $NPROC_PER_NODE"
23
-
24
- # Training Arguments
25
- GLOBAL_BATCH_SIZE=128
26
- LOCAL_BATCH_SIZE=4
27
- GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$LOCAL_BATCH_SIZE)]
28
- echo $GRADIENT_ACCUMULATION_STEPS
29
-
30
- # Log Arguments
31
- export TRANSFORMERS_OFFLINE=1
32
- export WANDB_PROJECT=videollama2phi3_siglip
33
- RUN_NAME=vllava_settings
34
- DATA_DIR=datasets
35
- OUTP_DIR=work_dirs
36
-
37
- torchrun --nnodes $WORLD_SIZE \
38
- --nproc_per_node $NPROC_PER_NODE \
39
- --master_addr=$MASTER_ADDR \
40
- --master_port=$MASTER_PORT \
41
- --node_rank $RANK \
42
- videollama2/train_flash_attn.py \
43
- --deepspeed scripts/zero3.json \
44
- --model_type videollama2_phi3 \
45
- --model_path microsoft/Phi-3-mini-4k-instruct \
46
- --vision_tower google/siglip-so400m-patch14-384 \
47
- --mm_projector_type stc_connector_v35 \
48
- --pretrain_mm_mlp_adapter ${OUTP_DIR}/${WANDB_PROJECT}/pretrain_${RUN_NAME}/mm_projector.bin \
49
- --data_path ${DATA_DIR}/videollava_sft/videochatgpt_llavaimage_tune.json \
50
- --data_folder ${DATA_DIR}/videollava_sft/ \
51
- --mm_vision_select_layer -2 \
52
- --image_aspect_ratio pad \
53
- --num_frames 8 \
54
- --bf16 True \
55
- --tf32 True \
56
- --fp16 False \
57
- --output_dir ${OUTP_DIR}/${WANDB_PROJECT}/finetune_${RUN_NAME} \
58
- --num_train_epochs 3 \
59
- --per_device_train_batch_size $LOCAL_BATCH_SIZE \
60
- --per_device_eval_batch_size 4 \
61
- --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
62
- --evaluation_strategy "no" \
63
- --save_strategy "steps" \
64
- --save_steps 200 \
65
- --save_total_limit 99 \
66
- --learning_rate 2e-5 \
67
- --weight_decay 0. \
68
- --warmup_ratio 0.03 \
69
- --lr_scheduler_type "cosine" \
70
- --logging_steps 1 \
71
- --model_max_length 2048 \
72
- --gradient_checkpointing True \
73
- --dataloader_num_workers 4 \
74
- --report_to tensorboard \
75
- --run_name finetune_$RUN_NAME \
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
VideoLLaMA2/scripts/siglip/finetune_qwen2.sh DELETED
@@ -1,75 +0,0 @@
1
- #!/bin/bash
2
-
3
- # Environment Variables
4
- ARG_WORLD_SIZE=${1:-1}
5
- ARG_NPROC_PER_NODE=${2:-8}
6
- ARG_MASTER_ADDR="127.0.0.1"
7
- ARG_MASTER_PORT=16666
8
- ARG_RANK=0
9
-
10
- # Multiple conditions
11
- if [ ! -n "$WORLD_SIZE" ] || [ ! -n "$NPROC_PER_NODE" ]; then
12
- WORLD_SIZE=$ARG_WORLD_SIZE
13
- NPROC_PER_NODE=$ARG_NPROC_PER_NODE
14
- fi
15
- if [ ! -n "$MASTER_ADDR" ] || [ ! -n "$MASTER_PORT" ] || [ ! -n "$RANK" ]; then
16
- MASTER_ADDR=$ARG_MASTER_ADDR
17
- MASTER_PORT=$ARG_MASTER_PORT
18
- RANK=$ARG_RANK
19
- fi
20
-
21
- echo "WORLD_SIZE: $WORLD_SIZE"
22
- echo "NPROC_PER_NODE: $NPROC_PER_NODE"
23
-
24
- # Training Arguments
25
- GLOBAL_BATCH_SIZE=128
26
- LOCAL_BATCH_SIZE=4
27
- GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$LOCAL_BATCH_SIZE)]
28
- echo $GRADIENT_ACCUMULATION_STEPS
29
-
30
- # Log Arguments
31
- export TRANSFORMERS_OFFLINE=1
32
- export WANDB_PROJECT=videollama2qwen2_siglip
33
- RUN_NAME=vllava_settings
34
- DATA_DIR=datasets
35
- OUTP_DIR=work_dirs
36
-
37
- torchrun --nnodes $WORLD_SIZE \
38
- --nproc_per_node $NPROC_PER_NODE \
39
- --master_addr=$MASTER_ADDR \
40
- --master_port=$MASTER_PORT \
41
- --node_rank $RANK \
42
- videollama2/train_flash_attn.py \
43
- --deepspeed scripts/zero3.json \
44
- --model_type videollama2_qwen2 \
45
- --model_path Qwen/Qwen2-7B-Instruct \
46
- --vision_tower google/siglip-so400m-patch14-384 \
47
- --mm_projector_type stc_connector_v35 \
48
- --pretrain_mm_mlp_adapter ${OUTP_DIR}/${WANDB_PROJECT}/pretrain_${RUN_NAME}/mm_projector.bin \
49
- --data_path ${DATA_DIR}/videollava_sft/videochatgpt_llavaimage_tune.json \
50
- --data_folder ${DATA_DIR}/videollava_sft/ \
51
- --mm_vision_select_layer -2 \
52
- --image_aspect_ratio pad \
53
- --num_frames 8 \
54
- --bf16 True \
55
- --tf32 True \
56
- --fp16 False \
57
- --output_dir ${OUTP_DIR}/${WANDB_PROJECT}/finetune_${RUN_NAME} \
58
- --num_train_epochs 1 \
59
- --per_device_train_batch_size $LOCAL_BATCH_SIZE \
60
- --per_device_eval_batch_size 4 \
61
- --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
62
- --evaluation_strategy "no" \
63
- --save_strategy "steps" \
64
- --save_steps 500 \
65
- --save_total_limit 99 \
66
- --learning_rate 2e-5 \
67
- --weight_decay 0. \
68
- --warmup_ratio 0.03 \
69
- --lr_scheduler_type "cosine" \
70
- --logging_steps 1 \
71
- --model_max_length 2048 \
72
- --gradient_checkpointing True \
73
- --dataloader_num_workers 4 \
74
- --report_to tensorboard \
75
- --run_name $RUN_NAME \
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
VideoLLaMA2/scripts/siglip/pretrain_gemma2.sh DELETED
@@ -1,75 +0,0 @@
1
- #!/bin/bash
2
-
3
- # Environment Variables
4
- ARG_WORLD_SIZE=${1:-1}
5
- ARG_NPROC_PER_NODE=${2:-8}
6
- ARG_MASTER_ADDR="127.0.0.1"
7
- ARG_MASTER_PORT=16666
8
- ARG_RANK=0
9
-
10
- # Multiple conditions
11
- if [ ! -n "$WORLD_SIZE" ] || [ ! -n "$NPROC_PER_NODE" ]; then
12
- WORLD_SIZE=$ARG_WORLD_SIZE
13
- NPROC_PER_NODE=$ARG_NPROC_PER_NODE
14
- fi
15
- if [ ! -n "$MASTER_ADDR" ] || [ ! -n "$MASTER_PORT" ] || [ ! -n "$RANK" ]; then
16
- MASTER_ADDR=$ARG_MASTER_ADDR
17
- MASTER_PORT=$ARG_MASTER_PORT
18
- RANK=$ARG_RANK
19
- fi
20
-
21
- echo "WORLD_SIZE: $WORLD_SIZE"
22
- echo "NPROC_PER_NODE: $NPROC_PER_NODE"
23
-
24
- # Training Arguments
25
- GLOBAL_BATCH_SIZE=256
26
- LOCAL_BATCH_SIZE=4
27
- GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$LOCAL_BATCH_SIZE)]
28
- echo $GRADIENT_ACCUMULATION_STEPS
29
-
30
- # Log Arguments
31
- export TRANSFORMERS_OFFLINE=1
32
- export WANDB_PROJECT=videollama2gemma2_siglip
33
- RUN_NAME=vllava_settings
34
- DATA_DIR=datasets
35
- OUTP_DIR=work_dirs
36
-
37
- torchrun --nnodes $WORLD_SIZE \
38
- --nproc_per_node $NPROC_PER_NODE \
39
- --master_addr=$MASTER_ADDR \
40
- --master_port=$MASTER_PORT \
41
- --node_rank $RANK \
42
- videollama2/train_flash_attn.py \
43
- --deepspeed scripts/zero3.json \
44
- --model_type videollama2_gemma2 \
45
- --model_path google/gemma-2-2b-it \
46
- --vision_tower google/siglip-so400m-patch14-384 \
47
- --mm_projector_type stc_connector_v35 \
48
- --tune_mm_mlp_adapter True \
49
- --data_path ${DATA_DIR}/videollava_pt/valley_llavaimage.json \
50
- --data_folder ${DATA_DIR}/videollava_pt/ \
51
- --mm_vision_select_layer -2 \
52
- --num_frames 8 \
53
- --bf16 True \
54
- --tf32 True \
55
- --fp16 False \
56
- --output_dir ${OUTP_DIR}/${WANDB_PROJECT}/pretrain_${RUN_NAME} \
57
- --num_train_epochs 1 \
58
- --per_device_train_batch_size $LOCAL_BATCH_SIZE \
59
- --per_device_eval_batch_size 4 \
60
- --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
61
- --evaluation_strategy "no" \
62
- --save_strategy "steps" \
63
- --save_steps 500 \
64
- --save_total_limit 99 \
65
- --learning_rate 1e-3 \
66
- --weight_decay 0. \
67
- --warmup_ratio 0.03 \
68
- --lr_scheduler_type "cosine" \
69
- --logging_steps 1 \
70
- --model_max_length 2048 \
71
- --gradient_checkpointing True \
72
- --dataloader_num_workers 4 \
73
- --lazy_preprocess True \
74
- --report_to tensorboard \
75
- --run_name pretrain_$RUN_NAME \
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
VideoLLaMA2/scripts/siglip/pretrain_mistral.sh DELETED
@@ -1,75 +0,0 @@
1
- #!/bin/bash
2
-
3
- # Environment Variables
4
- ARG_WORLD_SIZE=${1:-1}
5
- ARG_NPROC_PER_NODE=${2:-8}
6
- ARG_MASTER_ADDR="127.0.0.1"
7
- ARG_MASTER_PORT=16666
8
- ARG_RANK=0
9
-
10
- # Multiple conditions
11
- if [ ! -n "$WORLD_SIZE" ] || [ ! -n "$NPROC_PER_NODE" ]; then
12
- WORLD_SIZE=$ARG_WORLD_SIZE
13
- NPROC_PER_NODE=$ARG_NPROC_PER_NODE
14
- fi
15
- if [ ! -n "$MASTER_ADDR" ] || [ ! -n "$MASTER_PORT" ] || [ ! -n "$RANK" ]; then
16
- MASTER_ADDR=$ARG_MASTER_ADDR
17
- MASTER_PORT=$ARG_MASTER_PORT
18
- RANK=$ARG_RANK
19
- fi
20
-
21
- echo "WORLD_SIZE: $WORLD_SIZE"
22
- echo "NPROC_PER_NODE: $NPROC_PER_NODE"
23
-
24
- # Training Arguments
25
- GLOBAL_BATCH_SIZE=256
26
- LOCAL_BATCH_SIZE=8
27
- GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$LOCAL_BATCH_SIZE)]
28
- echo $GRADIENT_ACCUMULATION_STEPS
29
-
30
- # Log Arguments
31
- export TRANSFORMERS_OFFLINE=1
32
- export WANDB_PROJECT=videollama2mistral_siglip
33
- RUN_NAME=vllava_settings
34
- DATA_DIR=datasets
35
- OUTP_DIR=work_dirs
36
-
37
- torchrun --nnodes $WORLD_SIZE \
38
- --nproc_per_node $NPROC_PER_NODE \
39
- --master_addr=$MASTER_ADDR \
40
- --master_port=$MASTER_PORT \
41
- --node_rank $RANK \
42
- videollama2/train_flash_attn.py \
43
- --deepspeed scripts/zero3.json \
44
- --model_type videollama2 \
45
- --model_path mistralai/Mistral-7B-Instruct-v0.2 \
46
- --vision_tower google/siglip-so400m-patch14-384 \
47
- --mm_projector_type stc_connector_v35 \
48
- --tune_mm_mlp_adapter True \
49
- --data_path ${DATA_DIR}/videollava_pt/valley_llavaimage.json \
50
- --data_folder ${DATA_DIR}/videollava_pt/ \
51
- --mm_vision_select_layer -2 \
52
- --num_frames 8 \
53
- --bf16 True \
54
- --tf32 True \
55
- --fp16 False \
56
- --output_dir ${OUTP_DIR}/${WANDB_PROJECT}/pretrain_${RUN_NAME} \
57
- --num_train_epochs 1 \
58
- --per_device_train_batch_size $LOCAL_BATCH_SIZE \
59
- --per_device_eval_batch_size 4 \
60
- --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
61
- --evaluation_strategy "no" \
62
- --save_strategy "steps" \
63
- --save_steps 500 \
64
- --save_total_limit 99 \
65
- --learning_rate 1e-3 \
66
- --weight_decay 0. \
67
- --warmup_ratio 0.03 \
68
- --lr_scheduler_type "cosine" \
69
- --logging_steps 1 \
70
- --model_max_length 2048 \
71
- --gradient_checkpointing True \
72
- --dataloader_num_workers 16 \
73
- --lazy_preprocess True \
74
- --report_to tensorboard \
75
- --run_name pretrain_$RUN_NAME \
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
VideoLLaMA2/scripts/siglip/pretrain_phi3.sh DELETED
@@ -1,75 +0,0 @@
1
- #!/bin/bash
2
-
3
- # Environment Variables
4
- ARG_WORLD_SIZE=${1:-1}
5
- ARG_NPROC_PER_NODE=${2:-8}
6
- ARG_MASTER_ADDR="127.0.0.1"
7
- ARG_MASTER_PORT=16666
8
- ARG_RANK=0
9
-
10
- # Multiple conditions
11
- if [ ! -n "$WORLD_SIZE" ] || [ ! -n "$NPROC_PER_NODE" ]; then
12
- WORLD_SIZE=$ARG_WORLD_SIZE
13
- NPROC_PER_NODE=$ARG_NPROC_PER_NODE
14
- fi
15
- if [ ! -n "$MASTER_ADDR" ] || [ ! -n "$MASTER_PORT" ] || [ ! -n "$RANK" ]; then
16
- MASTER_ADDR=$ARG_MASTER_ADDR
17
- MASTER_PORT=$ARG_MASTER_PORT
18
- RANK=$ARG_RANK
19
- fi
20
-
21
- echo "WORLD_SIZE: $WORLD_SIZE"
22
- echo "NPROC_PER_NODE: $NPROC_PER_NODE"
23
-
24
- # Training Arguments
25
- GLOBAL_BATCH_SIZE=256
26
- LOCAL_BATCH_SIZE=8
27
- GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$LOCAL_BATCH_SIZE)]
28
- echo $GRADIENT_ACCUMULATION_STEPS
29
-
30
- # Log Arguments
31
- export TRANSFORMERS_OFFLINE=1
32
- export WANDB_PROJECT=videollama2phi3_siglip
33
- RUN_NAME=vllava_settings
34
- DATA_DIR=datasets
35
- OUTP_DIR=work_dirs
36
-
37
- torchrun --nnodes $WORLD_SIZE \
38
- --nproc_per_node $NPROC_PER_NODE \
39
- --master_addr=$MASTER_ADDR \
40
- --master_port=$MASTER_PORT \
41
- --node_rank $RANK \
42
- videollama2/train_flash_attn.py \
43
- --deepspeed scripts/zero3.json \
44
- --model_type videollama2_phi3 \
45
- --model_path microsoft/Phi-3-mini-4k-instruct \
46
- --vision_tower google/siglip-so400m-patch14-384 \
47
- --mm_projector_type stc_connector_v35 \
48
- --tune_mm_mlp_adapter True \
49
- --data_path ${DATA_DIR}/videollava_pt/valley_llavaimage.json \
50
- --data_folder ${DATA_DIR}/videollava_pt/ \
51
- --mm_vision_select_layer -2 \
52
- --num_frames 8 \
53
- --bf16 True \
54
- --tf32 True \
55
- --fp16 False \
56
- --output_dir ${OUTP_DIR}/${WANDB_PROJECT}/pretrain_${RUN_NAME} \
57
- --num_train_epochs 1 \
58
- --per_device_train_batch_size $LOCAL_BATCH_SIZE \
59
- --per_device_eval_batch_size 4 \
60
- --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
61
- --evaluation_strategy "no" \
62
- --save_strategy "steps" \
63
- --save_steps 500 \
64
- --save_total_limit 99 \
65
- --learning_rate 1e-3 \
66
- --weight_decay 0. \
67
- --warmup_ratio 0.03 \
68
- --lr_scheduler_type "cosine" \
69
- --logging_steps 1 \
70
- --model_max_length 2048 \
71
- --gradient_checkpointing True \
72
- --dataloader_num_workers 4 \
73
- --lazy_preprocess True \
74
- --report_to tensorboard \
75
- --run_name pretrain_$RUN_NAME \
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
VideoLLaMA2/scripts/siglip/pretrain_qwen2.sh DELETED
@@ -1,75 +0,0 @@
1
- #!/bin/bash
2
-
3
- # Environment Variables
4
- ARG_WORLD_SIZE=${1:-1}
5
- ARG_NPROC_PER_NODE=${2:-8}
6
- ARG_MASTER_ADDR="127.0.0.1"
7
- ARG_MASTER_PORT=16666
8
- ARG_RANK=0
9
-
10
- # Multiple conditions
11
- if [ ! -n "$WORLD_SIZE" ] || [ ! -n "$NPROC_PER_NODE" ]; then
12
- WORLD_SIZE=$ARG_WORLD_SIZE
13
- NPROC_PER_NODE=$ARG_NPROC_PER_NODE
14
- fi
15
- if [ ! -n "$MASTER_ADDR" ] || [ ! -n "$MASTER_PORT" ] || [ ! -n "$RANK" ]; then
16
- MASTER_ADDR=$ARG_MASTER_ADDR
17
- MASTER_PORT=$ARG_MASTER_PORT
18
- RANK=$ARG_RANK
19
- fi
20
-
21
- echo "WORLD_SIZE: $WORLD_SIZE"
22
- echo "NPROC_PER_NODE: $NPROC_PER_NODE"
23
-
24
- # Training Arguments
25
- GLOBAL_BATCH_SIZE=256
26
- LOCAL_BATCH_SIZE=8
27
- GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$LOCAL_BATCH_SIZE)]
28
- echo $GRADIENT_ACCUMULATION_STEPS
29
-
30
- # Log Arguments
31
- export TRANSFORMERS_OFFLINE=1
32
- export WANDB_PROJECT=videollama2qwen2_siglip
33
- RUN_NAME=vllava_settings
34
- DATA_DIR=datasets
35
- OUTP_DIR=work_dirs
36
-
37
- torchrun --nnodes $WORLD_SIZE \
38
- --nproc_per_node $NPROC_PER_NODE \
39
- --master_addr=$MASTER_ADDR \
40
- --master_port=$MASTER_PORT \
41
- --node_rank $RANK \
42
- videollama2/train_flash_attn.py \
43
- --deepspeed scripts/zero3.json \
44
- --model_type videollama2_qwen2 \
45
- --model_path Qwen/Qwen2-7B-Instruct \
46
- --vision_tower google/siglip-so400m-patch14-384 \
47
- --mm_projector_type stc_connector_v35 \
48
- --tune_mm_mlp_adapter True \
49
- --data_path ${DATA_DIR}/videollava_pt/valley_llavaimage.json \
50
- --data_folder ${DATA_DIR}/videollava_pt/ \
51
- --mm_vision_select_layer -2 \
52
- --num_frames 8 \
53
- --bf16 True \
54
- --tf32 True \
55
- --fp16 False \
56
- --output_dir ${OUTP_DIR}/${WANDB_PROJECT}/pretrain_${RUN_NAME} \
57
- --num_train_epochs 1 \
58
- --per_device_train_batch_size $LOCAL_BATCH_SIZE \
59
- --per_device_eval_batch_size 4 \
60
- --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
61
- --evaluation_strategy "no" \
62
- --save_strategy "steps" \
63
- --save_steps 500 \
64
- --save_total_limit 99 \
65
- --learning_rate 1e-3 \
66
- --weight_decay 0. \
67
- --warmup_ratio 0.03 \
68
- --lr_scheduler_type "cosine" \
69
- --logging_steps 1 \
70
- --model_max_length 2048 \
71
- --gradient_checkpointing True \
72
- --dataloader_num_workers 4 \
73
- --lazy_preprocess True \
74
- --report_to tensorboard \
75
- --run_name $RUN_NAME \
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
VideoLLaMA2/scripts/vllava/finetune.sh CHANGED
@@ -5,7 +5,7 @@ ARG_WORLD_SIZE=${1:-1}
5
  ARG_NPROC_PER_NODE=${2:-8}
6
  ARG_MASTER_ADDR="127.0.0.1"
7
  ARG_MASTER_PORT=16666
8
- ARG_RANK=0
9
 
10
  # Multiple conditions
11
  if [ ! -n "$WORLD_SIZE" ] || [ ! -n "$NPROC_PER_NODE" ]; then
@@ -28,8 +28,8 @@ GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$L
28
 
29
  # Log Arguments
30
  export TRANSFORMERS_OFFLINE=1
31
- export WANDB_PROJECT=videollama2
32
- RUN_NAME=vllava_settings
33
  DATA_DIR=datasets
34
  OUTP_DIR=work_dirs
35
 
@@ -38,18 +38,18 @@ torchrun --nnodes $WORLD_SIZE \
38
  --master_addr=$MASTER_ADDR \
39
  --master_port=$MASTER_PORT \
40
  --node_rank $RANK \
41
- videollama2/train_flash_attn.py \
42
  --deepspeed scripts/zero3.json \
43
- --model_type videollama2 \
44
- --model_path mistralai/Mistral-7B-Instruct-v0.2 \
45
- --vision_tower openai/clip-vit-large-patch14-336 \
46
- --mm_projector_type stc_connector \
47
  --pretrain_mm_mlp_adapter ${OUTP_DIR}/${WANDB_PROJECT}/pretrain_${RUN_NAME}/mm_projector.bin \
48
  --data_path ${DATA_DIR}/videollava_sft/videochatgpt_llavaimage_tune.json \
49
  --data_folder ${DATA_DIR}/videollava_sft/ \
50
  --mm_vision_select_layer -2 \
51
  --image_aspect_ratio pad \
52
- --num_frames 8 \
53
  --bf16 True \
54
  --tf32 True \
55
  --fp16 False \
@@ -58,7 +58,6 @@ torchrun --nnodes $WORLD_SIZE \
58
  --per_device_train_batch_size $LOCAL_BATCH_SIZE \
59
  --per_device_eval_batch_size 4 \
60
  --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
61
- --evaluation_strategy "no" \
62
  --save_strategy "steps" \
63
  --save_steps 500 \
64
  --save_total_limit 99 \
 
5
  ARG_NPROC_PER_NODE=${2:-8}
6
  ARG_MASTER_ADDR="127.0.0.1"
7
  ARG_MASTER_PORT=16666
8
+ ARG_RANK=${3:-0}
9
 
10
  # Multiple conditions
11
  if [ ! -n "$WORLD_SIZE" ] || [ ! -n "$NPROC_PER_NODE" ]; then
 
28
 
29
  # Log Arguments
30
  export TRANSFORMERS_OFFLINE=1
31
+ export WANDB_PROJECT=videollama2qwen2_vllava
32
+ RUN_NAME=siglip_tcv35_7b_16f
33
  DATA_DIR=datasets
34
  OUTP_DIR=work_dirs
35
 
 
38
  --master_addr=$MASTER_ADDR \
39
  --master_port=$MASTER_PORT \
40
  --node_rank $RANK \
41
+ videollama2/train.py \
42
  --deepspeed scripts/zero3.json \
43
+ --model_type videollama2_qwen2 \
44
+ --model_path Qwen/Qwen2-7B-Instruct \
45
+ --vision_tower google/siglip-so400m-patch14-384 \
46
+ --mm_projector_type stc_connector_v35 \
47
  --pretrain_mm_mlp_adapter ${OUTP_DIR}/${WANDB_PROJECT}/pretrain_${RUN_NAME}/mm_projector.bin \
48
  --data_path ${DATA_DIR}/videollava_sft/videochatgpt_llavaimage_tune.json \
49
  --data_folder ${DATA_DIR}/videollava_sft/ \
50
  --mm_vision_select_layer -2 \
51
  --image_aspect_ratio pad \
52
+ --num_frames 16 \
53
  --bf16 True \
54
  --tf32 True \
55
  --fp16 False \
 
58
  --per_device_train_batch_size $LOCAL_BATCH_SIZE \
59
  --per_device_eval_batch_size 4 \
60
  --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
 
61
  --save_strategy "steps" \
62
  --save_steps 500 \
63
  --save_total_limit 99 \
VideoLLaMA2/scripts/vllava/finetune_qwen2.sh DELETED
@@ -1,74 +0,0 @@
1
- #!/bin/bash
2
-
3
- # Environment Variables
4
- ARG_WORLD_SIZE=${1:-1}
5
- ARG_NPROC_PER_NODE=${2:-8}
6
- ARG_MASTER_ADDR="127.0.0.1"
7
- ARG_MASTER_PORT=16666
8
- ARG_RANK=0
9
-
10
- # Multiple conditions
11
- if [ ! -n "$WORLD_SIZE" ] || [ ! -n "$NPROC_PER_NODE" ]; then
12
- WORLD_SIZE=$ARG_WORLD_SIZE
13
- NPROC_PER_NODE=$ARG_NPROC_PER_NODE
14
- fi
15
- if [ ! -n "$MASTER_ADDR" ] || [ ! -n "$MASTER_PORT" ] || [ ! -n "$RANK" ]; then
16
- MASTER_ADDR=$ARG_MASTER_ADDR
17
- MASTER_PORT=$ARG_MASTER_PORT
18
- RANK=$ARG_RANK
19
- fi
20
-
21
- echo "WORLD_SIZE: $WORLD_SIZE"
22
- echo "NPROC_PER_NODE: $NPROC_PER_NODE"
23
-
24
- # Training Arguments
25
- GLOBAL_BATCH_SIZE=128
26
- LOCAL_BATCH_SIZE=4
27
- GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$LOCAL_BATCH_SIZE)]
28
-
29
- # Log Arguments
30
- export TRANSFORMERS_OFFLINE=1
31
- export WANDB_PROJECT=videollama2qwen2
32
- RUN_NAME=vllava_settings
33
- DATA_DIR=datasets
34
- OUTP_DIR=work_dirs
35
-
36
- torchrun --nnodes $WORLD_SIZE \
37
- --nproc_per_node $NPROC_PER_NODE \
38
- --master_addr=$MASTER_ADDR \
39
- --master_port=$MASTER_PORT \
40
- --node_rank $RANK \
41
- videollama2/train_flash_attn.py \
42
- --deepspeed scripts/zero3.json \
43
- --model_type videollama2_qwen2 \
44
- --model_path Qwen/Qwen2-7B-Instruct \
45
- --vision_tower openai/clip-vit-large-patch14-336 \
46
- --mm_projector_type stc_connector \
47
- --pretrain_mm_mlp_adapter ${OUTP_DIR}/${WANDB_PROJECT}/pretrain_${RUN_NAME}/mm_projector.bin \
48
- --data_path ${DATA_DIR}/videollava_sft/videochatgpt_llavaimage_tune.json \
49
- --data_folder ${DATA_DIR}/videollava_sft/ \
50
- --mm_vision_select_layer -2 \
51
- --image_aspect_ratio pad \
52
- --num_frames 8 \
53
- --bf16 True \
54
- --tf32 True \
55
- --fp16 False \
56
- --output_dir ${OUTP_DIR}/${WANDB_PROJECT}/finetune_${RUN_NAME} \
57
- --num_train_epochs 1 \
58
- --per_device_train_batch_size $LOCAL_BATCH_SIZE \
59
- --per_device_eval_batch_size 4 \
60
- --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
61
- --evaluation_strategy "no" \
62
- --save_strategy "steps" \
63
- --save_steps 500 \
64
- --save_total_limit 99 \
65
- --learning_rate 2e-5 \
66
- --weight_decay 0. \
67
- --warmup_ratio 0.03 \
68
- --lr_scheduler_type "cosine" \
69
- --logging_steps 1 \
70
- --model_max_length 2048 \
71
- --gradient_checkpointing True \
72
- --dataloader_num_workers 4 \
73
- --report_to tensorboard \
74
- --run_name $RUN_NAME \
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
VideoLLaMA2/scripts/vllava/pretrain.sh CHANGED
@@ -5,7 +5,7 @@ ARG_WORLD_SIZE=${1:-1}
5
  ARG_NPROC_PER_NODE=${2:-8}
6
  ARG_MASTER_ADDR="127.0.0.1"
7
  ARG_MASTER_PORT=16666
8
- ARG_RANK=0
9
 
10
  # Multiple conditions
11
  if [ ! -n "$WORLD_SIZE" ] || [ ! -n "$NPROC_PER_NODE" ]; then
@@ -28,8 +28,8 @@ GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$L
28
 
29
  # Log Arguments
30
  export TRANSFORMERS_OFFLINE=1
31
- export WANDB_PROJECT=videollama2
32
- RUN_NAME=vllava_settings
33
  DATA_DIR=datasets
34
  OUTP_DIR=work_dirs
35
 
@@ -38,17 +38,17 @@ torchrun --nnodes $WORLD_SIZE \
38
  --master_addr=$MASTER_ADDR \
39
  --master_port=$MASTER_PORT \
40
  --node_rank $RANK \
41
- videollama2/train_flash_attn.py \
42
  --deepspeed scripts/zero3.json \
43
- --model_type videollama2 \
44
- --model_path mistralai/Mistral-7B-Instruct-v0.2 \
45
- --vision_tower openai/clip-vit-large-patch14-336 \
46
- --mm_projector_type stc_connector \
47
  --tune_mm_mlp_adapter True \
48
  --data_path ${DATA_DIR}/videollava_pt/valley_llavaimage.json \
49
  --data_folder ${DATA_DIR}/videollava_pt/ \
50
  --mm_vision_select_layer -2 \
51
- --num_frames 8 \
52
  --bf16 True \
53
  --tf32 True \
54
  --fp16 False \
@@ -69,6 +69,5 @@ torchrun --nnodes $WORLD_SIZE \
69
  --model_max_length 2048 \
70
  --gradient_checkpointing True \
71
  --dataloader_num_workers 4 \
72
- --lazy_preprocess True \
73
  --report_to tensorboard \
74
  --run_name $RUN_NAME \
 
5
  ARG_NPROC_PER_NODE=${2:-8}
6
  ARG_MASTER_ADDR="127.0.0.1"
7
  ARG_MASTER_PORT=16666
8
+ ARG_RANK=${3:-0}
9
 
10
  # Multiple conditions
11
  if [ ! -n "$WORLD_SIZE" ] || [ ! -n "$NPROC_PER_NODE" ]; then
 
28
 
29
  # Log Arguments
30
  export TRANSFORMERS_OFFLINE=1
31
+ export WANDB_PROJECT=videollama2qwen2_vllava
32
+ RUN_NAME=siglip_tcv35_7b_16f
33
  DATA_DIR=datasets
34
  OUTP_DIR=work_dirs
35
 
 
38
  --master_addr=$MASTER_ADDR \
39
  --master_port=$MASTER_PORT \
40
  --node_rank $RANK \
41
+ videollama2/train.py \
42
  --deepspeed scripts/zero3.json \
43
+ --model_type videollama2_qwen2 \
44
+ --model_path Qwen/Qwen2-7B-Instruct \
45
+ --vision_tower google/siglip-so400m-patch14-384 \
46
+ --mm_projector_type stc_connector_v35 \
47
  --tune_mm_mlp_adapter True \
48
  --data_path ${DATA_DIR}/videollava_pt/valley_llavaimage.json \
49
  --data_folder ${DATA_DIR}/videollava_pt/ \
50
  --mm_vision_select_layer -2 \
51
+ --num_frames 16 \
52
  --bf16 True \
53
  --tf32 True \
54
  --fp16 False \
 
69
  --model_max_length 2048 \
70
  --gradient_checkpointing True \
71
  --dataloader_num_workers 4 \
 
72
  --report_to tensorboard \
73
  --run_name $RUN_NAME \
VideoLLaMA2/scripts/vllava/pretrain_qwen2.sh DELETED
@@ -1,74 +0,0 @@
1
- #!/bin/bash
2
-
3
- # Environment Variables
4
- ARG_WORLD_SIZE=${1:-1}
5
- ARG_NPROC_PER_NODE=${2:-8}
6
- ARG_MASTER_ADDR="127.0.0.1"
7
- ARG_MASTER_PORT=16666
8
- ARG_RANK=0
9
-
10
- # Multiple conditions
11
- if [ ! -n "$WORLD_SIZE" ] || [ ! -n "$NPROC_PER_NODE" ]; then
12
- WORLD_SIZE=$ARG_WORLD_SIZE
13
- NPROC_PER_NODE=$ARG_NPROC_PER_NODE
14
- fi
15
- if [ ! -n "$MASTER_ADDR" ] || [ ! -n "$MASTER_PORT" ] || [ ! -n "$RANK" ]; then
16
- MASTER_ADDR=$ARG_MASTER_ADDR
17
- MASTER_PORT=$ARG_MASTER_PORT
18
- RANK=$ARG_RANK
19
- fi
20
-
21
- echo "WORLD_SIZE: $WORLD_SIZE"
22
- echo "NPROC_PER_NODE: $NPROC_PER_NODE"
23
-
24
- # Training Arguments
25
- GLOBAL_BATCH_SIZE=256
26
- LOCAL_BATCH_SIZE=8
27
- GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$LOCAL_BATCH_SIZE)]
28
-
29
- # Log Arguments
30
- export TRANSFORMERS_OFFLINE=1
31
- export WANDB_PROJECT=videollama2qwen2
32
- RUN_NAME=vllava_settings
33
- DATA_DIR=datasets
34
- OUTP_DIR=work_dirs
35
-
36
- torchrun --nnodes $WORLD_SIZE \
37
- --nproc_per_node $NPROC_PER_NODE \
38
- --master_addr=$MASTER_ADDR \
39
- --master_port=$MASTER_PORT \
40
- --node_rank $RANK \
41
- videollama2/train_flash_attn.py \
42
- --deepspeed scripts/zero3.json \
43
- --model_type videollama2_qwen2 \
44
- --model_path Qwen/Qwen2-7B-Instruct \
45
- --vision_tower openai/clip-vit-large-patch14-336 \
46
- --mm_projector_type stc_connector \
47
- --tune_mm_mlp_adapter True \
48
- --data_path ${DATA_DIR}/videollava_pt/valley_llavaimage.json \
49
- --data_folder ${DATA_DIR}/videollava_pt/ \
50
- --mm_vision_select_layer -2 \
51
- --num_frames 8 \
52
- --bf16 True \
53
- --tf32 True \
54
- --fp16 False \
55
- --output_dir ${OUTP_DIR}/${WANDB_PROJECT}/pretrain_${RUN_NAME} \
56
- --num_train_epochs 1 \
57
- --per_device_train_batch_size $LOCAL_BATCH_SIZE \
58
- --per_device_eval_batch_size 4 \
59
- --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
60
- --evaluation_strategy "no" \
61
- --save_strategy "steps" \
62
- --save_steps 500 \
63
- --save_total_limit 99 \
64
- --learning_rate 1e-3 \
65
- --weight_decay 0. \
66
- --warmup_ratio 0.03 \
67
- --lr_scheduler_type "cosine" \
68
- --logging_steps 1 \
69
- --model_max_length 2048 \
70
- --gradient_checkpointing True \
71
- --dataloader_num_workers 4 \
72
- --lazy_preprocess True \
73
- --report_to tensorboard \
74
- --run_name $RUN_NAME \
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
VideoLLaMA2/videollama2/__init__.py CHANGED
@@ -58,7 +58,7 @@ def mm_infer(image_or_video, instruct, model, tokenizer, modal='video', **kwargs
58
  tensor = None
59
  else:
60
  tensor = image_or_video.half().cuda()
61
- tensor = [(tensor, modal_token)]
62
 
63
  # 2. text preprocess (tag process & generate prompt).
64
  if isinstance(instruct, str):
@@ -93,7 +93,7 @@ def mm_infer(image_or_video, instruct, model, tokenizer, modal='video', **kwargs
93
  do_sample = kwargs.get('do_sample', False)
94
  temperature = kwargs.get('temperature', 0.2 if do_sample else 0.0)
95
  top_p = kwargs.get('top_p', 0.9)
96
- max_new_tokens = kwargs.get('max_new_tokens', 1024)
97
 
98
  with torch.inference_mode():
99
  output_ids = model.generate(
 
58
  tensor = None
59
  else:
60
  tensor = image_or_video.half().cuda()
61
+ tensor = [(tensor, modal)]
62
 
63
  # 2. text preprocess (tag process & generate prompt).
64
  if isinstance(instruct, str):
 
93
  do_sample = kwargs.get('do_sample', False)
94
  temperature = kwargs.get('temperature', 0.2 if do_sample else 0.0)
95
  top_p = kwargs.get('top_p', 0.9)
96
+ max_new_tokens = kwargs.get('max_new_tokens', 2048)
97
 
98
  with torch.inference_mode():
99
  output_ids = model.generate(
VideoLLaMA2/videollama2/eval/inference_video_cap_msvc.py CHANGED
@@ -5,6 +5,8 @@ import json
5
  import warnings
6
  from tqdm import tqdm
7
 
 
 
8
  import sys
9
  sys.path.append('./')
10
  from videollama2 import model_init, mm_infer
@@ -25,6 +27,44 @@ def get_chunk(lst, n, k):
25
  return chunks[k]
26
 
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  def run_inference(args):
29
  disable_torch_init()
30
 
@@ -37,16 +77,16 @@ def run_inference(args):
37
  os.makedirs(os.path.dirname(args.output_file), exist_ok=True)
38
  ans_file = open(answer_file, "w")
39
 
40
- video_formats = ['.mp4', '.avi', '.mov', '.mkv']
 
 
41
 
42
  # Iterate over each sample in the ground truth file
43
- for idx, sample in enumerate(tqdm(gt_questions)):
44
- video_name = sample['video_path']
45
- question = sample['question']
46
- answer = sample['captions']
47
-
48
- video_path = os.path.join(args.video_folder, video_name)
49
- video_tensor = processor['video'](video_path)
50
 
51
  output = mm_infer(
52
  video_tensor,
@@ -73,6 +113,8 @@ if __name__ == "__main__":
73
  parser.add_argument("--num-chunks", type=int, default=1)
74
  parser.add_argument("--chunk-idx", type=int, default=0)
75
  parser.add_argument("--device", type=str, required=False, default='cuda:0')
 
 
76
  args = parser.parse_args()
77
 
78
  run_inference(args)
 
5
  import warnings
6
  from tqdm import tqdm
7
 
8
+ from torch.utils.data import Dataset, DataLoader
9
+
10
  import sys
11
  sys.path.append('./')
12
  from videollama2 import model_init, mm_infer
 
27
  return chunks[k]
28
 
29
 
30
+ class MSVCDataset(Dataset):
31
+
32
+ video_formats = ['.mp4', '.webm', '.avi', '.mov', '.mkv']
33
+
34
+ def __init__(self, folder, questions, processor):
35
+ self.folder = folder
36
+ self.questions = questions
37
+ self.processor = processor
38
+
39
+ def __len__(self):
40
+ return len(self.questions)
41
+
42
+ def __getitem__(self, idx):
43
+ sample = self.questions[idx]
44
+
45
+ video_name = sample['video_path']
46
+ question = sample['question']
47
+ answer = sample['captions']
48
+
49
+ video_path = os.path.join(self.folder, video_name)
50
+ video_tensor = self.processor(video_path)
51
+
52
+ return {
53
+ 'video': video_tensor,
54
+ 'video_name': video_name,
55
+ 'question': question,
56
+ 'answer': answer,
57
+ }
58
+
59
+
60
+ def collate_fn(batch):
61
+ vid = [x['video'] for x in batch]
62
+ v_id = [x['video_name'] for x in batch]
63
+ qus = [x['question'] for x in batch]
64
+ ans = [x['answer'] for x in batch]
65
+ return vid, v_id, qus, ans
66
+
67
+
68
  def run_inference(args):
69
  disable_torch_init()
70
 
 
77
  os.makedirs(os.path.dirname(args.output_file), exist_ok=True)
78
  ans_file = open(answer_file, "w")
79
 
80
+ assert args.batch_size == 1, "Batch size must be 1 for inference"
81
+ dataset = MSVCDataset(args.video_folder, gt_questions, processor['video'])
82
+ dataloader = DataLoader(dataset, shuffle=False, batch_size=args.batch_size, num_workers=args.num_workers, collate_fn=collate_fn)
83
 
84
  # Iterate over each sample in the ground truth file
85
+ for idx, (video_tensors, video_names, questions, answers) in enumerate(tqdm(dataloader)):
86
+ video_tensor = video_tensors[0]
87
+ video_name = video_names[0]
88
+ question = questions[0]
89
+ answer = answers[0]
 
 
90
 
91
  output = mm_infer(
92
  video_tensor,
 
113
  parser.add_argument("--num-chunks", type=int, default=1)
114
  parser.add_argument("--chunk-idx", type=int, default=0)
115
  parser.add_argument("--device", type=str, required=False, default='cuda:0')
116
+ parser.add_argument("--batch-size", type=int, required=False, default=1)
117
+ parser.add_argument("--num-workers", type=int, required=False, default=8)
118
  args = parser.parse_args()
119
 
120
  run_inference(args)
VideoLLaMA2/videollama2/eval/inference_video_mcqa_egoschema.py CHANGED
@@ -62,7 +62,7 @@ class EgoschemaDataset(Dataset):
62
  axs = [a0, a1, a2, a3, a4]
63
  ops = ['(A)', '(B)', '(C)', '(D)', '(E)']
64
 
65
- instruct = f'Question: {question}\nOptions:\n(A) {a0}\n(B) {a1}\n(C) {a2}\n(D) {a3}\n(E) {a4}\nAnswer with the option\'s letter from the given choices directly and only give the best option.'
66
 
67
  return {
68
  'q_uid': q_uid,
@@ -90,7 +90,8 @@ def egoschema_dump(ans_file, line, outputs):
90
  output = output.replace('Answer', '')
91
  pred_answer = re.findall('[\(\ ]*[A-E][\)\ ]*', output)
92
  try:
93
- assert len(pred_answer) >= 1, 'The video \"{}\" output \"{}\" is not in the expected format'.format(line['q_uid'], instruct + '\n' + output)
 
94
  pred_answer = pred_answer[0].strip()
95
  pred_answer = pred_answer.strip('()')
96
  pred_idx = letters.index(pred_answer)
@@ -117,14 +118,18 @@ def run_inference(args):
117
  video_tensor = line['video'][0]
118
  instruct = line['instruct'][0]
119
 
120
- pred = mm_infer(
121
- video_tensor,
122
- instruct,
123
- model=model,
124
- tokenizer=tokenizer,
125
- modal='video',
126
- do_sample=False,
127
- )
 
 
 
 
128
 
129
  egoschema_dump(ans_file, line, [pred])
130
 
 
62
  axs = [a0, a1, a2, a3, a4]
63
  ops = ['(A)', '(B)', '(C)', '(D)', '(E)']
64
 
65
+ instruct = f'Select the best answer to the following multiple-choice question based on the video.\n{question}\nOptions:\n(A) {a0}\n(B) {a1}\n(C) {a2}\n(D) {a3}\n(E) {a4}\nAnswer with the option\'s letter from the given choices directly and only give the best option. The best answer is: '
66
 
67
  return {
68
  'q_uid': q_uid,
 
90
  output = output.replace('Answer', '')
91
  pred_answer = re.findall('[\(\ ]*[A-E][\)\ ]*', output)
92
  try:
93
+
94
+ assert len(pred_answer) >= 1, 'The video \"{}\" instruct: \n\"{}\"\n output: \n\"{}\"\n is not in the expected format'.format(line['q_uid'], instruct, output)
95
  pred_answer = pred_answer[0].strip()
96
  pred_answer = pred_answer.strip('()')
97
  pred_idx = letters.index(pred_answer)
 
118
  video_tensor = line['video'][0]
119
  instruct = line['instruct'][0]
120
 
121
+ try:
122
+ pred = mm_infer(
123
+ video_tensor,
124
+ instruct,
125
+ model=model,
126
+ tokenizer=tokenizer,
127
+ modal='video',
128
+ do_sample=False,
129
+ )
130
+ except:
131
+ traceback.print_exc()
132
+ pred = 'C'
133
 
134
  egoschema_dump(ans_file, line, [pred])
135
 
VideoLLaMA2/videollama2/eval/inference_video_mcqa_mvbench.py CHANGED
@@ -141,7 +141,7 @@ def mvbench_dump(vid, instruct, letters, options, output):
141
  pred_idx = letters.index(pred_answer)
142
  find_flag = True
143
 
144
- assert find_flag, 'The video \"{}\" output: \n\"{}\" is not in the expected format'.format(vid, instruct + '\n' + output)
145
  except:
146
  traceback.print_exc()
147
  pred_idx = 2
 
141
  pred_idx = letters.index(pred_answer)
142
  find_flag = True
143
 
144
+ assert find_flag, 'The video \"{}\" instruct: \n\"{}\"\n output: \n\"{}\"\n is not in the expected format'.format(vid, instruct, output)
145
  except:
146
  traceback.print_exc()
147
  pred_idx = 2
VideoLLaMA2/videollama2/eval/inference_video_mcqa_perception_test_mcqa.py CHANGED
@@ -129,7 +129,7 @@ def run_inference(args):
129
  output = output.replace('Answer', '')
130
  pred_answer = re.findall('\(*[A-C]\)*', output)
131
  try:
132
- assert len(pred_answer) >= 1, 'The video \"{}\" output \"{}\" is not in the expected format'.format(video_id, instruct + '\n' + output)
133
  pred_answer = pred_answer[0].strip()
134
  # if not pred_answer.startswith('('):
135
  pred_answer = pred_answer.strip('()')
 
129
  output = output.replace('Answer', '')
130
  pred_answer = re.findall('\(*[A-C]\)*', output)
131
  try:
132
+ assert len(pred_answer) >= 1, 'The video \"{}\" instruct: \n\"{}\"\n output: \n\"{}\"\n is not in the expected format'.format(video_id, instruct, output)
133
  pred_answer = pred_answer[0].strip()
134
  # if not pred_answer.startswith('('):
135
  pred_answer = pred_answer.strip('()')
VideoLLaMA2/videollama2/eval/inference_video_mcqa_videomme.py CHANGED
@@ -219,7 +219,7 @@ def videomme_dump(record, instruct, options, output):
219
  pred_idx = letters.index(pred_answer)
220
  find_flag = True
221
 
222
- assert find_flag, 'The video \"{}\" output: \n\"{}\" is not in the expected format'.format(record['youtube_id'], instruct + '\n' + output)
223
  except:
224
  traceback.print_exc()
225
  pred_idx = 2
 
219
  pred_idx = letters.index(pred_answer)
220
  find_flag = True
221
 
222
+ assert find_flag, 'The video \"{}\" instruct: \n\"{}\"\n output: \n\"{}\"\n is not in the expected format'.format(record['youtube_id'], instruct, output)
223
  except:
224
  traceback.print_exc()
225
  pred_idx = 2
VideoLLaMA2/videollama2/eval/inference_video_oqa_activitynet.py CHANGED
@@ -49,6 +49,7 @@ class ActivitynetDataset(Dataset):
49
  question_id = sample['question_id']
50
  answer = answer['answer']
51
 
 
52
  for fmt in self.video_formats: # Added this line
53
  temp_path = os.path.join(args.video_folder, f"v_{video_name}{fmt}")
54
  if os.path.exists(temp_path):
@@ -60,6 +61,9 @@ class ActivitynetDataset(Dataset):
60
  video_path = temp_path
61
  break
62
 
 
 
 
63
  video_tensor = self.processor(video_path)
64
 
65
  return {
@@ -109,14 +113,18 @@ def run_inference(args):
109
 
110
  # question = question + '\n' + 'Answer the question using a single word or a short phrase with multiple words.'
111
 
112
- output = mm_infer(
113
- video_tensor,
114
- question,
115
- model=model,
116
- tokenizer=tokenizer,
117
- modal='video',
118
- do_sample=False,
119
- )
 
 
 
 
120
 
121
  sample_set = {'id': question_id, 'question': question, 'answer': answer, 'pred': output}
122
  ans_file.write(json.dumps(sample_set) + "\n")
 
49
  question_id = sample['question_id']
50
  answer = answer['answer']
51
 
52
+ video_path = None
53
  for fmt in self.video_formats: # Added this line
54
  temp_path = os.path.join(args.video_folder, f"v_{video_name}{fmt}")
55
  if os.path.exists(temp_path):
 
61
  video_path = temp_path
62
  break
63
 
64
+ if video_path is None:
65
+ raise FileNotFoundError(f"Video file not found for {os.path.join(args.video_folder, video_name)}")
66
+
67
  video_tensor = self.processor(video_path)
68
 
69
  return {
 
113
 
114
  # question = question + '\n' + 'Answer the question using a single word or a short phrase with multiple words.'
115
 
116
+ try:
117
+ output = mm_infer(
118
+ video_tensor,
119
+ question,
120
+ model=model,
121
+ tokenizer=tokenizer,
122
+ modal='video',
123
+ do_sample=False,
124
+ )
125
+ except:
126
+ traceback.print_exc()
127
+ output = "error"
128
 
129
  sample_set = {'id': question_id, 'question': question, 'answer': answer, 'pred': output}
130
  ans_file.write(json.dumps(sample_set) + "\n")
VideoLLaMA2/videollama2/mm_utils.py CHANGED
@@ -5,6 +5,7 @@ import base64
5
  import traceback
6
  from io import BytesIO
7
 
 
8
  import torch
9
  import imageio
10
  import numpy as np
@@ -172,7 +173,7 @@ def process_video(video_path, processor, s=None, e=None, aspect_ratio='pad', num
172
  if os.path.isdir(video_path):
173
  video_data = [Image.open(os.path.join(video_path, frame_files[f_idx])) for f_idx in sampled_frame_indices]
174
  elif video_path.endswith('.gif'):
175
- video_data = [Image.fromarray(frame) for idx, frame in enumerate(gif_reader) if idx in sampled_frame_indices]
176
  else:
177
  video_data = [Image.fromarray(frame) for frame in vreader.get_batch(sampled_frame_indices).asnumpy()]
178
 
 
5
  import traceback
6
  from io import BytesIO
7
 
8
+ import cv2
9
  import torch
10
  import imageio
11
  import numpy as np
 
173
  if os.path.isdir(video_path):
174
  video_data = [Image.open(os.path.join(video_path, frame_files[f_idx])) for f_idx in sampled_frame_indices]
175
  elif video_path.endswith('.gif'):
176
+ video_data = [Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_RGBA2RGB)) for idx, frame in enumerate(gif_reader) if idx in sampled_frame_indices]
177
  else:
178
  video_data = [Image.fromarray(frame) for frame in vreader.get_batch(sampled_frame_indices).asnumpy()]
179
 
VideoLLaMA2/videollama2/model/__init__.py CHANGED
@@ -22,12 +22,10 @@ import torch
22
  from transformers import PretrainedConfig, AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig
23
 
24
  from .projector import load_mm_projector
25
- from .videollama2_llama import Videollama2LlamaForCausalLM, Videollama2Config
26
  from .videollama2_mistral import Videollama2MistralForCausalLM, Videollama2MistralConfig
27
  from .videollama2_mixtral import Videollama2MixtralForCausalLM, Videollama2MixtralConfig
28
  from .videollama2_qwen2 import Videollama2Qwen2ForCausalLM, Videollama2Qwen2Config
29
- from .videollama2_gemma2 import Videollama2Gemma2ForCausalLM, Videollama2Gemma2Config
30
- from .videollama2_phi3 import Videollama2Phi3ForCausalLM, Videollama2Phi3Config
31
 
32
 
33
  VLLMs = {
@@ -36,8 +34,14 @@ VLLMs = {
36
  "videollama2_mistral": Videollama2MistralForCausalLM,
37
  "videollama2_mixtral": Videollama2MixtralForCausalLM,
38
  "videollama2_qwen2": Videollama2Qwen2ForCausalLM,
39
- "videollama2_gemma2": Videollama2Gemma2ForCausalLM,
40
- "videollama2_phi3": Videollama2Phi3ForCausalLM,
 
 
 
 
 
 
41
  }
42
 
43
 
@@ -69,137 +73,112 @@ def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, l
69
  if use_flash_attn:
70
  kwargs['attn_implementation'] = 'flash_attention_2'
71
 
72
- if "videollama" in model_name.lower() or 'vlb' in model_name.lower():
73
- # NOTE: lora/qlora model loading
74
- if 'lora' in model_name.lower() or 'qlora' in model_name.lower():
75
- if model_base is None:
76
- cfg_pretrained = PretrainedConfig.from_pretrained(model_path, token=token)
77
- # NOTE: AutoConfig will modify `_name_or_path` property to `model_path` if `model_path` is not None.
78
- # cfg_pretrained = AutoConfig.from_pretrained(model_path, token=token)
79
- model_base = model_base if model_base is not None else cfg_pretrained._name_or_path
80
-
81
- lora_cfg_pretrained = AutoConfig.from_pretrained(model_path)
82
- # NOTE: remove qlora training quantization config
83
- if hasattr(lora_cfg_pretrained, 'quantization_config'):
84
- del lora_cfg_pretrained.quantization_config
85
- tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False, token=token)
86
- print('Loading VideoLLaMA from base model...')
87
-
88
- if 'vicuna' in model_base.lower():
89
- model = Videollama2LlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=lora_cfg_pretrained, **kwargs)
90
- elif 'mistral' in model_base.lower():
91
- model = Videollama2MistralForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=lora_cfg_pretrained, **kwargs)
92
- else:
93
- model = Videollama2MistralForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=lora_cfg_pretrained, **kwargs)
94
-
95
- token_num, tokem_dim = model.lm_head.out_features, model.lm_head.in_features
96
- if model.lm_head.weight.shape[0] != token_num:
97
- model.lm_head.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
98
- model.model.embed_tokens.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
99
-
100
- print('Loading additional VideoLLaMA weights...')
101
- if os.path.exists(os.path.join(model_path, 'non_lora_trainables.bin')):
102
- non_lora_trainables = torch.load(os.path.join(model_path, 'non_lora_trainables.bin'), map_location='cpu')
103
- else:
104
- # this is probably from HF Hub
105
- from huggingface_hub import hf_hub_download
106
- def load_from_hf(repo_id, filename, subfolder=None):
107
- cache_file = hf_hub_download(
108
- repo_id=repo_id,
109
- filename=filename,
110
- subfolder=subfolder)
111
- return torch.load(cache_file, map_location='cpu')
112
- non_lora_trainables = load_from_hf(model_path, 'non_lora_trainables.bin')
113
- non_lora_trainables = {(k[11:] if k.startswith('base_model.') else k): v for k, v in non_lora_trainables.items()}
114
- if any(k.startswith('model.model.') for k in non_lora_trainables):
115
- non_lora_trainables = {(k[6:] if k.startswith('model.') else k): v for k, v in non_lora_trainables.items()}
116
- model.load_state_dict(non_lora_trainables, strict=False)
117
-
118
- from peft import PeftModel
119
- print('Loading LoRA weights...')
120
- model = PeftModel.from_pretrained(model, model_path)
121
- print('Merging LoRA weights...')
122
- model = model.merge_and_unload()
123
- print('Model is loaded...')
124
- elif model_base is not None or '-base' in model_name.lower():
125
- # NOTE: Base/Pretrain model loading
126
- print('Loading VideoLLaMA 2 from base model...')
127
- cfg_pretrained = PretrainedConfig.from_pretrained(model_path, token=token)
128
- # NOTE: AutoConfig will modify `_name_or_path` property to `model_path` if `model_path` is not None.
129
- # cfg_pretrained = AutoConfig.from_pretrained(model_path, token=token)
130
- model_base = model_base if model_base is not None else cfg_pretrained._name_or_path
131
-
132
- tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False, token=token)
133
-
134
- if 'vicuna' in model_base.lower():
135
- model = Videollama2LlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs)
136
- elif 'mistral' in model_base.lower():
137
- model = Videollama2MistralForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs)
138
- elif 'mixtral' in model_base.lower():
139
- model = Videollama2MixtralForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs)
140
- elif 'qwen2' in model_base.lower():
141
- model = Videollama2Qwen2ForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs)
142
- elif 'gemma2' in model_base.lower():
143
- model = Videollama2Gemma2ForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs)
144
- elif 'phi3' in model_base.lower():
145
- model = Videollama2Phi3ForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs)
146
- else:
147
- model = Videollama2MistralForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs)
148
-
149
- # NOTE; loading vision-language projector
150
- # * old codes for loading local mm_projector.bin
151
- # mm_projector_weights = torch.load(os.path.join(model_path, 'mm_projector.bin'), map_location='cpu')
152
- # mm_projector_weights = {k: v.to(torch.float16) for k, v in mm_projector_weights.items()}
153
- # model.load_state_dict(mm_projector_weights, strict=False)
154
- # * new codes which supports loading mm_projector.bin both offline and online
155
- mm_projector_weights = load_mm_projector(model_path, token=token)
156
- model.load_state_dict(mm_projector_weights, strict=False)
157
  else:
158
- # NOTE: SFT model loading
159
- cfg_pretrained = PretrainedConfig.from_pretrained(model_path, token=token)
160
- model_base = cfg_pretrained._name_or_path
161
-
162
- if 'vicuna' in model_base.lower():
163
- tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, token=token)
164
- model = Videollama2LlamaForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
165
- elif 'mistral' in model_base.lower():
166
- tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, token=token)
167
- model = Videollama2MistralForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
168
- elif 'mixtral' in model_base.lower():
169
- tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, token=token)
170
- model = Videollama2MixtralForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
171
- elif 'qwen2' in model_base.lower():
172
- tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, token=token)
173
- model = Videollama2Qwen2ForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
174
- elif 'gemma2' in model_base.lower():
175
- model = Videollama2Gemma2ForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs)
176
- elif 'phi3' in model_base.lower():
177
- model = Videollama2Phi3ForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs)
178
- else:
179
- # NOTE: mistral-based model is our default model.
180
- tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, token=token)
181
- model = Videollama2MistralForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
182
- else:
183
- # Load language model
184
- if model_base is not None:
185
- # PEFT model
186
- from peft import PeftModel
187
- tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
188
- model = AutoModelForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, **kwargs)
189
- print(f"Loading LoRA weights from {model_path}")
190
- model = PeftModel.from_pretrained(model, model_path)
191
- print(f"Merging weights")
192
- model = model.merge_and_unload()
193
- print('Convert to FP16...')
194
- model.to(torch.float16)
 
 
 
 
 
 
 
 
 
 
195
  else:
196
- use_fast = False
197
- tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
198
- model = AutoModelForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
 
200
  processor = None
201
 
202
- if "videollama" in model_name.lower() or 'vlb' in model_name.lower():
203
  vision_tower = model.get_vision_tower()
204
  if not vision_tower.is_loaded:
205
  vision_tower.load_model()
 
22
  from transformers import PretrainedConfig, AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig
23
 
24
  from .projector import load_mm_projector
25
+ from .videollama2_llama import Videollama2LlamaForCausalLM, Videollama2LlamaConfig
26
  from .videollama2_mistral import Videollama2MistralForCausalLM, Videollama2MistralConfig
27
  from .videollama2_mixtral import Videollama2MixtralForCausalLM, Videollama2MixtralConfig
28
  from .videollama2_qwen2 import Videollama2Qwen2ForCausalLM, Videollama2Qwen2Config
 
 
29
 
30
 
31
  VLLMs = {
 
34
  "videollama2_mistral": Videollama2MistralForCausalLM,
35
  "videollama2_mixtral": Videollama2MixtralForCausalLM,
36
  "videollama2_qwen2": Videollama2Qwen2ForCausalLM,
37
+ }
38
+
39
+ VLLMConfigs = {
40
+ "videollama2": Videollama2MistralConfig,
41
+ "videollama2_llama": Videollama2LlamaConfig,
42
+ "videollama2_mistral": Videollama2MistralConfig,
43
+ "videollama2_mixtral": Videollama2MixtralConfig,
44
+ "videollama2_qwen2": Videollama2Qwen2Config,
45
  }
46
 
47
 
 
73
  if use_flash_attn:
74
  kwargs['attn_implementation'] = 'flash_attention_2'
75
 
76
+ config = AutoConfig.from_pretrained(model_path)
77
+
78
+ # judge model type
79
+ model_type = config.model_type
80
+
81
+ # judge pretrain/finetune
82
+ try:
83
+ is_pretraining = config.tune_mm_mlp_adapter
84
+ except:
85
+ is_pretraining = False
86
+
87
+ # NOTE: lora/qlora model loading
88
+ if 'lora' in model_name.lower() or 'qlora' in model_name.lower():
89
+ cfg_pretrained = PretrainedConfig.from_pretrained(model_path, token=token)
90
+ # NOTE: AutoConfig will modify `_name_or_path` property to `model_path` if `model_path` is not None.
91
+ # cfg_pretrained = AutoConfig.from_pretrained(model_path, token=token)
92
+ model_base = model_base if model_base is not None else cfg_pretrained._name_or_path
93
+
94
+ # NOTE: remove qlora training quantization config
95
+ if hasattr(config, 'quantization_config'):
96
+ del config.quantization_config
97
+ tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False, token=token)
98
+ print('Loading VideoLLaMA lora model...')
99
+
100
+ if 'vicuna' in model_base.lower():
101
+ model = Videollama2LlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=config, **kwargs)
102
+ elif 'mistral' in model_base.lower():
103
+ model = Videollama2MistralForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=config, **kwargs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  else:
105
+ model = Videollama2MistralForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=config, **kwargs)
106
+
107
+ token_num, tokem_dim = model.lm_head.out_features, model.lm_head.in_features
108
+ if model.lm_head.weight.shape[0] != token_num:
109
+ model.lm_head.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
110
+ model.model.embed_tokens.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
111
+
112
+ print('Loading additional VideoLLaMA weights...')
113
+ if os.path.exists(os.path.join(model_path, 'non_lora_trainables.bin')):
114
+ non_lora_trainables = torch.load(os.path.join(model_path, 'non_lora_trainables.bin'), map_location='cpu')
115
+ else:
116
+ # this is probably from HF Hub
117
+ from huggingface_hub import hf_hub_download
118
+ def load_from_hf(repo_id, filename, subfolder=None):
119
+ cache_file = hf_hub_download(
120
+ repo_id=repo_id,
121
+ filename=filename,
122
+ subfolder=subfolder)
123
+ return torch.load(cache_file, map_location='cpu')
124
+ non_lora_trainables = load_from_hf(model_path, 'non_lora_trainables.bin')
125
+ non_lora_trainables = {(k[11:] if k.startswith('base_model.') else k): v for k, v in non_lora_trainables.items()}
126
+ if any(k.startswith('model.model.') for k in non_lora_trainables):
127
+ non_lora_trainables = {(k[6:] if k.startswith('model.') else k): v for k, v in non_lora_trainables.items()}
128
+ model.load_state_dict(non_lora_trainables, strict=False)
129
+
130
+ from peft import PeftModel
131
+ print('Loading LoRA weights...')
132
+ model = PeftModel.from_pretrained(model, model_path)
133
+ print('Merging LoRA weights...')
134
+ model = model.merge_and_unload()
135
+ print('Model is loaded...')
136
+ elif model_base is not None or is_pretraining:
137
+ # NOTE: Base/Pretrain model loading
138
+ print('Loading VideoLLaMA 2 from base model...')
139
+ cfg_pretrained = PretrainedConfig.from_pretrained(model_path, token=token)
140
+ # NOTE: AutoConfig will modify `_name_or_path` property to `model_path` if `model_path` is not None.
141
+ # cfg_pretrained = AutoConfig.from_pretrained(model_path, token=token)
142
+ model_base = model_base if model_base is not None else cfg_pretrained._name_or_path
143
+
144
+ tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False, token=token)
145
+
146
+ if model_type in ['videollama2', 'videollama2_mistral']:
147
+ model = Videollama2MistralForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=config, **kwargs)
148
+ elif model_type in ['videollama2_mixtral']:
149
+ model = Videollama2MixtralForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=config, **kwargs)
150
+ elif model_type in ['videollama2_qwen2']:
151
+ model = Videollama2Qwen2ForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=config, **kwargs)
152
  else:
153
+ model = Videollama2MistralForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=config, **kwargs)
154
+
155
+ # NOTE; loading vision-language projector
156
+ # * old codes for loading local mm_projector.bin
157
+ # mm_projector_weights = torch.load(os.path.join(model_path, 'mm_projector.bin'), map_location='cpu')
158
+ # mm_projector_weights = {k: v.to(torch.float16) for k, v in mm_projector_weights.items()}
159
+ # model.load_state_dict(mm_projector_weights, strict=False)
160
+ # * new codes which supports loading mm_projector.bin both offline and online
161
+ mm_projector_weights = load_mm_projector(model_path, token=token)
162
+ model.load_state_dict(mm_projector_weights, strict=False)
163
+ elif 'videollama2' in model_type:
164
+ # NOTE: SFT model loading
165
+ tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, token=token)
166
+
167
+ if model_type in ['videollama2', 'videollama2_mistral']:
168
+ model = Videollama2MistralForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, config=config, **kwargs)
169
+ elif model_type in ['videollama2_mixtral']:
170
+ model = Videollama2MixtralForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, config=config, **kwargs)
171
+ elif model_type in ['videollama2_qwen2']:
172
+ model = Videollama2Qwen2ForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, config=config, **kwargs)
173
+ else:
174
+ model = Videollama2MistralForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, config=config, **kwargs)
175
+ else:
176
+ tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, token=token)
177
+ model = AutoModelForCausalLM.from_pretrained(model_path, config=config, **kwargs)
178
 
179
  processor = None
180
 
181
+ if "videollama" in model_type:
182
  vision_tower = model.get_vision_tower()
183
  if not vision_tower.is_loaded:
184
  vision_tower.load_model()
VideoLLaMA2/videollama2/model/encoder.py CHANGED
@@ -88,6 +88,10 @@ class CLIPVisionTower(nn.Module):
88
  def num_patches_per_side(self):
89
  return self.config.image_size // self.config.patch_size
90
 
 
 
 
 
91
 
92
  class SiglipVisionTower(nn.Module):
93
 
@@ -165,7 +169,11 @@ class SiglipVisionTower(nn.Module):
165
  @property
166
  def num_patches_per_side(self):
167
  return self.config.image_size // self.config.patch_size
168
-
 
 
 
 
169
 
170
  def build_vision_tower(vision_tower_cfg, **kwargs):
171
  vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None))
 
88
  def num_patches_per_side(self):
89
  return self.config.image_size // self.config.patch_size
90
 
91
+ @property
92
+ def image_size(self):
93
+ return self.config.image_size
94
+
95
 
96
  class SiglipVisionTower(nn.Module):
97
 
 
169
  @property
170
  def num_patches_per_side(self):
171
  return self.config.image_size // self.config.patch_size
172
+
173
+ @property
174
+ def image_size(self):
175
+ return self.config.image_size
176
+
177
 
178
  def build_vision_tower(vision_tower_cfg, **kwargs):
179
  vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None))
VideoLLaMA2/videollama2/model/videollama2_arch.py CHANGED
@@ -117,6 +117,7 @@ class Videollama2MetaForCausalLM(ABC):
117
 
118
  data_batch = []
119
  for i, (data, modal) in enumerate(images):
 
120
  if modal == 'image':
121
  data = data.expand(num_frames, -1, -1, -1)
122
  else:
@@ -125,6 +126,8 @@ class Videollama2MetaForCausalLM(ABC):
125
 
126
  data_batch = torch.stack(data_batch, dim=0)
127
 
 
 
128
  assert len(data_batch.size()) == 5
129
  batch_size = data_batch.size(0)
130
 
 
117
 
118
  data_batch = []
119
  for i, (data, modal) in enumerate(images):
120
+ print(data, modal.shape)
121
  if modal == 'image':
122
  data = data.expand(num_frames, -1, -1, -1)
123
  else:
 
126
 
127
  data_batch = torch.stack(data_batch, dim=0)
128
 
129
+ print(data_batch.shape)
130
+
131
  assert len(data_batch.size()) == 5
132
  batch_size = data_batch.size(0)
133
 
VideoLLaMA2/videollama2/model/videollama2_gemma2.py DELETED
@@ -1,157 +0,0 @@
1
- # Adopted from: https://github.com/haotian-liu/LLaVA. Below is the original copyright:
2
- # Copyright 2023 Haotian Liu
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
-
17
- from typing import List, Optional, Tuple, Union
18
-
19
- import torch
20
- import torch.nn as nn
21
- from torch.nn import CrossEntropyLoss
22
-
23
- from transformers import AutoConfig, AutoModelForCausalLM, \
24
- Gemma2Config, Gemma2Model, Gemma2ForCausalLM
25
-
26
- from transformers.modeling_outputs import CausalLMOutputWithPast
27
- from transformers.generation.utils import GenerateOutput
28
-
29
- from .videollama2_arch import Videollama2MetaModel, Videollama2MetaForCausalLM
30
-
31
-
32
- class Videollama2Gemma2Config(Gemma2Config):
33
- model_type = "videollama2_gemma2"
34
-
35
- def __init__(self, **kwargs):
36
- super().__init__(**kwargs)
37
- self.model_type = "videollama2_gemma2"
38
-
39
-
40
- class Videollama2Gemma2Model(Videollama2MetaModel, Gemma2Model):
41
- config_class = Videollama2Gemma2Config
42
-
43
- def __init__(self, config: Gemma2Config):
44
- super(Videollama2Gemma2Model, self).__init__(config)
45
-
46
-
47
- class Videollama2Gemma2ForCausalLM(Gemma2ForCausalLM, Videollama2MetaForCausalLM):
48
- config_class = Videollama2Gemma2Config
49
-
50
- def __init__(self, config, **kwargs):
51
- super(Gemma2ForCausalLM, self).__init__(config)
52
- self.model = Videollama2Gemma2Model(config)
53
- # self.pretraining_tp = config.pretraining_tp
54
- self.vocab_size = config.vocab_size
55
- self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
56
-
57
- # Initialize weights and apply final processing
58
- self.post_init()
59
-
60
- def get_model(self):
61
- return self.model
62
-
63
- def forward(
64
- self,
65
- input_ids: torch.LongTensor = None,
66
- attention_mask: Optional[torch.Tensor] = None,
67
- position_ids: Optional[torch.LongTensor] = None,
68
- past_key_values: Optional[List[torch.FloatTensor]] = None,
69
- inputs_embeds: Optional[torch.FloatTensor] = None,
70
- labels: Optional[torch.LongTensor] = None,
71
- use_cache: Optional[bool] = None,
72
- output_attentions: Optional[bool] = None,
73
- output_hidden_states: Optional[bool] = None,
74
- images: Optional[torch.FloatTensor] = None,
75
- return_dict: Optional[bool] = None,
76
- **kwargs
77
- ) -> Union[Tuple, CausalLMOutputWithPast]:
78
-
79
- if inputs_embeds is None:
80
- (
81
- input_ids,
82
- attention_mask,
83
- past_key_values,
84
- inputs_embeds,
85
- labels
86
- ) = self.prepare_inputs_labels_for_multimodal(
87
- input_ids,
88
- attention_mask,
89
- past_key_values,
90
- labels,
91
- images
92
- )
93
-
94
- outputs = super().forward(
95
- input_ids=input_ids,
96
- attention_mask=attention_mask,
97
- past_key_values=past_key_values,
98
- inputs_embeds=inputs_embeds,
99
- labels=labels,
100
- use_cache=use_cache,
101
- output_attentions=output_attentions,
102
- output_hidden_states=output_hidden_states,
103
- return_dict=return_dict
104
- )
105
-
106
- outputs.labels = labels
107
-
108
- return outputs
109
-
110
- @torch.no_grad()
111
- def generate(
112
- self,
113
- inputs: Optional[torch.Tensor] = None,
114
- images: Optional[torch.Tensor] = None,
115
- **kwargs,
116
- ) -> Union[GenerateOutput, torch.LongTensor]:
117
- position_ids = kwargs.pop("position_ids", None)
118
- attention_mask = kwargs.pop("attention_mask", None)
119
- if "inputs_embeds" in kwargs:
120
- raise NotImplementedError("`inputs_embeds` is not supported")
121
-
122
- if images is not None:
123
- (
124
- input_ids,
125
- attention_mask,
126
- past_key_values,
127
- inputs_embeds,
128
- _
129
- ) = self.prepare_inputs_labels_for_multimodal(
130
- input_ids=inputs,
131
- attention_mask=attention_mask,
132
- past_key_values=None,
133
- labels=None,
134
- images=images
135
- )
136
- else:
137
- inputs_embeds = self.get_model().embed_tokens(inputs)
138
-
139
- return super().generate(
140
- position_ids=position_ids,
141
- attention_mask=attention_mask,
142
- inputs_embeds=inputs_embeds,
143
- **kwargs
144
- )
145
-
146
- def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
147
- images = kwargs.pop("images", None)
148
- _inputs = super().prepare_inputs_for_generation(
149
- input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
150
- )
151
- if images is not None:
152
- _inputs['images'] = images
153
- return _inputs
154
-
155
-
156
- AutoConfig.register("videollama2_gemma2", Videollama2Gemma2Config)
157
- AutoModelForCausalLM.register(Videollama2Gemma2Config, Videollama2Gemma2ForCausalLM)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
VideoLLaMA2/videollama2/model/videollama2_llama.py CHANGED
@@ -27,7 +27,7 @@ from transformers.generation.utils import GenerateOutput
27
  from .videollama2_arch import Videollama2MetaModel, Videollama2MetaForCausalLM
28
 
29
 
30
- class Videollama2Config(LlamaConfig):
31
  model_type = "videollama2_llama"
32
 
33
  def __init__(self, **kwargs):
@@ -36,14 +36,14 @@ class Videollama2Config(LlamaConfig):
36
 
37
 
38
  class Videollama2LlamaModel(Videollama2MetaModel, LlamaModel):
39
- config_class = Videollama2Config
40
 
41
  def __init__(self, config: LlamaConfig):
42
  super(Videollama2LlamaModel, self).__init__(config)
43
 
44
 
45
  class Videollama2LlamaForCausalLM(LlamaForCausalLM, Videollama2MetaForCausalLM):
46
- config_class = Videollama2Config
47
 
48
  def __init__(self, config, **kwargs):
49
  super(LlamaForCausalLM, self).__init__(config)
@@ -98,7 +98,7 @@ class Videollama2LlamaForCausalLM(LlamaForCausalLM, Videollama2MetaForCausalLM):
98
  use_cache=use_cache,
99
  output_attentions=output_attentions,
100
  output_hidden_states=output_hidden_states,
101
- return_dict=return_dict
102
  )
103
 
104
  outputs.labels = labels
@@ -151,5 +151,5 @@ class Videollama2LlamaForCausalLM(LlamaForCausalLM, Videollama2MetaForCausalLM):
151
  return _inputs
152
 
153
 
154
- AutoConfig.register("videollama2_llama", Videollama2Config)
155
- AutoModelForCausalLM.register(Videollama2Config, Videollama2LlamaForCausalLM)
 
27
  from .videollama2_arch import Videollama2MetaModel, Videollama2MetaForCausalLM
28
 
29
 
30
+ class Videollama2LlamaConfig(LlamaConfig):
31
  model_type = "videollama2_llama"
32
 
33
  def __init__(self, **kwargs):
 
36
 
37
 
38
  class Videollama2LlamaModel(Videollama2MetaModel, LlamaModel):
39
+ config_class = Videollama2LlamaConfig
40
 
41
  def __init__(self, config: LlamaConfig):
42
  super(Videollama2LlamaModel, self).__init__(config)
43
 
44
 
45
  class Videollama2LlamaForCausalLM(LlamaForCausalLM, Videollama2MetaForCausalLM):
46
+ config_class = Videollama2LlamaConfig
47
 
48
  def __init__(self, config, **kwargs):
49
  super(LlamaForCausalLM, self).__init__(config)
 
98
  use_cache=use_cache,
99
  output_attentions=output_attentions,
100
  output_hidden_states=output_hidden_states,
101
+ return_dict=return_dict,
102
  )
103
 
104
  outputs.labels = labels
 
151
  return _inputs
152
 
153
 
154
+ AutoConfig.register("videollama2_llama", Videollama2LlamaConfig)
155
+ AutoModelForCausalLM.register(Videollama2LlamaConfig, Videollama2LlamaForCausalLM)
VideoLLaMA2/videollama2/model/videollama2_mistral.py CHANGED
@@ -100,7 +100,7 @@ class Videollama2MistralForCausalLM(MistralForCausalLM, Videollama2MetaForCausal
100
  use_cache=use_cache,
101
  output_attentions=output_attentions,
102
  output_hidden_states=output_hidden_states,
103
- return_dict=return_dict
104
  )
105
 
106
  outputs.labels = labels
 
100
  use_cache=use_cache,
101
  output_attentions=output_attentions,
102
  output_hidden_states=output_hidden_states,
103
+ return_dict=return_dict,
104
  )
105
 
106
  outputs.labels = labels
VideoLLaMA2/videollama2/model/videollama2_mixtral.py CHANGED
@@ -99,7 +99,7 @@ class Videollama2MixtralForCausalLM(MixtralForCausalLM, Videollama2MetaForCausal
99
  use_cache=use_cache,
100
  output_attentions=output_attentions,
101
  output_hidden_states=output_hidden_states,
102
- return_dict=return_dict
103
  )
104
 
105
  @torch.no_grad()
 
99
  use_cache=use_cache,
100
  output_attentions=output_attentions,
101
  output_hidden_states=output_hidden_states,
102
+ return_dict=return_dict,
103
  )
104
 
105
  @torch.no_grad()
VideoLLaMA2/videollama2/model/videollama2_phi3.py DELETED
@@ -1,157 +0,0 @@
1
- # Adopted from: https://github.com/haotian-liu/LLaVA. Below is the original copyright:
2
- # Copyright 2023 Haotian Liu
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
-
17
- from typing import List, Optional, Tuple, Union
18
-
19
- import torch
20
- import torch.nn as nn
21
- from torch.nn import CrossEntropyLoss
22
-
23
- from transformers import AutoConfig, AutoModelForCausalLM, PretrainedConfig, \
24
- Phi3Config, Phi3Model, Phi3ForCausalLM
25
-
26
- from transformers.modeling_outputs import CausalLMOutputWithPast
27
- from transformers.generation.utils import GenerateOutput
28
-
29
- from .videollama2_arch import Videollama2MetaModel, Videollama2MetaForCausalLM
30
-
31
-
32
- class Videollama2Phi3Config(Phi3Config):
33
- model_type = "videollama2_phi3"
34
-
35
- def __init__(self, **kwargs):
36
- super().__init__(**kwargs)
37
- self.model_type = "videollama2_phi3"
38
-
39
-
40
- class Videollama2Phi3Model(Videollama2MetaModel, Phi3Model):
41
- config_class = Videollama2Phi3Config
42
-
43
- def __init__(self, config: Phi3Config):
44
- super(Videollama2Phi3Model, self).__init__(config)
45
-
46
-
47
- class Videollama2Phi3ForCausalLM(Phi3ForCausalLM, Videollama2MetaForCausalLM):
48
- config_class = Videollama2Phi3Config
49
-
50
- def __init__(self, config, **kwargs):
51
- super(Phi3ForCausalLM, self).__init__(config)
52
- self.model = Videollama2Phi3Model(config)
53
- # self.pretraining_tp = config.pretraining_tp
54
- self.vocab_size = config.vocab_size
55
- self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
56
-
57
- # Initialize weights and apply final processing
58
- self.post_init()
59
-
60
- def get_model(self):
61
- return self.model
62
-
63
- def forward(
64
- self,
65
- input_ids: torch.LongTensor = None,
66
- attention_mask: Optional[torch.Tensor] = None,
67
- position_ids: Optional[torch.LongTensor] = None,
68
- past_key_values: Optional[List[torch.FloatTensor]] = None,
69
- inputs_embeds: Optional[torch.FloatTensor] = None,
70
- labels: Optional[torch.LongTensor] = None,
71
- use_cache: Optional[bool] = None,
72
- output_attentions: Optional[bool] = None,
73
- output_hidden_states: Optional[bool] = None,
74
- images: Optional[torch.FloatTensor] = None,
75
- return_dict: Optional[bool] = None,
76
- **kwargs
77
- ) -> Union[Tuple, CausalLMOutputWithPast]:
78
-
79
- if inputs_embeds is None:
80
- (
81
- input_ids,
82
- attention_mask,
83
- past_key_values,
84
- inputs_embeds,
85
- labels
86
- ) = self.prepare_inputs_labels_for_multimodal(
87
- input_ids,
88
- attention_mask,
89
- past_key_values,
90
- labels,
91
- images
92
- )
93
-
94
- outputs = super().forward(
95
- input_ids=input_ids,
96
- attention_mask=attention_mask,
97
- past_key_values=past_key_values,
98
- inputs_embeds=inputs_embeds,
99
- labels=labels,
100
- use_cache=use_cache,
101
- output_attentions=output_attentions,
102
- output_hidden_states=output_hidden_states,
103
- return_dict=return_dict
104
- )
105
-
106
- outputs.labels = labels
107
-
108
- return outputs
109
-
110
- @torch.no_grad()
111
- def generate(
112
- self,
113
- inputs: Optional[torch.Tensor] = None,
114
- images: Optional[torch.Tensor] = None,
115
- **kwargs,
116
- ) -> Union[GenerateOutput, torch.LongTensor]:
117
- position_ids = kwargs.pop("position_ids", None)
118
- attention_mask = kwargs.pop("attention_mask", None)
119
- if "inputs_embeds" in kwargs:
120
- raise NotImplementedError("`inputs_embeds` is not supported")
121
-
122
- if images is not None:
123
- (
124
- input_ids,
125
- attention_mask,
126
- past_key_values,
127
- inputs_embeds,
128
- _
129
- ) = self.prepare_inputs_labels_for_multimodal(
130
- input_ids=inputs,
131
- attention_mask=attention_mask,
132
- past_key_values=None,
133
- labels=None,
134
- images=images
135
- )
136
- else:
137
- inputs_embeds = self.get_model().embed_tokens(inputs)
138
-
139
- return super().generate(
140
- position_ids=position_ids,
141
- attention_mask=attention_mask,
142
- inputs_embeds=inputs_embeds,
143
- **kwargs
144
- )
145
-
146
- def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
147
- images = kwargs.pop("images", None)
148
- _inputs = super().prepare_inputs_for_generation(
149
- input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
150
- )
151
- if images is not None:
152
- _inputs['images'] = images
153
- return _inputs
154
-
155
-
156
- AutoConfig.register("videollama2_phi3", Videollama2Phi3Config)
157
- AutoModelForCausalLM.register(Videollama2Phi3Config, Videollama2Phi3ForCausalLM)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
VideoLLaMA2/videollama2/model/videollama2_qwen2.py CHANGED
@@ -98,7 +98,7 @@ class Videollama2Qwen2ForCausalLM(Qwen2ForCausalLM, Videollama2MetaForCausalLM):
98
  use_cache=use_cache,
99
  output_attentions=output_attentions,
100
  output_hidden_states=output_hidden_states,
101
- return_dict=return_dict
102
  )
103
 
104
  @torch.no_grad()
 
98
  use_cache=use_cache,
99
  output_attentions=output_attentions,
100
  output_hidden_states=output_hidden_states,
101
+ return_dict=return_dict,
102
  )
103
 
104
  @torch.no_grad()
VideoLLaMA2/videollama2/serve/gradio_web_server_adhoc.py CHANGED
@@ -129,20 +129,26 @@ def generate(image, video, message, chatbot, textbox_in, temperature, top_p, max
129
  one_turn_chat[0] += "\n" + show_images
130
  # 2. not first run case
131
  else:
132
- previous_image = re.findall(r'<img src="./file=(.+?)"', chatbot[0][0])
133
- previous_video = re.findall(r'<video controls playsinline width="500" style="display: inline-block;" src="./file=(.+?)"', chatbot[0][0])
134
- if len(previous_image) > 0:
135
- previous_image = previous_image[0]
136
- # 2.1 new image append or pure text input will start a new conversation
137
- if previous_image != image:
138
- message.clear()
139
- one_turn_chat[0] += "\n" + show_images if image is not None else ""
140
- elif len(previous_video) > 0:
141
- previous_video = previous_video[0]
142
- # 2.2 new video append or pure text input will start a new conversation
143
- if previous_video != video:
144
- message.clear()
145
- one_turn_chat[0] += "\n" + show_images if video is not None else ""
 
 
 
 
 
 
146
 
147
  message.append({'role': 'user', 'content': textbox_in})
148
  text_en_out = handler.generate(data, message, temperature=temperature, top_p=top_p, max_output_tokens=max_output_tokens)
@@ -173,7 +179,7 @@ def clear_history(message, chatbot):
173
  # 2. The operation or tensor which requires cuda are limited in those functions wrapped via spaces.GPU
174
  # 3. The function can't return tensor or other cuda objects.
175
 
176
- model_path = 'DAMO-NLP-SG/VideoLLaMA2-7B-16F'
177
 
178
  handler = Chat(model_path, load_8bit=False, load_4bit=True)
179
 
 
129
  one_turn_chat[0] += "\n" + show_images
130
  # 2. not first run case
131
  else:
132
+ # scanning the last image or video
133
+ length = len(chatbot)
134
+ for i in range(length - 1, -1, -1):
135
+ previous_image = re.findall(r'<img src="./file=(.+?)"', chatbot[i][0])
136
+ previous_video = re.findall(r'<video controls playsinline width="500" style="display: inline-block;" src="./file=(.+?)"', chatbot[i][0])
137
+
138
+ if len(previous_image) > 0:
139
+ previous_image = previous_image[-1]
140
+ # 2.1 new image append or pure text input will start a new conversation
141
+ if (video is not None) or (image is not None and os.path.basename(previous_image) != os.path.basename(image)):
142
+ message.clear()
143
+ one_turn_chat[0] += "\n" + show_images
144
+ break
145
+ elif len(previous_video) > 0:
146
+ previous_video = previous_video[-1]
147
+ # 2.2 new video append or pure text input will start a new conversation
148
+ if image is not None or (video is not None and os.path.basename(previous_video) != os.path.basename(video)):
149
+ message.clear()
150
+ one_turn_chat[0] += "\n" + show_images
151
+ break
152
 
153
  message.append({'role': 'user', 'content': textbox_in})
154
  text_en_out = handler.generate(data, message, temperature=temperature, top_p=top_p, max_output_tokens=max_output_tokens)
 
179
  # 2. The operation or tensor which requires cuda are limited in those functions wrapped via spaces.GPU
180
  # 3. The function can't return tensor or other cuda objects.
181
 
182
+ model_path = 'DAMO-NLP-SG/VideoLLaMA2.1-7B-16F'
183
 
184
  handler = Chat(model_path, load_8bit=False, load_4bit=True)
185
 
VideoLLaMA2/videollama2/train.py CHANGED
@@ -87,7 +87,7 @@ class ModelArguments:
87
  @dataclass
88
  class DataArguments:
89
  # Path Arguments
90
- data_path: str = field(default=None, metadata={"help": "Path to the training data."})
91
  # image_folder: Optional[str] = field(default=None)
92
  # video_folder: Optional[str] = field(default=None)
93
  data_folder: Optional[str] = field(default=None)
@@ -105,7 +105,6 @@ class TrainingArguments(transformers.TrainingArguments):
105
  mm_projector_lr: Optional[float] = None
106
  freeze_mm_mlp_adapter: bool = field(default=False)
107
  remove_unused_columns: bool = field(default=False)
108
- cache_dir: Optional[str] = field(default=None)
109
  # Training Data Arguments
110
  group_by_modality_length: bool = field(default=False)
111
  model_max_length: int = field(
@@ -153,23 +152,14 @@ def preprocess_plain(
153
  {'role': 'user', 'content': modal_token},
154
  {'role': 'assistant', 'content': source[1]['value']}
155
  ]
156
- conversation = tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=False)
157
- # 2. tokenize conversations
158
- input_ids.append(tokenizer_multimodal_token(conversation, tokenizer, modal_token, return_tensors='pt'))
159
- # 3. make targets
160
- targets.append(copy.deepcopy(input_ids[-1]))
161
- instruction = tokenizer.apply_chat_template(message[:1], tokenize=False, add_generation_prompt=True)
162
- instruction_len = len(tokenizer_multimodal_token(instruction, tokenizer, modal_token, return_tensors='pt'))
163
- targets[-1][:instruction_len] = IGNORE_INDEX
164
-
165
- # print("instruction: ----------------")
166
- # print(instruction)
167
- # print("conversation: ----------------")
168
- # print(conversation)
169
- # print("training targets: ----------------")
170
- # print(tokenizer.decode(targets[-1][instruction_len:]))
171
- # print(input_ids[-1])
172
- # print(targets[-1])
173
 
174
  return dict(input_ids=input_ids, labels=targets)
175
 
@@ -251,7 +241,10 @@ class LazySupervisedDataset(Dataset):
251
  tokenizer: transformers.PreTrainedTokenizer,
252
  data_args: DataArguments):
253
  super(LazySupervisedDataset, self).__init__()
254
- list_data_dict = json.load(open(data_path, "r"))
 
 
 
255
 
256
  rank0_print("Formatting inputs...Skip in lazy mode")
257
  self.tokenizer = tokenizer
@@ -340,8 +333,7 @@ class LazySupervisedDataset(Dataset):
340
  data_dict['video'] = video
341
  elif self.data_args.is_multimodal:
342
  # image does not exist in the data, but the model is multimodal
343
- crop_size = self.data_args.image_processor.crop_size
344
- data_dict['image'] = torch.zeros(3, crop_size['height'], crop_size['width'])
345
  return data_dict
346
 
347
 
@@ -429,18 +421,14 @@ def train(attn_implementation=None):
429
  bnb_4bit_quant_storage=compute_dtype,
430
  )
431
  ))
432
-
433
- config = transformers.AutoConfig.from_pretrained(model_args.model_path, trust_remote_code=True)
434
- if 'gemma2' in model_args.model_type:
435
- config._attn_implementation = 'eager'
436
- else:
437
- config._attn_implementation = attn_implementation
438
 
439
  if model_args.vision_tower is not None:
440
  model = VLLMs[model_args.model_type].from_pretrained(
441
  model_args.model_path,
442
  config=config,
443
- cache_dir=training_args.cache_dir,
444
  torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
445
  do_sample=True,
446
  **bnb_model_from_pretrained_args
@@ -452,7 +440,6 @@ def train(attn_implementation=None):
452
  model = transformers.LlamaForCausalLM.from_pretrained(
453
  model_args.model_path,
454
  config=config,
455
- cache_dir=training_args.cache_dir,
456
  torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
457
  do_sample=True,
458
  **bnb_model_from_pretrained_args
@@ -496,7 +483,6 @@ def train(attn_implementation=None):
496
 
497
  tokenizer = transformers.AutoTokenizer.from_pretrained(
498
  model_args.model_path,
499
- cache_dir=training_args.cache_dir,
500
  model_max_length=training_args.model_max_length,
501
  padding_side="right",
502
  use_fast=True,
@@ -512,6 +498,8 @@ def train(attn_implementation=None):
512
  vision_tower = model.get_vision_tower()
513
  vision_tower.to(dtype=torch.bfloat16 if training_args.bf16 else torch.float16, device=training_args.device)
514
 
 
 
515
  data_args.image_processor = vision_tower.image_processor
516
  data_args.video_processor = vision_tower.video_processor if hasattr(vision_tower, "video_processor") else vision_tower.image_processor
517
 
@@ -581,4 +569,4 @@ def train(attn_implementation=None):
581
 
582
 
583
  if __name__ == "__main__":
584
- train()
 
87
  @dataclass
88
  class DataArguments:
89
  # Path Arguments
90
+ data_path: List[str] = field(default=None, metadata={"help": "Path to the training data."})
91
  # image_folder: Optional[str] = field(default=None)
92
  # video_folder: Optional[str] = field(default=None)
93
  data_folder: Optional[str] = field(default=None)
 
105
  mm_projector_lr: Optional[float] = None
106
  freeze_mm_mlp_adapter: bool = field(default=False)
107
  remove_unused_columns: bool = field(default=False)
 
108
  # Training Data Arguments
109
  group_by_modality_length: bool = field(default=False)
110
  model_max_length: int = field(
 
152
  {'role': 'user', 'content': modal_token},
153
  {'role': 'assistant', 'content': source[1]['value']}
154
  ]
155
+ conversation = " ".join([sentence['value'] for sentence in source])
156
+
157
+ input_id = tokenizer_multimodal_token(conversation, tokenizer, modal_token, return_tensors='pt')
158
+ target = copy.deepcopy(input_id)
159
+ target[input_id == MODAL_INDEX_MAP[modal_token]] = IGNORE_INDEX
160
+
161
+ input_ids.append(input_id)
162
+ targets.append(target)
 
 
 
 
 
 
 
 
 
163
 
164
  return dict(input_ids=input_ids, labels=targets)
165
 
 
241
  tokenizer: transformers.PreTrainedTokenizer,
242
  data_args: DataArguments):
243
  super(LazySupervisedDataset, self).__init__()
244
+ list_data_dict = []
245
+ for dp in data_path:
246
+ _datas = json.load(open(dp, "r"))
247
+ list_data_dict.extend(_datas)
248
 
249
  rank0_print("Formatting inputs...Skip in lazy mode")
250
  self.tokenizer = tokenizer
 
333
  data_dict['video'] = video
334
  elif self.data_args.is_multimodal:
335
  # image does not exist in the data, but the model is multimodal
336
+ data_dict['image'] = torch.zeros(3, self.data_args.image_size, self.data_args.image_size)
 
337
  return data_dict
338
 
339
 
 
421
  bnb_4bit_quant_storage=compute_dtype,
422
  )
423
  ))
424
+
425
+ config = VLLMConfigs[model_args.model_type].from_pretrained(model_args.model_path, trust_remote_code=True)
426
+ config._attn_implementation = attn_implementation
 
 
 
427
 
428
  if model_args.vision_tower is not None:
429
  model = VLLMs[model_args.model_type].from_pretrained(
430
  model_args.model_path,
431
  config=config,
 
432
  torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
433
  do_sample=True,
434
  **bnb_model_from_pretrained_args
 
440
  model = transformers.LlamaForCausalLM.from_pretrained(
441
  model_args.model_path,
442
  config=config,
 
443
  torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
444
  do_sample=True,
445
  **bnb_model_from_pretrained_args
 
483
 
484
  tokenizer = transformers.AutoTokenizer.from_pretrained(
485
  model_args.model_path,
 
486
  model_max_length=training_args.model_max_length,
487
  padding_side="right",
488
  use_fast=True,
 
498
  vision_tower = model.get_vision_tower()
499
  vision_tower.to(dtype=torch.bfloat16 if training_args.bf16 else torch.float16, device=training_args.device)
500
 
501
+ data_args.image_size = vision_tower.image_size
502
+
503
  data_args.image_processor = vision_tower.image_processor
504
  data_args.video_processor = vision_tower.video_processor if hasattr(vision_tower, "video_processor") else vision_tower.image_processor
505
 
 
569
 
570
 
571
  if __name__ == "__main__":
572
+ train("flash_attention_2")
VideoLLaMA2/videollama2/train_flash_attn.py DELETED
@@ -1,12 +0,0 @@
1
- # Adopted from https://github.com/haotian-liu/LLaVA. Below is the original copyright:
2
- # Adopted from https://github.com/lm-sys/FastChat. Below is the original copyright:
3
- # Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright:
4
- # Make it more memory efficient by monkey patching the LLaMA model with FlashAttn.
5
-
6
- import sys
7
- sys.path.append('./')
8
-
9
- from videollama2.train import train
10
-
11
- if __name__ == "__main__":
12
- train(attn_implementation="flash_attention_2")