lmms-lab
/

LLaVA-Video-72B-Qwen2

Text Generation

Inference Endpoints

Model card Files Files and versions Community

ZhangYuanhan commited on Sep 17

Commit

1bfe401

•

1 Parent(s): cc6c7b1

Update README.md

Files changed (1) hide show

README.md +2 -1

README.md CHANGED Viewed

@@ -199,6 +199,7 @@ video_path = "XXXX"
 max_frames_num = "64"
 video,frame_time,video_time = load_video(video_path, max_frames_num, 1, force_sample=True)
 video = image_processor.preprocess(video, return_tensors="pt")["pixel_values"].cuda().bfloat16()
 conv_template = "qwen_1_5"  # Make sure you use correct chat template for different models
 question = DEFAULT_IMAGE_TOKEN + "\nPlease describe this video in detail."
 conv = copy.deepcopy(conv_templates[conv_template])
@@ -209,7 +210,7 @@ input_ids = tokenizer_image_token(prompt_question, tokenizer, IMAGE_TOKEN_INDEX,
 cont = model.generate(
     input_ids,
     images=video,
-    modalities="video"
     do_sample=False,
     temperature=0,
     max_new_tokens=4096,

 max_frames_num = "64"
 video,frame_time,video_time = load_video(video_path, max_frames_num, 1, force_sample=True)
 video = image_processor.preprocess(video, return_tensors="pt")["pixel_values"].cuda().bfloat16()
+video = [video]
 conv_template = "qwen_1_5"  # Make sure you use correct chat template for different models
 question = DEFAULT_IMAGE_TOKEN + "\nPlease describe this video in detail."
 conv = copy.deepcopy(conv_templates[conv_template])
 cont = model.generate(
     input_ids,
     images=video,
+    modalities=["video"],
     do_sample=False,
     temperature=0,
     max_new_tokens=4096,