Text Generation
Transformers
Safetensors
English
llava
multimodal
conversational
Eval Results
Inference Endpoints
ZhangYuanhan commited on
Commit
1bfe401
1 Parent(s): cc6c7b1

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +2 -1
README.md CHANGED
@@ -199,6 +199,7 @@ video_path = "XXXX"
199
  max_frames_num = "64"
200
  video,frame_time,video_time = load_video(video_path, max_frames_num, 1, force_sample=True)
201
  video = image_processor.preprocess(video, return_tensors="pt")["pixel_values"].cuda().bfloat16()
 
202
  conv_template = "qwen_1_5" # Make sure you use correct chat template for different models
203
  question = DEFAULT_IMAGE_TOKEN + "\nPlease describe this video in detail."
204
  conv = copy.deepcopy(conv_templates[conv_template])
@@ -209,7 +210,7 @@ input_ids = tokenizer_image_token(prompt_question, tokenizer, IMAGE_TOKEN_INDEX,
209
  cont = model.generate(
210
  input_ids,
211
  images=video,
212
- modalities="video"
213
  do_sample=False,
214
  temperature=0,
215
  max_new_tokens=4096,
 
199
  max_frames_num = "64"
200
  video,frame_time,video_time = load_video(video_path, max_frames_num, 1, force_sample=True)
201
  video = image_processor.preprocess(video, return_tensors="pt")["pixel_values"].cuda().bfloat16()
202
+ video = [video]
203
  conv_template = "qwen_1_5" # Make sure you use correct chat template for different models
204
  question = DEFAULT_IMAGE_TOKEN + "\nPlease describe this video in detail."
205
  conv = copy.deepcopy(conv_templates[conv_template])
 
210
  cont = model.generate(
211
  input_ids,
212
  images=video,
213
+ modalities=["video"],
214
  do_sample=False,
215
  temperature=0,
216
  max_new_tokens=4096,