llava-hf
/

LLaVA-NeXT-Video-7B-32K-hf

Image-Text-to-Text

llava_next_video

Inference Endpoints

Model card Files Files and versions Community

RaushanTurganbay HF staff commited on Aug 16

Commit

3c6d41a

•

1 Parent(s): 437026d

update processor kwargs

Files changed (1) hide show

README.md +2 -2

README.md CHANGED Viewed

@@ -89,7 +89,7 @@ def read_video_pyav(container, indices):
     return np.stack([x.to_ndarray(format="rgb24") for x in frames])
-# define a chat histiry and use `apply_chat_template` to get correctly formatted prompt
 # Each value in "content" has to be a list of dicts with types ("text", "image", "video")
 conversation = [
     {
@@ -138,7 +138,7 @@ prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
 image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
 raw_image = Image.open(requests.get(image_file, stream=True).raw)
-inputs_image = processor(prompt, images=raw_image, return_tensors='pt').to(0, torch.float16)
 output = model.generate(**inputs_video, max_new_tokens=100, do_sample=False)
 print(processor.decode(output[0][2:], skip_special_tokens=True))

     return np.stack([x.to_ndarray(format="rgb24") for x in frames])
+# define a chat history and use `apply_chat_template` to get correctly formatted prompt
 # Each value in "content" has to be a list of dicts with types ("text", "image", "video")
 conversation = [
     {
 image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
 raw_image = Image.open(requests.get(image_file, stream=True).raw)
+inputs_image = processor(text=prompt, images=raw_image, return_tensors='pt').to(0, torch.float16)
 output = model.generate(**inputs_video, max_new_tokens=100, do_sample=False)
 print(processor.decode(output[0][2:], skip_special_tokens=True))