llava-hf
/

llava-interleave-qwen-0.5b-hf

@@ -5,6 +5,9 @@ license_link: LICENSE
 pipeline_tag: image-text-to-text
 language:
 - en
 ---
 # LLaVA Interleave Model Card
@@ -35,17 +38,31 @@ The model supports multi-image and multi-prompt generation. Meaning that you can
 Below we used [`"llava-hf/llava-interleave-qwen-0.5b-hf"`](https://huggingface.co/llava-hf/llava-interleave-qwen-0.5b-hf) checkpoint.
 ```python
 from transformers import pipeline
 from PIL import Image
 import requests
-model_id = "llava-hf/llava-interleave-qwen-0.5b-hf"
 pipe = pipeline("image-to-text", model=model_id)
-url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg"
 image = Image.open(requests.get(url, stream=True).raw)
-prompt = "<|im_start|>user <image>\nWhat does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud<|im_end|><|im_start|>assistant"
 outputs = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200})
 print(outputs)
@@ -62,11 +79,7 @@ from PIL import Image
 import torch
 from transformers import AutoProcessor, LlavaForConditionalGeneration
-model_id = "llava-hf/llava-interleave-qwen-0.5b-hf"
-prompt = "<|im_start|>user <image>\nWhat are these?|im_end|><|im_start|>assistant"
-image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
 model = LlavaForConditionalGeneration.from_pretrained(
     model_id,
     torch_dtype=torch.float16,
@@ -75,7 +88,21 @@ model = LlavaForConditionalGeneration.from_pretrained(
 processor = AutoProcessor.from_pretrained(model_id)
 raw_image = Image.open(requests.get(image_file, stream=True).raw)
 inputs = processor(prompt, raw_image, return_tensors='pt').to(0, torch.float16)
@@ -83,24 +110,55 @@ output = model.generate(**inputs, max_new_tokens=200, do_sample=False)
 print(processor.decode(output[0][2:], skip_special_tokens=True))
 ```
-When prompting with videos/3D data/multi-view data, prompt like following:
 ```python
 # if you downsampled n frames from the input
 image_tokens = "<image>" * n
 prompt = f"<|im_start|>user {image_tokens}\nWhat are these?|im_end|><|im_start|>assistant"
 ```
 When prompting with interleaved images and videos, prompt like following:
 ```python
 # two interleaved images
-prompt = "<|im_start|>user <image><image>\nWhat are these?|im_end|><|im_start|>assistant"
 # two interleaved videos, if you downsampled n frames in total from both videos
 image_tokens = "<image>" * n
 prompt = f"<|im_start|>user {image_tokens}\nWhat are these?|im_end|><|im_start|>assistant"
 ```
 ### Model optimization

 pipeline_tag: image-text-to-text
 language:
 - en
+tags:
+- vision
+- image-text-to-text
 ---
 # LLaVA Interleave Model Card
 Below we used [`"llava-hf/llava-interleave-qwen-0.5b-hf"`](https://huggingface.co/llava-hf/llava-interleave-qwen-0.5b-hf) checkpoint.
 ```python
 from transformers import pipeline
 from PIL import Image
 import requests
+model_id = "llava-hf/llava-interleave-qwen-0.5b-dpo-hf"
 pipe = pipeline("image-to-text", model=model_id)
+url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg"
 image = Image.open(requests.get(url, stream=True).raw)
+# Define a chat histiry and use `apply_chat_template` to get correctly formatted prompt
+# Each value in "content" has to be a list of dicts with types ("text", "image")
+conversation = [
+    {
+      "role": "user",
+      "content": [
+          {"type": "text", "text": "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud"},
+          {"type": "image"},
+        ],
+    },
+]
+prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
 outputs = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200})
 print(outputs)
 import torch
 from transformers import AutoProcessor, LlavaForConditionalGeneration
+model_id = "llava-hf/llava-interleave-qwen-0.5b-dpo-hf"
 model = LlavaForConditionalGeneration.from_pretrained(
     model_id,
     torch_dtype=torch.float16,
 processor = AutoProcessor.from_pretrained(model_id)
+# Define a chat histiry and use `apply_chat_template` to get correctly formatted prompt
+# Each value in "content" has to be a list of dicts with types ("text", "image")
+conversation = [
+    {
+      "role": "user",
+      "content": [
+          {"type": "text", "text": "What are these?"},
+          {"type": "image"},
+        ],
+    },
+]
+prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
+image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
 raw_image = Image.open(requests.get(image_file, stream=True).raw)
 inputs = processor(prompt, raw_image, return_tensors='pt').to(0, torch.float16)
 print(processor.decode(output[0][2:], skip_special_tokens=True))
 ```
+When prompting with videos/3D/multi-view input, prompt like following:
 ```python
 # if you downsampled n frames from the input
 image_tokens = "<image>" * n
 prompt = f"<|im_start|>user {image_tokens}\nWhat are these?|im_end|><|im_start|>assistant"
+# With chat template if you sampled 6 frames you have to have 8 images in one conversation turn
+conversation = [
+    {
+      "role": "user",
+      "content": [
+          {"type": "text", "text": "What are these?"},
+          {"type": "image"},
+          {"type": "image"},
+          {"type": "image"},
+          {"type": "image"},
+          {"type": "image"},
+        ],
+    },
+]
+prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
 ```
 When prompting with interleaved images and videos, prompt like following:
 ```python
 # two interleaved images
+prompt = "<|im_start|>user <image><image>\nWhat is the difference between these two images?|im_end|><|im_start|>assistant"
 # two interleaved videos, if you downsampled n frames in total from both videos
 image_tokens = "<image>" * n
 prompt = f"<|im_start|>user {image_tokens}\nWhat are these?|im_end|><|im_start|>assistant"
+# chat template in interleaved format work same as in sampling videos. Just pass in as many images you want for a prompt
+conversation = [
+    {
+      "role": "user",
+      "content": [
+          {"type": "text", "text": "What is the difference between these two images?"},
+          {"type": "image"},
+          {"type": "image"},
+        ],
+    },
+]
 ```
 ### Model optimization