llava-hf
/

vip-llava-13b-hf

@@ -4,6 +4,9 @@ language:
 pipeline_tag: image-to-text
 inference: false
 arxiv: 2304.08485
 ---
 # VipLLaVA Model Card
@@ -43,8 +46,8 @@ A chat between a curious human and an artificial intelligence assistant. The ass
 Where `<prompt>` denotes the prompt asked by the user
-### Using `pipeline`:
 ```python
 from transformers import pipeline
@@ -54,10 +57,21 @@ import requests
 model_id = "llava-hf/vip-llava-13b-hf"
 pipe = pipeline("image-to-text", model=model_id)
 url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg"
 image = Image.open(requests.get(url, stream=True).raw)
-question = "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud"
-prompt = f"A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.###Human: <image>\n{question}###Assistant:"
 outputs = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200})
 print(outputs)
@@ -75,12 +89,6 @@ import torch
 from transformers import AutoProcessor, VipLlavaForConditionalGeneration
 model_id = "llava-hf/vip-llava-13b-hf"
-question = "What are these?"
-prompt = f"A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.###Human: <image>\n{question}###Assistant:"
-image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
 model = VipLlavaForConditionalGeneration.from_pretrained(
     model_id,
     torch_dtype=torch.float16,
@@ -89,7 +97,21 @@ model = VipLlavaForConditionalGeneration.from_pretrained(
 processor = AutoProcessor.from_pretrained(model_id)
 raw_image = Image.open(requests.get(image_file, stream=True).raw)
 inputs = processor(prompt, raw_image, return_tensors='pt').to(0, torch.float16)

 pipeline_tag: image-to-text
 inference: false
 arxiv: 2304.08485
+tags:
+- vision
+- image-text-to-text
 ---
 # VipLLaVA Model Card
 Where `<prompt>` denotes the prompt asked by the user
+### Using `pipeline`:
 ```python
 from transformers import pipeline
 model_id = "llava-hf/vip-llava-13b-hf"
 pipe = pipeline("image-to-text", model=model_id)
 url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg"
 image = Image.open(requests.get(url, stream=True).raw)
+# Define a chat histiry and use `apply_chat_template` to get correctly formatted prompt
+# Each value in "content" has to be a list of dicts with types ("text", "image")
+conversation = [
+    {
+      "role": "user",
+      "content": [
+          {"type": "text", "text": "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud"},
+          {"type": "image"},
+        ],
+    },
+]
+prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
 outputs = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200})
 print(outputs)
 from transformers import AutoProcessor, VipLlavaForConditionalGeneration
 model_id = "llava-hf/vip-llava-13b-hf"
 model = VipLlavaForConditionalGeneration.from_pretrained(
     model_id,
     torch_dtype=torch.float16,
 processor = AutoProcessor.from_pretrained(model_id)
+# Define a chat histiry and use `apply_chat_template` to get correctly formatted prompt
+# Each value in "content" has to be a list of dicts with types ("text", "image")
+conversation = [
+    {
+      "role": "user",
+      "content": [
+          {"type": "text", "text": "What are these?"},
+          {"type": "image"},
+        ],
+    },
+]
+prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
+image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
 raw_image = Image.open(requests.get(image_file, stream=True).raw)
 inputs = processor(prompt, raw_image, return_tensors='pt').to(0, torch.float16)