RaushanTurganbay HF staff commited on
Commit
e8266a5
1 Parent(s): 2233e6c

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +68 -10
README.md CHANGED
@@ -5,6 +5,9 @@ license_link: LICENSE
5
  pipeline_tag: image-text-to-text
6
  language:
7
  - en
 
 
 
8
  ---
9
 
10
  # LLaVA Interleave Model Card
@@ -35,17 +38,31 @@ The model supports multi-image and multi-prompt generation. Meaning that you can
35
 
36
  Below we used [`"llava-hf/llava-interleave-qwen-0.5b-hf"`](https://huggingface.co/llava-hf/llava-interleave-qwen-0.5b-hf) checkpoint.
37
 
 
38
  ```python
39
  from transformers import pipeline
40
  from PIL import Image
41
  import requests
42
 
43
- model_id = "llava-hf/llava-interleave-qwen-0.5b-hf"
44
  pipe = pipeline("image-to-text", model=model_id)
45
- url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg"
46
 
 
47
  image = Image.open(requests.get(url, stream=True).raw)
48
- prompt = "<|im_start|>user <image>\nWhat does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud<|im_end|><|im_start|>assistant"
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
  outputs = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200})
51
  print(outputs)
@@ -62,11 +79,7 @@ from PIL import Image
62
  import torch
63
  from transformers import AutoProcessor, LlavaForConditionalGeneration
64
 
65
- model_id = "llava-hf/llava-interleave-qwen-0.5b-hf"
66
-
67
- prompt = "<|im_start|>user <image>\nWhat are these?|im_end|><|im_start|>assistant"
68
- image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
69
-
70
  model = LlavaForConditionalGeneration.from_pretrained(
71
  model_id,
72
  torch_dtype=torch.float16,
@@ -75,7 +88,21 @@ model = LlavaForConditionalGeneration.from_pretrained(
75
 
76
  processor = AutoProcessor.from_pretrained(model_id)
77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
 
79
  raw_image = Image.open(requests.get(image_file, stream=True).raw)
80
  inputs = processor(prompt, raw_image, return_tensors='pt').to(0, torch.float16)
81
 
@@ -83,24 +110,55 @@ output = model.generate(**inputs, max_new_tokens=200, do_sample=False)
83
  print(processor.decode(output[0][2:], skip_special_tokens=True))
84
  ```
85
 
86
- When prompting with videos/3D data/multi-view data, prompt like following:
 
87
 
88
  ```python
89
  # if you downsampled n frames from the input
90
 
91
  image_tokens = "<image>" * n
92
  prompt = f"<|im_start|>user {image_tokens}\nWhat are these?|im_end|><|im_start|>assistant"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  ```
94
 
95
  When prompting with interleaved images and videos, prompt like following:
96
 
97
  ```python
98
  # two interleaved images
99
- prompt = "<|im_start|>user <image><image>\nWhat are these?|im_end|><|im_start|>assistant"
100
 
101
  # two interleaved videos, if you downsampled n frames in total from both videos
102
  image_tokens = "<image>" * n
103
  prompt = f"<|im_start|>user {image_tokens}\nWhat are these?|im_end|><|im_start|>assistant"
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  ```
105
 
106
  ### Model optimization
 
5
  pipeline_tag: image-text-to-text
6
  language:
7
  - en
8
+ tags:
9
+ - vision
10
+ - image-text-to-text
11
  ---
12
 
13
  # LLaVA Interleave Model Card
 
38
 
39
  Below we used [`"llava-hf/llava-interleave-qwen-0.5b-hf"`](https://huggingface.co/llava-hf/llava-interleave-qwen-0.5b-hf) checkpoint.
40
 
41
+
42
  ```python
43
  from transformers import pipeline
44
  from PIL import Image
45
  import requests
46
 
47
+ model_id = "llava-hf/llava-interleave-qwen-0.5b-dpo-hf"
48
  pipe = pipeline("image-to-text", model=model_id)
 
49
 
50
+ url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg"
51
  image = Image.open(requests.get(url, stream=True).raw)
52
+
53
+ # Define a chat histiry and use `apply_chat_template` to get correctly formatted prompt
54
+ # Each value in "content" has to be a list of dicts with types ("text", "image")
55
+ conversation = [
56
+ {
57
+
58
+ "role": "user",
59
+ "content": [
60
+ {"type": "text", "text": "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud"},
61
+ {"type": "image"},
62
+ ],
63
+ },
64
+ ]
65
+ prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
66
 
67
  outputs = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200})
68
  print(outputs)
 
79
  import torch
80
  from transformers import AutoProcessor, LlavaForConditionalGeneration
81
 
82
+ model_id = "llava-hf/llava-interleave-qwen-0.5b-dpo-hf"
 
 
 
 
83
  model = LlavaForConditionalGeneration.from_pretrained(
84
  model_id,
85
  torch_dtype=torch.float16,
 
88
 
89
  processor = AutoProcessor.from_pretrained(model_id)
90
 
91
+ # Define a chat histiry and use `apply_chat_template` to get correctly formatted prompt
92
+ # Each value in "content" has to be a list of dicts with types ("text", "image")
93
+ conversation = [
94
+ {
95
+
96
+ "role": "user",
97
+ "content": [
98
+ {"type": "text", "text": "What are these?"},
99
+ {"type": "image"},
100
+ ],
101
+ },
102
+ ]
103
+ prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
104
 
105
+ image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
106
  raw_image = Image.open(requests.get(image_file, stream=True).raw)
107
  inputs = processor(prompt, raw_image, return_tensors='pt').to(0, torch.float16)
108
 
 
110
  print(processor.decode(output[0][2:], skip_special_tokens=True))
111
  ```
112
 
113
+
114
+ When prompting with videos/3D/multi-view input, prompt like following:
115
 
116
  ```python
117
  # if you downsampled n frames from the input
118
 
119
  image_tokens = "<image>" * n
120
  prompt = f"<|im_start|>user {image_tokens}\nWhat are these?|im_end|><|im_start|>assistant"
121
+
122
+ # With chat template if you sampled 6 frames you have to have 8 images in one conversation turn
123
+ conversation = [
124
+ {
125
+
126
+ "role": "user",
127
+ "content": [
128
+ {"type": "text", "text": "What are these?"},
129
+ {"type": "image"},
130
+ {"type": "image"},
131
+ {"type": "image"},
132
+ {"type": "image"},
133
+ {"type": "image"},
134
+ ],
135
+ },
136
+ ]
137
+ prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
138
  ```
139
 
140
  When prompting with interleaved images and videos, prompt like following:
141
 
142
  ```python
143
  # two interleaved images
144
+ prompt = "<|im_start|>user <image><image>\nWhat is the difference between these two images?|im_end|><|im_start|>assistant"
145
 
146
  # two interleaved videos, if you downsampled n frames in total from both videos
147
  image_tokens = "<image>" * n
148
  prompt = f"<|im_start|>user {image_tokens}\nWhat are these?|im_end|><|im_start|>assistant"
149
+
150
+ # chat template in interleaved format work same as in sampling videos. Just pass in as many images you want for a prompt
151
+ conversation = [
152
+ {
153
+
154
+ "role": "user",
155
+ "content": [
156
+ {"type": "text", "text": "What is the difference between these two images?"},
157
+ {"type": "image"},
158
+ {"type": "image"},
159
+ ],
160
+ },
161
+ ]
162
  ```
163
 
164
  ### Model optimization