added_tokens.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<|box_end|>": 151649,
3
+ "<|box_start|>": 151648,
4
+ "<|endoftext|>": 151643,
5
+ "<|im_end|>": 151645,
6
+ "<|im_start|>": 151644,
7
+ "<|image_pad|>": 151655,
8
+ "<|object_ref_end|>": 151647,
9
+ "<|object_ref_start|>": 151646,
10
+ "<|quad_end|>": 151651,
11
+ "<|quad_start|>": 151650,
12
+ "<|video_pad|>": 151656,
13
+ "<|vision_end|>": 151653,
14
+ "<|vision_pad|>": 151654,
15
+ "<|vision_start|>": 151652
16
+ }
chat_template.json CHANGED
@@ -1,3 +1,3 @@
1
  {
2
- "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
3
- }
 
1
  {
2
+ "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
3
+ }
config.json CHANGED
@@ -1,17 +1,14 @@
1
  {
 
2
  "architectures": [
3
  "Qwen2VLForConditionalGeneration"
4
  ],
5
  "attention_dropout": 0.0,
6
  "bos_token_id": 151643,
7
  "eos_token_id": 151645,
8
- "vision_start_token_id": 151652,
9
- "vision_end_token_id": 151653,
10
- "vision_token_id": 151654,
11
- "image_token_id": 151655,
12
- "video_token_id": 151656,
13
  "hidden_act": "silu",
14
  "hidden_size": 3584,
 
15
  "initializer_range": 0.02,
16
  "intermediate_size": 18944,
17
  "max_position_embeddings": 32768,
@@ -21,32 +18,29 @@
21
  "num_hidden_layers": 28,
22
  "num_key_value_heads": 4,
23
  "rms_norm_eps": 1e-06,
 
 
 
 
 
 
 
 
24
  "rope_theta": 1000000.0,
25
  "sliding_window": 32768,
26
  "tie_word_embeddings": false,
27
  "torch_dtype": "bfloat16",
28
- "transformers_version": "4.41.2",
29
- "use_cache": true,
30
  "use_sliding_window": false,
 
31
  "vision_config": {
32
- "depth": 32,
33
- "embed_dim": 1280,
34
- "mlp_ratio": 4,
35
- "num_heads": 16,
36
  "in_chans": 3,
37
- "hidden_size": 3584,
38
- "patch_size": 14,
39
- "spatial_merge_size": 2,
40
- "spatial_patch_size": 14,
41
- "temporal_patch_size": 2
42
- },
43
- "rope_scaling": {
44
- "type": "mrope",
45
- "mrope_section": [
46
- 16,
47
- 24,
48
- 24
49
- ]
50
  },
 
 
 
51
  "vocab_size": 152064
52
- }
 
1
  {
2
+ "_name_or_path": "/home/tom/fssd/Qwen2-VL-7B-Instruct",
3
  "architectures": [
4
  "Qwen2VLForConditionalGeneration"
5
  ],
6
  "attention_dropout": 0.0,
7
  "bos_token_id": 151643,
8
  "eos_token_id": 151645,
 
 
 
 
 
9
  "hidden_act": "silu",
10
  "hidden_size": 3584,
11
+ "image_token_id": 151655,
12
  "initializer_range": 0.02,
13
  "intermediate_size": 18944,
14
  "max_position_embeddings": 32768,
 
18
  "num_hidden_layers": 28,
19
  "num_key_value_heads": 4,
20
  "rms_norm_eps": 1e-06,
21
+ "rope_scaling": {
22
+ "mrope_section": [
23
+ 16,
24
+ 24,
25
+ 24
26
+ ],
27
+ "type": "mrope"
28
+ },
29
  "rope_theta": 1000000.0,
30
  "sliding_window": 32768,
31
  "tie_word_embeddings": false,
32
  "torch_dtype": "bfloat16",
33
+ "transformers_version": "4.45.0.dev0",
34
+ "use_cache": false,
35
  "use_sliding_window": false,
36
+ "video_token_id": 151656,
37
  "vision_config": {
 
 
 
 
38
  "in_chans": 3,
39
+ "model_type": "qwen2_vl",
40
+ "spatial_patch_size": 14
 
 
 
 
 
 
 
 
 
 
 
41
  },
42
+ "vision_end_token_id": 151653,
43
+ "vision_start_token_id": 151652,
44
+ "vision_token_id": 151654,
45
  "vocab_size": 152064
46
+ }
configuration.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"framework": "pytorch", "task": "image-text-to-text", "allow_remote": true}
generation_config.json CHANGED
@@ -1,14 +1,12 @@
1
  {
2
  "bos_token_id": 151643,
3
- "pad_token_id": 151643,
4
  "do_sample": true,
5
- "eos_token_id": [
6
- 151645,
7
- 151643
8
- ],
9
- "repetition_penalty": 1.0,
10
- "temperature": 0.01,
11
- "top_p": 0.001,
12
  "top_k": 1,
13
- "transformers_version": "4.37.0"
 
14
  }
 
1
  {
2
  "bos_token_id": 151643,
 
3
  "do_sample": true,
4
+ "eos_token_id": 151645,
5
+ "max_new_tokens": 2048,
6
+ "pad_token_id": 151643,
7
+ "repetition_penalty": 1.05,
8
+ "temperature": 0.1,
 
 
9
  "top_k": 1,
10
+ "top_p": 0.001,
11
+ "transformers_version": "4.45.0.dev0"
12
  }
merges.txt CHANGED
@@ -1,3 +1,4 @@
 
1
  Ġ Ġ
2
  ĠĠ ĠĠ
3
  i n
 
1
+ #version: 0.2
2
  Ġ Ġ
3
  ĠĠ ĠĠ
4
  i n
model.safetensors.index.json CHANGED
The diff for this file is too large to render. See raw diff