Antonio88 commited on
Commit
b737943
1 Parent(s): 5e07f71

Upload tokenizer

Browse files
Files changed (3) hide show
  1. README.md +2 -3
  2. tokenizer.json +1 -1
  3. tokenizer_config.json +8 -27
README.md CHANGED
@@ -2,12 +2,11 @@
2
  language:
3
  - en
4
  license: apache-2.0
 
 
5
  base_model: mistralai/Mistral-7B-Instruct-v0.2
6
-
7
  datasets:
8
  - Antonio88/TaliStran-DataSet
9
- tags:
10
- - Conversational
11
  ---
12
 
13
 
 
2
  language:
3
  - en
4
  license: apache-2.0
5
+ tags:
6
+ - Conversational
7
  base_model: mistralai/Mistral-7B-Instruct-v0.2
 
8
  datasets:
9
  - Antonio88/TaliStran-DataSet
 
 
10
  ---
11
 
12
 
tokenizer.json CHANGED
@@ -2,7 +2,7 @@
2
  "version": "1.0",
3
  "truncation": {
4
  "direction": "Right",
5
- "max_length": 256,
6
  "strategy": "LongestFirst",
7
  "stride": 0
8
  },
 
2
  "version": "1.0",
3
  "truncation": {
4
  "direction": "Right",
5
+ "max_length": 512,
6
  "strategy": "LongestFirst",
7
  "stride": 0
8
  },
tokenizer_config.json CHANGED
@@ -4,58 +4,39 @@
4
  "added_tokens_decoder": {
5
  "0": {
6
  "content": "<unk>",
7
- "lstrip": true,
8
  "normalized": false,
9
- "rstrip": true,
10
  "single_word": false,
11
  "special": true
12
  },
13
  "1": {
14
  "content": "<s>",
15
- "lstrip": true,
16
- "normalized": false,
17
- "rstrip": true,
18
- "single_word": false,
19
- "special": true
20
- },
21
- "2": {
22
- "content": "</s>",
23
  "lstrip": false,
24
  "normalized": false,
25
  "rstrip": false,
26
  "single_word": false,
27
  "special": true
28
  },
29
- "32000": {
30
- "content": "<|im_end|>",
31
  "lstrip": false,
32
  "normalized": false,
33
  "rstrip": false,
34
  "single_word": false,
35
  "special": true
36
- },
37
- "32001": {
38
- "content": "<|im_start|>",
39
- "lstrip": true,
40
- "normalized": false,
41
- "rstrip": true,
42
- "single_word": false,
43
- "special": true
44
  }
45
  },
46
  "additional_special_tokens": [],
47
  "bos_token": "<s>",
48
- "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
49
  "clean_up_tokenization_spaces": false,
50
- "eos_token": "<|im_end|>",
51
  "legacy": true,
52
  "model_max_length": 1000000000000000019884624838656,
53
- "pad_token": null,
54
  "sp_model_kwargs": {},
55
  "spaces_between_special_tokens": false,
56
  "tokenizer_class": "LlamaTokenizer",
57
- "trust_remote_code": false,
58
  "unk_token": "<unk>",
59
- "use_default_system_prompt": true,
60
- "use_fast": true
61
- }
 
4
  "added_tokens_decoder": {
5
  "0": {
6
  "content": "<unk>",
7
+ "lstrip": false,
8
  "normalized": false,
9
+ "rstrip": false,
10
  "single_word": false,
11
  "special": true
12
  },
13
  "1": {
14
  "content": "<s>",
 
 
 
 
 
 
 
 
15
  "lstrip": false,
16
  "normalized": false,
17
  "rstrip": false,
18
  "single_word": false,
19
  "special": true
20
  },
21
+ "2": {
22
+ "content": "</s>",
23
  "lstrip": false,
24
  "normalized": false,
25
  "rstrip": false,
26
  "single_word": false,
27
  "special": true
 
 
 
 
 
 
 
 
28
  }
29
  },
30
  "additional_special_tokens": [],
31
  "bos_token": "<s>",
 
32
  "clean_up_tokenization_spaces": false,
33
+ "eos_token": "</s>",
34
  "legacy": true,
35
  "model_max_length": 1000000000000000019884624838656,
36
+ "pad_token": "</s>",
37
  "sp_model_kwargs": {},
38
  "spaces_between_special_tokens": false,
39
  "tokenizer_class": "LlamaTokenizer",
 
40
  "unk_token": "<unk>",
41
+ "use_default_system_prompt": false
42
+ }