shirzady1934 commited on
Commit
8aa42ae
1 Parent(s): bd0b7ee

Upload tokenizer

Browse files
added_tokens.json CHANGED
@@ -1,3 +1,7 @@
1
  {
2
- "<|endoftext|>": 6881
 
 
 
 
3
  }
 
1
  {
2
+ "</s>": 6882,
3
+ "<mask>": 6885,
4
+ "<pad>": 6884,
5
+ "<s>": 6881,
6
+ "<unk>": 6883
7
  }
special_tokens_map.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
- "bos_token": "<|endoftext|>",
3
  "cls_token": "[CLS]",
4
- "eos_token": "<|endoftext|>",
5
  "mask_token": "[MASK]",
6
  "pad_token": "[PAD]",
7
  "sep_token": "[SEP]",
 
1
  {
2
+ "bos_token": "<s>",
3
  "cls_token": "[CLS]",
4
+ "eos_token": "</s>",
5
  "mask_token": "[MASK]",
6
  "pad_token": "[PAD]",
7
  "sep_token": "[SEP]",
tokenizer.json CHANGED
@@ -50,12 +50,48 @@
50
  },
51
  {
52
  "id": 6881,
53
- "content": "<|endoftext|>",
54
  "single_word": false,
55
  "lstrip": false,
56
  "rstrip": false,
57
  "normalized": false,
58
  "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  }
60
  ],
61
  "normalizer": null,
 
50
  },
51
  {
52
  "id": 6881,
53
+ "content": "<s>",
54
  "single_word": false,
55
  "lstrip": false,
56
  "rstrip": false,
57
  "normalized": false,
58
  "special": true
59
+ },
60
+ {
61
+ "id": 6882,
62
+ "content": "</s>",
63
+ "single_word": false,
64
+ "lstrip": false,
65
+ "rstrip": false,
66
+ "normalized": false,
67
+ "special": true
68
+ },
69
+ {
70
+ "id": 6883,
71
+ "content": "<unk>",
72
+ "single_word": false,
73
+ "lstrip": false,
74
+ "rstrip": false,
75
+ "normalized": false,
76
+ "special": true
77
+ },
78
+ {
79
+ "id": 6884,
80
+ "content": "<pad>",
81
+ "single_word": false,
82
+ "lstrip": false,
83
+ "rstrip": false,
84
+ "normalized": false,
85
+ "special": true
86
+ },
87
+ {
88
+ "id": 6885,
89
+ "content": "<mask>",
90
+ "single_word": false,
91
+ "lstrip": true,
92
+ "rstrip": false,
93
+ "normalized": false,
94
+ "special": true
95
  }
96
  ],
97
  "normalizer": null,
tokenizer_config.json CHANGED
@@ -1,11 +1,15 @@
1
  {
2
  "add_prefix_space": false,
3
- "bos_token": "<|endoftext|>",
4
  "clean_up_tokenization_spaces": true,
5
- "eos_token": "<|endoftext|>",
6
- "max_length": 100,
 
 
7
  "model_max_length": 1000000000000000019884624838656,
8
- "padding_side": "right",
9
- "tokenizer_class": "GPT2Tokenizer",
10
- "unk_token": "<|endoftext|>"
 
 
11
  }
 
1
  {
2
  "add_prefix_space": false,
3
+ "bos_token": "<s>",
4
  "clean_up_tokenization_spaces": true,
5
+ "cls_token": "<s>",
6
+ "eos_token": "</s>",
7
+ "errors": "replace",
8
+ "mask_token": "<mask>",
9
  "model_max_length": 1000000000000000019884624838656,
10
+ "pad_token": "<pad>",
11
+ "sep_token": "</s>",
12
+ "tokenizer_class": "RobertaTokenizer",
13
+ "trim_offsets": true,
14
+ "unk_token": "<unk>"
15
  }