Exqrch commited on
Commit
ebf742c
1 Parent(s): 8ab71f7

Upload tokenizer

Browse files
Files changed (4) hide show
  1. special_tokens_map.json +7 -0
  2. tokenizer.json +450 -0
  3. tokenizer_config.json +13 -0
  4. vocab.txt +300 -0
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
tokenizer.json ADDED
@@ -0,0 +1,450 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "[PAD]",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 1,
17
+ "content": "[UNK]",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 2,
26
+ "content": "[CLS]",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ },
33
+ {
34
+ "id": 3,
35
+ "content": "[SEP]",
36
+ "single_word": false,
37
+ "lstrip": false,
38
+ "rstrip": false,
39
+ "normalized": false,
40
+ "special": true
41
+ },
42
+ {
43
+ "id": 4,
44
+ "content": "[MASK]",
45
+ "single_word": false,
46
+ "lstrip": false,
47
+ "rstrip": false,
48
+ "normalized": false,
49
+ "special": true
50
+ }
51
+ ],
52
+ "normalizer": {
53
+ "type": "BertNormalizer",
54
+ "clean_text": true,
55
+ "handle_chinese_chars": true,
56
+ "strip_accents": null,
57
+ "lowercase": true
58
+ },
59
+ "pre_tokenizer": {
60
+ "type": "BertPreTokenizer"
61
+ },
62
+ "post_processor": {
63
+ "type": "TemplateProcessing",
64
+ "single": [
65
+ {
66
+ "SpecialToken": {
67
+ "id": "[CLS]",
68
+ "type_id": 0
69
+ }
70
+ },
71
+ {
72
+ "Sequence": {
73
+ "id": "A",
74
+ "type_id": 0
75
+ }
76
+ },
77
+ {
78
+ "SpecialToken": {
79
+ "id": "[SEP]",
80
+ "type_id": 0
81
+ }
82
+ }
83
+ ],
84
+ "pair": [
85
+ {
86
+ "SpecialToken": {
87
+ "id": "[CLS]",
88
+ "type_id": 0
89
+ }
90
+ },
91
+ {
92
+ "Sequence": {
93
+ "id": "A",
94
+ "type_id": 0
95
+ }
96
+ },
97
+ {
98
+ "SpecialToken": {
99
+ "id": "[SEP]",
100
+ "type_id": 0
101
+ }
102
+ },
103
+ {
104
+ "Sequence": {
105
+ "id": "B",
106
+ "type_id": 1
107
+ }
108
+ },
109
+ {
110
+ "SpecialToken": {
111
+ "id": "[SEP]",
112
+ "type_id": 1
113
+ }
114
+ }
115
+ ],
116
+ "special_tokens": {
117
+ "[CLS]": {
118
+ "id": "[CLS]",
119
+ "ids": [
120
+ 2
121
+ ],
122
+ "tokens": [
123
+ "[CLS]"
124
+ ]
125
+ },
126
+ "[SEP]": {
127
+ "id": "[SEP]",
128
+ "ids": [
129
+ 3
130
+ ],
131
+ "tokens": [
132
+ "[SEP]"
133
+ ]
134
+ }
135
+ }
136
+ },
137
+ "decoder": {
138
+ "type": "WordPiece",
139
+ "prefix": "##",
140
+ "cleanup": true
141
+ },
142
+ "model": {
143
+ "type": "WordPiece",
144
+ "unk_token": "[UNK]",
145
+ "continuing_subword_prefix": "##",
146
+ "max_input_chars_per_word": 100,
147
+ "vocab": {
148
+ "[PAD]": 0,
149
+ "[UNK]": 1,
150
+ "[CLS]": 2,
151
+ "[SEP]": 3,
152
+ "[MASK]": 4,
153
+ "!": 5,
154
+ ",": 6,
155
+ ".": 7,
156
+ "?": 8,
157
+ "a": 9,
158
+ "b": 10,
159
+ "c": 11,
160
+ "d": 12,
161
+ "e": 13,
162
+ "f": 14,
163
+ "g": 15,
164
+ "h": 16,
165
+ "i": 17,
166
+ "k": 18,
167
+ "l": 19,
168
+ "m": 20,
169
+ "n": 21,
170
+ "o": 22,
171
+ "p": 23,
172
+ "q": 24,
173
+ "r": 25,
174
+ "s": 26,
175
+ "t": 27,
176
+ "u": 28,
177
+ "v": 29,
178
+ "w": 30,
179
+ "x": 31,
180
+ "y": 32,
181
+ "z": 33,
182
+ "##o": 34,
183
+ "##r": 35,
184
+ "##g": 36,
185
+ "##t": 37,
186
+ "##i": 38,
187
+ "##l": 39,
188
+ "##e": 40,
189
+ "##s": 41,
190
+ "##a": 42,
191
+ "##n": 43,
192
+ "##u": 44,
193
+ "##c": 45,
194
+ "##h": 46,
195
+ "##v": 47,
196
+ "##y": 48,
197
+ "##k": 49,
198
+ "##p": 50,
199
+ "##w": 51,
200
+ "##d": 52,
201
+ "##m": 53,
202
+ "##b": 54,
203
+ "##f": 55,
204
+ "##x": 56,
205
+ "##z": 57,
206
+ "th": 58,
207
+ "##ou": 59,
208
+ "##re": 60,
209
+ "the": 61,
210
+ "##nd": 62,
211
+ "##is": 63,
212
+ "##es": 64,
213
+ "##er": 65,
214
+ "my": 66,
215
+ "##or": 67,
216
+ "##ve": 68,
217
+ "ha": 69,
218
+ "##it": 70,
219
+ "##ll": 71,
220
+ "to": 72,
221
+ "##nt": 73,
222
+ "and": 74,
223
+ "no": 75,
224
+ "##ed": 76,
225
+ "mo": 77,
226
+ "##st": 78,
227
+ "##at": 79,
228
+ "in": 80,
229
+ "thou": 81,
230
+ "##in": 82,
231
+ "##ea": 83,
232
+ "##me": 84,
233
+ "co": 85,
234
+ "of": 86,
235
+ "##ir": 87,
236
+ "wh": 88,
237
+ "##on": 89,
238
+ "##el": 90,
239
+ "not": 91,
240
+ "is": 92,
241
+ "wi": 93,
242
+ "##ee": 94,
243
+ "as": 95,
244
+ "##ra": 96,
245
+ "##th": 97,
246
+ "##ld": 98,
247
+ "most": 99,
248
+ "for": 100,
249
+ "##ri": 101,
250
+ "will": 102,
251
+ "be": 103,
252
+ "ca": 104,
253
+ "me": 105,
254
+ "so": 106,
255
+ "sh": 107,
256
+ "##ow": 108,
257
+ "##il": 109,
258
+ "##en": 110,
259
+ "##se": 111,
260
+ "##ch": 112,
261
+ "thy": 113,
262
+ "##est": 114,
263
+ "have": 115,
264
+ "what": 116,
265
+ "are": 117,
266
+ "it": 118,
267
+ "li": 119,
268
+ "sp": 120,
269
+ "you": 121,
270
+ "##oo": 122,
271
+ "##ty": 123,
272
+ "##la": 124,
273
+ "##ar": 125,
274
+ "##ay": 126,
275
+ "##ke": 127,
276
+ "this": 128,
277
+ "thee": 129,
278
+ "##ear": 130,
279
+ "come": 131,
280
+ "##irit": 132,
281
+ "spirit": 133,
282
+ "ch": 134,
283
+ "do": 135,
284
+ "his": 136,
285
+ "##ro": 137,
286
+ "##le": 138,
287
+ "##ly": 139,
288
+ "##ut": 140,
289
+ "##ith": 141,
290
+ "more": 142,
291
+ "##ment": 143,
292
+ "all": 144,
293
+ "ba": 145,
294
+ "but": 146,
295
+ "de": 147,
296
+ "lo": 148,
297
+ "st": 149,
298
+ "see": 150,
299
+ "we": 151,
300
+ "wor": 152,
301
+ "with": 153,
302
+ "##gh": 154,
303
+ "##ic": 155,
304
+ "##an": 156,
305
+ "##ake": 157,
306
+ "##ul": 158,
307
+ "##mp": 159,
308
+ "##ber": 160,
309
+ "that": 161,
310
+ "##ould": 162,
311
+ "##ist": 163,
312
+ "now": 164,
313
+ "##ing": 165,
314
+ "##eep": 166,
315
+ "am": 167,
316
+ "ari": 168,
317
+ "ex": 169,
318
+ "go": 170,
319
+ "ho": 171,
320
+ "he": 172,
321
+ "per": 173,
322
+ "re": 174,
323
+ "sou": 175,
324
+ "ser": 176,
325
+ "sla": 177,
326
+ "say": 178,
327
+ "tis": 179,
328
+ "wa": 180,
329
+ "##ge": 181,
330
+ "##id": 182,
331
+ "##et": 183,
332
+ "##ement": 184,
333
+ "##ne": 185,
334
+ "##nst": 186,
335
+ "##ure": 187,
336
+ "##ct": 188,
337
+ "##ves": 189,
338
+ "##ms": 190,
339
+ "##med": 191,
340
+ "##for": 192,
341
+ "##ful": 193,
342
+ "##our": 194,
343
+ "##ous": 195,
344
+ "they": 196,
345
+ "##ish": 197,
346
+ "##ess": 198,
347
+ "##nter": 199,
348
+ "##eas": 200,
349
+ "liber": 201,
350
+ "##ood": 202,
351
+ "char": 203,
352
+ "##rom": 204,
353
+ "ariel": 205,
354
+ "serv": 206,
355
+ "slave": 207,
356
+ "liberty": 208,
357
+ "ad": 209,
358
+ "bo": 210,
359
+ "br": 211,
360
+ "by": 212,
361
+ "bes": 213,
362
+ "bra": 214,
363
+ "bear": 215,
364
+ "du": 216,
365
+ "ear": 217,
366
+ "fre": 218,
367
+ "fir": 219,
368
+ "fri": 220,
369
+ "fly": 221,
370
+ "fet": 222,
371
+ "from": 223,
372
+ "gre": 224,
373
+ "gra": 225,
374
+ "good": 226,
375
+ "hon": 227,
376
+ "how": 228,
377
+ "kn": 229,
378
+ "king": 230,
379
+ "make": 231,
380
+ "mist": 232,
381
+ "ne": 233,
382
+ "on": 234,
383
+ "ou": 235,
384
+ "ow": 236,
385
+ "po": 237,
386
+ "pl": 238,
387
+ "qu": 239,
388
+ "rel": 240,
389
+ "sl": 241,
390
+ "sa": 242,
391
+ "su": 243,
392
+ "sw": 244,
393
+ "sen": 245,
394
+ "tr": 246,
395
+ "would": 247,
396
+ "wish": 248,
397
+ "##ore": 249,
398
+ "##te": 250,
399
+ "##tu": 251,
400
+ "##ip": 252,
401
+ "##ib": 253,
402
+ "##ion": 254,
403
+ "##igh": 255,
404
+ "##lib": 256,
405
+ "##ere": 257,
406
+ "##end": 258,
407
+ "##ss": 259,
408
+ "##ses": 260,
409
+ "##and": 261,
410
+ "##all": 262,
411
+ "##ant": 263,
412
+ "##ain": 264,
413
+ "##ue": 265,
414
+ "##us": 266,
415
+ "##ck": 267,
416
+ "##cit": 268,
417
+ "##ver": 269,
418
+ "##vil": 270,
419
+ "##fe": 271,
420
+ "##ress": 272,
421
+ "them": 273,
422
+ "there": 274,
423
+ "##nds": 275,
424
+ "##ven": 276,
425
+ "has": 277,
426
+ "hast": 278,
427
+ "hath": 279,
428
+ "##ease": 280,
429
+ "cont": 281,
430
+ "comp": 282,
431
+ "##one": 283,
432
+ "##elf": 284,
433
+ "forth": 285,
434
+ "canst": 286,
435
+ "calib": 287,
436
+ "mere": 288,
437
+ "ship": 289,
438
+ "shall": 290,
439
+ "##own": 291,
440
+ "like": 292,
441
+ "your": 293,
442
+ "##lete": 294,
443
+ "bad": 295,
444
+ "devil": 296,
445
+ "love": 297,
446
+ "seek": 298,
447
+ "work": 299
448
+ }
449
+ }
450
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "do_lower_case": true,
4
+ "mask_token": "[MASK]",
5
+ "model_max_length": 512,
6
+ "pad_token": "[PAD]",
7
+ "sep_token": "[SEP]",
8
+ "special_tokens_map_file": null,
9
+ "strip_accents": null,
10
+ "tokenize_chinese_chars": true,
11
+ "tokenizer_class": "BertTokenizer",
12
+ "unk_token": "[UNK]"
13
+ }
vocab.txt ADDED
@@ -0,0 +1,300 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [PAD]
2
+ [UNK]
3
+ [CLS]
4
+ [SEP]
5
+ [MASK]
6
+ !
7
+ ,
8
+ .
9
+ ?
10
+ a
11
+ b
12
+ c
13
+ d
14
+ e
15
+ f
16
+ g
17
+ h
18
+ i
19
+ k
20
+ l
21
+ m
22
+ n
23
+ o
24
+ p
25
+ q
26
+ r
27
+ s
28
+ t
29
+ u
30
+ v
31
+ w
32
+ x
33
+ y
34
+ z
35
+ ##o
36
+ ##r
37
+ ##g
38
+ ##t
39
+ ##i
40
+ ##l
41
+ ##e
42
+ ##s
43
+ ##a
44
+ ##n
45
+ ##u
46
+ ##c
47
+ ##h
48
+ ##v
49
+ ##y
50
+ ##k
51
+ ##p
52
+ ##w
53
+ ##d
54
+ ##m
55
+ ##b
56
+ ##f
57
+ ##x
58
+ ##z
59
+ th
60
+ ##ou
61
+ ##re
62
+ the
63
+ ##nd
64
+ ##is
65
+ ##es
66
+ ##er
67
+ my
68
+ ##or
69
+ ##ve
70
+ ha
71
+ ##it
72
+ ##ll
73
+ to
74
+ ##nt
75
+ and
76
+ no
77
+ ##ed
78
+ mo
79
+ ##st
80
+ ##at
81
+ in
82
+ thou
83
+ ##in
84
+ ##ea
85
+ ##me
86
+ co
87
+ of
88
+ ##ir
89
+ wh
90
+ ##on
91
+ ##el
92
+ not
93
+ is
94
+ wi
95
+ ##ee
96
+ as
97
+ ##ra
98
+ ##th
99
+ ##ld
100
+ most
101
+ for
102
+ ##ri
103
+ will
104
+ be
105
+ ca
106
+ me
107
+ so
108
+ sh
109
+ ##ow
110
+ ##il
111
+ ##en
112
+ ##se
113
+ ##ch
114
+ thy
115
+ ##est
116
+ have
117
+ what
118
+ are
119
+ it
120
+ li
121
+ sp
122
+ you
123
+ ##oo
124
+ ##ty
125
+ ##la
126
+ ##ar
127
+ ##ay
128
+ ##ke
129
+ this
130
+ thee
131
+ ##ear
132
+ come
133
+ ##irit
134
+ spirit
135
+ ch
136
+ do
137
+ his
138
+ ##ro
139
+ ##le
140
+ ##ly
141
+ ##ut
142
+ ##ith
143
+ more
144
+ ##ment
145
+ all
146
+ ba
147
+ but
148
+ de
149
+ lo
150
+ st
151
+ see
152
+ we
153
+ wor
154
+ with
155
+ ##gh
156
+ ##ic
157
+ ##an
158
+ ##ake
159
+ ##ul
160
+ ##mp
161
+ ##ber
162
+ that
163
+ ##ould
164
+ ##ist
165
+ now
166
+ ##ing
167
+ ##eep
168
+ am
169
+ ari
170
+ ex
171
+ go
172
+ ho
173
+ he
174
+ per
175
+ re
176
+ sou
177
+ ser
178
+ sla
179
+ say
180
+ tis
181
+ wa
182
+ ##ge
183
+ ##id
184
+ ##et
185
+ ##ement
186
+ ##ne
187
+ ##nst
188
+ ##ure
189
+ ##ct
190
+ ##ves
191
+ ##ms
192
+ ##med
193
+ ##for
194
+ ##ful
195
+ ##our
196
+ ##ous
197
+ they
198
+ ##ish
199
+ ##ess
200
+ ##nter
201
+ ##eas
202
+ liber
203
+ ##ood
204
+ char
205
+ ##rom
206
+ ariel
207
+ serv
208
+ slave
209
+ liberty
210
+ ad
211
+ bo
212
+ br
213
+ by
214
+ bes
215
+ bra
216
+ bear
217
+ du
218
+ ear
219
+ fre
220
+ fir
221
+ fri
222
+ fly
223
+ fet
224
+ from
225
+ gre
226
+ gra
227
+ good
228
+ hon
229
+ how
230
+ kn
231
+ king
232
+ make
233
+ mist
234
+ ne
235
+ on
236
+ ou
237
+ ow
238
+ po
239
+ pl
240
+ qu
241
+ rel
242
+ sl
243
+ sa
244
+ su
245
+ sw
246
+ sen
247
+ tr
248
+ would
249
+ wish
250
+ ##ore
251
+ ##te
252
+ ##tu
253
+ ##ip
254
+ ##ib
255
+ ##ion
256
+ ##igh
257
+ ##lib
258
+ ##ere
259
+ ##end
260
+ ##ss
261
+ ##ses
262
+ ##and
263
+ ##all
264
+ ##ant
265
+ ##ain
266
+ ##ue
267
+ ##us
268
+ ##ck
269
+ ##cit
270
+ ##ver
271
+ ##vil
272
+ ##fe
273
+ ##ress
274
+ them
275
+ there
276
+ ##nds
277
+ ##ven
278
+ has
279
+ hast
280
+ hath
281
+ ##ease
282
+ cont
283
+ comp
284
+ ##one
285
+ ##elf
286
+ forth
287
+ canst
288
+ calib
289
+ mere
290
+ ship
291
+ shall
292
+ ##own
293
+ like
294
+ your
295
+ ##lete
296
+ bad
297
+ devil
298
+ love
299
+ seek
300
+ work