Exqrch commited on
Commit
dfb317b
1 Parent(s): ebf742c

Upload tokenizer

Browse files
Files changed (2) hide show
  1. tokenizer.json +159 -159
  2. vocab.txt +63 -63
tokenizer.json CHANGED
@@ -179,30 +179,30 @@
179
  "x": 31,
180
  "y": 32,
181
  "z": 33,
182
- "##o": 34,
183
- "##r": 35,
184
- "##g": 36,
185
- "##t": 37,
186
- "##i": 38,
187
- "##l": 39,
188
- "##e": 40,
189
- "##s": 41,
190
- "##a": 42,
191
- "##n": 43,
192
- "##u": 44,
193
- "##c": 45,
194
- "##h": 46,
195
- "##v": 47,
196
- "##y": 48,
197
- "##k": 49,
198
- "##p": 50,
199
- "##w": 51,
200
- "##d": 52,
201
- "##m": 53,
202
- "##b": 54,
203
- "##f": 55,
204
- "##x": 56,
205
- "##z": 57,
206
  "th": 58,
207
  "##ou": 59,
208
  "##re": 60,
@@ -215,8 +215,8 @@
215
  "##or": 67,
216
  "##ve": 68,
217
  "ha": 69,
218
- "##it": 70,
219
- "##ll": 71,
220
  "to": 72,
221
  "##nt": 73,
222
  "and": 74,
@@ -227,23 +227,23 @@
227
  "##at": 79,
228
  "in": 80,
229
  "thou": 81,
230
- "##in": 82,
231
- "##ea": 83,
232
  "##me": 84,
233
  "co": 85,
234
  "of": 86,
235
  "##ir": 87,
236
  "wh": 88,
237
- "##on": 89,
238
- "##el": 90,
239
  "not": 91,
240
  "is": 92,
241
  "wi": 93,
242
  "##ee": 94,
243
  "as": 95,
244
- "##ra": 96,
245
  "##th": 97,
246
- "##ld": 98,
247
  "most": 99,
248
  "for": 100,
249
  "##ri": 101,
@@ -253,23 +253,23 @@
253
  "me": 105,
254
  "so": 106,
255
  "sh": 107,
256
- "##ow": 108,
257
- "##il": 109,
258
- "##en": 110,
259
- "##se": 111,
260
  "##ch": 112,
261
  "thy": 113,
262
  "##est": 114,
263
  "have": 115,
264
  "what": 116,
265
  "are": 117,
266
- "it": 118,
267
- "li": 119,
268
- "sp": 120,
269
- "you": 121,
270
- "##oo": 122,
271
- "##ty": 123,
272
- "##la": 124,
273
  "##ar": 125,
274
  "##ay": 126,
275
  "##ke": 127,
@@ -280,44 +280,44 @@
280
  "##irit": 132,
281
  "spirit": 133,
282
  "ch": 134,
283
- "do": 135,
284
- "his": 136,
285
- "##ro": 137,
286
- "##le": 138,
287
- "##ly": 139,
288
- "##ut": 140,
289
- "##ith": 141,
290
- "more": 142,
291
- "##ment": 143,
292
- "all": 144,
293
- "ba": 145,
294
- "but": 146,
295
- "de": 147,
296
- "lo": 148,
297
  "st": 149,
298
  "see": 150,
299
  "we": 151,
300
  "wor": 152,
301
  "with": 153,
302
- "##gh": 154,
303
- "##ic": 155,
304
  "##an": 156,
305
  "##ake": 157,
306
  "##ul": 158,
307
- "##mp": 159,
308
- "##ber": 160,
309
- "that": 161,
310
- "##ould": 162,
311
- "##ist": 163,
312
- "now": 164,
313
- "##ing": 165,
314
- "##eep": 166,
315
- "am": 167,
316
- "ari": 168,
317
- "ex": 169,
318
- "go": 170,
319
- "ho": 171,
320
- "he": 172,
321
  "per": 173,
322
  "re": 174,
323
  "sou": 175,
@@ -326,97 +326,97 @@
326
  "say": 178,
327
  "tis": 179,
328
  "wa": 180,
329
- "##ge": 181,
330
- "##id": 182,
331
- "##et": 183,
332
- "##ement": 184,
333
- "##ne": 185,
334
- "##nst": 186,
335
- "##ure": 187,
336
- "##ct": 188,
337
- "##ves": 189,
338
- "##ms": 190,
339
- "##med": 191,
340
- "##for": 192,
341
- "##ful": 193,
342
- "##our": 194,
343
  "##ous": 195,
344
- "they": 196,
345
- "##ish": 197,
346
- "##ess": 198,
347
- "##nter": 199,
348
- "##eas": 200,
349
- "liber": 201,
350
- "##ood": 202,
351
- "char": 203,
352
- "##rom": 204,
353
  "ariel": 205,
354
  "serv": 206,
355
  "slave": 207,
356
  "liberty": 208,
357
  "ad": 209,
358
- "bo": 210,
359
- "br": 211,
360
  "by": 212,
361
  "bes": 213,
362
  "bra": 214,
363
  "bear": 215,
364
  "du": 216,
365
- "ear": 217,
366
- "fre": 218,
367
- "fir": 219,
368
- "fri": 220,
369
- "fly": 221,
370
- "fet": 222,
371
- "from": 223,
372
- "gre": 224,
373
- "gra": 225,
374
- "good": 226,
375
  "hon": 227,
376
- "how": 228,
377
- "kn": 229,
378
- "king": 230,
379
- "make": 231,
380
- "mist": 232,
381
- "ne": 233,
382
  "on": 234,
383
- "ou": 235,
384
- "ow": 236,
385
- "po": 237,
386
- "pl": 238,
387
- "qu": 239,
388
- "rel": 240,
389
- "sl": 241,
390
- "sa": 242,
391
- "su": 243,
392
- "sw": 244,
393
- "sen": 245,
394
- "tr": 246,
395
- "would": 247,
396
- "wish": 248,
397
- "##ore": 249,
398
- "##te": 250,
399
- "##tu": 251,
400
- "##ip": 252,
401
- "##ib": 253,
402
- "##ion": 254,
403
- "##igh": 255,
404
- "##lib": 256,
405
- "##ere": 257,
406
- "##end": 258,
407
- "##ss": 259,
408
- "##ses": 260,
409
- "##and": 261,
410
- "##all": 262,
411
- "##ant": 263,
412
- "##ain": 264,
413
  "##ue": 265,
414
  "##us": 266,
415
- "##ck": 267,
416
- "##cit": 268,
417
- "##ver": 269,
418
- "##vil": 270,
419
- "##fe": 271,
420
  "##ress": 272,
421
  "them": 273,
422
  "there": 274,
@@ -428,23 +428,23 @@
428
  "##ease": 280,
429
  "cont": 281,
430
  "comp": 282,
431
- "##one": 283,
432
- "##elf": 284,
433
  "forth": 285,
434
  "canst": 286,
435
- "calib": 287,
436
  "mere": 288,
437
  "ship": 289,
438
  "shall": 290,
439
- "##own": 291,
440
  "like": 292,
441
  "your": 293,
442
- "##lete": 294,
443
- "bad": 295,
444
- "devil": 296,
445
- "love": 297,
446
- "seek": 298,
447
- "work": 299
448
  }
449
  }
450
  }
 
179
  "x": 31,
180
  "y": 32,
181
  "z": 33,
182
+ "##l": 34,
183
+ "##e": 35,
184
+ "##p": 36,
185
+ "##i": 37,
186
+ "##s": 38,
187
+ "##t": 39,
188
+ "##r": 40,
189
+ "##f": 41,
190
+ "##w": 42,
191
+ "##a": 43,
192
+ "##o": 44,
193
+ "##u": 45,
194
+ "##n": 46,
195
+ "##d": 47,
196
+ "##g": 48,
197
+ "##h": 49,
198
+ "##v": 50,
199
+ "##k": 51,
200
+ "##m": 52,
201
+ "##z": 53,
202
+ "##y": 54,
203
+ "##c": 55,
204
+ "##b": 56,
205
+ "##x": 57,
206
  "th": 58,
207
  "##ou": 59,
208
  "##re": 60,
 
215
  "##or": 67,
216
  "##ve": 68,
217
  "ha": 69,
218
+ "##ll": 70,
219
+ "##it": 71,
220
  "to": 72,
221
  "##nt": 73,
222
  "and": 74,
 
227
  "##at": 79,
228
  "in": 80,
229
  "thou": 81,
230
+ "##ea": 82,
231
+ "##in": 83,
232
  "##me": 84,
233
  "co": 85,
234
  "of": 86,
235
  "##ir": 87,
236
  "wh": 88,
237
+ "##el": 89,
238
+ "##on": 90,
239
  "not": 91,
240
  "is": 92,
241
  "wi": 93,
242
  "##ee": 94,
243
  "as": 95,
244
+ "##ld": 96,
245
  "##th": 97,
246
+ "##ra": 98,
247
  "most": 99,
248
  "for": 100,
249
  "##ri": 101,
 
253
  "me": 105,
254
  "so": 106,
255
  "sh": 107,
256
+ "##la": 108,
257
+ "##en": 109,
258
+ "##se": 110,
259
+ "##ro": 111,
260
  "##ch": 112,
261
  "thy": 113,
262
  "##est": 114,
263
  "have": 115,
264
  "what": 116,
265
  "are": 117,
266
+ "do": 118,
267
+ "it": 119,
268
+ "li": 120,
269
+ "sp": 121,
270
+ "you": 122,
271
+ "##il": 123,
272
+ "##ty": 124,
273
  "##ar": 125,
274
  "##ay": 126,
275
  "##ke": 127,
 
280
  "##irit": 132,
281
  "spirit": 133,
282
  "ch": 134,
283
+ "go": 135,
284
+ "ho": 136,
285
+ "his": 137,
286
+ "lo": 138,
287
+ "##le": 139,
288
+ "##ly": 140,
289
+ "##ut": 141,
290
+ "##ith": 142,
291
+ "more": 143,
292
+ "##ment": 144,
293
+ "all": 145,
294
+ "ba": 146,
295
+ "but": 147,
296
+ "de": 148,
297
  "st": 149,
298
  "see": 150,
299
  "we": 151,
300
  "wor": 152,
301
  "with": 153,
302
+ "##ic": 154,
303
+ "##wn": 155,
304
  "##an": 156,
305
  "##ake": 157,
306
  "##ul": 158,
307
+ "##gh": 159,
308
+ "##mp": 160,
309
+ "##ber": 161,
310
+ "that": 162,
311
+ "##ould": 163,
312
+ "##ist": 164,
313
+ "now": 165,
314
+ "##ing": 166,
315
+ "##eep": 167,
316
+ "am": 168,
317
+ "ari": 169,
318
+ "ex": 170,
319
+ "he": 171,
320
+ "po": 172,
321
  "per": 173,
322
  "re": 174,
323
  "sou": 175,
 
326
  "say": 178,
327
  "tis": 179,
328
  "wa": 180,
329
+ "##et": 181,
330
+ "##ement": 182,
331
+ "##id": 183,
332
+ "##for": 184,
333
+ "##ful": 185,
334
+ "##od": 186,
335
+ "##un": 187,
336
+ "##ure": 188,
337
+ "##nst": 189,
338
+ "##ge": 190,
339
+ "##ves": 191,
340
+ "##ms": 192,
341
+ "##med": 193,
342
+ "##ct": 194,
343
  "##ous": 195,
344
+ "##our": 196,
345
+ "they": 197,
346
+ "##ish": 198,
347
+ "##ess": 199,
348
+ "##nter": 200,
349
+ "##eas": 201,
350
+ "##rom": 202,
351
+ "liber": 203,
352
+ "char": 204,
353
  "ariel": 205,
354
  "serv": 206,
355
  "slave": 207,
356
  "liberty": 208,
357
  "ad": 209,
358
+ "br": 210,
359
+ "bo": 211,
360
  "by": 212,
361
  "bes": 213,
362
  "bra": 214,
363
  "bear": 215,
364
  "du": 216,
365
+ "en": 217,
366
+ "ear": 218,
367
+ "fre": 219,
368
+ "fir": 220,
369
+ "fri": 221,
370
+ "fly": 222,
371
+ "fet": 223,
372
+ "from": 224,
373
+ "gre": 225,
374
+ "gra": 226,
375
  "hon": 227,
376
+ "kn": 228,
377
+ "king": 229,
378
+ "make": 230,
379
+ "mist": 231,
380
+ "ne": 232,
381
+ "ou": 233,
382
  "on": 234,
383
+ "own": 235,
384
+ "pl": 236,
385
+ "qu": 237,
386
+ "rel": 238,
387
+ "sl": 239,
388
+ "sw": 240,
389
+ "sa": 241,
390
+ "su": 242,
391
+ "sen": 243,
392
+ "vi": 244,
393
+ "would": 245,
394
+ "wish": 246,
395
+ "##li": 247,
396
+ "##em": 248,
397
+ "##ere": 249,
398
+ "##end": 250,
399
+ "##ip": 251,
400
+ "##ion": 252,
401
+ "##igh": 253,
402
+ "##ss": 254,
403
+ "##ses": 255,
404
+ "##te": 256,
405
+ "##tun": 257,
406
+ "##fe": 258,
407
+ "##wer": 259,
408
+ "##and": 260,
409
+ "##all": 261,
410
+ "##ant": 262,
411
+ "##ow": 263,
412
+ "##ore": 264,
413
  "##ue": 265,
414
  "##us": 266,
415
+ "##ver": 267,
416
+ "##vil": 268,
417
+ "##ck": 269,
418
+ "##cit": 270,
419
+ "##ban": 271,
420
  "##ress": 272,
421
  "them": 273,
422
  "there": 274,
 
428
  "##ease": 280,
429
  "cont": 281,
430
  "comp": 282,
431
+ "##elf": 283,
432
+ "##one": 284,
433
  "forth": 285,
434
  "canst": 286,
435
+ "cali": 287,
436
  "mere": 288,
437
  "ship": 289,
438
  "shall": 290,
439
+ "##lain": 291,
440
  "like": 292,
441
  "your": 293,
442
+ "good": 294,
443
+ "how": 295,
444
+ "love": 296,
445
+ "##lete": 297,
446
+ "bad": 298,
447
+ "devil": 299
448
  }
449
  }
450
  }
vocab.txt CHANGED
@@ -32,30 +32,30 @@ w
32
  x
33
  y
34
  z
35
- ##o
36
- ##r
37
- ##g
38
- ##t
39
- ##i
40
  ##l
41
  ##e
 
 
42
  ##s
 
 
 
 
43
  ##a
44
- ##n
45
  ##u
46
- ##c
 
 
47
  ##h
48
  ##v
49
- ##y
50
  ##k
51
- ##p
52
- ##w
53
- ##d
54
  ##m
 
 
 
55
  ##b
56
- ##f
57
  ##x
58
- ##z
59
  th
60
  ##ou
61
  ##re
@@ -68,8 +68,8 @@ my
68
  ##or
69
  ##ve
70
  ha
71
- ##it
72
  ##ll
 
73
  to
74
  ##nt
75
  and
@@ -80,23 +80,23 @@ mo
80
  ##at
81
  in
82
  thou
83
- ##in
84
  ##ea
 
85
  ##me
86
  co
87
  of
88
  ##ir
89
  wh
90
- ##on
91
  ##el
 
92
  not
93
  is
94
  wi
95
  ##ee
96
  as
97
- ##ra
98
- ##th
99
  ##ld
 
 
100
  most
101
  for
102
  ##ri
@@ -106,23 +106,23 @@ ca
106
  me
107
  so
108
  sh
109
- ##ow
110
- ##il
111
  ##en
112
  ##se
 
113
  ##ch
114
  thy
115
  ##est
116
  have
117
  what
118
  are
 
119
  it
120
  li
121
  sp
122
  you
123
- ##oo
124
  ##ty
125
- ##la
126
  ##ar
127
  ##ay
128
  ##ke
@@ -133,9 +133,10 @@ come
133
  ##irit
134
  spirit
135
  ch
136
- do
 
137
  his
138
- ##ro
139
  ##le
140
  ##ly
141
  ##ut
@@ -146,17 +147,17 @@ all
146
  ba
147
  but
148
  de
149
- lo
150
  st
151
  see
152
  we
153
  wor
154
  with
155
- ##gh
156
  ##ic
 
157
  ##an
158
  ##ake
159
  ##ul
 
160
  ##mp
161
  ##ber
162
  that
@@ -168,9 +169,8 @@ now
168
  am
169
  ari
170
  ex
171
- go
172
- ho
173
  he
 
174
  per
175
  re
176
  sou
@@ -179,42 +179,43 @@ sla
179
  say
180
  tis
181
  wa
182
- ##ge
183
- ##id
184
  ##et
185
  ##ement
186
- ##ne
187
- ##nst
 
 
 
188
  ##ure
189
- ##ct
 
190
  ##ves
191
  ##ms
192
  ##med
193
- ##for
194
- ##ful
195
- ##our
196
  ##ous
 
197
  they
198
  ##ish
199
  ##ess
200
  ##nter
201
  ##eas
 
202
  liber
203
- ##ood
204
  char
205
- ##rom
206
  ariel
207
  serv
208
  slave
209
  liberty
210
  ad
211
- bo
212
  br
 
213
  by
214
  bes
215
  bra
216
  bear
217
  du
 
218
  ear
219
  fre
220
  fir
@@ -224,52 +225,51 @@ fet
224
  from
225
  gre
226
  gra
227
- good
228
  hon
229
- how
230
  kn
231
  king
232
  make
233
  mist
234
  ne
235
- on
236
  ou
237
- ow
238
- po
239
  pl
240
  qu
241
  rel
242
  sl
 
243
  sa
244
  su
245
- sw
246
  sen
247
- tr
248
  would
249
  wish
250
- ##ore
251
- ##te
252
- ##tu
 
253
  ##ip
254
- ##ib
255
  ##ion
256
  ##igh
257
- ##lib
258
- ##ere
259
- ##end
260
  ##ss
261
  ##ses
 
 
 
 
262
  ##and
263
  ##all
264
  ##ant
265
- ##ain
 
266
  ##ue
267
  ##us
268
- ##ck
269
- ##cit
270
  ##ver
271
  ##vil
272
- ##fe
 
 
273
  ##ress
274
  them
275
  there
@@ -281,20 +281,20 @@ hath
281
  ##ease
282
  cont
283
  comp
284
- ##one
285
  ##elf
 
286
  forth
287
  canst
288
- calib
289
  mere
290
  ship
291
  shall
292
- ##own
293
  like
294
  your
 
 
 
295
  ##lete
296
  bad
297
  devil
298
- love
299
- seek
300
- work
 
32
  x
33
  y
34
  z
 
 
 
 
 
35
  ##l
36
  ##e
37
+ ##p
38
+ ##i
39
  ##s
40
+ ##t
41
+ ##r
42
+ ##f
43
+ ##w
44
  ##a
45
+ ##o
46
  ##u
47
+ ##n
48
+ ##d
49
+ ##g
50
  ##h
51
  ##v
 
52
  ##k
 
 
 
53
  ##m
54
+ ##z
55
+ ##y
56
+ ##c
57
  ##b
 
58
  ##x
 
59
  th
60
  ##ou
61
  ##re
 
68
  ##or
69
  ##ve
70
  ha
 
71
  ##ll
72
+ ##it
73
  to
74
  ##nt
75
  and
 
80
  ##at
81
  in
82
  thou
 
83
  ##ea
84
+ ##in
85
  ##me
86
  co
87
  of
88
  ##ir
89
  wh
 
90
  ##el
91
+ ##on
92
  not
93
  is
94
  wi
95
  ##ee
96
  as
 
 
97
  ##ld
98
+ ##th
99
+ ##ra
100
  most
101
  for
102
  ##ri
 
106
  me
107
  so
108
  sh
109
+ ##la
 
110
  ##en
111
  ##se
112
+ ##ro
113
  ##ch
114
  thy
115
  ##est
116
  have
117
  what
118
  are
119
+ do
120
  it
121
  li
122
  sp
123
  you
124
+ ##il
125
  ##ty
 
126
  ##ar
127
  ##ay
128
  ##ke
 
133
  ##irit
134
  spirit
135
  ch
136
+ go
137
+ ho
138
  his
139
+ lo
140
  ##le
141
  ##ly
142
  ##ut
 
147
  ba
148
  but
149
  de
 
150
  st
151
  see
152
  we
153
  wor
154
  with
 
155
  ##ic
156
+ ##wn
157
  ##an
158
  ##ake
159
  ##ul
160
+ ##gh
161
  ##mp
162
  ##ber
163
  that
 
169
  am
170
  ari
171
  ex
 
 
172
  he
173
+ po
174
  per
175
  re
176
  sou
 
179
  say
180
  tis
181
  wa
 
 
182
  ##et
183
  ##ement
184
+ ##id
185
+ ##for
186
+ ##ful
187
+ ##od
188
+ ##un
189
  ##ure
190
+ ##nst
191
+ ##ge
192
  ##ves
193
  ##ms
194
  ##med
195
+ ##ct
 
 
196
  ##ous
197
+ ##our
198
  they
199
  ##ish
200
  ##ess
201
  ##nter
202
  ##eas
203
+ ##rom
204
  liber
 
205
  char
 
206
  ariel
207
  serv
208
  slave
209
  liberty
210
  ad
 
211
  br
212
+ bo
213
  by
214
  bes
215
  bra
216
  bear
217
  du
218
+ en
219
  ear
220
  fre
221
  fir
 
225
  from
226
  gre
227
  gra
 
228
  hon
 
229
  kn
230
  king
231
  make
232
  mist
233
  ne
 
234
  ou
235
+ on
236
+ own
237
  pl
238
  qu
239
  rel
240
  sl
241
+ sw
242
  sa
243
  su
 
244
  sen
245
+ vi
246
  would
247
  wish
248
+ ##li
249
+ ##em
250
+ ##ere
251
+ ##end
252
  ##ip
 
253
  ##ion
254
  ##igh
 
 
 
255
  ##ss
256
  ##ses
257
+ ##te
258
+ ##tun
259
+ ##fe
260
+ ##wer
261
  ##and
262
  ##all
263
  ##ant
264
+ ##ow
265
+ ##ore
266
  ##ue
267
  ##us
 
 
268
  ##ver
269
  ##vil
270
+ ##ck
271
+ ##cit
272
+ ##ban
273
  ##ress
274
  them
275
  there
 
281
  ##ease
282
  cont
283
  comp
 
284
  ##elf
285
+ ##one
286
  forth
287
  canst
288
+ cali
289
  mere
290
  ship
291
  shall
292
+ ##lain
293
  like
294
  your
295
+ good
296
+ how
297
+ love
298
  ##lete
299
  bad
300
  devil