Files changed (4) hide show
  1. special_tokens_map.json +7 -1
  2. tokenizer.json +650 -0
  3. tokenizer_config.json +15 -1
  4. vocab.txt +0 -0
special_tokens_map.json CHANGED
@@ -1 +1,7 @@
1
- {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
tokenizer.json ADDED
@@ -0,0 +1,650 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "[PAD]",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 1,
17
+ "content": "[UNK]",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 2,
26
+ "content": "[CLS]",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ },
33
+ {
34
+ "id": 3,
35
+ "content": "[SEP]",
36
+ "single_word": false,
37
+ "lstrip": false,
38
+ "rstrip": false,
39
+ "normalized": false,
40
+ "special": true
41
+ },
42
+ {
43
+ "id": 4,
44
+ "content": "[MASK]",
45
+ "single_word": false,
46
+ "lstrip": false,
47
+ "rstrip": false,
48
+ "normalized": false,
49
+ "special": true
50
+ }
51
+ ],
52
+ "normalizer": {
53
+ "type": "BertNormalizer",
54
+ "clean_text": true,
55
+ "handle_chinese_chars": true,
56
+ "strip_accents": null,
57
+ "lowercase": true
58
+ },
59
+ "pre_tokenizer": {
60
+ "type": "BertPreTokenizer"
61
+ },
62
+ "post_processor": {
63
+ "type": "TemplateProcessing",
64
+ "single": [
65
+ {
66
+ "SpecialToken": {
67
+ "id": "[CLS]",
68
+ "type_id": 0
69
+ }
70
+ },
71
+ {
72
+ "Sequence": {
73
+ "id": "A",
74
+ "type_id": 0
75
+ }
76
+ },
77
+ {
78
+ "SpecialToken": {
79
+ "id": "[SEP]",
80
+ "type_id": 0
81
+ }
82
+ }
83
+ ],
84
+ "pair": [
85
+ {
86
+ "SpecialToken": {
87
+ "id": "[CLS]",
88
+ "type_id": 0
89
+ }
90
+ },
91
+ {
92
+ "Sequence": {
93
+ "id": "A",
94
+ "type_id": 0
95
+ }
96
+ },
97
+ {
98
+ "SpecialToken": {
99
+ "id": "[SEP]",
100
+ "type_id": 0
101
+ }
102
+ },
103
+ {
104
+ "Sequence": {
105
+ "id": "B",
106
+ "type_id": 1
107
+ }
108
+ },
109
+ {
110
+ "SpecialToken": {
111
+ "id": "[SEP]",
112
+ "type_id": 1
113
+ }
114
+ }
115
+ ],
116
+ "special_tokens": {
117
+ "[CLS]": {
118
+ "id": "[CLS]",
119
+ "ids": [
120
+ 2
121
+ ],
122
+ "tokens": [
123
+ "[CLS]"
124
+ ]
125
+ },
126
+ "[SEP]": {
127
+ "id": "[SEP]",
128
+ "ids": [
129
+ 3
130
+ ],
131
+ "tokens": [
132
+ "[SEP]"
133
+ ]
134
+ }
135
+ }
136
+ },
137
+ "decoder": {
138
+ "type": "WordPiece",
139
+ "prefix": "##",
140
+ "cleanup": true
141
+ },
142
+ "model": {
143
+ "type": "WordPiece",
144
+ "unk_token": "[UNK]",
145
+ "continuing_subword_prefix": "##",
146
+ "max_input_chars_per_word": 100,
147
+ "vocab": {
148
+ "[PAD]": 0,
149
+ "[UNK]": 1,
150
+ "[CLS]": 2,
151
+ "[SEP]": 3,
152
+ "[MASK]": 4,
153
+ "ء": 5,
154
+ "ا": 6,
155
+ "ب": 7,
156
+ "ت": 8,
157
+ "ث": 9,
158
+ "ج": 10,
159
+ "ح": 11,
160
+ "خ": 12,
161
+ "د": 13,
162
+ "ذ": 14,
163
+ "ر": 15,
164
+ "ز": 16,
165
+ "س": 17,
166
+ "ش": 18,
167
+ "ص": 19,
168
+ "ض": 20,
169
+ "ط": 21,
170
+ "ظ": 22,
171
+ "ع": 23,
172
+ "غ": 24,
173
+ "ف": 25,
174
+ "ق": 26,
175
+ "ك": 27,
176
+ "ل": 28,
177
+ "م": 29,
178
+ "ن": 30,
179
+ "ه": 31,
180
+ "و": 32,
181
+ "ي": 33,
182
+ "پ": 34,
183
+ "ލ": 35,
184
+ "##ل": 36,
185
+ "##ب": 37,
186
+ "##ر": 38,
187
+ "##ق": 39,
188
+ "##ي": 40,
189
+ "##ه": 41,
190
+ "##ن": 42,
191
+ "##ج": 43,
192
+ "##غ": 44,
193
+ "##ع": 45,
194
+ "##ض": 46,
195
+ "##ح": 47,
196
+ "##ك": 48,
197
+ "##و": 49,
198
+ "##م": 50,
199
+ "##ت": 51,
200
+ "##ش": 52,
201
+ "##ا": 53,
202
+ "##خ": 54,
203
+ "##ف": 55,
204
+ "##ث": 56,
205
+ "##ز": 57,
206
+ "##د": 58,
207
+ "##ء": 59,
208
+ "##س": 60,
209
+ "##ظ": 61,
210
+ "##ط": 62,
211
+ "##ذ": 63,
212
+ "##ص": 64,
213
+ "ال": 65,
214
+ "##ال": 66,
215
+ "##يه": 67,
216
+ "الم": 68,
217
+ "الا": 69,
218
+ "##ات": 70,
219
+ "##ان": 71,
220
+ "##ري": 72,
221
+ "##لي": 73,
222
+ "##اء": 74,
223
+ "##ار": 75,
224
+ "##ام": 76,
225
+ "ان": 77,
226
+ "##ين": 78,
227
+ "##اد": 79,
228
+ "##ير": 80,
229
+ "##اب": 81,
230
+ "##ول": 82,
231
+ "علي": 83,
232
+ "##ون": 84,
233
+ "##ها": 85,
234
+ "الع": 86,
235
+ "##اع": 87,
236
+ "وال": 88,
237
+ "##ست": 89,
238
+ "الس": 90,
239
+ "الي": 91,
240
+ "لل": 92,
241
+ "الت": 93,
242
+ "##ور": 94,
243
+ "##اس": 95,
244
+ "##اف": 96,
245
+ "الج": 97,
246
+ "##مه": 98,
247
+ "##يد": 99,
248
+ "الح": 100,
249
+ "الق": 101,
250
+ "##رب": 102,
251
+ "##وا": 103,
252
+ "##يا": 104,
253
+ "الف": 105,
254
+ "##ره": 106,
255
+ "بال": 107,
256
+ "##له": 108,
257
+ "##ود": 109,
258
+ "##را": 110,
259
+ "##وم": 111,
260
+ "الد": 112,
261
+ "##لا": 113,
262
+ "##هم": 114,
263
+ "الش": 115,
264
+ "وا": 116,
265
+ "##حد": 117,
266
+ "##يس": 118,
267
+ "##نا": 119,
268
+ "الب": 120,
269
+ "##قه": 121,
270
+ "##يل": 122,
271
+ "##من": 123,
272
+ "##عه": 124,
273
+ "الر": 125,
274
+ "##قي": 126,
275
+ "##رك": 127,
276
+ "##نت": 128,
277
+ "##اه": 129,
278
+ "وت": 130,
279
+ "##مد": 131,
280
+ "##قد": 132,
281
+ "##في": 133,
282
+ "##وي": 134,
283
+ "الو": 135,
284
+ "##اره": 136,
285
+ "الن": 137,
286
+ "##مل": 138,
287
+ "##مر": 139,
288
+ "##ته": 140,
289
+ "##لس": 141,
290
+ "اع": 142,
291
+ "##يم": 143,
292
+ "وق": 144,
293
+ "الاس": 145,
294
+ "##راء": 146,
295
+ "##وري": 147,
296
+ "الخ": 148,
297
+ "مح": 149,
298
+ "##جه": 150,
299
+ "##ءيس": 151,
300
+ "##بي": 152,
301
+ "##به": 153,
302
+ "##ني": 154,
303
+ "##صر": 155,
304
+ "##عد": 156,
305
+ "##كن": 157,
306
+ "##وق": 158,
307
+ "ام": 159,
308
+ "##لال": 160,
309
+ "##هد": 161,
310
+ "##وس": 162,
311
+ "اس": 163,
312
+ "##بد": 164,
313
+ "##بر": 165,
314
+ "##حه": 166,
315
+ "##تي": 167,
316
+ "##لام": 168,
317
+ "##مال": 169,
318
+ "##لم": 170,
319
+ "الص": 171,
320
+ "الث": 172,
321
+ "##كر": 173,
322
+ "##تم": 174,
323
+ "##فا": 175,
324
+ "من": 176,
325
+ "##ده": 177,
326
+ "المت": 178,
327
+ "است": 179,
328
+ "الام": 180,
329
+ "##وله": 181,
330
+ "##اني": 182,
331
+ "##قت": 183,
332
+ "##ما": 184,
333
+ "##ريق": 185,
334
+ "##حت": 186,
335
+ "الك": 187,
336
+ "##سي": 188,
337
+ "اي": 189,
338
+ "##قل": 190,
339
+ "##جم": 191,
340
+ "##با": 192,
341
+ "##اص": 193,
342
+ "##دي": 194,
343
+ "##فه": 195,
344
+ "او": 196,
345
+ "##ضي": 197,
346
+ "##وع": 198,
347
+ "اك": 199,
348
+ "بن": 200,
349
+ "##وات": 201,
350
+ "##شر": 202,
351
+ "##طه": 203,
352
+ "##كه": 204,
353
+ "##بار": 205,
354
+ "##زي": 206,
355
+ "##نه": 207,
356
+ "مس": 208,
357
+ "##تح": 209,
358
+ "##لك": 210,
359
+ "وك": 211,
360
+ "وي": 212,
361
+ "اب": 213,
362
+ "اخ": 214,
363
+ "##وض": 215,
364
+ "خلال": 216,
365
+ "##ادي": 217,
366
+ "##عت": 218,
367
+ "##شار": 219,
368
+ "##صل": 220,
369
+ "##الي": 221,
370
+ "##قب": 222,
371
+ "سي": 223,
372
+ "##اله": 224,
373
+ "##رت": 225,
374
+ "##اري": 226,
375
+ "وم": 227,
376
+ "وقال": 228,
377
+ "الل": 229,
378
+ "المس": 230,
379
+ "الان": 231,
380
+ "##اده": 232,
381
+ "##ولي": 233,
382
+ "انه": 234,
383
+ "##خل": 235,
384
+ "##هر": 236,
385
+ "مد": 237,
386
+ "##اج": 238,
387
+ "عبد": 239,
388
+ "##دد": 240,
389
+ "##زاء": 241,
390
+ "##وب": 242,
391
+ "##يره": 243,
392
+ "اج": 244,
393
+ "##دم": 245,
394
+ "##عود": 246,
395
+ "مع": 247,
396
+ "مت": 248,
397
+ "##قا": 249,
398
+ "##وف": 250,
399
+ "##اي": 251,
400
+ "##وره": 252,
401
+ "##حي": 253,
402
+ "العام": 254,
403
+ "المن": 255,
404
+ "بر": 256,
405
+ "##رض": 257,
406
+ "##انيه": 258,
407
+ "لم": 259,
408
+ "##سم": 260,
409
+ "##صري": 261,
410
+ "وز": 262,
411
+ "تع": 263,
412
+ "##طر": 264,
413
+ "##كو": 265,
414
+ "##ديد": 266,
415
+ "بد": 267,
416
+ "##ضاف": 268,
417
+ "المد": 269,
418
+ "##كل": 270,
419
+ "الاخ": 271,
420
+ "##ريك": 272,
421
+ "##جلس": 273,
422
+ "##كون": 274,
423
+ "##اح": 275,
424
+ "##عب": 276,
425
+ "##تر": 277,
426
+ "##حده": 278,
427
+ "##انت": 279,
428
+ "اليوم": 280,
429
+ "##خص": 281,
430
+ "##طين": 282,
431
+ "والم": 283,
432
+ "##زه": 284,
433
+ "وب": 285,
434
+ "اف": 286,
435
+ "##ثر": 287,
436
+ "##سه": 288,
437
+ "العرب": 289,
438
+ "##ويه": 290,
439
+ "تم": 291,
440
+ "لت": 292,
441
+ "الرءيس": 293,
442
+ "الشر": 294,
443
+ "##طل": 295,
444
+ "##ينه": 296,
445
+ "##سب": 297,
446
+ "##ند": 298,
447
+ "محمد": 299,
448
+ "رءيس": 300,
449
+ "عام": 301,
450
+ "##عا": 302,
451
+ "##طقه": 303,
452
+ "##لسطين": 304,
453
+ "##عل": 305,
454
+ "##وج": 306,
455
+ "وان": 307,
456
+ "##الم": 308,
457
+ "##وز": 309,
458
+ "الجزاء": 310,
459
+ "بم": 311,
460
+ "##صد": 312,
461
+ "يت": 313,
462
+ "##قر": 314,
463
+ "##ابه": 315,
464
+ "##نظ": 316,
465
+ "##يش": 317,
466
+ "##اعه": 318,
467
+ "##يين": 319,
468
+ "##يب": 320,
469
+ "##اك": 321,
470
+ "اح": 322,
471
+ "##تها": 323,
472
+ "مر": 324,
473
+ "##اعب": 325,
474
+ "قال": 326,
475
+ "##ركه": 327,
476
+ "اله": 328,
477
+ "##از": 329,
478
+ "##طن": 330,
479
+ "الط": 331,
480
+ "##تل": 332,
481
+ "المح": 333,
482
+ "الز": 334,
483
+ "وح": 335,
484
+ "##عم": 336,
485
+ "وس": 337,
486
+ "الله": 338,
487
+ "الغ": 339,
488
+ "اد": 340,
489
+ "##قات": 341,
490
+ "##رف": 342,
491
+ "##وه": 343,
492
+ "المع": 344,
493
+ "##ارات": 345,
494
+ "يوم": 346,
495
+ "حس": 347,
496
+ "##وان": 348,
497
+ "وع": 349,
498
+ "##نتخ": 350,
499
+ "بان": 351,
500
+ "مء": 352,
501
+ "##ليه": 353,
502
+ "##ذا": 354,
503
+ "بت": 355,
504
+ "##رين": 356,
505
+ "##كومه": 357,
506
+ "##فر": 358,
507
+ "للم": 359,
508
+ "بش": 360,
509
+ "##كت": 361,
510
+ "تح": 362,
511
+ "##ذلك": 363,
512
+ "وج": 364,
513
+ "الما": 365,
514
+ "##ءه": 366,
515
+ "فر": 367,
516
+ "مست": 368,
517
+ "##يان": 369,
518
+ "##لاث": 370,
519
+ "##يلي": 371,
520
+ "الاست": 372,
521
+ "بل": 373,
522
+ "##ناء": 374,
523
+ "المتحده": 375,
524
+ "##قيه": 376,
525
+ "مش": 377,
526
+ "##قاء": 378,
527
+ "##زال": 379,
528
+ "الاه": 380,
529
+ "##يع": 381,
530
+ "##اخل": 382,
531
+ "##اليه": 383,
532
+ "السعود": 384,
533
+ "##وريا": 385,
534
+ "المر": 386,
535
+ "الامريك": 387,
536
+ "وه": 388,
537
+ "وفي": 389,
538
+ "##ضاء": 390,
539
+ "##فت": 391,
540
+ "##ارج": 392,
541
+ "با": 393,
542
+ "تق": 394,
543
+ "الوز": 395,
544
+ "##ضه": 396,
545
+ "##حدث": 397,
546
+ "ات": 398,
547
+ "البل": 399,
548
+ "الجم": 400,
549
+ "##باراه": 401,
550
+ "##اسه": 402,
551
+ "##رات": 403,
552
+ "##ابع": 404,
553
+ "الامن": 405,
554
+ "جم": 406,
555
+ "الاول": 407,
556
+ "بح": 408,
557
+ "وف": 409,
558
+ "##راءيل": 410,
559
+ "##عي": 411,
560
+ "##كري": 412,
561
+ "##كم": 413,
562
+ "مصر": 414,
563
+ "##قط": 415,
564
+ "##واجه": 416,
565
+ "##مالك": 417,
566
+ "##رد": 418,
567
+ "##وريه": 419,
568
+ "##بيه": 420,
569
+ "##نس": 421,
570
+ "احد": 422,
571
+ "الاسلام": 423,
572
+ "الاهلي": 424,
573
+ "عدد": 425,
574
+ "الاع": 426,
575
+ "اكثر": 427,
576
+ "الجزاءر": 428,
577
+ "##ليم": 429,
578
+ "قر": 430,
579
+ "يكن": 431,
580
+ "##ضيه": 432,
581
+ "##طال": 433,
582
+ "##اصه": 434,
583
+ "##ركز": 435,
584
+ "##بل": 436,
585
+ "العالم": 437,
586
+ "الفلسطين": 438,
587
+ "حتي": 439,
588
+ "يع": 440,
589
+ "##زب": 441,
590
+ "##بت": 442,
591
+ "##يران": 443,
592
+ "الفر": 444,
593
+ "اق": 445,
594
+ "والت": 446,
595
+ "##فع": 447,
596
+ "##هدف": 448,
597
+ "المست": 449,
598
+ "مجلس": 450,
599
+ "اعل": 451,
600
+ "امام": 452,
601
+ "##نظيم": 453,
602
+ "ون": 454,
603
+ "##دا": 455,
604
+ "##قبل": 456,
605
+ "لا": 457,
606
+ "ول": 458,
607
+ "اص": 459,
608
+ "موق": 460,
609
+ "##جل": 461,
610
+ "المصري": 462,
611
+ "الدول": 463,
612
+ "##تحاد": 464,
613
+ "##ربع": 465,
614
+ "##وت": 466,
615
+ "خط": 467,
616
+ "##نيه": 468,
617
+ "الماضي": 469,
618
+ "مص": 470,
619
+ "##جد": 471,
620
+ "##امه": 472,
621
+ "##دري": 473,
622
+ "##ضع": 474,
623
+ "واضاف": 475,
624
+ "التح": 476,
625
+ "##ابات": 477,
626
+ "السل": 478,
627
+ "حم": 479,
628
+ "##صف": 480,
629
+ "الاف": 481,
630
+ "##ظام": 482,
631
+ "##انه": 483,
632
+ "مخ": 484,
633
+ "##اءل": 485,
634
+ "##يف": 486,
635
+ "تر": 487,
636
+ "##ضا": 488,
637
+ "##غرب": 489,
638
+ "##ملكه": 490,
639
+ "قاء": 491,
640
+ "تش": 492,
641
+ "السي": 493,
642
+ "تزال": 494,
643
+ "سوريا": 495,
644
+ "##جي": 496,
645
+ "انت": 497,
646
+ "##اسيه": 498,
647
+ "##اسي": 499
648
+ }
649
+ }
650
+ }
tokenizer_config.json CHANGED
@@ -1 +1,15 @@
1
- {"do_lower_case": true, "do_basic_tokenize": true, "never_split": null, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "/project/6007993/elmadany/Models/MARBERT_17M/pytorch_verison/"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "clean_up_tokenization_spaces": true,
3
+ "cls_token": "[CLS]",
4
+ "do_basic_tokenize": true,
5
+ "do_lower_case": true,
6
+ "mask_token": "[MASK]",
7
+ "model_max_length": 1000000000000000019884624838656,
8
+ "never_split": null,
9
+ "pad_token": "[PAD]",
10
+ "sep_token": "[SEP]",
11
+ "strip_accents": null,
12
+ "tokenize_chinese_chars": true,
13
+ "tokenizer_class": "BertTokenizer",
14
+ "unk_token": "[UNK]"
15
+ }
vocab.txt CHANGED
The diff for this file is too large to render. See raw diff