Upload tokenizer
#6
by
lamaabdulaziz
- opened
- special_tokens_map.json +7 -1
- tokenizer.json +650 -0
- tokenizer_config.json +15 -1
- vocab.txt +0 -0
special_tokens_map.json
CHANGED
@@ -1 +1,7 @@
|
|
1 |
-
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cls_token": "[CLS]",
|
3 |
+
"mask_token": "[MASK]",
|
4 |
+
"pad_token": "[PAD]",
|
5 |
+
"sep_token": "[SEP]",
|
6 |
+
"unk_token": "[UNK]"
|
7 |
+
}
|
tokenizer.json
ADDED
@@ -0,0 +1,650 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"version": "1.0",
|
3 |
+
"truncation": null,
|
4 |
+
"padding": null,
|
5 |
+
"added_tokens": [
|
6 |
+
{
|
7 |
+
"id": 0,
|
8 |
+
"content": "[PAD]",
|
9 |
+
"single_word": false,
|
10 |
+
"lstrip": false,
|
11 |
+
"rstrip": false,
|
12 |
+
"normalized": false,
|
13 |
+
"special": true
|
14 |
+
},
|
15 |
+
{
|
16 |
+
"id": 1,
|
17 |
+
"content": "[UNK]",
|
18 |
+
"single_word": false,
|
19 |
+
"lstrip": false,
|
20 |
+
"rstrip": false,
|
21 |
+
"normalized": false,
|
22 |
+
"special": true
|
23 |
+
},
|
24 |
+
{
|
25 |
+
"id": 2,
|
26 |
+
"content": "[CLS]",
|
27 |
+
"single_word": false,
|
28 |
+
"lstrip": false,
|
29 |
+
"rstrip": false,
|
30 |
+
"normalized": false,
|
31 |
+
"special": true
|
32 |
+
},
|
33 |
+
{
|
34 |
+
"id": 3,
|
35 |
+
"content": "[SEP]",
|
36 |
+
"single_word": false,
|
37 |
+
"lstrip": false,
|
38 |
+
"rstrip": false,
|
39 |
+
"normalized": false,
|
40 |
+
"special": true
|
41 |
+
},
|
42 |
+
{
|
43 |
+
"id": 4,
|
44 |
+
"content": "[MASK]",
|
45 |
+
"single_word": false,
|
46 |
+
"lstrip": false,
|
47 |
+
"rstrip": false,
|
48 |
+
"normalized": false,
|
49 |
+
"special": true
|
50 |
+
}
|
51 |
+
],
|
52 |
+
"normalizer": {
|
53 |
+
"type": "BertNormalizer",
|
54 |
+
"clean_text": true,
|
55 |
+
"handle_chinese_chars": true,
|
56 |
+
"strip_accents": null,
|
57 |
+
"lowercase": true
|
58 |
+
},
|
59 |
+
"pre_tokenizer": {
|
60 |
+
"type": "BertPreTokenizer"
|
61 |
+
},
|
62 |
+
"post_processor": {
|
63 |
+
"type": "TemplateProcessing",
|
64 |
+
"single": [
|
65 |
+
{
|
66 |
+
"SpecialToken": {
|
67 |
+
"id": "[CLS]",
|
68 |
+
"type_id": 0
|
69 |
+
}
|
70 |
+
},
|
71 |
+
{
|
72 |
+
"Sequence": {
|
73 |
+
"id": "A",
|
74 |
+
"type_id": 0
|
75 |
+
}
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"SpecialToken": {
|
79 |
+
"id": "[SEP]",
|
80 |
+
"type_id": 0
|
81 |
+
}
|
82 |
+
}
|
83 |
+
],
|
84 |
+
"pair": [
|
85 |
+
{
|
86 |
+
"SpecialToken": {
|
87 |
+
"id": "[CLS]",
|
88 |
+
"type_id": 0
|
89 |
+
}
|
90 |
+
},
|
91 |
+
{
|
92 |
+
"Sequence": {
|
93 |
+
"id": "A",
|
94 |
+
"type_id": 0
|
95 |
+
}
|
96 |
+
},
|
97 |
+
{
|
98 |
+
"SpecialToken": {
|
99 |
+
"id": "[SEP]",
|
100 |
+
"type_id": 0
|
101 |
+
}
|
102 |
+
},
|
103 |
+
{
|
104 |
+
"Sequence": {
|
105 |
+
"id": "B",
|
106 |
+
"type_id": 1
|
107 |
+
}
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"SpecialToken": {
|
111 |
+
"id": "[SEP]",
|
112 |
+
"type_id": 1
|
113 |
+
}
|
114 |
+
}
|
115 |
+
],
|
116 |
+
"special_tokens": {
|
117 |
+
"[CLS]": {
|
118 |
+
"id": "[CLS]",
|
119 |
+
"ids": [
|
120 |
+
2
|
121 |
+
],
|
122 |
+
"tokens": [
|
123 |
+
"[CLS]"
|
124 |
+
]
|
125 |
+
},
|
126 |
+
"[SEP]": {
|
127 |
+
"id": "[SEP]",
|
128 |
+
"ids": [
|
129 |
+
3
|
130 |
+
],
|
131 |
+
"tokens": [
|
132 |
+
"[SEP]"
|
133 |
+
]
|
134 |
+
}
|
135 |
+
}
|
136 |
+
},
|
137 |
+
"decoder": {
|
138 |
+
"type": "WordPiece",
|
139 |
+
"prefix": "##",
|
140 |
+
"cleanup": true
|
141 |
+
},
|
142 |
+
"model": {
|
143 |
+
"type": "WordPiece",
|
144 |
+
"unk_token": "[UNK]",
|
145 |
+
"continuing_subword_prefix": "##",
|
146 |
+
"max_input_chars_per_word": 100,
|
147 |
+
"vocab": {
|
148 |
+
"[PAD]": 0,
|
149 |
+
"[UNK]": 1,
|
150 |
+
"[CLS]": 2,
|
151 |
+
"[SEP]": 3,
|
152 |
+
"[MASK]": 4,
|
153 |
+
"ء": 5,
|
154 |
+
"ا": 6,
|
155 |
+
"ب": 7,
|
156 |
+
"ت": 8,
|
157 |
+
"ث": 9,
|
158 |
+
"ج": 10,
|
159 |
+
"ح": 11,
|
160 |
+
"خ": 12,
|
161 |
+
"د": 13,
|
162 |
+
"ذ": 14,
|
163 |
+
"ر": 15,
|
164 |
+
"ز": 16,
|
165 |
+
"س": 17,
|
166 |
+
"ش": 18,
|
167 |
+
"ص": 19,
|
168 |
+
"ض": 20,
|
169 |
+
"ط": 21,
|
170 |
+
"ظ": 22,
|
171 |
+
"ع": 23,
|
172 |
+
"غ": 24,
|
173 |
+
"ف": 25,
|
174 |
+
"ق": 26,
|
175 |
+
"ك": 27,
|
176 |
+
"ل": 28,
|
177 |
+
"م": 29,
|
178 |
+
"ن": 30,
|
179 |
+
"ه": 31,
|
180 |
+
"و": 32,
|
181 |
+
"ي": 33,
|
182 |
+
"پ": 34,
|
183 |
+
"ލ": 35,
|
184 |
+
"##ل": 36,
|
185 |
+
"##ب": 37,
|
186 |
+
"##ر": 38,
|
187 |
+
"##ق": 39,
|
188 |
+
"##ي": 40,
|
189 |
+
"##ه": 41,
|
190 |
+
"##ن": 42,
|
191 |
+
"##ج": 43,
|
192 |
+
"##غ": 44,
|
193 |
+
"##ع": 45,
|
194 |
+
"##ض": 46,
|
195 |
+
"##ح": 47,
|
196 |
+
"##ك": 48,
|
197 |
+
"##و": 49,
|
198 |
+
"##م": 50,
|
199 |
+
"##ت": 51,
|
200 |
+
"##ش": 52,
|
201 |
+
"##ا": 53,
|
202 |
+
"##خ": 54,
|
203 |
+
"##ف": 55,
|
204 |
+
"##ث": 56,
|
205 |
+
"##ز": 57,
|
206 |
+
"##د": 58,
|
207 |
+
"##ء": 59,
|
208 |
+
"##س": 60,
|
209 |
+
"##ظ": 61,
|
210 |
+
"##ط": 62,
|
211 |
+
"##ذ": 63,
|
212 |
+
"##ص": 64,
|
213 |
+
"ال": 65,
|
214 |
+
"##ال": 66,
|
215 |
+
"##يه": 67,
|
216 |
+
"الم": 68,
|
217 |
+
"الا": 69,
|
218 |
+
"##ات": 70,
|
219 |
+
"##ان": 71,
|
220 |
+
"##ري": 72,
|
221 |
+
"##لي": 73,
|
222 |
+
"##اء": 74,
|
223 |
+
"##ار": 75,
|
224 |
+
"##ام": 76,
|
225 |
+
"ان": 77,
|
226 |
+
"##ين": 78,
|
227 |
+
"##اد": 79,
|
228 |
+
"##ير": 80,
|
229 |
+
"##اب": 81,
|
230 |
+
"##ول": 82,
|
231 |
+
"علي": 83,
|
232 |
+
"##ون": 84,
|
233 |
+
"##ها": 85,
|
234 |
+
"الع": 86,
|
235 |
+
"##اع": 87,
|
236 |
+
"وال": 88,
|
237 |
+
"##ست": 89,
|
238 |
+
"الس": 90,
|
239 |
+
"الي": 91,
|
240 |
+
"لل": 92,
|
241 |
+
"الت": 93,
|
242 |
+
"##ور": 94,
|
243 |
+
"##اس": 95,
|
244 |
+
"##اف": 96,
|
245 |
+
"الج": 97,
|
246 |
+
"##مه": 98,
|
247 |
+
"##يد": 99,
|
248 |
+
"الح": 100,
|
249 |
+
"الق": 101,
|
250 |
+
"##رب": 102,
|
251 |
+
"##وا": 103,
|
252 |
+
"##يا": 104,
|
253 |
+
"الف": 105,
|
254 |
+
"##ره": 106,
|
255 |
+
"بال": 107,
|
256 |
+
"##له": 108,
|
257 |
+
"##ود": 109,
|
258 |
+
"##را": 110,
|
259 |
+
"##وم": 111,
|
260 |
+
"الد": 112,
|
261 |
+
"##لا": 113,
|
262 |
+
"##هم": 114,
|
263 |
+
"الش": 115,
|
264 |
+
"وا": 116,
|
265 |
+
"##حد": 117,
|
266 |
+
"##يس": 118,
|
267 |
+
"##نا": 119,
|
268 |
+
"الب": 120,
|
269 |
+
"##قه": 121,
|
270 |
+
"##يل": 122,
|
271 |
+
"##من": 123,
|
272 |
+
"##عه": 124,
|
273 |
+
"الر": 125,
|
274 |
+
"##قي": 126,
|
275 |
+
"##رك": 127,
|
276 |
+
"##نت": 128,
|
277 |
+
"##اه": 129,
|
278 |
+
"وت": 130,
|
279 |
+
"##مد": 131,
|
280 |
+
"##قد": 132,
|
281 |
+
"##في": 133,
|
282 |
+
"##وي": 134,
|
283 |
+
"الو": 135,
|
284 |
+
"##اره": 136,
|
285 |
+
"الن": 137,
|
286 |
+
"##مل": 138,
|
287 |
+
"##مر": 139,
|
288 |
+
"##ته": 140,
|
289 |
+
"##لس": 141,
|
290 |
+
"اع": 142,
|
291 |
+
"##يم": 143,
|
292 |
+
"وق": 144,
|
293 |
+
"الاس": 145,
|
294 |
+
"##راء": 146,
|
295 |
+
"##وري": 147,
|
296 |
+
"الخ": 148,
|
297 |
+
"مح": 149,
|
298 |
+
"##جه": 150,
|
299 |
+
"##ءيس": 151,
|
300 |
+
"##بي": 152,
|
301 |
+
"##به": 153,
|
302 |
+
"##ني": 154,
|
303 |
+
"##صر": 155,
|
304 |
+
"##عد": 156,
|
305 |
+
"##كن": 157,
|
306 |
+
"##وق": 158,
|
307 |
+
"ام": 159,
|
308 |
+
"##لال": 160,
|
309 |
+
"##هد": 161,
|
310 |
+
"##وس": 162,
|
311 |
+
"اس": 163,
|
312 |
+
"##بد": 164,
|
313 |
+
"##بر": 165,
|
314 |
+
"##حه": 166,
|
315 |
+
"##تي": 167,
|
316 |
+
"##لام": 168,
|
317 |
+
"##مال": 169,
|
318 |
+
"##لم": 170,
|
319 |
+
"الص": 171,
|
320 |
+
"الث": 172,
|
321 |
+
"##كر": 173,
|
322 |
+
"##تم": 174,
|
323 |
+
"##فا": 175,
|
324 |
+
"من": 176,
|
325 |
+
"##ده": 177,
|
326 |
+
"المت": 178,
|
327 |
+
"است": 179,
|
328 |
+
"الام": 180,
|
329 |
+
"##وله": 181,
|
330 |
+
"##اني": 182,
|
331 |
+
"##قت": 183,
|
332 |
+
"##ما": 184,
|
333 |
+
"##ريق": 185,
|
334 |
+
"##حت": 186,
|
335 |
+
"الك": 187,
|
336 |
+
"##سي": 188,
|
337 |
+
"اي": 189,
|
338 |
+
"##قل": 190,
|
339 |
+
"##جم": 191,
|
340 |
+
"##با": 192,
|
341 |
+
"##اص": 193,
|
342 |
+
"##دي": 194,
|
343 |
+
"##فه": 195,
|
344 |
+
"او": 196,
|
345 |
+
"##ضي": 197,
|
346 |
+
"##وع": 198,
|
347 |
+
"اك": 199,
|
348 |
+
"بن": 200,
|
349 |
+
"##وات": 201,
|
350 |
+
"##شر": 202,
|
351 |
+
"##طه": 203,
|
352 |
+
"##كه": 204,
|
353 |
+
"##بار": 205,
|
354 |
+
"##زي": 206,
|
355 |
+
"##نه": 207,
|
356 |
+
"مس": 208,
|
357 |
+
"##تح": 209,
|
358 |
+
"##لك": 210,
|
359 |
+
"وك": 211,
|
360 |
+
"وي": 212,
|
361 |
+
"اب": 213,
|
362 |
+
"اخ": 214,
|
363 |
+
"##وض": 215,
|
364 |
+
"خلال": 216,
|
365 |
+
"##ادي": 217,
|
366 |
+
"##عت": 218,
|
367 |
+
"##شار": 219,
|
368 |
+
"##صل": 220,
|
369 |
+
"##الي": 221,
|
370 |
+
"##قب": 222,
|
371 |
+
"سي": 223,
|
372 |
+
"##اله": 224,
|
373 |
+
"##رت": 225,
|
374 |
+
"##اري": 226,
|
375 |
+
"وم": 227,
|
376 |
+
"وقال": 228,
|
377 |
+
"الل": 229,
|
378 |
+
"المس": 230,
|
379 |
+
"الان": 231,
|
380 |
+
"##اده": 232,
|
381 |
+
"##ولي": 233,
|
382 |
+
"انه": 234,
|
383 |
+
"##خل": 235,
|
384 |
+
"##هر": 236,
|
385 |
+
"مد": 237,
|
386 |
+
"##اج": 238,
|
387 |
+
"عبد": 239,
|
388 |
+
"##دد": 240,
|
389 |
+
"##زاء": 241,
|
390 |
+
"##وب": 242,
|
391 |
+
"##يره": 243,
|
392 |
+
"اج": 244,
|
393 |
+
"##دم": 245,
|
394 |
+
"##عود": 246,
|
395 |
+
"مع": 247,
|
396 |
+
"مت": 248,
|
397 |
+
"##قا": 249,
|
398 |
+
"##وف": 250,
|
399 |
+
"##اي": 251,
|
400 |
+
"##وره": 252,
|
401 |
+
"##حي": 253,
|
402 |
+
"العام": 254,
|
403 |
+
"المن": 255,
|
404 |
+
"بر": 256,
|
405 |
+
"##رض": 257,
|
406 |
+
"##انيه": 258,
|
407 |
+
"لم": 259,
|
408 |
+
"##سم": 260,
|
409 |
+
"##صري": 261,
|
410 |
+
"وز": 262,
|
411 |
+
"تع": 263,
|
412 |
+
"##طر": 264,
|
413 |
+
"##كو": 265,
|
414 |
+
"##ديد": 266,
|
415 |
+
"بد": 267,
|
416 |
+
"##ضاف": 268,
|
417 |
+
"المد": 269,
|
418 |
+
"##كل": 270,
|
419 |
+
"الاخ": 271,
|
420 |
+
"##ريك": 272,
|
421 |
+
"##جلس": 273,
|
422 |
+
"##كون": 274,
|
423 |
+
"##اح": 275,
|
424 |
+
"##عب": 276,
|
425 |
+
"##تر": 277,
|
426 |
+
"##حده": 278,
|
427 |
+
"##انت": 279,
|
428 |
+
"اليوم": 280,
|
429 |
+
"##خص": 281,
|
430 |
+
"##طين": 282,
|
431 |
+
"والم": 283,
|
432 |
+
"##زه": 284,
|
433 |
+
"وب": 285,
|
434 |
+
"اف": 286,
|
435 |
+
"##ثر": 287,
|
436 |
+
"##سه": 288,
|
437 |
+
"العرب": 289,
|
438 |
+
"##ويه": 290,
|
439 |
+
"تم": 291,
|
440 |
+
"لت": 292,
|
441 |
+
"الرءيس": 293,
|
442 |
+
"الشر": 294,
|
443 |
+
"##طل": 295,
|
444 |
+
"##ينه": 296,
|
445 |
+
"##سب": 297,
|
446 |
+
"##ند": 298,
|
447 |
+
"محمد": 299,
|
448 |
+
"رءيس": 300,
|
449 |
+
"عام": 301,
|
450 |
+
"##عا": 302,
|
451 |
+
"##طقه": 303,
|
452 |
+
"##لسطين": 304,
|
453 |
+
"##عل": 305,
|
454 |
+
"##وج": 306,
|
455 |
+
"وان": 307,
|
456 |
+
"##الم": 308,
|
457 |
+
"##وز": 309,
|
458 |
+
"الجزاء": 310,
|
459 |
+
"بم": 311,
|
460 |
+
"##صد": 312,
|
461 |
+
"يت": 313,
|
462 |
+
"##قر": 314,
|
463 |
+
"##ابه": 315,
|
464 |
+
"##نظ": 316,
|
465 |
+
"##يش": 317,
|
466 |
+
"##اعه": 318,
|
467 |
+
"##يين": 319,
|
468 |
+
"##يب": 320,
|
469 |
+
"##اك": 321,
|
470 |
+
"اح": 322,
|
471 |
+
"##تها": 323,
|
472 |
+
"مر": 324,
|
473 |
+
"##اعب": 325,
|
474 |
+
"قال": 326,
|
475 |
+
"##ركه": 327,
|
476 |
+
"اله": 328,
|
477 |
+
"##از": 329,
|
478 |
+
"##طن": 330,
|
479 |
+
"الط": 331,
|
480 |
+
"##تل": 332,
|
481 |
+
"المح": 333,
|
482 |
+
"الز": 334,
|
483 |
+
"وح": 335,
|
484 |
+
"##عم": 336,
|
485 |
+
"وس": 337,
|
486 |
+
"الله": 338,
|
487 |
+
"الغ": 339,
|
488 |
+
"اد": 340,
|
489 |
+
"##قات": 341,
|
490 |
+
"##رف": 342,
|
491 |
+
"##وه": 343,
|
492 |
+
"المع": 344,
|
493 |
+
"##ارات": 345,
|
494 |
+
"يوم": 346,
|
495 |
+
"حس": 347,
|
496 |
+
"##وان": 348,
|
497 |
+
"وع": 349,
|
498 |
+
"##نتخ": 350,
|
499 |
+
"بان": 351,
|
500 |
+
"مء": 352,
|
501 |
+
"##ليه": 353,
|
502 |
+
"##ذا": 354,
|
503 |
+
"بت": 355,
|
504 |
+
"##رين": 356,
|
505 |
+
"##كومه": 357,
|
506 |
+
"##فر": 358,
|
507 |
+
"للم": 359,
|
508 |
+
"بش": 360,
|
509 |
+
"##كت": 361,
|
510 |
+
"تح": 362,
|
511 |
+
"##ذلك": 363,
|
512 |
+
"وج": 364,
|
513 |
+
"الما": 365,
|
514 |
+
"##ءه": 366,
|
515 |
+
"فر": 367,
|
516 |
+
"مست": 368,
|
517 |
+
"##يان": 369,
|
518 |
+
"##لاث": 370,
|
519 |
+
"##يلي": 371,
|
520 |
+
"الاست": 372,
|
521 |
+
"بل": 373,
|
522 |
+
"##ناء": 374,
|
523 |
+
"المتحده": 375,
|
524 |
+
"##قيه": 376,
|
525 |
+
"مش": 377,
|
526 |
+
"##قاء": 378,
|
527 |
+
"##زال": 379,
|
528 |
+
"الاه": 380,
|
529 |
+
"##يع": 381,
|
530 |
+
"##اخل": 382,
|
531 |
+
"##اليه": 383,
|
532 |
+
"السعود": 384,
|
533 |
+
"##وريا": 385,
|
534 |
+
"المر": 386,
|
535 |
+
"الامريك": 387,
|
536 |
+
"وه": 388,
|
537 |
+
"وفي": 389,
|
538 |
+
"##ضاء": 390,
|
539 |
+
"##فت": 391,
|
540 |
+
"##ارج": 392,
|
541 |
+
"با": 393,
|
542 |
+
"تق": 394,
|
543 |
+
"الوز": 395,
|
544 |
+
"##ضه": 396,
|
545 |
+
"##حدث": 397,
|
546 |
+
"ات": 398,
|
547 |
+
"البل": 399,
|
548 |
+
"الجم": 400,
|
549 |
+
"##باراه": 401,
|
550 |
+
"##اسه": 402,
|
551 |
+
"##رات": 403,
|
552 |
+
"##ابع": 404,
|
553 |
+
"الامن": 405,
|
554 |
+
"جم": 406,
|
555 |
+
"الاول": 407,
|
556 |
+
"بح": 408,
|
557 |
+
"وف": 409,
|
558 |
+
"##راءيل": 410,
|
559 |
+
"##عي": 411,
|
560 |
+
"##كري": 412,
|
561 |
+
"##كم": 413,
|
562 |
+
"مصر": 414,
|
563 |
+
"##قط": 415,
|
564 |
+
"##واجه": 416,
|
565 |
+
"##مالك": 417,
|
566 |
+
"##رد": 418,
|
567 |
+
"##وريه": 419,
|
568 |
+
"##بيه": 420,
|
569 |
+
"##نس": 421,
|
570 |
+
"احد": 422,
|
571 |
+
"الاسلام": 423,
|
572 |
+
"الاهلي": 424,
|
573 |
+
"عدد": 425,
|
574 |
+
"الاع": 426,
|
575 |
+
"اكثر": 427,
|
576 |
+
"الجزاءر": 428,
|
577 |
+
"##ليم": 429,
|
578 |
+
"قر": 430,
|
579 |
+
"يكن": 431,
|
580 |
+
"##ضيه": 432,
|
581 |
+
"##طال": 433,
|
582 |
+
"##اصه": 434,
|
583 |
+
"##ركز": 435,
|
584 |
+
"##بل": 436,
|
585 |
+
"العالم": 437,
|
586 |
+
"الفلسطين": 438,
|
587 |
+
"حتي": 439,
|
588 |
+
"يع": 440,
|
589 |
+
"##زب": 441,
|
590 |
+
"##بت": 442,
|
591 |
+
"##يران": 443,
|
592 |
+
"الفر": 444,
|
593 |
+
"اق": 445,
|
594 |
+
"والت": 446,
|
595 |
+
"##فع": 447,
|
596 |
+
"##هدف": 448,
|
597 |
+
"المست": 449,
|
598 |
+
"مجلس": 450,
|
599 |
+
"اعل": 451,
|
600 |
+
"امام": 452,
|
601 |
+
"##نظيم": 453,
|
602 |
+
"ون": 454,
|
603 |
+
"##دا": 455,
|
604 |
+
"##قبل": 456,
|
605 |
+
"لا": 457,
|
606 |
+
"ول": 458,
|
607 |
+
"اص": 459,
|
608 |
+
"موق": 460,
|
609 |
+
"##جل": 461,
|
610 |
+
"المصري": 462,
|
611 |
+
"الدول": 463,
|
612 |
+
"##تحاد": 464,
|
613 |
+
"##ربع": 465,
|
614 |
+
"##وت": 466,
|
615 |
+
"خط": 467,
|
616 |
+
"##نيه": 468,
|
617 |
+
"الماضي": 469,
|
618 |
+
"مص": 470,
|
619 |
+
"##جد": 471,
|
620 |
+
"##امه": 472,
|
621 |
+
"##دري": 473,
|
622 |
+
"##ضع": 474,
|
623 |
+
"واضاف": 475,
|
624 |
+
"التح": 476,
|
625 |
+
"##ابات": 477,
|
626 |
+
"السل": 478,
|
627 |
+
"حم": 479,
|
628 |
+
"##صف": 480,
|
629 |
+
"الاف": 481,
|
630 |
+
"##ظام": 482,
|
631 |
+
"##انه": 483,
|
632 |
+
"مخ": 484,
|
633 |
+
"##اءل": 485,
|
634 |
+
"##يف": 486,
|
635 |
+
"تر": 487,
|
636 |
+
"##ضا": 488,
|
637 |
+
"##غرب": 489,
|
638 |
+
"##ملكه": 490,
|
639 |
+
"قاء": 491,
|
640 |
+
"تش": 492,
|
641 |
+
"السي": 493,
|
642 |
+
"تزال": 494,
|
643 |
+
"سوريا": 495,
|
644 |
+
"##جي": 496,
|
645 |
+
"انت": 497,
|
646 |
+
"##اسيه": 498,
|
647 |
+
"##اسي": 499
|
648 |
+
}
|
649 |
+
}
|
650 |
+
}
|
tokenizer_config.json
CHANGED
@@ -1 +1,15 @@
|
|
1 |
-
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"clean_up_tokenization_spaces": true,
|
3 |
+
"cls_token": "[CLS]",
|
4 |
+
"do_basic_tokenize": true,
|
5 |
+
"do_lower_case": true,
|
6 |
+
"mask_token": "[MASK]",
|
7 |
+
"model_max_length": 1000000000000000019884624838656,
|
8 |
+
"never_split": null,
|
9 |
+
"pad_token": "[PAD]",
|
10 |
+
"sep_token": "[SEP]",
|
11 |
+
"strip_accents": null,
|
12 |
+
"tokenize_chinese_chars": true,
|
13 |
+
"tokenizer_class": "BertTokenizer",
|
14 |
+
"unk_token": "[UNK]"
|
15 |
+
}
|
vocab.txt
CHANGED
The diff for this file is too large to render.
See raw diff
|
|