strickvl commited on
Commit
57c829b
1 Parent(s): 4e1f514

update tokenizer script

Browse files
Files changed (1) hide show
  1. src/train_tokenizer.ipynb +1047 -18
src/train_tokenizer.ipynb CHANGED
@@ -30,7 +30,7 @@
30
  },
31
  {
32
  "cell_type": "code",
33
- "execution_count": 13,
34
  "metadata": {},
35
  "outputs": [
36
  {
@@ -39,7 +39,7 @@
39
  "4294"
40
  ]
41
  },
42
- "execution_count": 13,
43
  "metadata": {},
44
  "output_type": "execute_result"
45
  }
@@ -65,7 +65,7 @@
65
  },
66
  {
67
  "cell_type": "code",
68
- "execution_count": 17,
69
  "metadata": {},
70
  "outputs": [],
71
  "source": [
@@ -87,7 +87,7 @@
87
  },
88
  {
89
  "cell_type": "code",
90
- "execution_count": 18,
91
  "metadata": {},
92
  "outputs": [],
93
  "source": [
@@ -102,7 +102,7 @@
102
  },
103
  {
104
  "cell_type": "code",
105
- "execution_count": 19,
106
  "metadata": {},
107
  "outputs": [],
108
  "source": [
@@ -114,16 +114,25 @@
114
  },
115
  {
116
  "cell_type": "code",
117
- "execution_count": 27,
 
 
 
 
 
 
 
 
 
 
118
  "metadata": {},
119
  "outputs": [],
120
  "source": [
121
  "from tokenizers.trainers import BpeTrainer\n",
122
  "\n",
123
- "# trainer = BpeTrainer(vocab_size=25000, min_frequency=2)\n",
124
  "trainer = BpeTrainer(\n",
125
  " min_frequency=2,\n",
126
- " vocab_size=100000,\n",
127
  " special_tokens=[\"[UNK]\", \"[CLS]\", \"[SEP]\", \"[PAD]\", \"[MASK]\"],\n",
128
  " show_progress=True,\n",
129
  ")"
@@ -131,7 +140,7 @@
131
  },
132
  {
133
  "cell_type": "code",
134
- "execution_count": 22,
135
  "metadata": {},
136
  "outputs": [
137
  {
@@ -140,7 +149,7 @@
140
  "4294"
141
  ]
142
  },
143
- "execution_count": 22,
144
  "metadata": {},
145
  "output_type": "execute_result"
146
  }
@@ -156,7 +165,7 @@
156
  },
157
  {
158
  "cell_type": "code",
159
- "execution_count": 28,
160
  "metadata": {},
161
  "outputs": [
162
  {
@@ -175,16 +184,16 @@
175
  },
176
  {
177
  "cell_type": "code",
178
- "execution_count": 29,
179
  "metadata": {},
180
  "outputs": [
181
  {
182
  "data": {
183
  "text/plain": [
184
- "<tokenizers.models.BPE at 0x140d828f0>"
185
  ]
186
  },
187
- "execution_count": 29,
188
  "metadata": {},
189
  "output_type": "execute_result"
190
  }
@@ -195,16 +204,16 @@
195
  },
196
  {
197
  "cell_type": "code",
198
- "execution_count": 30,
199
  "metadata": {},
200
  "outputs": [
201
  {
202
  "data": {
203
  "text/plain": [
204
- "100000"
205
  ]
206
  },
207
- "execution_count": 30,
208
  "metadata": {},
209
  "output_type": "execute_result"
210
  }
@@ -215,7 +224,1027 @@
215
  },
216
  {
217
  "cell_type": "code",
218
- "execution_count": 31,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
  "metadata": {},
220
  "outputs": [],
221
  "source": [
 
30
  },
31
  {
32
  "cell_type": "code",
33
+ "execution_count": 1,
34
  "metadata": {},
35
  "outputs": [
36
  {
 
39
  "4294"
40
  ]
41
  },
42
+ "execution_count": 1,
43
  "metadata": {},
44
  "output_type": "execute_result"
45
  }
 
65
  },
66
  {
67
  "cell_type": "code",
68
+ "execution_count": 2,
69
  "metadata": {},
70
  "outputs": [],
71
  "source": [
 
87
  },
88
  {
89
  "cell_type": "code",
90
+ "execution_count": 3,
91
  "metadata": {},
92
  "outputs": [],
93
  "source": [
 
102
  },
103
  {
104
  "cell_type": "code",
105
+ "execution_count": 4,
106
  "metadata": {},
107
  "outputs": [],
108
  "source": [
 
114
  },
115
  {
116
  "cell_type": "code",
117
+ "execution_count": 5,
118
+ "metadata": {},
119
+ "outputs": [],
120
+ "source": [
121
+ "from tokenizers.pre_tokenizers import Whitespace\n",
122
+ "tokenizer.pre_tokenizer = Whitespace()"
123
+ ]
124
+ },
125
+ {
126
+ "cell_type": "code",
127
+ "execution_count": 6,
128
  "metadata": {},
129
  "outputs": [],
130
  "source": [
131
  "from tokenizers.trainers import BpeTrainer\n",
132
  "\n",
 
133
  "trainer = BpeTrainer(\n",
134
  " min_frequency=2,\n",
135
+ " vocab_size=40000,\n",
136
  " special_tokens=[\"[UNK]\", \"[CLS]\", \"[SEP]\", \"[PAD]\", \"[MASK]\"],\n",
137
  " show_progress=True,\n",
138
  ")"
 
140
  },
141
  {
142
  "cell_type": "code",
143
+ "execution_count": 7,
144
  "metadata": {},
145
  "outputs": [
146
  {
 
149
  "4294"
150
  ]
151
  },
152
+ "execution_count": 7,
153
  "metadata": {},
154
  "output_type": "execute_result"
155
  }
 
165
  },
166
  {
167
  "cell_type": "code",
168
+ "execution_count": 8,
169
  "metadata": {},
170
  "outputs": [
171
  {
 
184
  },
185
  {
186
  "cell_type": "code",
187
+ "execution_count": 9,
188
  "metadata": {},
189
  "outputs": [
190
  {
191
  "data": {
192
  "text/plain": [
193
+ "<tokenizers.models.BPE at 0x114910dd0>"
194
  ]
195
  },
196
+ "execution_count": 9,
197
  "metadata": {},
198
  "output_type": "execute_result"
199
  }
 
204
  },
205
  {
206
  "cell_type": "code",
207
+ "execution_count": 10,
208
  "metadata": {},
209
  "outputs": [
210
  {
211
  "data": {
212
  "text/plain": [
213
+ "40000"
214
  ]
215
  },
216
+ "execution_count": 10,
217
  "metadata": {},
218
  "output_type": "execute_result"
219
  }
 
224
  },
225
  {
226
  "cell_type": "code",
227
+ "execution_count": 11,
228
+ "metadata": {},
229
+ "outputs": [
230
+ {
231
+ "data": {
232
+ "text/plain": [
233
+ "{'وداع': 21045,\n",
234
+ " 'وسیل': 16315,\n",
235
+ " 'گھنٹہ': 15020,\n",
236
+ " 'لسانیں': 24958,\n",
237
+ " 'نیکی': 4830,\n",
238
+ " 'پیِ': 34528,\n",
239
+ " 'ہد': 20306,\n",
240
+ " 'ریموٹ': 39651,\n",
241
+ " 'ولک': 35099,\n",
242
+ " 'مّا': 21551,\n",
243
+ " 'هال': 2349,\n",
244
+ " 'دیئیں': 39545,\n",
245
+ " 'همگرنچ': 19468,\n",
246
+ " 'ربیدگءَ': 26168,\n",
247
+ " 'ۓِ': 13276,\n",
248
+ " 'اَہ': 10481,\n",
249
+ " 'پمیش': 1235,\n",
250
+ " 'علاقہ': 6854,\n",
251
+ " 'زمانگءَ': 3377,\n",
252
+ " 'مزھرا': 10075,\n",
253
+ " 'مُک': 4909,\n",
254
+ " 'وتسر': 13772,\n",
255
+ " 'بندگءَ': 14795,\n",
256
+ " 'شن': 1353,\n",
257
+ " 'آنگو': 7345,\n",
258
+ " 'پْروشی': 29354,\n",
259
+ " 'كپت': 19130,\n",
260
+ " 'ﮕﺎﺭ': 30181,\n",
261
+ " 'کُشان': 30562,\n",
262
+ " 'لسانی': 5701,\n",
263
+ " 'لاطاکیه': 38356,\n",
264
+ " 'قربانیءِ': 37641,\n",
265
+ " 'وانس': 25383,\n",
266
+ " 'بےاںت': 7657,\n",
267
+ " 'گناہ': 4020,\n",
268
+ " 'ںْ': 14704,\n",
269
+ " 'بوسگے': 32604,\n",
270
+ " 'بیسیمہءِ': 22887,\n",
271
+ " 'śéń': 38410,\n",
272
+ " 'طلا': 20225,\n",
273
+ " 'نڈُک': 19500,\n",
274
+ " 'ٹہینتگ': 20009,\n",
275
+ " 'ترس': 1897,\n",
276
+ " 'ﮎ': 376,\n",
277
+ " 'السلام': 25174,\n",
278
+ " 'کماتگ': 12586,\n",
279
+ " 'پدریچءِ': 26446,\n",
280
+ " 'ﻤﺎﺭ': 27352,\n",
281
+ " 'تَہار': 22655,\n",
282
+ " 'دیّت': 30338,\n",
283
+ " 'کشّگ': 3752,\n",
284
+ " 'باریں': 1136,\n",
285
+ " 'نَدر': 22263,\n",
286
+ " 'سِکّ': 22182,\n",
287
+ " 'کودکی': 12401,\n",
288
+ " 'جتاءُ': 31366,\n",
289
+ " 'لّت': 11285,\n",
290
+ " 'ستانے': 31050,\n",
291
+ " 'دێمروی': 21246,\n",
292
+ " 'ملامت': 5761,\n",
293
+ " 'آا��ں': 24805,\n",
294
+ " 'پون': 1567,\n",
295
+ " 'مـئـے': 25996,\n",
296
+ " 'سنج': 3757,\n",
297
+ " 'مدرسہءِ': 24556,\n",
298
+ " 'سارتءُ': 26450,\n",
299
+ " 'موہن': 10299,\n",
300
+ " 'اُسمان': 32999,\n",
301
+ " 'داتءُ': 5529,\n",
302
+ " 'ونگز': 32683,\n",
303
+ " 'انگہ': 2559,\n",
304
+ " 'آہانی': 6378,\n",
305
+ " 'چپّی': 7261,\n",
306
+ " 'ءَگْور': 21671,\n",
307
+ " 'تورینگ': 29157,\n",
308
+ " 'سیاسی': 1754,\n",
309
+ " 'پیشدار': 8423,\n",
310
+ " 'نوبال': 23546,\n",
311
+ " 'منگہیں': 27389,\n",
312
+ " 'درکتگ': 27581,\n",
313
+ " 'پارست': 35605,\n",
314
+ " 'لکاں': 12145,\n",
315
+ " 'پیرُ': 14777,\n",
316
+ " 'ھَڈّ': 16897,\n",
317
+ " 'رماں': 24885,\n",
318
+ " 'عبڈ': 18713,\n",
319
+ " 'هژڈری': 21287,\n",
320
+ " 'سھب': 3174,\n",
321
+ " 'پمّنءَ': 37260,\n",
322
+ " 'همائی': 9730,\n",
323
+ " 'وشدل': 11919,\n",
324
+ " 'نصیحت': 21355,\n",
325
+ " 'ڈپارٹمنٹءَ': 22903,\n",
326
+ " 'صڑے': 38719,\n",
327
+ " 'آدمانی': 7250,\n",
328
+ " 'یونان': 8745,\n",
329
+ " 'سرکاری': 2748,\n",
330
+ " 'وارتگیں': 8980,\n",
331
+ " 'کومے': 19359,\n",
332
+ " 'مکتب': 4857,\n",
333
+ " 'کرنءِ': 9363,\n",
334
+ " '🚌': 542,\n",
335
+ " '۔۔۔۔۔۔۔۔۔۔۔۔۔۔': 12288,\n",
336
+ " 'اگریچ': 21875,\n",
337
+ " 'بِن': 22113,\n",
338
+ " 'ءِته': 23290,\n",
339
+ " 'مزدور': 23637,\n",
340
+ " 'کرچک': 25434,\n",
341
+ " 'کـنـت': 11046,\n",
342
+ " 'ڈَنڈ': 17814,\n",
343
+ " 'سائیں': 21848,\n",
344
+ " 'بدانی': 16772,\n",
345
+ " 'درھیں': 14767,\n",
346
+ " 'ایدگه': 27557,\n",
347
+ " 'دستے': 4286,\n",
348
+ " 'ورنایے': 8293,\n",
349
+ " 'خیمہ': 28657,\n",
350
+ " 'سَپَر': 31146,\n",
351
+ " 'بکشائیت': 21396,\n",
352
+ " 'گْومے': 18521,\n",
353
+ " 'هڑڈو': 28797,\n",
354
+ " 'انگلستان': 14582,\n",
355
+ " 'نکشاں': 36952,\n",
356
+ " 'پارم': 13934,\n",
357
+ " 'جُہدءِ': 24247,\n",
358
+ " 'اۆگان': 3995,\n",
359
+ " 'ئےِ': 1121,\n",
360
+ " 'اسپیتیں': 3589,\n",
361
+ " 'جناور': 6627,\n",
362
+ " 'عطاء': 7573,\n",
363
+ " 'بَھم': 25588,\n",
364
+ " 'ﻣﺎﺕ': 26668,\n",
365
+ " 'لِز': 28525,\n",
366
+ " 'مِیران': 35688,\n",
367
+ " 'چکّاں': 14324,\n",
368
+ " 'لِنگ': 15575,\n",
369
+ " 'بھم': 12435,\n",
370
+ " 'گرتاں': 8684,\n",
371
+ " 'آپءَ': 21082,\n",
372
+ " 'ڑیاں': 22059,\n",
373
+ " 'مودر': 25456,\n",
374
+ " 'چَمّگ': 19943,\n",
375
+ " 'دوینانی': 25334,\n",
376
+ " 'چمگاں': 20646,\n",
377
+ " 'شاعرءِ': 9187,\n",
378
+ " 'تچ': 2567,\n",
379
+ " 'سوارگ': 31752,\n",
380
+ " 'کپتگءُ': 15489,\n",
381
+ " 'میچینتگ': 33236,\n",
382
+ " 'گوشان': 7777,\n",
383
+ " 'لُمب': 19656,\n",
384
+ " 'ھمالیہ': 39796,\n",
385
+ " 'بورژ': 26074,\n",
386
+ " 'ردگار': 20409,\n",
387
+ " 'سیراں': 27598,\n",
388
+ " 'ﭼﯿﺎﮐﮧ': 20111,\n",
389
+ " 'ثما': 33534,\n",
390
+ " 'کارگالءِ': 36680,\n",
391
+ " 'همدپ': 22025,\n",
392
+ " 'یکاں': 21732,\n",
393
+ " 'ﭼﻪ': 8659,\n",
394
+ " 'شِرد': 25959,\n",
395
+ " 'بوائے': 29755,\n",
396
+ " 'لێ': 29936,\n",
397
+ " 'جِکّ': 14457,\n",
398
+ " 'سیچی': 16674,\n",
399
+ " 'اهوال': 18625,\n",
400
+ " 'ڈرانی': 31758,\n",
401
+ " 'شهیڈ': 14374,\n",
402
+ " '۱۴۰۱': 20054,\n",
403
+ " 'ثا': 6276,\n",
404
+ " 'جُہد': 3189,\n",
405
+ " 'ءُسر': 20338,\n",
406
+ " 'انگتہ': 7055,\n",
407
+ " 'طاک': 12308,\n",
408
+ " 'لگام': 15895,\n",
409
+ " 'عیسوی': 16935,\n",
410
+ " 'درکت': 20474,\n",
411
+ " 'ملپداں': 25550,\n",
412
+ " 'مُھک': 27852,\n",
413
+ " 'داہ': 14248,\n",
414
+ " 'گرئیت': 21915,\n",
415
+ " 'دکاناں': 26267,\n",
416
+ " 'نویسانی': 28504,\n",
417
+ " 'کاربند': 25262,\n",
418
+ " 'جُھدکار': 12366,\n",
419
+ " 'بہ': 601,\n",
420
+ " 'ترانءَ': 20550,\n",
421
+ " 'آتـ': 13202,\n",
422
+ " 'چشین': 27240,\n",
423
+ " 'کُن': 4902,\n",
424
+ " 'دوتکگ': 26462,\n",
425
+ " 'عبدو': 24154,\n",
426
+ " 'نِشت': 4427,\n",
427
+ " 'ھاڑ': 16108,\n",
428
+ " 'مَرے': 28744,\n",
429
+ " 'گپ': 638,\n",
430
+ " 'غالبءَ': 38224,\n",
431
+ " 'اُگد': 23712,\n",
432
+ " 'نَیکہ': 14110,\n",
433
+ " 'کُلّیں': 8191,\n",
434
+ " '؍': 159,\n",
435
+ " 'آزادی': 8932,\n",
436
+ " 'آئرا': 2147,\n",
437
+ " 'بیزاری': 10496,\n",
438
+ " 'لاء': 25365,\n",
439
+ " 'رکّینتگ': 31791,\n",
440
+ " 'فرنگ': 29916,\n",
441
+ " 'ڈنے': 15272,\n",
442
+ " 'َرَغ': 30832,\n",
443
+ " 'پیسرگیں': 31525,\n",
444
+ " 'سقراط': 10224,\n",
445
+ " 'الکاپیں': 8332,\n",
446
+ " 'دپترے': 10116,\n",
447
+ " 'چوناها': 16254,\n",
448
+ " 'جوانسال': 31527,\n",
449
+ " 'دارگے': 17462,\n",
450
+ " 'ئان': 24818,\n",
451
+ " 'بدیان': 29753,\n",
452
+ " 'کوئیلھو': 12112,\n",
453
+ " 'نڈی': 4103,\n",
454
+ " 'بُگٹی': 4979,\n",
455
+ " 'لّو': 5387,\n",
456
+ " 'ٹیلی': 6043,\n",
457
+ " 'کامی': 3485,\n",
458
+ " 'ڑالر': 39079,\n",
459
+ " 'لٹریچر': 15030,\n",
460
+ " 'سرجمین': 18040,\n",
461
+ " 'سَـوَ': 15098,\n",
462
+ " 'ءُدگہ': 27368,\n",
463
+ " 'ریمانی': 39646,\n",
464
+ " 'شیدران': 30630,\n",
465
+ " 'ﺟﮧ': 23267,\n",
466
+ " 'ِیر': 7501,\n",
467
+ " 'گفت': 19176,\n",
468
+ " 'ناکوءَ': 14443,\n",
469
+ " 'نارد': 31985,\n",
470
+ " 'جَس': 14890,\n",
471
+ " 'سہمی': 12663,\n",
472
+ " 'ترّءُ': 19870,\n",
473
+ " 'خانؔءِ': 25728,\n",
474
+ " 'رۏچ': 33611,\n",
475
+ " 'ایوب': 8239,\n",
476
+ " 'وهدیکہ': 22778,\n",
477
+ " 'زُرتگ': 2271,\n",
478
+ " 'مُہ': 8018,\n",
479
+ " 'گوپت': 34184,\n",
480
+ " 'پلہ': 7110,\n",
481
+ " 'بندین': 34692,\n",
482
+ " 'مام': 18247,\n",
483
+ " 'ﮓﺀَ': 25100,\n",
484
+ " 'ہمودءَ': 24730,\n",
485
+ " 'فلو': 38790,\n",
486
+ " 'ھامین': 6528,\n",
487
+ " 'تول': 8132,\n",
488
+ " 'شهید': 7673,\n",
489
+ " 'عہدے': 17780,\n",
490
+ " 'وداریگاں': 22881,\n",
491
+ " 'همایانی': 25975,\n",
492
+ " 'تَھار': 28195,\n",
493
+ " 'گّ': 4966,\n",
494
+ " 'نوشی': 10064,\n",
495
+ " 'نیچہ': 25224,\n",
496
+ " 'بیران': 2676,\n",
497
+ " 'حلب': 21475,\n",
498
+ " 'اَھت': 27432,\n",
499
+ " 'سُرگی': 28626,\n",
500
+ " 'زورتگ': 28143,\n",
501
+ " 'پوشان': 36639,\n",
502
+ " 'بلیں': 8819,\n",
503
+ " 'جنائی': 13699,\n",
504
+ " 'مٰ': 18145,\n",
505
+ " 'کُھتیگ': 37555,\n",
506
+ " 'زَد': 13514,\n",
507
+ " 'ئشت': 17240,\n",
508
+ " 'اَبا': 39515,\n",
509
+ " 'نیستءُ': 31063,\n",
510
+ " 'زیبک': 11564,\n",
511
+ " 'مِین': 16365,\n",
512
+ " 'فکڑے': 29919,\n",
513
+ " 'اِشکُن': 22142,\n",
514
+ " 'کلمپوگ': 32156,\n",
515
+ " 'بانداتگیں': 31823,\n",
516
+ " 'سرسر': 27507,\n",
517
+ " 'مدد': 22063,\n",
518
+ " 'ﻣﺮﺩ': 11414,\n",
519
+ " 'تُنی': 25891,\n",
520
+ " 'کَجّ': 27948,\n",
521
+ " 'منجل': 14217,\n",
522
+ " 'گُلنام': 35957,\n",
523
+ " 'بَـنـت': 5401,\n",
524
+ " 'شُبینگ': 17569,\n",
525
+ " 'شپادیں': 14603,\n",
526
+ " ':’’': 20134,\n",
527
+ " 'روگایاں': 20841,\n",
528
+ " 'سازانی': 10592,\n",
529
+ " 'آخر': 6418,\n",
530
+ " 'رملچہ': 11762,\n",
531
+ " 'هـُ': 28664,\n",
532
+ " 'گِرانیں': 28439,\n",
533
+ " 'گانہ': 31708,\n",
534
+ " 'ریکانی': 7585,\n",
535
+ " 'چیزےءِ': 16867,\n",
536
+ " 'زونڈ': 17268,\n",
537
+ " 'دپترءُ': 35578,\n",
538
+ " 'شُـت': 13884,\n",
539
+ " 'بایڈ': 5894,\n",
540
+ " 'پـــ': 18179,\n",
541
+ " 'اشکریں': 21293,\n",
542
+ " 'ھبرءِ': 16912,\n",
543
+ " 'طِ': 10680,\n",
544
+ " 'پتن': 18444,\n",
545
+ " 'نکال': 28085,\n",
546
+ " 'کُجائے': 13524,\n",
547
+ " 'ایکس': 13099,\n",
548
+ " 'ھسین': 13271,\n",
549
+ " 'جاهان': 25934,\n",
550
+ " 'هفطه': 26821,\n",
551
+ " 'برزگ': 21799,\n",
552
+ " 'جوابش': 36551,\n",
553
+ " 'شاعرو': 30696,\n",
554
+ " 'کنئگا': 20016,\n",
555
+ " 'پورت': 39000,\n",
556
+ " 'کُچکاں': 21829,\n",
557
+ " 'ہؤ': 12698,\n",
558
+ " 'شکون': 19107,\n",
559
+ " 'نـوک': 16235,\n",
560
+ " 'ں': 265,\n",
561
+ " 'ّو': 1250,\n",
562
+ " 'ٹیبل': 5694,\n",
563
+ " 'میناب': 18256,\n",
564
+ " 'بُوتنت': 16922,\n",
565
+ " 'شهی': 19535,\n",
566
+ " 'کڑو': 31633,\n",
567
+ " 'لُنٹانی': 5356,\n",
568
+ " 'زگاری': 5261,\n",
569
+ " 'رالن': 39450,\n",
570
+ " 'همرا': 35052,\n",
571
+ " 'شارءِ': 16990,\n",
572
+ " 'دشیاری': 38619,\n",
573
+ " 'نِ': 1063,\n",
574
+ " 'دْراجیں': 5270,\n",
575
+ " 'پردوس': 36917,\n",
576
+ " 'انوں': 2458,\n",
577
+ " 'ماریا': 5384,\n",
578
+ " 'داز': 9652,\n",
579
+ " 'بیبت': 16011,\n",
580
+ " 'ڈیموکڑی': 33271,\n",
581
+ " 'سرگوستاں': 36480,\n",
582
+ " 'ﻮ': 506,\n",
583
+ " 'عقید': 19117,\n",
584
+ " 'کاٹاراں': 37187,\n",
585
+ " 'ٹیچر': 8267,\n",
586
+ " 'لیوار': 25373,\n",
587
+ " 'مّد': 36059,\n",
588
+ " 'کائیں': 6825,\n",
589
+ " 'سّا': 7721,\n",
590
+ " 'سرپداں': 11309,\n",
591
+ " 'مانپو': 23431,\n",
592
+ " 'بےتوار': 23526,\n",
593
+ " 'بُتگر': 25511,\n",
594
+ " 'چمیں': 30969,\n",
595
+ " 'زورگءُ': 25779,\n",
596
+ " 'پلو': 8788,\n",
597
+ " 'وڑا': 1345,\n",
598
+ " 'جمل': 33543,\n",
599
+ " 'نہَ': 39636,\n",
600
+ " 'دپیں': 15445,\n",
601
+ " 'اوت': 2898,\n",
602
+ " 'الکاپی': 6421,\n",
603
+ " 'بَگَل': 37304,\n",
604
+ " 'سیمسریں': 16976,\n",
605
+ " 'زمانگی': 27847,\n",
606
+ " 'پیپلز': 21422,\n",
607
+ " 'ساچیں': 8432,\n",
608
+ " 'کنیت': 20458,\n",
609
+ " 'دکش': 33575,\n",
610
+ " 'یکٹر': 5834,\n",
611
+ " 'چَمے': 35642,\n",
612
+ " 'اَھدءِ': 37437,\n",
613
+ " 'روشن': 8386,\n",
614
+ " 'بَز': 6240,\n",
615
+ " 'اَںط': 6020,\n",
616
+ " 'رِدانکی': 8443,\n",
617
+ " 'ﺯﯾﺮ': 24701,\n",
618
+ " 'کٹار': 31861,\n",
619
+ " 'مُلاّ': 10866,\n",
620
+ " 'بہاول': 23412,\n",
621
+ " 'رکھینت': 24114,\n",
622
+ " 'ذباں': 29819,\n",
623
+ " 'ریس': 3551,\n",
624
+ " 'شبیرءِ': 29100,\n",
625
+ " 'مالءُ': 11321,\n",
626
+ " 'کوشءَ': 16262,\n",
627
+ " 'بُنز': 11260,\n",
628
+ " 'ّکی': 27208,\n",
629
+ " 'ذرا': 38621,\n",
630
+ " '۲۰۲': 26498,\n",
631
+ " 'نوپ': 30765,\n",
632
+ " 'کومءُ': 37210,\n",
633
+ " 'حیالانی': 8594,\n",
634
+ " 'مسکیں': 11171,\n",
635
+ " 'ُکےءَ': 30910,\n",
636
+ " 'داشتاں': 24027,\n",
637
+ " '؟۔۔۔۔۔': 14629,\n",
638
+ " 'ھکیم': 10468,\n",
639
+ " 'فا': 1616,\n",
640
+ " 'لیپٹ': 30602,\n",
641
+ " 'تپاکیں': 36699,\n",
642
+ " 'تپاس': 5874,\n",
643
+ " 'نامداری': 9874,\n",
644
+ " 'ﺭﺍﺟﯽ': 22932,\n",
645
+ " 'تیارنہ': 36353,\n",
646
+ " 'باد': 929,\n",
647
+ " 'چکاّ': 32037,\n",
648
+ " 'روسی': 4970,\n",
649
+ " 'بھادری': 24095,\n",
650
+ " 'چلتن': 19887,\n",
651
+ " 'منزلءِ': 24162,\n",
652
+ " 'ژانگ': 23210,\n",
653
+ " 'ںێم': 25068,\n",
654
+ " 'اَلگ': 34214,\n",
655
+ " 'مسیبت': 32742,\n",
656
+ " 'ملما': 15419,\n",
657
+ " 'کرشن': 27729,\n",
658
+ " 'رستگ': 2379,\n",
659
+ " 'عید': 3384,\n",
660
+ " 'چُپی': 28138,\n",
661
+ " 'بُلبل': 24338,\n",
662
+ " 'رجـانـکـار': 17200,\n",
663
+ " 'وکیلے': 19795,\n",
664
+ " 'درؔ': 39897,\n",
665
+ " 'رَمگ': 15474,\n",
666
+ " 'پںچ': 33105,\n",
667
+ " 'جَل': 4988,\n",
668
+ " 'پیشیمءِ': 28869,\n",
669
+ " 'جیڑیت': 6467,\n",
670
+ " 'كُمْ': 18134,\n",
671
+ " 'پرسی': 12935,\n",
672
+ " 'پيرُك': 32737,\n",
673
+ " 'دێمداری': 32483,\n",
674
+ " 'ھپط': 33981,\n",
675
+ " 'اێر': 6557,\n",
676
+ " 'یکّر': 17390,\n",
677
+ " 'ءَچم': 34108,\n",
678
+ " 'دوچءِ': 36759,\n",
679
+ " 'ِنٹ': 38945,\n",
680
+ " 'کَنگی': 35081,\n",
681
+ " 'دینے': 15314,\n",
682
+ " 'گْر': 3356,\n",
683
+ " 'مھریءِ': 29301,\n",
684
+ " 'اندرو': 30233,\n",
685
+ " 'زالے': 7070,\n",
686
+ " 'صابر': 6873,\n",
687
+ " 'سربوت': 7756,\n",
688
+ " 'اُک': 21449,\n",
689
+ " 'لھتی': 24489,\n",
690
+ " 'ڑل': 25040,\n",
691
+ " 'لھڑ': 10451,\n",
692
+ " 'بـَر': 11653,\n",
693
+ " 'پِلّی': 31333,\n",
694
+ " 'مسکی': 12650,\n",
695
+ " 'پێم': 3889,\n",
696
+ " 'لانکی': 19133,\n",
697
+ " 'مکّهین': 21319,\n",
698
+ " '//“': 31994,\n",
699
+ " 'کــــ': 30093,\n",
700
+ " 'ورناءِ': 23919,\n",
701
+ " 'عبدالعزیز': 5101,\n",
702
+ " 'پندءَ': 26204,\n",
703
+ " 'پروا': 16161,\n",
704
+ " 'سمجھ': 19599,\n",
705
+ " 'چراغ': 7450,\n",
706
+ " 'پُجّ': 13881,\n",
707
+ " 'ثنت': 9984,\n",
708
+ " 'پتریت': 26321,\n",
709
+ " 'دُرہیں': 31724,\n",
710
+ " 'ڈیزائن': 33350,\n",
711
+ " 'پیڈیاءِ': 37840,\n",
712
+ " 'مڑوی': 14961,\n",
713
+ " 'پی': 621,\n",
714
+ " 'چُکءَ': 7678,\n",
715
+ " 'ڈاشته': 24612,\n",
716
+ " 'سْیاھگ': 3292,\n",
717
+ " 'وفاق': 8072,\n",
718
+ " 'انقلاب': 5661,\n",
719
+ " '۱۹۴': 18856,\n",
720
+ " 'لازُمی': 33390,\n",
721
+ " 'لوگی': 4180,\n",
722
+ " 'زہرشانی': 13596,\n",
723
+ " 'ﻣﻦ': 3327,\n",
724
+ " 'ردوست': 8314,\n",
725
+ " 'اۆدے': 31874,\n",
726
+ " 'ڈریا': 39056,\n",
727
+ " 'شیمیں': 19342,\n",
728
+ " 'کورسءِ': 19631,\n",
729
+ " 'لبزانکاں': 23787,\n",
730
+ " 'اوشتارین': 26754,\n",
731
+ " 'حجا': 23049,\n",
732
+ " 'دزگھاراں': 24494,\n",
733
+ " 'پرچاکہ': 2019,\n",
734
+ " 'آهان': 18775,\n",
735
+ " 'دؤ': 19077,\n",
736
+ " 'جاهانی': 25935,\n",
737
+ " 'مردمانی': 1155,\n",
738
+ " 'مَڑاہ': 23722,\n",
739
+ " 'صۆج': 6939,\n",
740
+ " 'دِیوان': 10871,\n",
741
+ " 'راچنے': 30296,\n",
742
+ " 'راہءَ': 4489,\n",
743
+ " 'سبزلءِ': 28442,\n",
744
+ " 'تاںی': 17610,\n",
745
+ " 'بیسٹ': 34381,\n",
746
+ " 'ڈێمطڑ': 37043,\n",
747
+ " 'ھنداں': 8715,\n",
748
+ " 'منّانکاں': 26721,\n",
749
+ " 'دووار': 30552,\n",
750
+ " 'پیشکان': 11294,\n",
751
+ " 'سدانی': 10126,\n",
752
+ " 'نادْ': 3154,\n",
753
+ " 'مداری': 38835,\n",
754
+ " 'گیشیناں': 24183,\n",
755
+ " 'زھران': 24129,\n",
756
+ " 'پازگاہ': 24671,\n",
757
+ " 'پادان': 15466,\n",
758
+ " 'مثالے': 15622,\n",
759
+ " 'لونجان': 6422,\n",
760
+ " 'دْروت': 8708,\n",
761
+ " 'رپتگاں': 20007,\n",
762
+ " 'بانڈُ': 30650,\n",
763
+ " 'ڈوڑه': 33036,\n",
764
+ " 'مروارد': 14394,\n",
765
+ " 'وکءُ': 16687,\n",
766
+ " 'بسطری': 28034,\n",
767
+ " 'نوریں': 26014,\n",
768
+ " 'تیڑ': 20610,\n",
769
+ " 'کِشتگیں': 25749,\n",
770
+ " 'اھوالے': 24319,\n",
771
+ " 'کھول': 6408,\n",
772
+ " 'بِچکند': 18979,\n",
773
+ " 'ٹیو': 6526,\n",
774
+ " 'انساں': 34116,\n",
775
+ " 'دگءُ': 25401,\n",
776
+ " 'میچینت': 38013,\n",
777
+ " 'دینی': 2809,\n",
778
+ " 'فن': 4862,\n",
779
+ " 'نکن': 8108,\n",
780
+ " 'بدبہتی': 30915,\n",
781
+ " 'مچءُ': 32224,\n",
782
+ " 'انچوشکہ': 3710,\n",
783
+ " 'ھزم': 17334,\n",
784
+ " 'کڑڈاڑ': 17746,\n",
785
+ " 'گوکرت': 39478,\n",
786
+ " 'چرائیءَ': 8727,\n",
787
+ " 'ازمان': 38483,\n",
788
+ " 'ﺋﺪ': 39265,\n",
789
+ " 'انَچو': 14557,\n",
790
+ " 'نکشون': 36953,\n",
791
+ " 'سےئیں': 35487,\n",
792
+ " 'ششمی': 5281,\n",
793
+ " 'رَنگءَ': 23753,\n",
794
+ " 'لُڈگ': 31552,\n",
795
+ " 'فان': 10228,\n",
796
+ " 'میلانی': 34283,\n",
797
+ " 'گلوبلائزیشن': 19989,\n",
798
+ " 'ژند': 5513,\n",
799
+ " 'كاری': 29931,\n",
800
+ " 'آہُو': 35877,\n",
801
+ " 'ساوڑ': 4759,\n",
802
+ " 'ھرابیں': 6258,\n",
803
+ " 'کشتہ': 30097,\n",
804
+ " 'باپور': 34297,\n",
805
+ " 'بلندیں': 23334,\n",
806
+ " 'پنتانی': 36904,\n",
807
+ " '۔()': 23252,\n",
808
+ " 'پنجگور': 2876,\n",
809
+ " 'بـاں': 31572,\n",
810
+ " '؟!': 4257,\n",
811
+ " 'آف': 2896,\n",
812
+ " 'آستون': 18893,\n",
813
+ " 'اِسـ': 30301,\n",
814
+ " 'چشم': 6952,\n",
815
+ " 'ءِ': 547,\n",
816
+ " 'جتی': 15853,\n",
817
+ " 'زاط': 28030,\n",
818
+ " 'چاگردء': 31247,\n",
819
+ " 'یونٹ': 14537,\n",
820
+ " 'ناگت': 6892,\n",
821
+ " 'اھم': 7960,\n",
822
+ " 'راجدپتراں': 31659,\n",
823
+ " 'ڈکگ': 39054,\n",
824
+ " 'شیطان': 7405,\n",
825
+ " 'غُر': 24943,\n",
826
+ " 'زیارت': 3516,\n",
827
+ " 'کّرا': 17599,\n",
828
+ " 'کڈّک': 33180,\n",
829
+ " 'نبشتگ': 1883,\n",
830
+ " 'نـڈی': 28463,\n",
831
+ " 'شِی': 4774,\n",
832
+ " 'لُوٹ': 10361,\n",
833
+ " 'وزیڑ': 37164,\n",
834
+ " 'میکسیکو': 19914,\n",
835
+ " 'نُگرہ': 21389,\n",
836
+ " 'ود': 1419,\n",
837
+ " 'وانوکءِ': 22231,\n",
838
+ " 'سنگیں': 18512,\n",
839
+ " 'کاری': 940,\n",
840
+ " 'طهڑ': 27120,\n",
841
+ " 'ناکس': 17410,\n",
842
+ " 'ہدوناک': 39175,\n",
843
+ " 'مَلِک': 25674,\n",
844
+ " 'نِبشتہ': 13012,\n",
845
+ " 'بولے': 5250,\n",
846
+ " 'بندن': 5907,\n",
847
+ " 'گبرّ': 33952,\n",
848
+ " 'کمّێں': 32499,\n",
849
+ " 'زمانگاں': 5693,\n",
850
+ " 'کالوجی': 24065,\n",
851
+ " 'ھژ': 3448,\n",
852
+ " 'وزی': 4831,\n",
853
+ " 'ٹکا': 7288,\n",
854
+ " 'عکسکاری': 26143,\n",
855
+ " 'گشتگیں': 16171,\n",
856
+ " 'نَی': 11340,\n",
857
+ " 'عینک': 13058,\n",
858
+ " 'جینز': 15394,\n",
859
+ " 'آمر': 26991,\n",
860
+ " 'ڈیں': 4783,\n",
861
+ " 'ءّکه': 31673,\n",
862
+ " 'بیگُل': 34380,\n",
863
+ " 'میس': 18254,\n",
864
+ " 'ﺑﻮﺗﮓ': 17163,\n",
865
+ " 'گَپّے': 32799,\n",
866
+ " 'چوھ': 34494,\n",
867
+ " 'گنجاں': 24182,\n",
868
+ " 'ديميں': 38024,\n",
869
+ " 'لاچاری': 6257,\n",
870
+ " 'پرومی': 11964,\n",
871
+ " 'آلا': 15824,\n",
872
+ " 'کارکنوکیں': 20455,\n",
873
+ " 'احمدءَ': 15604,\n",
874
+ " 'چنچک': 11673,\n",
875
+ " 'گـ': 1406,\n",
876
+ " 'موڈ': 10767,\n",
877
+ " 'مشھور': 14904,\n",
878
+ " 'صباءِ': 21049,\n",
879
+ " '...!': 37403,\n",
880
+ " 'تُنّیگ': 12065,\n",
881
+ " 'رَند': 2534,\n",
882
+ " 'کردءِ': 19761,\n",
883
+ " 'جرا': 13697,\n",
884
+ " 'گنداِیت': 22033,\n",
885
+ " 'نابزانتی': 25235,\n",
886
+ " 'ﺯﺍ': 10472,\n",
887
+ " 'اسپینی': 32697,\n",
888
+ " 'حاموشی': 18559,\n",
889
+ " 'انساپءِ': 37554,\n",
890
+ " 'آ': 647,\n",
891
+ " 'اَبرم': 15139,\n",
892
+ " 'گُڈّی': 3164,\n",
893
+ " 'ھُورت': 22192,\n",
894
+ " 'سکری': 8413,\n",
895
+ " 'مولاناءَ': 8722,\n",
896
+ " 'واھشت': 5473,\n",
897
+ " '****//': 14978,\n",
898
+ " 'نـوں': 17751,\n",
899
+ " 'فیر': 14669,\n",
900
+ " 'جهلی': 28114,\n",
901
+ " 'شِکار': 31579,\n",
902
+ " 'کلاشنکوف': 32020,\n",
903
+ " 'بُرَّگ': 32203,\n",
904
+ " 'میھ': 21735,\n",
905
+ " 'ای': 599,\n",
906
+ " 'نندان': 14926,\n",
907
+ " 'برچانک': 34445,\n",
908
+ " 'پریت': 34740,\n",
909
+ " 'ھمراہءُ': 36143,\n",
910
+ " 'لُوک': 23985,\n",
911
+ " 'رکّگ': 26594,\n",
912
+ " 'باورءَ': 36440,\n",
913
+ " 'نێمگا': 33198,\n",
914
+ " 'نچُک': 10562,\n",
915
+ " 'اسلا': 13859,\n",
916
+ " 'مکبول': 31884,\n",
917
+ " '۵۷': 27312,\n",
918
+ " 'کتگ': 725,\n",
919
+ " 'فث': 38787,\n",
920
+ " 'کـَـنَـگ': 37838,\n",
921
+ " 'گوزگءَ': 9945,\n",
922
+ " 'روکءَ': 14946,\n",
923
+ " 'مِیدانی': 29006,\n",
924
+ " 'بیح': 9274,\n",
925
+ " 'بیوانکی': 16422,\n",
926
+ " 'قط': 5506,\n",
927
+ " 'کشاب': 20023,\n",
928
+ " 'پھر': 4365,\n",
929
+ " 'طُر': 38738,\n",
930
+ " 'بلاهیں': 13662,\n",
931
+ " 'وارد': 30782,\n",
932
+ " 'شڑیعت': 29513,\n",
933
+ " 'ششتگیں': 38698,\n",
934
+ " 'مھلبیں': 14453,\n",
935
+ " 'یاکوت': 34708,\n",
936
+ " 'اِستین': 32668,\n",
937
+ " 'جدائی': 23046,\n",
938
+ " 'رانکی': 11160,\n",
939
+ " 'گرے': 4402,\n",
940
+ " 'برانچ': 39886,\n",
941
+ " '١': 225,\n",
942
+ " 'رازیگ': 7087,\n",
943
+ " 'مارشتے': 10143,\n",
944
+ " 'اۆگانثطان': 17918,\n",
945
+ " 'ابیڈ': 21207,\n",
946
+ " 'آگه': 38452,\n",
947
+ " 'تورو': 39868,\n",
948
+ " 'جیپ': 30754,\n",
949
+ " 'ٹلو': 30030,\n",
950
+ " 'دروشم': 1395,\n",
951
+ " 'همین': 9810,\n",
952
+ " 'بنایا': 36421,\n",
953
+ " 'تولگے': 32898,\n",
954
+ " 'وتساپ': 21683,\n",
955
+ " 'اٹ': 5501,\n",
956
+ " 'بالاہ': 12623,\n",
957
+ " 'چاچ': 7887,\n",
958
+ " 'جَنّتی': 33251,\n",
959
+ " 'چونائیں': 36129,\n",
960
+ " 'سیپ': 39940,\n",
961
+ " 'گالریچ': 8690,\n",
962
+ " 'ﯿﻦ': 8660,\n",
963
+ " 'فِری': 32576,\n",
964
+ " 'وَردِن': 37758,\n",
965
+ " 'دومیگ': 6241,\n",
966
+ " 'اَنچش': 13297,\n",
967
+ " 'ú': 68,\n",
968
+ " 'نسب': 10355,\n",
969
+ " 'ﮒ': 380,\n",
970
+ " 'گدارک': 28144,\n",
971
+ " 'صدق': 29073,\n",
972
+ " 'ایکٹی': 36504,\n",
973
+ " 'فاء': 17778,\n",
974
+ " 'بازینے': 3432,\n",
975
+ " 'نێ': 4322,\n",
976
+ " 'گڑی': 4784,\n",
977
+ " 'نوکر': 6314,\n",
978
+ " 'تامرانی': 7254,\n",
979
+ " 'هاشمی': 9612,\n",
980
+ " 'بریس': 28662,\n",
981
+ " 'نِمَک': 35588,\n",
982
+ " 'دُوز': 23750,\n",
983
+ " 'اسٹینڈر': 14536,\n",
984
+ " 'واکدار': 15641,\n",
985
+ " 'ـوک': 17277,\n",
986
+ " 'ﻫﺴﺖ': 25120,\n",
987
+ " 'اِسرار': 39471,\n",
988
+ " 'بُڈّ': 16738,\n",
989
+ " 'ھورءُ': 20825,\n",
990
+ " 'تَپّاس': 28197,\n",
991
+ " 'دینکے': 8170,\n",
992
+ " 'عبدوست': 19754,\n",
993
+ " 'پڑا': 5566,\n",
994
+ " 'چارمکنڈ': 26936,\n",
995
+ " 'زورگا': 31289,\n",
996
+ " 'بیتگان': 35133,\n",
997
+ " 'دوں': 11224,\n",
998
+ " 'رهادگ': 17020,\n",
999
+ " 'کۆ': 11502,\n",
1000
+ " 'ﺸﺘ': 8515,\n",
1001
+ " 'نبشط': 9144,\n",
1002
+ " 'بیاںیه': 20124,\n",
1003
+ " 'وکیلاں': 29064,\n",
1004
+ " 'نہادیں': 37578,\n",
1005
+ " 'بیریں': 30454,\n",
1006
+ " 'تولیدر': 38354,\n",
1007
+ " '😁': 535,\n",
1008
+ " 'کبلیں': 39100,\n",
1009
+ " 'گِپتگیں': 15678,\n",
1010
+ " 'امریکا': 5354,\n",
1011
+ " 'کشّی': 7320,\n",
1012
+ " 'آڑ': 17233,\n",
1013
+ " 'بستگی': 25864,\n",
1014
+ " 'اماڑط': 24192,\n",
1015
+ " 'گێشتر': 8342,\n",
1016
+ " 'مسٹر': 10578,\n",
1017
+ " 'گازی': 12245,\n",
1018
+ " 'شناسگال': 32865,\n",
1019
+ " 'پیشال': 31103,\n",
1020
+ " 'بےبڑمش': 38359,\n",
1021
+ " 'ڈگڑاں': 39060,\n",
1022
+ " 'الناس': 39502,\n",
1023
+ " 'تکیں': 38527,\n",
1024
+ " 'ردر': 39653,\n",
1025
+ " 'ﻖ': 482,\n",
1026
+ " 'ابکہ': 12673,\n",
1027
+ " 'زہر': 3113,\n",
1028
+ " 'ضنڈ': 33688,\n",
1029
+ " 'گیانی': 39931,\n",
1030
+ " 'ﮑﯿﻦ': 30178,\n",
1031
+ " '👆🏻': 39324,\n",
1032
+ " 'ﺜ': 431,\n",
1033
+ " 'راجدپترءِ': 8057,\n",
1034
+ " 'کائے': 4351,\n",
1035
+ " 'دَگّ': 18847,\n",
1036
+ " 'يناں': 17300,\n",
1037
+ " 'دیَـگ': 34236,\n",
1038
+ " 'لمح': 38815,\n",
1039
+ " 'گوم': 6886,\n",
1040
+ " 'راہی': 3881,\n",
1041
+ " 'شیدائی': 10510,\n",
1042
+ " 'ُمانی': 38934,\n",
1043
+ " 'کناں': 889,\n",
1044
+ " 'دوہزار': 10043,\n",
1045
+ " 'مھرے': 9917,\n",
1046
+ " 'کارو': 5198,\n",
1047
+ " 'رستگیں': 13519,\n",
1048
+ " 'ھمیش': 1504,\n",
1049
+ " 'مانائیں': 20928,\n",
1050
+ " 'غن': 24941,\n",
1051
+ " 'ﭼﻮ': 7509,\n",
1052
+ " 'کاڑمڑذ': 37964,\n",
1053
+ " 'ﺷﻤﺎ': 39298,\n",
1054
+ " 'تاکدیمے': 24128,\n",
1055
+ " 'َگا': 23163,\n",
1056
+ " 'بیچار': 6448,\n",
1057
+ " 'سلاماں': 21151,\n",
1058
+ " 'آبادی': 11902,\n",
1059
+ " 'روگن': 7005,\n",
1060
+ " 'رُپت': 11322,\n",
1061
+ " 'ﺩﺍﺕ': 17122,\n",
1062
+ " 'Ц': 121,\n",
1063
+ " 'ٹہک': 11686,\n",
1064
+ " 'گلُ': 35261,\n",
1065
+ " 'تُور': 35774,\n",
1066
+ " 'گلا': 1990,\n",
1067
+ " 'مروشی': 28191,\n",
1068
+ " 'کنَگی': 30471,\n",
1069
+ " 'ﯿﺖ': 6648,\n",
1070
+ " 'جمّ': 12963,\n",
1071
+ " 'شہناز': 25833,\n",
1072
+ " 'غفار': 9801,\n",
1073
+ " 'سَرپَد': 35888,\n",
1074
+ " 'درخت': 39913,\n",
1075
+ " 'جیڑایت': 38137,\n",
1076
+ " 'نگیگی': 14944,\n",
1077
+ " 'وھدئے': 13868,\n",
1078
+ " 'ستءَ': 16611,\n",
1079
+ " 'اسپر': 20590,\n",
1080
+ " 'مقبروں': 36964,\n",
1081
+ " 'منگو': 13292,\n",
1082
+ " 'برءِ': 39883,\n",
1083
+ " 'اثرمند': 17896,\n",
1084
+ " 'نگاراں': 27455,\n",
1085
+ " 'بلند': 14234,\n",
1086
+ " 'اجکّھی': 38244,\n",
1087
+ " 'زراں': 5207,\n",
1088
+ " 'نش': 3107,\n",
1089
+ " 'بلخ': 30351,\n",
1090
+ " 'ظۆر': 33698,\n",
1091
+ " 'ھنچشیں': 10635,\n",
1092
+ " 'پئیر': 38998,\n",
1093
+ " 'اح': 1077,\n",
1094
+ " 'درباریاں': 13597,\n",
1095
+ " 'پارلمان': 9711,\n",
1096
+ " 'ھمساھگانی': 17953,\n",
1097
+ " 'اوژناگ': 8139,\n",
1098
+ " 'ﻧﮑ': 18221,\n",
1099
+ " 'رَکّینگ': 26835,\n",
1100
+ " 'الَگ': 27421,\n",
1101
+ " 'ﭼﯿﺰ': 13190,\n",
1102
+ " 'عمر': 1823,\n",
1103
+ " 'رکانی': 24888,\n",
1104
+ " 'کمانستان': 17875,\n",
1105
+ " 'دِرتگ': 15657,\n",
1106
+ " 'پمشا': 29017,\n",
1107
+ " 'زات': 2738,\n",
1108
+ " 'رَوا': 15473,\n",
1109
+ " 'ناں': 880,\n",
1110
+ " 'وَشّی': 15057,\n",
1111
+ " 'عربی': 2469,\n",
1112
+ " 'زوراکیں': 7257,\n",
1113
+ " 'ماتا': 11020,\n",
1114
+ " 'رندترا': 22056,\n",
1115
+ " 'تَئی': 14395,\n",
1116
+ " 'سرکشی': 27514,\n",
1117
+ " 'پروفیسر': 3344,\n",
1118
+ " 'جِییَند': 32828,\n",
1119
+ " 'شھار': 7616,\n",
1120
+ " 'اُردو': 4139,\n",
1121
+ " 'آؤر': 2294,\n",
1122
+ " 'مُش': 4879,\n",
1123
+ " 'ھَند': 6759,\n",
1124
+ " 'چیردستی': 18998,\n",
1125
+ " 'کنجل': 27542,\n",
1126
+ " 'لچھکارانی': 37638,\n",
1127
+ " 'سولر': 32004,\n",
1128
+ " 'کَنداں': 35087,\n",
1129
+ " 'راحتی': 34168,\n",
1130
+ " 'رایت': 17371,\n",
1131
+ " 'سس': 2980,\n",
1132
+ " 'لنجیں': 19135,\n",
1133
+ " 'اوستءُ': 28792,\n",
1134
+ " 'پوراں': 26087,\n",
1135
+ " 'وشتریں': 8489,\n",
1136
+ " 'نفی': 13730,\n",
1137
+ " 'بُرگ': 19542,\n",
1138
+ " 'ﻫﺎﺷﻤﯽ': 33247,\n",
1139
+ " 'پاساں': 5865,\n",
1140
+ " 'اُم': 12977,\n",
1141
+ " 'ﺯﻧﺪ': 12501,\n",
1142
+ " 'نتگ': 1008,\n",
1143
+ " 'کُلان': 8834,\n",
1144
+ " 'کُتگءُ': 19526,\n",
1145
+ " 'شُنا': 22041,\n",
1146
+ " 'گُدءِ': 34606,\n",
1147
+ " 'ترانی': 34695,\n",
1148
+ " 'ـزل': 38785,\n",
1149
+ " 'سیگ': 16673,\n",
1150
+ " 'بھاراں': 18814,\n",
1151
+ " 'شیپاں': 27660,\n",
1152
+ " 'جالبی': 28546,\n",
1153
+ " 'چھاپءُ': 36989,\n",
1154
+ " 'اِنگا': 30302,\n",
1155
+ " 'ڈںیا': 8513,\n",
1156
+ " 'دلبڑی': 32471,\n",
1157
+ " 'معت': 35920,\n",
1158
+ " 'شیکگ': 22702,\n",
1159
+ " 'زنگی': 11468,\n",
1160
+ " 'چِلّین': 37322,\n",
1161
+ " 'ڈِی': 11819,\n",
1162
+ " 'گُن': 21855,\n",
1163
+ " 'هـَـ': 18714,\n",
1164
+ " 'پرتگیزی': 30751,\n",
1165
+ " 'گُشادی': 13823,\n",
1166
+ " 'آسءَ': 7442,\n",
1167
+ " 'دوسری': 21816,\n",
1168
+ " 'ڈگار': 1476,\n",
1169
+ " 'ھمسائی': 34367,\n",
1170
+ " 'کھچر': 14105,\n",
1171
+ " 'جھگیر': 13554,\n",
1172
+ " 'توس': 5783,\n",
1173
+ " 'لقاءَهُ': 26903,\n",
1174
+ " 'کپگءِ': 23571,\n",
1175
+ " 'بزّگی': 6487,\n",
1176
+ " 'ھاھا': 2914,\n",
1177
+ " 'آرٹیکل': 31337,\n",
1178
+ " 'زئ': 15867,\n",
1179
+ " 'اِن': 3550,\n",
1180
+ " 'کاریت': 2814,\n",
1181
+ " 'داثہ': 20450,\n",
1182
+ " 'روایتے': 28934,\n",
1183
+ " 'زھگ': 2016,\n",
1184
+ " 'دیستگیں': 9507,\n",
1185
+ " 'دبیر': 38616,\n",
1186
+ " 'نبشتاری': 6899,\n",
1187
+ " 'شرمند': 34960,\n",
1188
+ " 'شاڑ': 25429,\n",
1189
+ " 'حوالکاری': 24438,\n",
1190
+ " 'ھِل': 31780,\n",
1191
+ " 'بَشار': 35022,\n",
1192
+ " 'ــــــــــــــــــــــــــــــــ': 15109,\n",
1193
+ " 'پیل': 9456,\n",
1194
+ " 'نوبتے': 12380,\n",
1195
+ " 'وڑےءَ': 13855,\n",
1196
+ " 'منکہ': 23303,\n",
1197
+ " 'وپسگ': 6859,\n",
1198
+ " 'درا': 874,\n",
1199
+ " 'مئیگ': 8106,\n",
1200
+ " 'کمائی': 17323,\n",
1201
+ " 'بولیءِ': 32396,\n",
1202
+ " 'گورگند': 35423,\n",
1203
+ " 'دعا': 3495,\n",
1204
+ " 'ماتے': 5914,\n",
1205
+ " 'رُدگ': 28241,\n",
1206
+ " 'دوازہ': 37069,\n",
1207
+ " 'تش': 21460,\n",
1208
+ " 'ﻛﺖ': 30210,\n",
1209
+ " 'باریگے': 20993,\n",
1210
+ " 'شربت': 11930,\n",
1211
+ " 'پـگـر': 21393,\n",
1212
+ " 'ھبءُ': 26010,\n",
1213
+ " 'کا': 687,\n",
1214
+ " 'مرغ': 7411,\n",
1215
+ " 'انٹیلی': 30239,\n",
1216
+ " 'ڈیوان': 4658,\n",
1217
+ " 'كے': 10688,\n",
1218
+ " 'کَورءِ': 21161,\n",
1219
+ " 'ﻧﮧ': 6443,\n",
1220
+ " 'منءٙ': 34149,\n",
1221
+ " 'ﺑُﺘﺎﻧﯽ': 37421,\n",
1222
+ " 'یکّوی': 36168,\n",
1223
+ " 'К': 116,\n",
1224
+ " 'نیستیءِ': 17955,\n",
1225
+ " 'ںڈای': 32520,\n",
1226
+ " 'نرگس': 19708,\n",
1227
+ " 'آوارانی': 37020,\n",
1228
+ " 'کسمان': 3664,\n",
1229
+ " 'دُزی': 7906,\n",
1230
+ " 'عطاءِ': 8599,\n",
1231
+ " 'مرچگیں': 10200,\n",
1232
+ " 'ﺑﺰ': 11830,\n",
1233
+ " ...}"
1234
+ ]
1235
+ },
1236
+ "execution_count": 11,
1237
+ "metadata": {},
1238
+ "output_type": "execute_result"
1239
+ }
1240
+ ],
1241
+ "source": [
1242
+ "tokenizer.get_vocab()"
1243
+ ]
1244
+ },
1245
+ {
1246
+ "cell_type": "code",
1247
+ "execution_count": 12,
1248
  "metadata": {},
1249
  "outputs": [],
1250
  "source": [