balochiml
/

balochi-tokenizer

Model card Files Files and versions Community

strickvl commited on Jun 3, 2023

Commit

24469e0

•

1 Parent(s): 1380cef

add tokenization process

Files changed (1) hide show

src/train_tokenizer.ipynb +122 -0

src/train_tokenizer.ipynb CHANGED Viewed

@@ -100,6 +100,128 @@
     "        file.write(cleaned_text)\n"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,

     "        file.write(cleaned_text)\n"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from tokenizers import Tokenizer\n",
+    "from tokenizers.models import BPE\n",
+    "\n",
+    "tokenizer = Tokenizer(BPE(unk_token=\"[UNK]\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from tokenizers.trainers import BpeTrainer\n",
+    "\n",
+    "# trainer = BpeTrainer(vocab_size=25000, min_frequency=2)\n",
+    "trainer = BpeTrainer(\n",
+    "    min_frequency=2,\n",
+    "    vocab_size=100000,\n",
+    "    special_tokens=[\"[UNK]\", \"[CLS]\", \"[SEP]\", \"[PAD]\", \"[MASK]\"],\n",
+    "    show_progress=True,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "4294"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# get a list of all the txt files in\n",
+    "# '/Users/strickvl/balochi/balochi-tokenizer/data/processed_text'\n",
+    "\n",
+    "processed_files = get_txt_file_paths(\"../data/processed_text\")\n",
+    "assert len(processed_files) == len(txt_paths)\n",
+    "len(processed_files)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "tokenizer.train(processed_files, trainer)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<tokenizers.models.BPE at 0x140d828f0>"
+      ]
+     },
+     "execution_count": 29,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tokenizer.model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "100000"
+      ]
+     },
+     "execution_count": 30,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tokenizer.get_vocab_size()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tokenizer.save(\"../models/balochi-tokenizer.json\")"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,