add tokenization process
Browse files- src/train_tokenizer.ipynb +122 -0
src/train_tokenizer.ipynb
CHANGED
@@ -100,6 +100,128 @@
|
|
100 |
" file.write(cleaned_text)\n"
|
101 |
]
|
102 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
{
|
104 |
"cell_type": "code",
|
105 |
"execution_count": null,
|
|
|
100 |
" file.write(cleaned_text)\n"
|
101 |
]
|
102 |
},
|
103 |
+
{
|
104 |
+
"cell_type": "code",
|
105 |
+
"execution_count": 19,
|
106 |
+
"metadata": {},
|
107 |
+
"outputs": [],
|
108 |
+
"source": [
|
109 |
+
"from tokenizers import Tokenizer\n",
|
110 |
+
"from tokenizers.models import BPE\n",
|
111 |
+
"\n",
|
112 |
+
"tokenizer = Tokenizer(BPE(unk_token=\"[UNK]\"))"
|
113 |
+
]
|
114 |
+
},
|
115 |
+
{
|
116 |
+
"cell_type": "code",
|
117 |
+
"execution_count": 27,
|
118 |
+
"metadata": {},
|
119 |
+
"outputs": [],
|
120 |
+
"source": [
|
121 |
+
"from tokenizers.trainers import BpeTrainer\n",
|
122 |
+
"\n",
|
123 |
+
"# trainer = BpeTrainer(vocab_size=25000, min_frequency=2)\n",
|
124 |
+
"trainer = BpeTrainer(\n",
|
125 |
+
" min_frequency=2,\n",
|
126 |
+
" vocab_size=100000,\n",
|
127 |
+
" special_tokens=[\"[UNK]\", \"[CLS]\", \"[SEP]\", \"[PAD]\", \"[MASK]\"],\n",
|
128 |
+
" show_progress=True,\n",
|
129 |
+
")"
|
130 |
+
]
|
131 |
+
},
|
132 |
+
{
|
133 |
+
"cell_type": "code",
|
134 |
+
"execution_count": 22,
|
135 |
+
"metadata": {},
|
136 |
+
"outputs": [
|
137 |
+
{
|
138 |
+
"data": {
|
139 |
+
"text/plain": [
|
140 |
+
"4294"
|
141 |
+
]
|
142 |
+
},
|
143 |
+
"execution_count": 22,
|
144 |
+
"metadata": {},
|
145 |
+
"output_type": "execute_result"
|
146 |
+
}
|
147 |
+
],
|
148 |
+
"source": [
|
149 |
+
"# get a list of all the txt files in\n",
|
150 |
+
"# '/Users/strickvl/balochi/balochi-tokenizer/data/processed_text'\n",
|
151 |
+
"\n",
|
152 |
+
"processed_files = get_txt_file_paths(\"../data/processed_text\")\n",
|
153 |
+
"assert len(processed_files) == len(txt_paths)\n",
|
154 |
+
"len(processed_files)"
|
155 |
+
]
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"cell_type": "code",
|
159 |
+
"execution_count": 28,
|
160 |
+
"metadata": {},
|
161 |
+
"outputs": [
|
162 |
+
{
|
163 |
+
"name": "stdout",
|
164 |
+
"output_type": "stream",
|
165 |
+
"text": [
|
166 |
+
"\n",
|
167 |
+
"\n",
|
168 |
+
"\n"
|
169 |
+
]
|
170 |
+
}
|
171 |
+
],
|
172 |
+
"source": [
|
173 |
+
"tokenizer.train(processed_files, trainer)"
|
174 |
+
]
|
175 |
+
},
|
176 |
+
{
|
177 |
+
"cell_type": "code",
|
178 |
+
"execution_count": 29,
|
179 |
+
"metadata": {},
|
180 |
+
"outputs": [
|
181 |
+
{
|
182 |
+
"data": {
|
183 |
+
"text/plain": [
|
184 |
+
"<tokenizers.models.BPE at 0x140d828f0>"
|
185 |
+
]
|
186 |
+
},
|
187 |
+
"execution_count": 29,
|
188 |
+
"metadata": {},
|
189 |
+
"output_type": "execute_result"
|
190 |
+
}
|
191 |
+
],
|
192 |
+
"source": [
|
193 |
+
"tokenizer.model"
|
194 |
+
]
|
195 |
+
},
|
196 |
+
{
|
197 |
+
"cell_type": "code",
|
198 |
+
"execution_count": 30,
|
199 |
+
"metadata": {},
|
200 |
+
"outputs": [
|
201 |
+
{
|
202 |
+
"data": {
|
203 |
+
"text/plain": [
|
204 |
+
"100000"
|
205 |
+
]
|
206 |
+
},
|
207 |
+
"execution_count": 30,
|
208 |
+
"metadata": {},
|
209 |
+
"output_type": "execute_result"
|
210 |
+
}
|
211 |
+
],
|
212 |
+
"source": [
|
213 |
+
"tokenizer.get_vocab_size()"
|
214 |
+
]
|
215 |
+
},
|
216 |
+
{
|
217 |
+
"cell_type": "code",
|
218 |
+
"execution_count": 31,
|
219 |
+
"metadata": {},
|
220 |
+
"outputs": [],
|
221 |
+
"source": [
|
222 |
+
"tokenizer.save(\"../models/balochi-tokenizer.json\")"
|
223 |
+
]
|
224 |
+
},
|
225 |
{
|
226 |
"cell_type": "code",
|
227 |
"execution_count": null,
|