strickvl commited on
Commit
24469e0
1 Parent(s): 1380cef

add tokenization process

Browse files
Files changed (1) hide show
  1. src/train_tokenizer.ipynb +122 -0
src/train_tokenizer.ipynb CHANGED
@@ -100,6 +100,128 @@
100
  " file.write(cleaned_text)\n"
101
  ]
102
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  {
104
  "cell_type": "code",
105
  "execution_count": null,
 
100
  " file.write(cleaned_text)\n"
101
  ]
102
  },
103
+ {
104
+ "cell_type": "code",
105
+ "execution_count": 19,
106
+ "metadata": {},
107
+ "outputs": [],
108
+ "source": [
109
+ "from tokenizers import Tokenizer\n",
110
+ "from tokenizers.models import BPE\n",
111
+ "\n",
112
+ "tokenizer = Tokenizer(BPE(unk_token=\"[UNK]\"))"
113
+ ]
114
+ },
115
+ {
116
+ "cell_type": "code",
117
+ "execution_count": 27,
118
+ "metadata": {},
119
+ "outputs": [],
120
+ "source": [
121
+ "from tokenizers.trainers import BpeTrainer\n",
122
+ "\n",
123
+ "# trainer = BpeTrainer(vocab_size=25000, min_frequency=2)\n",
124
+ "trainer = BpeTrainer(\n",
125
+ " min_frequency=2,\n",
126
+ " vocab_size=100000,\n",
127
+ " special_tokens=[\"[UNK]\", \"[CLS]\", \"[SEP]\", \"[PAD]\", \"[MASK]\"],\n",
128
+ " show_progress=True,\n",
129
+ ")"
130
+ ]
131
+ },
132
+ {
133
+ "cell_type": "code",
134
+ "execution_count": 22,
135
+ "metadata": {},
136
+ "outputs": [
137
+ {
138
+ "data": {
139
+ "text/plain": [
140
+ "4294"
141
+ ]
142
+ },
143
+ "execution_count": 22,
144
+ "metadata": {},
145
+ "output_type": "execute_result"
146
+ }
147
+ ],
148
+ "source": [
149
+ "# get a list of all the txt files in\n",
150
+ "# '/Users/strickvl/balochi/balochi-tokenizer/data/processed_text'\n",
151
+ "\n",
152
+ "processed_files = get_txt_file_paths(\"../data/processed_text\")\n",
153
+ "assert len(processed_files) == len(txt_paths)\n",
154
+ "len(processed_files)"
155
+ ]
156
+ },
157
+ {
158
+ "cell_type": "code",
159
+ "execution_count": 28,
160
+ "metadata": {},
161
+ "outputs": [
162
+ {
163
+ "name": "stdout",
164
+ "output_type": "stream",
165
+ "text": [
166
+ "\n",
167
+ "\n",
168
+ "\n"
169
+ ]
170
+ }
171
+ ],
172
+ "source": [
173
+ "tokenizer.train(processed_files, trainer)"
174
+ ]
175
+ },
176
+ {
177
+ "cell_type": "code",
178
+ "execution_count": 29,
179
+ "metadata": {},
180
+ "outputs": [
181
+ {
182
+ "data": {
183
+ "text/plain": [
184
+ "<tokenizers.models.BPE at 0x140d828f0>"
185
+ ]
186
+ },
187
+ "execution_count": 29,
188
+ "metadata": {},
189
+ "output_type": "execute_result"
190
+ }
191
+ ],
192
+ "source": [
193
+ "tokenizer.model"
194
+ ]
195
+ },
196
+ {
197
+ "cell_type": "code",
198
+ "execution_count": 30,
199
+ "metadata": {},
200
+ "outputs": [
201
+ {
202
+ "data": {
203
+ "text/plain": [
204
+ "100000"
205
+ ]
206
+ },
207
+ "execution_count": 30,
208
+ "metadata": {},
209
+ "output_type": "execute_result"
210
+ }
211
+ ],
212
+ "source": [
213
+ "tokenizer.get_vocab_size()"
214
+ ]
215
+ },
216
+ {
217
+ "cell_type": "code",
218
+ "execution_count": 31,
219
+ "metadata": {},
220
+ "outputs": [],
221
+ "source": [
222
+ "tokenizer.save(\"../models/balochi-tokenizer.json\")"
223
+ ]
224
+ },
225
  {
226
  "cell_type": "code",
227
  "execution_count": null,