Upload train_sentencepiece.py
Browse files- train_sentencepiece.py +6 -0
train_sentencepiece.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sentencepiece as spm
|
2 |
+
|
3 |
+
spm.SentencePieceTrainer.train(input="/researchdisk/lm_training_dataset_full_sentences/train.txt", model_prefix='spiece', vocab_size=32000, character_coverage=1.0,
|
4 |
+
pad_id=0, unk_id=2, eos_id=1, bos_id=-1,
|
5 |
+
train_extremely_large_corpus=True,
|
6 |
+
num_threads=96, input_sentence_size=50000000, shuffle_input_sentence=True)
|