aapot commited on
Commit
333c248
1 Parent(s): ba320be

Upload train_sentencepiece.py

Browse files
Files changed (1) hide show
  1. train_sentencepiece.py +6 -0
train_sentencepiece.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ import sentencepiece as spm
2
+
3
+ spm.SentencePieceTrainer.train(input="/researchdisk/lm_training_dataset_full_sentences/train.txt", model_prefix='spiece', vocab_size=32000, character_coverage=1.0,
4
+ pad_id=0, unk_id=2, eos_id=1, bos_id=-1,
5
+ train_extremely_large_corpus=True,
6
+ num_threads=96, input_sentence_size=50000000, shuffle_input_sentence=True)