|
import datasets |
|
|
|
from t5_tokenizer_model import SentencePieceUnigramTokenizer |
|
|
|
vocab_size = 32_000 |
|
input_sentence_size = None |
|
model_dir = "." |
|
|
|
|
|
dataset = datasets.load_dataset("oscar", name="unshuffled_deduplicated_sv", split="train") |
|
|
|
tokenizer = SentencePieceUnigramTokenizer(unk_token="<unk>", eos_token="</s>", pad_token="<pad>") |
|
|
|
|
|
def batch_iterator(input_sentence_size=None): |
|
if input_sentence_size is None: |
|
input_sentence_size = len(dataset) |
|
batch_length = 100 |
|
for i in range(0, input_sentence_size, batch_length): |
|
yield dataset[i: i + batch_length]["text"] |
|
|
|
|
|
|
|
tokenizer.train_from_iterator( |
|
iterator=batch_iterator(input_sentence_size=input_sentence_size), |
|
vocab_size=vocab_size, |
|
show_progress=True, |
|
) |
|
|
|
|
|
tokenizer.save(f"{model_dir}/tokenizer.json") |