bayartsogt commited on
Commit
2ddcd1e
1 Parent(s): 8637116

adding code

Browse files
Files changed (2) hide show
  1. create_mn_gpt2_config.py +6 -0
  2. train_mn_tokenizer.py +26 -0
create_mn_gpt2_config.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from transformers import GPT2Config
2
+
3
+ model_dir = "./" # ${MODEL_DIR}
4
+
5
+ config = GPT2Config.from_pretrained("gpt2", resid_pdrop=0.0, embd_pdrop=0.0, attn_pdrop=0.0)
6
+ config.save_pretrained(model_dir)
train_mn_tokenizer.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ from tokenizers import trainers, Tokenizer, normalizers, ByteLevelBPETokenizer
3
+
4
+ model_dir = "./" # ${MODEL_DIR}
5
+
6
+ # load dataset
7
+ dataset = load_dataset("oscar", "unshuffled_deduplicated_mn", split="train")
8
+
9
+ # Instantiate tokenizer
10
+ tokenizer = ByteLevelBPETokenizer()
11
+
12
+ def batch_iterator(batch_size=1000):
13
+ for i in range(0, len(dataset), batch_size):
14
+ yield dataset[i: i + batch_size]["text"]
15
+
16
+ # Customized training
17
+ tokenizer.train_from_iterator(batch_iterator(), vocab_size=50265, min_frequency=2, special_tokens=[
18
+ "<s>",
19
+ "<pad>",
20
+ "</s>",
21
+ "<unk>",
22
+ "<mask>",
23
+ ])
24
+
25
+ # Save files to disk
26
+ tokenizer.save(f"{model_dir}/tokenizer.json")