File size: 324 Bytes
751936e
 
 
9495a4f
751936e
 
 
 
428b731
9495a4f
1
2
3
4
5
6
7
8
9
10
11
from transformers import AutoTokenizer
from vocab import TokenizerType

tokenizer = AutoTokenizer.from_pretrained("baichuan-inc/Baichuan2-7B-Chat", trust_remote_code=True)


# byte-bpe  sentencepiece
tokenizer.type = TokenizerType.ByteBPE

tokenizer.comments = "expand the vocqbulary size from 64000 in Baichuan1 to 125696"