xu-song commited on
Commit
a6aee1d
1 Parent(s): 6b70021

add zephyr

Browse files
vocab/__init__.py CHANGED
@@ -70,7 +70,7 @@ uniq_tokenizers = [
70
  ""
71
  ]
72
 
73
- # TODO: alias/abbr, hf_path, tokenizer_class/type, comments,
74
  all_tokenizers = [
75
  ##### bert 系列
76
  ("bert_base_cased", "", "bert"),
@@ -99,7 +99,7 @@ all_tokenizers = [
99
  ("chatyuan_large_v2", "", "sentencepiece"),
100
  ("prompt_clue", "", "sentencepiece"),
101
 
102
- ("llama", "", "sentencepiece"), # '中文单字': 700, '中文多字': 0
103
  ("llama2", "", "sentencepiece"),
104
  ("chinese_llama", "", "sentencepiece"), #
105
  ("chinese_llama2", "", "sentencepiece"), #
@@ -168,6 +168,7 @@ all_tokenizers = [
168
  ("gemma_7b",),
169
  ("olmo_7b",),
170
  ("aya_101",),
 
171
  ]
172
 
173
  all_tokenizers = [tokenizer[0] for tokenizer in all_tokenizers]
 
70
  ""
71
  ]
72
 
73
+ # TODO: alias/abbr, description, hf_path, tokenizer_class/type, comments, Organization
74
  all_tokenizers = [
75
  ##### bert 系列
76
  ("bert_base_cased", "", "bert"),
 
99
  ("chatyuan_large_v2", "", "sentencepiece"),
100
  ("prompt_clue", "", "sentencepiece"),
101
 
102
+ ("llama", "", "sentencepiece", "llama use single digits and thus uses 4 tokens to encode the number 1000"), # '中文单字': 700, '中文多字': 0
103
  ("llama2", "", "sentencepiece"),
104
  ("chinese_llama", "", "sentencepiece"), #
105
  ("chinese_llama2", "", "sentencepiece"), #
 
168
  ("gemma_7b",),
169
  ("olmo_7b",),
170
  ("aya_101",),
171
+ ("zephyr_7b_beta",)
172
  ]
173
 
174
  all_tokenizers = [tokenizer[0] for tokenizer in all_tokenizers]
vocab/zephyr_7b_beta/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+
2
+
3
+ from transformers import AutoTokenizer
4
+
5
+ tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")