Spaces:

xu-song
/

tokenizer-arena

Running

xu-song commited on Jan 26

Commit

a37f943

•

1 Parent(s): 0415b36

update

Files changed (2) hide show

examples.py CHANGED Viewed

@@ -1,6 +1,6 @@
 examples = {
     "en": [
-        ["spaces:  2spaces        8spaces\t1tab\t\t2tab\n1newline", "llama", "chatglm_6b"],  # chatglm 有blank_n,
         # ！？｡＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.
         ["punctuations: ,.:/?+=\"，。！？；【】〔〕〖〗", "baichuan", "llama"],
         ["symbols: 🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan", "llama"],
@@ -8,7 +8,7 @@ examples = {
     ]
     ,
     "zh": [
-        ["空格测试：  2个空格        8个空格", "llama", "chatglm_6b"],  # chatglm 有blank_n,
         ["标点测试：，。！？；", "baichuan_7b", "llama"],
         ["符号测试：🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan_7b", "llama"],
         ["数字测试：(10086 + 98) = 100184", "baichuan_7b", "llama"],

 examples = {
     "en": [
+        ["spaces:  2spaces        8spaces\t1tab\t\t2tab\n1newline", "llama", "chatglm2_6b"],  # chatglm 有blank_n,
         # ！？｡＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.
         ["punctuations: ,.:/?+=\"，。！？；【】〔〕〖〗", "baichuan", "llama"],
         ["symbols: 🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan", "llama"],
     ]
     ,
     "zh": [
+        ["空格测试：  2个空格        8个空格", "llama", "chatglm2_6b"],  # chatglm 有blank_n,
         ["标点测试：，。！？；", "baichuan_7b", "llama"],
         ["符号测试：🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan_7b", "llama"],
         ["数字测试：(10086 + 98) = 100184", "baichuan_7b", "llama"],

util.py CHANGED Viewed

@@ -31,7 +31,7 @@ def tokenize(text, tokenizer_type, color_num=5):
                 token_str = token.decode("utf-8")
             except:
                 token_str = token.decode("utf-8", errors="ignore")
-                logger.error("decode_error: " + json.dumps(
                     {"tokenizer_type": tokenizer_type, "token": str(token), "token_str": token_str},
                     ensure_ascii=False))

                 token_str = token.decode("utf-8")
             except:
                 token_str = token.decode("utf-8", errors="ignore")
+                logger.error("decode_error: " + json.dumps(    # gpt_35_turbo 经常有token会decode error，这里用来记录一下
                     {"tokenizer_type": tokenizer_type, "token": str(token), "token_str": token_str},
                     ensure_ascii=False))