diff --git "a/stats/character_stats.json" "b/stats/character_stats.json" --- "a/stats/character_stats.json" +++ "b/stats/character_stats.json" @@ -1,1769 +1,1788 @@ -{ - "FacebookAI/xlm-roberta-base": { - "tokenizer": "xlm-roberta-base", - "organization": "Facebook", - "vocab_size": 250002, - "num(digit)": 2728, - "len(digit)": "1,3,9", - "num(space)": 1, - "len(space)": "1,1,1", - "num(ar)": 14644, - "len(ar)": "1,4,16", - "num(zh)": 18457, - "len(zh)": "1,2,16", - "num(ja)": 20572, - "len(ja)": "1,2,16", - "num(ja-kana)": 3434, - "len(ja-kana)": "1,3,12", - "num(ko)": 5373, - "len(ko)": "1,2,8" - }, - "clue/roberta_chinese_clue_tiny": { - "tokenizer": "roberta-chinese-clue", - "organization": "CLUE", - "vocab_size": 8021, - "num(digit)": 230, - "len(digit)": "1,4,10", - "num(space)": 0, - "len(space)": "-", - "num(ar)": 30, - "len(ar)": "1,2,3", - "num(zh)": 5689, - "len(zh)": "1,1,1", - "num(ja)": 5691, - "len(ja)": "1,1,3", - "num(ja-kana)": 0, - "len(ja-kana)": "-", - "num(ko)": 0, - "len(ko)": "-" - }, - "dbmdz/bert-base-german-uncased": { - "tokenizer": "bert-base-german-uncased", - "organization": "dbmdz", - "vocab_size": 31102, - "num(digit)": 1733, - "len(digit)": "1,4,12", - "num(space)": 0, - "len(space)": "-", - "num(ar)": 0, - "len(ar)": "-", - "num(zh)": 0, - "len(zh)": "-", - "num(ja)": 0, - "len(ja)": "-", - "num(ja-kana)": 0, - "len(ja-kana)": "-", - "num(ko)": 0, - "len(ko)": "-" - }, - "google-bert/bert-base-cased": { - "tokenizer": "bert-base-cased", - "organization": "Google", - "vocab_size": 28996, - "num(digit)": 926, - "len(digit)": "1,4,11", - "num(space)": 0, - "len(space)": "-", - "num(ar)": 94, - "len(ar)": "1,3,4", - "num(zh)": 226, - "len(zh)": "1,2,3", - "num(ja)": 390, - "len(ja)": "1,2,3", - "num(ja-kana)": 164, - "len(ja-kana)": "1,2,3", - "num(ko)": 10, - "len(ko)": "1,2,3" - }, - "google-bert/bert-base-chinese": { - "tokenizer": "bert-base-chinese", - "organization": "Google", - "vocab_size": 21128, - "num(digit)": 1451, - "len(digit)": "1,3,12", - "num(space)": 2, - "len(space)": "1,2,3", - "num(ar)": 30, - "len(ar)": "1,2,3", - "num(zh)": 14642, - "len(zh)": "1,2,3", - "num(ja)": 15197, - "len(ja)": "1,3,15", - "num(ja-kana)": 553, - "len(ja-kana)": "1,3,15", - "num(ko)": 0, - "len(ko)": "-" - }, - "google-bert/bert-base-german-cased": { - "tokenizer": "bert-base-german-cased", - "organization": "Google", - "vocab_size": 30000, - "num(digit)": 4065, - "len(digit)": "1,11,22", - "num(space)": 0, - "len(space)": "-", - "num(ar)": 0, - "len(ar)": "-", - "num(zh)": 0, - "len(zh)": "-", - "num(ja)": 0, - "len(ja)": "-", - "num(ja-kana)": 0, - "len(ja-kana)": "-", - "num(ko)": 0, - "len(ko)": "-" - }, - "google-bert/bert-base-multilingual-cased": { - "tokenizer": "bert-base-multilingual-cased", - "organization": "Google", - "vocab_size": 119547, - "num(digit)": 2583, - "len(digit)": "1,3,13", - "num(space)": 0, - "len(space)": "-", - "num(ar)": 4873, - "len(ar)": "1,5,14", - "num(zh)": 13542, - "len(zh)": "1,2,3", - "num(ja)": 14880, - "len(ja)": "1,3,10", - "num(ja-kana)": 1336, - "len(ja-kana)": "1,4,10", - "num(ko)": 3271, - "len(ko)": "1,3,6" - }, - "google-bert/bert-base-multilingual-uncased": { - "tokenizer": "bert-base-multilingual-uncased", - "organization": "Google", - "vocab_size": 105879, - "num(digit)": 2510, - "len(digit)": "1,3,13", - "num(space)": 2, - "len(space)": "1,2,3", - "num(ar)": 4530, - "len(ar)": "1,5,13", - "num(zh)": 16658, - "len(zh)": "1,2,3", - "num(ja)": 17858, - "len(ja)": "1,3,10", - "num(ja-kana)": 1188, - "len(ja-kana)": "1,4,10", - "num(ko)": 0, - "len(ko)": "-" - }, - "google-bert/bert-base-uncased": { - "tokenizer": "bert-base-uncased", - "organization": "Google", - "vocab_size": 30522, - "num(digit)": 2056, - "len(digit)": "1,4,11", - "num(space)": 0, - "len(space)": "-", - "num(ar)": 88, - "len(ar)": "1,3,5", - "num(zh)": 488, - "len(zh)": "1,2,3", - "num(ja)": 676, - "len(ja)": "1,2,3", - "num(ja-kana)": 188, - "len(ja-kana)": "1,2,3", - "num(ko)": 0, - "len(ko)": "-" - }, - "google/mobilebert-uncased": { - "tokenizer": "mobilebert-uncased", - "organization": "Google", - "vocab_size": 30522, - "num(digit)": 2056, - "len(digit)": "1,4,11", - "num(space)": 0, - "len(space)": "-", - "num(ar)": 88, - "len(ar)": "1,3,5", - "num(zh)": 488, - "len(zh)": "1,2,3", - "num(ja)": 676, - "len(ja)": "1,2,3", - "num(ja-kana)": 188, - "len(ja-kana)": "1,2,3", - "num(ko)": 0, - "len(ko)": "-" - }, - "tohoku-nlp/bert-base-japanese": { - "tokenizer": "bert-base-japanese", - "organization": "Tohoku", - "vocab_size": 32000, - "num(digit)": 669, - "len(digit)": "1,3,5", - "num(space)": 0, - "len(space)": "-", - "num(ar)": 10, - "len(ar)": "1,3,3", - "num(zh)": 18792, - "len(zh)": "1,2,11", - "num(ja)": 28367, - "len(ja)": "1,2,13", - "num(ja-kana)": 12359, - "len(ja-kana)": "1,4,13", - "num(ko)": 0, - "len(ko)": "-" - }, - "gpt-4": { - "tokenizer": "gpt-4", - "organization": "OpenAI", - "vocab_size": 100277, - "num(digit)": 1110, - "len(digit)": "1,3,3", - "num(space)": 47472, - "len(space)": "1,7,128", - "num(ar)": 113, - "len(ar)": "1,2,10", - "num(zh)": 868, - "len(zh)": "1,1,7", - "num(ja)": 1035, - "len(ja)": "1,1,7", - "num(ja-kana)": 169, - "len(ja-kana)": "1,1,7", - "num(ko)": 299, - "len(ko)": "1,2,4" - }, - "llama3": { - "tokenizer": "llama3", - "organization": "Meta", - "vocab_size": 128256, - "num(digit)": 1110, - "len(digit)": "1,3,3", - "num(space)": 60860, - "len(space)": "1,6,128", - "num(ar)": 3810, - "len(ar)": "1,4,11", - "num(zh)": 4424, - "len(zh)": "1,1,7", - "num(ja)": 5387, - "len(ja)": "1,2,8", - "num(ja-kana)": 1086, - "len(ja-kana)": "1,2,8", - "num(ko)": 2281, - "len(ko)": "1,2,6" - }, - "google-t5/t5-large": { - "tokenizer": "t5", - "organization": "Google", - "vocab_size": 32100, - "num(digit)": 1133, - "len(digit)": "1,3,13", - "num(space)": 0, - "len(space)": "-", - "num(ar)": 0, - "len(ar)": "-", - "num(zh)": 0, - "len(zh)": "-", - "num(ja)": 0, - "len(ja)": "-", - "num(ja-kana)": 0, - "len(ja-kana)": "-", - "num(ko)": 0, - "len(ko)": "-" - }, - "google/byt5-small": { - "tokenizer": "byt5-small", - "organization": "Google", - "vocab_size": 384, - "num(digit)": 10, - "len(digit)": "1,1,1", - "num(space)": 10, - "len(space)": "1,1,1", - "num(ar)": 0, - "len(ar)": "-", - "num(zh)": 0, - "len(zh)": "-", - "num(ja)": 0, - "len(ja)": "-", - "num(ja-kana)": 0, - "len(ja-kana)": "-", - "num(ko)": 0, - "len(ko)": "-" - }, - "google/mt5-large": { - "tokenizer": "mt5-large", - "organization": "Google", - "vocab_size": 250100, - "num(digit)": 16829, - "len(digit)": "1,4,16", - "num(space)": 1, - "len(space)": "1,1,1", - "num(ar)": 7459, - "len(ar)": "1,3,16", - "num(zh)": 21489, - "len(zh)": "1,2,16", - "num(ja)": 27078, - "len(ja)": "1,2,16", - "num(ja-kana)": 9160, - "len(ja-kana)": "1,3,14", - "num(ko)": 4041, - "len(ko)": "1,1,10" - }, - "lmsys/fastchat-t5-3b-v1.0": { - "tokenizer": "fastchat-t5-3b-v1.0", - "organization": "LMSYS", - "vocab_size": 32110, - "num(digit)": 1033, - "len(digit)": "1,3,8", - "num(space)": 0, - "len(space)": "-", - "num(ar)": 0, - "len(ar)": "-", - "num(zh)": 0, - "len(zh)": "-", - "num(ja)": 0, - "len(ja)": "-", - "num(ja-kana)": 0, - "len(ja-kana)": "-", - "num(ko)": 0, - "len(ko)": "-" - }, - "paust/pko-t5-large": { - "tokenizer": "pko-t5-large", - "organization": "PAUST", - "vocab_size": 50358, - "num(digit)": 51, - "len(digit)": "1,2,3", - "num(space)": 10, - "len(space)": "1,1,1", - "num(ar)": 0, - "len(ar)": "-", - "num(zh)": 0, - "len(zh)": "-", - "num(ja)": 0, - "len(ja)": "-", - "num(ja-kana)": 0, - "len(ja-kana)": "-", - "num(ko)": 49050, - "len(ko)": "1,2,16" - }, - "bloom": { - "tokenizer": "bloom", - "organization": "BigScience", - "vocab_size": 250680, - "num(digit)": 6629, - "len(digit)": "1,4,50", - "num(space)": 140180, - "len(space)": "1,6,600", - "num(ar)": 20854, - "len(ar)": "1,5,16", - "num(zh)": 30603, - "len(zh)": "1,2,23", - "num(ja)": 30816, - "len(ja)": "1,2,23", - "num(ja-kana)": 214, - "len(ja-kana)": "1,1,3", - "num(ko)": 338, - "len(ko)": "1,1,3" - }, - "llama": { - "tokenizer": "llama", - "organization": "Meta", - "vocab_size": 32000, - "num(digit)": 20, - "len(digit)": "1,1,1", - "num(space)": 61, - "len(space)": "1,2,15", - "num(ar)": 55, - "len(ar)": "1,1,2", - "num(zh)": 700, - "len(zh)": "1,1,1", - "num(ja)": 837, - "len(ja)": "1,1,1", - "num(ja-kana)": 137, - "len(ja-kana)": "1,1,1", - "num(ko)": 111, - "len(ko)": "1,1,1" - }, - "ClueAI/ChatYuan-large-v2": { - "tokenizer": "ChatYuan-large-v2", - "organization": "CLUE", - "vocab_size": 32128, - "num(digit)": 740, - "len(digit)": "1,3,9", - "num(space)": 0, - "len(space)": "-", - "num(ar)": 2, - "len(ar)": "1,1,1", - "num(zh)": 29591, - "len(zh)": "1,2,16", - "num(ja)": 29736, - "len(ja)": "1,2,16", - "num(ja-kana)": 145, - "len(ja-kana)": "1,1,2", - "num(ko)": 0, - "len(ko)": "-" - }, - "Meta/llama3": { - "tokenizer": "llama3", - "organization": "Meta", - "vocab_size": 128256, - "num(digit)": 1110, - "len(digit)": "1,3,3", - "num(space)": 60860, - "len(space)": "1,6,128", - "num(ar)": 3810, - "len(ar)": "1,4,11", - "num(zh)": 4424, - "len(zh)": "1,1,7", - "num(ja)": 5387, - "len(ja)": "1,2,8", - "num(ja-kana)": 1086, - "len(ja-kana)": "1,2,8", - "num(ko)": 2281, - "len(ko)": "1,2,6" - }, - "openai/gpt-4": { - "tokenizer": "gpt-4", - "organization": "OpenAI", - "vocab_size": 100277, - "num(digit)": 1110, - "len(digit)": "1,3,3", - "num(space)": 47472, - "len(space)": "1,7,128", - "num(ar)": 113, - "len(ar)": "1,2,10", - "num(zh)": 868, - "len(zh)": "1,1,7", - "num(ja)": 1035, - "len(ja)": "1,1,7", - "num(ja-kana)": 169, - "len(ja-kana)": "1,1,7", - "num(ko)": 299, - "len(ko)": "1,2,4" - }, - "gradientai/Llama-3-8B-Instruct-Gradient-1048k": { - "tokenizer": "llama3", - "organization": "Meta", - "vocab_size": 128256, - "num(digit)": 1110, - "len(digit)": "1,3,3", - "num(space)": 60860, - "len(space)": "1,6,128", - "num(ar)": 3810, - "len(ar)": "1,4,11", - "num(zh)": 4424, - "len(zh)": "1,1,7", - "num(ja)": 5387, - "len(ja)": "1,2,8", - "num(ja-kana)": 1086, - "len(ja-kana)": "1,2,8", - "num(ko)": 2281, - "len(ko)": "1,2,6" - }, - "bigscience/bloom": { - "tokenizer": "bloom", - "organization": "BigScience", - "vocab_size": 250680, - "num(digit)": 6629, - "len(digit)": "1,4,50", - "num(space)": 140180, - "len(space)": "1,6,600", - "num(ar)": 20854, - "len(ar)": "1,5,16", - "num(zh)": 30603, - "len(zh)": "1,2,23", - "num(ja)": 30816, - "len(ja)": "1,2,23", - "num(ja-kana)": 214, - "len(ja-kana)": "1,1,3", - "num(ko)": 338, - "len(ko)": "1,1,3" - }, - "huggyllama/llama-7b": { - "tokenizer": "llama", - "organization": "Meta", - "vocab_size": 32000, - "num(digit)": 20, - "len(digit)": "1,1,1", - "num(space)": 61, - "len(space)": "1,2,15", - "num(ar)": 55, - "len(ar)": "1,1,2", - "num(zh)": 700, - "len(zh)": "1,1,1", - "num(ja)": 837, - "len(ja)": "1,1,1", - "num(ja-kana)": 137, - "len(ja-kana)": "1,1,1", - "num(ko)": 111, - "len(ko)": "1,1,1" - }, - "baichuan-inc/Baichuan-7B": { - "tokenizer": "baichuan", - "organization": "Baichuan", - "vocab_size": 64000, - "num(digit)": 335, - "len(digit)": "1,14,14", - "num(space)": 13, - "len(space)": "1,1,1", - "num(ar)": 299, - "len(ar)": "1,1,2", - "num(zh)": 27676, - "len(zh)": "1,1,9", - "num(ja)": 28522, - "len(ja)": "1,1,9", - "num(ja-kana)": 178, - "len(ja-kana)": "1,1,1", - "num(ko)": 1591, - "len(ko)": "1,1,1" - }, - "01-ai/Yi-34B": { - "tokenizer": "Yi-34B", - "organization": "Yi", - "vocab_size": 64000, - "num(digit)": 200, - "len(digit)": "1,13,15", - "num(space)": 24274, - "len(space)": "1,7,16", - "num(ar)": 18, - "len(ar)": "1,1,4", - "num(zh)": 21356, - "len(zh)": "1,2,12", - "num(ja)": 21407, - "len(ja)": "1,2,12", - "num(ja-kana)": 51, - "len(ja-kana)": "1,1,2", - "num(ko)": 28, - "len(ko)": "1,1,2" - }, - "01-ai/Yi-6B": { - "tokenizer": "Yi-6B", - "organization": "Yi", - "vocab_size": 64000, - "num(digit)": 200, - "len(digit)": "1,13,15", - "num(space)": 24274, - "len(space)": "1,7,16", - "num(ar)": 18, - "len(ar)": "1,1,4", - "num(zh)": 21356, - "len(zh)": "1,2,12", - "num(ja)": 21407, - "len(ja)": "1,2,12", - "num(ja-kana)": 51, - "len(ja-kana)": "1,1,2", - "num(ko)": 28, - "len(ko)": "1,1,2" - }, - "01-ai/Yi-VL-34B": { - "tokenizer": "Yi-VL-34B", - "organization": "Yi", - "vocab_size": 64000, - "num(digit)": 200, - "len(digit)": "1,13,15", - "num(space)": 43, - "len(space)": "1,2,15", - "num(ar)": 18, - "len(ar)": "1,1,4", - "num(zh)": 21356, - "len(zh)": "1,2,12", - "num(ja)": 21407, - "len(ja)": "1,2,12", - "num(ja-kana)": 51, - "len(ja-kana)": "1,1,2", - "num(ko)": 28, - "len(ko)": "1,1,2" - }, - "ClassCat/gpt2-base-french": { - "tokenizer": "gpt2-base-french", - "organization": "ClassCat", - "vocab_size": 50000, - "num(digit)": 1833, - "len(digit)": "1,4,5", - "num(space)": 31889, - "len(space)": "1,7,32", - "num(ar)": 41, - "len(ar)": "1,1,4", - "num(zh)": 27, - "len(zh)": "1,1,1", - "num(ja)": 46, - "len(ja)": "1,1,2", - "num(ja-kana)": 19, - "len(ja-kana)": "1,1,2", - "num(ko)": 0, - "len(ko)": "-" - }, - "ClassCat/gpt2-base-spanish": { - "tokenizer": "gpt2-base-spanish", - "organization": "ClassCat", - "vocab_size": 50000, - "num(digit)": 1492, - "len(digit)": "1,4,9", - "num(space)": 34496, - "len(space)": "1,8,32", - "num(ar)": 36, - "len(ar)": "1,1,4", - "num(zh)": 13, - "len(zh)": "1,1,1", - "num(ja)": 36, - "len(ja)": "1,1,2", - "num(ja-kana)": 23, - "len(ja-kana)": "1,1,2", - "num(ko)": 0, - "len(ko)": "-" - }, - "ClueAI/PromptCLUE-base": { - "tokenizer": "PromptCLUE-base", - "organization": "CLUE", - "vocab_size": 32128, - "num(digit)": 740, - "len(digit)": "1,3,9", - "num(space)": 0, - "len(space)": "-", - "num(ar)": 2, - "len(ar)": "1,1,1", - "num(zh)": 29591, - "len(zh)": "1,2,16", - "num(ja)": 29736, - "len(ja)": "1,2,16", - "num(ja-kana)": 145, - "len(ja-kana)": "1,1,2", - "num(ko)": 0, - "len(ko)": "-" - }, - "CohereForAI/aya-101": { - "tokenizer": "aya-101", - "organization": "Cohere For AI", - "vocab_size": 250100, - "num(digit)": 16829, - "len(digit)": "1,4,16", - "num(space)": 1, - "len(space)": "1,1,1", - "num(ar)": 7459, - "len(ar)": "1,3,16", - "num(zh)": 21489, - "len(zh)": "1,2,16", - "num(ja)": 27078, - "len(ja)": "1,2,16", - "num(ja-kana)": 9160, - "len(ja-kana)": "1,3,14", - "num(ko)": 4041, - "len(ko)": "1,1,10" - }, - "EleutherAI/gpt-neox-20b": { - "tokenizer": "gpt-neox-20b", - "organization": "EleutherAI", - "vocab_size": 50277, - "num(digit)": 2036, - "len(digit)": "1,3,35", - "num(space)": 28996, - "len(space)": "1,7,512", - "num(ar)": 94, - "len(ar)": "1,2,4", - "num(zh)": 313, - "len(zh)": "1,1,2", - "num(ja)": 480, - "len(ja)": "1,1,4", - "num(ja-kana)": 167, - "len(ja-kana)": "1,1,4", - "num(ko)": 25, - "len(ko)": "1,1,2" - }, - "HuggingFaceH4/starchat-alpha": { - "tokenizer": "starchat-alpha", - "organization": "-", - "vocab_size": 49156, - "num(digit)": 10, - "len(digit)": "1,1,1", - "num(space)": 16515, - "len(space)": "1,6,256", - "num(ar)": 84, - "len(ar)": "1,2,4", - "num(zh)": 2030, - "len(zh)": "1,1,7", - "num(ja)": 2368, - "len(ja)": "1,1,8", - "num(ja-kana)": 360, - "len(ja-kana)": "1,2,8", - "num(ko)": 491, - "len(ko)": "1,2,5" - }, - "HuggingFaceH4/zephyr-7b-beta": { - "tokenizer": "zephyr-7b-beta", - "organization": "HuggingFace", - "vocab_size": 32000, - "num(digit)": 20, - "len(digit)": "1,1,1", - "num(space)": 85, - "len(space)": "1,3,15", - "num(ar)": 71, - "len(ar)": "1,1,2", - "num(zh)": 1459, - "len(zh)": "1,1,2", - "num(ja)": 1593, - "len(ja)": "1,1,2", - "num(ja-kana)": 134, - "len(ja-kana)": "1,1,1", - "num(ko)": 346, - "len(ko)": "1,1,1" - }, - "LLM360/CrystalCoder": { - "tokenizer": "CrystalCoder", - "organization": "MBZUAI", - "vocab_size": 32022, - "num(digit)": 20, - "len(digit)": "1,1,1", - "num(space)": 61, - "len(space)": "1,2,15", - "num(ar)": 55, - "len(ar)": "1,1,2", - "num(zh)": 700, - "len(zh)": "1,1,1", - "num(ja)": 837, - "len(ja)": "1,1,1", - "num(ja-kana)": 137, - "len(ja-kana)": "1,1,1", - "num(ko)": 111, - "len(ko)": "1,1,1" - }, - "NousResearch/Llama-2-7b-chat-hf": { - "tokenizer": "llama2", - "organization": "Meta", - "vocab_size": 32001, - "num(digit)": 20, - "len(digit)": "1,1,1", - "num(space)": 61, - "len(space)": "1,2,15", - "num(ar)": 55, - "len(ar)": "1,1,2", - "num(zh)": 700, - "len(zh)": "1,1,1", - "num(ja)": 837, - "len(ja)": "1,1,1", - "num(ja-kana)": 137, - "len(ja-kana)": "1,1,1", - "num(ko)": 111, - "len(ko)": "1,1,1" - }, - "OrionStarAI/Orion-14B-Chat": { - "tokenizer": "Orion-14B-Chat", - "organization": "OrionStar", - "vocab_size": 84608, - "num(digit)": 1559, - "len(digit)": "1,4,14", - "num(space)": 18383, - "len(space)": "1,6,16", - "num(ar)": 102, - "len(ar)": "1,1,1", - "num(zh)": 46998, - "len(zh)": "1,2,16", - "num(ja)": 49644, - "len(ja)": "1,2,16", - "num(ja-kana)": 2987, - "len(ja-kana)": "1,3,11", - "num(ko)": 5110, - "len(ko)": "1,2,7" - }, - "Qwen/Qwen-7B-Chat": { - "tokenizer": "Qwen", - "organization": "Alibaba", - "vocab_size": 151851, - "num(digit)": 10, - "len(digit)": "1,1,1", - "num(space)": 55883, - "len(space)": "1,6,128", - "num(ar)": 4018, - "len(ar)": "1,3,12", - "num(zh)": 25557, - "len(zh)": "1,2,7", - "num(ja)": 27206, - "len(ja)": "1,2,11", - "num(ja-kana)": 2089, - "len(ja-kana)": "1,3,11", - "num(ko)": 3495, - "len(ko)": "1,1,5" - }, - "Qwen/Qwen1.5-14B-Chat": { - "tokenizer": "Qwen1.5", - "organization": "Alibaba", - "vocab_size": 151646, - "num(digit)": 10, - "len(digit)": "1,1,1", - "num(space)": 55883, - "len(space)": "1,6,128", - "num(ar)": 4018, - "len(ar)": "1,3,12", - "num(zh)": 25557, - "len(zh)": "1,2,7", - "num(ja)": 27206, - "len(ja)": "1,2,11", - "num(ja-kana)": 2089, - "len(ja-kana)": "1,3,11", - "num(ko)": 3495, - "len(ko)": "1,1,5" - }, - "Skywork/Skywork-13B-Math": { - "tokenizer": "Skywork-13B-Math", - "organization": "Kunlun", - "vocab_size": 65519, - "num(digit)": 20, - "len(digit)": "1,1,1", - "num(space)": 62, - "len(space)": "1,2,15", - "num(ar)": 56, - "len(ar)": "1,1,2", - "num(zh)": 33913, - "len(zh)": "1,2,5", - "num(ja)": 34064, - "len(ja)": "1,2,5", - "num(ja-kana)": 150, - "len(ja-kana)": "1,1,1", - "num(ko)": 111, - "len(ko)": "1,1,1" - }, - "Skywork/Skywork-13B-base": { - "tokenizer": "Skywork-13B-base", - "organization": "Kunlun", - "vocab_size": 65519, - "num(digit)": 20, - "len(digit)": "1,1,1", - "num(space)": 62, - "len(space)": "1,2,15", - "num(ar)": 56, - "len(ar)": "1,1,2", - "num(zh)": 33913, - "len(zh)": "1,2,5", - "num(ja)": 34064, - "len(ja)": "1,2,5", - "num(ja-kana)": 150, - "len(ja-kana)": "1,1,1", - "num(ko)": 111, - "len(ko)": "1,1,1" - }, - "THUDM/chatglm-6b": { - "tokenizer": "chatglm-6b", - "organization": "Tsinghua", - "vocab_size": 130344, - "num(digit)": 20, - "len(digit)": "1,1,1", - "num(space)": 93, - "len(space)": "1,34,80", - "num(ar)": 137, - "len(ar)": "1,2,4", - "num(zh)": 61358, - "len(zh)": "1,2,16", - "num(ja)": 61784, - "len(ja)": "1,2,16", - "num(ja-kana)": 439, - "len(ja-kana)": "1,2,5", - "num(ko)": 114, - "len(ko)": "1,1,3" - }, - "THUDM/chatglm2-6b": { - "tokenizer": "chatglm2-6b", - "organization": "Tsinghua", - "vocab_size": 64787, - "num(digit)": 20, - "len(digit)": "1,1,1", - "num(space)": 67, - "len(space)": "1,2,15", - "num(ar)": 57, - "len(ar)": "1,1,2", - "num(zh)": 30922, - "len(zh)": "1,2,16", - "num(ja)": 31065, - "len(ja)": "1,2,16", - "num(ja-kana)": 143, - "len(ja-kana)": "1,1,1", - "num(ko)": 604, - "len(ko)": "1,1,1" - }, - "THUDM/chatglm3-6b": { - "tokenizer": "chatglm3-6b", - "organization": "Tsinghua", - "vocab_size": 64796, - "num(digit)": 20, - "len(digit)": "1,1,1", - "num(space)": 67, - "len(space)": "1,2,15", - "num(ar)": 57, - "len(ar)": "1,1,2", - "num(zh)": 30922, - "len(zh)": "1,2,16", - "num(ja)": 31065, - "len(ja)": "1,2,16", - "num(ja-kana)": 143, - "len(ja-kana)": "1,1,1", - "num(ko)": 604, - "len(ko)": "1,1,1" - }, - "TigerResearch/tigerbot-13b-chat-v2": { - "tokenizer": "tigerbot-13b-chat-v2", - "organization": "Tigerobo", - "vocab_size": 60515, - "num(digit)": 20, - "len(digit)": "1,1,1", - "num(space)": 61, - "len(space)": "1,2,15", - "num(ar)": 55, - "len(ar)": "1,1,2", - "num(zh)": 28603, - "len(zh)": "1,2,16", - "num(ja)": 28770, - "len(ja)": "1,2,16", - "num(ja-kana)": 167, - "len(ja-kana)": "1,1,2", - "num(ko)": 261, - "len(ko)": "1,1,1" - }, - "TigerResearch/tigerbot-70b-chat-v4-4k": { - "tokenizer": "tigerbot-70b-chat-v4-4k", - "organization": "Tigerobo", - "vocab_size": 65110, - "num(digit)": 20, - "len(digit)": "1,1,1", - "num(space)": 61, - "len(space)": "1,2,15", - "num(ar)": 55, - "len(ar)": "1,1,2", - "num(zh)": 30509, - "len(zh)": "1,2,16", - "num(ja)": 32061, - "len(ja)": "1,2,16", - "num(ja-kana)": 2071, - "len(ja-kana)": "1,2,8", - "num(ko)": 1504, - "len(ko)": "1,1,5" - }, - "Upstage/SOLAR-10.7B-v1.0": { - "tokenizer": "SOLAR-10.7B-v1.0", - "organization": "-", - "vocab_size": 32000, - "num(digit)": 20, - "len(digit)": "1,1,1", - "num(space)": 85, - "len(space)": "1,3,15", - "num(ar)": 71, - "len(ar)": "1,1,2", - "num(zh)": 1459, - "len(zh)": "1,1,2", - "num(ja)": 1593, - "len(ja)": "1,1,2", - "num(ja-kana)": 134, - "len(ja-kana)": "1,1,1", - "num(ko)": 346, - "len(ko)": "1,1,1" - }, - "WizardLM/WizardCoder-15B-V1.0": { - "tokenizer": "WizardCoder-15B-V1.0", - "organization": "Microsoft", - "vocab_size": 49153, - "num(digit)": 10, - "len(digit)": "1,1,1", - "num(space)": 16515, - "len(space)": "1,6,256", - "num(ar)": 84, - "len(ar)": "1,2,4", - "num(zh)": 2030, - "len(zh)": "1,1,7", - "num(ja)": 2368, - "len(ja)": "1,1,8", - "num(ja-kana)": 360, - "len(ja-kana)": "1,2,8", - "num(ko)": 491, - "len(ko)": "1,2,5" - }, - "WizardLM/WizardCoder-Python-7B-V1.0": { - "tokenizer": "WizardCoder-Python-7B-V1.0", - "organization": "Microsoft", - "vocab_size": 32001, - "num(digit)": 20, - "len(digit)": "1,1,1", - "num(space)": 61, - "len(space)": "1,2,15", - "num(ar)": 55, - "len(ar)": "1,1,2", - "num(zh)": 700, - "len(zh)": "1,1,1", - "num(ja)": 837, - "len(ja)": "1,1,1", - "num(ja-kana)": 137, - "len(ja-kana)": "1,1,1", - "num(ko)": 111, - "len(ko)": "1,1,1" - }, - "WizardLM/WizardLM-7B-V1.0": { - "tokenizer": "WizardLM-7B-V1.0", - "organization": "Microsoft", - "vocab_size": 32001, - "num(digit)": 20, - "len(digit)": "1,1,1", - "num(space)": 61, - "len(space)": "1,2,15", - "num(ar)": 55, - "len(ar)": "1,1,2", - "num(zh)": 700, - "len(zh)": "1,1,1", - "num(ja)": 837, - "len(ja)": "1,1,1", - "num(ja-kana)": 137, - "len(ja-kana)": "1,1,1", - "num(ko)": 111, - "len(ko)": "1,1,1" - }, - "WizardLM/WizardMath-70B-V1.0": { - "tokenizer": "WizardMath-70B-V1.0", - "organization": "Microsoft", - "vocab_size": 32002, - "num(digit)": 20, - "len(digit)": "1,1,1", - "num(space)": 61, - "len(space)": "1,2,15", - "num(ar)": 55, - "len(ar)": "1,1,2", - "num(zh)": 700, - "len(zh)": "1,1,1", - "num(ja)": 837, - "len(ja)": "1,1,1", - "num(ja-kana)": 137, - "len(ja-kana)": "1,1,1", - "num(ko)": 111, - "len(ko)": "1,1,1" - }, - "abeja/gpt-neox-japanese-2.7b": { - "tokenizer": "gpt-neox-japanese-2.7b", - "organization": "ABEJA", - "vocab_size": 32000, - "num(digit)": 20, - "len(digit)": "1,1,1", - "num(space)": 0, - "len(space)": "-", - "num(ar)": 0, - "len(ar)": "-", - "num(zh)": 15176, - "len(zh)": "1,2,2", - "num(ja)": 31482, - "len(ja)": "1,2,3", - "num(ja-kana)": 16306, - "len(ja-kana)": "1,3,3", - "num(ko)": 0, - "len(ko)": "-" - }, - "ai21labs/Jamba-v0.1": { - "tokenizer": "Jamba-v0.1", - "organization": "AI21", - "vocab_size": 65536, - "num(digit)": 1556, - "len(digit)": "1,16,17", - "num(space)": 39501, - "len(space)": "1,7,32", - "num(ar)": 867, - "len(ar)": "1,3,8", - "num(zh)": 1157, - "len(zh)": "1,1,2", - "num(ja)": 1287, - "len(ja)": "1,1,2", - "num(ja-kana)": 130, - "len(ja-kana)": "1,1,2", - "num(ko)": 312, - "len(ko)": "1,1,2" - }, - "allenai/OLMo-7B": { - "tokenizer": "OLMo-7B", - "organization": "Allen AI", - "vocab_size": 50280, - "num(digit)": 2036, - "len(digit)": "1,3,35", - "num(space)": 29019, - "len(space)": "1,7,512", - "num(ar)": 94, - "len(ar)": "1,2,4", - "num(zh)": 313, - "len(zh)": "1,1,2", - "num(ja)": 480, - "len(ja)": "1,1,4", - "num(ja-kana)": 167, - "len(ja-kana)": "1,1,4", - "num(ko)": 25, - "len(ko)": "1,1,2" - }, - "baichuan-inc/Baichuan2-7B-Chat": { - "tokenizer": "baichuan2", - "organization": "Baichuan", - "vocab_size": 125696, - "num(digit)": 1023, - "len(digit)": "1,14,14", - "num(space)": 26013, - "len(space)": "1,7,32", - "num(ar)": 335, - "len(ar)": "1,1,27", - "num(zh)": 70398, - "len(zh)": "1,2,32", - "num(ja)": 71269, - "len(ja)": "1,2,32", - "num(ja-kana)": 206, - "len(ja-kana)": "1,1,9", - "num(ko)": 1595, - "len(ko)": "1,1,2" - }, - "ckiplab/gpt2-base-chinese": { - "tokenizer": "gpt2-base-chinese", - "organization": "SINICA", - "vocab_size": 21128, - "num(digit)": 1451, - "len(digit)": "1,3,12", - "num(space)": 2, - "len(space)": "1,2,3", - "num(ar)": 30, - "len(ar)": "1,2,3", - "num(zh)": 14642, - "len(zh)": "1,2,3", - "num(ja)": 15197, - "len(ja)": "1,3,15", - "num(ja-kana)": 553, - "len(ja-kana)": "1,3,15", - "num(ko)": 0, - "len(ko)": "-" - }, - "cyberagent/open-calm-7b": { - "tokenizer": "open-calm-7b", - "organization": "CyberAgent", - "vocab_size": 52000, - "num(digit)": 690, - "len(digit)": "1,3,5", - "num(space)": 1698, - "len(space)": "1,4,33", - "num(ar)": 10, - "len(ar)": "1,1,4", - "num(zh)": 30775, - "len(zh)": "1,3,31", - "num(ja)": 45790, - "len(ja)": "1,3,31", - "num(ja-kana)": 32535, - "len(ja-kana)": "1,3,31", - "num(ko)": 0, - "len(ko)": "-" - }, - "databricks/dbrx-instruct": { - "tokenizer": "dbrx-instruct", - "organization": "Databricks", - "vocab_size": 100280, - "num(digit)": 1126, - "len(digit)": "1,3,17", - "num(space)": 47400, - "len(space)": "1,7,128", - "num(ar)": 113, - "len(ar)": "1,2,10", - "num(zh)": 868, - "len(zh)": "1,1,7", - "num(ja)": 1035, - "len(ja)": "1,1,7", - "num(ja-kana)": 169, - "len(ja-kana)": "1,1,7", - "num(ko)": 299, - "len(ko)": "1,2,4" - }, - "deepseek-ai/DeepSeek-V2": { - "tokenizer": "DeepSeek-V2", - "organization": "DeepSeek", - "vocab_size": 100002, - "num(digit)": 10, - "len(digit)": "1,1,1", - "num(space)": 48073, - "len(space)": "1,7,128", - "num(ar)": 48, - "len(ar)": "1,1,4", - "num(zh)": 18052, - "len(zh)": "1,2,16", - "num(ja)": 18090, - "len(ja)": "1,2,16", - "num(ja-kana)": 38, - "len(ja-kana)": "1,1,2", - "num(ko)": 16, - "len(ko)": "1,1,2" - }, - "deepseek-ai/deepseek-coder-33b-instruct": { - "tokenizer": "deepseek-coder-33b-instruct", - "organization": "DeepSeek", - "vocab_size": 32022, - "num(digit)": 10, - "len(digit)": "1,1,1", - "num(space)": 15254, - "len(space)": "1,6,65", - "num(ar)": 12, - "len(ar)": "1,1,2", - "num(zh)": 4803, - "len(zh)": "1,2,4", - "num(ja)": 4804, - "len(ja)": "1,2,4", - "num(ja-kana)": 1, - "len(ja-kana)": "1,1,1", - "num(ko)": 0, - "len(ko)": "-" - }, - "deepseek-ai/deepseek-llm-7b-base": { - "tokenizer": "deepseek-llm-7b-base", - "organization": "DeepSeek", - "vocab_size": 100015, - "num(digit)": 10, - "len(digit)": "1,1,1", - "num(space)": 48073, - "len(space)": "1,7,128", - "num(ar)": 48, - "len(ar)": "1,1,4", - "num(zh)": 18052, - "len(zh)": "1,2,16", - "num(ja)": 18090, - "len(ja)": "1,2,16", - "num(ja-kana)": 38, - "len(ja-kana)": "1,1,2", - "num(ko)": 16, - "len(ko)": "1,1,2" - }, - "eson/kplug-base-encoder": { - "tokenizer": "kplug", - "organization": "JD", - "vocab_size": 10261, - "num(digit)": 420, - "len(digit)": "1,3,12", - "num(space)": 0, - "len(space)": "-", - "num(ar)": 0, - "len(ar)": "-", - "num(zh)": 5764, - "len(zh)": "1,1,1", - "num(ja)": 5766, - "len(ja)": "1,1,3", - "num(ja-kana)": 0, - "len(ja-kana)": "-", - "num(ko)": 0, - "len(ko)": "-" - }, - "fnlp/moss-moon-003-sft": { - "tokenizer": "moss-moon-003-sft", - "organization": "Fudan", - "vocab_size": 106072, - "num(digit)": 1848, - "len(digit)": "1,3,16", - "num(space)": 33566, - "len(space)": "1,7,102", - "num(ar)": 25, - "len(ar)": "1,1,4", - "num(zh)": 54230, - "len(zh)": "1,2,15", - "num(ja)": 54381, - "len(ja)": "1,2,15", - "num(ja-kana)": 152, - "len(ja-kana)": "1,1,7", - "num(ko)": 0, - "len(ko)": "-" - }, - "google/gemma-7b": { - "tokenizer": "gemma-7b", - "organization": "Google", - "vocab_size": 256000, - "num(digit)": 134, - "len(digit)": "1,10,12", - "num(space)": 125662, - "len(space)": "1,7,31", - "num(ar)": 6274, - "len(ar)": "1,4,15", - "num(zh)": 23767, - "len(zh)": "1,2,12", - "num(ja)": 28852, - "len(ja)": "1,2,12", - "num(ja-kana)": 7061, - "len(ja-kana)": "1,3,12", - "num(ko)": 2295, - "len(ko)": "1,1,5" - }, - "google/switch-c-2048": { - "tokenizer": "switch-c-2048", - "organization": "Google", - "vocab_size": 32100, - "num(digit)": 1133, - "len(digit)": "1,3,13", - "num(space)": 0, - "len(space)": "-", - "num(ar)": 0, - "len(ar)": "-", - "num(zh)": 0, - "len(zh)": "-", - "num(ja)": 0, - "len(ja)": "-", - "num(ja-kana)": 0, - "len(ja-kana)": "-", - "num(ko)": 0, - "len(ko)": "-" - }, - "hfl/chinese-alpaca-lora-7b": { - "tokenizer": "chinese-alpaca-lora-7b", - "organization": "-", - "vocab_size": 49954, - "num(digit)": 614, - "len(digit)": "1,3,5", - "num(space)": 61, - "len(space)": "1,2,15", - "num(ar)": 55, - "len(ar)": "1,1,2", - "num(zh)": 17839, - "len(zh)": "1,2,13", - "num(ja)": 17993, - "len(ja)": "1,2,13", - "num(ja-kana)": 154, - "len(ja-kana)": "1,1,1", - "num(ko)": 135, - "len(ko)": "1,1,1" - }, - "hfl/chinese-llama-2-7b": { - "tokenizer": "chinese-llama-2-7b", - "organization": "-", - "vocab_size": 55296, - "num(digit)": 20, - "len(digit)": "1,1,1", - "num(space)": 61, - "len(space)": "1,2,15", - "num(ar)": 55, - "len(ar)": "1,1,2", - "num(zh)": 23974, - "len(zh)": "1,2,16", - "num(ja)": 24111, - "len(ja)": "1,2,16", - "num(ja-kana)": 137, - "len(ja-kana)": "1,1,1", - "num(ko)": 111, - "len(ko)": "1,1,1" - }, - "hfl/chinese-llama-lora-7b": { - "tokenizer": "chinese-llama-lora-7b", - "organization": "-", - "vocab_size": 49953, - "num(digit)": 614, - "len(digit)": "1,3,5", - "num(space)": 61, - "len(space)": "1,2,15", - "num(ar)": 55, - "len(ar)": "1,1,2", - "num(zh)": 17839, - "len(zh)": "1,2,13", - "num(ja)": 17993, - "len(ja)": "1,2,13", - "num(ja-kana)": 154, - "len(ja-kana)": "1,1,1", - "num(ko)": 135, - "len(ko)": "1,1,1" - }, - "hfl/llama-3-chinese-8b": { - "tokenizer": "llama-3-chinese-8b", - "organization": "-", - "vocab_size": 128256, - "num(digit)": 1110, - "len(digit)": "1,3,3", - "num(space)": 60860, - "len(space)": "1,6,128", - "num(ar)": 3810, - "len(ar)": "1,4,11", - "num(zh)": 4424, - "len(zh)": "1,1,7", - "num(ja)": 5387, - "len(ja)": "1,2,8", - "num(ja-kana)": 1086, - "len(ja-kana)": "1,2,8", - "num(ko)": 2281, - "len(ko)": "1,2,6" - }, - "hpcai-tech/grok-1": { - "tokenizer": "grok-1", - "organization": "xAI", - "vocab_size": 131072, - "num(digit)": 40, - "len(digit)": "1,6,13", - "num(space)": 399, - "len(space)": "1,3,16", - "num(ar)": 69, - "len(ar)": "1,2,4", - "num(zh)": 1626, - "len(zh)": "1,2,7", - "num(ja)": 3118, - "len(ja)": "1,2,8", - "num(ja-kana)": 1908, - "len(ja-kana)": "1,2,8", - "num(ko)": 67, - "len(ko)": "1,1,2" - }, - "internlm/internlm-chat-7b": { - "tokenizer": "internlm-chat-7b", - "organization": "Shanghai AI Lab", - "vocab_size": 103168, - "num(digit)": 1259, - "len(digit)": "1,3,19", - "num(space)": 33008, - "len(space)": "1,6,128", - "num(ar)": 6702, - "len(ar)": "1,4,16", - "num(zh)": 32000, - "len(zh)": "1,2,15", - "num(ja)": 32866, - "len(ja)": "1,2,15", - "num(ja-kana)": 864, - "len(ja-kana)": "1,2,9", - "num(ko)": 298, - "len(ko)": "1,1,1" - }, - "internlm/internlm-xcomposer-7b": { - "tokenizer": "internlm-xcomposer-7b", - "organization": "Shanghai AI Lab", - "vocab_size": 103168, - "num(digit)": 1261, - "len(digit)": "1,3,19", - "num(space)": 33008, - "len(space)": "1,6,128", - "num(ar)": 6702, - "len(ar)": "1,4,16", - "num(zh)": 32000, - "len(zh)": "1,2,15", - "num(ja)": 32866, - "len(ja)": "1,2,15", - "num(ja-kana)": 864, - "len(ja-kana)": "1,2,9", - "num(ko)": 298, - "len(ko)": "1,1,1" - }, - "internlm/internlm2-chat-7b": { - "tokenizer": "internlm2-chat-7b", - "organization": "Shanghai AI Lab", - "vocab_size": 92544, - "num(digit)": 1261, - "len(digit)": "1,3,18", - "num(space)": 28681, - "len(space)": "1,7,128", - "num(ar)": 30, - "len(ar)": "1,1,1", - "num(zh)": 31148, - "len(zh)": "1,2,15", - "num(ja)": 31296, - "len(ja)": "1,2,15", - "num(ja-kana)": 148, - "len(ja-kana)": "1,1,1", - "num(ko)": 83, - "len(ko)": "1,1,1" - }, - "internlm/internlm2-math-7b": { - "tokenizer": "internlm2-math-7b", - "organization": "Shanghai AI Lab", - "vocab_size": 92544, - "num(digit)": 1261, - "len(digit)": "1,3,18", - "num(space)": 28681, - "len(space)": "1,7,128", - "num(ar)": 30, - "len(ar)": "1,1,1", - "num(zh)": 31148, - "len(zh)": "1,2,15", - "num(ja)": 31296, - "len(ja)": "1,2,15", - "num(ja-kana)": 148, - "len(ja-kana)": "1,1,1", - "num(ko)": 83, - "len(ko)": "1,1,1" - }, - "microsoft/Phi-3-mini-4k-instruct": { - "tokenizer": "Phi-3-mini-4k-instruct", - "organization": "Microsoft", - "vocab_size": 32011, - "num(digit)": 20, - "len(digit)": "1,1,1", - "num(space)": 61, - "len(space)": "1,2,15", - "num(ar)": 55, - "len(ar)": "1,1,2", - "num(zh)": 700, - "len(zh)": "1,1,1", - "num(ja)": 837, - "len(ja)": "1,1,1", - "num(ja-kana)": 137, - "len(ja-kana)": "1,1,1", - "num(ko)": 111, - "len(ko)": "1,1,1" - }, - "microsoft/phi-1": { - "tokenizer": "phi-1", - "organization": "Microsoft", - "vocab_size": 50295, - "num(digit)": 1691, - "len(digit)": "1,3,16", - "num(space)": 33129, - "len(space)": "1,7,66", - "num(ar)": 22, - "len(ar)": "1,1,3", - "num(zh)": 51, - "len(zh)": "1,1,4", - "num(ja)": 183, - "len(ja)": "1,1,7", - "num(ja-kana)": 133, - "len(ja-kana)": "1,1,7", - "num(ko)": 0, - "len(ko)": "-" - }, - "microsoft/phi-2": { - "tokenizer": "phi-2", - "organization": "Microsoft", - "vocab_size": 50295, - "num(digit)": 1691, - "len(digit)": "1,3,16", - "num(space)": 33129, - "len(space)": "1,7,66", - "num(ar)": 22, - "len(ar)": "1,1,3", - "num(zh)": 51, - "len(zh)": "1,1,4", - "num(ja)": 183, - "len(ja)": "1,1,7", - "num(ja-kana)": 133, - "len(ja-kana)": "1,1,7", - "num(ko)": 0, - "len(ko)": "-" - }, - "mistralai/Mistral-7B-v0.1": { - "tokenizer": "Mistral-7B-v0.1", - "organization": "Mistral", - "vocab_size": 32000, - "num(digit)": 20, - "len(digit)": "1,1,1", - "num(space)": 85, - "len(space)": "1,3,15", - "num(ar)": 71, - "len(ar)": "1,1,2", - "num(zh)": 1459, - "len(zh)": "1,1,2", - "num(ja)": 1593, - "len(ja)": "1,1,2", - "num(ja-kana)": 134, - "len(ja-kana)": "1,1,1", - "num(ko)": 346, - "len(ko)": "1,1,1" - }, - "mistralai/Mixtral-8x7B-v0.1": { - "tokenizer": "Mixtral-8x7B-v0.1", - "organization": "Mistral", - "vocab_size": 32000, - "num(digit)": 20, - "len(digit)": "1,1,1", - "num(space)": 85, - "len(space)": "1,3,15", - "num(ar)": 71, - "len(ar)": "1,1,2", - "num(zh)": 1459, - "len(zh)": "1,1,2", - "num(ja)": 1593, - "len(ja)": "1,1,2", - "num(ja-kana)": 134, - "len(ja-kana)": "1,1,1", - "num(ko)": 346, - "len(ko)": "1,1,1" - }, - "openai-community/gpt2": { - "tokenizer": "gpt2", - "organization": "OpenAI", - "vocab_size": 50257, - "num(digit)": 1691, - "len(digit)": "1,3,16", - "num(space)": 33129, - "len(space)": "1,7,66", - "num(ar)": 22, - "len(ar)": "1,1,3", - "num(zh)": 51, - "len(zh)": "1,1,4", - "num(ja)": 183, - "len(ja)": "1,1,7", - "num(ja-kana)": 133, - "len(ja-kana)": "1,1,7", - "num(ko)": 0, - "len(ko)": "-" - }, - "openai/code-davinci-002": { - "tokenizer": "code-davinci-002", - "organization": "OpenAI", - "vocab_size": 50281, - "num(digit)": 1691, - "len(digit)": "1,3,16", - "num(space)": 33175, - "len(space)": "1,7,66", - "num(ar)": 22, - "len(ar)": "1,1,3", - "num(zh)": 51, - "len(zh)": "1,1,4", - "num(ja)": 183, - "len(ja)": "1,1,7", - "num(ja-kana)": 133, - "len(ja-kana)": "1,1,7", - "num(ko)": 0, - "len(ko)": "-" - }, - "openai/gpt-3.5-turbo": { - "tokenizer": "gpt-3.5-turbo", - "organization": "OpenAI", - "vocab_size": 100277, - "num(digit)": 1110, - "len(digit)": "1,3,3", - "num(space)": 47472, - "len(space)": "1,7,128", - "num(ar)": 113, - "len(ar)": "1,2,10", - "num(zh)": 868, - "len(zh)": "1,1,7", - "num(ja)": 1035, - "len(ja)": "1,1,7", - "num(ja-kana)": 169, - "len(ja-kana)": "1,1,7", - "num(ko)": 299, - "len(ko)": "1,2,4" - }, - "openai/gpt-4o": { - "tokenizer": "gpt-4o", - "organization": "OpenAI", - "vocab_size": 200019, - "num(digit)": 1110, - "len(digit)": "1,3,3", - "num(space)": 109316, - "len(space)": "1,6,128", - "num(ar)": 8055, - "len(ar)": "1,4,12", - "num(zh)": 7563, - "len(zh)": "1,2,11", - "num(ja)": 8292, - "len(ja)": "1,2,11", - "num(ja-kana)": 809, - "len(ja-kana)": "1,2,11", - "num(ko)": 2365, - "len(ko)": "1,2,8" - }, - "openai/text-davinci-003": { - "tokenizer": "text-davinci-003", - "organization": "OpenAI", - "vocab_size": 50281, - "num(digit)": 1691, - "len(digit)": "1,3,16", - "num(space)": 33175, - "len(space)": "1,7,66", - "num(ar)": 22, - "len(ar)": "1,1,3", - "num(zh)": 51, - "len(zh)": "1,1,4", - "num(ja)": 183, - "len(ja)": "1,1,7", - "num(ja-kana)": 133, - "len(ja-kana)": "1,1,7", - "num(ko)": 0, - "len(ko)": "-" - }, - "thu-coai/CharacterGLM-6B": { - "tokenizer": "CharacterGLM-6B", - "organization": "Tsinghua", - "vocab_size": 64789, - "num(digit)": 20, - "len(digit)": "1,1,1", - "num(space)": 67, - "len(space)": "1,2,15", - "num(ar)": 57, - "len(ar)": "1,1,2", - "num(zh)": 30922, - "len(zh)": "1,2,16", - "num(ja)": 31065, - "len(ja)": "1,2,16", - "num(ja-kana)": 143, - "len(ja-kana)": "1,1,1", - "num(ko)": 604, - "len(ko)": "1,1,1" - }, - "tiiuae/falcon-180b": { - "tokenizer": "falcon-180b", - "organization": "TII", - "vocab_size": 65024, - "num(digit)": 1108, - "len(digit)": "1,3,3", - "num(space)": 40202, - "len(space)": "1,7,65", - "num(ar)": 21, - "len(ar)": "1,1,4", - "num(zh)": 1627, - "len(zh)": "1,1,3", - "num(ja)": 1652, - "len(ja)": "1,1,3", - "num(ja-kana)": 25, - "len(ja-kana)": "1,1,1", - "num(ko)": 1, - "len(ko)": "1,1,1" - }, - "tiiuae/falcon-7b": { - "tokenizer": "falcon-7b", - "organization": "TII", - "vocab_size": 65024, - "num(digit)": 1108, - "len(digit)": "1,3,3", - "num(space)": 40202, - "len(space)": "1,7,65", - "num(ar)": 21, - "len(ar)": "1,1,4", - "num(zh)": 1627, - "len(zh)": "1,1,3", - "num(ja)": 1652, - "len(ja)": "1,1,3", - "num(ja-kana)": 25, - "len(ja-kana)": "1,1,1", - "num(ko)": 1, - "len(ko)": "1,1,1" - }, - "Qwen/Qwen1.5-1.8B": { - "tokenizer": "Qwen1.5-1.8B", - "organization": "Alibaba", - "vocab_size": 151646, - "num(digit)": 10, - "len(digit)": "1,1,1", - "num(space)": 55883, - "len(space)": "1,6,128", - "num(ar)": 4018, - "len(ar)": "1,3,12", - "num(zh)": 25557, - "len(zh)": "1,2,7", - "num(ja)": 27206, - "len(ja)": "1,2,11", - "num(ja-kana)": 2089, - "len(ja-kana)": "1,3,11", - "num(ko)": 3495, - "len(ko)": "1,1,5" - }, - "Qwen/Qwen1.5-110B": { - "tokenizer": "Qwen1.5-110B", - "organization": "Alibaba", - "vocab_size": 151646, - "num(digit)": 10, - "len(digit)": "1,1,1", - "num(space)": 55883, - "len(space)": "1,6,128", - "num(ar)": 4018, - "len(ar)": "1,3,12", - "num(zh)": 25557, - "len(zh)": "1,2,7", - "num(ja)": 27206, - "len(ja)": "1,2,11", - "num(ja-kana)": 2089, - "len(ja-kana)": "1,3,11", - "num(ko)": 3495, - "len(ko)": "1,1,5" - }, - "Qwen/Qwen1.5-14B": { - "tokenizer": "Qwen1.5-14B", - "organization": "Alibaba", - "vocab_size": 151646, - "num(digit)": 10, - "len(digit)": "1,1,1", - "num(space)": 55883, - "len(space)": "1,6,128", - "num(ar)": 4018, - "len(ar)": "1,3,12", - "num(zh)": 25557, - "len(zh)": "1,2,7", - "num(ja)": 27206, - "len(ja)": "1,2,11", - "num(ja-kana)": 2089, - "len(ja-kana)": "1,3,11", - "num(ko)": 3495, - "len(ko)": "1,1,5" - } +{ + "FacebookAI/xlm-roberta-base": { + "tokenizer": "xlm-roberta-base", + "organization": "Facebook", + "vocab_size": 250002, + "num(digit)": 2728, + "len(digit)": "1,3,9", + "num(space)": 1, + "len(space)": "1,1,1", + "num(ar)": 14644, + "len(ar)": "1,4,16", + "num(zh)": 18457, + "len(zh)": "1,2,16", + "num(ja)": 20572, + "len(ja)": "1,2,16", + "num(ja-kana)": 3434, + "len(ja-kana)": "1,3,12", + "num(ko)": 5373, + "len(ko)": "1,2,8" + }, + "clue/roberta_chinese_clue_tiny": { + "tokenizer": "roberta-chinese-clue", + "organization": "CLUE", + "vocab_size": 8021, + "num(digit)": 230, + "len(digit)": "1,4,10", + "num(space)": 0, + "len(space)": "-", + "num(ar)": 30, + "len(ar)": "1,2,3", + "num(zh)": 5689, + "len(zh)": "1,1,1", + "num(ja)": 5691, + "len(ja)": "1,1,3", + "num(ja-kana)": 0, + "len(ja-kana)": "-", + "num(ko)": 0, + "len(ko)": "-" + }, + "dbmdz/bert-base-german-uncased": { + "tokenizer": "bert-base-german-uncased", + "organization": "dbmdz", + "vocab_size": 31102, + "num(digit)": 1733, + "len(digit)": "1,4,12", + "num(space)": 0, + "len(space)": "-", + "num(ar)": 0, + "len(ar)": "-", + "num(zh)": 0, + "len(zh)": "-", + "num(ja)": 0, + "len(ja)": "-", + "num(ja-kana)": 0, + "len(ja-kana)": "-", + "num(ko)": 0, + "len(ko)": "-" + }, + "google-bert/bert-base-cased": { + "tokenizer": "bert-base-cased", + "organization": "Google", + "vocab_size": 28996, + "num(digit)": 926, + "len(digit)": "1,4,11", + "num(space)": 0, + "len(space)": "-", + "num(ar)": 94, + "len(ar)": "1,3,4", + "num(zh)": 226, + "len(zh)": "1,2,3", + "num(ja)": 390, + "len(ja)": "1,2,3", + "num(ja-kana)": 164, + "len(ja-kana)": "1,2,3", + "num(ko)": 10, + "len(ko)": "1,2,3" + }, + "google-bert/bert-base-chinese": { + "tokenizer": "bert-base-chinese", + "organization": "Google", + "vocab_size": 21128, + "num(digit)": 1451, + "len(digit)": "1,3,12", + "num(space)": 2, + "len(space)": "1,2,3", + "num(ar)": 30, + "len(ar)": "1,2,3", + "num(zh)": 14642, + "len(zh)": "1,2,3", + "num(ja)": 15197, + "len(ja)": "1,3,15", + "num(ja-kana)": 553, + "len(ja-kana)": "1,3,15", + "num(ko)": 0, + "len(ko)": "-" + }, + "google-bert/bert-base-german-cased": { + "tokenizer": "bert-base-german-cased", + "organization": "Google", + "vocab_size": 30000, + "num(digit)": 4065, + "len(digit)": "1,11,22", + "num(space)": 0, + "len(space)": "-", + "num(ar)": 0, + "len(ar)": "-", + "num(zh)": 0, + "len(zh)": "-", + "num(ja)": 0, + "len(ja)": "-", + "num(ja-kana)": 0, + "len(ja-kana)": "-", + "num(ko)": 0, + "len(ko)": "-" + }, + "google-bert/bert-base-multilingual-cased": { + "tokenizer": "bert-base-multilingual-cased", + "organization": "Google", + "vocab_size": 119547, + "num(digit)": 2583, + "len(digit)": "1,3,13", + "num(space)": 0, + "len(space)": "-", + "num(ar)": 4873, + "len(ar)": "1,5,14", + "num(zh)": 13542, + "len(zh)": "1,2,3", + "num(ja)": 14880, + "len(ja)": "1,3,10", + "num(ja-kana)": 1336, + "len(ja-kana)": "1,4,10", + "num(ko)": 3271, + "len(ko)": "1,3,6" + }, + "google-bert/bert-base-multilingual-uncased": { + "tokenizer": "bert-base-multilingual-uncased", + "organization": "Google", + "vocab_size": 105879, + "num(digit)": 2510, + "len(digit)": "1,3,13", + "num(space)": 2, + "len(space)": "1,2,3", + "num(ar)": 4530, + "len(ar)": "1,5,13", + "num(zh)": 16658, + "len(zh)": "1,2,3", + "num(ja)": 17858, + "len(ja)": "1,3,10", + "num(ja-kana)": 1188, + "len(ja-kana)": "1,4,10", + "num(ko)": 0, + "len(ko)": "-" + }, + "google-bert/bert-base-uncased": { + "tokenizer": "bert-base-uncased", + "organization": "Google", + "vocab_size": 30522, + "num(digit)": 2056, + "len(digit)": "1,4,11", + "num(space)": 0, + "len(space)": "-", + "num(ar)": 88, + "len(ar)": "1,3,5", + "num(zh)": 488, + "len(zh)": "1,2,3", + "num(ja)": 676, + "len(ja)": "1,2,3", + "num(ja-kana)": 188, + "len(ja-kana)": "1,2,3", + "num(ko)": 0, + "len(ko)": "-" + }, + "google/mobilebert-uncased": { + "tokenizer": "mobilebert-uncased", + "organization": "Google", + "vocab_size": 30522, + "num(digit)": 2056, + "len(digit)": "1,4,11", + "num(space)": 0, + "len(space)": "-", + "num(ar)": 88, + "len(ar)": "1,3,5", + "num(zh)": 488, + "len(zh)": "1,2,3", + "num(ja)": 676, + "len(ja)": "1,2,3", + "num(ja-kana)": 188, + "len(ja-kana)": "1,2,3", + "num(ko)": 0, + "len(ko)": "-" + }, + "tohoku-nlp/bert-base-japanese": { + "tokenizer": "bert-base-japanese", + "organization": "Tohoku", + "vocab_size": 32000, + "num(digit)": 669, + "len(digit)": "1,3,5", + "num(space)": 0, + "len(space)": "-", + "num(ar)": 10, + "len(ar)": "1,3,3", + "num(zh)": 18792, + "len(zh)": "1,2,11", + "num(ja)": 28367, + "len(ja)": "1,2,13", + "num(ja-kana)": 12359, + "len(ja-kana)": "1,4,13", + "num(ko)": 0, + "len(ko)": "-" + }, + "gpt-4": { + "tokenizer": "gpt-4", + "organization": "OpenAI", + "vocab_size": 100277, + "num(digit)": 1110, + "len(digit)": "1,3,3", + "num(space)": 47472, + "len(space)": "1,7,128", + "num(ar)": 113, + "len(ar)": "1,2,10", + "num(zh)": 868, + "len(zh)": "1,1,7", + "num(ja)": 1035, + "len(ja)": "1,1,7", + "num(ja-kana)": 169, + "len(ja-kana)": "1,1,7", + "num(ko)": 299, + "len(ko)": "1,2,4" + }, + "llama3": { + "tokenizer": "llama3", + "organization": "Meta", + "vocab_size": 128256, + "num(digit)": 1110, + "len(digit)": "1,3,3", + "num(space)": 60860, + "len(space)": "1,6,128", + "num(ar)": 3810, + "len(ar)": "1,4,11", + "num(zh)": 4424, + "len(zh)": "1,1,7", + "num(ja)": 5387, + "len(ja)": "1,2,8", + "num(ja-kana)": 1086, + "len(ja-kana)": "1,2,8", + "num(ko)": 2281, + "len(ko)": "1,2,6" + }, + "google-t5/t5-large": { + "tokenizer": "t5", + "organization": "Google", + "vocab_size": 32100, + "num(digit)": 1133, + "len(digit)": "1,3,13", + "num(space)": 0, + "len(space)": "-", + "num(ar)": 0, + "len(ar)": "-", + "num(zh)": 0, + "len(zh)": "-", + "num(ja)": 0, + "len(ja)": "-", + "num(ja-kana)": 0, + "len(ja-kana)": "-", + "num(ko)": 0, + "len(ko)": "-" + }, + "google/byt5-small": { + "tokenizer": "byt5-small", + "organization": "Google", + "vocab_size": 384, + "num(digit)": 10, + "len(digit)": "1,1,1", + "num(space)": 10, + "len(space)": "1,1,1", + "num(ar)": 0, + "len(ar)": "-", + "num(zh)": 0, + "len(zh)": "-", + "num(ja)": 0, + "len(ja)": "-", + "num(ja-kana)": 0, + "len(ja-kana)": "-", + "num(ko)": 0, + "len(ko)": "-" + }, + "google/mt5-large": { + "tokenizer": "mt5-large", + "organization": "Google", + "vocab_size": 250100, + "num(digit)": 16829, + "len(digit)": "1,4,16", + "num(space)": 1, + "len(space)": "1,1,1", + "num(ar)": 7459, + "len(ar)": "1,3,16", + "num(zh)": 21489, + "len(zh)": "1,2,16", + "num(ja)": 27078, + "len(ja)": "1,2,16", + "num(ja-kana)": 9160, + "len(ja-kana)": "1,3,14", + "num(ko)": 4041, + "len(ko)": "1,1,10" + }, + "lmsys/fastchat-t5-3b-v1.0": { + "tokenizer": "fastchat-t5-3b-v1.0", + "organization": "LMSYS", + "vocab_size": 32110, + "num(digit)": 1033, + "len(digit)": "1,3,8", + "num(space)": 0, + "len(space)": "-", + "num(ar)": 0, + "len(ar)": "-", + "num(zh)": 0, + "len(zh)": "-", + "num(ja)": 0, + "len(ja)": "-", + "num(ja-kana)": 0, + "len(ja-kana)": "-", + "num(ko)": 0, + "len(ko)": "-" + }, + "paust/pko-t5-large": { + "tokenizer": "pko-t5-large", + "organization": "PAUST", + "vocab_size": 50358, + "num(digit)": 51, + "len(digit)": "1,2,3", + "num(space)": 10, + "len(space)": "1,1,1", + "num(ar)": 0, + "len(ar)": "-", + "num(zh)": 0, + "len(zh)": "-", + "num(ja)": 0, + "len(ja)": "-", + "num(ja-kana)": 0, + "len(ja-kana)": "-", + "num(ko)": 49050, + "len(ko)": "1,2,16" + }, + "bloom": { + "tokenizer": "bloom", + "organization": "BigScience", + "vocab_size": 250680, + "num(digit)": 6629, + "len(digit)": "1,4,50", + "num(space)": 140180, + "len(space)": "1,6,600", + "num(ar)": 20854, + "len(ar)": "1,5,16", + "num(zh)": 30603, + "len(zh)": "1,2,23", + "num(ja)": 30816, + "len(ja)": "1,2,23", + "num(ja-kana)": 214, + "len(ja-kana)": "1,1,3", + "num(ko)": 338, + "len(ko)": "1,1,3" + }, + "llama": { + "tokenizer": "llama", + "organization": "Meta", + "vocab_size": 32000, + "num(digit)": 20, + "len(digit)": "1,1,1", + "num(space)": 61, + "len(space)": "1,2,15", + "num(ar)": 55, + "len(ar)": "1,1,2", + "num(zh)": 700, + "len(zh)": "1,1,1", + "num(ja)": 837, + "len(ja)": "1,1,1", + "num(ja-kana)": 137, + "len(ja-kana)": "1,1,1", + "num(ko)": 111, + "len(ko)": "1,1,1" + }, + "ClueAI/ChatYuan-large-v2": { + "tokenizer": "ChatYuan-large-v2", + "organization": "CLUE", + "vocab_size": 32128, + "num(digit)": 740, + "len(digit)": "1,3,9", + "num(space)": 0, + "len(space)": "-", + "num(ar)": 2, + "len(ar)": "1,1,1", + "num(zh)": 29591, + "len(zh)": "1,2,16", + "num(ja)": 29736, + "len(ja)": "1,2,16", + "num(ja-kana)": 145, + "len(ja-kana)": "1,1,2", + "num(ko)": 0, + "len(ko)": "-" + }, + "Meta/llama3": { + "tokenizer": "llama3", + "organization": "Meta", + "vocab_size": 128256, + "num(digit)": 1110, + "len(digit)": "1,3,3", + "num(space)": 60860, + "len(space)": "1,6,128", + "num(ar)": 3810, + "len(ar)": "1,4,11", + "num(zh)": 4424, + "len(zh)": "1,1,7", + "num(ja)": 5387, + "len(ja)": "1,2,8", + "num(ja-kana)": 1086, + "len(ja-kana)": "1,2,8", + "num(ko)": 2281, + "len(ko)": "1,2,6" + }, + "openai/gpt-4": { + "tokenizer": "gpt-4", + "organization": "OpenAI", + "vocab_size": 100277, + "num(digit)": 1110, + "len(digit)": "1,3,3", + "num(space)": 47472, + "len(space)": "1,7,128", + "num(ar)": 113, + "len(ar)": "1,2,10", + "num(zh)": 868, + "len(zh)": "1,1,7", + "num(ja)": 1035, + "len(ja)": "1,1,7", + "num(ja-kana)": 169, + "len(ja-kana)": "1,1,7", + "num(ko)": 299, + "len(ko)": "1,2,4" + }, + "gradientai/Llama-3-8B-Instruct-Gradient-1048k": { + "tokenizer": "llama3", + "organization": "Meta", + "vocab_size": 128256, + "num(digit)": 1110, + "len(digit)": "1,3,3", + "num(space)": 60860, + "len(space)": "1,6,128", + "num(ar)": 3810, + "len(ar)": "1,4,11", + "num(zh)": 4424, + "len(zh)": "1,1,7", + "num(ja)": 5387, + "len(ja)": "1,2,8", + "num(ja-kana)": 1086, + "len(ja-kana)": "1,2,8", + "num(ko)": 2281, + "len(ko)": "1,2,6" + }, + "bigscience/bloom": { + "tokenizer": "bloom", + "organization": "BigScience", + "vocab_size": 250680, + "num(digit)": 6629, + "len(digit)": "1,4,50", + "num(space)": 140180, + "len(space)": "1,6,600", + "num(ar)": 20854, + "len(ar)": "1,5,16", + "num(zh)": 30603, + "len(zh)": "1,2,23", + "num(ja)": 30816, + "len(ja)": "1,2,23", + "num(ja-kana)": 214, + "len(ja-kana)": "1,1,3", + "num(ko)": 338, + "len(ko)": "1,1,3" + }, + "huggyllama/llama-7b": { + "tokenizer": "llama", + "organization": "Meta", + "vocab_size": 32000, + "num(digit)": 20, + "len(digit)": "1,1,1", + "num(space)": 61, + "len(space)": "1,2,15", + "num(ar)": 55, + "len(ar)": "1,1,2", + "num(zh)": 700, + "len(zh)": "1,1,1", + "num(ja)": 837, + "len(ja)": "1,1,1", + "num(ja-kana)": 137, + "len(ja-kana)": "1,1,1", + "num(ko)": 111, + "len(ko)": "1,1,1" + }, + "baichuan-inc/Baichuan-7B": { + "tokenizer": "baichuan", + "organization": "Baichuan", + "vocab_size": 64000, + "num(digit)": 335, + "len(digit)": "1,14,14", + "num(space)": 13, + "len(space)": "1,1,1", + "num(ar)": 299, + "len(ar)": "1,1,2", + "num(zh)": 27676, + "len(zh)": "1,1,9", + "num(ja)": 28522, + "len(ja)": "1,1,9", + "num(ja-kana)": 178, + "len(ja-kana)": "1,1,1", + "num(ko)": 1591, + "len(ko)": "1,1,1" + }, + "01-ai/Yi-34B": { + "tokenizer": "Yi-34B", + "organization": "Yi", + "vocab_size": 64000, + "num(digit)": 200, + "len(digit)": "1,13,15", + "num(space)": 24274, + "len(space)": "1,7,16", + "num(ar)": 18, + "len(ar)": "1,1,4", + "num(zh)": 21356, + "len(zh)": "1,2,12", + "num(ja)": 21407, + "len(ja)": "1,2,12", + "num(ja-kana)": 51, + "len(ja-kana)": "1,1,2", + "num(ko)": 28, + "len(ko)": "1,1,2" + }, + "01-ai/Yi-6B": { + "tokenizer": "Yi-6B", + "organization": "Yi", + "vocab_size": 64000, + "num(digit)": 200, + "len(digit)": "1,13,15", + "num(space)": 24274, + "len(space)": "1,7,16", + "num(ar)": 18, + "len(ar)": "1,1,4", + "num(zh)": 21356, + "len(zh)": "1,2,12", + "num(ja)": 21407, + "len(ja)": "1,2,12", + "num(ja-kana)": 51, + "len(ja-kana)": "1,1,2", + "num(ko)": 28, + "len(ko)": "1,1,2" + }, + "01-ai/Yi-VL-34B": { + "tokenizer": "Yi-VL-34B", + "organization": "Yi", + "vocab_size": 64000, + "num(digit)": 200, + "len(digit)": "1,13,15", + "num(space)": 43, + "len(space)": "1,2,15", + "num(ar)": 18, + "len(ar)": "1,1,4", + "num(zh)": 21356, + "len(zh)": "1,2,12", + "num(ja)": 21407, + "len(ja)": "1,2,12", + "num(ja-kana)": 51, + "len(ja-kana)": "1,1,2", + "num(ko)": 28, + "len(ko)": "1,1,2" + }, + "ClassCat/gpt2-base-french": { + "tokenizer": "gpt2-base-french", + "organization": "ClassCat", + "vocab_size": 50000, + "num(digit)": 1833, + "len(digit)": "1,4,5", + "num(space)": 31889, + "len(space)": "1,7,32", + "num(ar)": 41, + "len(ar)": "1,1,4", + "num(zh)": 27, + "len(zh)": "1,1,1", + "num(ja)": 46, + "len(ja)": "1,1,2", + "num(ja-kana)": 19, + "len(ja-kana)": "1,1,2", + "num(ko)": 0, + "len(ko)": "-" + }, + "ClassCat/gpt2-base-spanish": { + "tokenizer": "gpt2-base-spanish", + "organization": "ClassCat", + "vocab_size": 50000, + "num(digit)": 1492, + "len(digit)": "1,4,9", + "num(space)": 34496, + "len(space)": "1,8,32", + "num(ar)": 36, + "len(ar)": "1,1,4", + "num(zh)": 13, + "len(zh)": "1,1,1", + "num(ja)": 36, + "len(ja)": "1,1,2", + "num(ja-kana)": 23, + "len(ja-kana)": "1,1,2", + "num(ko)": 0, + "len(ko)": "-" + }, + "ClueAI/PromptCLUE-base": { + "tokenizer": "PromptCLUE-base", + "organization": "CLUE", + "vocab_size": 32128, + "num(digit)": 740, + "len(digit)": "1,3,9", + "num(space)": 0, + "len(space)": "-", + "num(ar)": 2, + "len(ar)": "1,1,1", + "num(zh)": 29591, + "len(zh)": "1,2,16", + "num(ja)": 29736, + "len(ja)": "1,2,16", + "num(ja-kana)": 145, + "len(ja-kana)": "1,1,2", + "num(ko)": 0, + "len(ko)": "-" + }, + "CohereForAI/aya-101": { + "tokenizer": "aya-101", + "organization": "Cohere For AI", + "vocab_size": 250100, + "num(digit)": 16829, + "len(digit)": "1,4,16", + "num(space)": 1, + "len(space)": "1,1,1", + "num(ar)": 7459, + "len(ar)": "1,3,16", + "num(zh)": 21489, + "len(zh)": "1,2,16", + "num(ja)": 27078, + "len(ja)": "1,2,16", + "num(ja-kana)": 9160, + "len(ja-kana)": "1,3,14", + "num(ko)": 4041, + "len(ko)": "1,1,10" + }, + "EleutherAI/gpt-neox-20b": { + "tokenizer": "gpt-neox-20b", + "organization": "EleutherAI", + "vocab_size": 50277, + "num(digit)": 2036, + "len(digit)": "1,3,35", + "num(space)": 28996, + "len(space)": "1,7,512", + "num(ar)": 94, + "len(ar)": "1,2,4", + "num(zh)": 313, + "len(zh)": "1,1,2", + "num(ja)": 480, + "len(ja)": "1,1,4", + "num(ja-kana)": 167, + "len(ja-kana)": "1,1,4", + "num(ko)": 25, + "len(ko)": "1,1,2" + }, + "HuggingFaceH4/starchat-alpha": { + "tokenizer": "starchat-alpha", + "organization": "-", + "vocab_size": 49156, + "num(digit)": 10, + "len(digit)": "1,1,1", + "num(space)": 16515, + "len(space)": "1,6,256", + "num(ar)": 84, + "len(ar)": "1,2,4", + "num(zh)": 2030, + "len(zh)": "1,1,7", + "num(ja)": 2368, + "len(ja)": "1,1,8", + "num(ja-kana)": 360, + "len(ja-kana)": "1,2,8", + "num(ko)": 491, + "len(ko)": "1,2,5" + }, + "HuggingFaceH4/zephyr-7b-beta": { + "tokenizer": "zephyr-7b-beta", + "organization": "HuggingFace", + "vocab_size": 32000, + "num(digit)": 20, + "len(digit)": "1,1,1", + "num(space)": 85, + "len(space)": "1,3,15", + "num(ar)": 71, + "len(ar)": "1,1,2", + "num(zh)": 1459, + "len(zh)": "1,1,2", + "num(ja)": 1593, + "len(ja)": "1,1,2", + "num(ja-kana)": 134, + "len(ja-kana)": "1,1,1", + "num(ko)": 346, + "len(ko)": "1,1,1" + }, + "LLM360/CrystalCoder": { + "tokenizer": "CrystalCoder", + "organization": "MBZUAI", + "vocab_size": 32022, + "num(digit)": 20, + "len(digit)": "1,1,1", + "num(space)": 61, + "len(space)": "1,2,15", + "num(ar)": 55, + "len(ar)": "1,1,2", + "num(zh)": 700, + "len(zh)": "1,1,1", + "num(ja)": 837, + "len(ja)": "1,1,1", + "num(ja-kana)": 137, + "len(ja-kana)": "1,1,1", + "num(ko)": 111, + "len(ko)": "1,1,1" + }, + "NousResearch/Llama-2-7b-chat-hf": { + "tokenizer": "llama2", + "organization": "Meta", + "vocab_size": 32001, + "num(digit)": 20, + "len(digit)": "1,1,1", + "num(space)": 61, + "len(space)": "1,2,15", + "num(ar)": 55, + "len(ar)": "1,1,2", + "num(zh)": 700, + "len(zh)": "1,1,1", + "num(ja)": 837, + "len(ja)": "1,1,1", + "num(ja-kana)": 137, + "len(ja-kana)": "1,1,1", + "num(ko)": 111, + "len(ko)": "1,1,1" + }, + "OrionStarAI/Orion-14B-Chat": { + "tokenizer": "Orion-14B-Chat", + "organization": "OrionStar", + "vocab_size": 84608, + "num(digit)": 1559, + "len(digit)": "1,4,14", + "num(space)": 18383, + "len(space)": "1,6,16", + "num(ar)": 102, + "len(ar)": "1,1,1", + "num(zh)": 46998, + "len(zh)": "1,2,16", + "num(ja)": 49644, + "len(ja)": "1,2,16", + "num(ja-kana)": 2987, + "len(ja-kana)": "1,3,11", + "num(ko)": 5110, + "len(ko)": "1,2,7" + }, + "Qwen/Qwen-7B-Chat": { + "tokenizer": "Qwen", + "organization": "Alibaba", + "vocab_size": 151851, + "num(digit)": 10, + "len(digit)": "1,1,1", + "num(space)": 55883, + "len(space)": "1,6,128", + "num(ar)": 4018, + "len(ar)": "1,3,12", + "num(zh)": 25557, + "len(zh)": "1,2,7", + "num(ja)": 27206, + "len(ja)": "1,2,11", + "num(ja-kana)": 2089, + "len(ja-kana)": "1,3,11", + "num(ko)": 3495, + "len(ko)": "1,1,5" + }, + "Qwen/Qwen1.5-14B-Chat": { + "tokenizer": "Qwen1.5", + "organization": "Alibaba", + "vocab_size": 151646, + "num(digit)": 10, + "len(digit)": "1,1,1", + "num(space)": 55883, + "len(space)": "1,6,128", + "num(ar)": 4018, + "len(ar)": "1,3,12", + "num(zh)": 25557, + "len(zh)": "1,2,7", + "num(ja)": 27206, + "len(ja)": "1,2,11", + "num(ja-kana)": 2089, + "len(ja-kana)": "1,3,11", + "num(ko)": 3495, + "len(ko)": "1,1,5" + }, + "Skywork/Skywork-13B-Math": { + "tokenizer": "Skywork-13B-Math", + "organization": "Kunlun", + "vocab_size": 65519, + "num(digit)": 20, + "len(digit)": "1,1,1", + "num(space)": 62, + "len(space)": "1,2,15", + "num(ar)": 56, + "len(ar)": "1,1,2", + "num(zh)": 33913, + "len(zh)": "1,2,5", + "num(ja)": 34064, + "len(ja)": "1,2,5", + "num(ja-kana)": 150, + "len(ja-kana)": "1,1,1", + "num(ko)": 111, + "len(ko)": "1,1,1" + }, + "Skywork/Skywork-13B-base": { + "tokenizer": "Skywork-13B-base", + "organization": "Kunlun", + "vocab_size": 65519, + "num(digit)": 20, + "len(digit)": "1,1,1", + "num(space)": 62, + "len(space)": "1,2,15", + "num(ar)": 56, + "len(ar)": "1,1,2", + "num(zh)": 33913, + "len(zh)": "1,2,5", + "num(ja)": 34064, + "len(ja)": "1,2,5", + "num(ja-kana)": 150, + "len(ja-kana)": "1,1,1", + "num(ko)": 111, + "len(ko)": "1,1,1" + }, + "THUDM/chatglm-6b": { + "tokenizer": "chatglm-6b", + "organization": "Tsinghua", + "vocab_size": 130344, + "num(digit)": 20, + "len(digit)": "1,1,1", + "num(space)": 93, + "len(space)": "1,34,80", + "num(ar)": 137, + "len(ar)": "1,2,4", + "num(zh)": 61358, + "len(zh)": "1,2,16", + "num(ja)": 61784, + "len(ja)": "1,2,16", + "num(ja-kana)": 439, + "len(ja-kana)": "1,2,5", + "num(ko)": 114, + "len(ko)": "1,1,3" + }, + "THUDM/chatglm2-6b": { + "tokenizer": "chatglm2-6b", + "organization": "Tsinghua", + "vocab_size": 64787, + "num(digit)": 20, + "len(digit)": "1,1,1", + "num(space)": 67, + "len(space)": "1,2,15", + "num(ar)": 57, + "len(ar)": "1,1,2", + "num(zh)": 30922, + "len(zh)": "1,2,16", + "num(ja)": 31065, + "len(ja)": "1,2,16", + "num(ja-kana)": 143, + "len(ja-kana)": "1,1,1", + "num(ko)": 604, + "len(ko)": "1,1,1" + }, + "THUDM/chatglm3-6b": { + "tokenizer": "chatglm3-6b", + "organization": "Tsinghua", + "vocab_size": 64796, + "num(digit)": 20, + "len(digit)": "1,1,1", + "num(space)": 67, + "len(space)": "1,2,15", + "num(ar)": 57, + "len(ar)": "1,1,2", + "num(zh)": 30922, + "len(zh)": "1,2,16", + "num(ja)": 31065, + "len(ja)": "1,2,16", + "num(ja-kana)": 143, + "len(ja-kana)": "1,1,1", + "num(ko)": 604, + "len(ko)": "1,1,1" + }, + "TigerResearch/tigerbot-13b-chat-v2": { + "tokenizer": "tigerbot-13b-chat-v2", + "organization": "Tigerobo", + "vocab_size": 60515, + "num(digit)": 20, + "len(digit)": "1,1,1", + "num(space)": 61, + "len(space)": "1,2,15", + "num(ar)": 55, + "len(ar)": "1,1,2", + "num(zh)": 28603, + "len(zh)": "1,2,16", + "num(ja)": 28770, + "len(ja)": "1,2,16", + "num(ja-kana)": 167, + "len(ja-kana)": "1,1,2", + "num(ko)": 261, + "len(ko)": "1,1,1" + }, + "TigerResearch/tigerbot-70b-chat-v4-4k": { + "tokenizer": "tigerbot-70b-chat-v4-4k", + "organization": "Tigerobo", + "vocab_size": 65110, + "num(digit)": 20, + "len(digit)": "1,1,1", + "num(space)": 61, + "len(space)": "1,2,15", + "num(ar)": 55, + "len(ar)": "1,1,2", + "num(zh)": 30509, + "len(zh)": "1,2,16", + "num(ja)": 32061, + "len(ja)": "1,2,16", + "num(ja-kana)": 2071, + "len(ja-kana)": "1,2,8", + "num(ko)": 1504, + "len(ko)": "1,1,5" + }, + "Upstage/SOLAR-10.7B-v1.0": { + "tokenizer": "SOLAR-10.7B-v1.0", + "organization": "-", + "vocab_size": 32000, + "num(digit)": 20, + "len(digit)": "1,1,1", + "num(space)": 85, + "len(space)": "1,3,15", + "num(ar)": 71, + "len(ar)": "1,1,2", + "num(zh)": 1459, + "len(zh)": "1,1,2", + "num(ja)": 1593, + "len(ja)": "1,1,2", + "num(ja-kana)": 134, + "len(ja-kana)": "1,1,1", + "num(ko)": 346, + "len(ko)": "1,1,1" + }, + "WizardLM/WizardCoder-15B-V1.0": { + "tokenizer": "WizardCoder-15B-V1.0", + "organization": "Microsoft", + "vocab_size": 49153, + "num(digit)": 10, + "len(digit)": "1,1,1", + "num(space)": 16515, + "len(space)": "1,6,256", + "num(ar)": 84, + "len(ar)": "1,2,4", + "num(zh)": 2030, + "len(zh)": "1,1,7", + "num(ja)": 2368, + "len(ja)": "1,1,8", + "num(ja-kana)": 360, + "len(ja-kana)": "1,2,8", + "num(ko)": 491, + "len(ko)": "1,2,5" + }, + "WizardLM/WizardCoder-Python-7B-V1.0": { + "tokenizer": "WizardCoder-Python-7B-V1.0", + "organization": "Microsoft", + "vocab_size": 32001, + "num(digit)": 20, + "len(digit)": "1,1,1", + "num(space)": 61, + "len(space)": "1,2,15", + "num(ar)": 55, + "len(ar)": "1,1,2", + "num(zh)": 700, + "len(zh)": "1,1,1", + "num(ja)": 837, + "len(ja)": "1,1,1", + "num(ja-kana)": 137, + "len(ja-kana)": "1,1,1", + "num(ko)": 111, + "len(ko)": "1,1,1" + }, + "WizardLM/WizardLM-7B-V1.0": { + "tokenizer": "WizardLM-7B-V1.0", + "organization": "Microsoft", + "vocab_size": 32001, + "num(digit)": 20, + "len(digit)": "1,1,1", + "num(space)": 61, + "len(space)": "1,2,15", + "num(ar)": 55, + "len(ar)": "1,1,2", + "num(zh)": 700, + "len(zh)": "1,1,1", + "num(ja)": 837, + "len(ja)": "1,1,1", + "num(ja-kana)": 137, + "len(ja-kana)": "1,1,1", + "num(ko)": 111, + "len(ko)": "1,1,1" + }, + "WizardLM/WizardMath-70B-V1.0": { + "tokenizer": "WizardMath-70B-V1.0", + "organization": "Microsoft", + "vocab_size": 32002, + "num(digit)": 20, + "len(digit)": "1,1,1", + "num(space)": 61, + "len(space)": "1,2,15", + "num(ar)": 55, + "len(ar)": "1,1,2", + "num(zh)": 700, + "len(zh)": "1,1,1", + "num(ja)": 837, + "len(ja)": "1,1,1", + "num(ja-kana)": 137, + "len(ja-kana)": "1,1,1", + "num(ko)": 111, + "len(ko)": "1,1,1" + }, + "abeja/gpt-neox-japanese-2.7b": { + "tokenizer": "gpt-neox-japanese-2.7b", + "organization": "ABEJA", + "vocab_size": 32000, + "num(digit)": 20, + "len(digit)": "1,1,1", + "num(space)": 0, + "len(space)": "-", + "num(ar)": 0, + "len(ar)": "-", + "num(zh)": 15176, + "len(zh)": "1,2,2", + "num(ja)": 31482, + "len(ja)": "1,2,3", + "num(ja-kana)": 16306, + "len(ja-kana)": "1,3,3", + "num(ko)": 0, + "len(ko)": "-" + }, + "ai21labs/Jamba-v0.1": { + "tokenizer": "Jamba-v0.1", + "organization": "AI21", + "vocab_size": 65536, + "num(digit)": 1556, + "len(digit)": "1,16,17", + "num(space)": 39501, + "len(space)": "1,7,32", + "num(ar)": 867, + "len(ar)": "1,3,8", + "num(zh)": 1157, + "len(zh)": "1,1,2", + "num(ja)": 1287, + "len(ja)": "1,1,2", + "num(ja-kana)": 130, + "len(ja-kana)": "1,1,2", + "num(ko)": 312, + "len(ko)": "1,1,2" + }, + "allenai/OLMo-7B": { + "tokenizer": "OLMo-7B", + "organization": "Allen AI", + "vocab_size": 50280, + "num(digit)": 2036, + "len(digit)": "1,3,35", + "num(space)": 29019, + "len(space)": "1,7,512", + "num(ar)": 94, + "len(ar)": "1,2,4", + "num(zh)": 313, + "len(zh)": "1,1,2", + "num(ja)": 480, + "len(ja)": "1,1,4", + "num(ja-kana)": 167, + "len(ja-kana)": "1,1,4", + "num(ko)": 25, + "len(ko)": "1,1,2" + }, + "baichuan-inc/Baichuan2-7B-Chat": { + "tokenizer": "baichuan2", + "organization": "Baichuan", + "vocab_size": 125696, + "num(digit)": 1023, + "len(digit)": "1,14,14", + "num(space)": 26013, + "len(space)": "1,7,32", + "num(ar)": 335, + "len(ar)": "1,1,27", + "num(zh)": 70398, + "len(zh)": "1,2,32", + "num(ja)": 71269, + "len(ja)": "1,2,32", + "num(ja-kana)": 206, + "len(ja-kana)": "1,1,9", + "num(ko)": 1595, + "len(ko)": "1,1,2" + }, + "ckiplab/gpt2-base-chinese": { + "tokenizer": "gpt2-base-chinese", + "organization": "SINICA", + "vocab_size": 21128, + "num(digit)": 1451, + "len(digit)": "1,3,12", + "num(space)": 2, + "len(space)": "1,2,3", + "num(ar)": 30, + "len(ar)": "1,2,3", + "num(zh)": 14642, + "len(zh)": "1,2,3", + "num(ja)": 15197, + "len(ja)": "1,3,15", + "num(ja-kana)": 553, + "len(ja-kana)": "1,3,15", + "num(ko)": 0, + "len(ko)": "-" + }, + "cyberagent/open-calm-7b": { + "tokenizer": "open-calm-7b", + "organization": "CyberAgent", + "vocab_size": 52000, + "num(digit)": 690, + "len(digit)": "1,3,5", + "num(space)": 1698, + "len(space)": "1,4,33", + "num(ar)": 10, + "len(ar)": "1,1,4", + "num(zh)": 30775, + "len(zh)": "1,3,31", + "num(ja)": 45790, + "len(ja)": "1,3,31", + "num(ja-kana)": 32535, + "len(ja-kana)": "1,3,31", + "num(ko)": 0, + "len(ko)": "-" + }, + "databricks/dbrx-instruct": { + "tokenizer": "dbrx-instruct", + "organization": "Databricks", + "vocab_size": 100280, + "num(digit)": 1126, + "len(digit)": "1,3,17", + "num(space)": 47400, + "len(space)": "1,7,128", + "num(ar)": 113, + "len(ar)": "1,2,10", + "num(zh)": 868, + "len(zh)": "1,1,7", + "num(ja)": 1035, + "len(ja)": "1,1,7", + "num(ja-kana)": 169, + "len(ja-kana)": "1,1,7", + "num(ko)": 299, + "len(ko)": "1,2,4" + }, + "deepseek-ai/DeepSeek-V2": { + "tokenizer": "DeepSeek-V2", + "organization": "DeepSeek", + "vocab_size": 100002, + "num(digit)": 10, + "len(digit)": "1,1,1", + "num(space)": 48073, + "len(space)": "1,7,128", + "num(ar)": 48, + "len(ar)": "1,1,4", + "num(zh)": 18052, + "len(zh)": "1,2,16", + "num(ja)": 18090, + "len(ja)": "1,2,16", + "num(ja-kana)": 38, + "len(ja-kana)": "1,1,2", + "num(ko)": 16, + "len(ko)": "1,1,2" + }, + "deepseek-ai/deepseek-coder-33b-instruct": { + "tokenizer": "deepseek-coder-33b-instruct", + "organization": "DeepSeek", + "vocab_size": 32022, + "num(digit)": 10, + "len(digit)": "1,1,1", + "num(space)": 15254, + "len(space)": "1,6,65", + "num(ar)": 12, + "len(ar)": "1,1,2", + "num(zh)": 4803, + "len(zh)": "1,2,4", + "num(ja)": 4804, + "len(ja)": "1,2,4", + "num(ja-kana)": 1, + "len(ja-kana)": "1,1,1", + "num(ko)": 0, + "len(ko)": "-" + }, + "deepseek-ai/deepseek-llm-7b-base": { + "tokenizer": "deepseek-llm-7b-base", + "organization": "DeepSeek", + "vocab_size": 100015, + "num(digit)": 10, + "len(digit)": "1,1,1", + "num(space)": 48073, + "len(space)": "1,7,128", + "num(ar)": 48, + "len(ar)": "1,1,4", + "num(zh)": 18052, + "len(zh)": "1,2,16", + "num(ja)": 18090, + "len(ja)": "1,2,16", + "num(ja-kana)": 38, + "len(ja-kana)": "1,1,2", + "num(ko)": 16, + "len(ko)": "1,1,2" + }, + "eson/kplug-base-encoder": { + "tokenizer": "kplug", + "organization": "JD", + "vocab_size": 10261, + "num(digit)": 420, + "len(digit)": "1,3,12", + "num(space)": 0, + "len(space)": "-", + "num(ar)": 0, + "len(ar)": "-", + "num(zh)": 5764, + "len(zh)": "1,1,1", + "num(ja)": 5766, + "len(ja)": "1,1,3", + "num(ja-kana)": 0, + "len(ja-kana)": "-", + "num(ko)": 0, + "len(ko)": "-" + }, + "fnlp/moss-moon-003-sft": { + "tokenizer": "moss-moon-003-sft", + "organization": "Fudan", + "vocab_size": 106072, + "num(digit)": 1848, + "len(digit)": "1,3,16", + "num(space)": 33566, + "len(space)": "1,7,102", + "num(ar)": 25, + "len(ar)": "1,1,4", + "num(zh)": 54230, + "len(zh)": "1,2,15", + "num(ja)": 54381, + "len(ja)": "1,2,15", + "num(ja-kana)": 152, + "len(ja-kana)": "1,1,7", + "num(ko)": 0, + "len(ko)": "-" + }, + "google/gemma-7b": { + "tokenizer": "gemma-7b", + "organization": "Google", + "vocab_size": 256000, + "num(digit)": 134, + "len(digit)": "1,10,12", + "num(space)": 125662, + "len(space)": "1,7,31", + "num(ar)": 6274, + "len(ar)": "1,4,15", + "num(zh)": 23767, + "len(zh)": "1,2,12", + "num(ja)": 28852, + "len(ja)": "1,2,12", + "num(ja-kana)": 7061, + "len(ja-kana)": "1,3,12", + "num(ko)": 2295, + "len(ko)": "1,1,5" + }, + "google/switch-c-2048": { + "tokenizer": "switch-c-2048", + "organization": "Google", + "vocab_size": 32100, + "num(digit)": 1133, + "len(digit)": "1,3,13", + "num(space)": 0, + "len(space)": "-", + "num(ar)": 0, + "len(ar)": "-", + "num(zh)": 0, + "len(zh)": "-", + "num(ja)": 0, + "len(ja)": "-", + "num(ja-kana)": 0, + "len(ja-kana)": "-", + "num(ko)": 0, + "len(ko)": "-" + }, + "hfl/chinese-alpaca-lora-7b": { + "tokenizer": "chinese-alpaca-lora-7b", + "organization": "-", + "vocab_size": 49954, + "num(digit)": 614, + "len(digit)": "1,3,5", + "num(space)": 61, + "len(space)": "1,2,15", + "num(ar)": 55, + "len(ar)": "1,1,2", + "num(zh)": 17839, + "len(zh)": "1,2,13", + "num(ja)": 17993, + "len(ja)": "1,2,13", + "num(ja-kana)": 154, + "len(ja-kana)": "1,1,1", + "num(ko)": 135, + "len(ko)": "1,1,1" + }, + "hfl/chinese-llama-2-7b": { + "tokenizer": "chinese-llama-2-7b", + "organization": "-", + "vocab_size": 55296, + "num(digit)": 20, + "len(digit)": "1,1,1", + "num(space)": 61, + "len(space)": "1,2,15", + "num(ar)": 55, + "len(ar)": "1,1,2", + "num(zh)": 23974, + "len(zh)": "1,2,16", + "num(ja)": 24111, + "len(ja)": "1,2,16", + "num(ja-kana)": 137, + "len(ja-kana)": "1,1,1", + "num(ko)": 111, + "len(ko)": "1,1,1" + }, + "hfl/chinese-llama-lora-7b": { + "tokenizer": "chinese-llama-lora-7b", + "organization": "-", + "vocab_size": 49953, + "num(digit)": 614, + "len(digit)": "1,3,5", + "num(space)": 61, + "len(space)": "1,2,15", + "num(ar)": 55, + "len(ar)": "1,1,2", + "num(zh)": 17839, + "len(zh)": "1,2,13", + "num(ja)": 17993, + "len(ja)": "1,2,13", + "num(ja-kana)": 154, + "len(ja-kana)": "1,1,1", + "num(ko)": 135, + "len(ko)": "1,1,1" + }, + "hfl/llama-3-chinese-8b": { + "tokenizer": "llama-3-chinese-8b", + "organization": "-", + "vocab_size": 128256, + "num(digit)": 1110, + "len(digit)": "1,3,3", + "num(space)": 60860, + "len(space)": "1,6,128", + "num(ar)": 3810, + "len(ar)": "1,4,11", + "num(zh)": 4424, + "len(zh)": "1,1,7", + "num(ja)": 5387, + "len(ja)": "1,2,8", + "num(ja-kana)": 1086, + "len(ja-kana)": "1,2,8", + "num(ko)": 2281, + "len(ko)": "1,2,6" + }, + "hpcai-tech/grok-1": { + "tokenizer": "grok-1", + "organization": "xAI", + "vocab_size": 131072, + "num(digit)": 40, + "len(digit)": "1,6,13", + "num(space)": 399, + "len(space)": "1,3,16", + "num(ar)": 69, + "len(ar)": "1,2,4", + "num(zh)": 1626, + "len(zh)": "1,2,7", + "num(ja)": 3118, + "len(ja)": "1,2,8", + "num(ja-kana)": 1908, + "len(ja-kana)": "1,2,8", + "num(ko)": 67, + "len(ko)": "1,1,2" + }, + "internlm/internlm-chat-7b": { + "tokenizer": "internlm-chat-7b", + "organization": "Shanghai AI Lab", + "vocab_size": 103168, + "num(digit)": 1259, + "len(digit)": "1,3,19", + "num(space)": 33008, + "len(space)": "1,6,128", + "num(ar)": 6702, + "len(ar)": "1,4,16", + "num(zh)": 32000, + "len(zh)": "1,2,15", + "num(ja)": 32866, + "len(ja)": "1,2,15", + "num(ja-kana)": 864, + "len(ja-kana)": "1,2,9", + "num(ko)": 298, + "len(ko)": "1,1,1" + }, + "internlm/internlm-xcomposer-7b": { + "tokenizer": "internlm-xcomposer-7b", + "organization": "Shanghai AI Lab", + "vocab_size": 103168, + "num(digit)": 1261, + "len(digit)": "1,3,19", + "num(space)": 33008, + "len(space)": "1,6,128", + "num(ar)": 6702, + "len(ar)": "1,4,16", + "num(zh)": 32000, + "len(zh)": "1,2,15", + "num(ja)": 32866, + "len(ja)": "1,2,15", + "num(ja-kana)": 864, + "len(ja-kana)": "1,2,9", + "num(ko)": 298, + "len(ko)": "1,1,1" + }, + "internlm/internlm2-chat-7b": { + "tokenizer": "internlm2-chat-7b", + "organization": "Shanghai AI Lab", + "vocab_size": 92544, + "num(digit)": 1261, + "len(digit)": "1,3,18", + "num(space)": 28681, + "len(space)": "1,7,128", + "num(ar)": 30, + "len(ar)": "1,1,1", + "num(zh)": 31148, + "len(zh)": "1,2,15", + "num(ja)": 31296, + "len(ja)": "1,2,15", + "num(ja-kana)": 148, + "len(ja-kana)": "1,1,1", + "num(ko)": 83, + "len(ko)": "1,1,1" + }, + "internlm/internlm2-math-7b": { + "tokenizer": "internlm2-math-7b", + "organization": "Shanghai AI Lab", + "vocab_size": 92544, + "num(digit)": 1261, + "len(digit)": "1,3,18", + "num(space)": 28681, + "len(space)": "1,7,128", + "num(ar)": 30, + "len(ar)": "1,1,1", + "num(zh)": 31148, + "len(zh)": "1,2,15", + "num(ja)": 31296, + "len(ja)": "1,2,15", + "num(ja-kana)": 148, + "len(ja-kana)": "1,1,1", + "num(ko)": 83, + "len(ko)": "1,1,1" + }, + "microsoft/Phi-3-mini-4k-instruct": { + "tokenizer": "Phi-3-mini-4k-instruct", + "organization": "Microsoft", + "vocab_size": 32011, + "num(digit)": 20, + "len(digit)": "1,1,1", + "num(space)": 61, + "len(space)": "1,2,15", + "num(ar)": 55, + "len(ar)": "1,1,2", + "num(zh)": 700, + "len(zh)": "1,1,1", + "num(ja)": 837, + "len(ja)": "1,1,1", + "num(ja-kana)": 137, + "len(ja-kana)": "1,1,1", + "num(ko)": 111, + "len(ko)": "1,1,1" + }, + "microsoft/phi-1": { + "tokenizer": "phi-1", + "organization": "Microsoft", + "vocab_size": 50295, + "num(digit)": 1691, + "len(digit)": "1,3,16", + "num(space)": 33129, + "len(space)": "1,7,66", + "num(ar)": 22, + "len(ar)": "1,1,3", + "num(zh)": 51, + "len(zh)": "1,1,4", + "num(ja)": 183, + "len(ja)": "1,1,7", + "num(ja-kana)": 133, + "len(ja-kana)": "1,1,7", + "num(ko)": 0, + "len(ko)": "-" + }, + "microsoft/phi-2": { + "tokenizer": "phi-2", + "organization": "Microsoft", + "vocab_size": 50295, + "num(digit)": 1691, + "len(digit)": "1,3,16", + "num(space)": 33129, + "len(space)": "1,7,66", + "num(ar)": 22, + "len(ar)": "1,1,3", + "num(zh)": 51, + "len(zh)": "1,1,4", + "num(ja)": 183, + "len(ja)": "1,1,7", + "num(ja-kana)": 133, + "len(ja-kana)": "1,1,7", + "num(ko)": 0, + "len(ko)": "-" + }, + "mistralai/Mistral-7B-v0.1": { + "tokenizer": "Mistral-7B-v0.1", + "organization": "Mistral", + "vocab_size": 32000, + "num(digit)": 20, + "len(digit)": "1,1,1", + "num(space)": 85, + "len(space)": "1,3,15", + "num(ar)": 71, + "len(ar)": "1,1,2", + "num(zh)": 1459, + "len(zh)": "1,1,2", + "num(ja)": 1593, + "len(ja)": "1,1,2", + "num(ja-kana)": 134, + "len(ja-kana)": "1,1,1", + "num(ko)": 346, + "len(ko)": "1,1,1" + }, + "mistralai/Mixtral-8x7B-v0.1": { + "tokenizer": "Mixtral-8x7B-v0.1", + "organization": "Mistral", + "vocab_size": 32000, + "num(digit)": 20, + "len(digit)": "1,1,1", + "num(space)": 85, + "len(space)": "1,3,15", + "num(ar)": 71, + "len(ar)": "1,1,2", + "num(zh)": 1459, + "len(zh)": "1,1,2", + "num(ja)": 1593, + "len(ja)": "1,1,2", + "num(ja-kana)": 134, + "len(ja-kana)": "1,1,1", + "num(ko)": 346, + "len(ko)": "1,1,1" + }, + "openai-community/gpt2": { + "tokenizer": "gpt2", + "organization": "OpenAI", + "vocab_size": 50257, + "num(digit)": 1691, + "len(digit)": "1,3,16", + "num(space)": 33129, + "len(space)": "1,7,66", + "num(ar)": 22, + "len(ar)": "1,1,3", + "num(zh)": 51, + "len(zh)": "1,1,4", + "num(ja)": 183, + "len(ja)": "1,1,7", + "num(ja-kana)": 133, + "len(ja-kana)": "1,1,7", + "num(ko)": 0, + "len(ko)": "-" + }, + "openai/code-davinci-002": { + "tokenizer": "code-davinci-002", + "organization": "OpenAI", + "vocab_size": 50281, + "num(digit)": 1691, + "len(digit)": "1,3,16", + "num(space)": 33175, + "len(space)": "1,7,66", + "num(ar)": 22, + "len(ar)": "1,1,3", + "num(zh)": 51, + "len(zh)": "1,1,4", + "num(ja)": 183, + "len(ja)": "1,1,7", + "num(ja-kana)": 133, + "len(ja-kana)": "1,1,7", + "num(ko)": 0, + "len(ko)": "-" + }, + "openai/gpt-3.5-turbo": { + "tokenizer": "gpt-3.5-turbo", + "organization": "OpenAI", + "vocab_size": 100277, + "num(digit)": 1110, + "len(digit)": "1,3,3", + "num(space)": 47472, + "len(space)": "1,7,128", + "num(ar)": 113, + "len(ar)": "1,2,10", + "num(zh)": 868, + "len(zh)": "1,1,7", + "num(ja)": 1035, + "len(ja)": "1,1,7", + "num(ja-kana)": 169, + "len(ja-kana)": "1,1,7", + "num(ko)": 299, + "len(ko)": "1,2,4" + }, + "openai/gpt-4o": { + "tokenizer": "gpt-4o", + "organization": "OpenAI", + "vocab_size": 200019, + "num(digit)": 1110, + "len(digit)": "1,3,3", + "num(space)": 109316, + "len(space)": "1,6,128", + "num(ar)": 8055, + "len(ar)": "1,4,12", + "num(zh)": 7563, + "len(zh)": "1,2,11", + "num(ja)": 8292, + "len(ja)": "1,2,11", + "num(ja-kana)": 809, + "len(ja-kana)": "1,2,11", + "num(ko)": 2365, + "len(ko)": "1,2,8" + }, + "openai/text-davinci-003": { + "tokenizer": "text-davinci-003", + "organization": "OpenAI", + "vocab_size": 50281, + "num(digit)": 1691, + "len(digit)": "1,3,16", + "num(space)": 33175, + "len(space)": "1,7,66", + "num(ar)": 22, + "len(ar)": "1,1,3", + "num(zh)": 51, + "len(zh)": "1,1,4", + "num(ja)": 183, + "len(ja)": "1,1,7", + "num(ja-kana)": 133, + "len(ja-kana)": "1,1,7", + "num(ko)": 0, + "len(ko)": "-" + }, + "thu-coai/CharacterGLM-6B": { + "tokenizer": "CharacterGLM-6B", + "organization": "Tsinghua", + "vocab_size": 64789, + "num(digit)": 20, + "len(digit)": "1,1,1", + "num(space)": 67, + "len(space)": "1,2,15", + "num(ar)": 57, + "len(ar)": "1,1,2", + "num(zh)": 30922, + "len(zh)": "1,2,16", + "num(ja)": 31065, + "len(ja)": "1,2,16", + "num(ja-kana)": 143, + "len(ja-kana)": "1,1,1", + "num(ko)": 604, + "len(ko)": "1,1,1" + }, + "tiiuae/falcon-180b": { + "tokenizer": "falcon-180b", + "organization": "TII", + "vocab_size": 65024, + "num(digit)": 1108, + "len(digit)": "1,3,3", + "num(space)": 40202, + "len(space)": "1,7,65", + "num(ar)": 21, + "len(ar)": "1,1,4", + "num(zh)": 1627, + "len(zh)": "1,1,3", + "num(ja)": 1652, + "len(ja)": "1,1,3", + "num(ja-kana)": 25, + "len(ja-kana)": "1,1,1", + "num(ko)": 1, + "len(ko)": "1,1,1" + }, + "tiiuae/falcon-7b": { + "tokenizer": "falcon-7b", + "organization": "TII", + "vocab_size": 65024, + "num(digit)": 1108, + "len(digit)": "1,3,3", + "num(space)": 40202, + "len(space)": "1,7,65", + "num(ar)": 21, + "len(ar)": "1,1,4", + "num(zh)": 1627, + "len(zh)": "1,1,3", + "num(ja)": 1652, + "len(ja)": "1,1,3", + "num(ja-kana)": 25, + "len(ja-kana)": "1,1,1", + "num(ko)": 1, + "len(ko)": "1,1,1" + }, + "Qwen/Qwen1.5-1.8B": { + "tokenizer": "Qwen1.5-1.8B", + "organization": "Alibaba", + "vocab_size": 151646, + "num(digit)": 10, + "len(digit)": "1,1,1", + "num(space)": 55883, + "len(space)": "1,6,128", + "num(ar)": 4018, + "len(ar)": "1,3,12", + "num(zh)": 25557, + "len(zh)": "1,2,7", + "num(ja)": 27206, + "len(ja)": "1,2,11", + "num(ja-kana)": 2089, + "len(ja-kana)": "1,3,11", + "num(ko)": 3495, + "len(ko)": "1,1,5" + }, + "Qwen/Qwen1.5-110B": { + "tokenizer": "Qwen1.5-110B", + "organization": "Alibaba", + "vocab_size": 151646, + "num(digit)": 10, + "len(digit)": "1,1,1", + "num(space)": 55883, + "len(space)": "1,6,128", + "num(ar)": 4018, + "len(ar)": "1,3,12", + "num(zh)": 25557, + "len(zh)": "1,2,7", + "num(ja)": 27206, + "len(ja)": "1,2,11", + "num(ja-kana)": 2089, + "len(ja-kana)": "1,3,11", + "num(ko)": 3495, + "len(ko)": "1,1,5" + }, + "Qwen/Qwen1.5-14B": { + "tokenizer": "Qwen1.5-14B", + "organization": "Alibaba", + "vocab_size": 151646, + "num(digit)": 10, + "len(digit)": "1,1,1", + "num(space)": 55883, + "len(space)": "1,6,128", + "num(ar)": 4018, + "len(ar)": "1,3,12", + "num(zh)": 25557, + "len(zh)": "1,2,7", + "num(ja)": 27206, + "len(ja)": "1,2,11", + "num(ja-kana)": 2089, + "len(ja-kana)": "1,3,11", + "num(ko)": 3495, + "len(ko)": "1,1,5" + }, + "asafaya/bert-base-arabic": { + "tokenizer": "bert-base-arabic", + "organization": "-", + "vocab_size": 32000, + "num(digit)": 507, + "len(digit)": "1,3,21", + "num(space)": 0, + "len(space)": "-", + "num(ar)": 28367, + "len(ar)": "1,5,34", + "num(zh)": 180, + "len(zh)": "1,1,1", + "num(ja)": 333, + "len(ja)": "1,1,3", + "num(ja-kana)": 153, + "len(ja-kana)": "1,1,3", + "num(ko)": 0, + "len(ko)": "-" + } } \ No newline at end of file