diff --git "a/stats/character_stats.json" "b/stats/character_stats.json"
--- "a/stats/character_stats.json"
+++ "b/stats/character_stats.json"
@@ -1,1769 +1,1788 @@
-{
- "FacebookAI/xlm-roberta-base": {
- "tokenizer": "xlm-roberta-base",
- "organization": "Facebook",
- "vocab_size": 250002,
- "num(digit)": 2728,
- "len(digit)": "1,3,9",
- "num(space)": 1,
- "len(space)": "1,1,1",
- "num(ar)": 14644,
- "len(ar)": "1,4,16",
- "num(zh)": 18457,
- "len(zh)": "1,2,16",
- "num(ja)": 20572,
- "len(ja)": "1,2,16",
- "num(ja-kana)": 3434,
- "len(ja-kana)": "1,3,12",
- "num(ko)": 5373,
- "len(ko)": "1,2,8"
- },
- "clue/roberta_chinese_clue_tiny": {
- "tokenizer": "roberta-chinese-clue",
- "organization": "CLUE",
- "vocab_size": 8021,
- "num(digit)": 230,
- "len(digit)": "1,4,10",
- "num(space)": 0,
- "len(space)": "-",
- "num(ar)": 30,
- "len(ar)": "1,2,3",
- "num(zh)": 5689,
- "len(zh)": "1,1,1",
- "num(ja)": 5691,
- "len(ja)": "1,1,3",
- "num(ja-kana)": 0,
- "len(ja-kana)": "-",
- "num(ko)": 0,
- "len(ko)": "-"
- },
- "dbmdz/bert-base-german-uncased": {
- "tokenizer": "bert-base-german-uncased",
- "organization": "dbmdz",
- "vocab_size": 31102,
- "num(digit)": 1733,
- "len(digit)": "1,4,12",
- "num(space)": 0,
- "len(space)": "-",
- "num(ar)": 0,
- "len(ar)": "-",
- "num(zh)": 0,
- "len(zh)": "-",
- "num(ja)": 0,
- "len(ja)": "-",
- "num(ja-kana)": 0,
- "len(ja-kana)": "-",
- "num(ko)": 0,
- "len(ko)": "-"
- },
- "google-bert/bert-base-cased": {
- "tokenizer": "bert-base-cased",
- "organization": "Google",
- "vocab_size": 28996,
- "num(digit)": 926,
- "len(digit)": "1,4,11",
- "num(space)": 0,
- "len(space)": "-",
- "num(ar)": 94,
- "len(ar)": "1,3,4",
- "num(zh)": 226,
- "len(zh)": "1,2,3",
- "num(ja)": 390,
- "len(ja)": "1,2,3",
- "num(ja-kana)": 164,
- "len(ja-kana)": "1,2,3",
- "num(ko)": 10,
- "len(ko)": "1,2,3"
- },
- "google-bert/bert-base-chinese": {
- "tokenizer": "bert-base-chinese",
- "organization": "Google",
- "vocab_size": 21128,
- "num(digit)": 1451,
- "len(digit)": "1,3,12",
- "num(space)": 2,
- "len(space)": "1,2,3",
- "num(ar)": 30,
- "len(ar)": "1,2,3",
- "num(zh)": 14642,
- "len(zh)": "1,2,3",
- "num(ja)": 15197,
- "len(ja)": "1,3,15",
- "num(ja-kana)": 553,
- "len(ja-kana)": "1,3,15",
- "num(ko)": 0,
- "len(ko)": "-"
- },
- "google-bert/bert-base-german-cased": {
- "tokenizer": "bert-base-german-cased",
- "organization": "Google",
- "vocab_size": 30000,
- "num(digit)": 4065,
- "len(digit)": "1,11,22",
- "num(space)": 0,
- "len(space)": "-",
- "num(ar)": 0,
- "len(ar)": "-",
- "num(zh)": 0,
- "len(zh)": "-",
- "num(ja)": 0,
- "len(ja)": "-",
- "num(ja-kana)": 0,
- "len(ja-kana)": "-",
- "num(ko)": 0,
- "len(ko)": "-"
- },
- "google-bert/bert-base-multilingual-cased": {
- "tokenizer": "bert-base-multilingual-cased",
- "organization": "Google",
- "vocab_size": 119547,
- "num(digit)": 2583,
- "len(digit)": "1,3,13",
- "num(space)": 0,
- "len(space)": "-",
- "num(ar)": 4873,
- "len(ar)": "1,5,14",
- "num(zh)": 13542,
- "len(zh)": "1,2,3",
- "num(ja)": 14880,
- "len(ja)": "1,3,10",
- "num(ja-kana)": 1336,
- "len(ja-kana)": "1,4,10",
- "num(ko)": 3271,
- "len(ko)": "1,3,6"
- },
- "google-bert/bert-base-multilingual-uncased": {
- "tokenizer": "bert-base-multilingual-uncased",
- "organization": "Google",
- "vocab_size": 105879,
- "num(digit)": 2510,
- "len(digit)": "1,3,13",
- "num(space)": 2,
- "len(space)": "1,2,3",
- "num(ar)": 4530,
- "len(ar)": "1,5,13",
- "num(zh)": 16658,
- "len(zh)": "1,2,3",
- "num(ja)": 17858,
- "len(ja)": "1,3,10",
- "num(ja-kana)": 1188,
- "len(ja-kana)": "1,4,10",
- "num(ko)": 0,
- "len(ko)": "-"
- },
- "google-bert/bert-base-uncased": {
- "tokenizer": "bert-base-uncased",
- "organization": "Google",
- "vocab_size": 30522,
- "num(digit)": 2056,
- "len(digit)": "1,4,11",
- "num(space)": 0,
- "len(space)": "-",
- "num(ar)": 88,
- "len(ar)": "1,3,5",
- "num(zh)": 488,
- "len(zh)": "1,2,3",
- "num(ja)": 676,
- "len(ja)": "1,2,3",
- "num(ja-kana)": 188,
- "len(ja-kana)": "1,2,3",
- "num(ko)": 0,
- "len(ko)": "-"
- },
- "google/mobilebert-uncased": {
- "tokenizer": "mobilebert-uncased",
- "organization": "Google",
- "vocab_size": 30522,
- "num(digit)": 2056,
- "len(digit)": "1,4,11",
- "num(space)": 0,
- "len(space)": "-",
- "num(ar)": 88,
- "len(ar)": "1,3,5",
- "num(zh)": 488,
- "len(zh)": "1,2,3",
- "num(ja)": 676,
- "len(ja)": "1,2,3",
- "num(ja-kana)": 188,
- "len(ja-kana)": "1,2,3",
- "num(ko)": 0,
- "len(ko)": "-"
- },
- "tohoku-nlp/bert-base-japanese": {
- "tokenizer": "bert-base-japanese",
- "organization": "Tohoku",
- "vocab_size": 32000,
- "num(digit)": 669,
- "len(digit)": "1,3,5",
- "num(space)": 0,
- "len(space)": "-",
- "num(ar)": 10,
- "len(ar)": "1,3,3",
- "num(zh)": 18792,
- "len(zh)": "1,2,11",
- "num(ja)": 28367,
- "len(ja)": "1,2,13",
- "num(ja-kana)": 12359,
- "len(ja-kana)": "1,4,13",
- "num(ko)": 0,
- "len(ko)": "-"
- },
- "gpt-4": {
- "tokenizer": "gpt-4",
- "organization": "OpenAI",
- "vocab_size": 100277,
- "num(digit)": 1110,
- "len(digit)": "1,3,3",
- "num(space)": 47472,
- "len(space)": "1,7,128",
- "num(ar)": 113,
- "len(ar)": "1,2,10",
- "num(zh)": 868,
- "len(zh)": "1,1,7",
- "num(ja)": 1035,
- "len(ja)": "1,1,7",
- "num(ja-kana)": 169,
- "len(ja-kana)": "1,1,7",
- "num(ko)": 299,
- "len(ko)": "1,2,4"
- },
- "llama3": {
- "tokenizer": "llama3",
- "organization": "Meta",
- "vocab_size": 128256,
- "num(digit)": 1110,
- "len(digit)": "1,3,3",
- "num(space)": 60860,
- "len(space)": "1,6,128",
- "num(ar)": 3810,
- "len(ar)": "1,4,11",
- "num(zh)": 4424,
- "len(zh)": "1,1,7",
- "num(ja)": 5387,
- "len(ja)": "1,2,8",
- "num(ja-kana)": 1086,
- "len(ja-kana)": "1,2,8",
- "num(ko)": 2281,
- "len(ko)": "1,2,6"
- },
- "google-t5/t5-large": {
- "tokenizer": "t5",
- "organization": "Google",
- "vocab_size": 32100,
- "num(digit)": 1133,
- "len(digit)": "1,3,13",
- "num(space)": 0,
- "len(space)": "-",
- "num(ar)": 0,
- "len(ar)": "-",
- "num(zh)": 0,
- "len(zh)": "-",
- "num(ja)": 0,
- "len(ja)": "-",
- "num(ja-kana)": 0,
- "len(ja-kana)": "-",
- "num(ko)": 0,
- "len(ko)": "-"
- },
- "google/byt5-small": {
- "tokenizer": "byt5-small",
- "organization": "Google",
- "vocab_size": 384,
- "num(digit)": 10,
- "len(digit)": "1,1,1",
- "num(space)": 10,
- "len(space)": "1,1,1",
- "num(ar)": 0,
- "len(ar)": "-",
- "num(zh)": 0,
- "len(zh)": "-",
- "num(ja)": 0,
- "len(ja)": "-",
- "num(ja-kana)": 0,
- "len(ja-kana)": "-",
- "num(ko)": 0,
- "len(ko)": "-"
- },
- "google/mt5-large": {
- "tokenizer": "mt5-large",
- "organization": "Google",
- "vocab_size": 250100,
- "num(digit)": 16829,
- "len(digit)": "1,4,16",
- "num(space)": 1,
- "len(space)": "1,1,1",
- "num(ar)": 7459,
- "len(ar)": "1,3,16",
- "num(zh)": 21489,
- "len(zh)": "1,2,16",
- "num(ja)": 27078,
- "len(ja)": "1,2,16",
- "num(ja-kana)": 9160,
- "len(ja-kana)": "1,3,14",
- "num(ko)": 4041,
- "len(ko)": "1,1,10"
- },
- "lmsys/fastchat-t5-3b-v1.0": {
- "tokenizer": "fastchat-t5-3b-v1.0",
- "organization": "LMSYS",
- "vocab_size": 32110,
- "num(digit)": 1033,
- "len(digit)": "1,3,8",
- "num(space)": 0,
- "len(space)": "-",
- "num(ar)": 0,
- "len(ar)": "-",
- "num(zh)": 0,
- "len(zh)": "-",
- "num(ja)": 0,
- "len(ja)": "-",
- "num(ja-kana)": 0,
- "len(ja-kana)": "-",
- "num(ko)": 0,
- "len(ko)": "-"
- },
- "paust/pko-t5-large": {
- "tokenizer": "pko-t5-large",
- "organization": "PAUST",
- "vocab_size": 50358,
- "num(digit)": 51,
- "len(digit)": "1,2,3",
- "num(space)": 10,
- "len(space)": "1,1,1",
- "num(ar)": 0,
- "len(ar)": "-",
- "num(zh)": 0,
- "len(zh)": "-",
- "num(ja)": 0,
- "len(ja)": "-",
- "num(ja-kana)": 0,
- "len(ja-kana)": "-",
- "num(ko)": 49050,
- "len(ko)": "1,2,16"
- },
- "bloom": {
- "tokenizer": "bloom",
- "organization": "BigScience",
- "vocab_size": 250680,
- "num(digit)": 6629,
- "len(digit)": "1,4,50",
- "num(space)": 140180,
- "len(space)": "1,6,600",
- "num(ar)": 20854,
- "len(ar)": "1,5,16",
- "num(zh)": 30603,
- "len(zh)": "1,2,23",
- "num(ja)": 30816,
- "len(ja)": "1,2,23",
- "num(ja-kana)": 214,
- "len(ja-kana)": "1,1,3",
- "num(ko)": 338,
- "len(ko)": "1,1,3"
- },
- "llama": {
- "tokenizer": "llama",
- "organization": "Meta",
- "vocab_size": 32000,
- "num(digit)": 20,
- "len(digit)": "1,1,1",
- "num(space)": 61,
- "len(space)": "1,2,15",
- "num(ar)": 55,
- "len(ar)": "1,1,2",
- "num(zh)": 700,
- "len(zh)": "1,1,1",
- "num(ja)": 837,
- "len(ja)": "1,1,1",
- "num(ja-kana)": 137,
- "len(ja-kana)": "1,1,1",
- "num(ko)": 111,
- "len(ko)": "1,1,1"
- },
- "ClueAI/ChatYuan-large-v2": {
- "tokenizer": "ChatYuan-large-v2",
- "organization": "CLUE",
- "vocab_size": 32128,
- "num(digit)": 740,
- "len(digit)": "1,3,9",
- "num(space)": 0,
- "len(space)": "-",
- "num(ar)": 2,
- "len(ar)": "1,1,1",
- "num(zh)": 29591,
- "len(zh)": "1,2,16",
- "num(ja)": 29736,
- "len(ja)": "1,2,16",
- "num(ja-kana)": 145,
- "len(ja-kana)": "1,1,2",
- "num(ko)": 0,
- "len(ko)": "-"
- },
- "Meta/llama3": {
- "tokenizer": "llama3",
- "organization": "Meta",
- "vocab_size": 128256,
- "num(digit)": 1110,
- "len(digit)": "1,3,3",
- "num(space)": 60860,
- "len(space)": "1,6,128",
- "num(ar)": 3810,
- "len(ar)": "1,4,11",
- "num(zh)": 4424,
- "len(zh)": "1,1,7",
- "num(ja)": 5387,
- "len(ja)": "1,2,8",
- "num(ja-kana)": 1086,
- "len(ja-kana)": "1,2,8",
- "num(ko)": 2281,
- "len(ko)": "1,2,6"
- },
- "openai/gpt-4": {
- "tokenizer": "gpt-4",
- "organization": "OpenAI",
- "vocab_size": 100277,
- "num(digit)": 1110,
- "len(digit)": "1,3,3",
- "num(space)": 47472,
- "len(space)": "1,7,128",
- "num(ar)": 113,
- "len(ar)": "1,2,10",
- "num(zh)": 868,
- "len(zh)": "1,1,7",
- "num(ja)": 1035,
- "len(ja)": "1,1,7",
- "num(ja-kana)": 169,
- "len(ja-kana)": "1,1,7",
- "num(ko)": 299,
- "len(ko)": "1,2,4"
- },
- "gradientai/Llama-3-8B-Instruct-Gradient-1048k": {
- "tokenizer": "llama3",
- "organization": "Meta",
- "vocab_size": 128256,
- "num(digit)": 1110,
- "len(digit)": "1,3,3",
- "num(space)": 60860,
- "len(space)": "1,6,128",
- "num(ar)": 3810,
- "len(ar)": "1,4,11",
- "num(zh)": 4424,
- "len(zh)": "1,1,7",
- "num(ja)": 5387,
- "len(ja)": "1,2,8",
- "num(ja-kana)": 1086,
- "len(ja-kana)": "1,2,8",
- "num(ko)": 2281,
- "len(ko)": "1,2,6"
- },
- "bigscience/bloom": {
- "tokenizer": "bloom",
- "organization": "BigScience",
- "vocab_size": 250680,
- "num(digit)": 6629,
- "len(digit)": "1,4,50",
- "num(space)": 140180,
- "len(space)": "1,6,600",
- "num(ar)": 20854,
- "len(ar)": "1,5,16",
- "num(zh)": 30603,
- "len(zh)": "1,2,23",
- "num(ja)": 30816,
- "len(ja)": "1,2,23",
- "num(ja-kana)": 214,
- "len(ja-kana)": "1,1,3",
- "num(ko)": 338,
- "len(ko)": "1,1,3"
- },
- "huggyllama/llama-7b": {
- "tokenizer": "llama",
- "organization": "Meta",
- "vocab_size": 32000,
- "num(digit)": 20,
- "len(digit)": "1,1,1",
- "num(space)": 61,
- "len(space)": "1,2,15",
- "num(ar)": 55,
- "len(ar)": "1,1,2",
- "num(zh)": 700,
- "len(zh)": "1,1,1",
- "num(ja)": 837,
- "len(ja)": "1,1,1",
- "num(ja-kana)": 137,
- "len(ja-kana)": "1,1,1",
- "num(ko)": 111,
- "len(ko)": "1,1,1"
- },
- "baichuan-inc/Baichuan-7B": {
- "tokenizer": "baichuan",
- "organization": "Baichuan",
- "vocab_size": 64000,
- "num(digit)": 335,
- "len(digit)": "1,14,14",
- "num(space)": 13,
- "len(space)": "1,1,1",
- "num(ar)": 299,
- "len(ar)": "1,1,2",
- "num(zh)": 27676,
- "len(zh)": "1,1,9",
- "num(ja)": 28522,
- "len(ja)": "1,1,9",
- "num(ja-kana)": 178,
- "len(ja-kana)": "1,1,1",
- "num(ko)": 1591,
- "len(ko)": "1,1,1"
- },
- "01-ai/Yi-34B": {
- "tokenizer": "Yi-34B",
- "organization": "Yi",
- "vocab_size": 64000,
- "num(digit)": 200,
- "len(digit)": "1,13,15",
- "num(space)": 24274,
- "len(space)": "1,7,16",
- "num(ar)": 18,
- "len(ar)": "1,1,4",
- "num(zh)": 21356,
- "len(zh)": "1,2,12",
- "num(ja)": 21407,
- "len(ja)": "1,2,12",
- "num(ja-kana)": 51,
- "len(ja-kana)": "1,1,2",
- "num(ko)": 28,
- "len(ko)": "1,1,2"
- },
- "01-ai/Yi-6B": {
- "tokenizer": "Yi-6B",
- "organization": "Yi",
- "vocab_size": 64000,
- "num(digit)": 200,
- "len(digit)": "1,13,15",
- "num(space)": 24274,
- "len(space)": "1,7,16",
- "num(ar)": 18,
- "len(ar)": "1,1,4",
- "num(zh)": 21356,
- "len(zh)": "1,2,12",
- "num(ja)": 21407,
- "len(ja)": "1,2,12",
- "num(ja-kana)": 51,
- "len(ja-kana)": "1,1,2",
- "num(ko)": 28,
- "len(ko)": "1,1,2"
- },
- "01-ai/Yi-VL-34B": {
- "tokenizer": "Yi-VL-34B",
- "organization": "Yi",
- "vocab_size": 64000,
- "num(digit)": 200,
- "len(digit)": "1,13,15",
- "num(space)": 43,
- "len(space)": "1,2,15",
- "num(ar)": 18,
- "len(ar)": "1,1,4",
- "num(zh)": 21356,
- "len(zh)": "1,2,12",
- "num(ja)": 21407,
- "len(ja)": "1,2,12",
- "num(ja-kana)": 51,
- "len(ja-kana)": "1,1,2",
- "num(ko)": 28,
- "len(ko)": "1,1,2"
- },
- "ClassCat/gpt2-base-french": {
- "tokenizer": "gpt2-base-french",
- "organization": "ClassCat",
- "vocab_size": 50000,
- "num(digit)": 1833,
- "len(digit)": "1,4,5",
- "num(space)": 31889,
- "len(space)": "1,7,32",
- "num(ar)": 41,
- "len(ar)": "1,1,4",
- "num(zh)": 27,
- "len(zh)": "1,1,1",
- "num(ja)": 46,
- "len(ja)": "1,1,2",
- "num(ja-kana)": 19,
- "len(ja-kana)": "1,1,2",
- "num(ko)": 0,
- "len(ko)": "-"
- },
- "ClassCat/gpt2-base-spanish": {
- "tokenizer": "gpt2-base-spanish",
- "organization": "ClassCat",
- "vocab_size": 50000,
- "num(digit)": 1492,
- "len(digit)": "1,4,9",
- "num(space)": 34496,
- "len(space)": "1,8,32",
- "num(ar)": 36,
- "len(ar)": "1,1,4",
- "num(zh)": 13,
- "len(zh)": "1,1,1",
- "num(ja)": 36,
- "len(ja)": "1,1,2",
- "num(ja-kana)": 23,
- "len(ja-kana)": "1,1,2",
- "num(ko)": 0,
- "len(ko)": "-"
- },
- "ClueAI/PromptCLUE-base": {
- "tokenizer": "PromptCLUE-base",
- "organization": "CLUE",
- "vocab_size": 32128,
- "num(digit)": 740,
- "len(digit)": "1,3,9",
- "num(space)": 0,
- "len(space)": "-",
- "num(ar)": 2,
- "len(ar)": "1,1,1",
- "num(zh)": 29591,
- "len(zh)": "1,2,16",
- "num(ja)": 29736,
- "len(ja)": "1,2,16",
- "num(ja-kana)": 145,
- "len(ja-kana)": "1,1,2",
- "num(ko)": 0,
- "len(ko)": "-"
- },
- "CohereForAI/aya-101": {
- "tokenizer": "aya-101",
- "organization": "Cohere For AI",
- "vocab_size": 250100,
- "num(digit)": 16829,
- "len(digit)": "1,4,16",
- "num(space)": 1,
- "len(space)": "1,1,1",
- "num(ar)": 7459,
- "len(ar)": "1,3,16",
- "num(zh)": 21489,
- "len(zh)": "1,2,16",
- "num(ja)": 27078,
- "len(ja)": "1,2,16",
- "num(ja-kana)": 9160,
- "len(ja-kana)": "1,3,14",
- "num(ko)": 4041,
- "len(ko)": "1,1,10"
- },
- "EleutherAI/gpt-neox-20b": {
- "tokenizer": "gpt-neox-20b",
- "organization": "EleutherAI",
- "vocab_size": 50277,
- "num(digit)": 2036,
- "len(digit)": "1,3,35",
- "num(space)": 28996,
- "len(space)": "1,7,512",
- "num(ar)": 94,
- "len(ar)": "1,2,4",
- "num(zh)": 313,
- "len(zh)": "1,1,2",
- "num(ja)": 480,
- "len(ja)": "1,1,4",
- "num(ja-kana)": 167,
- "len(ja-kana)": "1,1,4",
- "num(ko)": 25,
- "len(ko)": "1,1,2"
- },
- "HuggingFaceH4/starchat-alpha": {
- "tokenizer": "starchat-alpha",
- "organization": "-",
- "vocab_size": 49156,
- "num(digit)": 10,
- "len(digit)": "1,1,1",
- "num(space)": 16515,
- "len(space)": "1,6,256",
- "num(ar)": 84,
- "len(ar)": "1,2,4",
- "num(zh)": 2030,
- "len(zh)": "1,1,7",
- "num(ja)": 2368,
- "len(ja)": "1,1,8",
- "num(ja-kana)": 360,
- "len(ja-kana)": "1,2,8",
- "num(ko)": 491,
- "len(ko)": "1,2,5"
- },
- "HuggingFaceH4/zephyr-7b-beta": {
- "tokenizer": "zephyr-7b-beta",
- "organization": "HuggingFace",
- "vocab_size": 32000,
- "num(digit)": 20,
- "len(digit)": "1,1,1",
- "num(space)": 85,
- "len(space)": "1,3,15",
- "num(ar)": 71,
- "len(ar)": "1,1,2",
- "num(zh)": 1459,
- "len(zh)": "1,1,2",
- "num(ja)": 1593,
- "len(ja)": "1,1,2",
- "num(ja-kana)": 134,
- "len(ja-kana)": "1,1,1",
- "num(ko)": 346,
- "len(ko)": "1,1,1"
- },
- "LLM360/CrystalCoder": {
- "tokenizer": "CrystalCoder",
- "organization": "MBZUAI",
- "vocab_size": 32022,
- "num(digit)": 20,
- "len(digit)": "1,1,1",
- "num(space)": 61,
- "len(space)": "1,2,15",
- "num(ar)": 55,
- "len(ar)": "1,1,2",
- "num(zh)": 700,
- "len(zh)": "1,1,1",
- "num(ja)": 837,
- "len(ja)": "1,1,1",
- "num(ja-kana)": 137,
- "len(ja-kana)": "1,1,1",
- "num(ko)": 111,
- "len(ko)": "1,1,1"
- },
- "NousResearch/Llama-2-7b-chat-hf": {
- "tokenizer": "llama2",
- "organization": "Meta",
- "vocab_size": 32001,
- "num(digit)": 20,
- "len(digit)": "1,1,1",
- "num(space)": 61,
- "len(space)": "1,2,15",
- "num(ar)": 55,
- "len(ar)": "1,1,2",
- "num(zh)": 700,
- "len(zh)": "1,1,1",
- "num(ja)": 837,
- "len(ja)": "1,1,1",
- "num(ja-kana)": 137,
- "len(ja-kana)": "1,1,1",
- "num(ko)": 111,
- "len(ko)": "1,1,1"
- },
- "OrionStarAI/Orion-14B-Chat": {
- "tokenizer": "Orion-14B-Chat",
- "organization": "OrionStar",
- "vocab_size": 84608,
- "num(digit)": 1559,
- "len(digit)": "1,4,14",
- "num(space)": 18383,
- "len(space)": "1,6,16",
- "num(ar)": 102,
- "len(ar)": "1,1,1",
- "num(zh)": 46998,
- "len(zh)": "1,2,16",
- "num(ja)": 49644,
- "len(ja)": "1,2,16",
- "num(ja-kana)": 2987,
- "len(ja-kana)": "1,3,11",
- "num(ko)": 5110,
- "len(ko)": "1,2,7"
- },
- "Qwen/Qwen-7B-Chat": {
- "tokenizer": "Qwen",
- "organization": "Alibaba",
- "vocab_size": 151851,
- "num(digit)": 10,
- "len(digit)": "1,1,1",
- "num(space)": 55883,
- "len(space)": "1,6,128",
- "num(ar)": 4018,
- "len(ar)": "1,3,12",
- "num(zh)": 25557,
- "len(zh)": "1,2,7",
- "num(ja)": 27206,
- "len(ja)": "1,2,11",
- "num(ja-kana)": 2089,
- "len(ja-kana)": "1,3,11",
- "num(ko)": 3495,
- "len(ko)": "1,1,5"
- },
- "Qwen/Qwen1.5-14B-Chat": {
- "tokenizer": "Qwen1.5",
- "organization": "Alibaba",
- "vocab_size": 151646,
- "num(digit)": 10,
- "len(digit)": "1,1,1",
- "num(space)": 55883,
- "len(space)": "1,6,128",
- "num(ar)": 4018,
- "len(ar)": "1,3,12",
- "num(zh)": 25557,
- "len(zh)": "1,2,7",
- "num(ja)": 27206,
- "len(ja)": "1,2,11",
- "num(ja-kana)": 2089,
- "len(ja-kana)": "1,3,11",
- "num(ko)": 3495,
- "len(ko)": "1,1,5"
- },
- "Skywork/Skywork-13B-Math": {
- "tokenizer": "Skywork-13B-Math",
- "organization": "Kunlun",
- "vocab_size": 65519,
- "num(digit)": 20,
- "len(digit)": "1,1,1",
- "num(space)": 62,
- "len(space)": "1,2,15",
- "num(ar)": 56,
- "len(ar)": "1,1,2",
- "num(zh)": 33913,
- "len(zh)": "1,2,5",
- "num(ja)": 34064,
- "len(ja)": "1,2,5",
- "num(ja-kana)": 150,
- "len(ja-kana)": "1,1,1",
- "num(ko)": 111,
- "len(ko)": "1,1,1"
- },
- "Skywork/Skywork-13B-base": {
- "tokenizer": "Skywork-13B-base",
- "organization": "Kunlun",
- "vocab_size": 65519,
- "num(digit)": 20,
- "len(digit)": "1,1,1",
- "num(space)": 62,
- "len(space)": "1,2,15",
- "num(ar)": 56,
- "len(ar)": "1,1,2",
- "num(zh)": 33913,
- "len(zh)": "1,2,5",
- "num(ja)": 34064,
- "len(ja)": "1,2,5",
- "num(ja-kana)": 150,
- "len(ja-kana)": "1,1,1",
- "num(ko)": 111,
- "len(ko)": "1,1,1"
- },
- "THUDM/chatglm-6b": {
- "tokenizer": "chatglm-6b",
- "organization": "Tsinghua",
- "vocab_size": 130344,
- "num(digit)": 20,
- "len(digit)": "1,1,1",
- "num(space)": 93,
- "len(space)": "1,34,80",
- "num(ar)": 137,
- "len(ar)": "1,2,4",
- "num(zh)": 61358,
- "len(zh)": "1,2,16",
- "num(ja)": 61784,
- "len(ja)": "1,2,16",
- "num(ja-kana)": 439,
- "len(ja-kana)": "1,2,5",
- "num(ko)": 114,
- "len(ko)": "1,1,3"
- },
- "THUDM/chatglm2-6b": {
- "tokenizer": "chatglm2-6b",
- "organization": "Tsinghua",
- "vocab_size": 64787,
- "num(digit)": 20,
- "len(digit)": "1,1,1",
- "num(space)": 67,
- "len(space)": "1,2,15",
- "num(ar)": 57,
- "len(ar)": "1,1,2",
- "num(zh)": 30922,
- "len(zh)": "1,2,16",
- "num(ja)": 31065,
- "len(ja)": "1,2,16",
- "num(ja-kana)": 143,
- "len(ja-kana)": "1,1,1",
- "num(ko)": 604,
- "len(ko)": "1,1,1"
- },
- "THUDM/chatglm3-6b": {
- "tokenizer": "chatglm3-6b",
- "organization": "Tsinghua",
- "vocab_size": 64796,
- "num(digit)": 20,
- "len(digit)": "1,1,1",
- "num(space)": 67,
- "len(space)": "1,2,15",
- "num(ar)": 57,
- "len(ar)": "1,1,2",
- "num(zh)": 30922,
- "len(zh)": "1,2,16",
- "num(ja)": 31065,
- "len(ja)": "1,2,16",
- "num(ja-kana)": 143,
- "len(ja-kana)": "1,1,1",
- "num(ko)": 604,
- "len(ko)": "1,1,1"
- },
- "TigerResearch/tigerbot-13b-chat-v2": {
- "tokenizer": "tigerbot-13b-chat-v2",
- "organization": "Tigerobo",
- "vocab_size": 60515,
- "num(digit)": 20,
- "len(digit)": "1,1,1",
- "num(space)": 61,
- "len(space)": "1,2,15",
- "num(ar)": 55,
- "len(ar)": "1,1,2",
- "num(zh)": 28603,
- "len(zh)": "1,2,16",
- "num(ja)": 28770,
- "len(ja)": "1,2,16",
- "num(ja-kana)": 167,
- "len(ja-kana)": "1,1,2",
- "num(ko)": 261,
- "len(ko)": "1,1,1"
- },
- "TigerResearch/tigerbot-70b-chat-v4-4k": {
- "tokenizer": "tigerbot-70b-chat-v4-4k",
- "organization": "Tigerobo",
- "vocab_size": 65110,
- "num(digit)": 20,
- "len(digit)": "1,1,1",
- "num(space)": 61,
- "len(space)": "1,2,15",
- "num(ar)": 55,
- "len(ar)": "1,1,2",
- "num(zh)": 30509,
- "len(zh)": "1,2,16",
- "num(ja)": 32061,
- "len(ja)": "1,2,16",
- "num(ja-kana)": 2071,
- "len(ja-kana)": "1,2,8",
- "num(ko)": 1504,
- "len(ko)": "1,1,5"
- },
- "Upstage/SOLAR-10.7B-v1.0": {
- "tokenizer": "SOLAR-10.7B-v1.0",
- "organization": "-",
- "vocab_size": 32000,
- "num(digit)": 20,
- "len(digit)": "1,1,1",
- "num(space)": 85,
- "len(space)": "1,3,15",
- "num(ar)": 71,
- "len(ar)": "1,1,2",
- "num(zh)": 1459,
- "len(zh)": "1,1,2",
- "num(ja)": 1593,
- "len(ja)": "1,1,2",
- "num(ja-kana)": 134,
- "len(ja-kana)": "1,1,1",
- "num(ko)": 346,
- "len(ko)": "1,1,1"
- },
- "WizardLM/WizardCoder-15B-V1.0": {
- "tokenizer": "WizardCoder-15B-V1.0",
- "organization": "Microsoft",
- "vocab_size": 49153,
- "num(digit)": 10,
- "len(digit)": "1,1,1",
- "num(space)": 16515,
- "len(space)": "1,6,256",
- "num(ar)": 84,
- "len(ar)": "1,2,4",
- "num(zh)": 2030,
- "len(zh)": "1,1,7",
- "num(ja)": 2368,
- "len(ja)": "1,1,8",
- "num(ja-kana)": 360,
- "len(ja-kana)": "1,2,8",
- "num(ko)": 491,
- "len(ko)": "1,2,5"
- },
- "WizardLM/WizardCoder-Python-7B-V1.0": {
- "tokenizer": "WizardCoder-Python-7B-V1.0",
- "organization": "Microsoft",
- "vocab_size": 32001,
- "num(digit)": 20,
- "len(digit)": "1,1,1",
- "num(space)": 61,
- "len(space)": "1,2,15",
- "num(ar)": 55,
- "len(ar)": "1,1,2",
- "num(zh)": 700,
- "len(zh)": "1,1,1",
- "num(ja)": 837,
- "len(ja)": "1,1,1",
- "num(ja-kana)": 137,
- "len(ja-kana)": "1,1,1",
- "num(ko)": 111,
- "len(ko)": "1,1,1"
- },
- "WizardLM/WizardLM-7B-V1.0": {
- "tokenizer": "WizardLM-7B-V1.0",
- "organization": "Microsoft",
- "vocab_size": 32001,
- "num(digit)": 20,
- "len(digit)": "1,1,1",
- "num(space)": 61,
- "len(space)": "1,2,15",
- "num(ar)": 55,
- "len(ar)": "1,1,2",
- "num(zh)": 700,
- "len(zh)": "1,1,1",
- "num(ja)": 837,
- "len(ja)": "1,1,1",
- "num(ja-kana)": 137,
- "len(ja-kana)": "1,1,1",
- "num(ko)": 111,
- "len(ko)": "1,1,1"
- },
- "WizardLM/WizardMath-70B-V1.0": {
- "tokenizer": "WizardMath-70B-V1.0",
- "organization": "Microsoft",
- "vocab_size": 32002,
- "num(digit)": 20,
- "len(digit)": "1,1,1",
- "num(space)": 61,
- "len(space)": "1,2,15",
- "num(ar)": 55,
- "len(ar)": "1,1,2",
- "num(zh)": 700,
- "len(zh)": "1,1,1",
- "num(ja)": 837,
- "len(ja)": "1,1,1",
- "num(ja-kana)": 137,
- "len(ja-kana)": "1,1,1",
- "num(ko)": 111,
- "len(ko)": "1,1,1"
- },
- "abeja/gpt-neox-japanese-2.7b": {
- "tokenizer": "gpt-neox-japanese-2.7b",
- "organization": "ABEJA",
- "vocab_size": 32000,
- "num(digit)": 20,
- "len(digit)": "1,1,1",
- "num(space)": 0,
- "len(space)": "-",
- "num(ar)": 0,
- "len(ar)": "-",
- "num(zh)": 15176,
- "len(zh)": "1,2,2",
- "num(ja)": 31482,
- "len(ja)": "1,2,3",
- "num(ja-kana)": 16306,
- "len(ja-kana)": "1,3,3",
- "num(ko)": 0,
- "len(ko)": "-"
- },
- "ai21labs/Jamba-v0.1": {
- "tokenizer": "Jamba-v0.1",
- "organization": "AI21",
- "vocab_size": 65536,
- "num(digit)": 1556,
- "len(digit)": "1,16,17",
- "num(space)": 39501,
- "len(space)": "1,7,32",
- "num(ar)": 867,
- "len(ar)": "1,3,8",
- "num(zh)": 1157,
- "len(zh)": "1,1,2",
- "num(ja)": 1287,
- "len(ja)": "1,1,2",
- "num(ja-kana)": 130,
- "len(ja-kana)": "1,1,2",
- "num(ko)": 312,
- "len(ko)": "1,1,2"
- },
- "allenai/OLMo-7B": {
- "tokenizer": "OLMo-7B",
- "organization": "Allen AI",
- "vocab_size": 50280,
- "num(digit)": 2036,
- "len(digit)": "1,3,35",
- "num(space)": 29019,
- "len(space)": "1,7,512",
- "num(ar)": 94,
- "len(ar)": "1,2,4",
- "num(zh)": 313,
- "len(zh)": "1,1,2",
- "num(ja)": 480,
- "len(ja)": "1,1,4",
- "num(ja-kana)": 167,
- "len(ja-kana)": "1,1,4",
- "num(ko)": 25,
- "len(ko)": "1,1,2"
- },
- "baichuan-inc/Baichuan2-7B-Chat": {
- "tokenizer": "baichuan2",
- "organization": "Baichuan",
- "vocab_size": 125696,
- "num(digit)": 1023,
- "len(digit)": "1,14,14",
- "num(space)": 26013,
- "len(space)": "1,7,32",
- "num(ar)": 335,
- "len(ar)": "1,1,27",
- "num(zh)": 70398,
- "len(zh)": "1,2,32",
- "num(ja)": 71269,
- "len(ja)": "1,2,32",
- "num(ja-kana)": 206,
- "len(ja-kana)": "1,1,9",
- "num(ko)": 1595,
- "len(ko)": "1,1,2"
- },
- "ckiplab/gpt2-base-chinese": {
- "tokenizer": "gpt2-base-chinese",
- "organization": "SINICA",
- "vocab_size": 21128,
- "num(digit)": 1451,
- "len(digit)": "1,3,12",
- "num(space)": 2,
- "len(space)": "1,2,3",
- "num(ar)": 30,
- "len(ar)": "1,2,3",
- "num(zh)": 14642,
- "len(zh)": "1,2,3",
- "num(ja)": 15197,
- "len(ja)": "1,3,15",
- "num(ja-kana)": 553,
- "len(ja-kana)": "1,3,15",
- "num(ko)": 0,
- "len(ko)": "-"
- },
- "cyberagent/open-calm-7b": {
- "tokenizer": "open-calm-7b",
- "organization": "CyberAgent",
- "vocab_size": 52000,
- "num(digit)": 690,
- "len(digit)": "1,3,5",
- "num(space)": 1698,
- "len(space)": "1,4,33",
- "num(ar)": 10,
- "len(ar)": "1,1,4",
- "num(zh)": 30775,
- "len(zh)": "1,3,31",
- "num(ja)": 45790,
- "len(ja)": "1,3,31",
- "num(ja-kana)": 32535,
- "len(ja-kana)": "1,3,31",
- "num(ko)": 0,
- "len(ko)": "-"
- },
- "databricks/dbrx-instruct": {
- "tokenizer": "dbrx-instruct",
- "organization": "Databricks",
- "vocab_size": 100280,
- "num(digit)": 1126,
- "len(digit)": "1,3,17",
- "num(space)": 47400,
- "len(space)": "1,7,128",
- "num(ar)": 113,
- "len(ar)": "1,2,10",
- "num(zh)": 868,
- "len(zh)": "1,1,7",
- "num(ja)": 1035,
- "len(ja)": "1,1,7",
- "num(ja-kana)": 169,
- "len(ja-kana)": "1,1,7",
- "num(ko)": 299,
- "len(ko)": "1,2,4"
- },
- "deepseek-ai/DeepSeek-V2": {
- "tokenizer": "DeepSeek-V2",
- "organization": "DeepSeek",
- "vocab_size": 100002,
- "num(digit)": 10,
- "len(digit)": "1,1,1",
- "num(space)": 48073,
- "len(space)": "1,7,128",
- "num(ar)": 48,
- "len(ar)": "1,1,4",
- "num(zh)": 18052,
- "len(zh)": "1,2,16",
- "num(ja)": 18090,
- "len(ja)": "1,2,16",
- "num(ja-kana)": 38,
- "len(ja-kana)": "1,1,2",
- "num(ko)": 16,
- "len(ko)": "1,1,2"
- },
- "deepseek-ai/deepseek-coder-33b-instruct": {
- "tokenizer": "deepseek-coder-33b-instruct",
- "organization": "DeepSeek",
- "vocab_size": 32022,
- "num(digit)": 10,
- "len(digit)": "1,1,1",
- "num(space)": 15254,
- "len(space)": "1,6,65",
- "num(ar)": 12,
- "len(ar)": "1,1,2",
- "num(zh)": 4803,
- "len(zh)": "1,2,4",
- "num(ja)": 4804,
- "len(ja)": "1,2,4",
- "num(ja-kana)": 1,
- "len(ja-kana)": "1,1,1",
- "num(ko)": 0,
- "len(ko)": "-"
- },
- "deepseek-ai/deepseek-llm-7b-base": {
- "tokenizer": "deepseek-llm-7b-base",
- "organization": "DeepSeek",
- "vocab_size": 100015,
- "num(digit)": 10,
- "len(digit)": "1,1,1",
- "num(space)": 48073,
- "len(space)": "1,7,128",
- "num(ar)": 48,
- "len(ar)": "1,1,4",
- "num(zh)": 18052,
- "len(zh)": "1,2,16",
- "num(ja)": 18090,
- "len(ja)": "1,2,16",
- "num(ja-kana)": 38,
- "len(ja-kana)": "1,1,2",
- "num(ko)": 16,
- "len(ko)": "1,1,2"
- },
- "eson/kplug-base-encoder": {
- "tokenizer": "kplug",
- "organization": "JD",
- "vocab_size": 10261,
- "num(digit)": 420,
- "len(digit)": "1,3,12",
- "num(space)": 0,
- "len(space)": "-",
- "num(ar)": 0,
- "len(ar)": "-",
- "num(zh)": 5764,
- "len(zh)": "1,1,1",
- "num(ja)": 5766,
- "len(ja)": "1,1,3",
- "num(ja-kana)": 0,
- "len(ja-kana)": "-",
- "num(ko)": 0,
- "len(ko)": "-"
- },
- "fnlp/moss-moon-003-sft": {
- "tokenizer": "moss-moon-003-sft",
- "organization": "Fudan",
- "vocab_size": 106072,
- "num(digit)": 1848,
- "len(digit)": "1,3,16",
- "num(space)": 33566,
- "len(space)": "1,7,102",
- "num(ar)": 25,
- "len(ar)": "1,1,4",
- "num(zh)": 54230,
- "len(zh)": "1,2,15",
- "num(ja)": 54381,
- "len(ja)": "1,2,15",
- "num(ja-kana)": 152,
- "len(ja-kana)": "1,1,7",
- "num(ko)": 0,
- "len(ko)": "-"
- },
- "google/gemma-7b": {
- "tokenizer": "gemma-7b",
- "organization": "Google",
- "vocab_size": 256000,
- "num(digit)": 134,
- "len(digit)": "1,10,12",
- "num(space)": 125662,
- "len(space)": "1,7,31",
- "num(ar)": 6274,
- "len(ar)": "1,4,15",
- "num(zh)": 23767,
- "len(zh)": "1,2,12",
- "num(ja)": 28852,
- "len(ja)": "1,2,12",
- "num(ja-kana)": 7061,
- "len(ja-kana)": "1,3,12",
- "num(ko)": 2295,
- "len(ko)": "1,1,5"
- },
- "google/switch-c-2048": {
- "tokenizer": "switch-c-2048",
- "organization": "Google",
- "vocab_size": 32100,
- "num(digit)": 1133,
- "len(digit)": "1,3,13",
- "num(space)": 0,
- "len(space)": "-",
- "num(ar)": 0,
- "len(ar)": "-",
- "num(zh)": 0,
- "len(zh)": "-",
- "num(ja)": 0,
- "len(ja)": "-",
- "num(ja-kana)": 0,
- "len(ja-kana)": "-",
- "num(ko)": 0,
- "len(ko)": "-"
- },
- "hfl/chinese-alpaca-lora-7b": {
- "tokenizer": "chinese-alpaca-lora-7b",
- "organization": "-",
- "vocab_size": 49954,
- "num(digit)": 614,
- "len(digit)": "1,3,5",
- "num(space)": 61,
- "len(space)": "1,2,15",
- "num(ar)": 55,
- "len(ar)": "1,1,2",
- "num(zh)": 17839,
- "len(zh)": "1,2,13",
- "num(ja)": 17993,
- "len(ja)": "1,2,13",
- "num(ja-kana)": 154,
- "len(ja-kana)": "1,1,1",
- "num(ko)": 135,
- "len(ko)": "1,1,1"
- },
- "hfl/chinese-llama-2-7b": {
- "tokenizer": "chinese-llama-2-7b",
- "organization": "-",
- "vocab_size": 55296,
- "num(digit)": 20,
- "len(digit)": "1,1,1",
- "num(space)": 61,
- "len(space)": "1,2,15",
- "num(ar)": 55,
- "len(ar)": "1,1,2",
- "num(zh)": 23974,
- "len(zh)": "1,2,16",
- "num(ja)": 24111,
- "len(ja)": "1,2,16",
- "num(ja-kana)": 137,
- "len(ja-kana)": "1,1,1",
- "num(ko)": 111,
- "len(ko)": "1,1,1"
- },
- "hfl/chinese-llama-lora-7b": {
- "tokenizer": "chinese-llama-lora-7b",
- "organization": "-",
- "vocab_size": 49953,
- "num(digit)": 614,
- "len(digit)": "1,3,5",
- "num(space)": 61,
- "len(space)": "1,2,15",
- "num(ar)": 55,
- "len(ar)": "1,1,2",
- "num(zh)": 17839,
- "len(zh)": "1,2,13",
- "num(ja)": 17993,
- "len(ja)": "1,2,13",
- "num(ja-kana)": 154,
- "len(ja-kana)": "1,1,1",
- "num(ko)": 135,
- "len(ko)": "1,1,1"
- },
- "hfl/llama-3-chinese-8b": {
- "tokenizer": "llama-3-chinese-8b",
- "organization": "-",
- "vocab_size": 128256,
- "num(digit)": 1110,
- "len(digit)": "1,3,3",
- "num(space)": 60860,
- "len(space)": "1,6,128",
- "num(ar)": 3810,
- "len(ar)": "1,4,11",
- "num(zh)": 4424,
- "len(zh)": "1,1,7",
- "num(ja)": 5387,
- "len(ja)": "1,2,8",
- "num(ja-kana)": 1086,
- "len(ja-kana)": "1,2,8",
- "num(ko)": 2281,
- "len(ko)": "1,2,6"
- },
- "hpcai-tech/grok-1": {
- "tokenizer": "grok-1",
- "organization": "xAI",
- "vocab_size": 131072,
- "num(digit)": 40,
- "len(digit)": "1,6,13",
- "num(space)": 399,
- "len(space)": "1,3,16",
- "num(ar)": 69,
- "len(ar)": "1,2,4",
- "num(zh)": 1626,
- "len(zh)": "1,2,7",
- "num(ja)": 3118,
- "len(ja)": "1,2,8",
- "num(ja-kana)": 1908,
- "len(ja-kana)": "1,2,8",
- "num(ko)": 67,
- "len(ko)": "1,1,2"
- },
- "internlm/internlm-chat-7b": {
- "tokenizer": "internlm-chat-7b",
- "organization": "Shanghai AI Lab",
- "vocab_size": 103168,
- "num(digit)": 1259,
- "len(digit)": "1,3,19",
- "num(space)": 33008,
- "len(space)": "1,6,128",
- "num(ar)": 6702,
- "len(ar)": "1,4,16",
- "num(zh)": 32000,
- "len(zh)": "1,2,15",
- "num(ja)": 32866,
- "len(ja)": "1,2,15",
- "num(ja-kana)": 864,
- "len(ja-kana)": "1,2,9",
- "num(ko)": 298,
- "len(ko)": "1,1,1"
- },
- "internlm/internlm-xcomposer-7b": {
- "tokenizer": "internlm-xcomposer-7b",
- "organization": "Shanghai AI Lab",
- "vocab_size": 103168,
- "num(digit)": 1261,
- "len(digit)": "1,3,19",
- "num(space)": 33008,
- "len(space)": "1,6,128",
- "num(ar)": 6702,
- "len(ar)": "1,4,16",
- "num(zh)": 32000,
- "len(zh)": "1,2,15",
- "num(ja)": 32866,
- "len(ja)": "1,2,15",
- "num(ja-kana)": 864,
- "len(ja-kana)": "1,2,9",
- "num(ko)": 298,
- "len(ko)": "1,1,1"
- },
- "internlm/internlm2-chat-7b": {
- "tokenizer": "internlm2-chat-7b",
- "organization": "Shanghai AI Lab",
- "vocab_size": 92544,
- "num(digit)": 1261,
- "len(digit)": "1,3,18",
- "num(space)": 28681,
- "len(space)": "1,7,128",
- "num(ar)": 30,
- "len(ar)": "1,1,1",
- "num(zh)": 31148,
- "len(zh)": "1,2,15",
- "num(ja)": 31296,
- "len(ja)": "1,2,15",
- "num(ja-kana)": 148,
- "len(ja-kana)": "1,1,1",
- "num(ko)": 83,
- "len(ko)": "1,1,1"
- },
- "internlm/internlm2-math-7b": {
- "tokenizer": "internlm2-math-7b",
- "organization": "Shanghai AI Lab",
- "vocab_size": 92544,
- "num(digit)": 1261,
- "len(digit)": "1,3,18",
- "num(space)": 28681,
- "len(space)": "1,7,128",
- "num(ar)": 30,
- "len(ar)": "1,1,1",
- "num(zh)": 31148,
- "len(zh)": "1,2,15",
- "num(ja)": 31296,
- "len(ja)": "1,2,15",
- "num(ja-kana)": 148,
- "len(ja-kana)": "1,1,1",
- "num(ko)": 83,
- "len(ko)": "1,1,1"
- },
- "microsoft/Phi-3-mini-4k-instruct": {
- "tokenizer": "Phi-3-mini-4k-instruct",
- "organization": "Microsoft",
- "vocab_size": 32011,
- "num(digit)": 20,
- "len(digit)": "1,1,1",
- "num(space)": 61,
- "len(space)": "1,2,15",
- "num(ar)": 55,
- "len(ar)": "1,1,2",
- "num(zh)": 700,
- "len(zh)": "1,1,1",
- "num(ja)": 837,
- "len(ja)": "1,1,1",
- "num(ja-kana)": 137,
- "len(ja-kana)": "1,1,1",
- "num(ko)": 111,
- "len(ko)": "1,1,1"
- },
- "microsoft/phi-1": {
- "tokenizer": "phi-1",
- "organization": "Microsoft",
- "vocab_size": 50295,
- "num(digit)": 1691,
- "len(digit)": "1,3,16",
- "num(space)": 33129,
- "len(space)": "1,7,66",
- "num(ar)": 22,
- "len(ar)": "1,1,3",
- "num(zh)": 51,
- "len(zh)": "1,1,4",
- "num(ja)": 183,
- "len(ja)": "1,1,7",
- "num(ja-kana)": 133,
- "len(ja-kana)": "1,1,7",
- "num(ko)": 0,
- "len(ko)": "-"
- },
- "microsoft/phi-2": {
- "tokenizer": "phi-2",
- "organization": "Microsoft",
- "vocab_size": 50295,
- "num(digit)": 1691,
- "len(digit)": "1,3,16",
- "num(space)": 33129,
- "len(space)": "1,7,66",
- "num(ar)": 22,
- "len(ar)": "1,1,3",
- "num(zh)": 51,
- "len(zh)": "1,1,4",
- "num(ja)": 183,
- "len(ja)": "1,1,7",
- "num(ja-kana)": 133,
- "len(ja-kana)": "1,1,7",
- "num(ko)": 0,
- "len(ko)": "-"
- },
- "mistralai/Mistral-7B-v0.1": {
- "tokenizer": "Mistral-7B-v0.1",
- "organization": "Mistral",
- "vocab_size": 32000,
- "num(digit)": 20,
- "len(digit)": "1,1,1",
- "num(space)": 85,
- "len(space)": "1,3,15",
- "num(ar)": 71,
- "len(ar)": "1,1,2",
- "num(zh)": 1459,
- "len(zh)": "1,1,2",
- "num(ja)": 1593,
- "len(ja)": "1,1,2",
- "num(ja-kana)": 134,
- "len(ja-kana)": "1,1,1",
- "num(ko)": 346,
- "len(ko)": "1,1,1"
- },
- "mistralai/Mixtral-8x7B-v0.1": {
- "tokenizer": "Mixtral-8x7B-v0.1",
- "organization": "Mistral",
- "vocab_size": 32000,
- "num(digit)": 20,
- "len(digit)": "1,1,1",
- "num(space)": 85,
- "len(space)": "1,3,15",
- "num(ar)": 71,
- "len(ar)": "1,1,2",
- "num(zh)": 1459,
- "len(zh)": "1,1,2",
- "num(ja)": 1593,
- "len(ja)": "1,1,2",
- "num(ja-kana)": 134,
- "len(ja-kana)": "1,1,1",
- "num(ko)": 346,
- "len(ko)": "1,1,1"
- },
- "openai-community/gpt2": {
- "tokenizer": "gpt2",
- "organization": "OpenAI",
- "vocab_size": 50257,
- "num(digit)": 1691,
- "len(digit)": "1,3,16",
- "num(space)": 33129,
- "len(space)": "1,7,66",
- "num(ar)": 22,
- "len(ar)": "1,1,3",
- "num(zh)": 51,
- "len(zh)": "1,1,4",
- "num(ja)": 183,
- "len(ja)": "1,1,7",
- "num(ja-kana)": 133,
- "len(ja-kana)": "1,1,7",
- "num(ko)": 0,
- "len(ko)": "-"
- },
- "openai/code-davinci-002": {
- "tokenizer": "code-davinci-002",
- "organization": "OpenAI",
- "vocab_size": 50281,
- "num(digit)": 1691,
- "len(digit)": "1,3,16",
- "num(space)": 33175,
- "len(space)": "1,7,66",
- "num(ar)": 22,
- "len(ar)": "1,1,3",
- "num(zh)": 51,
- "len(zh)": "1,1,4",
- "num(ja)": 183,
- "len(ja)": "1,1,7",
- "num(ja-kana)": 133,
- "len(ja-kana)": "1,1,7",
- "num(ko)": 0,
- "len(ko)": "-"
- },
- "openai/gpt-3.5-turbo": {
- "tokenizer": "gpt-3.5-turbo",
- "organization": "OpenAI",
- "vocab_size": 100277,
- "num(digit)": 1110,
- "len(digit)": "1,3,3",
- "num(space)": 47472,
- "len(space)": "1,7,128",
- "num(ar)": 113,
- "len(ar)": "1,2,10",
- "num(zh)": 868,
- "len(zh)": "1,1,7",
- "num(ja)": 1035,
- "len(ja)": "1,1,7",
- "num(ja-kana)": 169,
- "len(ja-kana)": "1,1,7",
- "num(ko)": 299,
- "len(ko)": "1,2,4"
- },
- "openai/gpt-4o": {
- "tokenizer": "gpt-4o",
- "organization": "OpenAI",
- "vocab_size": 200019,
- "num(digit)": 1110,
- "len(digit)": "1,3,3",
- "num(space)": 109316,
- "len(space)": "1,6,128",
- "num(ar)": 8055,
- "len(ar)": "1,4,12",
- "num(zh)": 7563,
- "len(zh)": "1,2,11",
- "num(ja)": 8292,
- "len(ja)": "1,2,11",
- "num(ja-kana)": 809,
- "len(ja-kana)": "1,2,11",
- "num(ko)": 2365,
- "len(ko)": "1,2,8"
- },
- "openai/text-davinci-003": {
- "tokenizer": "text-davinci-003",
- "organization": "OpenAI",
- "vocab_size": 50281,
- "num(digit)": 1691,
- "len(digit)": "1,3,16",
- "num(space)": 33175,
- "len(space)": "1,7,66",
- "num(ar)": 22,
- "len(ar)": "1,1,3",
- "num(zh)": 51,
- "len(zh)": "1,1,4",
- "num(ja)": 183,
- "len(ja)": "1,1,7",
- "num(ja-kana)": 133,
- "len(ja-kana)": "1,1,7",
- "num(ko)": 0,
- "len(ko)": "-"
- },
- "thu-coai/CharacterGLM-6B": {
- "tokenizer": "CharacterGLM-6B",
- "organization": "Tsinghua",
- "vocab_size": 64789,
- "num(digit)": 20,
- "len(digit)": "1,1,1",
- "num(space)": 67,
- "len(space)": "1,2,15",
- "num(ar)": 57,
- "len(ar)": "1,1,2",
- "num(zh)": 30922,
- "len(zh)": "1,2,16",
- "num(ja)": 31065,
- "len(ja)": "1,2,16",
- "num(ja-kana)": 143,
- "len(ja-kana)": "1,1,1",
- "num(ko)": 604,
- "len(ko)": "1,1,1"
- },
- "tiiuae/falcon-180b": {
- "tokenizer": "falcon-180b",
- "organization": "TII",
- "vocab_size": 65024,
- "num(digit)": 1108,
- "len(digit)": "1,3,3",
- "num(space)": 40202,
- "len(space)": "1,7,65",
- "num(ar)": 21,
- "len(ar)": "1,1,4",
- "num(zh)": 1627,
- "len(zh)": "1,1,3",
- "num(ja)": 1652,
- "len(ja)": "1,1,3",
- "num(ja-kana)": 25,
- "len(ja-kana)": "1,1,1",
- "num(ko)": 1,
- "len(ko)": "1,1,1"
- },
- "tiiuae/falcon-7b": {
- "tokenizer": "falcon-7b",
- "organization": "TII",
- "vocab_size": 65024,
- "num(digit)": 1108,
- "len(digit)": "1,3,3",
- "num(space)": 40202,
- "len(space)": "1,7,65",
- "num(ar)": 21,
- "len(ar)": "1,1,4",
- "num(zh)": 1627,
- "len(zh)": "1,1,3",
- "num(ja)": 1652,
- "len(ja)": "1,1,3",
- "num(ja-kana)": 25,
- "len(ja-kana)": "1,1,1",
- "num(ko)": 1,
- "len(ko)": "1,1,1"
- },
- "Qwen/Qwen1.5-1.8B": {
- "tokenizer": "Qwen1.5-1.8B",
- "organization": "Alibaba",
- "vocab_size": 151646,
- "num(digit)": 10,
- "len(digit)": "1,1,1",
- "num(space)": 55883,
- "len(space)": "1,6,128",
- "num(ar)": 4018,
- "len(ar)": "1,3,12",
- "num(zh)": 25557,
- "len(zh)": "1,2,7",
- "num(ja)": 27206,
- "len(ja)": "1,2,11",
- "num(ja-kana)": 2089,
- "len(ja-kana)": "1,3,11",
- "num(ko)": 3495,
- "len(ko)": "1,1,5"
- },
- "Qwen/Qwen1.5-110B": {
- "tokenizer": "Qwen1.5-110B",
- "organization": "Alibaba",
- "vocab_size": 151646,
- "num(digit)": 10,
- "len(digit)": "1,1,1",
- "num(space)": 55883,
- "len(space)": "1,6,128",
- "num(ar)": 4018,
- "len(ar)": "1,3,12",
- "num(zh)": 25557,
- "len(zh)": "1,2,7",
- "num(ja)": 27206,
- "len(ja)": "1,2,11",
- "num(ja-kana)": 2089,
- "len(ja-kana)": "1,3,11",
- "num(ko)": 3495,
- "len(ko)": "1,1,5"
- },
- "Qwen/Qwen1.5-14B": {
- "tokenizer": "Qwen1.5-14B",
- "organization": "Alibaba",
- "vocab_size": 151646,
- "num(digit)": 10,
- "len(digit)": "1,1,1",
- "num(space)": 55883,
- "len(space)": "1,6,128",
- "num(ar)": 4018,
- "len(ar)": "1,3,12",
- "num(zh)": 25557,
- "len(zh)": "1,2,7",
- "num(ja)": 27206,
- "len(ja)": "1,2,11",
- "num(ja-kana)": 2089,
- "len(ja-kana)": "1,3,11",
- "num(ko)": 3495,
- "len(ko)": "1,1,5"
- }
+{
+ "FacebookAI/xlm-roberta-base": {
+ "tokenizer": "xlm-roberta-base",
+ "organization": "Facebook",
+ "vocab_size": 250002,
+ "num(digit)": 2728,
+ "len(digit)": "1,3,9",
+ "num(space)": 1,
+ "len(space)": "1,1,1",
+ "num(ar)": 14644,
+ "len(ar)": "1,4,16",
+ "num(zh)": 18457,
+ "len(zh)": "1,2,16",
+ "num(ja)": 20572,
+ "len(ja)": "1,2,16",
+ "num(ja-kana)": 3434,
+ "len(ja-kana)": "1,3,12",
+ "num(ko)": 5373,
+ "len(ko)": "1,2,8"
+ },
+ "clue/roberta_chinese_clue_tiny": {
+ "tokenizer": "roberta-chinese-clue",
+ "organization": "CLUE",
+ "vocab_size": 8021,
+ "num(digit)": 230,
+ "len(digit)": "1,4,10",
+ "num(space)": 0,
+ "len(space)": "-",
+ "num(ar)": 30,
+ "len(ar)": "1,2,3",
+ "num(zh)": 5689,
+ "len(zh)": "1,1,1",
+ "num(ja)": 5691,
+ "len(ja)": "1,1,3",
+ "num(ja-kana)": 0,
+ "len(ja-kana)": "-",
+ "num(ko)": 0,
+ "len(ko)": "-"
+ },
+ "dbmdz/bert-base-german-uncased": {
+ "tokenizer": "bert-base-german-uncased",
+ "organization": "dbmdz",
+ "vocab_size": 31102,
+ "num(digit)": 1733,
+ "len(digit)": "1,4,12",
+ "num(space)": 0,
+ "len(space)": "-",
+ "num(ar)": 0,
+ "len(ar)": "-",
+ "num(zh)": 0,
+ "len(zh)": "-",
+ "num(ja)": 0,
+ "len(ja)": "-",
+ "num(ja-kana)": 0,
+ "len(ja-kana)": "-",
+ "num(ko)": 0,
+ "len(ko)": "-"
+ },
+ "google-bert/bert-base-cased": {
+ "tokenizer": "bert-base-cased",
+ "organization": "Google",
+ "vocab_size": 28996,
+ "num(digit)": 926,
+ "len(digit)": "1,4,11",
+ "num(space)": 0,
+ "len(space)": "-",
+ "num(ar)": 94,
+ "len(ar)": "1,3,4",
+ "num(zh)": 226,
+ "len(zh)": "1,2,3",
+ "num(ja)": 390,
+ "len(ja)": "1,2,3",
+ "num(ja-kana)": 164,
+ "len(ja-kana)": "1,2,3",
+ "num(ko)": 10,
+ "len(ko)": "1,2,3"
+ },
+ "google-bert/bert-base-chinese": {
+ "tokenizer": "bert-base-chinese",
+ "organization": "Google",
+ "vocab_size": 21128,
+ "num(digit)": 1451,
+ "len(digit)": "1,3,12",
+ "num(space)": 2,
+ "len(space)": "1,2,3",
+ "num(ar)": 30,
+ "len(ar)": "1,2,3",
+ "num(zh)": 14642,
+ "len(zh)": "1,2,3",
+ "num(ja)": 15197,
+ "len(ja)": "1,3,15",
+ "num(ja-kana)": 553,
+ "len(ja-kana)": "1,3,15",
+ "num(ko)": 0,
+ "len(ko)": "-"
+ },
+ "google-bert/bert-base-german-cased": {
+ "tokenizer": "bert-base-german-cased",
+ "organization": "Google",
+ "vocab_size": 30000,
+ "num(digit)": 4065,
+ "len(digit)": "1,11,22",
+ "num(space)": 0,
+ "len(space)": "-",
+ "num(ar)": 0,
+ "len(ar)": "-",
+ "num(zh)": 0,
+ "len(zh)": "-",
+ "num(ja)": 0,
+ "len(ja)": "-",
+ "num(ja-kana)": 0,
+ "len(ja-kana)": "-",
+ "num(ko)": 0,
+ "len(ko)": "-"
+ },
+ "google-bert/bert-base-multilingual-cased": {
+ "tokenizer": "bert-base-multilingual-cased",
+ "organization": "Google",
+ "vocab_size": 119547,
+ "num(digit)": 2583,
+ "len(digit)": "1,3,13",
+ "num(space)": 0,
+ "len(space)": "-",
+ "num(ar)": 4873,
+ "len(ar)": "1,5,14",
+ "num(zh)": 13542,
+ "len(zh)": "1,2,3",
+ "num(ja)": 14880,
+ "len(ja)": "1,3,10",
+ "num(ja-kana)": 1336,
+ "len(ja-kana)": "1,4,10",
+ "num(ko)": 3271,
+ "len(ko)": "1,3,6"
+ },
+ "google-bert/bert-base-multilingual-uncased": {
+ "tokenizer": "bert-base-multilingual-uncased",
+ "organization": "Google",
+ "vocab_size": 105879,
+ "num(digit)": 2510,
+ "len(digit)": "1,3,13",
+ "num(space)": 2,
+ "len(space)": "1,2,3",
+ "num(ar)": 4530,
+ "len(ar)": "1,5,13",
+ "num(zh)": 16658,
+ "len(zh)": "1,2,3",
+ "num(ja)": 17858,
+ "len(ja)": "1,3,10",
+ "num(ja-kana)": 1188,
+ "len(ja-kana)": "1,4,10",
+ "num(ko)": 0,
+ "len(ko)": "-"
+ },
+ "google-bert/bert-base-uncased": {
+ "tokenizer": "bert-base-uncased",
+ "organization": "Google",
+ "vocab_size": 30522,
+ "num(digit)": 2056,
+ "len(digit)": "1,4,11",
+ "num(space)": 0,
+ "len(space)": "-",
+ "num(ar)": 88,
+ "len(ar)": "1,3,5",
+ "num(zh)": 488,
+ "len(zh)": "1,2,3",
+ "num(ja)": 676,
+ "len(ja)": "1,2,3",
+ "num(ja-kana)": 188,
+ "len(ja-kana)": "1,2,3",
+ "num(ko)": 0,
+ "len(ko)": "-"
+ },
+ "google/mobilebert-uncased": {
+ "tokenizer": "mobilebert-uncased",
+ "organization": "Google",
+ "vocab_size": 30522,
+ "num(digit)": 2056,
+ "len(digit)": "1,4,11",
+ "num(space)": 0,
+ "len(space)": "-",
+ "num(ar)": 88,
+ "len(ar)": "1,3,5",
+ "num(zh)": 488,
+ "len(zh)": "1,2,3",
+ "num(ja)": 676,
+ "len(ja)": "1,2,3",
+ "num(ja-kana)": 188,
+ "len(ja-kana)": "1,2,3",
+ "num(ko)": 0,
+ "len(ko)": "-"
+ },
+ "tohoku-nlp/bert-base-japanese": {
+ "tokenizer": "bert-base-japanese",
+ "organization": "Tohoku",
+ "vocab_size": 32000,
+ "num(digit)": 669,
+ "len(digit)": "1,3,5",
+ "num(space)": 0,
+ "len(space)": "-",
+ "num(ar)": 10,
+ "len(ar)": "1,3,3",
+ "num(zh)": 18792,
+ "len(zh)": "1,2,11",
+ "num(ja)": 28367,
+ "len(ja)": "1,2,13",
+ "num(ja-kana)": 12359,
+ "len(ja-kana)": "1,4,13",
+ "num(ko)": 0,
+ "len(ko)": "-"
+ },
+ "gpt-4": {
+ "tokenizer": "gpt-4",
+ "organization": "OpenAI",
+ "vocab_size": 100277,
+ "num(digit)": 1110,
+ "len(digit)": "1,3,3",
+ "num(space)": 47472,
+ "len(space)": "1,7,128",
+ "num(ar)": 113,
+ "len(ar)": "1,2,10",
+ "num(zh)": 868,
+ "len(zh)": "1,1,7",
+ "num(ja)": 1035,
+ "len(ja)": "1,1,7",
+ "num(ja-kana)": 169,
+ "len(ja-kana)": "1,1,7",
+ "num(ko)": 299,
+ "len(ko)": "1,2,4"
+ },
+ "llama3": {
+ "tokenizer": "llama3",
+ "organization": "Meta",
+ "vocab_size": 128256,
+ "num(digit)": 1110,
+ "len(digit)": "1,3,3",
+ "num(space)": 60860,
+ "len(space)": "1,6,128",
+ "num(ar)": 3810,
+ "len(ar)": "1,4,11",
+ "num(zh)": 4424,
+ "len(zh)": "1,1,7",
+ "num(ja)": 5387,
+ "len(ja)": "1,2,8",
+ "num(ja-kana)": 1086,
+ "len(ja-kana)": "1,2,8",
+ "num(ko)": 2281,
+ "len(ko)": "1,2,6"
+ },
+ "google-t5/t5-large": {
+ "tokenizer": "t5",
+ "organization": "Google",
+ "vocab_size": 32100,
+ "num(digit)": 1133,
+ "len(digit)": "1,3,13",
+ "num(space)": 0,
+ "len(space)": "-",
+ "num(ar)": 0,
+ "len(ar)": "-",
+ "num(zh)": 0,
+ "len(zh)": "-",
+ "num(ja)": 0,
+ "len(ja)": "-",
+ "num(ja-kana)": 0,
+ "len(ja-kana)": "-",
+ "num(ko)": 0,
+ "len(ko)": "-"
+ },
+ "google/byt5-small": {
+ "tokenizer": "byt5-small",
+ "organization": "Google",
+ "vocab_size": 384,
+ "num(digit)": 10,
+ "len(digit)": "1,1,1",
+ "num(space)": 10,
+ "len(space)": "1,1,1",
+ "num(ar)": 0,
+ "len(ar)": "-",
+ "num(zh)": 0,
+ "len(zh)": "-",
+ "num(ja)": 0,
+ "len(ja)": "-",
+ "num(ja-kana)": 0,
+ "len(ja-kana)": "-",
+ "num(ko)": 0,
+ "len(ko)": "-"
+ },
+ "google/mt5-large": {
+ "tokenizer": "mt5-large",
+ "organization": "Google",
+ "vocab_size": 250100,
+ "num(digit)": 16829,
+ "len(digit)": "1,4,16",
+ "num(space)": 1,
+ "len(space)": "1,1,1",
+ "num(ar)": 7459,
+ "len(ar)": "1,3,16",
+ "num(zh)": 21489,
+ "len(zh)": "1,2,16",
+ "num(ja)": 27078,
+ "len(ja)": "1,2,16",
+ "num(ja-kana)": 9160,
+ "len(ja-kana)": "1,3,14",
+ "num(ko)": 4041,
+ "len(ko)": "1,1,10"
+ },
+ "lmsys/fastchat-t5-3b-v1.0": {
+ "tokenizer": "fastchat-t5-3b-v1.0",
+ "organization": "LMSYS",
+ "vocab_size": 32110,
+ "num(digit)": 1033,
+ "len(digit)": "1,3,8",
+ "num(space)": 0,
+ "len(space)": "-",
+ "num(ar)": 0,
+ "len(ar)": "-",
+ "num(zh)": 0,
+ "len(zh)": "-",
+ "num(ja)": 0,
+ "len(ja)": "-",
+ "num(ja-kana)": 0,
+ "len(ja-kana)": "-",
+ "num(ko)": 0,
+ "len(ko)": "-"
+ },
+ "paust/pko-t5-large": {
+ "tokenizer": "pko-t5-large",
+ "organization": "PAUST",
+ "vocab_size": 50358,
+ "num(digit)": 51,
+ "len(digit)": "1,2,3",
+ "num(space)": 10,
+ "len(space)": "1,1,1",
+ "num(ar)": 0,
+ "len(ar)": "-",
+ "num(zh)": 0,
+ "len(zh)": "-",
+ "num(ja)": 0,
+ "len(ja)": "-",
+ "num(ja-kana)": 0,
+ "len(ja-kana)": "-",
+ "num(ko)": 49050,
+ "len(ko)": "1,2,16"
+ },
+ "bloom": {
+ "tokenizer": "bloom",
+ "organization": "BigScience",
+ "vocab_size": 250680,
+ "num(digit)": 6629,
+ "len(digit)": "1,4,50",
+ "num(space)": 140180,
+ "len(space)": "1,6,600",
+ "num(ar)": 20854,
+ "len(ar)": "1,5,16",
+ "num(zh)": 30603,
+ "len(zh)": "1,2,23",
+ "num(ja)": 30816,
+ "len(ja)": "1,2,23",
+ "num(ja-kana)": 214,
+ "len(ja-kana)": "1,1,3",
+ "num(ko)": 338,
+ "len(ko)": "1,1,3"
+ },
+ "llama": {
+ "tokenizer": "llama",
+ "organization": "Meta",
+ "vocab_size": 32000,
+ "num(digit)": 20,
+ "len(digit)": "1,1,1",
+ "num(space)": 61,
+ "len(space)": "1,2,15",
+ "num(ar)": 55,
+ "len(ar)": "1,1,2",
+ "num(zh)": 700,
+ "len(zh)": "1,1,1",
+ "num(ja)": 837,
+ "len(ja)": "1,1,1",
+ "num(ja-kana)": 137,
+ "len(ja-kana)": "1,1,1",
+ "num(ko)": 111,
+ "len(ko)": "1,1,1"
+ },
+ "ClueAI/ChatYuan-large-v2": {
+ "tokenizer": "ChatYuan-large-v2",
+ "organization": "CLUE",
+ "vocab_size": 32128,
+ "num(digit)": 740,
+ "len(digit)": "1,3,9",
+ "num(space)": 0,
+ "len(space)": "-",
+ "num(ar)": 2,
+ "len(ar)": "1,1,1",
+ "num(zh)": 29591,
+ "len(zh)": "1,2,16",
+ "num(ja)": 29736,
+ "len(ja)": "1,2,16",
+ "num(ja-kana)": 145,
+ "len(ja-kana)": "1,1,2",
+ "num(ko)": 0,
+ "len(ko)": "-"
+ },
+ "Meta/llama3": {
+ "tokenizer": "llama3",
+ "organization": "Meta",
+ "vocab_size": 128256,
+ "num(digit)": 1110,
+ "len(digit)": "1,3,3",
+ "num(space)": 60860,
+ "len(space)": "1,6,128",
+ "num(ar)": 3810,
+ "len(ar)": "1,4,11",
+ "num(zh)": 4424,
+ "len(zh)": "1,1,7",
+ "num(ja)": 5387,
+ "len(ja)": "1,2,8",
+ "num(ja-kana)": 1086,
+ "len(ja-kana)": "1,2,8",
+ "num(ko)": 2281,
+ "len(ko)": "1,2,6"
+ },
+ "openai/gpt-4": {
+ "tokenizer": "gpt-4",
+ "organization": "OpenAI",
+ "vocab_size": 100277,
+ "num(digit)": 1110,
+ "len(digit)": "1,3,3",
+ "num(space)": 47472,
+ "len(space)": "1,7,128",
+ "num(ar)": 113,
+ "len(ar)": "1,2,10",
+ "num(zh)": 868,
+ "len(zh)": "1,1,7",
+ "num(ja)": 1035,
+ "len(ja)": "1,1,7",
+ "num(ja-kana)": 169,
+ "len(ja-kana)": "1,1,7",
+ "num(ko)": 299,
+ "len(ko)": "1,2,4"
+ },
+ "gradientai/Llama-3-8B-Instruct-Gradient-1048k": {
+ "tokenizer": "llama3",
+ "organization": "Meta",
+ "vocab_size": 128256,
+ "num(digit)": 1110,
+ "len(digit)": "1,3,3",
+ "num(space)": 60860,
+ "len(space)": "1,6,128",
+ "num(ar)": 3810,
+ "len(ar)": "1,4,11",
+ "num(zh)": 4424,
+ "len(zh)": "1,1,7",
+ "num(ja)": 5387,
+ "len(ja)": "1,2,8",
+ "num(ja-kana)": 1086,
+ "len(ja-kana)": "1,2,8",
+ "num(ko)": 2281,
+ "len(ko)": "1,2,6"
+ },
+ "bigscience/bloom": {
+ "tokenizer": "bloom",
+ "organization": "BigScience",
+ "vocab_size": 250680,
+ "num(digit)": 6629,
+ "len(digit)": "1,4,50",
+ "num(space)": 140180,
+ "len(space)": "1,6,600",
+ "num(ar)": 20854,
+ "len(ar)": "1,5,16",
+ "num(zh)": 30603,
+ "len(zh)": "1,2,23",
+ "num(ja)": 30816,
+ "len(ja)": "1,2,23",
+ "num(ja-kana)": 214,
+ "len(ja-kana)": "1,1,3",
+ "num(ko)": 338,
+ "len(ko)": "1,1,3"
+ },
+ "huggyllama/llama-7b": {
+ "tokenizer": "llama",
+ "organization": "Meta",
+ "vocab_size": 32000,
+ "num(digit)": 20,
+ "len(digit)": "1,1,1",
+ "num(space)": 61,
+ "len(space)": "1,2,15",
+ "num(ar)": 55,
+ "len(ar)": "1,1,2",
+ "num(zh)": 700,
+ "len(zh)": "1,1,1",
+ "num(ja)": 837,
+ "len(ja)": "1,1,1",
+ "num(ja-kana)": 137,
+ "len(ja-kana)": "1,1,1",
+ "num(ko)": 111,
+ "len(ko)": "1,1,1"
+ },
+ "baichuan-inc/Baichuan-7B": {
+ "tokenizer": "baichuan",
+ "organization": "Baichuan",
+ "vocab_size": 64000,
+ "num(digit)": 335,
+ "len(digit)": "1,14,14",
+ "num(space)": 13,
+ "len(space)": "1,1,1",
+ "num(ar)": 299,
+ "len(ar)": "1,1,2",
+ "num(zh)": 27676,
+ "len(zh)": "1,1,9",
+ "num(ja)": 28522,
+ "len(ja)": "1,1,9",
+ "num(ja-kana)": 178,
+ "len(ja-kana)": "1,1,1",
+ "num(ko)": 1591,
+ "len(ko)": "1,1,1"
+ },
+ "01-ai/Yi-34B": {
+ "tokenizer": "Yi-34B",
+ "organization": "Yi",
+ "vocab_size": 64000,
+ "num(digit)": 200,
+ "len(digit)": "1,13,15",
+ "num(space)": 24274,
+ "len(space)": "1,7,16",
+ "num(ar)": 18,
+ "len(ar)": "1,1,4",
+ "num(zh)": 21356,
+ "len(zh)": "1,2,12",
+ "num(ja)": 21407,
+ "len(ja)": "1,2,12",
+ "num(ja-kana)": 51,
+ "len(ja-kana)": "1,1,2",
+ "num(ko)": 28,
+ "len(ko)": "1,1,2"
+ },
+ "01-ai/Yi-6B": {
+ "tokenizer": "Yi-6B",
+ "organization": "Yi",
+ "vocab_size": 64000,
+ "num(digit)": 200,
+ "len(digit)": "1,13,15",
+ "num(space)": 24274,
+ "len(space)": "1,7,16",
+ "num(ar)": 18,
+ "len(ar)": "1,1,4",
+ "num(zh)": 21356,
+ "len(zh)": "1,2,12",
+ "num(ja)": 21407,
+ "len(ja)": "1,2,12",
+ "num(ja-kana)": 51,
+ "len(ja-kana)": "1,1,2",
+ "num(ko)": 28,
+ "len(ko)": "1,1,2"
+ },
+ "01-ai/Yi-VL-34B": {
+ "tokenizer": "Yi-VL-34B",
+ "organization": "Yi",
+ "vocab_size": 64000,
+ "num(digit)": 200,
+ "len(digit)": "1,13,15",
+ "num(space)": 43,
+ "len(space)": "1,2,15",
+ "num(ar)": 18,
+ "len(ar)": "1,1,4",
+ "num(zh)": 21356,
+ "len(zh)": "1,2,12",
+ "num(ja)": 21407,
+ "len(ja)": "1,2,12",
+ "num(ja-kana)": 51,
+ "len(ja-kana)": "1,1,2",
+ "num(ko)": 28,
+ "len(ko)": "1,1,2"
+ },
+ "ClassCat/gpt2-base-french": {
+ "tokenizer": "gpt2-base-french",
+ "organization": "ClassCat",
+ "vocab_size": 50000,
+ "num(digit)": 1833,
+ "len(digit)": "1,4,5",
+ "num(space)": 31889,
+ "len(space)": "1,7,32",
+ "num(ar)": 41,
+ "len(ar)": "1,1,4",
+ "num(zh)": 27,
+ "len(zh)": "1,1,1",
+ "num(ja)": 46,
+ "len(ja)": "1,1,2",
+ "num(ja-kana)": 19,
+ "len(ja-kana)": "1,1,2",
+ "num(ko)": 0,
+ "len(ko)": "-"
+ },
+ "ClassCat/gpt2-base-spanish": {
+ "tokenizer": "gpt2-base-spanish",
+ "organization": "ClassCat",
+ "vocab_size": 50000,
+ "num(digit)": 1492,
+ "len(digit)": "1,4,9",
+ "num(space)": 34496,
+ "len(space)": "1,8,32",
+ "num(ar)": 36,
+ "len(ar)": "1,1,4",
+ "num(zh)": 13,
+ "len(zh)": "1,1,1",
+ "num(ja)": 36,
+ "len(ja)": "1,1,2",
+ "num(ja-kana)": 23,
+ "len(ja-kana)": "1,1,2",
+ "num(ko)": 0,
+ "len(ko)": "-"
+ },
+ "ClueAI/PromptCLUE-base": {
+ "tokenizer": "PromptCLUE-base",
+ "organization": "CLUE",
+ "vocab_size": 32128,
+ "num(digit)": 740,
+ "len(digit)": "1,3,9",
+ "num(space)": 0,
+ "len(space)": "-",
+ "num(ar)": 2,
+ "len(ar)": "1,1,1",
+ "num(zh)": 29591,
+ "len(zh)": "1,2,16",
+ "num(ja)": 29736,
+ "len(ja)": "1,2,16",
+ "num(ja-kana)": 145,
+ "len(ja-kana)": "1,1,2",
+ "num(ko)": 0,
+ "len(ko)": "-"
+ },
+ "CohereForAI/aya-101": {
+ "tokenizer": "aya-101",
+ "organization": "Cohere For AI",
+ "vocab_size": 250100,
+ "num(digit)": 16829,
+ "len(digit)": "1,4,16",
+ "num(space)": 1,
+ "len(space)": "1,1,1",
+ "num(ar)": 7459,
+ "len(ar)": "1,3,16",
+ "num(zh)": 21489,
+ "len(zh)": "1,2,16",
+ "num(ja)": 27078,
+ "len(ja)": "1,2,16",
+ "num(ja-kana)": 9160,
+ "len(ja-kana)": "1,3,14",
+ "num(ko)": 4041,
+ "len(ko)": "1,1,10"
+ },
+ "EleutherAI/gpt-neox-20b": {
+ "tokenizer": "gpt-neox-20b",
+ "organization": "EleutherAI",
+ "vocab_size": 50277,
+ "num(digit)": 2036,
+ "len(digit)": "1,3,35",
+ "num(space)": 28996,
+ "len(space)": "1,7,512",
+ "num(ar)": 94,
+ "len(ar)": "1,2,4",
+ "num(zh)": 313,
+ "len(zh)": "1,1,2",
+ "num(ja)": 480,
+ "len(ja)": "1,1,4",
+ "num(ja-kana)": 167,
+ "len(ja-kana)": "1,1,4",
+ "num(ko)": 25,
+ "len(ko)": "1,1,2"
+ },
+ "HuggingFaceH4/starchat-alpha": {
+ "tokenizer": "starchat-alpha",
+ "organization": "-",
+ "vocab_size": 49156,
+ "num(digit)": 10,
+ "len(digit)": "1,1,1",
+ "num(space)": 16515,
+ "len(space)": "1,6,256",
+ "num(ar)": 84,
+ "len(ar)": "1,2,4",
+ "num(zh)": 2030,
+ "len(zh)": "1,1,7",
+ "num(ja)": 2368,
+ "len(ja)": "1,1,8",
+ "num(ja-kana)": 360,
+ "len(ja-kana)": "1,2,8",
+ "num(ko)": 491,
+ "len(ko)": "1,2,5"
+ },
+ "HuggingFaceH4/zephyr-7b-beta": {
+ "tokenizer": "zephyr-7b-beta",
+ "organization": "HuggingFace",
+ "vocab_size": 32000,
+ "num(digit)": 20,
+ "len(digit)": "1,1,1",
+ "num(space)": 85,
+ "len(space)": "1,3,15",
+ "num(ar)": 71,
+ "len(ar)": "1,1,2",
+ "num(zh)": 1459,
+ "len(zh)": "1,1,2",
+ "num(ja)": 1593,
+ "len(ja)": "1,1,2",
+ "num(ja-kana)": 134,
+ "len(ja-kana)": "1,1,1",
+ "num(ko)": 346,
+ "len(ko)": "1,1,1"
+ },
+ "LLM360/CrystalCoder": {
+ "tokenizer": "CrystalCoder",
+ "organization": "MBZUAI",
+ "vocab_size": 32022,
+ "num(digit)": 20,
+ "len(digit)": "1,1,1",
+ "num(space)": 61,
+ "len(space)": "1,2,15",
+ "num(ar)": 55,
+ "len(ar)": "1,1,2",
+ "num(zh)": 700,
+ "len(zh)": "1,1,1",
+ "num(ja)": 837,
+ "len(ja)": "1,1,1",
+ "num(ja-kana)": 137,
+ "len(ja-kana)": "1,1,1",
+ "num(ko)": 111,
+ "len(ko)": "1,1,1"
+ },
+ "NousResearch/Llama-2-7b-chat-hf": {
+ "tokenizer": "llama2",
+ "organization": "Meta",
+ "vocab_size": 32001,
+ "num(digit)": 20,
+ "len(digit)": "1,1,1",
+ "num(space)": 61,
+ "len(space)": "1,2,15",
+ "num(ar)": 55,
+ "len(ar)": "1,1,2",
+ "num(zh)": 700,
+ "len(zh)": "1,1,1",
+ "num(ja)": 837,
+ "len(ja)": "1,1,1",
+ "num(ja-kana)": 137,
+ "len(ja-kana)": "1,1,1",
+ "num(ko)": 111,
+ "len(ko)": "1,1,1"
+ },
+ "OrionStarAI/Orion-14B-Chat": {
+ "tokenizer": "Orion-14B-Chat",
+ "organization": "OrionStar",
+ "vocab_size": 84608,
+ "num(digit)": 1559,
+ "len(digit)": "1,4,14",
+ "num(space)": 18383,
+ "len(space)": "1,6,16",
+ "num(ar)": 102,
+ "len(ar)": "1,1,1",
+ "num(zh)": 46998,
+ "len(zh)": "1,2,16",
+ "num(ja)": 49644,
+ "len(ja)": "1,2,16",
+ "num(ja-kana)": 2987,
+ "len(ja-kana)": "1,3,11",
+ "num(ko)": 5110,
+ "len(ko)": "1,2,7"
+ },
+ "Qwen/Qwen-7B-Chat": {
+ "tokenizer": "Qwen",
+ "organization": "Alibaba",
+ "vocab_size": 151851,
+ "num(digit)": 10,
+ "len(digit)": "1,1,1",
+ "num(space)": 55883,
+ "len(space)": "1,6,128",
+ "num(ar)": 4018,
+ "len(ar)": "1,3,12",
+ "num(zh)": 25557,
+ "len(zh)": "1,2,7",
+ "num(ja)": 27206,
+ "len(ja)": "1,2,11",
+ "num(ja-kana)": 2089,
+ "len(ja-kana)": "1,3,11",
+ "num(ko)": 3495,
+ "len(ko)": "1,1,5"
+ },
+ "Qwen/Qwen1.5-14B-Chat": {
+ "tokenizer": "Qwen1.5",
+ "organization": "Alibaba",
+ "vocab_size": 151646,
+ "num(digit)": 10,
+ "len(digit)": "1,1,1",
+ "num(space)": 55883,
+ "len(space)": "1,6,128",
+ "num(ar)": 4018,
+ "len(ar)": "1,3,12",
+ "num(zh)": 25557,
+ "len(zh)": "1,2,7",
+ "num(ja)": 27206,
+ "len(ja)": "1,2,11",
+ "num(ja-kana)": 2089,
+ "len(ja-kana)": "1,3,11",
+ "num(ko)": 3495,
+ "len(ko)": "1,1,5"
+ },
+ "Skywork/Skywork-13B-Math": {
+ "tokenizer": "Skywork-13B-Math",
+ "organization": "Kunlun",
+ "vocab_size": 65519,
+ "num(digit)": 20,
+ "len(digit)": "1,1,1",
+ "num(space)": 62,
+ "len(space)": "1,2,15",
+ "num(ar)": 56,
+ "len(ar)": "1,1,2",
+ "num(zh)": 33913,
+ "len(zh)": "1,2,5",
+ "num(ja)": 34064,
+ "len(ja)": "1,2,5",
+ "num(ja-kana)": 150,
+ "len(ja-kana)": "1,1,1",
+ "num(ko)": 111,
+ "len(ko)": "1,1,1"
+ },
+ "Skywork/Skywork-13B-base": {
+ "tokenizer": "Skywork-13B-base",
+ "organization": "Kunlun",
+ "vocab_size": 65519,
+ "num(digit)": 20,
+ "len(digit)": "1,1,1",
+ "num(space)": 62,
+ "len(space)": "1,2,15",
+ "num(ar)": 56,
+ "len(ar)": "1,1,2",
+ "num(zh)": 33913,
+ "len(zh)": "1,2,5",
+ "num(ja)": 34064,
+ "len(ja)": "1,2,5",
+ "num(ja-kana)": 150,
+ "len(ja-kana)": "1,1,1",
+ "num(ko)": 111,
+ "len(ko)": "1,1,1"
+ },
+ "THUDM/chatglm-6b": {
+ "tokenizer": "chatglm-6b",
+ "organization": "Tsinghua",
+ "vocab_size": 130344,
+ "num(digit)": 20,
+ "len(digit)": "1,1,1",
+ "num(space)": 93,
+ "len(space)": "1,34,80",
+ "num(ar)": 137,
+ "len(ar)": "1,2,4",
+ "num(zh)": 61358,
+ "len(zh)": "1,2,16",
+ "num(ja)": 61784,
+ "len(ja)": "1,2,16",
+ "num(ja-kana)": 439,
+ "len(ja-kana)": "1,2,5",
+ "num(ko)": 114,
+ "len(ko)": "1,1,3"
+ },
+ "THUDM/chatglm2-6b": {
+ "tokenizer": "chatglm2-6b",
+ "organization": "Tsinghua",
+ "vocab_size": 64787,
+ "num(digit)": 20,
+ "len(digit)": "1,1,1",
+ "num(space)": 67,
+ "len(space)": "1,2,15",
+ "num(ar)": 57,
+ "len(ar)": "1,1,2",
+ "num(zh)": 30922,
+ "len(zh)": "1,2,16",
+ "num(ja)": 31065,
+ "len(ja)": "1,2,16",
+ "num(ja-kana)": 143,
+ "len(ja-kana)": "1,1,1",
+ "num(ko)": 604,
+ "len(ko)": "1,1,1"
+ },
+ "THUDM/chatglm3-6b": {
+ "tokenizer": "chatglm3-6b",
+ "organization": "Tsinghua",
+ "vocab_size": 64796,
+ "num(digit)": 20,
+ "len(digit)": "1,1,1",
+ "num(space)": 67,
+ "len(space)": "1,2,15",
+ "num(ar)": 57,
+ "len(ar)": "1,1,2",
+ "num(zh)": 30922,
+ "len(zh)": "1,2,16",
+ "num(ja)": 31065,
+ "len(ja)": "1,2,16",
+ "num(ja-kana)": 143,
+ "len(ja-kana)": "1,1,1",
+ "num(ko)": 604,
+ "len(ko)": "1,1,1"
+ },
+ "TigerResearch/tigerbot-13b-chat-v2": {
+ "tokenizer": "tigerbot-13b-chat-v2",
+ "organization": "Tigerobo",
+ "vocab_size": 60515,
+ "num(digit)": 20,
+ "len(digit)": "1,1,1",
+ "num(space)": 61,
+ "len(space)": "1,2,15",
+ "num(ar)": 55,
+ "len(ar)": "1,1,2",
+ "num(zh)": 28603,
+ "len(zh)": "1,2,16",
+ "num(ja)": 28770,
+ "len(ja)": "1,2,16",
+ "num(ja-kana)": 167,
+ "len(ja-kana)": "1,1,2",
+ "num(ko)": 261,
+ "len(ko)": "1,1,1"
+ },
+ "TigerResearch/tigerbot-70b-chat-v4-4k": {
+ "tokenizer": "tigerbot-70b-chat-v4-4k",
+ "organization": "Tigerobo",
+ "vocab_size": 65110,
+ "num(digit)": 20,
+ "len(digit)": "1,1,1",
+ "num(space)": 61,
+ "len(space)": "1,2,15",
+ "num(ar)": 55,
+ "len(ar)": "1,1,2",
+ "num(zh)": 30509,
+ "len(zh)": "1,2,16",
+ "num(ja)": 32061,
+ "len(ja)": "1,2,16",
+ "num(ja-kana)": 2071,
+ "len(ja-kana)": "1,2,8",
+ "num(ko)": 1504,
+ "len(ko)": "1,1,5"
+ },
+ "Upstage/SOLAR-10.7B-v1.0": {
+ "tokenizer": "SOLAR-10.7B-v1.0",
+ "organization": "-",
+ "vocab_size": 32000,
+ "num(digit)": 20,
+ "len(digit)": "1,1,1",
+ "num(space)": 85,
+ "len(space)": "1,3,15",
+ "num(ar)": 71,
+ "len(ar)": "1,1,2",
+ "num(zh)": 1459,
+ "len(zh)": "1,1,2",
+ "num(ja)": 1593,
+ "len(ja)": "1,1,2",
+ "num(ja-kana)": 134,
+ "len(ja-kana)": "1,1,1",
+ "num(ko)": 346,
+ "len(ko)": "1,1,1"
+ },
+ "WizardLM/WizardCoder-15B-V1.0": {
+ "tokenizer": "WizardCoder-15B-V1.0",
+ "organization": "Microsoft",
+ "vocab_size": 49153,
+ "num(digit)": 10,
+ "len(digit)": "1,1,1",
+ "num(space)": 16515,
+ "len(space)": "1,6,256",
+ "num(ar)": 84,
+ "len(ar)": "1,2,4",
+ "num(zh)": 2030,
+ "len(zh)": "1,1,7",
+ "num(ja)": 2368,
+ "len(ja)": "1,1,8",
+ "num(ja-kana)": 360,
+ "len(ja-kana)": "1,2,8",
+ "num(ko)": 491,
+ "len(ko)": "1,2,5"
+ },
+ "WizardLM/WizardCoder-Python-7B-V1.0": {
+ "tokenizer": "WizardCoder-Python-7B-V1.0",
+ "organization": "Microsoft",
+ "vocab_size": 32001,
+ "num(digit)": 20,
+ "len(digit)": "1,1,1",
+ "num(space)": 61,
+ "len(space)": "1,2,15",
+ "num(ar)": 55,
+ "len(ar)": "1,1,2",
+ "num(zh)": 700,
+ "len(zh)": "1,1,1",
+ "num(ja)": 837,
+ "len(ja)": "1,1,1",
+ "num(ja-kana)": 137,
+ "len(ja-kana)": "1,1,1",
+ "num(ko)": 111,
+ "len(ko)": "1,1,1"
+ },
+ "WizardLM/WizardLM-7B-V1.0": {
+ "tokenizer": "WizardLM-7B-V1.0",
+ "organization": "Microsoft",
+ "vocab_size": 32001,
+ "num(digit)": 20,
+ "len(digit)": "1,1,1",
+ "num(space)": 61,
+ "len(space)": "1,2,15",
+ "num(ar)": 55,
+ "len(ar)": "1,1,2",
+ "num(zh)": 700,
+ "len(zh)": "1,1,1",
+ "num(ja)": 837,
+ "len(ja)": "1,1,1",
+ "num(ja-kana)": 137,
+ "len(ja-kana)": "1,1,1",
+ "num(ko)": 111,
+ "len(ko)": "1,1,1"
+ },
+ "WizardLM/WizardMath-70B-V1.0": {
+ "tokenizer": "WizardMath-70B-V1.0",
+ "organization": "Microsoft",
+ "vocab_size": 32002,
+ "num(digit)": 20,
+ "len(digit)": "1,1,1",
+ "num(space)": 61,
+ "len(space)": "1,2,15",
+ "num(ar)": 55,
+ "len(ar)": "1,1,2",
+ "num(zh)": 700,
+ "len(zh)": "1,1,1",
+ "num(ja)": 837,
+ "len(ja)": "1,1,1",
+ "num(ja-kana)": 137,
+ "len(ja-kana)": "1,1,1",
+ "num(ko)": 111,
+ "len(ko)": "1,1,1"
+ },
+ "abeja/gpt-neox-japanese-2.7b": {
+ "tokenizer": "gpt-neox-japanese-2.7b",
+ "organization": "ABEJA",
+ "vocab_size": 32000,
+ "num(digit)": 20,
+ "len(digit)": "1,1,1",
+ "num(space)": 0,
+ "len(space)": "-",
+ "num(ar)": 0,
+ "len(ar)": "-",
+ "num(zh)": 15176,
+ "len(zh)": "1,2,2",
+ "num(ja)": 31482,
+ "len(ja)": "1,2,3",
+ "num(ja-kana)": 16306,
+ "len(ja-kana)": "1,3,3",
+ "num(ko)": 0,
+ "len(ko)": "-"
+ },
+ "ai21labs/Jamba-v0.1": {
+ "tokenizer": "Jamba-v0.1",
+ "organization": "AI21",
+ "vocab_size": 65536,
+ "num(digit)": 1556,
+ "len(digit)": "1,16,17",
+ "num(space)": 39501,
+ "len(space)": "1,7,32",
+ "num(ar)": 867,
+ "len(ar)": "1,3,8",
+ "num(zh)": 1157,
+ "len(zh)": "1,1,2",
+ "num(ja)": 1287,
+ "len(ja)": "1,1,2",
+ "num(ja-kana)": 130,
+ "len(ja-kana)": "1,1,2",
+ "num(ko)": 312,
+ "len(ko)": "1,1,2"
+ },
+ "allenai/OLMo-7B": {
+ "tokenizer": "OLMo-7B",
+ "organization": "Allen AI",
+ "vocab_size": 50280,
+ "num(digit)": 2036,
+ "len(digit)": "1,3,35",
+ "num(space)": 29019,
+ "len(space)": "1,7,512",
+ "num(ar)": 94,
+ "len(ar)": "1,2,4",
+ "num(zh)": 313,
+ "len(zh)": "1,1,2",
+ "num(ja)": 480,
+ "len(ja)": "1,1,4",
+ "num(ja-kana)": 167,
+ "len(ja-kana)": "1,1,4",
+ "num(ko)": 25,
+ "len(ko)": "1,1,2"
+ },
+ "baichuan-inc/Baichuan2-7B-Chat": {
+ "tokenizer": "baichuan2",
+ "organization": "Baichuan",
+ "vocab_size": 125696,
+ "num(digit)": 1023,
+ "len(digit)": "1,14,14",
+ "num(space)": 26013,
+ "len(space)": "1,7,32",
+ "num(ar)": 335,
+ "len(ar)": "1,1,27",
+ "num(zh)": 70398,
+ "len(zh)": "1,2,32",
+ "num(ja)": 71269,
+ "len(ja)": "1,2,32",
+ "num(ja-kana)": 206,
+ "len(ja-kana)": "1,1,9",
+ "num(ko)": 1595,
+ "len(ko)": "1,1,2"
+ },
+ "ckiplab/gpt2-base-chinese": {
+ "tokenizer": "gpt2-base-chinese",
+ "organization": "SINICA",
+ "vocab_size": 21128,
+ "num(digit)": 1451,
+ "len(digit)": "1,3,12",
+ "num(space)": 2,
+ "len(space)": "1,2,3",
+ "num(ar)": 30,
+ "len(ar)": "1,2,3",
+ "num(zh)": 14642,
+ "len(zh)": "1,2,3",
+ "num(ja)": 15197,
+ "len(ja)": "1,3,15",
+ "num(ja-kana)": 553,
+ "len(ja-kana)": "1,3,15",
+ "num(ko)": 0,
+ "len(ko)": "-"
+ },
+ "cyberagent/open-calm-7b": {
+ "tokenizer": "open-calm-7b",
+ "organization": "CyberAgent",
+ "vocab_size": 52000,
+ "num(digit)": 690,
+ "len(digit)": "1,3,5",
+ "num(space)": 1698,
+ "len(space)": "1,4,33",
+ "num(ar)": 10,
+ "len(ar)": "1,1,4",
+ "num(zh)": 30775,
+ "len(zh)": "1,3,31",
+ "num(ja)": 45790,
+ "len(ja)": "1,3,31",
+ "num(ja-kana)": 32535,
+ "len(ja-kana)": "1,3,31",
+ "num(ko)": 0,
+ "len(ko)": "-"
+ },
+ "databricks/dbrx-instruct": {
+ "tokenizer": "dbrx-instruct",
+ "organization": "Databricks",
+ "vocab_size": 100280,
+ "num(digit)": 1126,
+ "len(digit)": "1,3,17",
+ "num(space)": 47400,
+ "len(space)": "1,7,128",
+ "num(ar)": 113,
+ "len(ar)": "1,2,10",
+ "num(zh)": 868,
+ "len(zh)": "1,1,7",
+ "num(ja)": 1035,
+ "len(ja)": "1,1,7",
+ "num(ja-kana)": 169,
+ "len(ja-kana)": "1,1,7",
+ "num(ko)": 299,
+ "len(ko)": "1,2,4"
+ },
+ "deepseek-ai/DeepSeek-V2": {
+ "tokenizer": "DeepSeek-V2",
+ "organization": "DeepSeek",
+ "vocab_size": 100002,
+ "num(digit)": 10,
+ "len(digit)": "1,1,1",
+ "num(space)": 48073,
+ "len(space)": "1,7,128",
+ "num(ar)": 48,
+ "len(ar)": "1,1,4",
+ "num(zh)": 18052,
+ "len(zh)": "1,2,16",
+ "num(ja)": 18090,
+ "len(ja)": "1,2,16",
+ "num(ja-kana)": 38,
+ "len(ja-kana)": "1,1,2",
+ "num(ko)": 16,
+ "len(ko)": "1,1,2"
+ },
+ "deepseek-ai/deepseek-coder-33b-instruct": {
+ "tokenizer": "deepseek-coder-33b-instruct",
+ "organization": "DeepSeek",
+ "vocab_size": 32022,
+ "num(digit)": 10,
+ "len(digit)": "1,1,1",
+ "num(space)": 15254,
+ "len(space)": "1,6,65",
+ "num(ar)": 12,
+ "len(ar)": "1,1,2",
+ "num(zh)": 4803,
+ "len(zh)": "1,2,4",
+ "num(ja)": 4804,
+ "len(ja)": "1,2,4",
+ "num(ja-kana)": 1,
+ "len(ja-kana)": "1,1,1",
+ "num(ko)": 0,
+ "len(ko)": "-"
+ },
+ "deepseek-ai/deepseek-llm-7b-base": {
+ "tokenizer": "deepseek-llm-7b-base",
+ "organization": "DeepSeek",
+ "vocab_size": 100015,
+ "num(digit)": 10,
+ "len(digit)": "1,1,1",
+ "num(space)": 48073,
+ "len(space)": "1,7,128",
+ "num(ar)": 48,
+ "len(ar)": "1,1,4",
+ "num(zh)": 18052,
+ "len(zh)": "1,2,16",
+ "num(ja)": 18090,
+ "len(ja)": "1,2,16",
+ "num(ja-kana)": 38,
+ "len(ja-kana)": "1,1,2",
+ "num(ko)": 16,
+ "len(ko)": "1,1,2"
+ },
+ "eson/kplug-base-encoder": {
+ "tokenizer": "kplug",
+ "organization": "JD",
+ "vocab_size": 10261,
+ "num(digit)": 420,
+ "len(digit)": "1,3,12",
+ "num(space)": 0,
+ "len(space)": "-",
+ "num(ar)": 0,
+ "len(ar)": "-",
+ "num(zh)": 5764,
+ "len(zh)": "1,1,1",
+ "num(ja)": 5766,
+ "len(ja)": "1,1,3",
+ "num(ja-kana)": 0,
+ "len(ja-kana)": "-",
+ "num(ko)": 0,
+ "len(ko)": "-"
+ },
+ "fnlp/moss-moon-003-sft": {
+ "tokenizer": "moss-moon-003-sft",
+ "organization": "Fudan",
+ "vocab_size": 106072,
+ "num(digit)": 1848,
+ "len(digit)": "1,3,16",
+ "num(space)": 33566,
+ "len(space)": "1,7,102",
+ "num(ar)": 25,
+ "len(ar)": "1,1,4",
+ "num(zh)": 54230,
+ "len(zh)": "1,2,15",
+ "num(ja)": 54381,
+ "len(ja)": "1,2,15",
+ "num(ja-kana)": 152,
+ "len(ja-kana)": "1,1,7",
+ "num(ko)": 0,
+ "len(ko)": "-"
+ },
+ "google/gemma-7b": {
+ "tokenizer": "gemma-7b",
+ "organization": "Google",
+ "vocab_size": 256000,
+ "num(digit)": 134,
+ "len(digit)": "1,10,12",
+ "num(space)": 125662,
+ "len(space)": "1,7,31",
+ "num(ar)": 6274,
+ "len(ar)": "1,4,15",
+ "num(zh)": 23767,
+ "len(zh)": "1,2,12",
+ "num(ja)": 28852,
+ "len(ja)": "1,2,12",
+ "num(ja-kana)": 7061,
+ "len(ja-kana)": "1,3,12",
+ "num(ko)": 2295,
+ "len(ko)": "1,1,5"
+ },
+ "google/switch-c-2048": {
+ "tokenizer": "switch-c-2048",
+ "organization": "Google",
+ "vocab_size": 32100,
+ "num(digit)": 1133,
+ "len(digit)": "1,3,13",
+ "num(space)": 0,
+ "len(space)": "-",
+ "num(ar)": 0,
+ "len(ar)": "-",
+ "num(zh)": 0,
+ "len(zh)": "-",
+ "num(ja)": 0,
+ "len(ja)": "-",
+ "num(ja-kana)": 0,
+ "len(ja-kana)": "-",
+ "num(ko)": 0,
+ "len(ko)": "-"
+ },
+ "hfl/chinese-alpaca-lora-7b": {
+ "tokenizer": "chinese-alpaca-lora-7b",
+ "organization": "-",
+ "vocab_size": 49954,
+ "num(digit)": 614,
+ "len(digit)": "1,3,5",
+ "num(space)": 61,
+ "len(space)": "1,2,15",
+ "num(ar)": 55,
+ "len(ar)": "1,1,2",
+ "num(zh)": 17839,
+ "len(zh)": "1,2,13",
+ "num(ja)": 17993,
+ "len(ja)": "1,2,13",
+ "num(ja-kana)": 154,
+ "len(ja-kana)": "1,1,1",
+ "num(ko)": 135,
+ "len(ko)": "1,1,1"
+ },
+ "hfl/chinese-llama-2-7b": {
+ "tokenizer": "chinese-llama-2-7b",
+ "organization": "-",
+ "vocab_size": 55296,
+ "num(digit)": 20,
+ "len(digit)": "1,1,1",
+ "num(space)": 61,
+ "len(space)": "1,2,15",
+ "num(ar)": 55,
+ "len(ar)": "1,1,2",
+ "num(zh)": 23974,
+ "len(zh)": "1,2,16",
+ "num(ja)": 24111,
+ "len(ja)": "1,2,16",
+ "num(ja-kana)": 137,
+ "len(ja-kana)": "1,1,1",
+ "num(ko)": 111,
+ "len(ko)": "1,1,1"
+ },
+ "hfl/chinese-llama-lora-7b": {
+ "tokenizer": "chinese-llama-lora-7b",
+ "organization": "-",
+ "vocab_size": 49953,
+ "num(digit)": 614,
+ "len(digit)": "1,3,5",
+ "num(space)": 61,
+ "len(space)": "1,2,15",
+ "num(ar)": 55,
+ "len(ar)": "1,1,2",
+ "num(zh)": 17839,
+ "len(zh)": "1,2,13",
+ "num(ja)": 17993,
+ "len(ja)": "1,2,13",
+ "num(ja-kana)": 154,
+ "len(ja-kana)": "1,1,1",
+ "num(ko)": 135,
+ "len(ko)": "1,1,1"
+ },
+ "hfl/llama-3-chinese-8b": {
+ "tokenizer": "llama-3-chinese-8b",
+ "organization": "-",
+ "vocab_size": 128256,
+ "num(digit)": 1110,
+ "len(digit)": "1,3,3",
+ "num(space)": 60860,
+ "len(space)": "1,6,128",
+ "num(ar)": 3810,
+ "len(ar)": "1,4,11",
+ "num(zh)": 4424,
+ "len(zh)": "1,1,7",
+ "num(ja)": 5387,
+ "len(ja)": "1,2,8",
+ "num(ja-kana)": 1086,
+ "len(ja-kana)": "1,2,8",
+ "num(ko)": 2281,
+ "len(ko)": "1,2,6"
+ },
+ "hpcai-tech/grok-1": {
+ "tokenizer": "grok-1",
+ "organization": "xAI",
+ "vocab_size": 131072,
+ "num(digit)": 40,
+ "len(digit)": "1,6,13",
+ "num(space)": 399,
+ "len(space)": "1,3,16",
+ "num(ar)": 69,
+ "len(ar)": "1,2,4",
+ "num(zh)": 1626,
+ "len(zh)": "1,2,7",
+ "num(ja)": 3118,
+ "len(ja)": "1,2,8",
+ "num(ja-kana)": 1908,
+ "len(ja-kana)": "1,2,8",
+ "num(ko)": 67,
+ "len(ko)": "1,1,2"
+ },
+ "internlm/internlm-chat-7b": {
+ "tokenizer": "internlm-chat-7b",
+ "organization": "Shanghai AI Lab",
+ "vocab_size": 103168,
+ "num(digit)": 1259,
+ "len(digit)": "1,3,19",
+ "num(space)": 33008,
+ "len(space)": "1,6,128",
+ "num(ar)": 6702,
+ "len(ar)": "1,4,16",
+ "num(zh)": 32000,
+ "len(zh)": "1,2,15",
+ "num(ja)": 32866,
+ "len(ja)": "1,2,15",
+ "num(ja-kana)": 864,
+ "len(ja-kana)": "1,2,9",
+ "num(ko)": 298,
+ "len(ko)": "1,1,1"
+ },
+ "internlm/internlm-xcomposer-7b": {
+ "tokenizer": "internlm-xcomposer-7b",
+ "organization": "Shanghai AI Lab",
+ "vocab_size": 103168,
+ "num(digit)": 1261,
+ "len(digit)": "1,3,19",
+ "num(space)": 33008,
+ "len(space)": "1,6,128",
+ "num(ar)": 6702,
+ "len(ar)": "1,4,16",
+ "num(zh)": 32000,
+ "len(zh)": "1,2,15",
+ "num(ja)": 32866,
+ "len(ja)": "1,2,15",
+ "num(ja-kana)": 864,
+ "len(ja-kana)": "1,2,9",
+ "num(ko)": 298,
+ "len(ko)": "1,1,1"
+ },
+ "internlm/internlm2-chat-7b": {
+ "tokenizer": "internlm2-chat-7b",
+ "organization": "Shanghai AI Lab",
+ "vocab_size": 92544,
+ "num(digit)": 1261,
+ "len(digit)": "1,3,18",
+ "num(space)": 28681,
+ "len(space)": "1,7,128",
+ "num(ar)": 30,
+ "len(ar)": "1,1,1",
+ "num(zh)": 31148,
+ "len(zh)": "1,2,15",
+ "num(ja)": 31296,
+ "len(ja)": "1,2,15",
+ "num(ja-kana)": 148,
+ "len(ja-kana)": "1,1,1",
+ "num(ko)": 83,
+ "len(ko)": "1,1,1"
+ },
+ "internlm/internlm2-math-7b": {
+ "tokenizer": "internlm2-math-7b",
+ "organization": "Shanghai AI Lab",
+ "vocab_size": 92544,
+ "num(digit)": 1261,
+ "len(digit)": "1,3,18",
+ "num(space)": 28681,
+ "len(space)": "1,7,128",
+ "num(ar)": 30,
+ "len(ar)": "1,1,1",
+ "num(zh)": 31148,
+ "len(zh)": "1,2,15",
+ "num(ja)": 31296,
+ "len(ja)": "1,2,15",
+ "num(ja-kana)": 148,
+ "len(ja-kana)": "1,1,1",
+ "num(ko)": 83,
+ "len(ko)": "1,1,1"
+ },
+ "microsoft/Phi-3-mini-4k-instruct": {
+ "tokenizer": "Phi-3-mini-4k-instruct",
+ "organization": "Microsoft",
+ "vocab_size": 32011,
+ "num(digit)": 20,
+ "len(digit)": "1,1,1",
+ "num(space)": 61,
+ "len(space)": "1,2,15",
+ "num(ar)": 55,
+ "len(ar)": "1,1,2",
+ "num(zh)": 700,
+ "len(zh)": "1,1,1",
+ "num(ja)": 837,
+ "len(ja)": "1,1,1",
+ "num(ja-kana)": 137,
+ "len(ja-kana)": "1,1,1",
+ "num(ko)": 111,
+ "len(ko)": "1,1,1"
+ },
+ "microsoft/phi-1": {
+ "tokenizer": "phi-1",
+ "organization": "Microsoft",
+ "vocab_size": 50295,
+ "num(digit)": 1691,
+ "len(digit)": "1,3,16",
+ "num(space)": 33129,
+ "len(space)": "1,7,66",
+ "num(ar)": 22,
+ "len(ar)": "1,1,3",
+ "num(zh)": 51,
+ "len(zh)": "1,1,4",
+ "num(ja)": 183,
+ "len(ja)": "1,1,7",
+ "num(ja-kana)": 133,
+ "len(ja-kana)": "1,1,7",
+ "num(ko)": 0,
+ "len(ko)": "-"
+ },
+ "microsoft/phi-2": {
+ "tokenizer": "phi-2",
+ "organization": "Microsoft",
+ "vocab_size": 50295,
+ "num(digit)": 1691,
+ "len(digit)": "1,3,16",
+ "num(space)": 33129,
+ "len(space)": "1,7,66",
+ "num(ar)": 22,
+ "len(ar)": "1,1,3",
+ "num(zh)": 51,
+ "len(zh)": "1,1,4",
+ "num(ja)": 183,
+ "len(ja)": "1,1,7",
+ "num(ja-kana)": 133,
+ "len(ja-kana)": "1,1,7",
+ "num(ko)": 0,
+ "len(ko)": "-"
+ },
+ "mistralai/Mistral-7B-v0.1": {
+ "tokenizer": "Mistral-7B-v0.1",
+ "organization": "Mistral",
+ "vocab_size": 32000,
+ "num(digit)": 20,
+ "len(digit)": "1,1,1",
+ "num(space)": 85,
+ "len(space)": "1,3,15",
+ "num(ar)": 71,
+ "len(ar)": "1,1,2",
+ "num(zh)": 1459,
+ "len(zh)": "1,1,2",
+ "num(ja)": 1593,
+ "len(ja)": "1,1,2",
+ "num(ja-kana)": 134,
+ "len(ja-kana)": "1,1,1",
+ "num(ko)": 346,
+ "len(ko)": "1,1,1"
+ },
+ "mistralai/Mixtral-8x7B-v0.1": {
+ "tokenizer": "Mixtral-8x7B-v0.1",
+ "organization": "Mistral",
+ "vocab_size": 32000,
+ "num(digit)": 20,
+ "len(digit)": "1,1,1",
+ "num(space)": 85,
+ "len(space)": "1,3,15",
+ "num(ar)": 71,
+ "len(ar)": "1,1,2",
+ "num(zh)": 1459,
+ "len(zh)": "1,1,2",
+ "num(ja)": 1593,
+ "len(ja)": "1,1,2",
+ "num(ja-kana)": 134,
+ "len(ja-kana)": "1,1,1",
+ "num(ko)": 346,
+ "len(ko)": "1,1,1"
+ },
+ "openai-community/gpt2": {
+ "tokenizer": "gpt2",
+ "organization": "OpenAI",
+ "vocab_size": 50257,
+ "num(digit)": 1691,
+ "len(digit)": "1,3,16",
+ "num(space)": 33129,
+ "len(space)": "1,7,66",
+ "num(ar)": 22,
+ "len(ar)": "1,1,3",
+ "num(zh)": 51,
+ "len(zh)": "1,1,4",
+ "num(ja)": 183,
+ "len(ja)": "1,1,7",
+ "num(ja-kana)": 133,
+ "len(ja-kana)": "1,1,7",
+ "num(ko)": 0,
+ "len(ko)": "-"
+ },
+ "openai/code-davinci-002": {
+ "tokenizer": "code-davinci-002",
+ "organization": "OpenAI",
+ "vocab_size": 50281,
+ "num(digit)": 1691,
+ "len(digit)": "1,3,16",
+ "num(space)": 33175,
+ "len(space)": "1,7,66",
+ "num(ar)": 22,
+ "len(ar)": "1,1,3",
+ "num(zh)": 51,
+ "len(zh)": "1,1,4",
+ "num(ja)": 183,
+ "len(ja)": "1,1,7",
+ "num(ja-kana)": 133,
+ "len(ja-kana)": "1,1,7",
+ "num(ko)": 0,
+ "len(ko)": "-"
+ },
+ "openai/gpt-3.5-turbo": {
+ "tokenizer": "gpt-3.5-turbo",
+ "organization": "OpenAI",
+ "vocab_size": 100277,
+ "num(digit)": 1110,
+ "len(digit)": "1,3,3",
+ "num(space)": 47472,
+ "len(space)": "1,7,128",
+ "num(ar)": 113,
+ "len(ar)": "1,2,10",
+ "num(zh)": 868,
+ "len(zh)": "1,1,7",
+ "num(ja)": 1035,
+ "len(ja)": "1,1,7",
+ "num(ja-kana)": 169,
+ "len(ja-kana)": "1,1,7",
+ "num(ko)": 299,
+ "len(ko)": "1,2,4"
+ },
+ "openai/gpt-4o": {
+ "tokenizer": "gpt-4o",
+ "organization": "OpenAI",
+ "vocab_size": 200019,
+ "num(digit)": 1110,
+ "len(digit)": "1,3,3",
+ "num(space)": 109316,
+ "len(space)": "1,6,128",
+ "num(ar)": 8055,
+ "len(ar)": "1,4,12",
+ "num(zh)": 7563,
+ "len(zh)": "1,2,11",
+ "num(ja)": 8292,
+ "len(ja)": "1,2,11",
+ "num(ja-kana)": 809,
+ "len(ja-kana)": "1,2,11",
+ "num(ko)": 2365,
+ "len(ko)": "1,2,8"
+ },
+ "openai/text-davinci-003": {
+ "tokenizer": "text-davinci-003",
+ "organization": "OpenAI",
+ "vocab_size": 50281,
+ "num(digit)": 1691,
+ "len(digit)": "1,3,16",
+ "num(space)": 33175,
+ "len(space)": "1,7,66",
+ "num(ar)": 22,
+ "len(ar)": "1,1,3",
+ "num(zh)": 51,
+ "len(zh)": "1,1,4",
+ "num(ja)": 183,
+ "len(ja)": "1,1,7",
+ "num(ja-kana)": 133,
+ "len(ja-kana)": "1,1,7",
+ "num(ko)": 0,
+ "len(ko)": "-"
+ },
+ "thu-coai/CharacterGLM-6B": {
+ "tokenizer": "CharacterGLM-6B",
+ "organization": "Tsinghua",
+ "vocab_size": 64789,
+ "num(digit)": 20,
+ "len(digit)": "1,1,1",
+ "num(space)": 67,
+ "len(space)": "1,2,15",
+ "num(ar)": 57,
+ "len(ar)": "1,1,2",
+ "num(zh)": 30922,
+ "len(zh)": "1,2,16",
+ "num(ja)": 31065,
+ "len(ja)": "1,2,16",
+ "num(ja-kana)": 143,
+ "len(ja-kana)": "1,1,1",
+ "num(ko)": 604,
+ "len(ko)": "1,1,1"
+ },
+ "tiiuae/falcon-180b": {
+ "tokenizer": "falcon-180b",
+ "organization": "TII",
+ "vocab_size": 65024,
+ "num(digit)": 1108,
+ "len(digit)": "1,3,3",
+ "num(space)": 40202,
+ "len(space)": "1,7,65",
+ "num(ar)": 21,
+ "len(ar)": "1,1,4",
+ "num(zh)": 1627,
+ "len(zh)": "1,1,3",
+ "num(ja)": 1652,
+ "len(ja)": "1,1,3",
+ "num(ja-kana)": 25,
+ "len(ja-kana)": "1,1,1",
+ "num(ko)": 1,
+ "len(ko)": "1,1,1"
+ },
+ "tiiuae/falcon-7b": {
+ "tokenizer": "falcon-7b",
+ "organization": "TII",
+ "vocab_size": 65024,
+ "num(digit)": 1108,
+ "len(digit)": "1,3,3",
+ "num(space)": 40202,
+ "len(space)": "1,7,65",
+ "num(ar)": 21,
+ "len(ar)": "1,1,4",
+ "num(zh)": 1627,
+ "len(zh)": "1,1,3",
+ "num(ja)": 1652,
+ "len(ja)": "1,1,3",
+ "num(ja-kana)": 25,
+ "len(ja-kana)": "1,1,1",
+ "num(ko)": 1,
+ "len(ko)": "1,1,1"
+ },
+ "Qwen/Qwen1.5-1.8B": {
+ "tokenizer": "Qwen1.5-1.8B",
+ "organization": "Alibaba",
+ "vocab_size": 151646,
+ "num(digit)": 10,
+ "len(digit)": "1,1,1",
+ "num(space)": 55883,
+ "len(space)": "1,6,128",
+ "num(ar)": 4018,
+ "len(ar)": "1,3,12",
+ "num(zh)": 25557,
+ "len(zh)": "1,2,7",
+ "num(ja)": 27206,
+ "len(ja)": "1,2,11",
+ "num(ja-kana)": 2089,
+ "len(ja-kana)": "1,3,11",
+ "num(ko)": 3495,
+ "len(ko)": "1,1,5"
+ },
+ "Qwen/Qwen1.5-110B": {
+ "tokenizer": "Qwen1.5-110B",
+ "organization": "Alibaba",
+ "vocab_size": 151646,
+ "num(digit)": 10,
+ "len(digit)": "1,1,1",
+ "num(space)": 55883,
+ "len(space)": "1,6,128",
+ "num(ar)": 4018,
+ "len(ar)": "1,3,12",
+ "num(zh)": 25557,
+ "len(zh)": "1,2,7",
+ "num(ja)": 27206,
+ "len(ja)": "1,2,11",
+ "num(ja-kana)": 2089,
+ "len(ja-kana)": "1,3,11",
+ "num(ko)": 3495,
+ "len(ko)": "1,1,5"
+ },
+ "Qwen/Qwen1.5-14B": {
+ "tokenizer": "Qwen1.5-14B",
+ "organization": "Alibaba",
+ "vocab_size": 151646,
+ "num(digit)": 10,
+ "len(digit)": "1,1,1",
+ "num(space)": 55883,
+ "len(space)": "1,6,128",
+ "num(ar)": 4018,
+ "len(ar)": "1,3,12",
+ "num(zh)": 25557,
+ "len(zh)": "1,2,7",
+ "num(ja)": 27206,
+ "len(ja)": "1,2,11",
+ "num(ja-kana)": 2089,
+ "len(ja-kana)": "1,3,11",
+ "num(ko)": 3495,
+ "len(ko)": "1,1,5"
+ },
+ "asafaya/bert-base-arabic": {
+ "tokenizer": "bert-base-arabic",
+ "organization": "-",
+ "vocab_size": 32000,
+ "num(digit)": 507,
+ "len(digit)": "1,3,21",
+ "num(space)": 0,
+ "len(space)": "-",
+ "num(ar)": 28367,
+ "len(ar)": "1,5,34",
+ "num(zh)": 180,
+ "len(zh)": "1,1,1",
+ "num(ja)": 333,
+ "len(ja)": "1,1,3",
+ "num(ja-kana)": 153,
+ "len(ja-kana)": "1,1,3",
+ "num(ko)": 0,
+ "len(ko)": "-"
+ }
}
\ No newline at end of file