xu-song commited on
Commit
2bd606a
1 Parent(s): f331792

remove vocabs; update compression_app; add character_app;

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +0 -10
  2. .gitignore +5 -1
  3. vocab/README.md → README.2.md +3 -1
  4. README.md +16 -1
  5. app.py +7 -5
  6. character_app.py +80 -0
  7. character_util.py +213 -0
  8. app_compression.py → compression_app.py +42 -40
  9. utils/compression_util.py → compression_util.py +151 -61
  10. config.py +0 -20
  11. patcher/README.md +15 -0
  12. patcher/sptokenizer_patch_deprecated.py +0 -105
  13. patcher/sptokenizer_wrapper.py +0 -61
  14. patcher/tiktoken_patch.py +2 -2
  15. app_playground.py → playground_app.py +34 -19
  16. examples.py → playground_examples.py +9 -9
  17. util.py → playground_util.py +39 -35
  18. requirements.txt +3 -1
  19. stats/character_stats.json +1712 -0
  20. stats/compress_rate.json +0 -4286
  21. stats/compression_rate.json +0 -0
  22. utils/byte_util.py +0 -0
  23. utils/character_util.py +0 -231
  24. utils/convert_sp_to_json.py +0 -4
  25. utils/fn_util.py +0 -0
  26. utils/lang_util.py +26 -30
  27. utils/lang_util_2.py +0 -115
  28. utils/oov.md +202 -0
  29. utils/oov_util.py +109 -3
  30. utils/speed_util.py +0 -9
  31. utils/symbol.py +0 -35
  32. utils/text_util.py +12 -1
  33. utils/vocab.jd.txt.v2 +0 -10268
  34. vocab.py +453 -0
  35. vocab/Intern_gpt/README.md +0 -0
  36. vocab/__init__.py +0 -260
  37. vocab/_alpaca_7b/README.md +0 -0
  38. vocab/_goat/README.md +0 -0
  39. vocab/_goat/__init__.py +0 -0
  40. vocab/albert/__init__.py +0 -6
  41. vocab/aya_101/__init__.py +0 -5
  42. vocab/baichuan/Baichuan-7B/config.json +0 -26
  43. vocab/baichuan/Baichuan-7B/configuration_baichuan.py +0 -66
  44. vocab/baichuan/Baichuan-7B/special_tokens_map.json +0 -23
  45. vocab/baichuan/Baichuan-7B/tokenization_baichuan.py +0 -250
  46. vocab/baichuan/Baichuan-7B/tokenizer.model +0 -3
  47. vocab/baichuan/Baichuan-7B/tokenizer_config.json +0 -35
  48. vocab/baichuan/__init__.py +0 -19
  49. vocab/baichuan/demo.py +0 -6
  50. vocab/baichuan/error.md +0 -8
.gitattributes CHANGED
@@ -33,13 +33,3 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
- vocab/belle_7b_2m/belle-7b-2m/tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
- vocab/bloom/tokenizer/tokenizer.json filter=lfs diff=lfs merge=lfs -text
38
- vocab/gemma_7b/gemma-7b/tokenizer.model filter=lfs diff=lfs merge=lfs -text
39
- vocab/gemma_7b/gemma-7b/tokenizer.json filter=lfs diff=lfs merge=lfs -text
40
- vocab/grok_1/tokenizer.model filter=lfs diff=lfs merge=lfs -text
41
- vocab/llama3/Meta-Llama-3-70B/tokenizer.json filter=lfs diff=lfs merge=lfs -text
42
- vocab/mistral_7b/Mistral-7B-v0.1/tokenizer.json filter=lfs diff=lfs merge=lfs -text
43
- vocab/mistral_7b/Mistral-7B-v0.1/tokenizer.model filter=lfs diff=lfs merge=lfs -text
44
- vocab/mixtral_8_7b/Mixtral-8x7B-v0.1/tokenizer.json filter=lfs diff=lfs merge=lfs -text
45
- vocab/mixtral_8_7b/Mixtral-8x7B-v0.1/tokenizer.model filter=lfs diff=lfs merge=lfs -text
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
.gitignore CHANGED
@@ -14,4 +14,8 @@ downloads/
14
  eggs/
15
  .eggs/
16
  .idea/
17
- gradio_cached_examples
 
 
 
 
 
14
  eggs/
15
  .eggs/
16
  .idea/
17
+ gradio_cached_examples
18
+ stats/
19
+ test/
20
+ wip/
21
+ tools/
vocab/README.md → README.2.md RENAMED
@@ -67,7 +67,7 @@ carol
67
  ```
68
 
69
 
70
- ##
71
 
72
  https://github.com/pytorch/fairseq/blob/master/tests/test_noising.py#L37
73
 
@@ -77,6 +77,8 @@ https://github.com/pytorch/fairseq/blob/master/tests/test_noising.py#L37
77
 
78
  跟BERT类似,只不过BERT是词后缀,这里是词前缀。
79
 
 
 
80
 
81
  ## GPT2
82
 
 
67
  ```
68
 
69
 
70
+ ## @@
71
 
72
  https://github.com/pytorch/fairseq/blob/master/tests/test_noising.py#L37
73
 
 
77
 
78
  跟BERT类似,只不过BERT是词后缀,这里是词前缀。
79
 
80
+ 这种应该是 https://github.com/rsennrich/subword-nmt
81
+
82
 
83
  ## GPT2
84
 
README.md CHANGED
@@ -7,6 +7,8 @@ sdk: gradio
7
  sdk_version: 4.28.3
8
  app_file: app.py
9
  pinned: false
 
 
10
  ---
11
 
12
 
@@ -210,4 +212,17 @@ python utils/compress_rate_util.py
210
 
211
  - Getting the most out of your tokenizer for pre-training and domain adaptation
212
  - Efficient and Effective Text Encoding for Chinese LLaMA and Alpaca
213
- - https://huggingface.co/spaces/Xenova/the-tokenizer-playground
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  sdk_version: 4.28.3
8
  app_file: app.py
9
  pinned: false
10
+ datasets:
11
+ - cc100
12
  ---
13
 
14
 
 
212
 
213
  - Getting the most out of your tokenizer for pre-training and domain adaptation
214
  - Efficient and Effective Text Encoding for Chinese LLaMA and Alpaca
215
+ - blog
216
+ - https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
217
+ - https://huggingface.co/docs/transformers/tokenizer_summary#sentencepiece
218
+ - https://www.huaxiaozhuan.com/%E5%B7%A5%E5%85%B7/huggingface_transformer/chapters/1_tokenizer.html
219
+ - https://zhuanlan.zhihu.com/p/652520262
220
+ - https://github.com/QwenLM/Qwen/blob/main/tokenization_note_zh.md
221
+ - demo
222
+ - https://huggingface.co/spaces/Xenova/the-tokenizer-playground
223
+ - https://github.com/dqbd/tiktokenizer
224
+ - https://chat.lmsys.org/?leaderboard
225
+ - https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard
226
+ - paper
227
+ - ss
228
+ -
app.py CHANGED
@@ -1,16 +1,18 @@
1
 
2
- import gradio as gr
3
- from app_playground import demo as tab_playground
4
- from app_compression import demo as tab_compression
5
  from patcher.gr_interface import TabbedInterface
6
 
7
 
8
  demo = TabbedInterface(
9
- [tab_playground, tab_compression],
10
- [" ⚔️ Playground", "🏆 Compression Leaderboard",], # 编码速度,解码速度,字符分类(zh、num等,支持正则),支持的语言,机构,。
11
  title='<div align="center">Tokenizer Arena ⚔️</div>',
12
  css="css/style.css"
13
  )
14
 
 
 
15
  if __name__ == "__main__":
16
  demo.launch()
 
1
 
2
+ from playground_app import demo as playground_tab
3
+ from compression_app import demo as compression_tab
4
+ from character_app import demo as character_tab
5
  from patcher.gr_interface import TabbedInterface
6
 
7
 
8
  demo = TabbedInterface(
9
+ [playground_tab, compression_tab, character_tab],
10
+ [" ⚔️ Playground", "🏆 Compression Leaderboard", "📊 Character Statistics"], # 编码速度,解码速度,字符分类(zh、num等,支持正则),支持的语言,机构,。
11
  title='<div align="center">Tokenizer Arena ⚔️</div>',
12
  css="css/style.css"
13
  )
14
 
15
+ demo.load(js=open("js/onload.js", "r", encoding="utf-8").read())
16
+
17
  if __name__ == "__main__":
18
  demo.launch()
character_app.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from character_util import get_character_table
3
+
4
+ all_columns = [
5
+ ("digit", "digit"),
6
+ ("space", "space"),
7
+ ("lang-chinese", 'zh'),
8
+ ("lang-korea", 'ko'),
9
+ ("lang-japanese", 'ja'),
10
+ # ("byte", "byte"),
11
+ # ("oov", "oov")
12
+ ]
13
+ default_columns = ["digit", "zh"]
14
+
15
+ # columns = ["lang-zh", "lang-korea", "lang-japanese", "number", "space", "bytes", "oov"]
16
+
17
+ abbr2name = {column[1]: column[0].split('-')[-1] for column in all_columns}
18
+
19
+
20
+ def get_column_info(columns):
21
+ print(columns)
22
+ markdown = ""
23
+ for column in columns:
24
+ markdown += f"- `num({column})`: num of tokens containing {abbr2name[column]} characters\n" \
25
+ f"- `len({column})`: `min,median,max` length of tokens containing {abbr2name[column]} characters\n"
26
+ return markdown
27
+
28
+
29
+ with gr.Blocks() as demo:
30
+ gr.Markdown("## 🛠️ Setting") # ⚙
31
+ with gr.Accordion("Please select the type of character you want to count.", open=True):
32
+ # file size 💽 🖴, tokens 🧮
33
+ with gr.Row():
34
+ with gr.Column():
35
+ columns = gr.Checkboxgroup(
36
+ all_columns,
37
+ value=default_columns,
38
+ label="character type",
39
+ # info=""
40
+ )
41
+ gr.Markdown(
42
+ "To count other types of characters, you can modify [character_util.py]"
43
+ "(https://huggingface.co/spaces/eson/tokenizer-arena/blob/main/character_util.py). "
44
+ )
45
+ column_info = gr.Markdown(
46
+ get_column_info(default_columns)
47
+ )
48
+
49
+ gr.Markdown("## 📊 Character Statistics")
50
+ search_bar = gr.Textbox(
51
+ placeholder="🔍 Search by tokenizer or organization (e.g., 'llama', 'openai') and press ENTER...",
52
+ show_label=False,
53
+ elem_id="search-bar",
54
+ )
55
+ compress_rate_table = gr.Dataframe(datatype="html", wrap=True)
56
+
57
+ search_bar.submit(
58
+ get_character_table,
59
+ inputs=[search_bar, columns],
60
+ outputs=compress_rate_table
61
+ )
62
+ columns.change(
63
+ get_character_table,
64
+ inputs=[search_bar, columns],
65
+ outputs=compress_rate_table
66
+ )
67
+ columns.change(
68
+ get_column_info,
69
+ inputs=[columns],
70
+ outputs=column_info
71
+ )
72
+
73
+ demo.load(
74
+ get_character_table,
75
+ inputs=[search_bar, columns],
76
+ outputs=compress_rate_table
77
+ )
78
+
79
+ if __name__ == "__main__":
80
+ demo.launch()
character_util.py ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ TODO:
3
+ 1. 繁体、简体、语种、
4
+ 2. 确认 bert的space token数目
5
+ 3. add token_impl
6
+ 4.
7
+ """
8
+ import os
9
+ import json
10
+ import numpy as np
11
+ import pandas as pd
12
+ from collections import Counter, defaultdict
13
+ from vocab import tokenizer_factory
14
+ from typing import Optional, Union, Literal
15
+ from utils.log_util import logger
16
+ from utils.text_util import contains_digit, get_space_count
17
+ from utils.lang_util import detect_language, language_ranges
18
+
19
+ CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
20
+
21
+
22
+ def _to_unicode(text):
23
+ return ''.join(r'\u{:04X}'.format(ord(chr)) for chr in text)
24
+
25
+
26
+ def _get_coding_length(tokenizer, vocab, filter=None):
27
+ """
28
+ oov character may be tokenized into more than one token.
29
+ """
30
+ all_length = []
31
+ for word in vocab:
32
+ if len(word) > 1:
33
+ continue
34
+ if filter is not None and filter(word):
35
+ continue
36
+ try:
37
+ tokens = tokenizer.encode(word)
38
+ except Exception as e:
39
+ print(e)
40
+
41
+ all_length.append(len(tokens))
42
+ # if len(tokens.ids) > 1:
43
+ # if len(tokens) > 3:
44
+ # print(word, tokens)
45
+
46
+ dist_length = Counter(all_length)
47
+ mean_length = round(sum(all_length) / len(all_length), 2)
48
+ return dist_length, mean_length
49
+
50
+
51
+ cache = {}
52
+
53
+
54
+ def _dist(token_lens):
55
+ """
56
+ :param token_lens:
57
+ :return: min,median,max of token_lens
58
+ """
59
+ if not token_lens:
60
+ return "-"
61
+ return f"{min(token_lens)},{round(np.median(token_lens))},{max(token_lens)}"
62
+
63
+
64
+ def iter_vocab(
65
+ tokenizer_name: str,
66
+ from_cache: bool = True,
67
+ cache_dir: str = "stats",
68
+ ) -> Union[pd.DataFrame, dict]:
69
+ """
70
+ :param tokenizer_name:
71
+ :param from_cache:
72
+ :param cache_dir:
73
+ :return:
74
+ """
75
+ tokenizer_config = tokenizer_factory.get_tokenizer_config(tokenizer_name)
76
+
77
+ cache_dir = os.path.join(CURRENT_DIR, cache_dir)
78
+ os.makedirs(cache_dir, exist_ok=True)
79
+
80
+ # load from cache
81
+ cache_path = os.path.join(cache_dir, "character_stats.json")
82
+ if not cache and os.path.exists(cache_path):
83
+ with open(cache_path, "r", encoding="utf-8") as f_tmp:
84
+ cache.update(json.load(f_tmp))
85
+ if from_cache and tokenizer_name in cache:
86
+ logger.info(f"load {tokenizer_config.name_or_path} from cache")
87
+ return cache[tokenizer_name]
88
+
89
+ tokenizer = tokenizer_factory.get_tokenizer(tokenizer_name)
90
+
91
+ tokens_by_lang = {lang[1]: [] for lang in language_ranges.keys()}
92
+ digit_tokens = []
93
+ space_tokens = []
94
+ byte_tokens = []
95
+
96
+ buffer = []
97
+ for token_id in range(tokenizer.vocab_size):
98
+ # for token_id in tokenizer.get_vocab():
99
+ # for token_id in range(len(tokenizer)):
100
+ decode_str = tokenizer.decode([token_id], skip_special_tokens=False)
101
+ token = tokenizer.convert_ids_to_tokens([token_id], skip_special_tokens=False)[0]
102
+ tags = []
103
+ if token is None: # 有些词典有空的id(不连续)
104
+ continue
105
+ if isinstance(token, bytes):
106
+ token = token.decode("utf-8", errors="ignore")
107
+
108
+ if hasattr(tokenizer, "sp_model"): # 基于 sentencepiece 包
109
+ if tokenizer.sp_model.is_byte(token_id):
110
+ tags.append("is_byte")
111
+ byte_tokens.append(token)
112
+
113
+ language_tags = detect_language(decode_str)
114
+ for language in language_tags:
115
+ tokens_by_lang[language[1]].append(decode_str)
116
+
117
+ if contains_digit(decode_str):
118
+ tags.append("digit")
119
+ digit_tokens.append(decode_str)
120
+
121
+ space_count = get_space_count(decode_str)
122
+ if space_count > 0:
123
+ space_tokens.append(decode_str)
124
+
125
+ buffer.append(json.dumps(
126
+ {
127
+ "id": token_id,
128
+ "token": token,
129
+ "token_decode": decode_str,
130
+ "token_dumps": json.dumps(token),
131
+ "token_unicode": _to_unicode(token),
132
+ "token_len": len(decode_str),
133
+ },
134
+ ensure_ascii=False) + "\n")
135
+
136
+ result = {
137
+ "tokenizer": tokenizer_factory.get_name_with_hyperlink(tokenizer_name),
138
+ "organization": tokenizer_config.org,
139
+ # "impl": str(tokenizer.__class__),
140
+ # "vocab_size-": tokenizer.vocab_size, # vocab_size_without_added_token
141
+ "vocab_size": len(tokenizer),
142
+
143
+ # "中文汉字编码长度均值": mean_length, # 不用统计,因为字典包含中文字符多,一般就意味着 中文汉字编码长度短。
144
+ # "中文汉字编码长度分布": json.dumps(dist_length),
145
+
146
+ "num(digit)": len(digit_tokens),
147
+ "len(digit)": _dist([len(token) for token in digit_tokens]),
148
+ "num(space)": len(space_tokens),
149
+ "len(space)": _dist([len(token) for token in space_tokens]),
150
+
151
+ # "num(byte)": len(byte_tokens)
152
+ }
153
+
154
+ for lang, tokens in tokens_by_lang.items():
155
+ result[f"num({lang})"] = len(tokens)
156
+ result["len(" + lang + ")"] = _dist([len(token) for token in tokens])
157
+
158
+ out_path = os.path.join(cache_dir, f"iter_vocab/{tokenizer_name.replace('/', '_')}.vocab.jsonl")
159
+ with open(out_path, "w", encoding="utf-8") as f_out:
160
+ for line in buffer:
161
+ f_out.write(line)
162
+ len_before = len(cache)
163
+ cache[tokenizer_name] = result
164
+ len_after = len(cache)
165
+ logger.info(f"saving {tokenizer_name} to memory and file cache: {len_before}->{len_after}")
166
+ with open(cache_path, "w", encoding="utf-8") as f_out:
167
+ f_out.write(json.dumps(cache, ensure_ascii=False, indent=2))
168
+ return result
169
+
170
+
171
+ def to_dataframe(stats, columns):
172
+ table = []
173
+ for stat in stats.values():
174
+ filtered_stat = {}
175
+ for k, v in stat.items():
176
+ if not k.startswith("num") and not k.startswith("len"):
177
+ filtered_stat[k] = v
178
+ if any(column in k for column in columns):
179
+ k = k.replace("ja-kana", "kana")
180
+ filtered_stat[k] = v
181
+ table.append(filtered_stat)
182
+ df = pd.DataFrame(table)
183
+ return df
184
+
185
+
186
+ def get_character_table(
187
+ tokenizer_filter: Optional[str] = None,
188
+ columns: Optional[str] = None,
189
+ return_type: Optional[Literal["dict", "dataframe"]] = "dataframe"
190
+ ) -> Union[pd.DataFrame, dict]:
191
+ """
192
+ """
193
+ logger.info(f"columns: {columns}, tokenizer_filter: {tokenizer_filter}")
194
+ stats = {}
195
+ if tokenizer_filter is not None:
196
+ tokenizer_names = [tokenizer_config.name_or_path for tokenizer_config in tokenizer_factory.all_tokenizer_configs
197
+ if tokenizer_filter.lower() in tokenizer_config.name_or_path.lower()]
198
+ else:
199
+ tokenizer_names = tokenizer_factory.all_tokenizer_names
200
+
201
+ for tokenizer_name in tokenizer_names:
202
+ stat = iter_vocab(tokenizer_name)
203
+ stats[tokenizer_name] = stat
204
+
205
+ if return_type == "dataframe":
206
+ stats = to_dataframe(stats, columns)
207
+ return stats
208
+
209
+
210
+ if __name__ == "__main__":
211
+ # aa = get_character_table(tokenizer_filter="baichuan")
212
+ df = get_character_table()
213
+ logger.info(f"\n{df.to_markdown(index=False)}")
app_compression.py → compression_app.py RENAMED
@@ -1,6 +1,14 @@
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
- from utils.compression_util import get_compression_leaderboard
3
- from utils.compression_util import common_corpuses
4
 
5
  with gr.Blocks() as demo:
6
  # gr.Markdown("## Convertor")
@@ -44,63 +52,56 @@ with gr.Blocks() as demo:
44
  # )
45
 
46
  gr.Markdown("## 🛠️ Setting") # ⚙
47
- with gr.Accordion("Please select corpus and measure of compression rate ...", open=True):
48
  # file size 💽 🖴, tokens 🧮
49
- # gr.Markdown(
50
- # "Please select corpus and measure of compression rate.\n"
51
- #"`num_of_trillion_tokens` `num_of_billion_tokens`\n"
52
- # "- `b_tokens/g_bytes` measures how many billion tokens per gigabytes corpus. \n"
53
- # "- `t_tokens/t_bytes` measures how many trillion tokens per terabytes corpus. \n"
54
- # "- `n_chars/n_tokens` measures how many chars per token in the current corpus. \n\n"
55
- # "All the above measures are depend on corpus. You can reproduce this "
56
- # "procedure at [github](https://github.com/xu-song/tokenizer-arena/)."
57
- # )
58
-
59
  with gr.Row():
60
- compress_rate_corpus = gr.Dropdown(
61
- common_corpuses, # , "code"
62
- value=["cc100-en", "cc100-zh-Hans"],
63
- label="corpus",
64
- multiselect=True
65
- # info=""
66
- )
 
67
 
 
 
 
 
 
 
 
 
68
 
69
- # unit of file_size: gigabyte terabyte
70
- # unit of token_num: million billion trillion
71
- # The most common units of measurement include length (meter, inch, foot), weight (gram, kilogram, pound), volume (liter, gallon, milliliter), time (second, minute, hour)
72
- compress_rate_unit = gr.Radio(
73
- ["b_tokens/g_bytes", "t_tokens/t_bytes"],
74
- value="b_tokens/g_bytes",
75
- label="measure",
 
 
76
  )
77
 
78
- gr.Markdown(
79
- # "`num_of_trillion_tokens` `num_of_billion_tokens`\n"
80
- "- `b_tokens/g_bytes` measures how many billion tokens per gigabytes corpus. \n"
81
- "- `t_tokens/t_bytes` measures how many trillion tokens per terabytes corpus. \n"
82
- "- `n_chars/n_tokens` measures how many chars per token in the tokenized corpus. \n"
83
- # "\nAll the above measures are depend on corpus. You can reproduce this "
84
- # "procedure at [github](https://github.com/xu-song/tokenizer-arena/)."
85
- )
86
-
87
  gr.Markdown("## 🏆 Compression Rate Leaderboard")
88
  search_bar = gr.Textbox(
89
- placeholder="🔍 Search tokenizers(e.g., 'llama') and press ENTER...",
90
  show_label=False,
91
  elem_id="search-bar",
92
  )
93
- compress_rate_table = gr.Dataframe()
94
 
95
  # func call
96
  compress_rate_corpus.change(
97
  get_compression_leaderboard,
98
- inputs=[compress_rate_corpus, compress_rate_unit],
99
  outputs=compress_rate_table
100
  )
101
  compress_rate_unit.change(
102
  get_compression_leaderboard,
103
- inputs=[compress_rate_corpus, compress_rate_unit],
104
  outputs=compress_rate_table
105
  )
106
  # file_size.change(
@@ -123,5 +124,6 @@ with gr.Blocks() as demo:
123
  inputs=[compress_rate_corpus, compress_rate_unit],
124
  outputs=compress_rate_table
125
  )
 
126
  if __name__ == "__main__":
127
  demo.launch()
 
1
+ """
2
+ TODO:
3
+ - 统计 tokenizer_impl
4
+ - 统计 OOV
5
+ - 统计 reversal
6
+ - 增加 math,code
7
+ """
8
+
9
  import gradio as gr
10
+ from compression_util import get_compression_leaderboard, common_corpuses
11
+
12
 
13
  with gr.Blocks() as demo:
14
  # gr.Markdown("## Convertor")
 
52
  # )
53
 
54
  gr.Markdown("## 🛠️ Setting") # ⚙
55
+ with gr.Accordion("Please select the corpus and measure of compression rate.", open=True):
56
  # file size 💽 🖴, tokens 🧮
57
+ # Total amount of disk used
 
 
 
 
 
 
 
 
 
58
  with gr.Row():
59
+ with gr.Column():
60
+ compress_rate_corpus = gr.Dropdown(
61
+ common_corpuses, # , "code"
62
+ value=["cc100/en", "cc100/zh-Hans", "cc100/fr", "cc100/es"],
63
+ label="corpus",
64
+ multiselect=True
65
+ # info=""
66
+ )
67
 
68
+ # unit of file_size: gigabyte terabyte
69
+ # unit of token_num: million billion trillion
70
+ # The most common units of measurement include length (meter, inch, foot), weight (gram, kilogram, pound), volume (liter, gallon, milliliter), time (second, minute, hour)
71
+ compress_rate_unit = gr.Radio(
72
+ ["b_tokens/g_bytes", "t_tokens/t_bytes"],
73
+ value="b_tokens/g_bytes",
74
+ label="measure", # evaluation metric
75
+ )
76
 
77
+ gr.Markdown(
78
+ "- `corpus`: tokenization is performed on the selected subsets of [cc100](https://huggingface.co/datasets/cc100) corpus.\n"
79
+ "- `b_tokens/g_bytes` measures how many billion tokens per gigabytes corpus.\n"
80
+ "- `t_tokens/t_bytes` measures how many trillion tokens per terabytes corpus.\n"
81
+ # "- `g_bytes/b_tokens` measures how many gigabytes corpus per billion tokens.\n"
82
+ # "- `t_bytes/t_tokens` measures how many terabytes corpus per trillion tokens.\n"
83
+ "- `char/token` measures how many chars per token on the tokenized corpus.\n"
84
+ "- `oov_ratio`: out-of-vocabulary ratio on the selected corpus. 👉 get [oov charset](https://huggingface.co/spaces/eson/tokenizer-arena/blob/main/stats/compression_rate.json)\n\n"
85
+ "You can reproduce this procedure with [compression_util.py](https://huggingface.co/spaces/eson/tokenizer-arena/blob/main/compression_util.py)."
86
  )
87
 
 
 
 
 
 
 
 
 
 
88
  gr.Markdown("## 🏆 Compression Rate Leaderboard")
89
  search_bar = gr.Textbox(
90
+ placeholder="🔍 Search by tokenizer or organization (e.g., 'llama', 'openai') and press ENTER...",
91
  show_label=False,
92
  elem_id="search-bar",
93
  )
94
+ compress_rate_table = gr.Dataframe(datatype="html")
95
 
96
  # func call
97
  compress_rate_corpus.change(
98
  get_compression_leaderboard,
99
+ inputs=[compress_rate_corpus, compress_rate_unit, search_bar],
100
  outputs=compress_rate_table
101
  )
102
  compress_rate_unit.change(
103
  get_compression_leaderboard,
104
+ inputs=[compress_rate_corpus, compress_rate_unit, search_bar],
105
  outputs=compress_rate_table
106
  )
107
  # file_size.change(
 
124
  inputs=[compress_rate_corpus, compress_rate_unit],
125
  outputs=compress_rate_table
126
  )
127
+
128
  if __name__ == "__main__":
129
  demo.launch()
utils/compression_util.py → compression_util.py RENAMED
@@ -2,8 +2,8 @@
2
 
3
  中文数据:clue superclue
4
  英文数据:glue cnn_dailymail gigaword
5
- 代码数据:
6
- 数字:
7
 
8
  """
9
 
@@ -13,15 +13,15 @@ import sys
13
  import pandas as pd
14
  from datasets import load_dataset
15
  from utils.log_util import logger
16
- from vocab import load_tokener
17
- from vocab import all_tokenizers
18
  from typing import List, Optional, Union, Literal
19
 
20
  CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
21
 
22
  common_units = ["g_bytes/b_tokens", "b_tokens/g_bytes", "t_bytes/t_tokens", "t_tokens/t_bytes", "n_chars/n_tokens", ]
23
- common_corpuses = sorted(["cc100-en", "cc100-zh-Hans", "cc100-es", "cc100-fr", "cc100-de", "cc100-ko",
24
- "cc100-fa", "cc100-ar", "cc100-ja"])
 
25
 
26
  VALID_CODES_CC100 = [
27
  "am", "ar", "as", "az", "be", "bg", "bn", "bn_rom", "br", "bs", "ca", "cs", "cy", "da", "de",
@@ -44,9 +44,12 @@ def get_n_bytes_of_string(string_text):
44
 
45
 
46
  def unit_convertor(stat, unit):
47
- n_tokens = stat["n_tokens"]
48
- n_chars = stat["n_chars"]
49
- n_bytes = stat["n_bytes"]
 
 
 
50
 
51
  n_tokens_in_billion = n_tokens / (1000 * 1000 * 1000)
52
  n_tokens_in_trillion = n_tokens / (1000 * 1000 * 1000 * 1000)
@@ -57,11 +60,9 @@ def unit_convertor(stat, unit):
57
 
58
  if unit == "n_tokens/n_bytes":
59
  value = n_tokens / n_bytes
60
-
61
- # the average number of characters per token
62
- elif unit in ["n_chars/n_tokens", "chars_per_token"]: # 重要:平均一个token包含多少个字符。
63
  value = n_chars / n_tokens
64
- elif unit == "n_tokens/n_chars": # 一个中文汉字需要几个token?
65
  value = n_tokens / n_chars
66
  elif unit == "g_bytes/b_tokens":
67
  value = n_bytes_in_gb / n_tokens_in_billion
@@ -76,14 +77,48 @@ def unit_convertor(stat, unit):
76
  return round(value, 3)
77
 
78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  def to_dataframe(stats, units=None):
80
  if units is None:
81
  units = common_units
82
  elif not isinstance(units, list):
83
  units = [units]
84
  table = []
85
- for tokenizer_name, stat in stats.items():
86
- columns = {"tokenizer": tokenizer_name, "vocab_size": stat["vocab_size"]}
 
87
  for unit in units:
88
  if unit not in stat:
89
  columns[unit] = unit_convertor(stat, unit)
@@ -98,105 +133,159 @@ cache = {}
98
 
99
 
100
  def tokenize_corpus(
101
- tokenizer_name: str,
102
  corpuses: List[str],
103
- cache_path: str = "stats/compress_rate.json"
104
  ) -> dict:
105
  """
106
  这个要独立的cache,因为速度慢。
107
- :param tokenizer_name:
108
  :param corpuses:
109
  :param cache_path:
110
  :return:
111
  """
112
 
113
- def _tokenize(tokenizer, datasets):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  n_tokens = 0
115
  n_chars = 0
116
- n_bytes = 0
 
 
 
 
 
117
  for dataset in datasets:
118
  for item in dataset:
119
  text = item["text"]
120
  n_bytes += get_n_bytes_of_string(text)
121
  n_chars += len(text)
122
- encodings = tokenizer.encode(text)
123
- n_tokens += len(encodings)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  stat = {
125
- # "vocab_size": len(tokenizer.vocab_size,
126
- "vocab_size": len(tokenizer),
127
- "n_bytes": n_bytes,
128
- "n_tokens": n_tokens,
129
- "n_chars": n_chars,
 
 
130
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  return stat
132
 
133
  # load from cache
134
- cache_id = f"{tokenizer_name}.{'.'.join(corpuses)}"
 
135
  if not cache and os.path.exists(cache_path):
136
  with open(cache_path, "r", encoding="utf-8") as f_tmp:
137
  cache.update(json.load(f_tmp))
138
  if cache_id in cache:
139
- logger.info(f"loading {cache_id} from in-memory cache")
140
  return cache[cache_id]
141
 
142
  # tokenize corpus
143
- tokenizer = load_tokener(tokenizer_name)
144
- datasets = [load_dataset("eson/cc100-samples", corpus.replace("cc100-", ""), split="train") for corpus in corpuses]
145
- stat = _tokenize(tokenizer, datasets)
 
 
 
 
 
 
 
 
 
 
146
 
147
  # save to cache
148
  len_before = len(cache)
149
  cache[cache_id] = stat
150
  len_after = len(cache)
151
- logger.info(f"saving {cache_id} to in-memory and file cache: {len_before}->{len_after}")
152
  with open(cache_path, "w", encoding="utf-8") as f_tmp:
153
- json.dump(cache, f_tmp, indent=2)
154
  return stat
155
 
156
 
157
  def get_compression_leaderboard(
158
- corpuses: List[str] = ['cc100-en'],
159
  unit: str = "b_tokens/g_bytes",
160
  tokenizer_filter: Optional[str] = None,
161
  return_type: Optional[Literal["dict", "dataframe"]] = "dataframe"
162
  ) -> Union[pd.DataFrame, dict]:
163
  """
164
- ## TODO
165
- - search by organization,
166
  """
167
  logger.info(f"corpuses: {corpuses}; unit: {unit}; tokenizer_filter: {tokenizer_filter}")
168
  stats = {}
169
  if tokenizer_filter is not None:
170
- tokenizers = [tokenizer_name for tokenizer_name in all_tokenizers if tokenizer_filter in tokenizer_name]
 
171
  else:
172
- tokenizers = all_tokenizers
173
- for lang in corpuses:
174
- for tokenizer_name in tokenizers:
175
- stat = tokenize_corpus(tokenizer_name, [lang])
176
- stats[tokenizer_name] = stat
 
177
 
178
  if return_type == "dataframe":
179
  token_number_unit, file_size_unit = unit.split("/")
180
  reverse_unit = f"{file_size_unit}/{token_number_unit}"
181
- stats = to_dataframe(stats, [unit, reverse_unit, "n_chars/n_tokens"])
182
- stats = stats.sort_values(unit)
183
- stats = stats.rename(columns={unit: f' ⬆️{unit}'})
184
  return stats
185
 
186
 
187
- def update_compress_rate():
188
- pass
189
-
190
-
191
- def test():
192
- tokenizer_name = "gpt_4"
193
- tokenizer = load_tokener(tokenizer_name)
194
- stats = {tokenizer_name: tokenize_corpus(tokenizer, ["cc100-en", "cc100-zh-Hans"])}
195
- df = to_dataframe(stats)
196
- # print(df.to_markdown(index=False, tablefmt='fancy_grid'))
197
- logger.info(f"\n{df.to_markdown(index=False)}")
198
-
199
-
200
  def main():
201
  if len(sys.argv) == 3:
202
  tokenizer_filter = [sys.argv[1]]
@@ -204,11 +293,12 @@ def main():
204
  else:
205
  tokenizer_filter = None
206
  corpuses = common_corpuses
207
- df = get_compression_leaderboard(corpuses)
 
 
208
  # print(df.to_markdown(index=False, tablefmt='fancy_grid'))
209
  logger.info(f"\n{df.to_markdown(index=False)}")
210
 
211
 
212
  if __name__ == "__main__":
213
  main()
214
- # test()
 
2
 
3
  中文数据:clue superclue
4
  英文数据:glue cnn_dailymail gigaword
5
+ code:
6
+ math:
7
 
8
  """
9
 
 
13
  import pandas as pd
14
  from datasets import load_dataset
15
  from utils.log_util import logger
16
+ from vocab import tokenizer_factory, TokenizerConfig
 
17
  from typing import List, Optional, Union, Literal
18
 
19
  CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
20
 
21
  common_units = ["g_bytes/b_tokens", "b_tokens/g_bytes", "t_bytes/t_tokens", "t_tokens/t_bytes", "n_chars/n_tokens", ]
22
+
23
+ common_corpuses = sorted(["cc100/en", "cc100/zh-Hans", "cc100/es", "cc100/fr", "cc100/de", "cc100/ko",
24
+ "cc100/fa", "cc100/ar", "cc100/ja"])
25
 
26
  VALID_CODES_CC100 = [
27
  "am", "ar", "as", "az", "be", "bg", "bn", "bn_rom", "br", "bs", "ca", "cs", "cy", "da", "de",
 
44
 
45
 
46
  def unit_convertor(stat, unit):
47
+ n_tokens = stat["_n_tokens"]
48
+ n_chars = stat["_n_chars"]
49
+ n_bytes = stat["_n_bytes"]
50
+
51
+ if n_tokens is None:
52
+ return None
53
 
54
  n_tokens_in_billion = n_tokens / (1000 * 1000 * 1000)
55
  n_tokens_in_trillion = n_tokens / (1000 * 1000 * 1000 * 1000)
 
60
 
61
  if unit == "n_tokens/n_bytes":
62
  value = n_tokens / n_bytes
63
+ elif unit in ["char/token", "chars_per_token"]: # 重要:平均一个token包含多少个字符。
 
 
64
  value = n_chars / n_tokens
65
+ elif unit in ["token/char", "tokens_per_char"]: # 一个中文汉字需要几个token?
66
  value = n_tokens / n_chars
67
  elif unit == "g_bytes/b_tokens":
68
  value = n_bytes_in_gb / n_tokens_in_billion
 
77
  return round(value, 3)
78
 
79
 
80
+ def _merge_stats_by_corpus(stats_by_corpus, oov_threshold=0.3):
81
+ """
82
+ """
83
+ all_stats = list(stats_by_corpus.values())
84
+ assert len(set([stats["tokenizer"] for stats in all_stats])) == 1
85
+ reversible = all(stat['reversible'] for stat in all_stats)
86
+ is_support = all(stat['oov_ratio'] < oov_threshold for stat in all_stats)
87
+
88
+ merged_stats = {
89
+ "tokenizer": all_stats[0]["tokenizer"],
90
+ "organization": all_stats[0]["organization"],
91
+ "vocab_size": all_stats[0]["vocab_size"],
92
+ "_n_bytes": 0,
93
+ "_n_tokens": 0 if is_support else None,
94
+ "_n_chars": 0,
95
+ "_n_oov_chars": 0,
96
+ "reversible": True,
97
+ }
98
+ for stats in all_stats:
99
+ merged_stats["_n_bytes"] += stats["_n_bytes"]
100
+ merged_stats["_n_chars"] += stats["_n_chars"]
101
+ if is_support: # The number of tokens cannot be accurately counted, when there are too many UNKs.
102
+ merged_stats["_n_tokens"] += stats["_n_tokens"]
103
+ merged_stats["_n_oov_chars"] += stats["_n_oov_chars"]
104
+ merged_stats["reversible"] &= stats['reversible']
105
+
106
+ merged_stats.update({
107
+ "oov_ratio": float("%.4g" % (stats["_n_oov_chars"] / stats["_n_chars"])),
108
+ "reversible": reversible
109
+ })
110
+ return merged_stats
111
+
112
+
113
  def to_dataframe(stats, units=None):
114
  if units is None:
115
  units = common_units
116
  elif not isinstance(units, list):
117
  units = [units]
118
  table = []
119
+
120
+ for stat in stats.values():
121
+ columns = {k: v for k, v in stat.items() if not k.startswith("_")}
122
  for unit in units:
123
  if unit not in stat:
124
  columns[unit] = unit_convertor(stat, unit)
 
133
 
134
 
135
  def tokenize_corpus(
136
+ tokenizer_name: str, # 可以免加载tokenizer直接出结果
137
  corpuses: List[str],
138
+ cache_dir: str = "stats"
139
  ) -> dict:
140
  """
141
  这个要独立的cache,因为速度慢。
142
+ :param tokenizer_config: 可以不加载就
143
  :param corpuses:
144
  :param cache_path:
145
  :return:
146
  """
147
 
148
+ def _char_based_oov(src_text, decode_text):
149
+ oov_chars = []
150
+ for char in src_text:
151
+ if char not in decode_text:
152
+ oov_chars.append(char)
153
+
154
+ n_oov_chars = len(oov_chars)
155
+ oov_charset = list(dict.fromkeys(oov_chars))
156
+ return n_oov_chars, oov_charset
157
+
158
+ def _tokenize(tokenizer, datasets, detail_path=None):
159
+ """
160
+ export_diff: true | false
161
+ :param tokenizer:
162
+ :param datasets:
163
+ :param detail_path:
164
+ :return:
165
+ """
166
+ n_bytes = 0
167
  n_tokens = 0
168
  n_chars = 0
169
+ n_oov_chars = 0
170
+ diff_details = []
171
+ oov_charset = set()
172
+ unk_token_id = None
173
+ if hasattr(tokenizer, "unk_token"):
174
+ unk_token_id = tokenizer.unk_token_id
175
  for dataset in datasets:
176
  for item in dataset:
177
  text = item["text"]
178
  n_bytes += get_n_bytes_of_string(text)
179
  n_chars += len(text)
180
+ ids = tokenizer.encode(text, add_special_tokens=False)
181
+
182
+ # detect oov
183
+ decode_text = tokenizer.decode(ids)
184
+ decode_text_without_unk = tokenizer.decode([token_id for token_id in ids if token_id != unk_token_id])
185
+ if decode_text != text:
186
+ _n_oov_chars, _oov_charset = _char_based_oov(text, decode_text_without_unk)
187
+ diff_details.append(
188
+ {
189
+ "text": text,
190
+ "decode_text": decode_text,
191
+ "decode_text_without_unk": decode_text_without_unk,
192
+ "n_oov_chars": _n_oov_chars,
193
+ 'oov_ratio': _n_oov_chars / len(text),
194
+ 'oov_charset': json.dumps(_oov_charset, ensure_ascii=False),
195
+ }
196
+ )
197
+ n_oov_chars += _n_oov_chars
198
+ oov_charset.update(_oov_charset)
199
+ n_tokens += len(ids)
200
  stat = {
201
+ "_n_bytes": n_bytes,
202
+ "_n_tokens": n_tokens,
203
+ "_n_chars": n_chars,
204
+ "_n_oov_chars": n_oov_chars,
205
+ "oov_ratio": n_oov_chars / n_chars,
206
+ '_oov_charset': json.dumps(list(oov_charset), ensure_ascii=False),
207
+ "reversible": len(diff_details) == 0
208
  }
209
+
210
+ if detail_path and diff_details:
211
+ logger.info(f"saving tokenization detail to '{detail_path}'")
212
+ with open(detail_path, "w", encoding="utf-8") as f:
213
+ f.write(json.dumps(diff_details, ensure_ascii=False, indent=2))
214
+ # print(f"{tokenizer_config.name_or_path}, {infer_tokenizer_type(tokenizer_config)}\n"
215
+ # f"reversible: false; unk_token: {get_unk(tokenizer_config)},"
216
+ # f" unk_ratio: {unk_count / len(encoding):.4f}; oov: []")
217
+ # for diff_detail in diff_details:
218
+ # # print(f"text[{i}] = {str(bytes(text[i:], 'utf-8'))}\n"
219
+ # # f"decoding[{i}] = {str(bytes(decoding[i:], 'utf-8'))}")
220
+ # f.write(f"text= {json.dumps(text[i:], ensure_ascii=False)}, \n"
221
+ # f"decoding[{i}] = {json.dumps(decoding[i:], ensure_ascii=False)}")
222
  return stat
223
 
224
  # load from cache
225
+ cache_id = f"{tokenizer_name} @ {'.'.join(corpuses)}"
226
+ cache_path = os.path.join(cache_dir, "compression_rate.json")
227
  if not cache and os.path.exists(cache_path):
228
  with open(cache_path, "r", encoding="utf-8") as f_tmp:
229
  cache.update(json.load(f_tmp))
230
  if cache_id in cache:
231
+ # logger.info(f"loading {cache_id} from in-memory cache")
232
  return cache[cache_id]
233
 
234
  # tokenize corpus
235
+ tokenizer = tokenizer_factory.get_tokenizer(tokenizer_name)
236
+ datasets = [load_dataset("eson/cc100-samples", corpus.replace("cc100/", ""), split="train") for corpus in corpuses]
237
+
238
+ stat = {
239
+ "tokenizer": tokenizer_factory.get_name_with_hyperlink(tokenizer_name),
240
+ "organization": tokenizer_factory.get_tokenizer_config(tokenizer_name).org,
241
+ "vocab_size": len(tokenizer),
242
+ }
243
+ tokenize_detail_dir = os.path.join(cache_dir, "compression_rate")
244
+ os.makedirs(tokenize_detail_dir, exist_ok=True)
245
+ tokenize_detail_path = os.path.join(tokenize_detail_dir, cache_id.replace("/", ".") + ".diff.json")
246
+ stat.update(_tokenize(tokenizer, datasets, detail_path=tokenize_detail_path))
247
+ # add basic info
248
 
249
  # save to cache
250
  len_before = len(cache)
251
  cache[cache_id] = stat
252
  len_after = len(cache)
253
+ logger.info(f"saving '{cache_id}' to memory and file cache '{cache_path}': {len_before}->{len_after}")
254
  with open(cache_path, "w", encoding="utf-8") as f_tmp:
255
+ json.dump(cache, f_tmp, ensure_ascii=False, indent=2)
256
  return stat
257
 
258
 
259
  def get_compression_leaderboard(
260
+ corpuses: List[str] = ['cc100/en'],
261
  unit: str = "b_tokens/g_bytes",
262
  tokenizer_filter: Optional[str] = None,
263
  return_type: Optional[Literal["dict", "dataframe"]] = "dataframe"
264
  ) -> Union[pd.DataFrame, dict]:
265
  """
 
 
266
  """
267
  logger.info(f"corpuses: {corpuses}; unit: {unit}; tokenizer_filter: {tokenizer_filter}")
268
  stats = {}
269
  if tokenizer_filter is not None:
270
+ tokenizer_names = [tokenizer_name for tokenizer_name in tokenizer_factory.all_tokenizer_names
271
+ if tokenizer_filter.lower() in tokenizer_name.lower()]
272
  else:
273
+ tokenizer_names = tokenizer_factory.all_tokenizer_names
274
+ for tokenizer_name in tokenizer_names:
275
+ stats_by_corpus = {}
276
+ for corpus in corpuses:
277
+ stats_by_corpus[corpus] = tokenize_corpus(tokenizer_name, [corpus])
278
+ stats[tokenizer_name] = _merge_stats_by_corpus(stats_by_corpus)
279
 
280
  if return_type == "dataframe":
281
  token_number_unit, file_size_unit = unit.split("/")
282
  reverse_unit = f"{file_size_unit}/{token_number_unit}"
283
+ stats = to_dataframe(stats, [unit, reverse_unit, "char/token"])
284
+ stats = stats.sort_values(["oov_ratio", unit], ascending=[True, True])
285
+ stats = stats.rename(columns={"oov_ratio": f' ⬆️oov_ratio'}).rename(columns={unit: f' ⬆️{unit}'}) # ⬇
286
  return stats
287
 
288
 
 
 
 
 
 
 
 
 
 
 
 
 
 
289
  def main():
290
  if len(sys.argv) == 3:
291
  tokenizer_filter = [sys.argv[1]]
 
293
  else:
294
  tokenizer_filter = None
295
  corpuses = common_corpuses
296
+ # tokenizer_filter = "openai"
297
+ # corpuses = ["cc100/en", "cc100/zh-Hans"]
298
+ df = get_compression_leaderboard(corpuses, tokenizer_filter=tokenizer_filter)
299
  # print(df.to_markdown(index=False, tablefmt='fancy_grid'))
300
  logger.info(f"\n{df.to_markdown(index=False)}")
301
 
302
 
303
  if __name__ == "__main__":
304
  main()
 
config.py DELETED
@@ -1,20 +0,0 @@
1
- USE_REMOTE = False # use remote tokenizer or local tokenizer
2
-
3
- # load_vocab_with_SPECIAL_TOKEN = True # 如果不包含会导致计算词典大小错误、overlap_token计算不一致。
4
-
5
- # encoding config
6
- ADD_SPECIAL_TOKEN = False
7
-
8
- #
9
- LAZY_IMPORT = True
10
-
11
- # DEBUG: 设置环境变量 RUST_BACKTRACE=full
12
- #
13
-
14
- default_user_input = """\
15
- Replace this text in the input field to see how tokenization works.
16
- Buenos días!
17
- 华为发布Mate60手机。
18
- ラグビーワールドカップ2023フランス"""
19
- default_tokenizer_type_1 = "llama3"
20
- default_tokenizer_type_2 = "gpt_4"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
patcher/README.md ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ ## vocabsize不一致问题
4
+
5
+
6
+ - .vcab_size
7
+ - Size of the base vocabulary (without the added tokens)
8
+ - 来自 https://huggingface.co/transformers/v2.11.0/main_classes/tokenizer.html
9
+ - len(tokenizer)
10
+ - Size of the full vocabulary with the added tokens.
11
+ - https://github.com/huggingface/transformers/issues/12632
12
+ - max(tokenizer.get_vocab().values())
13
+ - 包括不连续的 token_id
14
+ - https://github.com/huggingface/transformers/issues/4875
15
+
patcher/sptokenizer_patch_deprecated.py DELETED
@@ -1,105 +0,0 @@
1
- """
2
-
3
- ## adapt to transformer tokenizer
4
-
5
- https://github.com/huggingface/transformers/blob/v4.40.1/src/transformers/tokenization_utils.py#L379
6
-
7
- ## usage
8
-
9
- - grok
10
-
11
- ## 风险评估
12
-
13
- - 可能会干扰 sentencepiece.SentencePieceProcessor的正常使用,比如 .vocab_size 原来是个方法,patch后是个property
14
-
15
-
16
- ## TODO
17
-
18
- 不用patch,改用wrapper。常见的 tokenizer通常是封装的 sentencepiece,
19
- """
20
-
21
- import sentencepiece
22
-
23
-
24
- @property
25
- def vocab_size(self):
26
- """Returns vocab size"""
27
- return self.get_piece_size()
28
-
29
-
30
- def get_vocab(self):
31
- """Returns vocab as a dict"""
32
- vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
33
- # vocab.update(self.added_tokens_encoder)
34
- return vocab
35
-
36
-
37
- def _tokenize(self, text):
38
- """Returns a tokenized string."""
39
- return self.encode(text, out_type=str)
40
-
41
-
42
- def _convert_token_to_id(self, token):
43
- """Converts a token (str) in an id using the vocab."""
44
- return self.piece_to_id(token)
45
-
46
-
47
- def _convert_id_to_token(self, index):
48
- """Converts an index (integer) in a token (str) using the vocab."""
49
- token = self.IdToPiece(index)
50
- return token
51
-
52
-
53
- def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
54
- """ copy from transformers.PreTrainedTokenizer
55
- Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
56
- added tokens.
57
-
58
- Args:
59
- ids (`int` or `List[int]`):
60
- The token id (or token ids) to convert to tokens.
61
- skip_special_tokens (`bool`, *optional*, defaults to `False`):
62
- Whether or not to remove special tokens in the decoding.
63
-
64
- Returns:
65
- `str` or `List[str]`: The decoded token(s).
66
- """
67
- self._added_tokens_decoder = {} # add by xs
68
- if isinstance(ids, int):
69
- if ids in self._added_tokens_decoder:
70
- return self._added_tokens_decoder[ids].content
71
- else:
72
- return self._convert_id_to_token(ids)
73
- tokens = []
74
- for index in ids:
75
- index = int(index)
76
- if skip_special_tokens and index in self.all_special_ids:
77
- continue
78
- if index in self._added_tokens_decoder:
79
- tokens.append(self._added_tokens_decoder[index].content)
80
- else:
81
- tokens.append(self._convert_id_to_token(index))
82
- return tokens
83
-
84
-
85
- def encode(self, *args, **kwargs):
86
- """
87
- add_special_token 是为了兼容 hf_tokenizer
88
- """
89
- kwargs.pop("add_special_tokens", None)
90
- kwargs.pop("allowed_special", None)
91
- return self.Encode(*args, **kwargs)
92
-
93
-
94
- def decode(self, *args, **kwargs):
95
- kwargs.pop("skip_special_tokens", None)
96
- return self.Decode(*args, **kwargs)
97
-
98
-
99
- sentencepiece.SentencePieceProcessor.vocab_size = vocab_size #
100
- sentencepiece.SentencePieceProcessor.get_vocab = get_vocab
101
- sentencepiece.SentencePieceProcessor._convert_id_to_token = _convert_id_to_token
102
- sentencepiece.SentencePieceProcessor.convert_ids_to_tokens = convert_ids_to_tokens
103
- # sentencepiece.SentencePieceProcessor.tokenize = _tokenize
104
- sentencepiece.SentencePieceProcessor.encode = encode
105
- sentencepiece.SentencePieceProcessor.decode = decode
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
patcher/sptokenizer_wrapper.py DELETED
@@ -1,61 +0,0 @@
1
- """ 封装 sentencepiece.SentencePieceProcessor,以便符合transformers中的tokenizer标准
2
-
3
- ## reference
4
-
5
-
6
- ## usage
7
-
8
- - grok
9
-
10
- """
11
-
12
- import sentencepiece as spm
13
- from transformers import PreTrainedTokenizer
14
-
15
-
16
- class SPTokenizerWrapper(PreTrainedTokenizer):
17
- """
18
-
19
- ## impl in PreTrainedTokenizer
20
- - convert_ids_to_tokens
21
- """
22
-
23
- def __init__(self, vocab_file):
24
- self.vocab_file = vocab_file
25
- self.sp_model = spm.SentencePieceProcessor(self.vocab_file)
26
- super().__init__()
27
-
28
- @property
29
- def vocab_size(self):
30
- """Returns vocab size"""
31
- return self.sp_model.get_piece_size()
32
-
33
- def get_vocab(self):
34
- """Returns vocab as a dict"""
35
- vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
36
- return vocab
37
-
38
- def _convert_token_to_id(self, token):
39
- """Converts a token (str) in an id using the vocab."""
40
- return self.sp_model.piece_to_id(token)
41
-
42
- def _convert_id_to_token(self, index):
43
- """Converts an index (integer) in a token (str) using the vocab."""
44
- token = self.sp_model.IdToPiece(index)
45
- return token
46
-
47
- # def (self, ids, skip_special_tokens=False): # impl in PreTrainedTokenizer
48
-
49
-
50
- def encode(self, *args, **kwargs):
51
- kwargs.pop("add_special_tokens", None)
52
- kwargs.pop("allowed_special", None)
53
- return self.sp_model.Encode(*args, **kwargs)
54
-
55
- def decode(self, *args, **kwargs):
56
- kwargs.pop("skip_special_tokens", None)
57
- return self.sp_model.Decode(*args, **kwargs)
58
-
59
-
60
-
61
- # PreTrainedTokenizer.convert_ids_to_tokens
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
patcher/tiktoken_patch.py CHANGED
@@ -70,8 +70,8 @@ def get_vocab(self, token_type="str"):
70
 
71
  @property
72
  def vocab_size(self):
73
- """Returns vocab size"""
74
- return self.n_vocab
75
 
76
 
77
  def encode(self, *args, **kwargs):
 
70
 
71
  @property
72
  def vocab_size(self):
73
+ """Returns vocab size without special tokens"""
74
+ return len(self._mergeable_ranks)
75
 
76
 
77
  def encode(self, *args, **kwargs):
app_playground.py → playground_app.py RENAMED
@@ -36,9 +36,12 @@ table
36
  """
37
 
38
  import gradio as gr
39
- from vocab import all_tokenizers
40
- from util import *
41
- from examples import example_fn, example_types
 
 
 
42
 
43
  get_window_url_params = """
44
  function(url_params) {
@@ -48,6 +51,8 @@ get_window_url_params = """
48
  }
49
  """
50
 
 
 
51
  with gr.Blocks() as demo:
52
  # links: https://www.coderstool.com/utf8-encoding-decoding
53
  # 功能:输入文本,进行分词
@@ -60,6 +65,7 @@ with gr.Blocks() as demo:
60
  example_types,
61
  value="Examples",
62
  type="index",
 
63
  show_label=False,
64
  container=False,
65
  scale=0,
@@ -102,21 +108,26 @@ with gr.Blocks() as demo:
102
  with gr.Column(scale=6):
103
  with gr.Group():
104
  tokenizer_name_1 = gr.Dropdown(
105
- all_tokenizers,
106
  label="Tokenizer 1",
107
  )
108
  with gr.Group():
109
  with gr.Row():
 
 
 
 
 
110
  stats_vocab_size_1 = gr.TextArea(
111
  label="Vocab Size",
112
  lines=1,
113
  elem_classes="statistics"
114
  )
115
- stats_zh_token_size_1 = gr.TextArea(
116
- label="ZH char/word",
117
- lines=1,
118
- elem_classes="statistics",
119
- )
120
  # stats_compress_rate_1 = gr.TextArea(
121
  # label="Compress Rate",
122
  # lines=1,
@@ -140,21 +151,26 @@ with gr.Blocks() as demo:
140
  with gr.Column(scale=6):
141
  with gr.Group():
142
  tokenizer_name_2 = gr.Dropdown(
143
- all_tokenizers,
144
  label="Tokenizer 2",
145
  )
146
  with gr.Group():
147
  with gr.Row():
148
- stats_vocab_size_2 = gr.TextArea(
149
- label="VocabSize",
150
  lines=1,
151
- elem_classes="statistics"
152
  )
153
- stats_zh_token_size_2 = gr.TextArea(
154
- label="ZH char/word", # 中文字/词
155
  lines=1,
156
- elem_classes="statistics",
157
  )
 
 
 
 
 
158
  # stats_compress_rate_2 = gr.TextArea(
159
  # label="Compress Rate",
160
  # lines=1,
@@ -196,7 +212,7 @@ with gr.Blocks() as demo:
196
 
197
  tokenizer_name_1.change(tokenize, [user_input, tokenizer_name_1],
198
  [output_text_1, output_table_1])
199
- tokenizer_name_1.change(basic_count, [tokenizer_name_1], [stats_vocab_size_1, stats_zh_token_size_1])
200
  tokenizer_name_1.change(get_overlap_token_size, [tokenizer_name_1, tokenizer_name_2],
201
  [stats_overlap_token_size_1, stats_overlap_token_size_2])
202
  # tokenizer_type_1.change(get_compress_rate, [tokenizer_type_1, compress_rate_corpus, compress_rate_unit],
@@ -209,7 +225,7 @@ with gr.Blocks() as demo:
209
 
210
  tokenizer_name_2.change(tokenize, [user_input, tokenizer_name_2],
211
  [output_text_2, output_table_2])
212
- tokenizer_name_2.change(basic_count, [tokenizer_name_2], [stats_vocab_size_2, stats_zh_token_size_2])
213
  tokenizer_name_2.change(get_overlap_token_size, [tokenizer_name_1, tokenizer_name_2],
214
  [stats_overlap_token_size_1, stats_overlap_token_size_2])
215
  # tokenizer_type_2.change(get_compress_rate,
@@ -235,7 +251,6 @@ with gr.Blocks() as demo:
235
  [user_input, tokenizer_name_1, tokenizer_name_2]
236
  )
237
 
238
- demo.load(js=open("js/onload.js", "r", encoding="utf-8").read())
239
  demo.load(
240
  fn=on_load,
241
  inputs=[user_input], # 这里只需要传个空object即可。
 
36
  """
37
 
38
  import gradio as gr
39
+ from vocab import tokenizer_factory
40
+ from playground_examples import example_types, example_fn
41
+ from playground_util import tokenize, tokenize_pair, basic_count, get_overlap_token_size, on_load
42
+
43
+
44
+
45
 
46
  get_window_url_params = """
47
  function(url_params) {
 
51
  }
52
  """
53
 
54
+ all_tokenizer_name = [(config.name_display, config.name_or_path) for config in tokenizer_factory.all_tokenizer_configs]
55
+
56
  with gr.Blocks() as demo:
57
  # links: https://www.coderstool.com/utf8-encoding-decoding
58
  # 功能:输入文本,进行分词
 
65
  example_types,
66
  value="Examples",
67
  type="index",
68
+ allow_custom_value=True,
69
  show_label=False,
70
  container=False,
71
  scale=0,
 
108
  with gr.Column(scale=6):
109
  with gr.Group():
110
  tokenizer_name_1 = gr.Dropdown(
111
+ all_tokenizer_name,
112
  label="Tokenizer 1",
113
  )
114
  with gr.Group():
115
  with gr.Row():
116
+ organization_1 = gr.TextArea(
117
+ label="Organization",
118
+ lines=1,
119
+ elem_classes="statistics",
120
+ )
121
  stats_vocab_size_1 = gr.TextArea(
122
  label="Vocab Size",
123
  lines=1,
124
  elem_classes="statistics"
125
  )
126
+ # stats_zh_token_size_1 = gr.TextArea(
127
+ # label="ZH char/word",
128
+ # lines=1,
129
+ # elem_classes="statistics",
130
+ # )
131
  # stats_compress_rate_1 = gr.TextArea(
132
  # label="Compress Rate",
133
  # lines=1,
 
151
  with gr.Column(scale=6):
152
  with gr.Group():
153
  tokenizer_name_2 = gr.Dropdown(
154
+ all_tokenizer_name,
155
  label="Tokenizer 2",
156
  )
157
  with gr.Group():
158
  with gr.Row():
159
+ organization_2 = gr.TextArea(
160
+ label="Organization",
161
  lines=1,
162
+ elem_classes="statistics",
163
  )
164
+ stats_vocab_size_2 = gr.TextArea(
165
+ label="Vocab Size",
166
  lines=1,
167
+ elem_classes="statistics"
168
  )
169
+ # stats_zh_token_size_2 = gr.TextArea(
170
+ # label="ZH char/word", # 中文字/词
171
+ # lines=1,
172
+ # elem_classes="statistics",
173
+ # )
174
  # stats_compress_rate_2 = gr.TextArea(
175
  # label="Compress Rate",
176
  # lines=1,
 
212
 
213
  tokenizer_name_1.change(tokenize, [user_input, tokenizer_name_1],
214
  [output_text_1, output_table_1])
215
+ tokenizer_name_1.change(basic_count, [tokenizer_name_1], [stats_vocab_size_1, organization_1])
216
  tokenizer_name_1.change(get_overlap_token_size, [tokenizer_name_1, tokenizer_name_2],
217
  [stats_overlap_token_size_1, stats_overlap_token_size_2])
218
  # tokenizer_type_1.change(get_compress_rate, [tokenizer_type_1, compress_rate_corpus, compress_rate_unit],
 
225
 
226
  tokenizer_name_2.change(tokenize, [user_input, tokenizer_name_2],
227
  [output_text_2, output_table_2])
228
+ tokenizer_name_2.change(basic_count, [tokenizer_name_2], [stats_vocab_size_2, organization_2])
229
  tokenizer_name_2.change(get_overlap_token_size, [tokenizer_name_1, tokenizer_name_2],
230
  [stats_overlap_token_size_1, stats_overlap_token_size_2])
231
  # tokenizer_type_2.change(get_compress_rate,
 
251
  [user_input, tokenizer_name_1, tokenizer_name_2]
252
  )
253
 
 
254
  demo.load(
255
  fn=on_load,
256
  inputs=[user_input], # 这里只需要传个空object即可。
examples.py → playground_examples.py RENAMED
@@ -19,11 +19,11 @@ https://www.computerhope.com/jargon/s/specchar.htm
19
 
20
  examples = {
21
  "en": [
22
- ["number: (10086 + 98) = 100184", "llama", "bloom"], #
23
- ["whitespace: 2spaces 8spaces\t1tab\t\t2tab\n1newline", "llama", "bert_base_cased"], # chatglm 有blank_n, bert丢掉了空格,
24
  # !?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.
25
- ["punctuation: ,.:/?+=\",。!?;【】〔〕〖〗", "gemma_7b", "llama"], # llama词典有点小
26
- ["symbol: 🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan", "llama"],
27
  # ["special: [PAD] [UNK] [CLS] [SEP] [MASK] <|system|> <|user|> <|assistant|> <|endoftext|>", "", ""],
28
  ],
29
  "zh": [
@@ -37,16 +37,16 @@ examples = {
37
 
38
  more_examples = [
39
  # bert系列
40
- ("bert_base_cased", "bert_base_uncased", "", ""), # # clue VS kplug, bert VS clue
41
- ("bert_base_cased", "clue", "", "增加了[]()"),
42
- ("clue", "kplug", "", ""),
43
 
44
  # llama系列 (基于sentencepiece)
45
  ("baichuan", "baichuan2", "baichuan2支持多空格 ,多个换行\n\n\n,do not add dummy prefix as Baichuan1"),
46
  ("llama", "baichuan2", "baichuan2支持多空格 ,多个换行\n\n"),
47
- ("llama", "chinese_llama2", ""),
48
  ("llama", "llama3", "扩充词典"),
49
- ("chinese_llama", "chinese_llama2", ""),
50
 
51
  # glm系列 (基于sentencepiece)
52
  ("glm", "chatglm1", ""),
 
19
 
20
  examples = {
21
  "en": [
22
+ ["number: (10086 + 98) = 100184", "huggyllama/llama-7b", "bigscience/bloom"], #
23
+ ["whitespace: 2spaces 8spaces\t1tab\t\t2tab\n1newline", "huggyllama/llama-7b", "google-bert/bert-base-cased"], # chatglm 有blank_n, bert丢掉了空格,
24
  # !?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.
25
+ ["punctuation: ,.:/?+=\",。!?;【】〔〕〖〗", "google/gemma-7b", "huggyllama/llama-7b"], # llama词典有点小
26
+ ["symbol: 🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan-inc/Baichuan-7B", "huggyllama/llama-7b"],
27
  # ["special: [PAD] [UNK] [CLS] [SEP] [MASK] <|system|> <|user|> <|assistant|> <|endoftext|>", "", ""],
28
  ],
29
  "zh": [
 
37
 
38
  more_examples = [
39
  # bert系列
40
+ ("bert-base-cased", "bert-base-uncased", "", ""), # # clue VS kplug, bert VS clue
41
+ ("bert-base-cased", "clue", "", "增加了[]()"),
42
+ ("roberta-chinese-clue", "kplug", "", ""),
43
 
44
  # llama系列 (基于sentencepiece)
45
  ("baichuan", "baichuan2", "baichuan2支持多空格 ,多个换行\n\n\n,do not add dummy prefix as Baichuan1"),
46
  ("llama", "baichuan2", "baichuan2支持多空格 ,多个换行\n\n"),
47
+ ("llama", "chinese-llama-2-7b", ""),
48
  ("llama", "llama3", "扩充词典"),
49
+ ("chinese-llama-lora-7b", "chinese-llama-2-7b", ""),
50
 
51
  # glm系列 (基于sentencepiece)
52
  ("glm", "chatglm1", ""),
util.py → playground_util.py RENAMED
@@ -1,22 +1,33 @@
1
  import gradio as gr
2
  import json
3
  import pandas as pd
4
- import config
5
- from vocab import load_tokener
6
- from utils.character_util import iter_vocab
7
  from utils.log_util import logger
8
- from utils.compression_util import tokenize_corpus, unit_convertor
9
  from functools import lru_cache
10
 
 
 
 
 
 
 
 
 
11
 
12
  @lru_cache
13
- def tokenize(text, tokenizer_name, color_num=5):
 
 
 
 
 
14
  """
15
  """
16
  logger.info("param=" + json.dumps({"text": text, "tokenizer_type": tokenizer_name}, ensure_ascii=False))
17
  pos_tokens = []
18
- tokenizer = load_tokener(tokenizer_name)
19
- if config.ADD_SPECIAL_TOKEN:
20
  encoding = tokenizer.encode(text, add_special_tokens=True)
21
  else:
22
  encoding = tokenizer.encode(text, add_special_tokens=False)
@@ -34,7 +45,7 @@ def tokenize(text, tokenizer_name, color_num=5):
34
  token_str = token.decode("utf-8")
35
  except:
36
  token_str = token.decode("utf-8", errors="ignore")
37
- logger.error(f"{idx}: decode_error: " + json.dumps( # gpt_35_turbo 经常有token会decode error,这里用来记录一下
38
  {"tokenizer_type": tokenizer_name, "token": str(token), "token_str": token_str},
39
  ensure_ascii=False))
40
 
@@ -45,7 +56,8 @@ def tokenize(text, tokenizer_name, color_num=5):
45
  token_bytes = bytes(token_str, "utf-8")
46
  # json_dumps = json.dumps(token_str)
47
  else:
48
- logger.error(f"{idx}: wrong type for token {token_id} {type(token)} " + json.dumps({"text": text, "tokenizer_type": tokenizer_name}, ensure_ascii=False))
 
49
  token_str = token
50
  token_bytes = token
51
  # continue
@@ -82,30 +94,22 @@ def tokenize_pair(text, tokenizer_type_1, tokenizer_type_2):
82
  @lru_cache
83
  def basic_count(tokenizer_name):
84
  stats = iter_vocab(tokenizer_name)
85
- return stats['vocab_size'], f'{stats["中文token数"]}'
86
  # return tokenizer.vocab_size, f'{stats["中文汉字数"]["中文单字"]}/{stats["中文汉字数"]["中文多字"]}'
87
 
88
- def get_compress_rate(tokenizer_type, all_corpus, unit):
89
- tokenizer = load_tokener(tokenizer_type)
90
- compress_rate_stats = tokenize_corpus(tokenizer, all_corpus)
91
- compress_rate = unit_convertor(compress_rate_stats, unit)
92
- return compress_rate
93
 
94
- # def get_all_compress_rate(corpuses, unit):
95
- # stats = {}
96
- # for lang in corpuses:
97
- # print("###" * 10 + lang)
98
- # for tokenizer_name in tokenizers:
99
- # tokenizer = load_tokener(tokenizer_name)
100
- # stat = tokenize_corpus(tokenizer, [lang])
101
- # stats[tokenizer_name] = stat
102
- # pprint(stats)
103
 
104
 
105
  @lru_cache
106
- def get_overlap_token_size(tokenizer_type_1, tokenizer_type_2):
107
- tokenizer1 = load_tokener(tokenizer_type_1)
108
- tokenizer2 = load_tokener(tokenizer_type_2)
109
 
110
  vocab_set_1 = tokenizer1.get_vocab().keys()
111
  vocab_set_2 = tokenizer2.get_vocab().keys()
@@ -121,11 +125,10 @@ def get_overlap_token_size(tokenizer_type_1, tokenizer_type_2):
121
  overlap_tokens = vocab_set_1 & vocab_set_2
122
  overlap_token_size = len(overlap_tokens)
123
  logger.info(
124
- f"{overlap_token_size} OverlapTokens of {tokenizer_type_1} {tokenizer_type_2}: {list(overlap_tokens)[:10]}")
125
  return overlap_token_size, overlap_token_size
126
 
127
 
128
-
129
  def on_load(url_params, request: gr.Request):
130
  """
131
  onLoad
@@ -148,15 +151,16 @@ def on_load(url_params, request: gr.Request):
148
  # if "referer" in request.headers: # not work for huggingface-space
149
  # url_params = parse_qs(urlparse(request.headers["referer"]).query)
150
  # url_params = {k: v[0] for k, v in url_params.items() if len(v) > 0}
151
- tokenizer_type_1 = url_params.get("tokenizer1", config.default_tokenizer_type_1)
152
- tokenizer_type_2 = url_params.get("tokenizer2", config.default_tokenizer_type_2)
153
- text = url_params.get("text", config.default_user_input)
154
  logger.info(f"client_ip: {client_ip}; params: {url_params}")
155
  return text, tokenizer_type_1, tokenizer_type_2
156
 
157
 
158
- def compress_rate_unit_change(unit):
159
- return gr.update(label=f"Compress Rate: {unit}"), gr.update(label=f"Compress Rate: {unit}"),
 
160
 
161
  def test_coding():
162
  bytes1 = b'\xe4\xb8\xad'
@@ -164,5 +168,5 @@ def test_coding():
164
 
165
 
166
  if __name__ == "__main__":
167
- print(get_overlap_token_size("gpt_35_turbo", "gpt_4"))
168
  # print(basic_count("internlm_chat_7b"))
 
1
  import gradio as gr
2
  import json
3
  import pandas as pd
4
+ from vocab import tokenizer_factory
5
+ from character_util import iter_vocab
 
6
  from utils.log_util import logger
 
7
  from functools import lru_cache
8
 
9
+ default_user_input = """\
10
+ Replace this text in the input field to see how tokenization works.
11
+ Buenos días!
12
+ 华为发布Mate60手机。
13
+ ラグビーワールドカップ2023フランス"""
14
+ # default_tokenizer_name_1 = "Meta/llama3"
15
+ default_tokenizer_name_1 = "gradientai/Llama-3-8B-Instruct-Gradient-1048k"
16
+ default_tokenizer_name_2 = "openai/gpt-4"
17
 
18
  @lru_cache
19
+ def tokenize(
20
+ text: str,
21
+ tokenizer_name: str,
22
+ color_num: int = 5,
23
+ add_special_token: bool = False
24
+ ):
25
  """
26
  """
27
  logger.info("param=" + json.dumps({"text": text, "tokenizer_type": tokenizer_name}, ensure_ascii=False))
28
  pos_tokens = []
29
+ tokenizer = tokenizer_factory.get_tokenizer(tokenizer_name)
30
+ if add_special_token:
31
  encoding = tokenizer.encode(text, add_special_tokens=True)
32
  else:
33
  encoding = tokenizer.encode(text, add_special_tokens=False)
 
45
  token_str = token.decode("utf-8")
46
  except:
47
  token_str = token.decode("utf-8", errors="ignore")
48
+ logger.error(f"{idx}: decode_error: " + json.dumps( # gpt_35_turbo 经常有token会decode error,这里用来记录一下
49
  {"tokenizer_type": tokenizer_name, "token": str(token), "token_str": token_str},
50
  ensure_ascii=False))
51
 
 
56
  token_bytes = bytes(token_str, "utf-8")
57
  # json_dumps = json.dumps(token_str)
58
  else:
59
+ logger.error(f"{idx}: wrong type for token {token_id} {type(token)} " + json.dumps(
60
+ {"text": text, "tokenizer_type": tokenizer_name}, ensure_ascii=False))
61
  token_str = token
62
  token_bytes = token
63
  # continue
 
94
  @lru_cache
95
  def basic_count(tokenizer_name):
96
  stats = iter_vocab(tokenizer_name)
97
+ return stats['vocab_size'], f'{stats["organization"]}'
98
  # return tokenizer.vocab_size, f'{stats["中文汉字数"]["中文单字"]}/{stats["中文汉字数"]["中文多字"]}'
99
 
 
 
 
 
 
100
 
101
+ # def get_compress_rate(tokenizer_name, all_corpus, unit):
102
+ # tokenizer = tokenizer_factory.get_tokenizer(tokenizer_name)
103
+ # compress_rate_stats = tokenize_corpus(tokenizer, all_corpus)
104
+ # compress_rate = unit_convertor(compress_rate_stats, unit)
105
+ # return compress_rate
106
+
 
 
 
107
 
108
 
109
  @lru_cache
110
+ def get_overlap_token_size(tokenizer_name_1, tokenizer_name_2):
111
+ tokenizer1 = tokenizer_factory.get_tokenizer(tokenizer_name_1)
112
+ tokenizer2 = tokenizer_factory.get_tokenizer(tokenizer_name_2)
113
 
114
  vocab_set_1 = tokenizer1.get_vocab().keys()
115
  vocab_set_2 = tokenizer2.get_vocab().keys()
 
125
  overlap_tokens = vocab_set_1 & vocab_set_2
126
  overlap_token_size = len(overlap_tokens)
127
  logger.info(
128
+ f"{overlap_token_size} OverlapTokens of {tokenizer_name_1} {tokenizer_name_2}: {list(overlap_tokens)[:10]}")
129
  return overlap_token_size, overlap_token_size
130
 
131
 
 
132
  def on_load(url_params, request: gr.Request):
133
  """
134
  onLoad
 
151
  # if "referer" in request.headers: # not work for huggingface-space
152
  # url_params = parse_qs(urlparse(request.headers["referer"]).query)
153
  # url_params = {k: v[0] for k, v in url_params.items() if len(v) > 0}
154
+ tokenizer_type_1 = url_params.get("tokenizer1", default_tokenizer_name_1)
155
+ tokenizer_type_2 = url_params.get("tokenizer2", default_tokenizer_name_2)
156
+ text = url_params.get("text", default_user_input)
157
  logger.info(f"client_ip: {client_ip}; params: {url_params}")
158
  return text, tokenizer_type_1, tokenizer_type_2
159
 
160
 
161
+ # def compress_rate_unit_change(unit):
162
+ # return gr.update(label=f"Compress Rate: {unit}"), gr.update(label=f"Compress Rate: {unit}"),
163
+
164
 
165
  def test_coding():
166
  bytes1 = b'\xe4\xb8\xad'
 
168
 
169
 
170
  if __name__ == "__main__":
171
+ print(get_overlap_token_size("gpt-35-turbo", "gpt-4"))
172
  # print(basic_count("internlm_chat_7b"))
requirements.txt CHANGED
@@ -6,4 +6,6 @@ torch
6
  zhon
7
  nltk
8
  boto3
9
- ai2-olmo==0.2.4
 
 
 
6
  zhon
7
  nltk
8
  boto3
9
+ ai2-olmo
10
+ ipadic
11
+ fugashi
stats/character_stats.json ADDED
@@ -0,0 +1,1712 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "FacebookAI/xlm-roberta-base": {
3
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/FacebookAI/xlm-roberta-base\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">xlm-roberta-base</a>",
4
+ "organization": "Facebook",
5
+ "vocab_size": 250002,
6
+ "num(digit)": 2728,
7
+ "len(digit)": "1,3,9",
8
+ "num(space)": 1,
9
+ "len(space)": "1,1,1",
10
+ "num(ar)": 14644,
11
+ "len(ar)": "1,4,16",
12
+ "num(zh)": 18457,
13
+ "len(zh)": "1,2,16",
14
+ "num(ja)": 20572,
15
+ "len(ja)": "1,2,16",
16
+ "num(ja-kana)": 3434,
17
+ "len(ja-kana)": "1,3,12",
18
+ "num(ko)": 5373,
19
+ "len(ko)": "1,2,8"
20
+ },
21
+ "clue/roberta_chinese_clue_tiny": {
22
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/clue/roberta_chinese_clue_tiny\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">roberta-chinese-clue</a>",
23
+ "organization": "CLUE",
24
+ "vocab_size": 8021,
25
+ "num(digit)": 230,
26
+ "len(digit)": "1,4,10",
27
+ "num(space)": 0,
28
+ "len(space)": "-",
29
+ "num(ar)": 30,
30
+ "len(ar)": "1,2,3",
31
+ "num(zh)": 5689,
32
+ "len(zh)": "1,1,1",
33
+ "num(ja)": 5691,
34
+ "len(ja)": "1,1,3",
35
+ "num(ja-kana)": 0,
36
+ "len(ja-kana)": "-",
37
+ "num(ko)": 0,
38
+ "len(ko)": "-"
39
+ },
40
+ "dbmdz/bert-base-german-uncased": {
41
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/dbmdz/bert-base-german-uncased\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">bert-base-german-uncased</a>",
42
+ "organization": "dbmdz",
43
+ "vocab_size": 31102,
44
+ "num(digit)": 1733,
45
+ "len(digit)": "1,4,12",
46
+ "num(space)": 0,
47
+ "len(space)": "-",
48
+ "num(ar)": 0,
49
+ "len(ar)": "-",
50
+ "num(zh)": 0,
51
+ "len(zh)": "-",
52
+ "num(ja)": 0,
53
+ "len(ja)": "-",
54
+ "num(ja-kana)": 0,
55
+ "len(ja-kana)": "-",
56
+ "num(ko)": 0,
57
+ "len(ko)": "-"
58
+ },
59
+ "google-bert/bert-base-cased": {
60
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/google-bert/bert-base-cased\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">bert-base-cased</a>",
61
+ "organization": "Google",
62
+ "vocab_size": 28996,
63
+ "num(digit)": 926,
64
+ "len(digit)": "1,4,11",
65
+ "num(space)": 0,
66
+ "len(space)": "-",
67
+ "num(ar)": 94,
68
+ "len(ar)": "1,3,4",
69
+ "num(zh)": 226,
70
+ "len(zh)": "1,2,3",
71
+ "num(ja)": 390,
72
+ "len(ja)": "1,2,3",
73
+ "num(ja-kana)": 164,
74
+ "len(ja-kana)": "1,2,3",
75
+ "num(ko)": 10,
76
+ "len(ko)": "1,2,3"
77
+ },
78
+ "google-bert/bert-base-chinese": {
79
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/google-bert/bert-base-chinese\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">bert-base-chinese</a>",
80
+ "organization": "Google",
81
+ "vocab_size": 21128,
82
+ "num(digit)": 1451,
83
+ "len(digit)": "1,3,12",
84
+ "num(space)": 2,
85
+ "len(space)": "1,2,3",
86
+ "num(ar)": 30,
87
+ "len(ar)": "1,2,3",
88
+ "num(zh)": 14642,
89
+ "len(zh)": "1,2,3",
90
+ "num(ja)": 15197,
91
+ "len(ja)": "1,3,15",
92
+ "num(ja-kana)": 553,
93
+ "len(ja-kana)": "1,3,15",
94
+ "num(ko)": 0,
95
+ "len(ko)": "-"
96
+ },
97
+ "google-bert/bert-base-german-cased": {
98
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/google-bert/bert-base-german-cased\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">bert-base-german-cased</a>",
99
+ "organization": "Google",
100
+ "vocab_size": 30000,
101
+ "num(digit)": 4065,
102
+ "len(digit)": "1,11,22",
103
+ "num(space)": 0,
104
+ "len(space)": "-",
105
+ "num(ar)": 0,
106
+ "len(ar)": "-",
107
+ "num(zh)": 0,
108
+ "len(zh)": "-",
109
+ "num(ja)": 0,
110
+ "len(ja)": "-",
111
+ "num(ja-kana)": 0,
112
+ "len(ja-kana)": "-",
113
+ "num(ko)": 0,
114
+ "len(ko)": "-"
115
+ },
116
+ "google-bert/bert-base-multilingual-cased": {
117
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/google-bert/bert-base-multilingual-cased\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">bert-base-multilingual-cased</a>",
118
+ "organization": "Google",
119
+ "vocab_size": 119547,
120
+ "num(digit)": 2583,
121
+ "len(digit)": "1,3,13",
122
+ "num(space)": 0,
123
+ "len(space)": "-",
124
+ "num(ar)": 4873,
125
+ "len(ar)": "1,5,14",
126
+ "num(zh)": 13542,
127
+ "len(zh)": "1,2,3",
128
+ "num(ja)": 14880,
129
+ "len(ja)": "1,3,10",
130
+ "num(ja-kana)": 1336,
131
+ "len(ja-kana)": "1,4,10",
132
+ "num(ko)": 3271,
133
+ "len(ko)": "1,3,6"
134
+ },
135
+ "google-bert/bert-base-multilingual-uncased": {
136
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/google-bert/bert-base-multilingual-uncased\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">bert-base-multilingual-uncased</a>",
137
+ "organization": "Google",
138
+ "vocab_size": 105879,
139
+ "num(digit)": 2510,
140
+ "len(digit)": "1,3,13",
141
+ "num(space)": 2,
142
+ "len(space)": "1,2,3",
143
+ "num(ar)": 4530,
144
+ "len(ar)": "1,5,13",
145
+ "num(zh)": 16658,
146
+ "len(zh)": "1,2,3",
147
+ "num(ja)": 17858,
148
+ "len(ja)": "1,3,10",
149
+ "num(ja-kana)": 1188,
150
+ "len(ja-kana)": "1,4,10",
151
+ "num(ko)": 0,
152
+ "len(ko)": "-"
153
+ },
154
+ "google-bert/bert-base-uncased": {
155
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/google-bert/bert-base-uncased\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">bert-base-uncased</a>",
156
+ "organization": "Google",
157
+ "vocab_size": 30522,
158
+ "num(digit)": 2056,
159
+ "len(digit)": "1,4,11",
160
+ "num(space)": 0,
161
+ "len(space)": "-",
162
+ "num(ar)": 88,
163
+ "len(ar)": "1,3,5",
164
+ "num(zh)": 488,
165
+ "len(zh)": "1,2,3",
166
+ "num(ja)": 676,
167
+ "len(ja)": "1,2,3",
168
+ "num(ja-kana)": 188,
169
+ "len(ja-kana)": "1,2,3",
170
+ "num(ko)": 0,
171
+ "len(ko)": "-"
172
+ },
173
+ "google/mobilebert-uncased": {
174
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/google/mobilebert-uncased\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">mobilebert-uncased</a>",
175
+ "organization": "Google",
176
+ "vocab_size": 30522,
177
+ "num(digit)": 2056,
178
+ "len(digit)": "1,4,11",
179
+ "num(space)": 0,
180
+ "len(space)": "-",
181
+ "num(ar)": 88,
182
+ "len(ar)": "1,3,5",
183
+ "num(zh)": 488,
184
+ "len(zh)": "1,2,3",
185
+ "num(ja)": 676,
186
+ "len(ja)": "1,2,3",
187
+ "num(ja-kana)": 188,
188
+ "len(ja-kana)": "1,2,3",
189
+ "num(ko)": 0,
190
+ "len(ko)": "-"
191
+ },
192
+ "tohoku-nlp/bert-base-japanese": {
193
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/tohoku-nlp/bert-base-japanese\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">bert-base-japanese</a>",
194
+ "organization": "Tohoku",
195
+ "vocab_size": 32000,
196
+ "num(digit)": 669,
197
+ "len(digit)": "1,3,5",
198
+ "num(space)": 0,
199
+ "len(space)": "-",
200
+ "num(ar)": 10,
201
+ "len(ar)": "1,3,3",
202
+ "num(zh)": 18792,
203
+ "len(zh)": "1,2,11",
204
+ "num(ja)": 28367,
205
+ "len(ja)": "1,2,13",
206
+ "num(ja-kana)": 12359,
207
+ "len(ja-kana)": "1,4,13",
208
+ "num(ko)": 0,
209
+ "len(ko)": "-"
210
+ },
211
+ "gpt-4": {
212
+ "tokenizer": "<a target=\"_blank\" href=\"https://github.com/openai/tiktoken\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">gpt-4</a>",
213
+ "organization": "OpenAI",
214
+ "vocab_size": 100277,
215
+ "num(digit)": 1110,
216
+ "len(digit)": "1,3,3",
217
+ "num(space)": 47472,
218
+ "len(space)": "1,7,128",
219
+ "num(ar)": 113,
220
+ "len(ar)": "1,2,10",
221
+ "num(zh)": 868,
222
+ "len(zh)": "1,1,7",
223
+ "num(ja)": 1035,
224
+ "len(ja)": "1,1,7",
225
+ "num(ja-kana)": 169,
226
+ "len(ja-kana)": "1,1,7",
227
+ "num(ko)": 299,
228
+ "len(ko)": "1,2,4"
229
+ },
230
+ "llama3": {
231
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/gradientai/Llama-3-8B-Instruct-Gradient-1048k\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">llama3</a>",
232
+ "organization": "Meta",
233
+ "vocab_size": 128256,
234
+ "num(digit)": 1110,
235
+ "len(digit)": "1,3,3",
236
+ "num(space)": 60860,
237
+ "len(space)": "1,6,128",
238
+ "num(ar)": 3810,
239
+ "len(ar)": "1,4,11",
240
+ "num(zh)": 4424,
241
+ "len(zh)": "1,1,7",
242
+ "num(ja)": 5387,
243
+ "len(ja)": "1,2,8",
244
+ "num(ja-kana)": 1086,
245
+ "len(ja-kana)": "1,2,8",
246
+ "num(ko)": 2281,
247
+ "len(ko)": "1,2,6"
248
+ },
249
+ "google-t5/t5-large": {
250
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/google-t5/t5-large\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">t5</a>",
251
+ "organization": "Google",
252
+ "vocab_size": 32100,
253
+ "num(digit)": 1133,
254
+ "len(digit)": "1,3,13",
255
+ "num(space)": 0,
256
+ "len(space)": "-",
257
+ "num(ar)": 0,
258
+ "len(ar)": "-",
259
+ "num(zh)": 0,
260
+ "len(zh)": "-",
261
+ "num(ja)": 0,
262
+ "len(ja)": "-",
263
+ "num(ja-kana)": 0,
264
+ "len(ja-kana)": "-",
265
+ "num(ko)": 0,
266
+ "len(ko)": "-"
267
+ },
268
+ "google/byt5-small": {
269
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/google/byt5-small\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">byt5-small</a>",
270
+ "organization": "Google",
271
+ "vocab_size": 384,
272
+ "num(digit)": 10,
273
+ "len(digit)": "1,1,1",
274
+ "num(space)": 10,
275
+ "len(space)": "1,1,1",
276
+ "num(ar)": 0,
277
+ "len(ar)": "-",
278
+ "num(zh)": 0,
279
+ "len(zh)": "-",
280
+ "num(ja)": 0,
281
+ "len(ja)": "-",
282
+ "num(ja-kana)": 0,
283
+ "len(ja-kana)": "-",
284
+ "num(ko)": 0,
285
+ "len(ko)": "-"
286
+ },
287
+ "google/mt5-large": {
288
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/google/mt5-large\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">mt5-large</a>",
289
+ "organization": "Google",
290
+ "vocab_size": 250100,
291
+ "num(digit)": 16829,
292
+ "len(digit)": "1,4,16",
293
+ "num(space)": 1,
294
+ "len(space)": "1,1,1",
295
+ "num(ar)": 7459,
296
+ "len(ar)": "1,3,16",
297
+ "num(zh)": 21489,
298
+ "len(zh)": "1,2,16",
299
+ "num(ja)": 27078,
300
+ "len(ja)": "1,2,16",
301
+ "num(ja-kana)": 9160,
302
+ "len(ja-kana)": "1,3,14",
303
+ "num(ko)": 4041,
304
+ "len(ko)": "1,1,10"
305
+ },
306
+ "lmsys/fastchat-t5-3b-v1.0": {
307
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/lmsys/fastchat-t5-3b-v1.0\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">fastchat-t5-3b-v1.0</a>",
308
+ "organization": "LMSYS",
309
+ "vocab_size": 32110,
310
+ "num(digit)": 1033,
311
+ "len(digit)": "1,3,8",
312
+ "num(space)": 0,
313
+ "len(space)": "-",
314
+ "num(ar)": 0,
315
+ "len(ar)": "-",
316
+ "num(zh)": 0,
317
+ "len(zh)": "-",
318
+ "num(ja)": 0,
319
+ "len(ja)": "-",
320
+ "num(ja-kana)": 0,
321
+ "len(ja-kana)": "-",
322
+ "num(ko)": 0,
323
+ "len(ko)": "-"
324
+ },
325
+ "paust/pko-t5-large": {
326
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/paust/pko-t5-large\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">pko-t5-large</a>",
327
+ "organization": "PAUST",
328
+ "vocab_size": 50358,
329
+ "num(digit)": 51,
330
+ "len(digit)": "1,2,3",
331
+ "num(space)": 10,
332
+ "len(space)": "1,1,1",
333
+ "num(ar)": 0,
334
+ "len(ar)": "-",
335
+ "num(zh)": 0,
336
+ "len(zh)": "-",
337
+ "num(ja)": 0,
338
+ "len(ja)": "-",
339
+ "num(ja-kana)": 0,
340
+ "len(ja-kana)": "-",
341
+ "num(ko)": 49050,
342
+ "len(ko)": "1,2,16"
343
+ },
344
+ "bloom": {
345
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/bigscience/bloom\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">bloom</a>",
346
+ "organization": "BigScience",
347
+ "vocab_size": 250680,
348
+ "num(digit)": 6629,
349
+ "len(digit)": "1,4,50",
350
+ "num(space)": 140180,
351
+ "len(space)": "1,6,600",
352
+ "num(ar)": 20854,
353
+ "len(ar)": "1,5,16",
354
+ "num(zh)": 30603,
355
+ "len(zh)": "1,2,23",
356
+ "num(ja)": 30816,
357
+ "len(ja)": "1,2,23",
358
+ "num(ja-kana)": 214,
359
+ "len(ja-kana)": "1,1,3",
360
+ "num(ko)": 338,
361
+ "len(ko)": "1,1,3"
362
+ },
363
+ "llama": {
364
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/huggyllama/llama-7b\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">llama</a>",
365
+ "organization": "Meta",
366
+ "vocab_size": 32000,
367
+ "num(digit)": 20,
368
+ "len(digit)": "1,1,1",
369
+ "num(space)": 61,
370
+ "len(space)": "1,2,15",
371
+ "num(ar)": 55,
372
+ "len(ar)": "1,1,2",
373
+ "num(zh)": 700,
374
+ "len(zh)": "1,1,1",
375
+ "num(ja)": 837,
376
+ "len(ja)": "1,1,1",
377
+ "num(ja-kana)": 137,
378
+ "len(ja-kana)": "1,1,1",
379
+ "num(ko)": 111,
380
+ "len(ko)": "1,1,1"
381
+ },
382
+ "ClueAI/ChatYuan-large-v2": {
383
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/ClueAI/ChatYuan-large-v2\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">ChatYuan-large-v2</a>",
384
+ "organization": "CLUE",
385
+ "vocab_size": 32128,
386
+ "num(digit)": 740,
387
+ "len(digit)": "1,3,9",
388
+ "num(space)": 0,
389
+ "len(space)": "-",
390
+ "num(ar)": 2,
391
+ "len(ar)": "1,1,1",
392
+ "num(zh)": 29591,
393
+ "len(zh)": "1,2,16",
394
+ "num(ja)": 29736,
395
+ "len(ja)": "1,2,16",
396
+ "num(ja-kana)": 145,
397
+ "len(ja-kana)": "1,1,2",
398
+ "num(ko)": 0,
399
+ "len(ko)": "-"
400
+ },
401
+ "Meta/llama3": {
402
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/gradientai/Llama-3-8B-Instruct-Gradient-1048k\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">llama3</a>",
403
+ "organization": "Meta",
404
+ "vocab_size": 128256,
405
+ "num(digit)": 1110,
406
+ "len(digit)": "1,3,3",
407
+ "num(space)": 60860,
408
+ "len(space)": "1,6,128",
409
+ "num(ar)": 3810,
410
+ "len(ar)": "1,4,11",
411
+ "num(zh)": 4424,
412
+ "len(zh)": "1,1,7",
413
+ "num(ja)": 5387,
414
+ "len(ja)": "1,2,8",
415
+ "num(ja-kana)": 1086,
416
+ "len(ja-kana)": "1,2,8",
417
+ "num(ko)": 2281,
418
+ "len(ko)": "1,2,6"
419
+ },
420
+ "openai/gpt-4": {
421
+ "tokenizer": "<a target=\"_blank\" href=\"https://github.com/openai/tiktoken\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">gpt-4</a>",
422
+ "organization": "OpenAI",
423
+ "vocab_size": 100277,
424
+ "num(digit)": 1110,
425
+ "len(digit)": "1,3,3",
426
+ "num(space)": 47472,
427
+ "len(space)": "1,7,128",
428
+ "num(ar)": 113,
429
+ "len(ar)": "1,2,10",
430
+ "num(zh)": 868,
431
+ "len(zh)": "1,1,7",
432
+ "num(ja)": 1035,
433
+ "len(ja)": "1,1,7",
434
+ "num(ja-kana)": 169,
435
+ "len(ja-kana)": "1,1,7",
436
+ "num(ko)": 299,
437
+ "len(ko)": "1,2,4"
438
+ },
439
+ "gradientai/Llama-3-8B-Instruct-Gradient-1048k": {
440
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/gradientai/Llama-3-8B-Instruct-Gradient-1048k\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">llama3</a>",
441
+ "organization": "Meta",
442
+ "vocab_size": 128256,
443
+ "num(digit)": 1110,
444
+ "len(digit)": "1,3,3",
445
+ "num(space)": 60860,
446
+ "len(space)": "1,6,128",
447
+ "num(ar)": 3810,
448
+ "len(ar)": "1,4,11",
449
+ "num(zh)": 4424,
450
+ "len(zh)": "1,1,7",
451
+ "num(ja)": 5387,
452
+ "len(ja)": "1,2,8",
453
+ "num(ja-kana)": 1086,
454
+ "len(ja-kana)": "1,2,8",
455
+ "num(ko)": 2281,
456
+ "len(ko)": "1,2,6"
457
+ },
458
+ "bigscience/bloom": {
459
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/bigscience/bloom\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">bloom</a>",
460
+ "organization": "BigScience",
461
+ "vocab_size": 250680,
462
+ "num(digit)": 6629,
463
+ "len(digit)": "1,4,50",
464
+ "num(space)": 140180,
465
+ "len(space)": "1,6,600",
466
+ "num(ar)": 20854,
467
+ "len(ar)": "1,5,16",
468
+ "num(zh)": 30603,
469
+ "len(zh)": "1,2,23",
470
+ "num(ja)": 30816,
471
+ "len(ja)": "1,2,23",
472
+ "num(ja-kana)": 214,
473
+ "len(ja-kana)": "1,1,3",
474
+ "num(ko)": 338,
475
+ "len(ko)": "1,1,3"
476
+ },
477
+ "huggyllama/llama-7b": {
478
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/huggyllama/llama-7b\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">llama</a>",
479
+ "organization": "Meta",
480
+ "vocab_size": 32000,
481
+ "num(digit)": 20,
482
+ "len(digit)": "1,1,1",
483
+ "num(space)": 61,
484
+ "len(space)": "1,2,15",
485
+ "num(ar)": 55,
486
+ "len(ar)": "1,1,2",
487
+ "num(zh)": 700,
488
+ "len(zh)": "1,1,1",
489
+ "num(ja)": 837,
490
+ "len(ja)": "1,1,1",
491
+ "num(ja-kana)": 137,
492
+ "len(ja-kana)": "1,1,1",
493
+ "num(ko)": 111,
494
+ "len(ko)": "1,1,1"
495
+ },
496
+ "baichuan-inc/Baichuan-7B": {
497
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/baichuan-inc/Baichuan-7B\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">baichuan</a>",
498
+ "organization": "Baichuan",
499
+ "vocab_size": 64000,
500
+ "num(digit)": 335,
501
+ "len(digit)": "1,14,14",
502
+ "num(space)": 13,
503
+ "len(space)": "1,1,1",
504
+ "num(ar)": 299,
505
+ "len(ar)": "1,1,2",
506
+ "num(zh)": 27676,
507
+ "len(zh)": "1,1,9",
508
+ "num(ja)": 28522,
509
+ "len(ja)": "1,1,9",
510
+ "num(ja-kana)": 178,
511
+ "len(ja-kana)": "1,1,1",
512
+ "num(ko)": 1591,
513
+ "len(ko)": "1,1,1"
514
+ },
515
+ "01-ai/Yi-34B": {
516
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/01-ai/Yi-34B\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">Yi-34B</a>",
517
+ "organization": "Yi",
518
+ "vocab_size": 64000,
519
+ "num(digit)": 200,
520
+ "len(digit)": "1,13,15",
521
+ "num(space)": 24274,
522
+ "len(space)": "1,7,16",
523
+ "num(ar)": 18,
524
+ "len(ar)": "1,1,4",
525
+ "num(zh)": 21356,
526
+ "len(zh)": "1,2,12",
527
+ "num(ja)": 21407,
528
+ "len(ja)": "1,2,12",
529
+ "num(ja-kana)": 51,
530
+ "len(ja-kana)": "1,1,2",
531
+ "num(ko)": 28,
532
+ "len(ko)": "1,1,2"
533
+ },
534
+ "01-ai/Yi-6B": {
535
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/01-ai/Yi-6B\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">Yi-6B</a>",
536
+ "organization": "Yi",
537
+ "vocab_size": 64000,
538
+ "num(digit)": 200,
539
+ "len(digit)": "1,13,15",
540
+ "num(space)": 24274,
541
+ "len(space)": "1,7,16",
542
+ "num(ar)": 18,
543
+ "len(ar)": "1,1,4",
544
+ "num(zh)": 21356,
545
+ "len(zh)": "1,2,12",
546
+ "num(ja)": 21407,
547
+ "len(ja)": "1,2,12",
548
+ "num(ja-kana)": 51,
549
+ "len(ja-kana)": "1,1,2",
550
+ "num(ko)": 28,
551
+ "len(ko)": "1,1,2"
552
+ },
553
+ "01-ai/Yi-VL-34B": {
554
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/01-ai/Yi-VL-34B\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">Yi-VL-34B</a>",
555
+ "organization": "Yi",
556
+ "vocab_size": 64000,
557
+ "num(digit)": 200,
558
+ "len(digit)": "1,13,15",
559
+ "num(space)": 43,
560
+ "len(space)": "1,2,15",
561
+ "num(ar)": 18,
562
+ "len(ar)": "1,1,4",
563
+ "num(zh)": 21356,
564
+ "len(zh)": "1,2,12",
565
+ "num(ja)": 21407,
566
+ "len(ja)": "1,2,12",
567
+ "num(ja-kana)": 51,
568
+ "len(ja-kana)": "1,1,2",
569
+ "num(ko)": 28,
570
+ "len(ko)": "1,1,2"
571
+ },
572
+ "ClassCat/gpt2-base-french": {
573
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/ClassCat/gpt2-base-french\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">gpt2-base-french</a>",
574
+ "organization": "ClassCat",
575
+ "vocab_size": 50000,
576
+ "num(digit)": 1833,
577
+ "len(digit)": "1,4,5",
578
+ "num(space)": 31889,
579
+ "len(space)": "1,7,32",
580
+ "num(ar)": 41,
581
+ "len(ar)": "1,1,4",
582
+ "num(zh)": 27,
583
+ "len(zh)": "1,1,1",
584
+ "num(ja)": 46,
585
+ "len(ja)": "1,1,2",
586
+ "num(ja-kana)": 19,
587
+ "len(ja-kana)": "1,1,2",
588
+ "num(ko)": 0,
589
+ "len(ko)": "-"
590
+ },
591
+ "ClassCat/gpt2-base-spanish": {
592
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/ClassCat/gpt2-base-spanish\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">gpt2-base-spanish</a>",
593
+ "organization": "ClassCat",
594
+ "vocab_size": 50000,
595
+ "num(digit)": 1492,
596
+ "len(digit)": "1,4,9",
597
+ "num(space)": 34496,
598
+ "len(space)": "1,8,32",
599
+ "num(ar)": 36,
600
+ "len(ar)": "1,1,4",
601
+ "num(zh)": 13,
602
+ "len(zh)": "1,1,1",
603
+ "num(ja)": 36,
604
+ "len(ja)": "1,1,2",
605
+ "num(ja-kana)": 23,
606
+ "len(ja-kana)": "1,1,2",
607
+ "num(ko)": 0,
608
+ "len(ko)": "-"
609
+ },
610
+ "ClueAI/PromptCLUE-base": {
611
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/ClueAI/PromptCLUE-base\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">PromptCLUE-base</a>",
612
+ "organization": "CLUE",
613
+ "vocab_size": 32128,
614
+ "num(digit)": 740,
615
+ "len(digit)": "1,3,9",
616
+ "num(space)": 0,
617
+ "len(space)": "-",
618
+ "num(ar)": 2,
619
+ "len(ar)": "1,1,1",
620
+ "num(zh)": 29591,
621
+ "len(zh)": "1,2,16",
622
+ "num(ja)": 29736,
623
+ "len(ja)": "1,2,16",
624
+ "num(ja-kana)": 145,
625
+ "len(ja-kana)": "1,1,2",
626
+ "num(ko)": 0,
627
+ "len(ko)": "-"
628
+ },
629
+ "CohereForAI/aya-101": {
630
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/CohereForAI/aya-101\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">aya-101</a>",
631
+ "organization": "Cohere For AI",
632
+ "vocab_size": 250100,
633
+ "num(digit)": 16829,
634
+ "len(digit)": "1,4,16",
635
+ "num(space)": 1,
636
+ "len(space)": "1,1,1",
637
+ "num(ar)": 7459,
638
+ "len(ar)": "1,3,16",
639
+ "num(zh)": 21489,
640
+ "len(zh)": "1,2,16",
641
+ "num(ja)": 27078,
642
+ "len(ja)": "1,2,16",
643
+ "num(ja-kana)": 9160,
644
+ "len(ja-kana)": "1,3,14",
645
+ "num(ko)": 4041,
646
+ "len(ko)": "1,1,10"
647
+ },
648
+ "EleutherAI/gpt-neox-20b": {
649
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/EleutherAI/gpt-neox-20b\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">gpt-neox-20b</a>",
650
+ "organization": "EleutherAI",
651
+ "vocab_size": 50277,
652
+ "num(digit)": 2036,
653
+ "len(digit)": "1,3,35",
654
+ "num(space)": 28996,
655
+ "len(space)": "1,7,512",
656
+ "num(ar)": 94,
657
+ "len(ar)": "1,2,4",
658
+ "num(zh)": 313,
659
+ "len(zh)": "1,1,2",
660
+ "num(ja)": 480,
661
+ "len(ja)": "1,1,4",
662
+ "num(ja-kana)": 167,
663
+ "len(ja-kana)": "1,1,4",
664
+ "num(ko)": 25,
665
+ "len(ko)": "1,1,2"
666
+ },
667
+ "HuggingFaceH4/starchat-alpha": {
668
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/HuggingFaceH4/starchat-alpha\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">starchat-alpha</a>",
669
+ "organization": "-",
670
+ "vocab_size": 49156,
671
+ "num(digit)": 10,
672
+ "len(digit)": "1,1,1",
673
+ "num(space)": 16515,
674
+ "len(space)": "1,6,256",
675
+ "num(ar)": 84,
676
+ "len(ar)": "1,2,4",
677
+ "num(zh)": 2030,
678
+ "len(zh)": "1,1,7",
679
+ "num(ja)": 2368,
680
+ "len(ja)": "1,1,8",
681
+ "num(ja-kana)": 360,
682
+ "len(ja-kana)": "1,2,8",
683
+ "num(ko)": 491,
684
+ "len(ko)": "1,2,5"
685
+ },
686
+ "HuggingFaceH4/zephyr-7b-beta": {
687
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/HuggingFaceH4/zephyr-7b-beta\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">zephyr-7b-beta</a>",
688
+ "organization": "HuggingFace",
689
+ "vocab_size": 32000,
690
+ "num(digit)": 20,
691
+ "len(digit)": "1,1,1",
692
+ "num(space)": 85,
693
+ "len(space)": "1,3,15",
694
+ "num(ar)": 71,
695
+ "len(ar)": "1,1,2",
696
+ "num(zh)": 1459,
697
+ "len(zh)": "1,1,2",
698
+ "num(ja)": 1593,
699
+ "len(ja)": "1,1,2",
700
+ "num(ja-kana)": 134,
701
+ "len(ja-kana)": "1,1,1",
702
+ "num(ko)": 346,
703
+ "len(ko)": "1,1,1"
704
+ },
705
+ "LLM360/CrystalCoder": {
706
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/LLM360/CrystalCoder\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">CrystalCoder</a>",
707
+ "organization": "MBZUAI",
708
+ "vocab_size": 32022,
709
+ "num(digit)": 20,
710
+ "len(digit)": "1,1,1",
711
+ "num(space)": 61,
712
+ "len(space)": "1,2,15",
713
+ "num(ar)": 55,
714
+ "len(ar)": "1,1,2",
715
+ "num(zh)": 700,
716
+ "len(zh)": "1,1,1",
717
+ "num(ja)": 837,
718
+ "len(ja)": "1,1,1",
719
+ "num(ja-kana)": 137,
720
+ "len(ja-kana)": "1,1,1",
721
+ "num(ko)": 111,
722
+ "len(ko)": "1,1,1"
723
+ },
724
+ "NousResearch/Llama-2-7b-chat-hf": {
725
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/NousResearch/Llama-2-7b-chat-hf\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">llama2</a>",
726
+ "organization": "Meta",
727
+ "vocab_size": 32001,
728
+ "num(digit)": 20,
729
+ "len(digit)": "1,1,1",
730
+ "num(space)": 61,
731
+ "len(space)": "1,2,15",
732
+ "num(ar)": 55,
733
+ "len(ar)": "1,1,2",
734
+ "num(zh)": 700,
735
+ "len(zh)": "1,1,1",
736
+ "num(ja)": 837,
737
+ "len(ja)": "1,1,1",
738
+ "num(ja-kana)": 137,
739
+ "len(ja-kana)": "1,1,1",
740
+ "num(ko)": 111,
741
+ "len(ko)": "1,1,1"
742
+ },
743
+ "OrionStarAI/Orion-14B-Chat": {
744
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/OrionStarAI/Orion-14B-Chat\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">Orion-14B-Chat</a>",
745
+ "organization": "OrionStar",
746
+ "vocab_size": 84608,
747
+ "num(digit)": 1559,
748
+ "len(digit)": "1,4,14",
749
+ "num(space)": 18383,
750
+ "len(space)": "1,6,16",
751
+ "num(ar)": 102,
752
+ "len(ar)": "1,1,1",
753
+ "num(zh)": 46998,
754
+ "len(zh)": "1,2,16",
755
+ "num(ja)": 49644,
756
+ "len(ja)": "1,2,16",
757
+ "num(ja-kana)": 2987,
758
+ "len(ja-kana)": "1,3,11",
759
+ "num(ko)": 5110,
760
+ "len(ko)": "1,2,7"
761
+ },
762
+ "Qwen/Qwen-7B-Chat": {
763
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/Qwen/Qwen-7B-Chat\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">Qwen</a>",
764
+ "organization": "Alibaba",
765
+ "vocab_size": 151851,
766
+ "num(digit)": 10,
767
+ "len(digit)": "1,1,1",
768
+ "num(space)": 55883,
769
+ "len(space)": "1,6,128",
770
+ "num(ar)": 4018,
771
+ "len(ar)": "1,3,12",
772
+ "num(zh)": 25557,
773
+ "len(zh)": "1,2,7",
774
+ "num(ja)": 27206,
775
+ "len(ja)": "1,2,11",
776
+ "num(ja-kana)": 2089,
777
+ "len(ja-kana)": "1,3,11",
778
+ "num(ko)": 3495,
779
+ "len(ko)": "1,1,5"
780
+ },
781
+ "Qwen/Qwen1.5-14B-Chat": {
782
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/Qwen/Qwen1.5-14B-Chat\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">Qwen1.5</a>",
783
+ "organization": "Alibaba",
784
+ "vocab_size": 151646,
785
+ "num(digit)": 10,
786
+ "len(digit)": "1,1,1",
787
+ "num(space)": 55883,
788
+ "len(space)": "1,6,128",
789
+ "num(ar)": 4018,
790
+ "len(ar)": "1,3,12",
791
+ "num(zh)": 25557,
792
+ "len(zh)": "1,2,7",
793
+ "num(ja)": 27206,
794
+ "len(ja)": "1,2,11",
795
+ "num(ja-kana)": 2089,
796
+ "len(ja-kana)": "1,3,11",
797
+ "num(ko)": 3495,
798
+ "len(ko)": "1,1,5"
799
+ },
800
+ "Skywork/Skywork-13B-Math": {
801
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/Skywork/Skywork-13B-Math\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">Skywork-13B-Math</a>",
802
+ "organization": "Kunlun",
803
+ "vocab_size": 65519,
804
+ "num(digit)": 20,
805
+ "len(digit)": "1,1,1",
806
+ "num(space)": 62,
807
+ "len(space)": "1,2,15",
808
+ "num(ar)": 56,
809
+ "len(ar)": "1,1,2",
810
+ "num(zh)": 33913,
811
+ "len(zh)": "1,2,5",
812
+ "num(ja)": 34064,
813
+ "len(ja)": "1,2,5",
814
+ "num(ja-kana)": 150,
815
+ "len(ja-kana)": "1,1,1",
816
+ "num(ko)": 111,
817
+ "len(ko)": "1,1,1"
818
+ },
819
+ "Skywork/Skywork-13B-base": {
820
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/Skywork/Skywork-13B-base\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">Skywork-13B-base</a>",
821
+ "organization": "Kunlun",
822
+ "vocab_size": 65519,
823
+ "num(digit)": 20,
824
+ "len(digit)": "1,1,1",
825
+ "num(space)": 62,
826
+ "len(space)": "1,2,15",
827
+ "num(ar)": 56,
828
+ "len(ar)": "1,1,2",
829
+ "num(zh)": 33913,
830
+ "len(zh)": "1,2,5",
831
+ "num(ja)": 34064,
832
+ "len(ja)": "1,2,5",
833
+ "num(ja-kana)": 150,
834
+ "len(ja-kana)": "1,1,1",
835
+ "num(ko)": 111,
836
+ "len(ko)": "1,1,1"
837
+ },
838
+ "THUDM/chatglm-6b": {
839
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/THUDM/chatglm-6b\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">chatglm-6b</a>",
840
+ "organization": "Tsinghua",
841
+ "vocab_size": 130344,
842
+ "num(digit)": 20,
843
+ "len(digit)": "1,1,1",
844
+ "num(space)": 93,
845
+ "len(space)": "1,34,80",
846
+ "num(ar)": 137,
847
+ "len(ar)": "1,2,4",
848
+ "num(zh)": 61358,
849
+ "len(zh)": "1,2,16",
850
+ "num(ja)": 61784,
851
+ "len(ja)": "1,2,16",
852
+ "num(ja-kana)": 439,
853
+ "len(ja-kana)": "1,2,5",
854
+ "num(ko)": 114,
855
+ "len(ko)": "1,1,3"
856
+ },
857
+ "THUDM/chatglm2-6b": {
858
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/THUDM/chatglm2-6b\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">chatglm2-6b</a>",
859
+ "organization": "Tsinghua",
860
+ "vocab_size": 64787,
861
+ "num(digit)": 20,
862
+ "len(digit)": "1,1,1",
863
+ "num(space)": 67,
864
+ "len(space)": "1,2,15",
865
+ "num(ar)": 57,
866
+ "len(ar)": "1,1,2",
867
+ "num(zh)": 30922,
868
+ "len(zh)": "1,2,16",
869
+ "num(ja)": 31065,
870
+ "len(ja)": "1,2,16",
871
+ "num(ja-kana)": 143,
872
+ "len(ja-kana)": "1,1,1",
873
+ "num(ko)": 604,
874
+ "len(ko)": "1,1,1"
875
+ },
876
+ "THUDM/chatglm3-6b": {
877
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/THUDM/chatglm3-6b\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">chatglm3-6b</a>",
878
+ "organization": "Tsinghua",
879
+ "vocab_size": 64796,
880
+ "num(digit)": 20,
881
+ "len(digit)": "1,1,1",
882
+ "num(space)": 67,
883
+ "len(space)": "1,2,15",
884
+ "num(ar)": 57,
885
+ "len(ar)": "1,1,2",
886
+ "num(zh)": 30922,
887
+ "len(zh)": "1,2,16",
888
+ "num(ja)": 31065,
889
+ "len(ja)": "1,2,16",
890
+ "num(ja-kana)": 143,
891
+ "len(ja-kana)": "1,1,1",
892
+ "num(ko)": 604,
893
+ "len(ko)": "1,1,1"
894
+ },
895
+ "TigerResearch/tigerbot-13b-chat-v2": {
896
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/TigerResearch/tigerbot-13b-chat-v2\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">tigerbot-13b-chat-v2</a>",
897
+ "organization": "Tigerobo",
898
+ "vocab_size": 60515,
899
+ "num(digit)": 20,
900
+ "len(digit)": "1,1,1",
901
+ "num(space)": 61,
902
+ "len(space)": "1,2,15",
903
+ "num(ar)": 55,
904
+ "len(ar)": "1,1,2",
905
+ "num(zh)": 28603,
906
+ "len(zh)": "1,2,16",
907
+ "num(ja)": 28770,
908
+ "len(ja)": "1,2,16",
909
+ "num(ja-kana)": 167,
910
+ "len(ja-kana)": "1,1,2",
911
+ "num(ko)": 261,
912
+ "len(ko)": "1,1,1"
913
+ },
914
+ "TigerResearch/tigerbot-70b-chat-v4-4k": {
915
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/TigerResearch/tigerbot-70b-chat-v4-4k\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">tigerbot-70b-chat-v4-4k</a>",
916
+ "organization": "Tigerobo",
917
+ "vocab_size": 65110,
918
+ "num(digit)": 20,
919
+ "len(digit)": "1,1,1",
920
+ "num(space)": 61,
921
+ "len(space)": "1,2,15",
922
+ "num(ar)": 55,
923
+ "len(ar)": "1,1,2",
924
+ "num(zh)": 30509,
925
+ "len(zh)": "1,2,16",
926
+ "num(ja)": 32061,
927
+ "len(ja)": "1,2,16",
928
+ "num(ja-kana)": 2071,
929
+ "len(ja-kana)": "1,2,8",
930
+ "num(ko)": 1504,
931
+ "len(ko)": "1,1,5"
932
+ },
933
+ "Upstage/SOLAR-10.7B-v1.0": {
934
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/Upstage/SOLAR-10.7B-v1.0\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">SOLAR-10.7B-v1.0</a>",
935
+ "organization": "-",
936
+ "vocab_size": 32000,
937
+ "num(digit)": 20,
938
+ "len(digit)": "1,1,1",
939
+ "num(space)": 85,
940
+ "len(space)": "1,3,15",
941
+ "num(ar)": 71,
942
+ "len(ar)": "1,1,2",
943
+ "num(zh)": 1459,
944
+ "len(zh)": "1,1,2",
945
+ "num(ja)": 1593,
946
+ "len(ja)": "1,1,2",
947
+ "num(ja-kana)": 134,
948
+ "len(ja-kana)": "1,1,1",
949
+ "num(ko)": 346,
950
+ "len(ko)": "1,1,1"
951
+ },
952
+ "WizardLM/WizardCoder-15B-V1.0": {
953
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/WizardLM/WizardCoder-15B-V1.0\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">WizardCoder-15B-V1.0</a>",
954
+ "organization": "Microsoft",
955
+ "vocab_size": 49153,
956
+ "num(digit)": 10,
957
+ "len(digit)": "1,1,1",
958
+ "num(space)": 16515,
959
+ "len(space)": "1,6,256",
960
+ "num(ar)": 84,
961
+ "len(ar)": "1,2,4",
962
+ "num(zh)": 2030,
963
+ "len(zh)": "1,1,7",
964
+ "num(ja)": 2368,
965
+ "len(ja)": "1,1,8",
966
+ "num(ja-kana)": 360,
967
+ "len(ja-kana)": "1,2,8",
968
+ "num(ko)": 491,
969
+ "len(ko)": "1,2,5"
970
+ },
971
+ "WizardLM/WizardCoder-Python-7B-V1.0": {
972
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/WizardLM/WizardCoder-Python-7B-V1.0\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">WizardCoder-Python-7B-V1.0</a>",
973
+ "organization": "Microsoft",
974
+ "vocab_size": 32001,
975
+ "num(digit)": 20,
976
+ "len(digit)": "1,1,1",
977
+ "num(space)": 61,
978
+ "len(space)": "1,2,15",
979
+ "num(ar)": 55,
980
+ "len(ar)": "1,1,2",
981
+ "num(zh)": 700,
982
+ "len(zh)": "1,1,1",
983
+ "num(ja)": 837,
984
+ "len(ja)": "1,1,1",
985
+ "num(ja-kana)": 137,
986
+ "len(ja-kana)": "1,1,1",
987
+ "num(ko)": 111,
988
+ "len(ko)": "1,1,1"
989
+ },
990
+ "WizardLM/WizardLM-7B-V1.0": {
991
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/WizardLM/WizardLM-7B-V1.0\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">WizardLM-7B-V1.0</a>",
992
+ "organization": "Microsoft",
993
+ "vocab_size": 32001,
994
+ "num(digit)": 20,
995
+ "len(digit)": "1,1,1",
996
+ "num(space)": 61,
997
+ "len(space)": "1,2,15",
998
+ "num(ar)": 55,
999
+ "len(ar)": "1,1,2",
1000
+ "num(zh)": 700,
1001
+ "len(zh)": "1,1,1",
1002
+ "num(ja)": 837,
1003
+ "len(ja)": "1,1,1",
1004
+ "num(ja-kana)": 137,
1005
+ "len(ja-kana)": "1,1,1",
1006
+ "num(ko)": 111,
1007
+ "len(ko)": "1,1,1"
1008
+ },
1009
+ "WizardLM/WizardMath-70B-V1.0": {
1010
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/WizardLM/WizardMath-70B-V1.0\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">WizardMath-70B-V1.0</a>",
1011
+ "organization": "Microsoft",
1012
+ "vocab_size": 32002,
1013
+ "num(digit)": 20,
1014
+ "len(digit)": "1,1,1",
1015
+ "num(space)": 61,
1016
+ "len(space)": "1,2,15",
1017
+ "num(ar)": 55,
1018
+ "len(ar)": "1,1,2",
1019
+ "num(zh)": 700,
1020
+ "len(zh)": "1,1,1",
1021
+ "num(ja)": 837,
1022
+ "len(ja)": "1,1,1",
1023
+ "num(ja-kana)": 137,
1024
+ "len(ja-kana)": "1,1,1",
1025
+ "num(ko)": 111,
1026
+ "len(ko)": "1,1,1"
1027
+ },
1028
+ "abeja/gpt-neox-japanese-2.7b": {
1029
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/abeja/gpt-neox-japanese-2.7b\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">gpt-neox-japanese-2.7b</a>",
1030
+ "organization": "ABEJA",
1031
+ "vocab_size": 32000,
1032
+ "num(digit)": 20,
1033
+ "len(digit)": "1,1,1",
1034
+ "num(space)": 0,
1035
+ "len(space)": "-",
1036
+ "num(ar)": 0,
1037
+ "len(ar)": "-",
1038
+ "num(zh)": 15176,
1039
+ "len(zh)": "1,2,2",
1040
+ "num(ja)": 31482,
1041
+ "len(ja)": "1,2,3",
1042
+ "num(ja-kana)": 16306,
1043
+ "len(ja-kana)": "1,3,3",
1044
+ "num(ko)": 0,
1045
+ "len(ko)": "-"
1046
+ },
1047
+ "ai21labs/Jamba-v0.1": {
1048
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/ai21labs/Jamba-v0.1\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">Jamba-v0.1</a>",
1049
+ "organization": "AI21",
1050
+ "vocab_size": 65536,
1051
+ "num(digit)": 1556,
1052
+ "len(digit)": "1,16,17",
1053
+ "num(space)": 39501,
1054
+ "len(space)": "1,7,32",
1055
+ "num(ar)": 867,
1056
+ "len(ar)": "1,3,8",
1057
+ "num(zh)": 1157,
1058
+ "len(zh)": "1,1,2",
1059
+ "num(ja)": 1287,
1060
+ "len(ja)": "1,1,2",
1061
+ "num(ja-kana)": 130,
1062
+ "len(ja-kana)": "1,1,2",
1063
+ "num(ko)": 312,
1064
+ "len(ko)": "1,1,2"
1065
+ },
1066
+ "allenai/OLMo-7B": {
1067
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/allenai/OLMo-7B\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">OLMo-7B</a>",
1068
+ "organization": "Allen AI",
1069
+ "vocab_size": 50280,
1070
+ "num(digit)": 2036,
1071
+ "len(digit)": "1,3,35",
1072
+ "num(space)": 29019,
1073
+ "len(space)": "1,7,512",
1074
+ "num(ar)": 94,
1075
+ "len(ar)": "1,2,4",
1076
+ "num(zh)": 313,
1077
+ "len(zh)": "1,1,2",
1078
+ "num(ja)": 480,
1079
+ "len(ja)": "1,1,4",
1080
+ "num(ja-kana)": 167,
1081
+ "len(ja-kana)": "1,1,4",
1082
+ "num(ko)": 25,
1083
+ "len(ko)": "1,1,2"
1084
+ },
1085
+ "baichuan-inc/Baichuan2-7B-Chat": {
1086
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">baichuan2</a>",
1087
+ "organization": "Baichuan",
1088
+ "vocab_size": 125696,
1089
+ "num(digit)": 1023,
1090
+ "len(digit)": "1,14,14",
1091
+ "num(space)": 26013,
1092
+ "len(space)": "1,7,32",
1093
+ "num(ar)": 335,
1094
+ "len(ar)": "1,1,27",
1095
+ "num(zh)": 70398,
1096
+ "len(zh)": "1,2,32",
1097
+ "num(ja)": 71269,
1098
+ "len(ja)": "1,2,32",
1099
+ "num(ja-kana)": 206,
1100
+ "len(ja-kana)": "1,1,9",
1101
+ "num(ko)": 1595,
1102
+ "len(ko)": "1,1,2"
1103
+ },
1104
+ "ckiplab/gpt2-base-chinese": {
1105
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/ckiplab/gpt2-base-chinese\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">gpt2-base-chinese</a>",
1106
+ "organization": "SINICA",
1107
+ "vocab_size": 21128,
1108
+ "num(digit)": 1451,
1109
+ "len(digit)": "1,3,12",
1110
+ "num(space)": 2,
1111
+ "len(space)": "1,2,3",
1112
+ "num(ar)": 30,
1113
+ "len(ar)": "1,2,3",
1114
+ "num(zh)": 14642,
1115
+ "len(zh)": "1,2,3",
1116
+ "num(ja)": 15197,
1117
+ "len(ja)": "1,3,15",
1118
+ "num(ja-kana)": 553,
1119
+ "len(ja-kana)": "1,3,15",
1120
+ "num(ko)": 0,
1121
+ "len(ko)": "-"
1122
+ },
1123
+ "cyberagent/open-calm-7b": {
1124
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/cyberagent/open-calm-7b\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">open-calm-7b</a>",
1125
+ "organization": "CyberAgent",
1126
+ "vocab_size": 52000,
1127
+ "num(digit)": 690,
1128
+ "len(digit)": "1,3,5",
1129
+ "num(space)": 1698,
1130
+ "len(space)": "1,4,33",
1131
+ "num(ar)": 10,
1132
+ "len(ar)": "1,1,4",
1133
+ "num(zh)": 30775,
1134
+ "len(zh)": "1,3,31",
1135
+ "num(ja)": 45790,
1136
+ "len(ja)": "1,3,31",
1137
+ "num(ja-kana)": 32535,
1138
+ "len(ja-kana)": "1,3,31",
1139
+ "num(ko)": 0,
1140
+ "len(ko)": "-"
1141
+ },
1142
+ "databricks/dbrx-instruct": {
1143
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/databricks/dbrx-instruct\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">dbrx-instruct</a>",
1144
+ "organization": "Databricks",
1145
+ "vocab_size": 100280,
1146
+ "num(digit)": 1126,
1147
+ "len(digit)": "1,3,17",
1148
+ "num(space)": 47400,
1149
+ "len(space)": "1,7,128",
1150
+ "num(ar)": 113,
1151
+ "len(ar)": "1,2,10",
1152
+ "num(zh)": 868,
1153
+ "len(zh)": "1,1,7",
1154
+ "num(ja)": 1035,
1155
+ "len(ja)": "1,1,7",
1156
+ "num(ja-kana)": 169,
1157
+ "len(ja-kana)": "1,1,7",
1158
+ "num(ko)": 299,
1159
+ "len(ko)": "1,2,4"
1160
+ },
1161
+ "deepseek-ai/DeepSeek-V2": {
1162
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/deepseek-ai/DeepSeek-V2\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">DeepSeek-V2</a>",
1163
+ "organization": "DeepSeek",
1164
+ "vocab_size": 100002,
1165
+ "num(digit)": 10,
1166
+ "len(digit)": "1,1,1",
1167
+ "num(space)": 48073,
1168
+ "len(space)": "1,7,128",
1169
+ "num(ar)": 48,
1170
+ "len(ar)": "1,1,4",
1171
+ "num(zh)": 18052,
1172
+ "len(zh)": "1,2,16",
1173
+ "num(ja)": 18090,
1174
+ "len(ja)": "1,2,16",
1175
+ "num(ja-kana)": 38,
1176
+ "len(ja-kana)": "1,1,2",
1177
+ "num(ko)": 16,
1178
+ "len(ko)": "1,1,2"
1179
+ },
1180
+ "deepseek-ai/deepseek-coder-33b-instruct": {
1181
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/deepseek-ai/deepseek-coder-33b-instruct\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">deepseek-coder-33b-instruct</a>",
1182
+ "organization": "DeepSeek",
1183
+ "vocab_size": 32022,
1184
+ "num(digit)": 10,
1185
+ "len(digit)": "1,1,1",
1186
+ "num(space)": 15254,
1187
+ "len(space)": "1,6,65",
1188
+ "num(ar)": 12,
1189
+ "len(ar)": "1,1,2",
1190
+ "num(zh)": 4803,
1191
+ "len(zh)": "1,2,4",
1192
+ "num(ja)": 4804,
1193
+ "len(ja)": "1,2,4",
1194
+ "num(ja-kana)": 1,
1195
+ "len(ja-kana)": "1,1,1",
1196
+ "num(ko)": 0,
1197
+ "len(ko)": "-"
1198
+ },
1199
+ "deepseek-ai/deepseek-llm-7b-base": {
1200
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/deepseek-ai/deepseek-llm-7b-base\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">deepseek-llm-7b-base</a>",
1201
+ "organization": "DeepSeek",
1202
+ "vocab_size": 100015,
1203
+ "num(digit)": 10,
1204
+ "len(digit)": "1,1,1",
1205
+ "num(space)": 48073,
1206
+ "len(space)": "1,7,128",
1207
+ "num(ar)": 48,
1208
+ "len(ar)": "1,1,4",
1209
+ "num(zh)": 18052,
1210
+ "len(zh)": "1,2,16",
1211
+ "num(ja)": 18090,
1212
+ "len(ja)": "1,2,16",
1213
+ "num(ja-kana)": 38,
1214
+ "len(ja-kana)": "1,1,2",
1215
+ "num(ko)": 16,
1216
+ "len(ko)": "1,1,2"
1217
+ },
1218
+ "eson/kplug-base-encoder": {
1219
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/eson/kplug-base-encoder\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">kplug</a>",
1220
+ "organization": "JD",
1221
+ "vocab_size": 10261,
1222
+ "num(digit)": 420,
1223
+ "len(digit)": "1,3,12",
1224
+ "num(space)": 0,
1225
+ "len(space)": "-",
1226
+ "num(ar)": 0,
1227
+ "len(ar)": "-",
1228
+ "num(zh)": 5764,
1229
+ "len(zh)": "1,1,1",
1230
+ "num(ja)": 5766,
1231
+ "len(ja)": "1,1,3",
1232
+ "num(ja-kana)": 0,
1233
+ "len(ja-kana)": "-",
1234
+ "num(ko)": 0,
1235
+ "len(ko)": "-"
1236
+ },
1237
+ "fnlp/moss-moon-003-sft": {
1238
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/fnlp/moss-moon-003-sft\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">moss-moon-003-sft</a>",
1239
+ "organization": "Fudan",
1240
+ "vocab_size": 106072,
1241
+ "num(digit)": 1848,
1242
+ "len(digit)": "1,3,16",
1243
+ "num(space)": 33566,
1244
+ "len(space)": "1,7,102",
1245
+ "num(ar)": 25,
1246
+ "len(ar)": "1,1,4",
1247
+ "num(zh)": 54230,
1248
+ "len(zh)": "1,2,15",
1249
+ "num(ja)": 54381,
1250
+ "len(ja)": "1,2,15",
1251
+ "num(ja-kana)": 152,
1252
+ "len(ja-kana)": "1,1,7",
1253
+ "num(ko)": 0,
1254
+ "len(ko)": "-"
1255
+ },
1256
+ "google/gemma-7b": {
1257
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/google/gemma-7b\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">gemma-7b</a>",
1258
+ "organization": "Google",
1259
+ "vocab_size": 256000,
1260
+ "num(digit)": 134,
1261
+ "len(digit)": "1,10,12",
1262
+ "num(space)": 125662,
1263
+ "len(space)": "1,7,31",
1264
+ "num(ar)": 6274,
1265
+ "len(ar)": "1,4,15",
1266
+ "num(zh)": 23767,
1267
+ "len(zh)": "1,2,12",
1268
+ "num(ja)": 28852,
1269
+ "len(ja)": "1,2,12",
1270
+ "num(ja-kana)": 7061,
1271
+ "len(ja-kana)": "1,3,12",
1272
+ "num(ko)": 2295,
1273
+ "len(ko)": "1,1,5"
1274
+ },
1275
+ "google/switch-c-2048": {
1276
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/google/switch-c-2048\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">switch-c-2048</a>",
1277
+ "organization": "Google",
1278
+ "vocab_size": 32100,
1279
+ "num(digit)": 1133,
1280
+ "len(digit)": "1,3,13",
1281
+ "num(space)": 0,
1282
+ "len(space)": "-",
1283
+ "num(ar)": 0,
1284
+ "len(ar)": "-",
1285
+ "num(zh)": 0,
1286
+ "len(zh)": "-",
1287
+ "num(ja)": 0,
1288
+ "len(ja)": "-",
1289
+ "num(ja-kana)": 0,
1290
+ "len(ja-kana)": "-",
1291
+ "num(ko)": 0,
1292
+ "len(ko)": "-"
1293
+ },
1294
+ "hfl/chinese-alpaca-lora-7b": {
1295
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/hfl/chinese-alpaca-lora-7b\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">chinese-alpaca-lora-7b</a>",
1296
+ "organization": "-",
1297
+ "vocab_size": 49954,
1298
+ "num(digit)": 614,
1299
+ "len(digit)": "1,3,5",
1300
+ "num(space)": 61,
1301
+ "len(space)": "1,2,15",
1302
+ "num(ar)": 55,
1303
+ "len(ar)": "1,1,2",
1304
+ "num(zh)": 17839,
1305
+ "len(zh)": "1,2,13",
1306
+ "num(ja)": 17993,
1307
+ "len(ja)": "1,2,13",
1308
+ "num(ja-kana)": 154,
1309
+ "len(ja-kana)": "1,1,1",
1310
+ "num(ko)": 135,
1311
+ "len(ko)": "1,1,1"
1312
+ },
1313
+ "hfl/chinese-llama-2-7b": {
1314
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/hfl/chinese-llama-2-7b\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">chinese-llama-2-7b</a>",
1315
+ "organization": "-",
1316
+ "vocab_size": 55296,
1317
+ "num(digit)": 20,
1318
+ "len(digit)": "1,1,1",
1319
+ "num(space)": 61,
1320
+ "len(space)": "1,2,15",
1321
+ "num(ar)": 55,
1322
+ "len(ar)": "1,1,2",
1323
+ "num(zh)": 23974,
1324
+ "len(zh)": "1,2,16",
1325
+ "num(ja)": 24111,
1326
+ "len(ja)": "1,2,16",
1327
+ "num(ja-kana)": 137,
1328
+ "len(ja-kana)": "1,1,1",
1329
+ "num(ko)": 111,
1330
+ "len(ko)": "1,1,1"
1331
+ },
1332
+ "hfl/chinese-llama-lora-7b": {
1333
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/hfl/chinese-llama-lora-7b\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">chinese-llama-lora-7b</a>",
1334
+ "organization": "-",
1335
+ "vocab_size": 49953,
1336
+ "num(digit)": 614,
1337
+ "len(digit)": "1,3,5",
1338
+ "num(space)": 61,
1339
+ "len(space)": "1,2,15",
1340
+ "num(ar)": 55,
1341
+ "len(ar)": "1,1,2",
1342
+ "num(zh)": 17839,
1343
+ "len(zh)": "1,2,13",
1344
+ "num(ja)": 17993,
1345
+ "len(ja)": "1,2,13",
1346
+ "num(ja-kana)": 154,
1347
+ "len(ja-kana)": "1,1,1",
1348
+ "num(ko)": 135,
1349
+ "len(ko)": "1,1,1"
1350
+ },
1351
+ "hfl/llama-3-chinese-8b": {
1352
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/hfl/llama-3-chinese-8b\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">llama-3-chinese-8b</a>",
1353
+ "organization": "-",
1354
+ "vocab_size": 128256,
1355
+ "num(digit)": 1110,
1356
+ "len(digit)": "1,3,3",
1357
+ "num(space)": 60860,
1358
+ "len(space)": "1,6,128",
1359
+ "num(ar)": 3810,
1360
+ "len(ar)": "1,4,11",
1361
+ "num(zh)": 4424,
1362
+ "len(zh)": "1,1,7",
1363
+ "num(ja)": 5387,
1364
+ "len(ja)": "1,2,8",
1365
+ "num(ja-kana)": 1086,
1366
+ "len(ja-kana)": "1,2,8",
1367
+ "num(ko)": 2281,
1368
+ "len(ko)": "1,2,6"
1369
+ },
1370
+ "hpcai-tech/grok-1": {
1371
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/hpcai-tech/grok-1\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">grok-1</a>",
1372
+ "organization": "xAI",
1373
+ "vocab_size": 131072,
1374
+ "num(digit)": 40,
1375
+ "len(digit)": "1,6,13",
1376
+ "num(space)": 399,
1377
+ "len(space)": "1,3,16",
1378
+ "num(ar)": 69,
1379
+ "len(ar)": "1,2,4",
1380
+ "num(zh)": 1626,
1381
+ "len(zh)": "1,2,7",
1382
+ "num(ja)": 3118,
1383
+ "len(ja)": "1,2,8",
1384
+ "num(ja-kana)": 1908,
1385
+ "len(ja-kana)": "1,2,8",
1386
+ "num(ko)": 67,
1387
+ "len(ko)": "1,1,2"
1388
+ },
1389
+ "internlm/internlm-chat-7b": {
1390
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/internlm/internlm-chat-7b\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">internlm-chat-7b</a>",
1391
+ "organization": "Shanghai AI Lab",
1392
+ "vocab_size": 103168,
1393
+ "num(digit)": 1259,
1394
+ "len(digit)": "1,3,19",
1395
+ "num(space)": 33008,
1396
+ "len(space)": "1,6,128",
1397
+ "num(ar)": 6702,
1398
+ "len(ar)": "1,4,16",
1399
+ "num(zh)": 32000,
1400
+ "len(zh)": "1,2,15",
1401
+ "num(ja)": 32866,
1402
+ "len(ja)": "1,2,15",
1403
+ "num(ja-kana)": 864,
1404
+ "len(ja-kana)": "1,2,9",
1405
+ "num(ko)": 298,
1406
+ "len(ko)": "1,1,1"
1407
+ },
1408
+ "internlm/internlm-xcomposer-7b": {
1409
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/internlm/internlm-xcomposer-7b\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">internlm-xcomposer-7b</a>",
1410
+ "organization": "Shanghai AI Lab",
1411
+ "vocab_size": 103168,
1412
+ "num(digit)": 1261,
1413
+ "len(digit)": "1,3,19",
1414
+ "num(space)": 33008,
1415
+ "len(space)": "1,6,128",
1416
+ "num(ar)": 6702,
1417
+ "len(ar)": "1,4,16",
1418
+ "num(zh)": 32000,
1419
+ "len(zh)": "1,2,15",
1420
+ "num(ja)": 32866,
1421
+ "len(ja)": "1,2,15",
1422
+ "num(ja-kana)": 864,
1423
+ "len(ja-kana)": "1,2,9",
1424
+ "num(ko)": 298,
1425
+ "len(ko)": "1,1,1"
1426
+ },
1427
+ "internlm/internlm2-chat-7b": {
1428
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/internlm/internlm2-chat-7b\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">internlm2-chat-7b</a>",
1429
+ "organization": "Shanghai AI Lab",
1430
+ "vocab_size": 92544,
1431
+ "num(digit)": 1261,
1432
+ "len(digit)": "1,3,18",
1433
+ "num(space)": 28681,
1434
+ "len(space)": "1,7,128",
1435
+ "num(ar)": 30,
1436
+ "len(ar)": "1,1,1",
1437
+ "num(zh)": 31148,
1438
+ "len(zh)": "1,2,15",
1439
+ "num(ja)": 31296,
1440
+ "len(ja)": "1,2,15",
1441
+ "num(ja-kana)": 148,
1442
+ "len(ja-kana)": "1,1,1",
1443
+ "num(ko)": 83,
1444
+ "len(ko)": "1,1,1"
1445
+ },
1446
+ "internlm/internlm2-math-7b": {
1447
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/internlm/internlm2-math-7b\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">internlm2-math-7b</a>",
1448
+ "organization": "Shanghai AI Lab",
1449
+ "vocab_size": 92544,
1450
+ "num(digit)": 1261,
1451
+ "len(digit)": "1,3,18",
1452
+ "num(space)": 28681,
1453
+ "len(space)": "1,7,128",
1454
+ "num(ar)": 30,
1455
+ "len(ar)": "1,1,1",
1456
+ "num(zh)": 31148,
1457
+ "len(zh)": "1,2,15",
1458
+ "num(ja)": 31296,
1459
+ "len(ja)": "1,2,15",
1460
+ "num(ja-kana)": 148,
1461
+ "len(ja-kana)": "1,1,1",
1462
+ "num(ko)": 83,
1463
+ "len(ko)": "1,1,1"
1464
+ },
1465
+ "microsoft/Phi-3-mini-4k-instruct": {
1466
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/microsoft/Phi-3-mini-4k-instruct\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">Phi-3-mini-4k-instruct</a>",
1467
+ "organization": "Microsoft",
1468
+ "vocab_size": 32011,
1469
+ "num(digit)": 20,
1470
+ "len(digit)": "1,1,1",
1471
+ "num(space)": 61,
1472
+ "len(space)": "1,2,15",
1473
+ "num(ar)": 55,
1474
+ "len(ar)": "1,1,2",
1475
+ "num(zh)": 700,
1476
+ "len(zh)": "1,1,1",
1477
+ "num(ja)": 837,
1478
+ "len(ja)": "1,1,1",
1479
+ "num(ja-kana)": 137,
1480
+ "len(ja-kana)": "1,1,1",
1481
+ "num(ko)": 111,
1482
+ "len(ko)": "1,1,1"
1483
+ },
1484
+ "microsoft/phi-1": {
1485
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/microsoft/phi-1\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">phi-1</a>",
1486
+ "organization": "Microsoft",
1487
+ "vocab_size": 50295,
1488
+ "num(digit)": 1691,
1489
+ "len(digit)": "1,3,16",
1490
+ "num(space)": 33129,
1491
+ "len(space)": "1,7,66",
1492
+ "num(ar)": 22,
1493
+ "len(ar)": "1,1,3",
1494
+ "num(zh)": 51,
1495
+ "len(zh)": "1,1,4",
1496
+ "num(ja)": 183,
1497
+ "len(ja)": "1,1,7",
1498
+ "num(ja-kana)": 133,
1499
+ "len(ja-kana)": "1,1,7",
1500
+ "num(ko)": 0,
1501
+ "len(ko)": "-"
1502
+ },
1503
+ "microsoft/phi-2": {
1504
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/microsoft/phi-2\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">phi-2</a>",
1505
+ "organization": "Microsoft",
1506
+ "vocab_size": 50295,
1507
+ "num(digit)": 1691,
1508
+ "len(digit)": "1,3,16",
1509
+ "num(space)": 33129,
1510
+ "len(space)": "1,7,66",
1511
+ "num(ar)": 22,
1512
+ "len(ar)": "1,1,3",
1513
+ "num(zh)": 51,
1514
+ "len(zh)": "1,1,4",
1515
+ "num(ja)": 183,
1516
+ "len(ja)": "1,1,7",
1517
+ "num(ja-kana)": 133,
1518
+ "len(ja-kana)": "1,1,7",
1519
+ "num(ko)": 0,
1520
+ "len(ko)": "-"
1521
+ },
1522
+ "mistralai/Mistral-7B-v0.1": {
1523
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/mistralai/Mistral-7B-v0.1\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">Mistral-7B-v0.1</a>",
1524
+ "organization": "Mistral",
1525
+ "vocab_size": 32000,
1526
+ "num(digit)": 20,
1527
+ "len(digit)": "1,1,1",
1528
+ "num(space)": 85,
1529
+ "len(space)": "1,3,15",
1530
+ "num(ar)": 71,
1531
+ "len(ar)": "1,1,2",
1532
+ "num(zh)": 1459,
1533
+ "len(zh)": "1,1,2",
1534
+ "num(ja)": 1593,
1535
+ "len(ja)": "1,1,2",
1536
+ "num(ja-kana)": 134,
1537
+ "len(ja-kana)": "1,1,1",
1538
+ "num(ko)": 346,
1539
+ "len(ko)": "1,1,1"
1540
+ },
1541
+ "mistralai/Mixtral-8x7B-v0.1": {
1542
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/mistralai/Mixtral-8x7B-v0.1\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">Mixtral-8x7B-v0.1</a>",
1543
+ "organization": "Mistral",
1544
+ "vocab_size": 32000,
1545
+ "num(digit)": 20,
1546
+ "len(digit)": "1,1,1",
1547
+ "num(space)": 85,
1548
+ "len(space)": "1,3,15",
1549
+ "num(ar)": 71,
1550
+ "len(ar)": "1,1,2",
1551
+ "num(zh)": 1459,
1552
+ "len(zh)": "1,1,2",
1553
+ "num(ja)": 1593,
1554
+ "len(ja)": "1,1,2",
1555
+ "num(ja-kana)": 134,
1556
+ "len(ja-kana)": "1,1,1",
1557
+ "num(ko)": 346,
1558
+ "len(ko)": "1,1,1"
1559
+ },
1560
+ "openai-community/gpt2": {
1561
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/openai-community/gpt2\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">gpt2</a>",
1562
+ "organization": "OpenAI",
1563
+ "vocab_size": 50257,
1564
+ "num(digit)": 1691,
1565
+ "len(digit)": "1,3,16",
1566
+ "num(space)": 33129,
1567
+ "len(space)": "1,7,66",
1568
+ "num(ar)": 22,
1569
+ "len(ar)": "1,1,3",
1570
+ "num(zh)": 51,
1571
+ "len(zh)": "1,1,4",
1572
+ "num(ja)": 183,
1573
+ "len(ja)": "1,1,7",
1574
+ "num(ja-kana)": 133,
1575
+ "len(ja-kana)": "1,1,7",
1576
+ "num(ko)": 0,
1577
+ "len(ko)": "-"
1578
+ },
1579
+ "openai/code-davinci-002": {
1580
+ "tokenizer": "<a target=\"_blank\" href=\"https://github.com/openai/tiktoken\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">code-davinci-002</a>",
1581
+ "organization": "OpenAI",
1582
+ "vocab_size": 50281,
1583
+ "num(digit)": 1691,
1584
+ "len(digit)": "1,3,16",
1585
+ "num(space)": 33175,
1586
+ "len(space)": "1,7,66",
1587
+ "num(ar)": 22,
1588
+ "len(ar)": "1,1,3",
1589
+ "num(zh)": 51,
1590
+ "len(zh)": "1,1,4",
1591
+ "num(ja)": 183,
1592
+ "len(ja)": "1,1,7",
1593
+ "num(ja-kana)": 133,
1594
+ "len(ja-kana)": "1,1,7",
1595
+ "num(ko)": 0,
1596
+ "len(ko)": "-"
1597
+ },
1598
+ "openai/gpt-3.5-turbo": {
1599
+ "tokenizer": "<a target=\"_blank\" href=\"https://github.com/openai/tiktoken\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">gpt-3.5-turbo</a>",
1600
+ "organization": "OpenAI",
1601
+ "vocab_size": 100277,
1602
+ "num(digit)": 1110,
1603
+ "len(digit)": "1,3,3",
1604
+ "num(space)": 47472,
1605
+ "len(space)": "1,7,128",
1606
+ "num(ar)": 113,
1607
+ "len(ar)": "1,2,10",
1608
+ "num(zh)": 868,
1609
+ "len(zh)": "1,1,7",
1610
+ "num(ja)": 1035,
1611
+ "len(ja)": "1,1,7",
1612
+ "num(ja-kana)": 169,
1613
+ "len(ja-kana)": "1,1,7",
1614
+ "num(ko)": 299,
1615
+ "len(ko)": "1,2,4"
1616
+ },
1617
+ "openai/gpt-4o": {
1618
+ "tokenizer": "<a target=\"_blank\" href=\"https://github.com/openai/tiktoken\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">gpt-4o</a>",
1619
+ "organization": "OpenAI",
1620
+ "vocab_size": 200019,
1621
+ "num(digit)": 1110,
1622
+ "len(digit)": "1,3,3",
1623
+ "num(space)": 109316,
1624
+ "len(space)": "1,6,128",
1625
+ "num(ar)": 8055,
1626
+ "len(ar)": "1,4,12",
1627
+ "num(zh)": 7563,
1628
+ "len(zh)": "1,2,11",
1629
+ "num(ja)": 8292,
1630
+ "len(ja)": "1,2,11",
1631
+ "num(ja-kana)": 809,
1632
+ "len(ja-kana)": "1,2,11",
1633
+ "num(ko)": 2365,
1634
+ "len(ko)": "1,2,8"
1635
+ },
1636
+ "openai/text-davinci-003": {
1637
+ "tokenizer": "<a target=\"_blank\" href=\"https://github.com/openai/tiktoken\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">text-davinci-003</a>",
1638
+ "organization": "OpenAI",
1639
+ "vocab_size": 50281,
1640
+ "num(digit)": 1691,
1641
+ "len(digit)": "1,3,16",
1642
+ "num(space)": 33175,
1643
+ "len(space)": "1,7,66",
1644
+ "num(ar)": 22,
1645
+ "len(ar)": "1,1,3",
1646
+ "num(zh)": 51,
1647
+ "len(zh)": "1,1,4",
1648
+ "num(ja)": 183,
1649
+ "len(ja)": "1,1,7",
1650
+ "num(ja-kana)": 133,
1651
+ "len(ja-kana)": "1,1,7",
1652
+ "num(ko)": 0,
1653
+ "len(ko)": "-"
1654
+ },
1655
+ "thu-coai/CharacterGLM-6B": {
1656
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/thu-coai/CharacterGLM-6B\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">CharacterGLM-6B</a>",
1657
+ "organization": "Tsinghua",
1658
+ "vocab_size": 64789,
1659
+ "num(digit)": 20,
1660
+ "len(digit)": "1,1,1",
1661
+ "num(space)": 67,
1662
+ "len(space)": "1,2,15",
1663
+ "num(ar)": 57,
1664
+ "len(ar)": "1,1,2",
1665
+ "num(zh)": 30922,
1666
+ "len(zh)": "1,2,16",
1667
+ "num(ja)": 31065,
1668
+ "len(ja)": "1,2,16",
1669
+ "num(ja-kana)": 143,
1670
+ "len(ja-kana)": "1,1,1",
1671
+ "num(ko)": 604,
1672
+ "len(ko)": "1,1,1"
1673
+ },
1674
+ "tiiuae/falcon-180b": {
1675
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/tiiuae/falcon-180b\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">falcon-180b</a>",
1676
+ "organization": "TII",
1677
+ "vocab_size": 65024,
1678
+ "num(digit)": 1108,
1679
+ "len(digit)": "1,3,3",
1680
+ "num(space)": 40202,
1681
+ "len(space)": "1,7,65",
1682
+ "num(ar)": 21,
1683
+ "len(ar)": "1,1,4",
1684
+ "num(zh)": 1627,
1685
+ "len(zh)": "1,1,3",
1686
+ "num(ja)": 1652,
1687
+ "len(ja)": "1,1,3",
1688
+ "num(ja-kana)": 25,
1689
+ "len(ja-kana)": "1,1,1",
1690
+ "num(ko)": 1,
1691
+ "len(ko)": "1,1,1"
1692
+ },
1693
+ "tiiuae/falcon-7b": {
1694
+ "tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/tiiuae/falcon-7b\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">falcon-7b</a>",
1695
+ "organization": "TII",
1696
+ "vocab_size": 65024,
1697
+ "num(digit)": 1108,
1698
+ "len(digit)": "1,3,3",
1699
+ "num(space)": 40202,
1700
+ "len(space)": "1,7,65",
1701
+ "num(ar)": 21,
1702
+ "len(ar)": "1,1,4",
1703
+ "num(zh)": 1627,
1704
+ "len(zh)": "1,1,3",
1705
+ "num(ja)": 1652,
1706
+ "len(ja)": "1,1,3",
1707
+ "num(ja-kana)": 25,
1708
+ "len(ja-kana)": "1,1,1",
1709
+ "num(ko)": 1,
1710
+ "len(ko)": "1,1,1"
1711
+ }
1712
+ }
stats/compress_rate.json DELETED
@@ -1,4286 +0,0 @@
1
- {
2
- "amber.cc100-en": {
3
- "vocab_size": 32000,
4
- "n_bytes": 1124813,
5
- "n_tokens": 294627,
6
- "n_chars": 1121360
7
- },
8
- "aya_101.cc100-en": {
9
- "vocab_size": 250100,
10
- "n_bytes": 1124813,
11
- "n_tokens": 317881,
12
- "n_chars": 1121360
13
- },
14
- "baichuan.cc100-en": {
15
- "vocab_size": 64000,
16
- "n_bytes": 1124813,
17
- "n_tokens": 280108,
18
- "n_chars": 1121360
19
- },
20
- "baichuan2.cc100-en": {
21
- "vocab_size": 125696,
22
- "n_bytes": 1124813,
23
- "n_tokens": 269011,
24
- "n_chars": 1121360
25
- },
26
- "bert_base_cased.cc100-en": {
27
- "vocab_size": 28996,
28
- "n_bytes": 1124813,
29
- "n_tokens": 288022,
30
- "n_chars": 1121360
31
- },
32
- "bert_base_chinese.cc100-en": {
33
- "vocab_size": 21128,
34
- "n_bytes": 1124813,
35
- "n_tokens": 377068,
36
- "n_chars": 1121360
37
- },
38
- "bert_base_uncased.cc100-en": {
39
- "vocab_size": 30522,
40
- "n_bytes": 1124813,
41
- "n_tokens": 280575,
42
- "n_chars": 1121360
43
- },
44
- "bloom.cc100-en": {
45
- "vocab_size": 250680,
46
- "n_bytes": 1124813,
47
- "n_tokens": 257405,
48
- "n_chars": 1121360
49
- },
50
- "byt5_small.cc100-en": {
51
- "vocab_size": 384,
52
- "n_bytes": 1124813,
53
- "n_tokens": 1134813,
54
- "n_chars": 1121360
55
- },
56
- "character_glm_6b.cc100-en": {
57
- "vocab_size": 64789,
58
- "n_bytes": 1124813,
59
- "n_tokens": 289347,
60
- "n_chars": 1121360
61
- },
62
- "chatglm2_6b.cc100-en": {
63
- "vocab_size": 64787,
64
- "n_bytes": 1124813,
65
- "n_tokens": 289329,
66
- "n_chars": 1121360
67
- },
68
- "chatglm3_6b.cc100-en": {
69
- "vocab_size": 64796,
70
- "n_bytes": 1124813,
71
- "n_tokens": 289347,
72
- "n_chars": 1121360
73
- },
74
- "chatglm_6b.cc100-en": {
75
- "vocab_size": 150344,
76
- "n_bytes": 1124813,
77
- "n_tokens": 284761,
78
- "n_chars": 1121360
79
- },
80
- "chatyuan_large_v2.cc100-en": {
81
- "vocab_size": 32128,
82
- "n_bytes": 1124813,
83
- "n_tokens": 536033,
84
- "n_chars": 1121360
85
- },
86
- "chinese_llama.cc100-en": {
87
- "vocab_size": 49953,
88
- "n_bytes": 1124813,
89
- "n_tokens": 291514,
90
- "n_chars": 1121360
91
- },
92
- "chinese_llama2.cc100-en": {
93
- "vocab_size": 55296,
94
- "n_bytes": 1124813,
95
- "n_tokens": 294627,
96
- "n_chars": 1121360
97
- },
98
- "code_davinci_002.cc100-en": {
99
- "vocab_size": 50281,
100
- "n_bytes": 1124813,
101
- "n_tokens": 258403,
102
- "n_chars": 1121360
103
- },
104
- "crystal_coder.cc100-en": {
105
- "vocab_size": 32022,
106
- "n_bytes": 1124813,
107
- "n_tokens": 284627,
108
- "n_chars": 1121360
109
- },
110
- "dbrx_instruct.cc100-en": {
111
- "vocab_size": 100280,
112
- "n_bytes": 1124813,
113
- "n_tokens": 254985,
114
- "n_chars": 1121360
115
- },
116
- "deepseek_coder_33b_instruct.cc100-en": {
117
- "vocab_size": 32022,
118
- "n_bytes": 1124813,
119
- "n_tokens": 287408,
120
- "n_chars": 1121360
121
- },
122
- "deepseek_llm_7b_base.cc100-en": {
123
- "vocab_size": 100015,
124
- "n_bytes": 1124813,
125
- "n_tokens": 272324,
126
- "n_chars": 1121360
127
- },
128
- "falcon_180b.cc100-en": {
129
- "vocab_size": 65024,
130
- "n_bytes": 1124813,
131
- "n_tokens": 262509,
132
- "n_chars": 1121360
133
- },
134
- "falcon_7b.cc100-en": {
135
- "vocab_size": 65024,
136
- "n_bytes": 1124813,
137
- "n_tokens": 262509,
138
- "n_chars": 1121360
139
- },
140
- "fastchat_t5_3b.cc100-en": {
141
- "vocab_size": 32110,
142
- "n_bytes": 1124813,
143
- "n_tokens": 484941,
144
- "n_chars": 1121360
145
- },
146
- "flan_t5_base.cc100-en": {
147
- "vocab_size": 32100,
148
- "n_bytes": 1124813,
149
- "n_tokens": 290104,
150
- "n_chars": 1121360
151
- },
152
- "gemma_7b.cc100-en": {
153
- "vocab_size": 256000,
154
- "n_bytes": 1124813,
155
- "n_tokens": 268010,
156
- "n_chars": 1121360
157
- },
158
- "gpt2.cc100-en": {
159
- "vocab_size": 50257,
160
- "n_bytes": 1124813,
161
- "n_tokens": 258428,
162
- "n_chars": 1121360
163
- },
164
- "gpt2_chinese.cc100-en": {
165
- "vocab_size": 21128,
166
- "n_bytes": 1124813,
167
- "n_tokens": 392641,
168
- "n_chars": 1121360
169
- },
170
- "gpt_35_turbo.cc100-en": {
171
- "vocab_size": 100277,
172
- "n_bytes": 1124813,
173
- "n_tokens": 254985,
174
- "n_chars": 1121360
175
- },
176
- "gpt_4.cc100-en": {
177
- "vocab_size": 100277,
178
- "n_bytes": 1124813,
179
- "n_tokens": 254985,
180
- "n_chars": 1121360
181
- },
182
- "gpt_nexo_20b.cc100-en": {
183
- "vocab_size": 50277,
184
- "n_bytes": 1124813,
185
- "n_tokens": 259357,
186
- "n_chars": 1121360
187
- },
188
- "grok_1.cc100-en": {
189
- "vocab_size": 131072,
190
- "n_bytes": 1124813,
191
- "n_tokens": 258048,
192
- "n_chars": 1121360
193
- },
194
- "internlm2_chat_7b.cc100-en": {
195
- "vocab_size": 92544,
196
- "n_bytes": 1124813,
197
- "n_tokens": 271583,
198
- "n_chars": 1121360
199
- },
200
- "internlm2_math_7b.cc100-en": {
201
- "vocab_size": 92544,
202
- "n_bytes": 1124813,
203
- "n_tokens": 271583,
204
- "n_chars": 1121360
205
- },
206
- "internlm_chat_7b.cc100-en": {
207
- "vocab_size": 103168,
208
- "n_bytes": 1124813,
209
- "n_tokens": 271293,
210
- "n_chars": 1121360
211
- },
212
- "internlm_xcomposer_7b.cc100-en": {
213
- "vocab_size": 103168,
214
- "n_bytes": 1124813,
215
- "n_tokens": 271293,
216
- "n_chars": 1121360
217
- },
218
- "jamba_v0_1.cc100-en": {
219
- "vocab_size": 65536,
220
- "n_bytes": 1124813,
221
- "n_tokens": 274242,
222
- "n_chars": 1121360
223
- },
224
- "kplug.cc100-en": {
225
- "vocab_size": 10261,
226
- "n_bytes": 1124813,
227
- "n_tokens": 393564,
228
- "n_chars": 1121360
229
- },
230
- "llama.cc100-en": {
231
- "vocab_size": 32000,
232
- "n_bytes": 1124813,
233
- "n_tokens": 294627,
234
- "n_chars": 1121360
235
- },
236
- "llama2.cc100-en": {
237
- "vocab_size": 32001,
238
- "n_bytes": 1124813,
239
- "n_tokens": 294627,
240
- "n_chars": 1121360
241
- },
242
- "llama3.cc100-en": {
243
- "vocab_size": 128256,
244
- "n_bytes": 1124813,
245
- "n_tokens": 254944,
246
- "n_chars": 1121360
247
- },
248
- "mistral_7b.cc100-en": {
249
- "vocab_size": 32000,
250
- "n_bytes": 1124813,
251
- "n_tokens": 285801,
252
- "n_chars": 1121360
253
- },
254
- "mixtral_8_7b.cc100-en": {
255
- "vocab_size": 32000,
256
- "n_bytes": 1124813,
257
- "n_tokens": 285801,
258
- "n_chars": 1121360
259
- },
260
- "mobilebert_uncased.cc100-en": {
261
- "vocab_size": 30522,
262
- "n_bytes": 1124813,
263
- "n_tokens": 280575,
264
- "n_chars": 1121360
265
- },
266
- "moss.cc100-en": {
267
- "vocab_size": 106072,
268
- "n_bytes": 1124813,
269
- "n_tokens": 257070,
270
- "n_chars": 1121360
271
- },
272
- "mt5_large.cc100-en": {
273
- "vocab_size": 250100,
274
- "n_bytes": 1124813,
275
- "n_tokens": 317881,
276
- "n_chars": 1121360
277
- },
278
- "olmo_7b.cc100-en": {
279
- "vocab_size": 50280,
280
- "n_bytes": 1124813,
281
- "n_tokens": 259357,
282
- "n_chars": 1121360
283
- },
284
- "orion_14b_chat.cc100-en": {
285
- "vocab_size": 84608,
286
- "n_bytes": 1124813,
287
- "n_tokens": 265948,
288
- "n_chars": 1121360
289
- },
290
- "phi_1.cc100-en": {
291
- "vocab_size": 50295,
292
- "n_bytes": 1124813,
293
- "n_tokens": 258409,
294
- "n_chars": 1121360
295
- },
296
- "phi_2.cc100-en": {
297
- "vocab_size": 50295,
298
- "n_bytes": 1124813,
299
- "n_tokens": 258409,
300
- "n_chars": 1121360
301
- },
302
- "phi_3_mini.cc100-en": {
303
- "vocab_size": 32011,
304
- "n_bytes": 1124813,
305
- "n_tokens": 294627,
306
- "n_chars": 1121360
307
- },
308
- "pko_t5_large.cc100-en": {
309
- "vocab_size": 50358,
310
- "n_bytes": 1124813,
311
- "n_tokens": 658985,
312
- "n_chars": 1121360
313
- },
314
- "prompt_clue.cc100-en": {
315
- "vocab_size": 32128,
316
- "n_bytes": 1124813,
317
- "n_tokens": 536033,
318
- "n_chars": 1121360
319
- },
320
- "qwen1_5_14b_chat.cc100-en": {
321
- "vocab_size": 151646,
322
- "n_bytes": 1124813,
323
- "n_tokens": 257983,
324
- "n_chars": 1121360
325
- },
326
- "qwen_1_8b_chat.cc100-en": {
327
- "vocab_size": 151851,
328
- "n_bytes": 1124813,
329
- "n_tokens": 257983,
330
- "n_chars": 1121360
331
- },
332
- "qwen_72b_chat.cc100-en": {
333
- "vocab_size": 151851,
334
- "n_bytes": 1124813,
335
- "n_tokens": 257983,
336
- "n_chars": 1121360
337
- },
338
- "qwen_7b_chat.cc100-en": {
339
- "vocab_size": 151851,
340
- "n_bytes": 1124813,
341
- "n_tokens": 257983,
342
- "n_chars": 1121360
343
- },
344
- "roberta_chinese_clue.cc100-en": {
345
- "vocab_size": 8021,
346
- "n_bytes": 1124813,
347
- "n_tokens": 583058,
348
- "n_chars": 1121360
349
- },
350
- "skywork_13b_base.cc100-en": {
351
- "vocab_size": 65519,
352
- "n_bytes": 1124813,
353
- "n_tokens": 294617,
354
- "n_chars": 1121360
355
- },
356
- "skywork_13b_math.cc100-en": {
357
- "vocab_size": 65519,
358
- "n_bytes": 1124813,
359
- "n_tokens": 294617,
360
- "n_chars": 1121360
361
- },
362
- "solar_10_7b.cc100-en": {
363
- "vocab_size": 32000,
364
- "n_bytes": 1124813,
365
- "n_tokens": 285801,
366
- "n_chars": 1121360
367
- },
368
- "starchat_alpha.cc100-en": {
369
- "vocab_size": 49156,
370
- "n_bytes": 1124813,
371
- "n_tokens": 288965,
372
- "n_chars": 1121360
373
- },
374
- "switch_c_2048.cc100-en": {
375
- "vocab_size": 32100,
376
- "n_bytes": 1124813,
377
- "n_tokens": 290104,
378
- "n_chars": 1121360
379
- },
380
- "t5_base.cc100-en": {
381
- "vocab_size": 32100,
382
- "n_bytes": 1124813,
383
- "n_tokens": 290104,
384
- "n_chars": 1121360
385
- },
386
- "t5_large.cc100-en": {
387
- "vocab_size": 32100,
388
- "n_bytes": 1124813,
389
- "n_tokens": 290104,
390
- "n_chars": 1121360
391
- },
392
- "t5_small.cc100-en": {
393
- "vocab_size": 32100,
394
- "n_bytes": 1124813,
395
- "n_tokens": 290104,
396
- "n_chars": 1121360
397
- },
398
- "text_davinci_003.cc100-en": {
399
- "vocab_size": 50281,
400
- "n_bytes": 1124813,
401
- "n_tokens": 258403,
402
- "n_chars": 1121360
403
- },
404
- "tigerbot_13b_chat_v2.cc100-en": {
405
- "vocab_size": 60515,
406
- "n_bytes": 1124813,
407
- "n_tokens": 285652,
408
- "n_chars": 1121360
409
- },
410
- "tigerbot_70b_chat_v4_4k.cc100-en": {
411
- "vocab_size": 65110,
412
- "n_bytes": 1124813,
413
- "n_tokens": 286946,
414
- "n_chars": 1121360
415
- },
416
- "wizardcoder_15b_v1.cc100-en": {
417
- "vocab_size": 49153,
418
- "n_bytes": 1124813,
419
- "n_tokens": 288965,
420
- "n_chars": 1121360
421
- },
422
- "wizardcoder_python_7b_v1.cc100-en": {
423
- "vocab_size": 32001,
424
- "n_bytes": 1124813,
425
- "n_tokens": 294627,
426
- "n_chars": 1121360
427
- },
428
- "wizardlm_7b_v1.cc100-en": {
429
- "vocab_size": 32001,
430
- "n_bytes": 1124813,
431
- "n_tokens": 294627,
432
- "n_chars": 1121360
433
- },
434
- "wizardmath_70b_v1.cc100-en": {
435
- "vocab_size": 32002,
436
- "n_bytes": 1124813,
437
- "n_tokens": 294627,
438
- "n_chars": 1121360
439
- },
440
- "xlm_roberta.cc100-en": {
441
- "vocab_size": 250002,
442
- "n_bytes": 1124813,
443
- "n_tokens": 300026,
444
- "n_chars": 1121360
445
- },
446
- "yi_34b.cc100-en": {
447
- "vocab_size": 64000,
448
- "n_bytes": 1124813,
449
- "n_tokens": 270400,
450
- "n_chars": 1121360
451
- },
452
- "yi_6b.cc100-en": {
453
- "vocab_size": 64000,
454
- "n_bytes": 1124813,
455
- "n_tokens": 270400,
456
- "n_chars": 1121360
457
- },
458
- "yi_vl34b.cc100-en": {
459
- "vocab_size": 64000,
460
- "n_bytes": 1124813,
461
- "n_tokens": 269738,
462
- "n_chars": 1121360
463
- },
464
- "zephyr_7b_beta.cc100-en": {
465
- "vocab_size": 32000,
466
- "n_bytes": 1124813,
467
- "n_tokens": 285801,
468
- "n_chars": 1121360
469
- },
470
- "amber.cc100-zh-Hans": {
471
- "vocab_size": 32000,
472
- "n_bytes": 2633047,
473
- "n_tokens": 1330093,
474
- "n_chars": 927311
475
- },
476
- "aya_101.cc100-zh-Hans": {
477
- "vocab_size": 250100,
478
- "n_bytes": 2633047,
479
- "n_tokens": 631182,
480
- "n_chars": 927311
481
- },
482
- "baichuan.cc100-zh-Hans": {
483
- "vocab_size": 64000,
484
- "n_bytes": 2633047,
485
- "n_tokens": 626117,
486
- "n_chars": 927311
487
- },
488
- "baichuan2.cc100-zh-Hans": {
489
- "vocab_size": 125696,
490
- "n_bytes": 2633047,
491
- "n_tokens": 541464,
492
- "n_chars": 927311
493
- },
494
- "bert_base_cased.cc100-zh-Hans": {
495
- "vocab_size": 28996,
496
- "n_bytes": 2633047,
497
- "n_tokens": 899709,
498
- "n_chars": 927311
499
- },
500
- "bert_base_chinese.cc100-zh-Hans": {
501
- "vocab_size": 21128,
502
- "n_bytes": 2633047,
503
- "n_tokens": 896599,
504
- "n_chars": 927311
505
- },
506
- "bert_base_uncased.cc100-zh-Hans": {
507
- "vocab_size": 30522,
508
- "n_bytes": 2633047,
509
- "n_tokens": 898554,
510
- "n_chars": 927311
511
- },
512
- "bloom.cc100-zh-Hans": {
513
- "vocab_size": 250680,
514
- "n_bytes": 2633047,
515
- "n_tokens": 573008,
516
- "n_chars": 927311
517
- },
518
- "byt5_small.cc100-zh-Hans": {
519
- "vocab_size": 384,
520
- "n_bytes": 2633047,
521
- "n_tokens": 2643047,
522
- "n_chars": 927311
523
- },
524
- "character_glm_6b.cc100-zh-Hans": {
525
- "vocab_size": 64789,
526
- "n_bytes": 2633047,
527
- "n_tokens": 583646,
528
- "n_chars": 927311
529
- },
530
- "chatglm2_6b.cc100-zh-Hans": {
531
- "vocab_size": 64787,
532
- "n_bytes": 2633047,
533
- "n_tokens": 583646,
534
- "n_chars": 927311
535
- },
536
- "chatglm3_6b.cc100-zh-Hans": {
537
- "vocab_size": 64796,
538
- "n_bytes": 2633047,
539
- "n_tokens": 583646,
540
- "n_chars": 927311
541
- },
542
- "chatglm_6b.cc100-zh-Hans": {
543
- "vocab_size": 150344,
544
- "n_bytes": 2633047,
545
- "n_tokens": 527384,
546
- "n_chars": 927311
547
- },
548
- "chatyuan_large_v2.cc100-zh-Hans": {
549
- "vocab_size": 32128,
550
- "n_bytes": 2633047,
551
- "n_tokens": 564905,
552
- "n_chars": 927311
553
- },
554
- "chinese_llama.cc100-zh-Hans": {
555
- "vocab_size": 49953,
556
- "n_bytes": 2633047,
557
- "n_tokens": 623219,
558
- "n_chars": 927311
559
- },
560
- "chinese_llama2.cc100-zh-Hans": {
561
- "vocab_size": 55296,
562
- "n_bytes": 2633047,
563
- "n_tokens": 625766,
564
- "n_chars": 927311
565
- },
566
- "code_davinci_002.cc100-zh-Hans": {
567
- "vocab_size": 50281,
568
- "n_bytes": 2633047,
569
- "n_tokens": 1876809,
570
- "n_chars": 927311
571
- },
572
- "crystal_coder.cc100-zh-Hans": {
573
- "vocab_size": 32022,
574
- "n_bytes": 2633047,
575
- "n_tokens": 1320093,
576
- "n_chars": 927311
577
- },
578
- "dbrx_instruct.cc100-zh-Hans": {
579
- "vocab_size": 100280,
580
- "n_bytes": 2633047,
581
- "n_tokens": 1084939,
582
- "n_chars": 927311
583
- },
584
- "deepseek_coder_33b_instruct.cc100-zh-Hans": {
585
- "vocab_size": 32022,
586
- "n_bytes": 2633047,
587
- "n_tokens": 720577,
588
- "n_chars": 927311
589
- },
590
- "deepseek_llm_7b_base.cc100-zh-Hans": {
591
- "vocab_size": 100015,
592
- "n_bytes": 2633047,
593
- "n_tokens": 605081,
594
- "n_chars": 927311
595
- },
596
- "falcon_180b.cc100-zh-Hans": {
597
- "vocab_size": 65024,
598
- "n_bytes": 2633047,
599
- "n_tokens": 1124681,
600
- "n_chars": 927311
601
- },
602
- "falcon_7b.cc100-zh-Hans": {
603
- "vocab_size": 65024,
604
- "n_bytes": 2633047,
605
- "n_tokens": 1124681,
606
- "n_chars": 927311
607
- },
608
- "fastchat_t5_3b.cc100-zh-Hans": {
609
- "vocab_size": 32110,
610
- "n_bytes": 2633047,
611
- "n_tokens": 178974,
612
- "n_chars": 927311
613
- },
614
- "flan_t5_base.cc100-zh-Hans": {
615
- "vocab_size": 32100,
616
- "n_bytes": 2633047,
617
- "n_tokens": 173520,
618
- "n_chars": 927311
619
- },
620
- "gemma_7b.cc100-zh-Hans": {
621
- "vocab_size": 256000,
622
- "n_bytes": 2633047,
623
- "n_tokens": 641795,
624
- "n_chars": 927311
625
- },
626
- "gpt2.cc100-zh-Hans": {
627
- "vocab_size": 50257,
628
- "n_bytes": 2633047,
629
- "n_tokens": 1876809,
630
- "n_chars": 927311
631
- },
632
- "gpt2_chinese.cc100-zh-Hans": {
633
- "vocab_size": 21128,
634
- "n_bytes": 2633047,
635
- "n_tokens": 899506,
636
- "n_chars": 927311
637
- },
638
- "gpt_35_turbo.cc100-zh-Hans": {
639
- "vocab_size": 100277,
640
- "n_bytes": 2633047,
641
- "n_tokens": 1084939,
642
- "n_chars": 927311
643
- },
644
- "gpt_4.cc100-zh-Hans": {
645
- "vocab_size": 100277,
646
- "n_bytes": 2633047,
647
- "n_tokens": 1084939,
648
- "n_chars": 927311
649
- },
650
- "gpt_nexo_20b.cc100-zh-Hans": {
651
- "vocab_size": 50277,
652
- "n_bytes": 2633047,
653
- "n_tokens": 1220529,
654
- "n_chars": 927311
655
- },
656
- "grok_1.cc100-zh-Hans": {
657
- "vocab_size": 131072,
658
- "n_bytes": 2633047,
659
- "n_tokens": 1414508,
660
- "n_chars": 927311
661
- },
662
- "internlm2_chat_7b.cc100-zh-Hans": {
663
- "vocab_size": 92544,
664
- "n_bytes": 2633047,
665
- "n_tokens": 579976,
666
- "n_chars": 927311
667
- },
668
- "internlm2_math_7b.cc100-zh-Hans": {
669
- "vocab_size": 92544,
670
- "n_bytes": 2633047,
671
- "n_tokens": 579976,
672
- "n_chars": 927311
673
- },
674
- "internlm_chat_7b.cc100-zh-Hans": {
675
- "vocab_size": 103168,
676
- "n_bytes": 2633047,
677
- "n_tokens": 579109,
678
- "n_chars": 927311
679
- },
680
- "internlm_xcomposer_7b.cc100-zh-Hans": {
681
- "vocab_size": 103168,
682
- "n_bytes": 2633047,
683
- "n_tokens": 579109,
684
- "n_chars": 927311
685
- },
686
- "jamba_v0_1.cc100-zh-Hans": {
687
- "vocab_size": 65536,
688
- "n_bytes": 2633047,
689
- "n_tokens": 1067054,
690
- "n_chars": 927311
691
- },
692
- "kplug.cc100-zh-Hans": {
693
- "vocab_size": 10261,
694
- "n_bytes": 2633047,
695
- "n_tokens": 902451,
696
- "n_chars": 927311
697
- },
698
- "llama.cc100-zh-Hans": {
699
- "vocab_size": 32000,
700
- "n_bytes": 2633047,
701
- "n_tokens": 1330093,
702
- "n_chars": 927311
703
- },
704
- "llama2.cc100-zh-Hans": {
705
- "vocab_size": 32001,
706
- "n_bytes": 2633047,
707
- "n_tokens": 1330093,
708
- "n_chars": 927311
709
- },
710
- "llama3.cc100-zh-Hans": {
711
- "vocab_size": 128256,
712
- "n_bytes": 2633047,
713
- "n_tokens": 747405,
714
- "n_chars": 927311
715
- },
716
- "mistral_7b.cc100-zh-Hans": {
717
- "vocab_size": 32000,
718
- "n_bytes": 2633047,
719
- "n_tokens": 1041023,
720
- "n_chars": 927311
721
- },
722
- "mixtral_8_7b.cc100-zh-Hans": {
723
- "vocab_size": 32000,
724
- "n_bytes": 2633047,
725
- "n_tokens": 1041023,
726
- "n_chars": 927311
727
- },
728
- "mobilebert_uncased.cc100-zh-Hans": {
729
- "vocab_size": 30522,
730
- "n_bytes": 2633047,
731
- "n_tokens": 898554,
732
- "n_chars": 927311
733
- },
734
- "moss.cc100-zh-Hans": {
735
- "vocab_size": 106072,
736
- "n_bytes": 2633047,
737
- "n_tokens": 557455,
738
- "n_chars": 927311
739
- },
740
- "mt5_large.cc100-zh-Hans": {
741
- "vocab_size": 250100,
742
- "n_bytes": 2633047,
743
- "n_tokens": 631182,
744
- "n_chars": 927311
745
- },
746
- "olmo_7b.cc100-zh-Hans": {
747
- "vocab_size": 50280,
748
- "n_bytes": 2633047,
749
- "n_tokens": 1220529,
750
- "n_chars": 927311
751
- },
752
- "orion_14b_chat.cc100-zh-Hans": {
753
- "vocab_size": 84608,
754
- "n_bytes": 2633047,
755
- "n_tokens": 529926,
756
- "n_chars": 927311
757
- },
758
- "phi_1.cc100-zh-Hans": {
759
- "vocab_size": 50295,
760
- "n_bytes": 2633047,
761
- "n_tokens": 1876809,
762
- "n_chars": 927311
763
- },
764
- "phi_2.cc100-zh-Hans": {
765
- "vocab_size": 50295,
766
- "n_bytes": 2633047,
767
- "n_tokens": 1876809,
768
- "n_chars": 927311
769
- },
770
- "phi_3_mini.cc100-zh-Hans": {
771
- "vocab_size": 32011,
772
- "n_bytes": 2633047,
773
- "n_tokens": 1330093,
774
- "n_chars": 927311
775
- },
776
- "pko_t5_large.cc100-zh-Hans": {
777
- "vocab_size": 50358,
778
- "n_bytes": 2633047,
779
- "n_tokens": 2533519,
780
- "n_chars": 927311
781
- },
782
- "prompt_clue.cc100-zh-Hans": {
783
- "vocab_size": 32128,
784
- "n_bytes": 2633047,
785
- "n_tokens": 564905,
786
- "n_chars": 927311
787
- },
788
- "qwen1_5_14b_chat.cc100-zh-Hans": {
789
- "vocab_size": 151646,
790
- "n_bytes": 2633047,
791
- "n_tokens": 589211,
792
- "n_chars": 927311
793
- },
794
- "qwen_1_8b_chat.cc100-zh-Hans": {
795
- "vocab_size": 151851,
796
- "n_bytes": 2633047,
797
- "n_tokens": 589211,
798
- "n_chars": 927311
799
- },
800
- "qwen_72b_chat.cc100-zh-Hans": {
801
- "vocab_size": 151851,
802
- "n_bytes": 2633047,
803
- "n_tokens": 589211,
804
- "n_chars": 927311
805
- },
806
- "qwen_7b_chat.cc100-zh-Hans": {
807
- "vocab_size": 151851,
808
- "n_bytes": 2633047,
809
- "n_tokens": 589211,
810
- "n_chars": 927311
811
- },
812
- "roberta_chinese_clue.cc100-zh-Hans": {
813
- "vocab_size": 8021,
814
- "n_bytes": 2633047,
815
- "n_tokens": 907144,
816
- "n_chars": 927311
817
- },
818
- "skywork_13b_base.cc100-zh-Hans": {
819
- "vocab_size": 65519,
820
- "n_bytes": 2633047,
821
- "n_tokens": 663923,
822
- "n_chars": 927311
823
- },
824
- "skywork_13b_math.cc100-zh-Hans": {
825
- "vocab_size": 65519,
826
- "n_bytes": 2633047,
827
- "n_tokens": 663923,
828
- "n_chars": 927311
829
- },
830
- "solar_10_7b.cc100-zh-Hans": {
831
- "vocab_size": 32000,
832
- "n_bytes": 2633047,
833
- "n_tokens": 1041023,
834
- "n_chars": 927311
835
- },
836
- "starchat_alpha.cc100-zh-Hans": {
837
- "vocab_size": 49156,
838
- "n_bytes": 2633047,
839
- "n_tokens": 882018,
840
- "n_chars": 927311
841
- },
842
- "switch_c_2048.cc100-zh-Hans": {
843
- "vocab_size": 32100,
844
- "n_bytes": 2633047,
845
- "n_tokens": 173519,
846
- "n_chars": 927311
847
- },
848
- "t5_base.cc100-zh-Hans": {
849
- "vocab_size": 32100,
850
- "n_bytes": 2633047,
851
- "n_tokens": 173519,
852
- "n_chars": 927311
853
- },
854
- "t5_large.cc100-zh-Hans": {
855
- "vocab_size": 32100,
856
- "n_bytes": 2633047,
857
- "n_tokens": 173519,
858
- "n_chars": 927311
859
- },
860
- "t5_small.cc100-zh-Hans": {
861
- "vocab_size": 32100,
862
- "n_bytes": 2633047,
863
- "n_tokens": 173519,
864
- "n_chars": 927311
865
- },
866
- "text_davinci_003.cc100-zh-Hans": {
867
- "vocab_size": 50281,
868
- "n_bytes": 2633047,
869
- "n_tokens": 1876809,
870
- "n_chars": 927311
871
- },
872
- "tigerbot_13b_chat_v2.cc100-zh-Hans": {
873
- "vocab_size": 60515,
874
- "n_bytes": 2633047,
875
- "n_tokens": 577385,
876
- "n_chars": 927311
877
- },
878
- "tigerbot_70b_chat_v4_4k.cc100-zh-Hans": {
879
- "vocab_size": 65110,
880
- "n_bytes": 2633047,
881
- "n_tokens": 577211,
882
- "n_chars": 927311
883
- },
884
- "wizardcoder_15b_v1.cc100-zh-Hans": {
885
- "vocab_size": 49153,
886
- "n_bytes": 2633047,
887
- "n_tokens": 882018,
888
- "n_chars": 927311
889
- },
890
- "wizardcoder_python_7b_v1.cc100-zh-Hans": {
891
- "vocab_size": 32001,
892
- "n_bytes": 2633047,
893
- "n_tokens": 1330093,
894
- "n_chars": 927311
895
- },
896
- "wizardlm_7b_v1.cc100-zh-Hans": {
897
- "vocab_size": 32001,
898
- "n_bytes": 2633047,
899
- "n_tokens": 1330093,
900
- "n_chars": 927311
901
- },
902
- "wizardmath_70b_v1.cc100-zh-Hans": {
903
- "vocab_size": 32002,
904
- "n_bytes": 2633047,
905
- "n_tokens": 1330093,
906
- "n_chars": 927311
907
- },
908
- "xlm_roberta.cc100-zh-Hans": {
909
- "vocab_size": 250002,
910
- "n_bytes": 2633047,
911
- "n_tokens": 619844,
912
- "n_chars": 927311
913
- },
914
- "yi_34b.cc100-zh-Hans": {
915
- "vocab_size": 64000,
916
- "n_bytes": 2633047,
917
- "n_tokens": 588729,
918
- "n_chars": 927311
919
- },
920
- "yi_6b.cc100-zh-Hans": {
921
- "vocab_size": 64000,
922
- "n_bytes": 2633047,
923
- "n_tokens": 588729,
924
- "n_chars": 927311
925
- },
926
- "yi_vl34b.cc100-zh-Hans": {
927
- "vocab_size": 64000,
928
- "n_bytes": 2633047,
929
- "n_tokens": 596166,
930
- "n_chars": 927311
931
- },
932
- "zephyr_7b_beta.cc100-zh-Hans": {
933
- "vocab_size": 32000,
934
- "n_bytes": 2633047,
935
- "n_tokens": 1041023,
936
- "n_chars": 927311
937
- },
938
- "amber.cc100-es": {
939
- "vocab_size": 32000,
940
- "n_bytes": 1664455,
941
- "n_tokens": 492235,
942
- "n_chars": 1630297
943
- },
944
- "aya_101.cc100-es": {
945
- "vocab_size": 250100,
946
- "n_bytes": 1664455,
947
- "n_tokens": 472231,
948
- "n_chars": 1630297
949
- },
950
- "baichuan.cc100-es": {
951
- "vocab_size": 64000,
952
- "n_bytes": 1664455,
953
- "n_tokens": 585804,
954
- "n_chars": 1630297
955
- },
956
- "baichuan2.cc100-es": {
957
- "vocab_size": 125696,
958
- "n_bytes": 1664455,
959
- "n_tokens": 551326,
960
- "n_chars": 1630297
961
- },
962
- "bert_base_cased.cc100-es": {
963
- "vocab_size": 28996,
964
- "n_bytes": 1664455,
965
- "n_tokens": 630231,
966
- "n_chars": 1630297
967
- },
968
- "bert_base_chinese.cc100-es": {
969
- "vocab_size": 21128,
970
- "n_bytes": 1664455,
971
- "n_tokens": 609419,
972
- "n_chars": 1630297
973
- },
974
- "bert_base_uncased.cc100-es": {
975
- "vocab_size": 30522,
976
- "n_bytes": 1664455,
977
- "n_tokens": 558042,
978
- "n_chars": 1630297
979
- },
980
- "bloom.cc100-es": {
981
- "vocab_size": 250680,
982
- "n_bytes": 1664455,
983
- "n_tokens": 350793,
984
- "n_chars": 1630297
985
- },
986
- "byt5_small.cc100-es": {
987
- "vocab_size": 384,
988
- "n_bytes": 1664455,
989
- "n_tokens": 1674455,
990
- "n_chars": 1630297
991
- },
992
- "character_glm_6b.cc100-es": {
993
- "vocab_size": 64789,
994
- "n_bytes": 1664455,
995
- "n_tokens": 566501,
996
- "n_chars": 1630297
997
- },
998
- "chatglm2_6b.cc100-es": {
999
- "vocab_size": 64787,
1000
- "n_bytes": 1664455,
1001
- "n_tokens": 566476,
1002
- "n_chars": 1630297
1003
- },
1004
- "chatglm3_6b.cc100-es": {
1005
- "vocab_size": 64796,
1006
- "n_bytes": 1664455,
1007
- "n_tokens": 566501,
1008
- "n_chars": 1630297
1009
- },
1010
- "chatglm_6b.cc100-es": {
1011
- "vocab_size": 150344,
1012
- "n_bytes": 1664455,
1013
- "n_tokens": 514848,
1014
- "n_chars": 1630297
1015
- },
1016
- "chatyuan_large_v2.cc100-es": {
1017
- "vocab_size": 32128,
1018
- "n_bytes": 1664455,
1019
- "n_tokens": 889530,
1020
- "n_chars": 1630297
1021
- },
1022
- "chinese_llama.cc100-es": {
1023
- "vocab_size": 49953,
1024
- "n_bytes": 1664455,
1025
- "n_tokens": 486672,
1026
- "n_chars": 1630297
1027
- },
1028
- "chinese_llama2.cc100-es": {
1029
- "vocab_size": 55296,
1030
- "n_bytes": 1664455,
1031
- "n_tokens": 492235,
1032
- "n_chars": 1630297
1033
- },
1034
- "code_davinci_002.cc100-es": {
1035
- "vocab_size": 50281,
1036
- "n_bytes": 1664455,
1037
- "n_tokens": 569853,
1038
- "n_chars": 1630297
1039
- },
1040
- "crystal_coder.cc100-es": {
1041
- "vocab_size": 32022,
1042
- "n_bytes": 1664455,
1043
- "n_tokens": 482235,
1044
- "n_chars": 1630297
1045
- },
1046
- "dbrx_instruct.cc100-es": {
1047
- "vocab_size": 100280,
1048
- "n_bytes": 1664455,
1049
- "n_tokens": 433875,
1050
- "n_chars": 1630297
1051
- },
1052
- "deepseek_coder_33b_instruct.cc100-es": {
1053
- "vocab_size": 32022,
1054
- "n_bytes": 1664455,
1055
- "n_tokens": 523884,
1056
- "n_chars": 1630297
1057
- },
1058
- "deepseek_llm_7b_base.cc100-es": {
1059
- "vocab_size": 100015,
1060
- "n_bytes": 1664455,
1061
- "n_tokens": 480877,
1062
- "n_chars": 1630297
1063
- },
1064
- "falcon_180b.cc100-es": {
1065
- "vocab_size": 65024,
1066
- "n_bytes": 1664455,
1067
- "n_tokens": 442138,
1068
- "n_chars": 1630297
1069
- },
1070
- "falcon_7b.cc100-es": {
1071
- "vocab_size": 65024,
1072
- "n_bytes": 1664455,
1073
- "n_tokens": 442138,
1074
- "n_chars": 1630297
1075
- },
1076
- "fastchat_t5_3b.cc100-es": {
1077
- "vocab_size": 32110,
1078
- "n_bytes": 1664455,
1079
- "n_tokens": 970105,
1080
- "n_chars": 1630297
1081
- },
1082
- "flan_t5_base.cc100-es": {
1083
- "vocab_size": 32100,
1084
- "n_bytes": 1664455,
1085
- "n_tokens": 706405,
1086
- "n_chars": 1630297
1087
- },
1088
- "gemma_7b.cc100-es": {
1089
- "vocab_size": 256000,
1090
- "n_bytes": 1664455,
1091
- "n_tokens": 371321,
1092
- "n_chars": 1630297
1093
- },
1094
- "gpt2.cc100-es": {
1095
- "vocab_size": 50257,
1096
- "n_bytes": 1664455,
1097
- "n_tokens": 569853,
1098
- "n_chars": 1630297
1099
- },
1100
- "gpt2_chinese.cc100-es": {
1101
- "vocab_size": 21128,
1102
- "n_bytes": 1664455,
1103
- "n_tokens": 703390,
1104
- "n_chars": 1630297
1105
- },
1106
- "gpt_35_turbo.cc100-es": {
1107
- "vocab_size": 100277,
1108
- "n_bytes": 1664455,
1109
- "n_tokens": 433875,
1110
- "n_chars": 1630297
1111
- },
1112
- "gpt_4.cc100-es": {
1113
- "vocab_size": 100277,
1114
- "n_bytes": 1664455,
1115
- "n_tokens": 433875,
1116
- "n_chars": 1630297
1117
- },
1118
- "gpt_nexo_20b.cc100-es": {
1119
- "vocab_size": 50277,
1120
- "n_bytes": 1664455,
1121
- "n_tokens": 494577,
1122
- "n_chars": 1630297
1123
- },
1124
- "grok_1.cc100-es": {
1125
- "vocab_size": 131072,
1126
- "n_bytes": 1664455,
1127
- "n_tokens": 449392,
1128
- "n_chars": 1630297
1129
- },
1130
- "internlm2_chat_7b.cc100-es": {
1131
- "vocab_size": 92544,
1132
- "n_bytes": 1664455,
1133
- "n_tokens": 518871,
1134
- "n_chars": 1630297
1135
- },
1136
- "internlm2_math_7b.cc100-es": {
1137
- "vocab_size": 92544,
1138
- "n_bytes": 1664455,
1139
- "n_tokens": 518871,
1140
- "n_chars": 1630297
1141
- },
1142
- "internlm_chat_7b.cc100-es": {
1143
- "vocab_size": 103168,
1144
- "n_bytes": 1664455,
1145
- "n_tokens": 516572,
1146
- "n_chars": 1630297
1147
- },
1148
- "internlm_xcomposer_7b.cc100-es": {
1149
- "vocab_size": 103168,
1150
- "n_bytes": 1664455,
1151
- "n_tokens": 516572,
1152
- "n_chars": 1630297
1153
- },
1154
- "jamba_v0_1.cc100-es": {
1155
- "vocab_size": 65536,
1156
- "n_bytes": 1664455,
1157
- "n_tokens": 420883,
1158
- "n_chars": 1630297
1159
- },
1160
- "kplug.cc100-es": {
1161
- "vocab_size": 10261,
1162
- "n_bytes": 1664455,
1163
- "n_tokens": 704804,
1164
- "n_chars": 1630297
1165
- },
1166
- "llama.cc100-es": {
1167
- "vocab_size": 32000,
1168
- "n_bytes": 1664455,
1169
- "n_tokens": 492235,
1170
- "n_chars": 1630297
1171
- },
1172
- "llama2.cc100-es": {
1173
- "vocab_size": 32001,
1174
- "n_bytes": 1664455,
1175
- "n_tokens": 492235,
1176
- "n_chars": 1630297
1177
- },
1178
- "llama3.cc100-es": {
1179
- "vocab_size": 128256,
1180
- "n_bytes": 1664455,
1181
- "n_tokens": 433289,
1182
- "n_chars": 1630297
1183
- },
1184
- "mistral_7b.cc100-es": {
1185
- "vocab_size": 32000,
1186
- "n_bytes": 1664455,
1187
- "n_tokens": 513915,
1188
- "n_chars": 1630297
1189
- },
1190
- "mixtral_8_7b.cc100-es": {
1191
- "vocab_size": 32000,
1192
- "n_bytes": 1664455,
1193
- "n_tokens": 513915,
1194
- "n_chars": 1630297
1195
- },
1196
- "mobilebert_uncased.cc100-es": {
1197
- "vocab_size": 30522,
1198
- "n_bytes": 1664455,
1199
- "n_tokens": 558042,
1200
- "n_chars": 1630297
1201
- },
1202
- "moss.cc100-es": {
1203
- "vocab_size": 106072,
1204
- "n_bytes": 1664455,
1205
- "n_tokens": 568539,
1206
- "n_chars": 1630297
1207
- },
1208
- "mt5_large.cc100-es": {
1209
- "vocab_size": 250100,
1210
- "n_bytes": 1664455,
1211
- "n_tokens": 472231,
1212
- "n_chars": 1630297
1213
- },
1214
- "olmo_7b.cc100-es": {
1215
- "vocab_size": 50280,
1216
- "n_bytes": 1664455,
1217
- "n_tokens": 494577,
1218
- "n_chars": 1630297
1219
- },
1220
- "orion_14b_chat.cc100-es": {
1221
- "vocab_size": 84608,
1222
- "n_bytes": 1664455,
1223
- "n_tokens": 628571,
1224
- "n_chars": 1630297
1225
- },
1226
- "phi_1.cc100-es": {
1227
- "vocab_size": 50295,
1228
- "n_bytes": 1664455,
1229
- "n_tokens": 569853,
1230
- "n_chars": 1630297
1231
- },
1232
- "phi_2.cc100-es": {
1233
- "vocab_size": 50295,
1234
- "n_bytes": 1664455,
1235
- "n_tokens": 569853,
1236
- "n_chars": 1630297
1237
- },
1238
- "phi_3_mini.cc100-es": {
1239
- "vocab_size": 32011,
1240
- "n_bytes": 1664455,
1241
- "n_tokens": 492235,
1242
- "n_chars": 1630297
1243
- },
1244
- "pko_t5_large.cc100-es": {
1245
- "vocab_size": 50358,
1246
- "n_bytes": 1664455,
1247
- "n_tokens": 1134056,
1248
- "n_chars": 1630297
1249
- },
1250
- "prompt_clue.cc100-es": {
1251
- "vocab_size": 32128,
1252
- "n_bytes": 1664455,
1253
- "n_tokens": 889530,
1254
- "n_chars": 1630297
1255
- },
1256
- "qwen1_5_14b_chat.cc100-es": {
1257
- "vocab_size": 151646,
1258
- "n_bytes": 1664455,
1259
- "n_tokens": 434264,
1260
- "n_chars": 1630297
1261
- },
1262
- "qwen_1_8b_chat.cc100-es": {
1263
- "vocab_size": 151851,
1264
- "n_bytes": 1664455,
1265
- "n_tokens": 434264,
1266
- "n_chars": 1630297
1267
- },
1268
- "qwen_72b_chat.cc100-es": {
1269
- "vocab_size": 151851,
1270
- "n_bytes": 1664455,
1271
- "n_tokens": 434264,
1272
- "n_chars": 1630297
1273
- },
1274
- "qwen_7b_chat.cc100-es": {
1275
- "vocab_size": 151851,
1276
- "n_bytes": 1664455,
1277
- "n_tokens": 434264,
1278
- "n_chars": 1630297
1279
- },
1280
- "roberta_chinese_clue.cc100-es": {
1281
- "vocab_size": 8021,
1282
- "n_bytes": 1664455,
1283
- "n_tokens": 866564,
1284
- "n_chars": 1630297
1285
- },
1286
- "skywork_13b_base.cc100-es": {
1287
- "vocab_size": 65519,
1288
- "n_bytes": 1664455,
1289
- "n_tokens": 492211,
1290
- "n_chars": 1630297
1291
- },
1292
- "skywork_13b_math.cc100-es": {
1293
- "vocab_size": 65519,
1294
- "n_bytes": 1664455,
1295
- "n_tokens": 492211,
1296
- "n_chars": 1630297
1297
- },
1298
- "solar_10_7b.cc100-es": {
1299
- "vocab_size": 32000,
1300
- "n_bytes": 1664455,
1301
- "n_tokens": 513915,
1302
- "n_chars": 1630297
1303
- },
1304
- "starchat_alpha.cc100-es": {
1305
- "vocab_size": 49156,
1306
- "n_bytes": 1664455,
1307
- "n_tokens": 530592,
1308
- "n_chars": 1630297
1309
- },
1310
- "switch_c_2048.cc100-es": {
1311
- "vocab_size": 32100,
1312
- "n_bytes": 1664455,
1313
- "n_tokens": 706400,
1314
- "n_chars": 1630297
1315
- },
1316
- "t5_base.cc100-es": {
1317
- "vocab_size": 32100,
1318
- "n_bytes": 1664455,
1319
- "n_tokens": 706400,
1320
- "n_chars": 1630297
1321
- },
1322
- "t5_large.cc100-es": {
1323
- "vocab_size": 32100,
1324
- "n_bytes": 1664455,
1325
- "n_tokens": 706400,
1326
- "n_chars": 1630297
1327
- },
1328
- "t5_small.cc100-es": {
1329
- "vocab_size": 32100,
1330
- "n_bytes": 1664455,
1331
- "n_tokens": 706400,
1332
- "n_chars": 1630297
1333
- },
1334
- "text_davinci_003.cc100-es": {
1335
- "vocab_size": 50281,
1336
- "n_bytes": 1664455,
1337
- "n_tokens": 569853,
1338
- "n_chars": 1630297
1339
- },
1340
- "tigerbot_13b_chat_v2.cc100-es": {
1341
- "vocab_size": 60515,
1342
- "n_bytes": 1664455,
1343
- "n_tokens": 482553,
1344
- "n_chars": 1630297
1345
- },
1346
- "tigerbot_70b_chat_v4_4k.cc100-es": {
1347
- "vocab_size": 65110,
1348
- "n_bytes": 1664455,
1349
- "n_tokens": 484099,
1350
- "n_chars": 1630297
1351
- },
1352
- "wizardcoder_15b_v1.cc100-es": {
1353
- "vocab_size": 49153,
1354
- "n_bytes": 1664455,
1355
- "n_tokens": 530592,
1356
- "n_chars": 1630297
1357
- },
1358
- "wizardcoder_python_7b_v1.cc100-es": {
1359
- "vocab_size": 32001,
1360
- "n_bytes": 1664455,
1361
- "n_tokens": 492235,
1362
- "n_chars": 1630297
1363
- },
1364
- "wizardlm_7b_v1.cc100-es": {
1365
- "vocab_size": 32001,
1366
- "n_bytes": 1664455,
1367
- "n_tokens": 492235,
1368
- "n_chars": 1630297
1369
- },
1370
- "wizardmath_70b_v1.cc100-es": {
1371
- "vocab_size": 32002,
1372
- "n_bytes": 1664455,
1373
- "n_tokens": 492235,
1374
- "n_chars": 1630297
1375
- },
1376
- "xlm_roberta.cc100-es": {
1377
- "vocab_size": 250002,
1378
- "n_bytes": 1664455,
1379
- "n_tokens": 399850,
1380
- "n_chars": 1630297
1381
- },
1382
- "yi_34b.cc100-es": {
1383
- "vocab_size": 64000,
1384
- "n_bytes": 1664455,
1385
- "n_tokens": 577018,
1386
- "n_chars": 1630297
1387
- },
1388
- "yi_6b.cc100-es": {
1389
- "vocab_size": 64000,
1390
- "n_bytes": 1664455,
1391
- "n_tokens": 577018,
1392
- "n_chars": 1630297
1393
- },
1394
- "yi_vl34b.cc100-es": {
1395
- "vocab_size": 64000,
1396
- "n_bytes": 1664455,
1397
- "n_tokens": 576794,
1398
- "n_chars": 1630297
1399
- },
1400
- "zephyr_7b_beta.cc100-es": {
1401
- "vocab_size": 32000,
1402
- "n_bytes": 1664455,
1403
- "n_tokens": 513915,
1404
- "n_chars": 1630297
1405
- },
1406
- "aya_101.cc100-fr": {
1407
- "vocab_size": 250100,
1408
- "n_bytes": 1540504,
1409
- "n_tokens": 470944,
1410
- "n_chars": 1484970
1411
- },
1412
- "baichuan.cc100-fr": {
1413
- "vocab_size": 64000,
1414
- "n_bytes": 1540504,
1415
- "n_tokens": 540430,
1416
- "n_chars": 1484970
1417
- },
1418
- "baichuan2.cc100-fr": {
1419
- "vocab_size": 125696,
1420
- "n_bytes": 1540504,
1421
- "n_tokens": 512313,
1422
- "n_chars": 1484970
1423
- },
1424
- "bert_base_cased.cc100-fr": {
1425
- "vocab_size": 28996,
1426
- "n_bytes": 1540504,
1427
- "n_tokens": 583210,
1428
- "n_chars": 1484970
1429
- },
1430
- "bert_base_chinese.cc100-fr": {
1431
- "vocab_size": 21128,
1432
- "n_bytes": 1540504,
1433
- "n_tokens": 553134,
1434
- "n_chars": 1484970
1435
- },
1436
- "bert_base_uncased.cc100-fr": {
1437
- "vocab_size": 30522,
1438
- "n_bytes": 1540504,
1439
- "n_tokens": 504075,
1440
- "n_chars": 1484970
1441
- },
1442
- "bloom.cc100-fr": {
1443
- "vocab_size": 250680,
1444
- "n_bytes": 1540504,
1445
- "n_tokens": 321639,
1446
- "n_chars": 1484970
1447
- },
1448
- "byt5_small.cc100-fr": {
1449
- "vocab_size": 384,
1450
- "n_bytes": 1540504,
1451
- "n_tokens": 1550504,
1452
- "n_chars": 1484970
1453
- },
1454
- "character_glm_6b.cc100-fr": {
1455
- "vocab_size": 64789,
1456
- "n_bytes": 1540504,
1457
- "n_tokens": 515052,
1458
- "n_chars": 1484970
1459
- },
1460
- "chatglm2_6b.cc100-fr": {
1461
- "vocab_size": 64787,
1462
- "n_bytes": 1540504,
1463
- "n_tokens": 515028,
1464
- "n_chars": 1484970
1465
- },
1466
- "chatglm3_6b.cc100-fr": {
1467
- "vocab_size": 64796,
1468
- "n_bytes": 1540504,
1469
- "n_tokens": 515052,
1470
- "n_chars": 1484970
1471
- },
1472
- "chatglm_6b.cc100-fr": {
1473
- "vocab_size": 150344,
1474
- "n_bytes": 1540504,
1475
- "n_tokens": 499261,
1476
- "n_chars": 1484970
1477
- },
1478
- "chatyuan_large_v2.cc100-fr": {
1479
- "vocab_size": 32128,
1480
- "n_bytes": 1540504,
1481
- "n_tokens": 822012,
1482
- "n_chars": 1484970
1483
- },
1484
- "chinese_llama.cc100-fr": {
1485
- "vocab_size": 49953,
1486
- "n_bytes": 1540504,
1487
- "n_tokens": 450352,
1488
- "n_chars": 1484970
1489
- },
1490
- "chinese_llama2.cc100-fr": {
1491
- "vocab_size": 55296,
1492
- "n_bytes": 1540504,
1493
- "n_tokens": 457243,
1494
- "n_chars": 1484970
1495
- },
1496
- "code_davinci_002.cc100-fr": {
1497
- "vocab_size": 50281,
1498
- "n_bytes": 1540504,
1499
- "n_tokens": 521776,
1500
- "n_chars": 1484970
1501
- },
1502
- "crystal_coder.cc100-fr": {
1503
- "vocab_size": 32022,
1504
- "n_bytes": 1540504,
1505
- "n_tokens": 447243,
1506
- "n_chars": 1484970
1507
- },
1508
- "dbrx_instruct.cc100-fr": {
1509
- "vocab_size": 100280,
1510
- "n_bytes": 1540504,
1511
- "n_tokens": 412685,
1512
- "n_chars": 1484970
1513
- },
1514
- "deepseek_coder_33b_instruct.cc100-fr": {
1515
- "vocab_size": 32022,
1516
- "n_bytes": 1540504,
1517
- "n_tokens": 537538,
1518
- "n_chars": 1484970
1519
- },
1520
- "deepseek_llm_7b_base.cc100-fr": {
1521
- "vocab_size": 100015,
1522
- "n_bytes": 1540504,
1523
- "n_tokens": 507693,
1524
- "n_chars": 1484970
1525
- },
1526
- "falcon_180b.cc100-fr": {
1527
- "vocab_size": 65024,
1528
- "n_bytes": 1540504,
1529
- "n_tokens": 407853,
1530
- "n_chars": 1484970
1531
- },
1532
- "falcon_7b.cc100-fr": {
1533
- "vocab_size": 65024,
1534
- "n_bytes": 1540504,
1535
- "n_tokens": 407853,
1536
- "n_chars": 1484970
1537
- },
1538
- "fastchat_t5_3b.cc100-fr": {
1539
- "vocab_size": 32110,
1540
- "n_bytes": 1540504,
1541
- "n_tokens": 717675,
1542
- "n_chars": 1484970
1543
- },
1544
- "flan_t5_base.cc100-fr": {
1545
- "vocab_size": 32100,
1546
- "n_bytes": 1540504,
1547
- "n_tokens": 476135,
1548
- "n_chars": 1484970
1549
- },
1550
- "gemma_7b.cc100-fr": {
1551
- "vocab_size": 256000,
1552
- "n_bytes": 1540504,
1553
- "n_tokens": 374551,
1554
- "n_chars": 1484970
1555
- },
1556
- "gpt2.cc100-fr": {
1557
- "vocab_size": 50257,
1558
- "n_bytes": 1540504,
1559
- "n_tokens": 521776,
1560
- "n_chars": 1484970
1561
- },
1562
- "gpt2_chinese.cc100-fr": {
1563
- "vocab_size": 21128,
1564
- "n_bytes": 1540504,
1565
- "n_tokens": 636442,
1566
- "n_chars": 1484970
1567
- },
1568
- "gpt_35_turbo.cc100-fr": {
1569
- "vocab_size": 100277,
1570
- "n_bytes": 1540504,
1571
- "n_tokens": 412685,
1572
- "n_chars": 1484970
1573
- },
1574
- "gpt_4.cc100-fr": {
1575
- "vocab_size": 100277,
1576
- "n_bytes": 1540504,
1577
- "n_tokens": 412685,
1578
- "n_chars": 1484970
1579
- },
1580
- "gpt_nexo_20b.cc100-fr": {
1581
- "vocab_size": 50277,
1582
- "n_bytes": 1540504,
1583
- "n_tokens": 458961,
1584
- "n_chars": 1484970
1585
- },
1586
- "grok_1.cc100-fr": {
1587
- "vocab_size": 131072,
1588
- "n_bytes": 1540504,
1589
- "n_tokens": 428298,
1590
- "n_chars": 1484970
1591
- },
1592
- "internlm2_chat_7b.cc100-fr": {
1593
- "vocab_size": 92544,
1594
- "n_bytes": 1540504,
1595
- "n_tokens": 496629,
1596
- "n_chars": 1484970
1597
- },
1598
- "internlm2_math_7b.cc100-fr": {
1599
- "vocab_size": 92544,
1600
- "n_bytes": 1540504,
1601
- "n_tokens": 496629,
1602
- "n_chars": 1484970
1603
- },
1604
- "internlm_chat_7b.cc100-fr": {
1605
- "vocab_size": 103168,
1606
- "n_bytes": 1540504,
1607
- "n_tokens": 495045,
1608
- "n_chars": 1484970
1609
- },
1610
- "internlm_xcomposer_7b.cc100-fr": {
1611
- "vocab_size": 103168,
1612
- "n_bytes": 1540504,
1613
- "n_tokens": 495045,
1614
- "n_chars": 1484970
1615
- },
1616
- "jamba_v0_1.cc100-fr": {
1617
- "vocab_size": 65536,
1618
- "n_bytes": 1540504,
1619
- "n_tokens": 412899,
1620
- "n_chars": 1484970
1621
- },
1622
- "kplug.cc100-fr": {
1623
- "vocab_size": 10261,
1624
- "n_bytes": 1540504,
1625
- "n_tokens": 638107,
1626
- "n_chars": 1484970
1627
- },
1628
- "llama.cc100-fr": {
1629
- "vocab_size": 32000,
1630
- "n_bytes": 1540504,
1631
- "n_tokens": 457243,
1632
- "n_chars": 1484970
1633
- },
1634
- "llama2.cc100-fr": {
1635
- "vocab_size": 32001,
1636
- "n_bytes": 1540504,
1637
- "n_tokens": 457243,
1638
- "n_chars": 1484970
1639
- },
1640
- "llama3.cc100-fr": {
1641
- "vocab_size": 128256,
1642
- "n_bytes": 1540504,
1643
- "n_tokens": 412146,
1644
- "n_chars": 1484970
1645
- },
1646
- "mistral_7b.cc100-fr": {
1647
- "vocab_size": 32000,
1648
- "n_bytes": 1540504,
1649
- "n_tokens": 476666,
1650
- "n_chars": 1484970
1651
- },
1652
- "mixtral_8_7b.cc100-fr": {
1653
- "vocab_size": 32000,
1654
- "n_bytes": 1540504,
1655
- "n_tokens": 476666,
1656
- "n_chars": 1484970
1657
- },
1658
- "mobilebert_uncased.cc100-fr": {
1659
- "vocab_size": 30522,
1660
- "n_bytes": 1540504,
1661
- "n_tokens": 504075,
1662
- "n_chars": 1484970
1663
- },
1664
- "moss.cc100-fr": {
1665
- "vocab_size": 106072,
1666
- "n_bytes": 1540504,
1667
- "n_tokens": 515669,
1668
- "n_chars": 1484970
1669
- },
1670
- "mt5_large.cc100-fr": {
1671
- "vocab_size": 250100,
1672
- "n_bytes": 1540504,
1673
- "n_tokens": 470944,
1674
- "n_chars": 1484970
1675
- },
1676
- "olmo_7b.cc100-fr": {
1677
- "vocab_size": 50280,
1678
- "n_bytes": 1540504,
1679
- "n_tokens": 458961,
1680
- "n_chars": 1484970
1681
- },
1682
- "orion_14b_chat.cc100-fr": {
1683
- "vocab_size": 84608,
1684
- "n_bytes": 1540504,
1685
- "n_tokens": 564107,
1686
- "n_chars": 1484970
1687
- },
1688
- "phi_1.cc100-fr": {
1689
- "vocab_size": 50295,
1690
- "n_bytes": 1540504,
1691
- "n_tokens": 521776,
1692
- "n_chars": 1484970
1693
- },
1694
- "phi_2.cc100-fr": {
1695
- "vocab_size": 50295,
1696
- "n_bytes": 1540504,
1697
- "n_tokens": 521776,
1698
- "n_chars": 1484970
1699
- },
1700
- "phi_3_mini.cc100-fr": {
1701
- "vocab_size": 32011,
1702
- "n_bytes": 1540504,
1703
- "n_tokens": 457243,
1704
- "n_chars": 1484970
1705
- },
1706
- "pko_t5_large.cc100-fr": {
1707
- "vocab_size": 50358,
1708
- "n_bytes": 1540504,
1709
- "n_tokens": 1044665,
1710
- "n_chars": 1484970
1711
- },
1712
- "prompt_clue.cc100-fr": {
1713
- "vocab_size": 32128,
1714
- "n_bytes": 1540504,
1715
- "n_tokens": 822012,
1716
- "n_chars": 1484970
1717
- },
1718
- "qwen1_5_14b_chat.cc100-fr": {
1719
- "vocab_size": 151646,
1720
- "n_bytes": 1540504,
1721
- "n_tokens": 413637,
1722
- "n_chars": 1484970
1723
- },
1724
- "qwen_1_8b_chat.cc100-fr": {
1725
- "vocab_size": 151851,
1726
- "n_bytes": 1540504,
1727
- "n_tokens": 413637,
1728
- "n_chars": 1484970
1729
- },
1730
- "qwen_72b_chat.cc100-fr": {
1731
- "vocab_size": 151851,
1732
- "n_bytes": 1540504,
1733
- "n_tokens": 413637,
1734
- "n_chars": 1484970
1735
- },
1736
- "qwen_7b_chat.cc100-fr": {
1737
- "vocab_size": 151851,
1738
- "n_bytes": 1540504,
1739
- "n_tokens": 413637,
1740
- "n_chars": 1484970
1741
- },
1742
- "roberta_chinese_clue.cc100-fr": {
1743
- "vocab_size": 8021,
1744
- "n_bytes": 1540504,
1745
- "n_tokens": 787363,
1746
- "n_chars": 1484970
1747
- },
1748
- "skywork_13b_base.cc100-fr": {
1749
- "vocab_size": 65519,
1750
- "n_bytes": 1540504,
1751
- "n_tokens": 457233,
1752
- "n_chars": 1484970
1753
- },
1754
- "skywork_13b_math.cc100-fr": {
1755
- "vocab_size": 65519,
1756
- "n_bytes": 1540504,
1757
- "n_tokens": 457233,
1758
- "n_chars": 1484970
1759
- },
1760
- "solar_10_7b.cc100-fr": {
1761
- "vocab_size": 32000,
1762
- "n_bytes": 1540504,
1763
- "n_tokens": 476666,
1764
- "n_chars": 1484970
1765
- },
1766
- "starchat_alpha.cc100-fr": {
1767
- "vocab_size": 49156,
1768
- "n_bytes": 1540504,
1769
- "n_tokens": 509958,
1770
- "n_chars": 1484970
1771
- },
1772
- "switch_c_2048.cc100-fr": {
1773
- "vocab_size": 32100,
1774
- "n_bytes": 1540504,
1775
- "n_tokens": 476133,
1776
- "n_chars": 1484970
1777
- },
1778
- "t5_base.cc100-fr": {
1779
- "vocab_size": 32100,
1780
- "n_bytes": 1540504,
1781
- "n_tokens": 476133,
1782
- "n_chars": 1484970
1783
- },
1784
- "t5_large.cc100-fr": {
1785
- "vocab_size": 32100,
1786
- "n_bytes": 1540504,
1787
- "n_tokens": 476133,
1788
- "n_chars": 1484970
1789
- },
1790
- "t5_small.cc100-fr": {
1791
- "vocab_size": 32100,
1792
- "n_bytes": 1540504,
1793
- "n_tokens": 476133,
1794
- "n_chars": 1484970
1795
- },
1796
- "text_davinci_003.cc100-fr": {
1797
- "vocab_size": 50281,
1798
- "n_bytes": 1540504,
1799
- "n_tokens": 521776,
1800
- "n_chars": 1484970
1801
- },
1802
- "tigerbot_13b_chat_v2.cc100-fr": {
1803
- "vocab_size": 60515,
1804
- "n_bytes": 1540504,
1805
- "n_tokens": 447372,
1806
- "n_chars": 1484970
1807
- },
1808
- "tigerbot_70b_chat_v4_4k.cc100-fr": {
1809
- "vocab_size": 65110,
1810
- "n_bytes": 1540504,
1811
- "n_tokens": 448567,
1812
- "n_chars": 1484970
1813
- },
1814
- "wizardcoder_15b_v1.cc100-fr": {
1815
- "vocab_size": 49153,
1816
- "n_bytes": 1540504,
1817
- "n_tokens": 509958,
1818
- "n_chars": 1484970
1819
- },
1820
- "wizardcoder_python_7b_v1.cc100-fr": {
1821
- "vocab_size": 32001,
1822
- "n_bytes": 1540504,
1823
- "n_tokens": 457243,
1824
- "n_chars": 1484970
1825
- },
1826
- "wizardlm_7b_v1.cc100-fr": {
1827
- "vocab_size": 32001,
1828
- "n_bytes": 1540504,
1829
- "n_tokens": 457243,
1830
- "n_chars": 1484970
1831
- },
1832
- "wizardmath_70b_v1.cc100-fr": {
1833
- "vocab_size": 32002,
1834
- "n_bytes": 1540504,
1835
- "n_tokens": 457243,
1836
- "n_chars": 1484970
1837
- },
1838
- "xlm_roberta.cc100-fr": {
1839
- "vocab_size": 250002,
1840
- "n_bytes": 1540504,
1841
- "n_tokens": 405041,
1842
- "n_chars": 1484970
1843
- },
1844
- "yi_34b.cc100-fr": {
1845
- "vocab_size": 64000,
1846
- "n_bytes": 1540504,
1847
- "n_tokens": 533106,
1848
- "n_chars": 1484970
1849
- },
1850
- "yi_6b.cc100-fr": {
1851
- "vocab_size": 64000,
1852
- "n_bytes": 1540504,
1853
- "n_tokens": 533106,
1854
- "n_chars": 1484970
1855
- },
1856
- "yi_vl34b.cc100-fr": {
1857
- "vocab_size": 64000,
1858
- "n_bytes": 1540504,
1859
- "n_tokens": 532288,
1860
- "n_chars": 1484970
1861
- },
1862
- "zephyr_7b_beta.cc100-fr": {
1863
- "vocab_size": 32000,
1864
- "n_bytes": 1540504,
1865
- "n_tokens": 476666,
1866
- "n_chars": 1484970
1867
- },
1868
- "gpt_neox_japanese_2_7b.cc100-en": {
1869
- "vocab_size": 32000,
1870
- "n_bytes": 1124813,
1871
- "n_tokens": 1121413,
1872
- "n_chars": 1121360
1873
- },
1874
- "gpt_neox_japanese_2_7b.cc100-zh-Hans": {
1875
- "vocab_size": 32000,
1876
- "n_bytes": 2633047,
1877
- "n_tokens": 1049033,
1878
- "n_chars": 927311
1879
- },
1880
- "aya_101.cc100-ja": {
1881
- "vocab_size": 250100,
1882
- "n_bytes": 1774770,
1883
- "n_tokens": 300542,
1884
- "n_chars": 603065
1885
- },
1886
- "baichuan.cc100-ja": {
1887
- "vocab_size": 64000,
1888
- "n_bytes": 1774770,
1889
- "n_tokens": 591656,
1890
- "n_chars": 603065
1891
- },
1892
- "baichuan2.cc100-ja": {
1893
- "vocab_size": 125696,
1894
- "n_bytes": 1774770,
1895
- "n_tokens": 554936,
1896
- "n_chars": 603065
1897
- },
1898
- "bert_base_cased.cc100-ja": {
1899
- "vocab_size": 28996,
1900
- "n_bytes": 1774770,
1901
- "n_tokens": 410492,
1902
- "n_chars": 603065
1903
- },
1904
- "bert_base_chinese.cc100-ja": {
1905
- "vocab_size": 21128,
1906
- "n_bytes": 1774770,
1907
- "n_tokens": 396831,
1908
- "n_chars": 603065
1909
- },
1910
- "bert_base_uncased.cc100-ja": {
1911
- "vocab_size": 30522,
1912
- "n_bytes": 1774770,
1913
- "n_tokens": 580634,
1914
- "n_chars": 603065
1915
- },
1916
- "bloom.cc100-ja": {
1917
- "vocab_size": 250680,
1918
- "n_bytes": 1774770,
1919
- "n_tokens": 523592,
1920
- "n_chars": 603065
1921
- },
1922
- "byt5_small.cc100-ja": {
1923
- "vocab_size": 384,
1924
- "n_bytes": 1774770,
1925
- "n_tokens": 1784770,
1926
- "n_chars": 603065
1927
- },
1928
- "aya_101.cc100-ar": {
1929
- "vocab_size": 250100,
1930
- "n_bytes": 2813283,
1931
- "n_tokens": 631736,
1932
- "n_chars": 1560987
1933
- },
1934
- "baichuan.cc100-ar": {
1935
- "vocab_size": 64000,
1936
- "n_bytes": 2813283,
1937
- "n_tokens": 1422976,
1938
- "n_chars": 1560987
1939
- },
1940
- "baichuan2.cc100-ar": {
1941
- "vocab_size": 125696,
1942
- "n_bytes": 2813283,
1943
- "n_tokens": 1337285,
1944
- "n_chars": 1560987
1945
- },
1946
- "bert_base_cased.cc100-ar": {
1947
- "vocab_size": 28996,
1948
- "n_bytes": 2813283,
1949
- "n_tokens": 1232449,
1950
- "n_chars": 1560987
1951
- },
1952
- "bert_base_chinese.cc100-ar": {
1953
- "vocab_size": 21128,
1954
- "n_bytes": 2813283,
1955
- "n_tokens": 536389,
1956
- "n_chars": 1560987
1957
- },
1958
- "bert_base_uncased.cc100-ar": {
1959
- "vocab_size": 30522,
1960
- "n_bytes": 2813283,
1961
- "n_tokens": 1269370,
1962
- "n_chars": 1560987
1963
- },
1964
- "bloom.cc100-ar": {
1965
- "vocab_size": 250680,
1966
- "n_bytes": 2813283,
1967
- "n_tokens": 427489,
1968
- "n_chars": 1560987
1969
- },
1970
- "byt5_small.cc100-ar": {
1971
- "vocab_size": 384,
1972
- "n_bytes": 2813283,
1973
- "n_tokens": 2823283,
1974
- "n_chars": 1560987
1975
- },
1976
- "character_glm_6b.cc100-ar": {
1977
- "vocab_size": 64789,
1978
- "n_bytes": 2813283,
1979
- "n_tokens": 1441847,
1980
- "n_chars": 1560987
1981
- },
1982
- "chatglm2_6b.cc100-ar": {
1983
- "vocab_size": 64787,
1984
- "n_bytes": 2813283,
1985
- "n_tokens": 1441847,
1986
- "n_chars": 1560987
1987
- },
1988
- "chatglm3_6b.cc100-ar": {
1989
- "vocab_size": 64796,
1990
- "n_bytes": 2813283,
1991
- "n_tokens": 1441847,
1992
- "n_chars": 1560987
1993
- },
1994
- "chatglm_6b.cc100-ar": {
1995
- "vocab_size": 150344,
1996
- "n_bytes": 2813283,
1997
- "n_tokens": 1097200,
1998
- "n_chars": 1560987
1999
- },
2000
- "chatyuan_large_v2.cc100-ar": {
2001
- "vocab_size": 32128,
2002
- "n_bytes": 2813283,
2003
- "n_tokens": 1006313,
2004
- "n_chars": 1560987
2005
- },
2006
- "chinese_llama.cc100-ar": {
2007
- "vocab_size": 49953,
2008
- "n_bytes": 2813283,
2009
- "n_tokens": 1421625,
2010
- "n_chars": 1560987
2011
- },
2012
- "chinese_llama2.cc100-ar": {
2013
- "vocab_size": 55296,
2014
- "n_bytes": 2813283,
2015
- "n_tokens": 1432081,
2016
- "n_chars": 1560987
2017
- },
2018
- "code_davinci_002.cc100-ar": {
2019
- "vocab_size": 50281,
2020
- "n_bytes": 2813283,
2021
- "n_tokens": 1558111,
2022
- "n_chars": 1560987
2023
- },
2024
- "crystal_coder.cc100-ar": {
2025
- "vocab_size": 32022,
2026
- "n_bytes": 2813283,
2027
- "n_tokens": 1422081,
2028
- "n_chars": 1560987
2029
- },
2030
- "dbrx_instruct.cc100-ar": {
2031
- "vocab_size": 100280,
2032
- "n_bytes": 2813283,
2033
- "n_tokens": 1105640,
2034
- "n_chars": 1560987
2035
- },
2036
- "deepseek_coder_33b_instruct.cc100-ar": {
2037
- "vocab_size": 32022,
2038
- "n_bytes": 2813283,
2039
- "n_tokens": 1958863,
2040
- "n_chars": 1560987
2041
- },
2042
- "deepseek_llm_7b_base.cc100-ar": {
2043
- "vocab_size": 100015,
2044
- "n_bytes": 2813283,
2045
- "n_tokens": 1426103,
2046
- "n_chars": 1560987
2047
- },
2048
- "falcon_180b.cc100-ar": {
2049
- "vocab_size": 65024,
2050
- "n_bytes": 2813283,
2051
- "n_tokens": 1597443,
2052
- "n_chars": 1560987
2053
- },
2054
- "falcon_7b.cc100-ar": {
2055
- "vocab_size": 65024,
2056
- "n_bytes": 2813283,
2057
- "n_tokens": 1597443,
2058
- "n_chars": 1560987
2059
- },
2060
- "fastchat_t5_3b.cc100-ar": {
2061
- "vocab_size": 32110,
2062
- "n_bytes": 2813283,
2063
- "n_tokens": 832267,
2064
- "n_chars": 1560987
2065
- },
2066
- "flan_t5_base.cc100-ar": {
2067
- "vocab_size": 32100,
2068
- "n_bytes": 2813283,
2069
- "n_tokens": 568957,
2070
- "n_chars": 1560987
2071
- },
2072
- "gemma_7b.cc100-ar": {
2073
- "vocab_size": 256000,
2074
- "n_bytes": 2813283,
2075
- "n_tokens": 573788,
2076
- "n_chars": 1560987
2077
- },
2078
- "gpt2.cc100-ar": {
2079
- "vocab_size": 50257,
2080
- "n_bytes": 2813283,
2081
- "n_tokens": 1558111,
2082
- "n_chars": 1560987
2083
- },
2084
- "gpt2_chinese.cc100-ar": {
2085
- "vocab_size": 21128,
2086
- "n_bytes": 2813283,
2087
- "n_tokens": 617677,
2088
- "n_chars": 1560987
2089
- },
2090
- "gpt_35_turbo.cc100-ar": {
2091
- "vocab_size": 100277,
2092
- "n_bytes": 2813283,
2093
- "n_tokens": 1105640,
2094
- "n_chars": 1560987
2095
- },
2096
- "gpt_4.cc100-ar": {
2097
- "vocab_size": 100277,
2098
- "n_bytes": 2813283,
2099
- "n_tokens": 1105640,
2100
- "n_chars": 1560987
2101
- },
2102
- "gpt_neox_japanese_2_7b.cc100-ar": {
2103
- "vocab_size": 32000,
2104
- "n_bytes": 2813283,
2105
- "n_tokens": 2809195,
2106
- "n_chars": 1560987
2107
- },
2108
- "gpt_nexo_20b.cc100-ar": {
2109
- "vocab_size": 50277,
2110
- "n_bytes": 2813283,
2111
- "n_tokens": 1106277,
2112
- "n_chars": 1560987
2113
- },
2114
- "grok_1.cc100-ar": {
2115
- "vocab_size": 131072,
2116
- "n_bytes": 2813283,
2117
- "n_tokens": 1392088,
2118
- "n_chars": 1560987
2119
- },
2120
- "internlm2_chat_7b.cc100-ar": {
2121
- "vocab_size": 92544,
2122
- "n_bytes": 2813283,
2123
- "n_tokens": 1635378,
2124
- "n_chars": 1560987
2125
- },
2126
- "internlm2_math_7b.cc100-ar": {
2127
- "vocab_size": 92544,
2128
- "n_bytes": 2813283,
2129
- "n_tokens": 1635378,
2130
- "n_chars": 1560987
2131
- },
2132
- "internlm_chat_7b.cc100-ar": {
2133
- "vocab_size": 103168,
2134
- "n_bytes": 2813283,
2135
- "n_tokens": 532046,
2136
- "n_chars": 1560987
2137
- },
2138
- "internlm_xcomposer_7b.cc100-ar": {
2139
- "vocab_size": 103168,
2140
- "n_bytes": 2813283,
2141
- "n_tokens": 532046,
2142
- "n_chars": 1560987
2143
- },
2144
- "jamba_v0_1.cc100-ar": {
2145
- "vocab_size": 65536,
2146
- "n_bytes": 2813283,
2147
- "n_tokens": 727886,
2148
- "n_chars": 1560987
2149
- },
2150
- "kplug.cc100-ar": {
2151
- "vocab_size": 10261,
2152
- "n_bytes": 2813283,
2153
- "n_tokens": 331987,
2154
- "n_chars": 1560987
2155
- },
2156
- "llama.cc100-ar": {
2157
- "vocab_size": 32000,
2158
- "n_bytes": 2813283,
2159
- "n_tokens": 1432081,
2160
- "n_chars": 1560987
2161
- },
2162
- "llama2.cc100-ar": {
2163
- "vocab_size": 32001,
2164
- "n_bytes": 2813283,
2165
- "n_tokens": 1432081,
2166
- "n_chars": 1560987
2167
- },
2168
- "llama3.cc100-ar": {
2169
- "vocab_size": 128256,
2170
- "n_bytes": 2813283,
2171
- "n_tokens": 615514,
2172
- "n_chars": 1560987
2173
- },
2174
- "mistral_7b.cc100-ar": {
2175
- "vocab_size": 32000,
2176
- "n_bytes": 2813283,
2177
- "n_tokens": 1406319,
2178
- "n_chars": 1560987
2179
- },
2180
- "mixtral_8_7b.cc100-ar": {
2181
- "vocab_size": 32000,
2182
- "n_bytes": 2813283,
2183
- "n_tokens": 1406319,
2184
- "n_chars": 1560987
2185
- },
2186
- "mobilebert_uncased.cc100-ar": {
2187
- "vocab_size": 30522,
2188
- "n_bytes": 2813283,
2189
- "n_tokens": 1269370,
2190
- "n_chars": 1560987
2191
- },
2192
- "moss.cc100-ar": {
2193
- "vocab_size": 106072,
2194
- "n_bytes": 2813283,
2195
- "n_tokens": 1557671,
2196
- "n_chars": 1560987
2197
- },
2198
- "mt5_large.cc100-ar": {
2199
- "vocab_size": 250100,
2200
- "n_bytes": 2813283,
2201
- "n_tokens": 631736,
2202
- "n_chars": 1560987
2203
- },
2204
- "olmo_7b.cc100-ar": {
2205
- "vocab_size": 50280,
2206
- "n_bytes": 2813283,
2207
- "n_tokens": 1106277,
2208
- "n_chars": 1560987
2209
- },
2210
- "orion_14b_chat.cc100-ar": {
2211
- "vocab_size": 84608,
2212
- "n_bytes": 2813283,
2213
- "n_tokens": 1531053,
2214
- "n_chars": 1560987
2215
- },
2216
- "phi_1.cc100-ar": {
2217
- "vocab_size": 50295,
2218
- "n_bytes": 2813283,
2219
- "n_tokens": 1558111,
2220
- "n_chars": 1560987
2221
- },
2222
- "phi_2.cc100-ar": {
2223
- "vocab_size": 50295,
2224
- "n_bytes": 2813283,
2225
- "n_tokens": 1558111,
2226
- "n_chars": 1560987
2227
- },
2228
- "phi_3_mini.cc100-ar": {
2229
- "vocab_size": 32011,
2230
- "n_bytes": 2813283,
2231
- "n_tokens": 1432081,
2232
- "n_chars": 1560987
2233
- },
2234
- "pko_t5_large.cc100-ar": {
2235
- "vocab_size": 50358,
2236
- "n_bytes": 2813283,
2237
- "n_tokens": 2815586,
2238
- "n_chars": 1560987
2239
- },
2240
- "prompt_clue.cc100-ar": {
2241
- "vocab_size": 32128,
2242
- "n_bytes": 2813283,
2243
- "n_tokens": 1006313,
2244
- "n_chars": 1560987
2245
- },
2246
- "qwen1_5_14b_chat.cc100-ar": {
2247
- "vocab_size": 151646,
2248
- "n_bytes": 2813283,
2249
- "n_tokens": 614959,
2250
- "n_chars": 1560987
2251
- },
2252
- "qwen_1_8b_chat.cc100-ar": {
2253
- "vocab_size": 151851,
2254
- "n_bytes": 2813283,
2255
- "n_tokens": 614959,
2256
- "n_chars": 1560987
2257
- },
2258
- "qwen_72b_chat.cc100-ar": {
2259
- "vocab_size": 151851,
2260
- "n_bytes": 2813283,
2261
- "n_tokens": 614959,
2262
- "n_chars": 1560987
2263
- },
2264
- "qwen_7b_chat.cc100-ar": {
2265
- "vocab_size": 151851,
2266
- "n_bytes": 2813283,
2267
- "n_tokens": 614959,
2268
- "n_chars": 1560987
2269
- },
2270
- "roberta_chinese_clue.cc100-ar": {
2271
- "vocab_size": 8021,
2272
- "n_bytes": 2813283,
2273
- "n_tokens": 621762,
2274
- "n_chars": 1560987
2275
- },
2276
- "skywork_13b_base.cc100-ar": {
2277
- "vocab_size": 65519,
2278
- "n_bytes": 2813283,
2279
- "n_tokens": 1432065,
2280
- "n_chars": 1560987
2281
- },
2282
- "skywork_13b_math.cc100-ar": {
2283
- "vocab_size": 65519,
2284
- "n_bytes": 2813283,
2285
- "n_tokens": 1432065,
2286
- "n_chars": 1560987
2287
- },
2288
- "solar_10_7b.cc100-ar": {
2289
- "vocab_size": 32000,
2290
- "n_bytes": 2813283,
2291
- "n_tokens": 1406319,
2292
- "n_chars": 1560987
2293
- },
2294
- "starchat_alpha.cc100-ar": {
2295
- "vocab_size": 49156,
2296
- "n_bytes": 2813283,
2297
- "n_tokens": 1195640,
2298
- "n_chars": 1560987
2299
- },
2300
- "switch_c_2048.cc100-ar": {
2301
- "vocab_size": 32100,
2302
- "n_bytes": 2813283,
2303
- "n_tokens": 568855,
2304
- "n_chars": 1560987
2305
- },
2306
- "t5_base.cc100-ar": {
2307
- "vocab_size": 32100,
2308
- "n_bytes": 2813283,
2309
- "n_tokens": 568855,
2310
- "n_chars": 1560987
2311
- },
2312
- "t5_large.cc100-ar": {
2313
- "vocab_size": 32100,
2314
- "n_bytes": 2813283,
2315
- "n_tokens": 568855,
2316
- "n_chars": 1560987
2317
- },
2318
- "t5_small.cc100-ar": {
2319
- "vocab_size": 32100,
2320
- "n_bytes": 2813283,
2321
- "n_tokens": 568855,
2322
- "n_chars": 1560987
2323
- },
2324
- "text_davinci_003.cc100-ar": {
2325
- "vocab_size": 50281,
2326
- "n_bytes": 2813283,
2327
- "n_tokens": 1558111,
2328
- "n_chars": 1560987
2329
- },
2330
- "tigerbot_13b_chat_v2.cc100-ar": {
2331
- "vocab_size": 60515,
2332
- "n_bytes": 2813283,
2333
- "n_tokens": 1422070,
2334
- "n_chars": 1560987
2335
- },
2336
- "tigerbot_70b_chat_v4_4k.cc100-ar": {
2337
- "vocab_size": 65110,
2338
- "n_bytes": 2813283,
2339
- "n_tokens": 1422073,
2340
- "n_chars": 1560987
2341
- },
2342
- "wizardcoder_15b_v1.cc100-ar": {
2343
- "vocab_size": 49153,
2344
- "n_bytes": 2813283,
2345
- "n_tokens": 1195640,
2346
- "n_chars": 1560987
2347
- },
2348
- "wizardcoder_python_7b_v1.cc100-ar": {
2349
- "vocab_size": 32001,
2350
- "n_bytes": 2813283,
2351
- "n_tokens": 1432081,
2352
- "n_chars": 1560987
2353
- },
2354
- "wizardlm_7b_v1.cc100-ar": {
2355
- "vocab_size": 32001,
2356
- "n_bytes": 2813283,
2357
- "n_tokens": 1432081,
2358
- "n_chars": 1560987
2359
- },
2360
- "wizardmath_70b_v1.cc100-ar": {
2361
- "vocab_size": 32002,
2362
- "n_bytes": 2813283,
2363
- "n_tokens": 1432081,
2364
- "n_chars": 1560987
2365
- },
2366
- "xlm_roberta.cc100-ar": {
2367
- "vocab_size": 250002,
2368
- "n_bytes": 2813283,
2369
- "n_tokens": 518287,
2370
- "n_chars": 1560987
2371
- },
2372
- "yi_34b.cc100-ar": {
2373
- "vocab_size": 64000,
2374
- "n_bytes": 2813283,
2375
- "n_tokens": 1795801,
2376
- "n_chars": 1560987
2377
- },
2378
- "yi_6b.cc100-ar": {
2379
- "vocab_size": 64000,
2380
- "n_bytes": 2813283,
2381
- "n_tokens": 1795801,
2382
- "n_chars": 1560987
2383
- },
2384
- "yi_vl34b.cc100-ar": {
2385
- "vocab_size": 64000,
2386
- "n_bytes": 2813283,
2387
- "n_tokens": 1803957,
2388
- "n_chars": 1560987
2389
- },
2390
- "zephyr_7b_beta.cc100-ar": {
2391
- "vocab_size": 32000,
2392
- "n_bytes": 2813283,
2393
- "n_tokens": 1406319,
2394
- "n_chars": 1560987
2395
- },
2396
- "aya_101.cc100-de": {
2397
- "vocab_size": 250100,
2398
- "n_bytes": 1814876,
2399
- "n_tokens": 480418,
2400
- "n_chars": 1784021
2401
- },
2402
- "baichuan.cc100-de": {
2403
- "vocab_size": 64000,
2404
- "n_bytes": 1814876,
2405
- "n_tokens": 680512,
2406
- "n_chars": 1784021
2407
- },
2408
- "baichuan2.cc100-de": {
2409
- "vocab_size": 125696,
2410
- "n_bytes": 1814876,
2411
- "n_tokens": 628063,
2412
- "n_chars": 1784021
2413
- },
2414
- "bert_base_cased.cc100-de": {
2415
- "vocab_size": 28996,
2416
- "n_bytes": 1814876,
2417
- "n_tokens": 731093,
2418
- "n_chars": 1784021
2419
- },
2420
- "bert_base_chinese.cc100-de": {
2421
- "vocab_size": 21128,
2422
- "n_bytes": 1814876,
2423
- "n_tokens": 561246,
2424
- "n_chars": 1784021
2425
- },
2426
- "bert_base_uncased.cc100-de": {
2427
- "vocab_size": 30522,
2428
- "n_bytes": 1814876,
2429
- "n_tokens": 646485,
2430
- "n_chars": 1784021
2431
- },
2432
- "bloom.cc100-de": {
2433
- "vocab_size": 250680,
2434
- "n_bytes": 1814876,
2435
- "n_tokens": 541170,
2436
- "n_chars": 1784021
2437
- },
2438
- "byt5_small.cc100-de": {
2439
- "vocab_size": 384,
2440
- "n_bytes": 1814876,
2441
- "n_tokens": 1824876,
2442
- "n_chars": 1784021
2443
- },
2444
- "character_glm_6b.cc100-de": {
2445
- "vocab_size": 64789,
2446
- "n_bytes": 1814876,
2447
- "n_tokens": 639822,
2448
- "n_chars": 1784021
2449
- },
2450
- "chatglm2_6b.cc100-de": {
2451
- "vocab_size": 64787,
2452
- "n_bytes": 1814876,
2453
- "n_tokens": 639757,
2454
- "n_chars": 1784021
2455
- },
2456
- "chatglm3_6b.cc100-de": {
2457
- "vocab_size": 64796,
2458
- "n_bytes": 1814876,
2459
- "n_tokens": 639822,
2460
- "n_chars": 1784021
2461
- },
2462
- "chatglm_6b.cc100-de": {
2463
- "vocab_size": 150344,
2464
- "n_bytes": 1814876,
2465
- "n_tokens": 589464,
2466
- "n_chars": 1784021
2467
- },
2468
- "chatyuan_large_v2.cc100-de": {
2469
- "vocab_size": 32128,
2470
- "n_bytes": 1814876,
2471
- "n_tokens": 970463,
2472
- "n_chars": 1784021
2473
- },
2474
- "chinese_llama.cc100-de": {
2475
- "vocab_size": 49953,
2476
- "n_bytes": 1814876,
2477
- "n_tokens": 523859,
2478
- "n_chars": 1784021
2479
- },
2480
- "chinese_llama2.cc100-de": {
2481
- "vocab_size": 55296,
2482
- "n_bytes": 1814876,
2483
- "n_tokens": 537318,
2484
- "n_chars": 1784021
2485
- },
2486
- "code_davinci_002.cc100-de": {
2487
- "vocab_size": 50281,
2488
- "n_bytes": 1814876,
2489
- "n_tokens": 684666,
2490
- "n_chars": 1784021
2491
- },
2492
- "crystal_coder.cc100-de": {
2493
- "vocab_size": 32022,
2494
- "n_bytes": 1814876,
2495
- "n_tokens": 527320,
2496
- "n_chars": 1784021
2497
- },
2498
- "dbrx_instruct.cc100-de": {
2499
- "vocab_size": 100280,
2500
- "n_bytes": 1814876,
2501
- "n_tokens": 500870,
2502
- "n_chars": 1784021
2503
- },
2504
- "deepseek_coder_33b_instruct.cc100-de": {
2505
- "vocab_size": 32022,
2506
- "n_bytes": 1814876,
2507
- "n_tokens": 745618,
2508
- "n_chars": 1784021
2509
- },
2510
- "deepseek_llm_7b_base.cc100-de": {
2511
- "vocab_size": 100015,
2512
- "n_bytes": 1814876,
2513
- "n_tokens": 642573,
2514
- "n_chars": 1784021
2515
- },
2516
- "falcon_180b.cc100-de": {
2517
- "vocab_size": 65024,
2518
- "n_bytes": 1814876,
2519
- "n_tokens": 497054,
2520
- "n_chars": 1784021
2521
- },
2522
- "falcon_7b.cc100-de": {
2523
- "vocab_size": 65024,
2524
- "n_bytes": 1814876,
2525
- "n_tokens": 497054,
2526
- "n_chars": 1784021
2527
- },
2528
- "fastchat_t5_3b.cc100-de": {
2529
- "vocab_size": 32110,
2530
- "n_bytes": 1814876,
2531
- "n_tokens": 736989,
2532
- "n_chars": 1784021
2533
- },
2534
- "flan_t5_base.cc100-de": {
2535
- "vocab_size": 32100,
2536
- "n_bytes": 1814876,
2537
- "n_tokens": 480254,
2538
- "n_chars": 1784021
2539
- },
2540
- "gemma_7b.cc100-de": {
2541
- "vocab_size": 256000,
2542
- "n_bytes": 1814876,
2543
- "n_tokens": 416876,
2544
- "n_chars": 1784021
2545
- },
2546
- "gpt2.cc100-de": {
2547
- "vocab_size": 50257,
2548
- "n_bytes": 1814876,
2549
- "n_tokens": 684669,
2550
- "n_chars": 1784021
2551
- },
2552
- "gpt2_chinese.cc100-de": {
2553
- "vocab_size": 21128,
2554
- "n_bytes": 1814876,
2555
- "n_tokens": 786497,
2556
- "n_chars": 1784021
2557
- },
2558
- "gpt_35_turbo.cc100-de": {
2559
- "vocab_size": 100277,
2560
- "n_bytes": 1814876,
2561
- "n_tokens": 500870,
2562
- "n_chars": 1784021
2563
- },
2564
- "gpt_4.cc100-de": {
2565
- "vocab_size": 100277,
2566
- "n_bytes": 1814876,
2567
- "n_tokens": 500870,
2568
- "n_chars": 1784021
2569
- },
2570
- "gpt_neox_japanese_2_7b.cc100-de": {
2571
- "vocab_size": 32000,
2572
- "n_bytes": 1814876,
2573
- "n_tokens": 1807780,
2574
- "n_chars": 1784021
2575
- },
2576
- "gpt_nexo_20b.cc100-de": {
2577
- "vocab_size": 50277,
2578
- "n_bytes": 1814876,
2579
- "n_tokens": 583628,
2580
- "n_chars": 1784021
2581
- },
2582
- "grok_1.cc100-de": {
2583
- "vocab_size": 131072,
2584
- "n_bytes": 1814876,
2585
- "n_tokens": 505220,
2586
- "n_chars": 1784021
2587
- },
2588
- "internlm2_chat_7b.cc100-de": {
2589
- "vocab_size": 92544,
2590
- "n_bytes": 1814876,
2591
- "n_tokens": 583917,
2592
- "n_chars": 1784021
2593
- },
2594
- "internlm2_math_7b.cc100-de": {
2595
- "vocab_size": 92544,
2596
- "n_bytes": 1814876,
2597
- "n_tokens": 583917,
2598
- "n_chars": 1784021
2599
- },
2600
- "internlm_chat_7b.cc100-de": {
2601
- "vocab_size": 103168,
2602
- "n_bytes": 1814876,
2603
- "n_tokens": 580489,
2604
- "n_chars": 1784021
2605
- },
2606
- "internlm_xcomposer_7b.cc100-de": {
2607
- "vocab_size": 103168,
2608
- "n_bytes": 1814876,
2609
- "n_tokens": 580489,
2610
- "n_chars": 1784021
2611
- },
2612
- "jamba_v0_1.cc100-de": {
2613
- "vocab_size": 65536,
2614
- "n_bytes": 1814876,
2615
- "n_tokens": 535856,
2616
- "n_chars": 1784021
2617
- },
2618
- "kplug.cc100-de": {
2619
- "vocab_size": 10261,
2620
- "n_bytes": 1814876,
2621
- "n_tokens": 789053,
2622
- "n_chars": 1784021
2623
- },
2624
- "llama.cc100-de": {
2625
- "vocab_size": 32000,
2626
- "n_bytes": 1814876,
2627
- "n_tokens": 537320,
2628
- "n_chars": 1784021
2629
- },
2630
- "llama2.cc100-de": {
2631
- "vocab_size": 32001,
2632
- "n_bytes": 1814876,
2633
- "n_tokens": 537320,
2634
- "n_chars": 1784021
2635
- },
2636
- "llama3.cc100-de": {
2637
- "vocab_size": 128256,
2638
- "n_bytes": 1814876,
2639
- "n_tokens": 499766,
2640
- "n_chars": 1784021
2641
- },
2642
- "mistral_7b.cc100-de": {
2643
- "vocab_size": 32000,
2644
- "n_bytes": 1814876,
2645
- "n_tokens": 577526,
2646
- "n_chars": 1784021
2647
- },
2648
- "mixtral_8_7b.cc100-de": {
2649
- "vocab_size": 32000,
2650
- "n_bytes": 1814876,
2651
- "n_tokens": 577526,
2652
- "n_chars": 1784021
2653
- },
2654
- "mobilebert_uncased.cc100-de": {
2655
- "vocab_size": 30522,
2656
- "n_bytes": 1814876,
2657
- "n_tokens": 646485,
2658
- "n_chars": 1784021
2659
- },
2660
- "moss.cc100-de": {
2661
- "vocab_size": 106072,
2662
- "n_bytes": 1814876,
2663
- "n_tokens": 683401,
2664
- "n_chars": 1784021
2665
- },
2666
- "mt5_large.cc100-de": {
2667
- "vocab_size": 250100,
2668
- "n_bytes": 1814876,
2669
- "n_tokens": 480418,
2670
- "n_chars": 1784021
2671
- },
2672
- "olmo_7b.cc100-de": {
2673
- "vocab_size": 50280,
2674
- "n_bytes": 1814876,
2675
- "n_tokens": 583628,
2676
- "n_chars": 1784021
2677
- },
2678
- "orion_14b_chat.cc100-de": {
2679
- "vocab_size": 84608,
2680
- "n_bytes": 1814876,
2681
- "n_tokens": 744404,
2682
- "n_chars": 1784021
2683
- },
2684
- "phi_1.cc100-de": {
2685
- "vocab_size": 50295,
2686
- "n_bytes": 1814876,
2687
- "n_tokens": 684665,
2688
- "n_chars": 1784021
2689
- },
2690
- "phi_2.cc100-de": {
2691
- "vocab_size": 50295,
2692
- "n_bytes": 1814876,
2693
- "n_tokens": 684665,
2694
- "n_chars": 1784021
2695
- },
2696
- "phi_3_mini.cc100-de": {
2697
- "vocab_size": 32011,
2698
- "n_bytes": 1814876,
2699
- "n_tokens": 537320,
2700
- "n_chars": 1784021
2701
- },
2702
- "pko_t5_large.cc100-de": {
2703
- "vocab_size": 50358,
2704
- "n_bytes": 1814876,
2705
- "n_tokens": 1254350,
2706
- "n_chars": 1784021
2707
- },
2708
- "prompt_clue.cc100-de": {
2709
- "vocab_size": 32128,
2710
- "n_bytes": 1814876,
2711
- "n_tokens": 970463,
2712
- "n_chars": 1784021
2713
- },
2714
- "qwen1_5_14b_chat.cc100-de": {
2715
- "vocab_size": 151646,
2716
- "n_bytes": 1814876,
2717
- "n_tokens": 503561,
2718
- "n_chars": 1784021
2719
- },
2720
- "qwen_1_8b_chat.cc100-de": {
2721
- "vocab_size": 151851,
2722
- "n_bytes": 1814876,
2723
- "n_tokens": 503561,
2724
- "n_chars": 1784021
2725
- },
2726
- "qwen_72b_chat.cc100-de": {
2727
- "vocab_size": 151851,
2728
- "n_bytes": 1814876,
2729
- "n_tokens": 503561,
2730
- "n_chars": 1784021
2731
- },
2732
- "qwen_7b_chat.cc100-de": {
2733
- "vocab_size": 151851,
2734
- "n_bytes": 1814876,
2735
- "n_tokens": 503561,
2736
- "n_chars": 1784021
2737
- },
2738
- "roberta_chinese_clue.cc100-de": {
2739
- "vocab_size": 8021,
2740
- "n_bytes": 1814876,
2741
- "n_tokens": 915612,
2742
- "n_chars": 1784021
2743
- },
2744
- "skywork_13b_base.cc100-de": {
2745
- "vocab_size": 65519,
2746
- "n_bytes": 1814876,
2747
- "n_tokens": 537308,
2748
- "n_chars": 1784021
2749
- },
2750
- "skywork_13b_math.cc100-de": {
2751
- "vocab_size": 65519,
2752
- "n_bytes": 1814876,
2753
- "n_tokens": 537308,
2754
- "n_chars": 1784021
2755
- },
2756
- "solar_10_7b.cc100-de": {
2757
- "vocab_size": 32000,
2758
- "n_bytes": 1814876,
2759
- "n_tokens": 577526,
2760
- "n_chars": 1784021
2761
- },
2762
- "starchat_alpha.cc100-de": {
2763
- "vocab_size": 49156,
2764
- "n_bytes": 1814876,
2765
- "n_tokens": 620541,
2766
- "n_chars": 1784021
2767
- },
2768
- "switch_c_2048.cc100-de": {
2769
- "vocab_size": 32100,
2770
- "n_bytes": 1814876,
2771
- "n_tokens": 480254,
2772
- "n_chars": 1784021
2773
- },
2774
- "t5_base.cc100-de": {
2775
- "vocab_size": 32100,
2776
- "n_bytes": 1814876,
2777
- "n_tokens": 480254,
2778
- "n_chars": 1784021
2779
- },
2780
- "t5_large.cc100-de": {
2781
- "vocab_size": 32100,
2782
- "n_bytes": 1814876,
2783
- "n_tokens": 480254,
2784
- "n_chars": 1784021
2785
- },
2786
- "t5_small.cc100-de": {
2787
- "vocab_size": 32100,
2788
- "n_bytes": 1814876,
2789
- "n_tokens": 480254,
2790
- "n_chars": 1784021
2791
- },
2792
- "text_davinci_003.cc100-de": {
2793
- "vocab_size": 50281,
2794
- "n_bytes": 1814876,
2795
- "n_tokens": 684666,
2796
- "n_chars": 1784021
2797
- },
2798
- "tigerbot_13b_chat_v2.cc100-de": {
2799
- "vocab_size": 60515,
2800
- "n_bytes": 1814876,
2801
- "n_tokens": 528918,
2802
- "n_chars": 1784021
2803
- },
2804
- "tigerbot_70b_chat_v4_4k.cc100-de": {
2805
- "vocab_size": 65110,
2806
- "n_bytes": 1814876,
2807
- "n_tokens": 529170,
2808
- "n_chars": 1784021
2809
- },
2810
- "wizardcoder_15b_v1.cc100-de": {
2811
- "vocab_size": 49153,
2812
- "n_bytes": 1814876,
2813
- "n_tokens": 620541,
2814
- "n_chars": 1784021
2815
- },
2816
- "wizardcoder_python_7b_v1.cc100-de": {
2817
- "vocab_size": 32001,
2818
- "n_bytes": 1814876,
2819
- "n_tokens": 537320,
2820
- "n_chars": 1784021
2821
- },
2822
- "wizardlm_7b_v1.cc100-de": {
2823
- "vocab_size": 32001,
2824
- "n_bytes": 1814876,
2825
- "n_tokens": 537320,
2826
- "n_chars": 1784021
2827
- },
2828
- "wizardmath_70b_v1.cc100-de": {
2829
- "vocab_size": 32002,
2830
- "n_bytes": 1814876,
2831
- "n_tokens": 537320,
2832
- "n_chars": 1784021
2833
- },
2834
- "xlm_roberta.cc100-de": {
2835
- "vocab_size": 250002,
2836
- "n_bytes": 1814876,
2837
- "n_tokens": 432571,
2838
- "n_chars": 1784021
2839
- },
2840
- "yi_34b.cc100-de": {
2841
- "vocab_size": 64000,
2842
- "n_bytes": 1814876,
2843
- "n_tokens": 698366,
2844
- "n_chars": 1784021
2845
- },
2846
- "yi_6b.cc100-de": {
2847
- "vocab_size": 64000,
2848
- "n_bytes": 1814876,
2849
- "n_tokens": 698366,
2850
- "n_chars": 1784021
2851
- },
2852
- "yi_vl34b.cc100-de": {
2853
- "vocab_size": 64000,
2854
- "n_bytes": 1814876,
2855
- "n_tokens": 697065,
2856
- "n_chars": 1784021
2857
- },
2858
- "zephyr_7b_beta.cc100-de": {
2859
- "vocab_size": 32000,
2860
- "n_bytes": 1814876,
2861
- "n_tokens": 577526,
2862
- "n_chars": 1784021
2863
- },
2864
- "gpt_neox_japanese_2_7b.cc100-es": {
2865
- "vocab_size": 32000,
2866
- "n_bytes": 1664455,
2867
- "n_tokens": 1658946,
2868
- "n_chars": 1630297
2869
- },
2870
- "gpt_neox_japanese_2_7b.cc100-fr": {
2871
- "vocab_size": 32000,
2872
- "n_bytes": 1540504,
2873
- "n_tokens": 1524129,
2874
- "n_chars": 1484970
2875
- },
2876
- "character_glm_6b.cc100-ja": {
2877
- "vocab_size": 64789,
2878
- "n_bytes": 1774770,
2879
- "n_tokens": 601380,
2880
- "n_chars": 603065
2881
- },
2882
- "chatglm2_6b.cc100-ja": {
2883
- "vocab_size": 64787,
2884
- "n_bytes": 1774770,
2885
- "n_tokens": 601380,
2886
- "n_chars": 603065
2887
- },
2888
- "chatglm3_6b.cc100-ja": {
2889
- "vocab_size": 64796,
2890
- "n_bytes": 1774770,
2891
- "n_tokens": 601380,
2892
- "n_chars": 603065
2893
- },
2894
- "chatglm_6b.cc100-ja": {
2895
- "vocab_size": 150344,
2896
- "n_bytes": 1774770,
2897
- "n_tokens": 489930,
2898
- "n_chars": 603065
2899
- },
2900
- "chatyuan_large_v2.cc100-ja": {
2901
- "vocab_size": 32128,
2902
- "n_bytes": 1774770,
2903
- "n_tokens": 575118,
2904
- "n_chars": 603065
2905
- },
2906
- "chinese_llama.cc100-ja": {
2907
- "vocab_size": 49953,
2908
- "n_bytes": 1774770,
2909
- "n_tokens": 614177,
2910
- "n_chars": 603065
2911
- },
2912
- "chinese_llama2.cc100-ja": {
2913
- "vocab_size": 55296,
2914
- "n_bytes": 1774770,
2915
- "n_tokens": 624362,
2916
- "n_chars": 603065
2917
- },
2918
- "code_davinci_002.cc100-ja": {
2919
- "vocab_size": 50281,
2920
- "n_bytes": 1774770,
2921
- "n_tokens": 844362,
2922
- "n_chars": 603065
2923
- },
2924
- "crystal_coder.cc100-ja": {
2925
- "vocab_size": 32022,
2926
- "n_bytes": 1774770,
2927
- "n_tokens": 718461,
2928
- "n_chars": 603065
2929
- },
2930
- "dbrx_instruct.cc100-ja": {
2931
- "vocab_size": 100280,
2932
- "n_bytes": 1774770,
2933
- "n_tokens": 630348,
2934
- "n_chars": 603065
2935
- },
2936
- "deepseek_coder_33b_instruct.cc100-ja": {
2937
- "vocab_size": 32022,
2938
- "n_bytes": 1774770,
2939
- "n_tokens": 1018060,
2940
- "n_chars": 603065
2941
- },
2942
- "deepseek_llm_7b_base.cc100-ja": {
2943
- "vocab_size": 100015,
2944
- "n_bytes": 1774770,
2945
- "n_tokens": 761467,
2946
- "n_chars": 603065
2947
- },
2948
- "falcon_180b.cc100-ja": {
2949
- "vocab_size": 65024,
2950
- "n_bytes": 1774770,
2951
- "n_tokens": 842458,
2952
- "n_chars": 603065
2953
- },
2954
- "falcon_7b.cc100-ja": {
2955
- "vocab_size": 65024,
2956
- "n_bytes": 1774770,
2957
- "n_tokens": 842458,
2958
- "n_chars": 603065
2959
- },
2960
- "fastchat_t5_3b.cc100-ja": {
2961
- "vocab_size": 32110,
2962
- "n_bytes": 1774770,
2963
- "n_tokens": 53915,
2964
- "n_chars": 603065
2965
- },
2966
- "flan_t5_base.cc100-ja": {
2967
- "vocab_size": 32100,
2968
- "n_bytes": 1774770,
2969
- "n_tokens": 51999,
2970
- "n_chars": 603065
2971
- },
2972
- "gemma_7b.cc100-ja": {
2973
- "vocab_size": 256000,
2974
- "n_bytes": 1774770,
2975
- "n_tokens": 317873,
2976
- "n_chars": 603065
2977
- },
2978
- "gpt2.cc100-ja": {
2979
- "vocab_size": 50257,
2980
- "n_bytes": 1774770,
2981
- "n_tokens": 844362,
2982
- "n_chars": 603065
2983
- },
2984
- "gpt2_chinese.cc100-ja": {
2985
- "vocab_size": 21128,
2986
- "n_bytes": 1774770,
2987
- "n_tokens": 503085,
2988
- "n_chars": 603065
2989
- },
2990
- "gpt_35_turbo.cc100-ja": {
2991
- "vocab_size": 100277,
2992
- "n_bytes": 1774770,
2993
- "n_tokens": 630348,
2994
- "n_chars": 603065
2995
- },
2996
- "gpt_4.cc100-ja": {
2997
- "vocab_size": 100277,
2998
- "n_bytes": 1774770,
2999
- "n_tokens": 630348,
3000
- "n_chars": 603065
3001
- },
3002
- "gpt_neox_japanese_2_7b.cc100-ja": {
3003
- "vocab_size": 32000,
3004
- "n_bytes": 1774770,
3005
- "n_tokens": 410803,
3006
- "n_chars": 603065
3007
- },
3008
- "gpt_nexo_20b.cc100-ja": {
3009
- "vocab_size": 50277,
3010
- "n_bytes": 1774770,
3011
- "n_tokens": 605168,
3012
- "n_chars": 603065
3013
- },
3014
- "grok_1.cc100-ja": {
3015
- "vocab_size": 131072,
3016
- "n_bytes": 1774770,
3017
- "n_tokens": 497590,
3018
- "n_chars": 603065
3019
- },
3020
- "internlm2_chat_7b.cc100-ja": {
3021
- "vocab_size": 92544,
3022
- "n_bytes": 1774770,
3023
- "n_tokens": 595803,
3024
- "n_chars": 603065
3025
- },
3026
- "internlm2_math_7b.cc100-ja": {
3027
- "vocab_size": 92544,
3028
- "n_bytes": 1774770,
3029
- "n_tokens": 595803,
3030
- "n_chars": 603065
3031
- },
3032
- "internlm_chat_7b.cc100-ja": {
3033
- "vocab_size": 103168,
3034
- "n_bytes": 1774770,
3035
- "n_tokens": 448212,
3036
- "n_chars": 603065
3037
- },
3038
- "internlm_xcomposer_7b.cc100-ja": {
3039
- "vocab_size": 103168,
3040
- "n_bytes": 1774770,
3041
- "n_tokens": 448212,
3042
- "n_chars": 603065
3043
- },
3044
- "jamba_v0_1.cc100-ja": {
3045
- "vocab_size": 65536,
3046
- "n_bytes": 1774770,
3047
- "n_tokens": 683256,
3048
- "n_chars": 603065
3049
- },
3050
- "kplug.cc100-ja": {
3051
- "vocab_size": 10261,
3052
- "n_bytes": 1774770,
3053
- "n_tokens": 338023,
3054
- "n_chars": 603065
3055
- },
3056
- "llama.cc100-ja": {
3057
- "vocab_size": 32000,
3058
- "n_bytes": 1774770,
3059
- "n_tokens": 728461,
3060
- "n_chars": 603065
3061
- },
3062
- "llama2.cc100-ja": {
3063
- "vocab_size": 32001,
3064
- "n_bytes": 1774770,
3065
- "n_tokens": 728461,
3066
- "n_chars": 603065
3067
- },
3068
- "llama3.cc100-ja": {
3069
- "vocab_size": 128256,
3070
- "n_bytes": 1774770,
3071
- "n_tokens": 414715,
3072
- "n_chars": 603065
3073
- },
3074
- "mistral_7b.cc100-ja": {
3075
- "vocab_size": 32000,
3076
- "n_bytes": 1774770,
3077
- "n_tokens": 685134,
3078
- "n_chars": 603065
3079
- },
3080
- "mixtral_8_7b.cc100-ja": {
3081
- "vocab_size": 32000,
3082
- "n_bytes": 1774770,
3083
- "n_tokens": 685134,
3084
- "n_chars": 603065
3085
- },
3086
- "mobilebert_uncased.cc100-ja": {
3087
- "vocab_size": 30522,
3088
- "n_bytes": 1774770,
3089
- "n_tokens": 580634,
3090
- "n_chars": 603065
3091
- },
3092
- "moss.cc100-ja": {
3093
- "vocab_size": 106072,
3094
- "n_bytes": 1774770,
3095
- "n_tokens": 600011,
3096
- "n_chars": 603065
3097
- },
3098
- "mt5_large.cc100-ja": {
3099
- "vocab_size": 250100,
3100
- "n_bytes": 1774770,
3101
- "n_tokens": 300542,
3102
- "n_chars": 603065
3103
- },
3104
- "olmo_7b.cc100-ja": {
3105
- "vocab_size": 50280,
3106
- "n_bytes": 1774770,
3107
- "n_tokens": 605168,
3108
- "n_chars": 603065
3109
- },
3110
- "orion_14b_chat.cc100-ja": {
3111
- "vocab_size": 84608,
3112
- "n_bytes": 1774770,
3113
- "n_tokens": 324956,
3114
- "n_chars": 603065
3115
- },
3116
- "phi_1.cc100-ja": {
3117
- "vocab_size": 50295,
3118
- "n_bytes": 1774770,
3119
- "n_tokens": 844362,
3120
- "n_chars": 603065
3121
- },
3122
- "phi_2.cc100-ja": {
3123
- "vocab_size": 50295,
3124
- "n_bytes": 1774770,
3125
- "n_tokens": 844362,
3126
- "n_chars": 603065
3127
- },
3128
- "phi_3_mini.cc100-ja": {
3129
- "vocab_size": 32011,
3130
- "n_bytes": 1774770,
3131
- "n_tokens": 728461,
3132
- "n_chars": 603065
3133
- },
3134
- "pko_t5_large.cc100-ja": {
3135
- "vocab_size": 50358,
3136
- "n_bytes": 1774770,
3137
- "n_tokens": 1766950,
3138
- "n_chars": 603065
3139
- },
3140
- "prompt_clue.cc100-ja": {
3141
- "vocab_size": 32128,
3142
- "n_bytes": 1774770,
3143
- "n_tokens": 575118,
3144
- "n_chars": 603065
3145
- },
3146
- "qwen1_5_14b_chat.cc100-ja": {
3147
- "vocab_size": 151646,
3148
- "n_bytes": 1774770,
3149
- "n_tokens": 377144,
3150
- "n_chars": 603065
3151
- },
3152
- "qwen_1_8b_chat.cc100-ja": {
3153
- "vocab_size": 151851,
3154
- "n_bytes": 1774770,
3155
- "n_tokens": 377144,
3156
- "n_chars": 603065
3157
- },
3158
- "qwen_72b_chat.cc100-ja": {
3159
- "vocab_size": 151851,
3160
- "n_bytes": 1774770,
3161
- "n_tokens": 377144,
3162
- "n_chars": 603065
3163
- },
3164
- "qwen_7b_chat.cc100-ja": {
3165
- "vocab_size": 151851,
3166
- "n_bytes": 1774770,
3167
- "n_tokens": 377144,
3168
- "n_chars": 603065
3169
- },
3170
- "roberta_chinese_clue.cc100-ja": {
3171
- "vocab_size": 8021,
3172
- "n_bytes": 1774770,
3173
- "n_tokens": 339411,
3174
- "n_chars": 603065
3175
- },
3176
- "skywork_13b_base.cc100-ja": {
3177
- "vocab_size": 65519,
3178
- "n_bytes": 1774770,
3179
- "n_tokens": 603613,
3180
- "n_chars": 603065
3181
- },
3182
- "skywork_13b_math.cc100-ja": {
3183
- "vocab_size": 65519,
3184
- "n_bytes": 1774770,
3185
- "n_tokens": 603613,
3186
- "n_chars": 603065
3187
- },
3188
- "solar_10_7b.cc100-ja": {
3189
- "vocab_size": 32000,
3190
- "n_bytes": 1774770,
3191
- "n_tokens": 685134,
3192
- "n_chars": 603065
3193
- },
3194
- "starchat_alpha.cc100-ja": {
3195
- "vocab_size": 49156,
3196
- "n_bytes": 1774770,
3197
- "n_tokens": 546876,
3198
- "n_chars": 603065
3199
- },
3200
- "switch_c_2048.cc100-ja": {
3201
- "vocab_size": 32100,
3202
- "n_bytes": 1774770,
3203
- "n_tokens": 51947,
3204
- "n_chars": 603065
3205
- },
3206
- "t5_base.cc100-ja": {
3207
- "vocab_size": 32100,
3208
- "n_bytes": 1774770,
3209
- "n_tokens": 51947,
3210
- "n_chars": 603065
3211
- },
3212
- "t5_large.cc100-ja": {
3213
- "vocab_size": 32100,
3214
- "n_bytes": 1774770,
3215
- "n_tokens": 51947,
3216
- "n_chars": 603065
3217
- },
3218
- "t5_small.cc100-ja": {
3219
- "vocab_size": 32100,
3220
- "n_bytes": 1774770,
3221
- "n_tokens": 51947,
3222
- "n_chars": 603065
3223
- },
3224
- "text_davinci_003.cc100-ja": {
3225
- "vocab_size": 50281,
3226
- "n_bytes": 1774770,
3227
- "n_tokens": 844362,
3228
- "n_chars": 603065
3229
- },
3230
- "tigerbot_13b_chat_v2.cc100-ja": {
3231
- "vocab_size": 60515,
3232
- "n_bytes": 1774770,
3233
- "n_tokens": 567792,
3234
- "n_chars": 603065
3235
- },
3236
- "tigerbot_70b_chat_v4_4k.cc100-ja": {
3237
- "vocab_size": 65110,
3238
- "n_bytes": 1774770,
3239
- "n_tokens": 406571,
3240
- "n_chars": 603065
3241
- },
3242
- "wizardcoder_15b_v1.cc100-ja": {
3243
- "vocab_size": 49153,
3244
- "n_bytes": 1774770,
3245
- "n_tokens": 546876,
3246
- "n_chars": 603065
3247
- },
3248
- "wizardcoder_python_7b_v1.cc100-ja": {
3249
- "vocab_size": 32001,
3250
- "n_bytes": 1774770,
3251
- "n_tokens": 728461,
3252
- "n_chars": 603065
3253
- },
3254
- "wizardlm_7b_v1.cc100-ja": {
3255
- "vocab_size": 32001,
3256
- "n_bytes": 1774770,
3257
- "n_tokens": 728461,
3258
- "n_chars": 603065
3259
- },
3260
- "wizardmath_70b_v1.cc100-ja": {
3261
- "vocab_size": 32002,
3262
- "n_bytes": 1774770,
3263
- "n_tokens": 728461,
3264
- "n_chars": 603065
3265
- },
3266
- "xlm_roberta.cc100-ja": {
3267
- "vocab_size": 250002,
3268
- "n_bytes": 1774770,
3269
- "n_tokens": 344820,
3270
- "n_chars": 603065
3271
- },
3272
- "yi_34b.cc100-ja": {
3273
- "vocab_size": 64000,
3274
- "n_bytes": 1774770,
3275
- "n_tokens": 740791,
3276
- "n_chars": 603065
3277
- },
3278
- "yi_6b.cc100-ja": {
3279
- "vocab_size": 64000,
3280
- "n_bytes": 1774770,
3281
- "n_tokens": 740791,
3282
- "n_chars": 603065
3283
- },
3284
- "yi_vl34b.cc100-ja": {
3285
- "vocab_size": 64000,
3286
- "n_bytes": 1774770,
3287
- "n_tokens": 749927,
3288
- "n_chars": 603065
3289
- },
3290
- "zephyr_7b_beta.cc100-ja": {
3291
- "vocab_size": 32000,
3292
- "n_bytes": 1774770,
3293
- "n_tokens": 685134,
3294
- "n_chars": 603065
3295
- },
3296
- "llama_3_chinese_8b.cc100-ar": {
3297
- "vocab_size": 128256,
3298
- "n_bytes": 2813283,
3299
- "n_tokens": 625514,
3300
- "n_chars": 1560987
3301
- },
3302
- "llama_3_chinese_8b.cc100-de": {
3303
- "vocab_size": 128256,
3304
- "n_bytes": 1814876,
3305
- "n_tokens": 509766,
3306
- "n_chars": 1784021
3307
- },
3308
- "llama_3_chinese_8b.cc100-en": {
3309
- "vocab_size": 128256,
3310
- "n_bytes": 1124813,
3311
- "n_tokens": 264944,
3312
- "n_chars": 1121360
3313
- },
3314
- "llama_3_chinese_8b.cc100-es": {
3315
- "vocab_size": 128256,
3316
- "n_bytes": 1664455,
3317
- "n_tokens": 443289,
3318
- "n_chars": 1630297
3319
- },
3320
- "aya_101.cc100-fa": {
3321
- "vocab_size": 250100,
3322
- "n_bytes": 2054052,
3323
- "n_tokens": 429922,
3324
- "n_chars": 1145876
3325
- },
3326
- "baichuan.cc100-fa": {
3327
- "vocab_size": 64000,
3328
- "n_bytes": 2054052,
3329
- "n_tokens": 1142057,
3330
- "n_chars": 1145876
3331
- },
3332
- "baichuan2.cc100-fa": {
3333
- "vocab_size": 125696,
3334
- "n_bytes": 2054052,
3335
- "n_tokens": 1052077,
3336
- "n_chars": 1145876
3337
- },
3338
- "bert_base_cased.cc100-fa": {
3339
- "vocab_size": 28996,
3340
- "n_bytes": 2054052,
3341
- "n_tokens": 903078,
3342
- "n_chars": 1145876
3343
- },
3344
- "bert_base_chinese.cc100-fa": {
3345
- "vocab_size": 21128,
3346
- "n_bytes": 2054052,
3347
- "n_tokens": 396414,
3348
- "n_chars": 1145876
3349
- },
3350
- "bert_base_uncased.cc100-fa": {
3351
- "vocab_size": 30522,
3352
- "n_bytes": 2054052,
3353
- "n_tokens": 910783,
3354
- "n_chars": 1145876
3355
- },
3356
- "bloom.cc100-fa": {
3357
- "vocab_size": 250680,
3358
- "n_bytes": 2054052,
3359
- "n_tokens": 434406,
3360
- "n_chars": 1145876
3361
- },
3362
- "byt5_small.cc100-fa": {
3363
- "vocab_size": 384,
3364
- "n_bytes": 2054052,
3365
- "n_tokens": 2064052,
3366
- "n_chars": 1145876
3367
- },
3368
- "character_glm_6b.cc100-fa": {
3369
- "vocab_size": 64789,
3370
- "n_bytes": 2054052,
3371
- "n_tokens": 1165051,
3372
- "n_chars": 1145876
3373
- },
3374
- "chatglm2_6b.cc100-fa": {
3375
- "vocab_size": 64787,
3376
- "n_bytes": 2054052,
3377
- "n_tokens": 1165051,
3378
- "n_chars": 1145876
3379
- },
3380
- "chatglm3_6b.cc100-fa": {
3381
- "vocab_size": 64796,
3382
- "n_bytes": 2054052,
3383
- "n_tokens": 1165051,
3384
- "n_chars": 1145876
3385
- },
3386
- "chatglm_6b.cc100-fa": {
3387
- "vocab_size": 150344,
3388
- "n_bytes": 2054052,
3389
- "n_tokens": 910808,
3390
- "n_chars": 1145876
3391
- },
3392
- "chatyuan_large_v2.cc100-fa": {
3393
- "vocab_size": 32128,
3394
- "n_bytes": 2054052,
3395
- "n_tokens": 740377,
3396
- "n_chars": 1145876
3397
- },
3398
- "chinese_llama.cc100-fa": {
3399
- "vocab_size": 49953,
3400
- "n_bytes": 2054052,
3401
- "n_tokens": 1150750,
3402
- "n_chars": 1145876
3403
- },
3404
- "chinese_llama2.cc100-fa": {
3405
- "vocab_size": 55296,
3406
- "n_bytes": 2054052,
3407
- "n_tokens": 1155078,
3408
- "n_chars": 1145876
3409
- },
3410
- "code_davinci_002.cc100-fa": {
3411
- "vocab_size": 50281,
3412
- "n_bytes": 2054052,
3413
- "n_tokens": 1292300,
3414
- "n_chars": 1145876
3415
- },
3416
- "crystal_coder.cc100-fa": {
3417
- "vocab_size": 32022,
3418
- "n_bytes": 2054052,
3419
- "n_tokens": 1145076,
3420
- "n_chars": 1145876
3421
- },
3422
- "dbrx_instruct.cc100-fa": {
3423
- "vocab_size": 100280,
3424
- "n_bytes": 2054052,
3425
- "n_tokens": 818067,
3426
- "n_chars": 1145876
3427
- },
3428
- "deepseek_coder_33b_instruct.cc100-fa": {
3429
- "vocab_size": 32022,
3430
- "n_bytes": 2054052,
3431
- "n_tokens": 1326109,
3432
- "n_chars": 1145876
3433
- },
3434
- "deepseek_llm_7b_base.cc100-fa": {
3435
- "vocab_size": 100015,
3436
- "n_bytes": 2054052,
3437
- "n_tokens": 973451,
3438
- "n_chars": 1145876
3439
- },
3440
- "falcon_180b.cc100-fa": {
3441
- "vocab_size": 65024,
3442
- "n_bytes": 2054052,
3443
- "n_tokens": 1246580,
3444
- "n_chars": 1145876
3445
- },
3446
- "falcon_7b.cc100-fa": {
3447
- "vocab_size": 65024,
3448
- "n_bytes": 2054052,
3449
- "n_tokens": 1246580,
3450
- "n_chars": 1145876
3451
- },
3452
- "fastchat_t5_3b.cc100-fa": {
3453
- "vocab_size": 32110,
3454
- "n_bytes": 2054052,
3455
- "n_tokens": 712443,
3456
- "n_chars": 1145876
3457
- },
3458
- "flan_t5_base.cc100-fa": {
3459
- "vocab_size": 32100,
3460
- "n_bytes": 2054052,
3461
- "n_tokens": 493779,
3462
- "n_chars": 1145876
3463
- },
3464
- "gemma_7b.cc100-fa": {
3465
- "vocab_size": 256000,
3466
- "n_bytes": 2054052,
3467
- "n_tokens": 373762,
3468
- "n_chars": 1145876
3469
- },
3470
- "gpt2.cc100-fa": {
3471
- "vocab_size": 50257,
3472
- "n_bytes": 2054052,
3473
- "n_tokens": 1292300,
3474
- "n_chars": 1145876
3475
- },
3476
- "gpt2_chinese.cc100-fa": {
3477
- "vocab_size": 21128,
3478
- "n_bytes": 2054052,
3479
- "n_tokens": 406174,
3480
- "n_chars": 1145876
3481
- },
3482
- "gpt_35_turbo.cc100-fa": {
3483
- "vocab_size": 100277,
3484
- "n_bytes": 2054052,
3485
- "n_tokens": 818067,
3486
- "n_chars": 1145876
3487
- },
3488
- "gpt_4.cc100-fa": {
3489
- "vocab_size": 100277,
3490
- "n_bytes": 2054052,
3491
- "n_tokens": 818067,
3492
- "n_chars": 1145876
3493
- },
3494
- "gpt_neox_japanese_2_7b.cc100-fa": {
3495
- "vocab_size": 32000,
3496
- "n_bytes": 2054052,
3497
- "n_tokens": 2036715,
3498
- "n_chars": 1145876
3499
- },
3500
- "gpt_nexo_20b.cc100-fa": {
3501
- "vocab_size": 50277,
3502
- "n_bytes": 2054052,
3503
- "n_tokens": 866434,
3504
- "n_chars": 1145876
3505
- },
3506
- "grok_1.cc100-fa": {
3507
- "vocab_size": 131072,
3508
- "n_bytes": 2054052,
3509
- "n_tokens": 1073281,
3510
- "n_chars": 1145876
3511
- },
3512
- "internlm2_chat_7b.cc100-fa": {
3513
- "vocab_size": 92544,
3514
- "n_bytes": 2054052,
3515
- "n_tokens": 1195032,
3516
- "n_chars": 1145876
3517
- },
3518
- "internlm2_math_7b.cc100-fa": {
3519
- "vocab_size": 92544,
3520
- "n_bytes": 2054052,
3521
- "n_tokens": 1195032,
3522
- "n_chars": 1145876
3523
- },
3524
- "internlm_chat_7b.cc100-fa": {
3525
- "vocab_size": 103168,
3526
- "n_bytes": 2054052,
3527
- "n_tokens": 640945,
3528
- "n_chars": 1145876
3529
- },
3530
- "internlm_xcomposer_7b.cc100-fa": {
3531
- "vocab_size": 103168,
3532
- "n_bytes": 2054052,
3533
- "n_tokens": 640945,
3534
- "n_chars": 1145876
3535
- },
3536
- "jamba_v0_1.cc100-fa": {
3537
- "vocab_size": 65536,
3538
- "n_bytes": 2054052,
3539
- "n_tokens": 732550,
3540
- "n_chars": 1145876
3541
- },
3542
- "kplug.cc100-fa": {
3543
- "vocab_size": 10261,
3544
- "n_bytes": 2054052,
3545
- "n_tokens": 274671,
3546
- "n_chars": 1145876
3547
- },
3548
- "llama.cc100-fa": {
3549
- "vocab_size": 32000,
3550
- "n_bytes": 2054052,
3551
- "n_tokens": 1155076,
3552
- "n_chars": 1145876
3553
- },
3554
- "llama2.cc100-fa": {
3555
- "vocab_size": 32001,
3556
- "n_bytes": 2054052,
3557
- "n_tokens": 1155076,
3558
- "n_chars": 1145876
3559
- },
3560
- "llama3.cc100-fa": {
3561
- "vocab_size": 128256,
3562
- "n_bytes": 2054052,
3563
- "n_tokens": 387448,
3564
- "n_chars": 1145876
3565
- },
3566
- "llama_3_chinese_8b.cc100-fa": {
3567
- "vocab_size": 128256,
3568
- "n_bytes": 2054052,
3569
- "n_tokens": 397448,
3570
- "n_chars": 1145876
3571
- },
3572
- "mistral_7b.cc100-fa": {
3573
- "vocab_size": 32000,
3574
- "n_bytes": 2054052,
3575
- "n_tokens": 1133278,
3576
- "n_chars": 1145876
3577
- },
3578
- "mixtral_8_7b.cc100-fa": {
3579
- "vocab_size": 32000,
3580
- "n_bytes": 2054052,
3581
- "n_tokens": 1133278,
3582
- "n_chars": 1145876
3583
- },
3584
- "mobilebert_uncased.cc100-fa": {
3585
- "vocab_size": 30522,
3586
- "n_bytes": 2054052,
3587
- "n_tokens": 910783,
3588
- "n_chars": 1145876
3589
- },
3590
- "moss.cc100-fa": {
3591
- "vocab_size": 106072,
3592
- "n_bytes": 2054052,
3593
- "n_tokens": 1285426,
3594
- "n_chars": 1145876
3595
- },
3596
- "mt5_large.cc100-fa": {
3597
- "vocab_size": 250100,
3598
- "n_bytes": 2054052,
3599
- "n_tokens": 429922,
3600
- "n_chars": 1145876
3601
- },
3602
- "olmo_7b.cc100-fa": {
3603
- "vocab_size": 50280,
3604
- "n_bytes": 2054052,
3605
- "n_tokens": 866434,
3606
- "n_chars": 1145876
3607
- },
3608
- "orion_14b_chat.cc100-fa": {
3609
- "vocab_size": 84608,
3610
- "n_bytes": 2054052,
3611
- "n_tokens": 1131108,
3612
- "n_chars": 1145876
3613
- },
3614
- "phi_1.cc100-fa": {
3615
- "vocab_size": 50295,
3616
- "n_bytes": 2054052,
3617
- "n_tokens": 1292300,
3618
- "n_chars": 1145876
3619
- },
3620
- "phi_2.cc100-fa": {
3621
- "vocab_size": 50295,
3622
- "n_bytes": 2054052,
3623
- "n_tokens": 1292300,
3624
- "n_chars": 1145876
3625
- },
3626
- "phi_3_mini.cc100-fa": {
3627
- "vocab_size": 32011,
3628
- "n_bytes": 2054052,
3629
- "n_tokens": 1155076,
3630
- "n_chars": 1145876
3631
- },
3632
- "pko_t5_large.cc100-fa": {
3633
- "vocab_size": 50358,
3634
- "n_bytes": 2054052,
3635
- "n_tokens": 2061040,
3636
- "n_chars": 1145876
3637
- },
3638
- "prompt_clue.cc100-fa": {
3639
- "vocab_size": 32128,
3640
- "n_bytes": 2054052,
3641
- "n_tokens": 740377,
3642
- "n_chars": 1145876
3643
- },
3644
- "qwen1_5_14b_chat.cc100-fa": {
3645
- "vocab_size": 151646,
3646
- "n_bytes": 2054052,
3647
- "n_tokens": 643421,
3648
- "n_chars": 1145876
3649
- },
3650
- "qwen_1_8b_chat.cc100-fa": {
3651
- "vocab_size": 151851,
3652
- "n_bytes": 2054052,
3653
- "n_tokens": 643421,
3654
- "n_chars": 1145876
3655
- },
3656
- "qwen_72b_chat.cc100-fa": {
3657
- "vocab_size": 151851,
3658
- "n_bytes": 2054052,
3659
- "n_tokens": 643421,
3660
- "n_chars": 1145876
3661
- },
3662
- "qwen_7b_chat.cc100-fa": {
3663
- "vocab_size": 151851,
3664
- "n_bytes": 2054052,
3665
- "n_tokens": 643421,
3666
- "n_chars": 1145876
3667
- },
3668
- "roberta_chinese_clue.cc100-fa": {
3669
- "vocab_size": 8021,
3670
- "n_bytes": 2054052,
3671
- "n_tokens": 407763,
3672
- "n_chars": 1145876
3673
- },
3674
- "skywork_13b_base.cc100-fa": {
3675
- "vocab_size": 65519,
3676
- "n_bytes": 2054052,
3677
- "n_tokens": 1155072,
3678
- "n_chars": 1145876
3679
- },
3680
- "skywork_13b_math.cc100-fa": {
3681
- "vocab_size": 65519,
3682
- "n_bytes": 2054052,
3683
- "n_tokens": 1155072,
3684
- "n_chars": 1145876
3685
- },
3686
- "solar_10_7b.cc100-fa": {
3687
- "vocab_size": 32000,
3688
- "n_bytes": 2054052,
3689
- "n_tokens": 1133278,
3690
- "n_chars": 1145876
3691
- },
3692
- "starchat_alpha.cc100-fa": {
3693
- "vocab_size": 49156,
3694
- "n_bytes": 2054052,
3695
- "n_tokens": 851630,
3696
- "n_chars": 1145876
3697
- },
3698
- "switch_c_2048.cc100-fa": {
3699
- "vocab_size": 32100,
3700
- "n_bytes": 2054052,
3701
- "n_tokens": 493767,
3702
- "n_chars": 1145876
3703
- },
3704
- "t5_base.cc100-fa": {
3705
- "vocab_size": 32100,
3706
- "n_bytes": 2054052,
3707
- "n_tokens": 493767,
3708
- "n_chars": 1145876
3709
- },
3710
- "t5_large.cc100-fa": {
3711
- "vocab_size": 32100,
3712
- "n_bytes": 2054052,
3713
- "n_tokens": 493767,
3714
- "n_chars": 1145876
3715
- },
3716
- "t5_small.cc100-fa": {
3717
- "vocab_size": 32100,
3718
- "n_bytes": 2054052,
3719
- "n_tokens": 493767,
3720
- "n_chars": 1145876
3721
- },
3722
- "text_davinci_003.cc100-fa": {
3723
- "vocab_size": 50281,
3724
- "n_bytes": 2054052,
3725
- "n_tokens": 1292300,
3726
- "n_chars": 1145876
3727
- },
3728
- "tigerbot_13b_chat_v2.cc100-fa": {
3729
- "vocab_size": 60515,
3730
- "n_bytes": 2054052,
3731
- "n_tokens": 1145046,
3732
- "n_chars": 1145876
3733
- },
3734
- "tigerbot_70b_chat_v4_4k.cc100-fa": {
3735
- "vocab_size": 65110,
3736
- "n_bytes": 2054052,
3737
- "n_tokens": 1145048,
3738
- "n_chars": 1145876
3739
- },
3740
- "wizardcoder_15b_v1.cc100-fa": {
3741
- "vocab_size": 49153,
3742
- "n_bytes": 2054052,
3743
- "n_tokens": 851630,
3744
- "n_chars": 1145876
3745
- },
3746
- "wizardcoder_python_7b_v1.cc100-fa": {
3747
- "vocab_size": 32001,
3748
- "n_bytes": 2054052,
3749
- "n_tokens": 1155076,
3750
- "n_chars": 1145876
3751
- },
3752
- "wizardlm_7b_v1.cc100-fa": {
3753
- "vocab_size": 32001,
3754
- "n_bytes": 2054052,
3755
- "n_tokens": 1155076,
3756
- "n_chars": 1145876
3757
- },
3758
- "wizardmath_70b_v1.cc100-fa": {
3759
- "vocab_size": 32002,
3760
- "n_bytes": 2054052,
3761
- "n_tokens": 1155076,
3762
- "n_chars": 1145876
3763
- },
3764
- "xlm_roberta.cc100-fa": {
3765
- "vocab_size": 250002,
3766
- "n_bytes": 2054052,
3767
- "n_tokens": 330926,
3768
- "n_chars": 1145876
3769
- },
3770
- "yi_34b.cc100-fa": {
3771
- "vocab_size": 64000,
3772
- "n_bytes": 2054052,
3773
- "n_tokens": 1337264,
3774
- "n_chars": 1145876
3775
- },
3776
- "yi_6b.cc100-fa": {
3777
- "vocab_size": 64000,
3778
- "n_bytes": 2054052,
3779
- "n_tokens": 1337264,
3780
- "n_chars": 1145876
3781
- },
3782
- "yi_vl34b.cc100-fa": {
3783
- "vocab_size": 64000,
3784
- "n_bytes": 2054052,
3785
- "n_tokens": 1346819,
3786
- "n_chars": 1145876
3787
- },
3788
- "zephyr_7b_beta.cc100-fa": {
3789
- "vocab_size": 32000,
3790
- "n_bytes": 2054052,
3791
- "n_tokens": 1133278,
3792
- "n_chars": 1145876
3793
- },
3794
- "llama_3_chinese_8b.cc100-fr": {
3795
- "vocab_size": 128256,
3796
- "n_bytes": 1540504,
3797
- "n_tokens": 422146,
3798
- "n_chars": 1484970
3799
- },
3800
- "llama_3_chinese_8b.cc100-ja": {
3801
- "vocab_size": 128256,
3802
- "n_bytes": 1774770,
3803
- "n_tokens": 424715,
3804
- "n_chars": 603065
3805
- },
3806
- "aya_101.cc100-ko": {
3807
- "vocab_size": 250100,
3808
- "n_bytes": 1524839,
3809
- "n_tokens": 434586,
3810
- "n_chars": 655190
3811
- },
3812
- "baichuan.cc100-ko": {
3813
- "vocab_size": 64000,
3814
- "n_bytes": 1524839,
3815
- "n_tokens": 639258,
3816
- "n_chars": 655190
3817
- },
3818
- "baichuan2.cc100-ko": {
3819
- "vocab_size": 125696,
3820
- "n_bytes": 1524839,
3821
- "n_tokens": 623358,
3822
- "n_chars": 655190
3823
- },
3824
- "bert_base_cased.cc100-ko": {
3825
- "vocab_size": 28996,
3826
- "n_bytes": 1524839,
3827
- "n_tokens": 222828,
3828
- "n_chars": 655190
3829
- },
3830
- "bert_base_chinese.cc100-ko": {
3831
- "vocab_size": 21128,
3832
- "n_bytes": 1524839,
3833
- "n_tokens": 219752,
3834
- "n_chars": 655190
3835
- },
3836
- "bert_base_uncased.cc100-ko": {
3837
- "vocab_size": 30522,
3838
- "n_bytes": 1524839,
3839
- "n_tokens": 904756,
3840
- "n_chars": 655190
3841
- },
3842
- "bloom.cc100-ko": {
3843
- "vocab_size": 250680,
3844
- "n_bytes": 1524839,
3845
- "n_tokens": 742111,
3846
- "n_chars": 655190
3847
- },
3848
- "byt5_small.cc100-ko": {
3849
- "vocab_size": 384,
3850
- "n_bytes": 1524839,
3851
- "n_tokens": 1534839,
3852
- "n_chars": 655190
3853
- },
3854
- "character_glm_6b.cc100-ko": {
3855
- "vocab_size": 64789,
3856
- "n_bytes": 1524839,
3857
- "n_tokens": 672160,
3858
- "n_chars": 655190
3859
- },
3860
- "chatglm2_6b.cc100-ko": {
3861
- "vocab_size": 64787,
3862
- "n_bytes": 1524839,
3863
- "n_tokens": 672156,
3864
- "n_chars": 655190
3865
- },
3866
- "chatglm3_6b.cc100-ko": {
3867
- "vocab_size": 64796,
3868
- "n_bytes": 1524839,
3869
- "n_tokens": 672160,
3870
- "n_chars": 655190
3871
- },
3872
- "chatglm_6b.cc100-ko": {
3873
- "vocab_size": 150344,
3874
- "n_bytes": 1524839,
3875
- "n_tokens": 939630,
3876
- "n_chars": 655190
3877
- },
3878
- "chatyuan_large_v2.cc100-ko": {
3879
- "vocab_size": 32128,
3880
- "n_bytes": 1524839,
3881
- "n_tokens": 354411,
3882
- "n_chars": 655190
3883
- },
3884
- "chinese_llama.cc100-ko": {
3885
- "vocab_size": 49953,
3886
- "n_bytes": 1524839,
3887
- "n_tokens": 913553,
3888
- "n_chars": 655190
3889
- },
3890
- "chinese_llama2.cc100-ko": {
3891
- "vocab_size": 55296,
3892
- "n_bytes": 1524839,
3893
- "n_tokens": 963427,
3894
- "n_chars": 655190
3895
- },
3896
- "code_davinci_002.cc100-ko": {
3897
- "vocab_size": 50281,
3898
- "n_bytes": 1524839,
3899
- "n_tokens": 1308993,
3900
- "n_chars": 655190
3901
- },
3902
- "crystal_coder.cc100-ko": {
3903
- "vocab_size": 32022,
3904
- "n_bytes": 1524839,
3905
- "n_tokens": 954428,
3906
- "n_chars": 655190
3907
- },
3908
- "dbrx_instruct.cc100-ko": {
3909
- "vocab_size": 100280,
3910
- "n_bytes": 1524839,
3911
- "n_tokens": 652277,
3912
- "n_chars": 655190
3913
- },
3914
- "deepseek_coder_33b_instruct.cc100-ko": {
3915
- "vocab_size": 32022,
3916
- "n_bytes": 1524839,
3917
- "n_tokens": 1454805,
3918
- "n_chars": 655190
3919
- },
3920
- "deepseek_llm_7b_base.cc100-ko": {
3921
- "vocab_size": 100015,
3922
- "n_bytes": 1524839,
3923
- "n_tokens": 1081983,
3924
- "n_chars": 655190
3925
- },
3926
- "falcon_180b.cc100-ko": {
3927
- "vocab_size": 65024,
3928
- "n_bytes": 1524839,
3929
- "n_tokens": 1330568,
3930
- "n_chars": 655190
3931
- },
3932
- "falcon_7b.cc100-ko": {
3933
- "vocab_size": 65024,
3934
- "n_bytes": 1524839,
3935
- "n_tokens": 1330568,
3936
- "n_chars": 655190
3937
- },
3938
- "fastchat_t5_3b.cc100-ko": {
3939
- "vocab_size": 32110,
3940
- "n_bytes": 1524839,
3941
- "n_tokens": 484953,
3942
- "n_chars": 655190
3943
- },
3944
- "flan_t5_base.cc100-ko": {
3945
- "vocab_size": 32100,
3946
- "n_bytes": 1524839,
3947
- "n_tokens": 344457,
3948
- "n_chars": 655190
3949
- },
3950
- "gemma_7b.cc100-ko": {
3951
- "vocab_size": 256000,
3952
- "n_bytes": 1524839,
3953
- "n_tokens": 464410,
3954
- "n_chars": 655190
3955
- },
3956
- "gpt2.cc100-ko": {
3957
- "vocab_size": 50257,
3958
- "n_bytes": 1524839,
3959
- "n_tokens": 1309029,
3960
- "n_chars": 655190
3961
- },
3962
- "gpt2_chinese.cc100-ko": {
3963
- "vocab_size": 21128,
3964
- "n_bytes": 1524839,
3965
- "n_tokens": 1055974,
3966
- "n_chars": 655190
3967
- },
3968
- "gpt_35_turbo.cc100-ko": {
3969
- "vocab_size": 100277,
3970
- "n_bytes": 1524839,
3971
- "n_tokens": 652277,
3972
- "n_chars": 655190
3973
- },
3974
- "gpt_4.cc100-ko": {
3975
- "vocab_size": 100277,
3976
- "n_bytes": 1524839,
3977
- "n_tokens": 652277,
3978
- "n_chars": 655190
3979
- },
3980
- "gpt_neox_japanese_2_7b.cc100-ko": {
3981
- "vocab_size": 32000,
3982
- "n_bytes": 1524839,
3983
- "n_tokens": 1512832,
3984
- "n_chars": 655190
3985
- },
3986
- "gpt_nexo_20b.cc100-ko": {
3987
- "vocab_size": 50277,
3988
- "n_bytes": 1524839,
3989
- "n_tokens": 973288,
3990
- "n_chars": 655190
3991
- },
3992
- "grok_1.cc100-ko": {
3993
- "vocab_size": 131072,
3994
- "n_bytes": 1524839,
3995
- "n_tokens": 1152005,
3996
- "n_chars": 655190
3997
- },
3998
- "internlm2_chat_7b.cc100-ko": {
3999
- "vocab_size": 92544,
4000
- "n_bytes": 1524839,
4001
- "n_tokens": 1008524,
4002
- "n_chars": 655190
4003
- },
4004
- "internlm2_math_7b.cc100-ko": {
4005
- "vocab_size": 92544,
4006
- "n_bytes": 1524839,
4007
- "n_tokens": 1008524,
4008
- "n_chars": 655190
4009
- },
4010
- "internlm_chat_7b.cc100-ko": {
4011
- "vocab_size": 103168,
4012
- "n_bytes": 1524839,
4013
- "n_tokens": 839609,
4014
- "n_chars": 655190
4015
- },
4016
- "internlm_xcomposer_7b.cc100-ko": {
4017
- "vocab_size": 103168,
4018
- "n_bytes": 1524839,
4019
- "n_tokens": 839609,
4020
- "n_chars": 655190
4021
- },
4022
- "jamba_v0_1.cc100-ko": {
4023
- "vocab_size": 65536,
4024
- "n_bytes": 1524839,
4025
- "n_tokens": 715688,
4026
- "n_chars": 655190
4027
- },
4028
- "kplug.cc100-ko": {
4029
- "vocab_size": 10261,
4030
- "n_bytes": 1524839,
4031
- "n_tokens": 222771,
4032
- "n_chars": 655190
4033
- },
4034
- "llama.cc100-ko": {
4035
- "vocab_size": 32000,
4036
- "n_bytes": 1524839,
4037
- "n_tokens": 964428,
4038
- "n_chars": 655190
4039
- },
4040
- "llama2.cc100-ko": {
4041
- "vocab_size": 32001,
4042
- "n_bytes": 1524839,
4043
- "n_tokens": 964428,
4044
- "n_chars": 655190
4045
- },
4046
- "llama3.cc100-ko": {
4047
- "vocab_size": 128256,
4048
- "n_bytes": 1524839,
4049
- "n_tokens": 412595,
4050
- "n_chars": 655190
4051
- },
4052
- "llama_3_chinese_8b.cc100-ko": {
4053
- "vocab_size": 128256,
4054
- "n_bytes": 1524839,
4055
- "n_tokens": 422595,
4056
- "n_chars": 655190
4057
- },
4058
- "mistral_7b.cc100-ko": {
4059
- "vocab_size": 32000,
4060
- "n_bytes": 1524839,
4061
- "n_tokens": 728766,
4062
- "n_chars": 655190
4063
- },
4064
- "mixtral_8_7b.cc100-ko": {
4065
- "vocab_size": 32000,
4066
- "n_bytes": 1524839,
4067
- "n_tokens": 728766,
4068
- "n_chars": 655190
4069
- },
4070
- "mobilebert_uncased.cc100-ko": {
4071
- "vocab_size": 30522,
4072
- "n_bytes": 1524839,
4073
- "n_tokens": 904756,
4074
- "n_chars": 655190
4075
- },
4076
- "moss.cc100-ko": {
4077
- "vocab_size": 106072,
4078
- "n_bytes": 1524839,
4079
- "n_tokens": 1305249,
4080
- "n_chars": 655190
4081
- },
4082
- "mt5_large.cc100-ko": {
4083
- "vocab_size": 250100,
4084
- "n_bytes": 1524839,
4085
- "n_tokens": 434586,
4086
- "n_chars": 655190
4087
- },
4088
- "olmo_7b.cc100-ko": {
4089
- "vocab_size": 50280,
4090
- "n_bytes": 1524839,
4091
- "n_tokens": 973288,
4092
- "n_chars": 655190
4093
- },
4094
- "orion_14b_chat.cc100-ko": {
4095
- "vocab_size": 84608,
4096
- "n_bytes": 1524839,
4097
- "n_tokens": 351149,
4098
- "n_chars": 655190
4099
- },
4100
- "phi_1.cc100-ko": {
4101
- "vocab_size": 50295,
4102
- "n_bytes": 1524839,
4103
- "n_tokens": 1308988,
4104
- "n_chars": 655190
4105
- },
4106
- "phi_2.cc100-ko": {
4107
- "vocab_size": 50295,
4108
- "n_bytes": 1524839,
4109
- "n_tokens": 1308988,
4110
- "n_chars": 655190
4111
- },
4112
- "phi_3_mini.cc100-ko": {
4113
- "vocab_size": 32011,
4114
- "n_bytes": 1524839,
4115
- "n_tokens": 964428,
4116
- "n_chars": 655190
4117
- },
4118
- "pko_t5_large.cc100-ko": {
4119
- "vocab_size": 50358,
4120
- "n_bytes": 1524839,
4121
- "n_tokens": 471643,
4122
- "n_chars": 655190
4123
- },
4124
- "prompt_clue.cc100-ko": {
4125
- "vocab_size": 32128,
4126
- "n_bytes": 1524839,
4127
- "n_tokens": 354411,
4128
- "n_chars": 655190
4129
- },
4130
- "qwen1_5_14b_chat.cc100-ko": {
4131
- "vocab_size": 151646,
4132
- "n_bytes": 1524839,
4133
- "n_tokens": 457492,
4134
- "n_chars": 655190
4135
- },
4136
- "qwen_1_8b_chat.cc100-ko": {
4137
- "vocab_size": 151851,
4138
- "n_bytes": 1524839,
4139
- "n_tokens": 457492,
4140
- "n_chars": 655190
4141
- },
4142
- "qwen_72b_chat.cc100-ko": {
4143
- "vocab_size": 151851,
4144
- "n_bytes": 1524839,
4145
- "n_tokens": 457492,
4146
- "n_chars": 655190
4147
- },
4148
- "qwen_7b_chat.cc100-ko": {
4149
- "vocab_size": 151851,
4150
- "n_bytes": 1524839,
4151
- "n_tokens": 457492,
4152
- "n_chars": 655190
4153
- },
4154
- "roberta_chinese_clue.cc100-ko": {
4155
- "vocab_size": 8021,
4156
- "n_bytes": 1524839,
4157
- "n_tokens": 226812,
4158
- "n_chars": 655190
4159
- },
4160
- "skywork_13b_base.cc100-ko": {
4161
- "vocab_size": 65519,
4162
- "n_bytes": 1524839,
4163
- "n_tokens": 962744,
4164
- "n_chars": 655190
4165
- },
4166
- "skywork_13b_math.cc100-ko": {
4167
- "vocab_size": 65519,
4168
- "n_bytes": 1524839,
4169
- "n_tokens": 962744,
4170
- "n_chars": 655190
4171
- },
4172
- "solar_10_7b.cc100-ko": {
4173
- "vocab_size": 32000,
4174
- "n_bytes": 1524839,
4175
- "n_tokens": 728766,
4176
- "n_chars": 655190
4177
- },
4178
- "starchat_alpha.cc100-ko": {
4179
- "vocab_size": 49156,
4180
- "n_bytes": 1524839,
4181
- "n_tokens": 580873,
4182
- "n_chars": 655190
4183
- },
4184
- "switch_c_2048.cc100-ko": {
4185
- "vocab_size": 32100,
4186
- "n_bytes": 1524839,
4187
- "n_tokens": 344457,
4188
- "n_chars": 655190
4189
- },
4190
- "t5_base.cc100-ko": {
4191
- "vocab_size": 32100,
4192
- "n_bytes": 1524839,
4193
- "n_tokens": 344457,
4194
- "n_chars": 655190
4195
- },
4196
- "t5_large.cc100-ko": {
4197
- "vocab_size": 32100,
4198
- "n_bytes": 1524839,
4199
- "n_tokens": 344457,
4200
- "n_chars": 655190
4201
- },
4202
- "t5_small.cc100-ko": {
4203
- "vocab_size": 32100,
4204
- "n_bytes": 1524839,
4205
- "n_tokens": 344457,
4206
- "n_chars": 655190
4207
- },
4208
- "text_davinci_003.cc100-ko": {
4209
- "vocab_size": 50281,
4210
- "n_bytes": 1524839,
4211
- "n_tokens": 1308993,
4212
- "n_chars": 655190
4213
- },
4214
- "tigerbot_13b_chat_v2.cc100-ko": {
4215
- "vocab_size": 60515,
4216
- "n_bytes": 1524839,
4217
- "n_tokens": 793053,
4218
- "n_chars": 655190
4219
- },
4220
- "tigerbot_70b_chat_v4_4k.cc100-ko": {
4221
- "vocab_size": 65110,
4222
- "n_bytes": 1524839,
4223
- "n_tokens": 484082,
4224
- "n_chars": 655190
4225
- },
4226
- "wizardcoder_15b_v1.cc100-ko": {
4227
- "vocab_size": 49153,
4228
- "n_bytes": 1524839,
4229
- "n_tokens": 580873,
4230
- "n_chars": 655190
4231
- },
4232
- "wizardcoder_python_7b_v1.cc100-ko": {
4233
- "vocab_size": 32001,
4234
- "n_bytes": 1524839,
4235
- "n_tokens": 964428,
4236
- "n_chars": 655190
4237
- },
4238
- "wizardlm_7b_v1.cc100-ko": {
4239
- "vocab_size": 32001,
4240
- "n_bytes": 1524839,
4241
- "n_tokens": 964428,
4242
- "n_chars": 655190
4243
- },
4244
- "wizardmath_70b_v1.cc100-ko": {
4245
- "vocab_size": 32002,
4246
- "n_bytes": 1524839,
4247
- "n_tokens": 964428,
4248
- "n_chars": 655190
4249
- },
4250
- "xlm_roberta.cc100-ko": {
4251
- "vocab_size": 250002,
4252
- "n_bytes": 1524839,
4253
- "n_tokens": 374571,
4254
- "n_chars": 655190
4255
- },
4256
- "yi_34b.cc100-ko": {
4257
- "vocab_size": 64000,
4258
- "n_bytes": 1524839,
4259
- "n_tokens": 1203134,
4260
- "n_chars": 655190
4261
- },
4262
- "yi_6b.cc100-ko": {
4263
- "vocab_size": 64000,
4264
- "n_bytes": 1524839,
4265
- "n_tokens": 1203134,
4266
- "n_chars": 655190
4267
- },
4268
- "yi_vl34b.cc100-ko": {
4269
- "vocab_size": 64000,
4270
- "n_bytes": 1524839,
4271
- "n_tokens": 1210021,
4272
- "n_chars": 655190
4273
- },
4274
- "zephyr_7b_beta.cc100-ko": {
4275
- "vocab_size": 32000,
4276
- "n_bytes": 1524839,
4277
- "n_tokens": 728766,
4278
- "n_chars": 655190
4279
- },
4280
- "llama_3_chinese_8b.cc100-zh-Hans": {
4281
- "vocab_size": 128256,
4282
- "n_bytes": 2633047,
4283
- "n_tokens": 757405,
4284
- "n_chars": 927311
4285
- }
4286
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
stats/compression_rate.json ADDED
The diff for this file is too large to render. See raw diff
 
utils/byte_util.py DELETED
File without changes
utils/character_util.py DELETED
@@ -1,231 +0,0 @@
1
- """
2
- TODO: 繁体、简体、语种、
3
- """
4
- import os
5
- import json
6
- from collections import Counter
7
- from vocab import load_tokener
8
- from utils.log_util import logger
9
- from utils.text_util import is_all_digit, has_digit, get_digit_count, get_space_count
10
- from utils.lang_util import detect_language
11
- from utils.lang_util_2 import is_zh_char, is_all_zh, get_zh_count
12
-
13
- CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
14
-
15
- zh_tokens = [line.strip() for line in open(os.path.join(CURRENT_DIR, "vocab.jd.txt.v2"), "r", encoding="utf-8") if
16
- is_zh_char(line.strip())]
17
-
18
-
19
- def digit_():
20
- """
21
- qwen segments numbers by single digits.
22
- """
23
- pass
24
-
25
-
26
- def to_unicode(text):
27
- return ''.join(r'\u{:04X}'.format(ord(chr)) for chr in text)
28
-
29
- def zh_iterator():
30
- for idx in range(ord(u'\u4e00'), ord(u'\u9fa5')):
31
- yield (chr(idx))
32
-
33
-
34
- def get_coding_length(tokenizer, vocab, filter=None):
35
- """
36
- 计算编码长度。(有些中文汉字被解码成多个token)
37
- """
38
- all_length = []
39
- for word in vocab:
40
- if len(word) > 1:
41
- continue
42
- if filter is not None and filter(word):
43
- continue
44
- try:
45
- tokens = tokenizer.encode(word)
46
- except Exception as e:
47
- print(e)
48
-
49
- all_length.append(len(tokens))
50
- # if len(tokens.ids) > 1:
51
- # if len(tokens) > 3:
52
- # print(word, tokens)
53
-
54
- dist_length = Counter(all_length)
55
- mean_length = round(sum(all_length) / len(all_length), 2)
56
- return dist_length, mean_length
57
-
58
-
59
-
60
- def remove_special_char():
61
- """
62
- :return:
63
- """
64
- # bert词典有 ##开头的
65
- # byteBPE词典有带空格的
66
- # decode_str = decode_str.strip().replace("#", "") # TODO, 按类型
67
- pass
68
-
69
-
70
- cache = {}
71
-
72
- def _mean(datas):
73
- return sum(datas) / len(datas)
74
-
75
- def iter_vocab(tokenizer_name, from_cache=True, cache_dir="stats/iter_vocab"):
76
- """
77
- 由于速度较快,建议不采用文件缓存。
78
- :param tokenizer:
79
- :param from_cache:
80
- :return:
81
- """
82
- cache_dir = os.path.join(CURRENT_DIR, f"../{cache_dir}")
83
- os.makedirs(cache_dir, exist_ok=True)
84
-
85
- tokenizer = load_tokener(tokenizer_name)
86
-
87
-
88
- # load from cache
89
- if from_cache and tokenizer_name in cache:
90
- logger.info(f"load {tokenizer_name} from cache")
91
- return cache[tokenizer_name]
92
-
93
- has_zh_tokens = []
94
- all_zh_tokens = []
95
- has_digit_tokens = []
96
- all_digit_tokens = []
97
- has_space_tokens = []
98
- all_space_tokens = []
99
-
100
- # zh_tags = ["all_zh", "has_zh"]
101
- # digit_tags = ["all_digit", "has_digit"]
102
-
103
- # zh_token_count = {"total": 0, "包含1个中文单字": 0, "中文多字": 0}
104
-
105
- # symbol_count = 0
106
-
107
- all_single_zh_tokens = set()
108
- zh_symbol_count = 0
109
- buffer = []
110
- for token_id in range(tokenizer.vocab_size):
111
- decode_str = tokenizer.decode([token_id], skip_special_tokens=False)
112
- token = tokenizer.convert_ids_to_tokens([token_id], skip_special_tokens=False)[0]
113
- # tokenizer.convert_tokens_to_string(tokens)
114
-
115
- tags = []
116
-
117
- if token is None: # 有些词典有空的id(不连续)
118
- continue
119
- if isinstance(token, bytes):
120
- token = token.decode("utf-8", errors="ignore")
121
-
122
- digit_count = get_digit_count(decode_str)
123
- language_tags = detect_language(decode_str)
124
-
125
- if "Chinese" in language_tags:
126
- has_zh_tokens.append(decode_str)
127
-
128
- if is_all_zh(decode_str):
129
- tags.append("all_zh")
130
- all_zh_tokens.append(decode_str)
131
-
132
-
133
- if is_all_digit(decode_str):
134
- tags.append("all_digit")
135
- all_digit_tokens.append(decode_str)
136
- if has_digit(decode_str):
137
- tags.append("has_digit")
138
- has_digit_tokens.append(decode_str)
139
-
140
-
141
- space_count = get_space_count(decode_str)
142
- if space_count > 0:
143
- has_space_tokens.append(decode_str)
144
- if space_count == len(decode_str):
145
- all_space_tokens.append(decode_str)
146
-
147
- zh_count = get_zh_count(decode_str)
148
-
149
- buffer.append(json.dumps(
150
- {"id": token_id,
151
- "token": token,
152
- "token_decode": decode_str,
153
- "token_dumps": json.dumps(token),
154
- "token_unicode": to_unicode(token),
155
- "token_len": len(decode_str),
156
- "zh_count": zh_count, # 包含汉字的数目
157
- # "zh-smpli": zh_hans_count, # 简体中文 zh-Hans
158
- "tags": tags,
159
- "zh_symbol_count": zh_symbol_count,
160
- },
161
- ensure_ascii=False) + "\n")
162
-
163
- # if zh_count >= 1:
164
- # zh_token_count["total"] += 1
165
- # if zh_count > 1:
166
- # zh_token_count["中文多字"] += 1
167
- # else:
168
- # zh_token_count["中文单字"] += 1
169
- # all_single_zh_tokens.add(decode_str.strip().replace("#", ""))
170
- #
171
- # zh_token_count["中文单字-去重���"] = len(all_single_zh_tokens)
172
-
173
- dist_length, mean_length = get_coding_length(tokenizer, zh_tokens, filter=lambda k: not is_zh_char(k))
174
-
175
- # TODO: 繁体字,简体字
176
-
177
- result = {
178
- "name": tokenizer_name,
179
- "impl": str(tokenizer.__class__),
180
- "vocab_size": len(tokenizer),
181
- "中文token数": len(has_zh_tokens),
182
- "中文token的平均长度": None,
183
- "纯中文token的平均长度": None,
184
- "中文标点数": zh_symbol_count,
185
- "中文汉字编码长度均值": mean_length,
186
- "中文汉字编码长度分布": json.dumps(dist_length),
187
- "纯数字token数": len(all_digit_tokens),
188
- "包含数字token数": len(has_digit_tokens),
189
- "纯数字token的平均长度": round(_mean([len(item) for item in all_digit_tokens]), 2),
190
- "纯中文token数": None, # all_zh
191
- "纯space的token数": len(all_space_tokens),
192
- "纯space的token数": len(all_space_tokens), # "#"
193
- "纯space的token的平均长度": None, # avg_len( tokens_contains_space)
194
- "contains_korea": None,
195
- }
196
- out_path = os.path.join(cache_dir, f"{tokenizer_name}.vocab.jsonl")
197
- logger.info(f"saving vocab to {out_path}")
198
- with open(out_path, "w", encoding="utf-8") as f_out:
199
- f_out.write(json.dumps(result, ensure_ascii=False) + "\n")
200
- for line in buffer:
201
- f_out.write(line)
202
- cache[tokenizer_name] = result
203
- return result
204
-
205
-
206
-
207
-
208
-
209
-
210
- if __name__ == "__main__":
211
- # test_coding_length(jd_vocab_tokens, filter=lambda k: not is_chinese(k))
212
- # test_coding_length(zh_punc)
213
- # test_coding_length(zh_iterator())
214
-
215
- # from vocab.chatglm2_6b import tokenizer; name = "chatglm2_6b"
216
- # from vocab.chatglm_6b import tokenizer; name="chatglm_6b"
217
- # from vocab.baichuan2 import tokenizer; name="baichuan2"
218
- name="gpt_4"
219
- # name="gpt2"
220
- # name="qwen1_5_14b_chat"
221
- # name="gpt_nexo_20b"
222
- # name="fastchat_t5_3b"
223
-
224
-
225
- print(iter_vocab(name))
226
-
227
-
228
-
229
-
230
-
231
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/convert_sp_to_json.py DELETED
@@ -1,4 +0,0 @@
1
-
2
- from vocab.baichuan_7b import tokenizer
3
-
4
- tokenizer.sp
 
 
 
 
 
utils/fn_util.py DELETED
File without changes
utils/lang_util.py CHANGED
@@ -18,43 +18,39 @@ import re
18
  # 由于大部分是'latin',所以就不统计了。
19
  common = ['Chinese', 'Japanese-Kana', 'Korean', 'Arabic', 'number']
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  def detect_language(s):
22
  # 定义各语言字符的Unicode范围
23
- language_ranges = {
24
- 'Arabic': r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]',
25
- # 'CJK' https://en.wikipedia.org/wiki/CJK_Unified_Ideographs
26
- 'Chinese': r'[\u4e00-\u9fff]',
27
- 'Japanese': r'[\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF\u3400-\u4DBF]', # https://stackoverflow.com/questions/19899554/unicode-range-for-japanese
28
- 'Japanese-Kana': r'[\u3040-\u309F\u30A0-\u30FF]', # Hiragana & Katakana
29
- # 'Korean': r'[\uac00-\ud7a3]',
30
- 'Hangul': r'[\uac00-\ud7a3]',
31
-
32
-
33
- # 拉丁字母系列
34
- 'Latin': r'[\u0000-\u007F\u0080-\u00FF]',
35
- 'English': r'[A-Za-z]', # 这可能会与其他使用基本拉丁字母的语言重叠
36
- 'French': r'[\u00C0-\u00FF]',
37
- 'German': r'[\u00C4\u00D6\u00DC\u00E4\u00F6\u00FC\u00DF]',
38
- 'Spanish-': r'[\u00C1\u00E1\u00C9\u00E9\u00CD\u00ED\u00D3\u00F3\u00DA\u00FA\u00D1\u00F1\u00FC]', # 西班牙语特有字符集合
39
-
40
-
41
- # 斯拉夫语族
42
- 'Cyrillic': r'[\u0400-\u04FF\u0500-\u052F\u2DE0-\u2DFF\uA640-\uA69F]',
43
-
44
- #
45
- 'Greek': r'[\u0370-\u03FF\u1F00-\u1FFF]', # 希腊字母
46
- 'Hebrew': r'[\u0590-\u05FF\uFB1D-\uFB4F]', # 希伯来语
47
-
48
-
49
- }
50
-
51
  detected_languages = []
52
-
53
  for language, pattern in language_ranges.items():
54
  if re.search(pattern, s):
55
  detected_languages.append(language)
56
 
57
- return detected_languages if detected_languages else ['Unknown']
58
 
59
 
60
  if __name__ == "__main__":
 
18
  # 由于大部分是'latin',所以就不统计了。
19
  common = ['Chinese', 'Japanese-Kana', 'Korean', 'Arabic', 'number']
20
 
21
+ language_ranges = {
22
+ ('Arabic', 'ar'): r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]',
23
+ # 'CJK' https://en.wikipedia.org/wiki/CJK_Unified_Ideographs
24
+ ('Chinese', 'zh'): r'[\u4e00-\u9fff]',
25
+ ('Japanese', 'ja'): r'[\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF\u3400-\u4DBF]',
26
+ # https://stackoverflow.com/questions/19899554/unicode-range-for-japanese
27
+ # Kana type refers to Japanese hiragana and katakana characters that represent phonetic sounds in the Japanese language.
28
+ ('Japanese-Kana', 'ja-kana'): r'[\u3040-\u309F\u30A0-\u30FF]', # Hiragana & Katakana
29
+ ('Korean', 'ko'): r'[\uac00-\ud7a3]',
30
+
31
+ # 拉丁字母系列
32
+ # ('Latin', 'la'): r'[\u0000-\u007F\u0080-\u00FF]',
33
+ # ('English', 'en'): r'[A-Za-z]', # 这可能会与其他使用基本拉丁字母的语言重叠
34
+ # ('French', 'fr'): r'[\u00C0-\u00FF]',
35
+ # ('German', 'de'): r'[\u00C4\u00D6\u00DC\u00E4\u00F6\u00FC\u00DF]',
36
+ # ('Spanish-特有'): r'[\u00C1\u00E1\u00C9\u00E9\u00CD\u00ED\u00D3\u00F3\u00DA\u00FA\u00D1\u00F1\u00FC]', # 西班牙语特有字符集合
37
+
38
+ # 斯拉夫语系列
39
+ # ('Cyrillic', ''): r'[\u0400-\u04FF\u0500-\u052F\u2DE0-\u2DFF\uA640-\uA69F]',
40
+
41
+ #
42
+ # 'Greek': r'[\u0370-\u03FF\u1F00-\u1FFF]', # 希腊字母
43
+ # 'Hebrew': r'[\u0590-\u05FF\uFB1D-\uFB4F]', # 希伯来语
44
+ }
45
+
46
  def detect_language(s):
47
  # 定义各语言字符的Unicode范围
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  detected_languages = []
 
49
  for language, pattern in language_ranges.items():
50
  if re.search(pattern, s):
51
  detected_languages.append(language)
52
 
53
+ return detected_languages
54
 
55
 
56
  if __name__ == "__main__":
utils/lang_util_2.py DELETED
@@ -1,115 +0,0 @@
1
- """
2
- 日语、韩语 等
3
- https://www.cnblogs.com/luoganttcc/p/16605150.html
4
- https://zhuanlan.zhihu.com/p/618684374
5
- - https://zhuanlan.zhihu.com/p/84625185 赞
6
-
7
-
8
- ## 相关包
9
-
10
- import opencc
11
- import langid
12
- imort langdetect
13
- https://github.com/pemistahl/lingua-py
14
- - 原理:
15
-
16
-
17
- """
18
-
19
-
20
-
21
- from zhon.hanzi import punctuation as zh_punc
22
-
23
- def is_zh_char(uchar):
24
- """
25
- https://github.com/fxsjy/jieba/blob/master/jieba/__init__.py#L48
26
- re.compile("([\u4E00-\u9FD5]+)", re.U)
27
- """
28
- return u'\u4e00' <= uchar <= u'\u9fa5'
29
-
30
- def has_zh_punc(text):
31
- """
32
- 是否包含中文标点
33
- """
34
- return any(ch in zh_punc for ch in text)
35
-
36
-
37
- def has_zh(text):
38
- """ contains Chinese characters """
39
- return any(is_zh_char(ch) for ch in text)
40
-
41
-
42
- def get_zh_count(text):
43
- return sum([is_zh_char(uchar) for uchar in text])
44
-
45
-
46
- def is_all_zh(text):
47
- return all(is_zh_char(char) for char in text)
48
-
49
-
50
- def is_all_en(text):
51
- return text.encode('utf-8').isalpha()
52
-
53
-
54
-
55
-
56
- ranges = [
57
- {"from": ord(u"\u3300"), "to": ord(u"\u33ff")}, # compatibility ideographs
58
- {"from": ord(u"\ufe30"), "to": ord(u"\ufe4f")}, # compatibility ideographs
59
- {"from": ord(u"\uf900"), "to": ord(u"\ufaff")}, # compatibility ideographs
60
- {"from": ord(u"\U0002F800"), "to": ord(u"\U0002fa1f")}, # compatibility ideographs
61
- {'from': ord(u'\u3040'), 'to': ord(u'\u309f')}, # Japanese Hiragana 日本平假名 96个
62
- {"from": ord(u"\u30a0"), "to": ord(u"\u30ff")}, # Japanese Katakana 日语片假名 96个
63
- {"from": ord(u"\u2e80"), "to": ord(u"\u2eff")}, # cjk radicals supplement
64
- {"from": ord(u"\u4e00"), "to": ord(u"\u9fff")}, # 中文 u"\u4e00"-'\u9fa5',
65
- {"from": ord(u"\u3400"), "to": ord(u"\u4dbf")}, #
66
- {"from": ord(u"\U00020000"), "to": ord(u"\U0002a6df")},
67
- {"from": ord(u"\U0002a700"), "to": ord(u"\U0002b73f")},
68
- {"from": ord(u"\U0002b740"), "to": ord(u"\U0002b81f")},
69
- {"from": ord(u"\U0002b820"), "to": ord(u"\U0002ceaf")} # included as of Unicode 8.0
70
- ]
71
-
72
- # 韩语 [\uac00-\ud7ff]
73
-
74
-
75
- def is_cjk(char):
76
- """
77
- CJK(Chinese、Japanese、Korean)
78
- 日语中有很多汉字,日本汉字超过2万。
79
- 韩语有谚文,超过50个,有朝鲜汉字超过2万。
80
- """
81
- return any([range["from"] <= ord(char) <= range["to"] for range in ranges])
82
-
83
-
84
- def cjk_substrings(string):
85
- i = 0
86
- while i < len(string):
87
- if is_cjk(string[i]):
88
- start = i
89
- while is_cjk(string[i]): i += 1
90
- yield string[start:i]
91
- i += 1
92
-
93
-
94
- def aa():
95
- # string = "sdf344asfasf天地方益3権sdfsdf".decode("utf-8")
96
- for idx, item in enumerate(ranges):
97
- print(idx, end=": ")
98
- for j in range(10):
99
- print(chr(item["from"] + j), end=", ")
100
- print("")
101
- # for sub in cjk_substrings(string):
102
- # string = string.replace(sub, "(" + sub + ")")
103
- # print(string)
104
-
105
-
106
- def is_traditional_chinese(text):
107
- cc = opencc.OpenCC('t2s')
108
- converted_text = cc.convert(text)
109
- if converted_text != text:
110
- return True
111
- return False
112
-
113
-
114
-
115
- # aa()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/oov.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ```sh
3
+ ###################################
4
+ ClueAI/ChatYuan-large-v2, <class 'tokenizers.models.Unigram'>
5
+ reversible: false; unk_token: <unk>, 2, unk_ratio: 0.2000; oov: []
6
+ text[7] = "Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
7
+ decoding[7] = "<unk>амглав<unk> у<unk>равления развития; <unk> <unk> 15~17<unk> <unk> 3<unk>; 確実に春が近づいてること; a közoktatással? _ Belföld; pum<unk>, i vjet<unk>r, vjeç; <unk>ا<unk> <unk> <unk>ا<unk> ; <unk> <unk> <unk> <unk> <unk> <unk>; <unk> <unk> ; <unk>зейн<unk>я асо<unk>:; <unk> <unk> <unk> <unk>; <unk>;<unk>"
8
+
9
+
10
+ ###################################
11
+ ClueAI/PromptCLUE-base, <class 'tokenizers.models.Unigram'>
12
+ reversible: false; unk_token: <unk>, 2, unk_ratio: 0.2000; oov: []
13
+ text[7] = "Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
14
+ decoding[7] = "<unk>амглав<unk> у<unk>равления развития; <unk> <unk> 15~17<unk> <unk> 3<unk>; 確実に春が近づいてること; a közoktatással? _ Belföld; pum<unk>, i vjet<unk>r, vjeç; <unk>ا<unk> <unk> <unk>ا<unk> ; <unk> <unk> <unk> <unk> <unk> <unk>; <unk> <unk> ; <unk>зейн<unk>я асо<unk>:; <unk> <unk> <unk> <unk>; <unk>;<unk>"
15
+ ###################################
16
+ CohereForAI/aya-101, <class 'tokenizers.models.Unigram'>
17
+ reversible: false; unk_token: <unk>, 2, unk_ratio: 0.0079; oov: []
18
+ text[73] = " a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
19
+ decoding[73] = "a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; <unk>❤❥웃유♋☮✊;װיקיװערטערבוך"
20
+ ###################################
21
+ FacebookAI/xlm-roberta-base, <class 'tokenizers.models.Unigram'>
22
+ reversible: false; unk_token: <unk>, 3, unk_ratio: 0.0096; oov: []
23
+ text[73] = " a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
24
+ decoding[73] = "a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; <unk>❤❥웃유♋☮✊;װיקיװערטערבוך"
25
+ ###################################
26
+ OrionStarAI/Orion-14B-Chat, sp_model, byte_num: 0
27
+ reversible: false; unk_token: <unk>, 0, unk_ratio: 0.0495; oov: []
28
+ text[71] = "; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
29
+ decoding[71] = "; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئ<unk> ⁇ ردوغان <unk> ⁇ قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለ<unk> ⁇ ጭ የግድግ<unk> ⁇ ; Дзейныя асобы:; « <unk> ⁇ <unk> ⁇ <unk> ⁇ ; \t\n <unk> ⁇ ❤❥웃유♋☮✊; <unk> ⁇ יקי<unk> ⁇ ערטערבוך "
30
+ ###################################
31
+ THUDM/chatglm-6b, byte_num: 256
32
+ reversible: false; unk_token: <unk>, 0, unk_ratio: 0.0000; oov: []
33
+ text[237] = "\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
34
+ decoding[237] = " 🦙❤❥웃유♋☮✊;װיקיװערטערבוך"
35
+ ###################################
36
+ abeja/gpt-neox-japanese-2.7b, japanese-bpe: https://github.com/tanreinama/Japanese-BPEEncoder_V2
37
+ reversible: false; unk_token: <|endoftext|>, 31999, unk_ratio: 0.0000; oov: []
38
+ text[7] = "Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
39
+ decoding[7] = "���������������� �������������������� ����������������; ������ ������ 15~17��� ��������� 3������; 確実に春が近づいてること; a k��zoktat��ssal? _ Belf��ld; pum��, i vjet��r, vje��; ���������������� ���� ���������������������� ; ��������������� ��������� ������ ��������� ������ ������������������������; ��������������� ��������������� ; �������������� ����������:; ǀ ��������������������������� ��������������� ���������������; \t\n\n🐯❤‖������🟥🟥🤚;��������������������������"
40
+
41
+
42
+ ###################################
43
+ baichuan-inc/Baichuan-7B, sp_model, byte_num: 256
44
+ reversible: false; unk_token: <unk>, 0, unk_ratio: 0.0000; oov: []
45
+ text[237] = "\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
46
+ decoding[237] = " 🦙❤❥웃유♋☮✊;װיקיװערטערבוך "
47
+ ###################################
48
+ ckiplab/gpt2-base-chinese, <class 'tokenizers.models.WordPiece'>
49
+ reversible: false; unk_token: [UNK], 100, unk_ratio: 0.1185; oov: []
50
+ text[5] = "; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
51
+ decoding[5] = " ; замглавы управления развития ; 특히 주소 15 ~ 17번 홀에선 3연속 ; 確 実 に 春 か 近 ついてること ; a kozoktatassal? _ belfold ; pume, i vjeter, vjec ; [UNK] [UNK] [UNK] ; [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] ; [UNK] [UNK] ; дзеиныя асобы : ; « [UNK] [UNK] [UNK] ; [UNK] ; [UNK]"
52
+
53
+
54
+ ###################################
55
+ cl-tohoku/bert-base-japanese, wordpiece.MecabTokenizer, 支持byte-level https://github.com/polm/fugashi
56
+ reversible: false; unk_token: [UNK], 1, unk_ratio: 0.3951; oov: []
57
+ text[5] = "; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
58
+ decoding[5] = " ; [UNK] [UNK] [UNK] ; [UNK] [UNK] 15 ~ 17 [UNK] [UNK] 3 [UNK] ; 確実 に 春 が 近づい てる こと ; a közoktatással? _ Belföld ; [UNK], i [UNK], vjeç ; [UNK] [UNK] [UNK] ; [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] ; [UNK] [UNK] ; [UNK] [UNK] :; [UNK] [UNK] [UNK] [UNK] ; [UNK] [UNK] [UNK] [UNK] [UNK]"
59
+
60
+
61
+ ###################################
62
+ clue/roberta_chinese_clue_tiny, <class 'tokenizers.models.WordPiece'>
63
+ reversible: false; unk_token: [UNK], 100, unk_ratio: 0.3580; oov: []
64
+ text[5] = "; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
65
+ decoding[5] = " ; [UNK] [UNK] [UNK] ; [UNK] [UNK] 15 ~ [UNK] [UNK] [UNK] ; [UNK] 実 [UNK] 春 [UNK] 近 [UNK] ; a kozoktatassal? _ belfold ; pume, i vjeter, vjec ; [UNK] [UNK] [UNK] ; [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] ; [UNK] [UNK] ; [UNK] асобы : ; « [UNK] [UNK] [UNK] ; [UNK] ; [UNK]"
66
+
67
+
68
+ ###################################
69
+ dbmdz/bert-base-german-uncased, <class 'tokenizers.models.WordPiece'>
70
+ reversible: false; unk_token: [UNK], 101, unk_ratio: 0.4459; oov: []
71
+ text[5] = "; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
72
+ decoding[5] = " ; [UNK] [UNK] [UNK] ; [UNK] [UNK] 15 ~ [UNK] [UNK] [UNK] ; [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] ; a kozoktatassal? _ belfold ; pume, i vjeter, vjec ; [UNK] [UNK] [UNK] ; [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] ; [UNK] [UNK] ; [UNK] [UNK] : ; « [UNK] [UNK] [UNK] ; [UNK] ; [UNK]"
73
+ ###################################
74
+ deepseek-ai/deepseek-coder-33b-instruct, <class 'tokenizers.models.BPE'>
75
+ reversible: false; unk_token: None, None, unk_ratio: 0.0000; oov: []
76
+ text[77] = "özoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
77
+ decoding[77] = "�zoktatással? _ Belf�ld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך "
78
+ Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
79
+ [2024-05-12 00:30:36] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer eson/kplug-base-encoder
80
+ ###################################
81
+ deepseek-ai/deepseek-llm-7b-base, <class 'tokenizers.models.BPE'>
82
+ reversible: false; unk_token: None, None, unk_ratio: 0.0000; oov: []
83
+ text[77] = "özoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
84
+ decoding[77] = "�zoktatással? _ Belf�ld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך "
85
+ [2024-05-12 00:30:56] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer fnlp/moss-moon-003-sft
86
+ ###################################
87
+ eson/kplug-base-encoder, <class 'tokenizers.models.WordPiece'>
88
+ reversible: false; unk_token: [UNK], 100, unk_ratio: 0.3625; oov: []
89
+ text[5] = "; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
90
+ decoding[5] = " ; [UNK] [UNK] [UNK] ; [UNK] [UNK] 15 ~ [UNK] [UNK] [UNK] ; [UNK] 実 [UNK] 春 [UNK] 近 [UNK] ; a kozoktatassal? _ belfold ; pume, i vjeter, vjec ; [UNK] [UNK] [UNK] ; [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] ; [UNK] [UNK] ; [UNK] асобы : ; « [UNK] [UNK] [UNK] ; [UNK] ; [UNK]"
91
+ Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
92
+ [2024-05-12 00:31:36] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer google-bert/bert-base-cased
93
+ ###################################
94
+ fnlp/moss-moon-003-sft, 应该是 sentencepiece.byte_bpe,待确认
95
+ reversible: false; unk_token: <|endoftext|>, 106028, unk_ratio: 0.0000; oov: []
96
+ text[74] = "a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
97
+ decoding[74] = " a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך "
98
+ ###################################
99
+ google-bert/bert-base-cased, <class 'tokenizers.models.WordPiece'>
100
+ reversible: false; unk_token: [UNK], 100, unk_ratio: 0.1732; oov: []
101
+ text[5] = "; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
102
+ decoding[5] = " ; Замглавы управления развития ; [UNK] [UNK] 15 ~ [UNK] [UNK] [UNK] ; [UNK] [UNK] に [UNK] [UNK] [UNK] [UNK] ; a közoktatással? _ Belföld ; pumë, i vjetër, vjeç ; [UNK] [UNK] قىرغىزىستان ; निम्न में से [UNK] सा [UNK] ; [UNK] [UNK] ; Дзейныя асобы : ; « [UNK] [UNK] [UNK] ; [UNK] ; [UNK]"
103
+ [2024-05-12 00:31:56] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer google-bert/bert-base-chinese
104
+ [2024-05-12 00:32:16] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer google-bert/bert-base-german-cased
105
+ ###################################
106
+ google-bert/bert-base-chinese, <class 'tokenizers.models.WordPiece'>
107
+ reversible: false; unk_token: [UNK], 100, unk_ratio: 0.3704; oov: []
108
+ text[5] = "; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
109
+ decoding[5] = " ; [UNK] управления развития ; [UNK] [UNK] 15 ~ [UNK] [UNK] [UNK] ; 確 実 に 春 [UNK] 近 [UNK] ; a [UNK]? _ [UNK] ; [UNK], i [UNK], [UNK] ; [UNK] [UNK] [UNK] ; [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] ; [UNK] [UNK] ; [UNK] асобы : ; « [UNK] [UNK] [UNK] ; [UNK] ; [UNK]"
110
+ ###################################
111
+ google-bert/bert-base-german-cased, <class 'tokenizers.models.WordPiece'>
112
+ reversible: false; unk_token: [UNK], 2, unk_ratio: 0.5938; oov: []
113
+ text[5] = "; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
114
+ decoding[5] = " ; [UNK] [UNK] [UNK] ; [UNK] [UNK] 15 ~ [UNK] [UNK] [UNK] ; [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] ; a [UNK]? _ Belföld ; [UNK], i [UNK], [UNK] ; [UNK] [UNK] [UNK] ; [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] ; [UNK] [UNK] ; [UNK] [UNK] : ; [UNK] [UNK] [UNK] [UNK] ; [UNK] ; [UNK]"
115
+ [2024-05-12 00:32:36] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer google-bert/bert-base-multilingual-cased
116
+ [2024-05-12 00:32:57] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer google-bert/bert-base-multilingual-uncased
117
+ ###################################
118
+ google-bert/bert-base-multilingual-cased, <class 'tokenizers.models.WordPiece'>
119
+ reversible: false; unk_token: [UNK], 100, unk_ratio: 0.0531; oov: []
120
+ text[5] = "; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
121
+ decoding[5] = " ; Замглавы управления развития ; 특히 주소 15 ~ 17번 홀에선 3연속 ; 確 実 に 春 が 近 づいてること ; a közoktatással? _ Belföld ; pumë, i vjetër, vjeç ; [UNK] [UNK] قىرغىزىستان ; निम्न में से कौन सा हारडवेयर ; [UNK] [UNK] ; Дзейныя асобы : ; « અમરેલીનાં મહિલા વિકાસ ; [UNK] ; [UNK]"
122
+ [2024-05-12 00:33:17] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer google-bert/bert-base-uncased
123
+ ###################################
124
+ google-bert/bert-base-multilingual-uncased, <class 'tokenizers.models.WordPiece'>
125
+ reversible: false; unk_token: [UNK], 100, unk_ratio: 0.0360; oov: []
126
+ text[5] = "; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
127
+ decoding[5] = " ; замглавы управления развития ; 특히 주소 15 ~ 17번 홀에선 3연속 ; 確 実 に 春 か 近 ついてること ; a kozoktatassal? _ belfold ; pume, i vjeter, vjec ; يەردوغان ۋە قىرغىزىستان ; निमन म स कौन सा हारडवयर ; [UNK] [UNK] ; дзеиныя асобы : ; « અમરલીના મહિલા વિકાસ ; [UNK] ; [UNK]"
128
+ [2024-05-12 00:33:37] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer google-t5/t5-large
129
+ ###################################
130
+ google-bert/bert-base-uncased, <class 'tokenizers.models.WordPiece'>
131
+ reversible: false; unk_token: [UNK], 100, unk_ratio: 0.0867; oov: []
132
+ text[5] = "; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
133
+ decoding[5] = " ; замглавы управления развития ; 특히 주소 15 ~ 17번 홀에선 3연속 ; [UNK] [UNK] に 春 か [UNK] ついてること ; a kozoktatassal? _ belfold ; pume, i vjeter, vjec ; [UNK] [UNK] قىرغىزىستان ; निमन म स [UNK] सा हारडवयर ; [UNK] [UNK] ; дзеиныя асобы : ; « [UNK] [UNK] [UNK] ; [UNK] ; [UNK]"
134
+ ###################################
135
+ google-t5/t5-large, <class 'tokenizers.models.Unigram'>
136
+ reversible: false; unk_token: <unk>, 2, unk_ratio: 0.2769; oov: []
137
+ text[7] = "Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
138
+ decoding[7] = "<unk>ам<unk>лав<unk> у<unk>равлени<unk> ра<unk>вити<unk>; <unk> <unk> 15<unk>17<unk> <unk> 3<unk>; <unk>; a közoktatással? _ Belföld; pum<unk>, i vjet<unk>r, vjeç; <unk> <unk> <unk> ; <unk> <unk> <unk> <unk> <unk> <unk>; <unk> <unk> ; <unk>е<unk>н<unk> асо<unk>:; « <unk> <unk> <unk>; <unk>;<unk>"
139
+ [2024-05-12 00:34:57] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer google/byt5-small
140
+ [2024-05-12 00:35:18] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer google/gemma-7b
141
+ [2024-05-12 00:35:39] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer google/mobilebert-uncased
142
+ [2024-05-12 00:36:59] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer google/mt5-large
143
+ ###################################
144
+ google/mobilebert-uncased, <class 'tokenizers.models.WordPiece'>
145
+ reversible: false; unk_token: [UNK], 100, unk_ratio: 0.0867; oov: []
146
+ text[5] = "; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
147
+ decoding[5] = " ; замглавы управления развития ; 특히 주소 15 ~ 17번 홀에선 3연속 ; [UNK] [UNK] に 春 か [UNK] ついてること ; a kozoktatassal? _ belfold ; pume, i vjeter, vjec ; [UNK] [UNK] قىرغىزىستان ; निमन म स [UNK] सा हारडवयर ; [UNK] [UNK] ; дзеиныя асобы : ; « [UNK] [UNK] [UNK] ; [UNK] ; [UNK]"
148
+ C:\Users\xusong28\Miniconda3\envs\py3.10-torch1.13-hf.latest\lib\site-packages\transformers\convert_slow_tokenizer.py:560: UserWarning: The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.
149
+ warnings.warn(
150
+ [2024-05-12 00:37:23] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer google/switch-c-2048
151
+ ###################################
152
+ google/mt5-large, <class 'tokenizers.models.Unigram'>
153
+ reversible: false; unk_token: <unk>, 2, unk_ratio: 0.0079; oov: []
154
+ text[73] = " a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
155
+ decoding[73] = "a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; <unk>❤❥웃유♋☮✊;װיקיװערטערבוך"
156
+ [2024-05-12 00:37:43] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer hfl/chinese-alpaca-lora-7b
157
+ ###################################
158
+ google/switch-c-2048, <class 'tokenizers.models.Unigram'>
159
+ reversible: false; unk_token: <unk>, 2, unk_ratio: 0.2769; oov: []
160
+ text[7] = "Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
161
+ decoding[7] = "<unk>ам<unk>лав<unk> у<unk>равлени<unk> ра<unk>вити<unk>; <unk> <unk> 15<unk>17<unk> <unk> 3<unk>; <unk>; a közoktatással? _ Belföld; pum<unk>, i vjet<unk>r, vjeç; <unk> <unk> <unk> ; <unk> <unk> <unk> <unk> <unk> <unk>; <unk> <unk> ; <unk>е<unk>н<unk> асо<unk>:; « <unk> <unk> <unk>; <unk>;<unk>"
162
+ You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
163
+ [2024-05-12 00:38:04] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer hfl/chinese-llama-2-7b
164
+ [2024-05-12 00:38:25] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer hfl/chinese-llama-lora-7b
165
+ [2024-05-12 00:38:46] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer hfl/llama-3-chinese-8b
166
+ Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
167
+ [2024-05-12 00:39:07] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer hpcai-tech/grok-1
168
+ [2024-05-12 00:39:28] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer internlm/internlm-chat-7b
169
+ [2024-05-12 00:40:09] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer internlm/internlm-xcomposer-7b
170
+ [2024-05-12 00:40:31] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer internlm/internlm2-chat-7b
171
+ [2024-05-12 00:41:13] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer internlm/internlm2-math-7b
172
+ [2024-05-12 00:41:35] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer lmsys/fastchat-t5-3b-v1.0
173
+ Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
174
+ ###################################
175
+ [2024-05-12 00:41:55] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer meta-llama/Llama-2-7b-chat
176
+ lmsys/fastchat-t5-3b-v1.0, sp_model, byte_num: 0
177
+ reversible: false; unk_token: <unk>, 2, unk_ratio: 0.2105; oov: []
178
+ text[7] = "Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
179
+ decoding[7] = " <unk> ам<unk> лав<unk> у<unk> равлени<unk> ра<unk> вити<unk>; <unk> <unk> 15<unk> 17<unk> <unk> 3<unk>; <unk>; a közoktatással? _ Belföld; pum<unk>, i vjet<unk>r, vjeç; <unk> <unk> <unk> ; <unk> <unk> <unk> <unk> <unk> <unk>; <unk> <unk> ; <unk> е<unk> н<unk> асо<unk>:; « <unk> <unk> <unk>; \t \n <unk> ;<unk> "
180
+ [2024-05-12 00:41:55] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer meta-llama/Meta-Llama-3-8B
181
+ [2024-05-12 00:41:55] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer microsoft/Phi-3-mini-4k-instruct
182
+ Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
183
+ [2024-05-12 00:42:16] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer microsoft/phi-1
184
+ [2024-05-12 00:42:36] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer microsoft/phi-2
185
+ Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
186
+ [2024-05-12 00:42:56] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer mistralai/Mistral-7B-v0.1
187
+ [2024-05-12 00:43:16] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer mistralai/Mixtral-8x7B-v0.1
188
+ [2024-05-12 00:43:37] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer openai-community/gpt2
189
+ [2024-05-12 00:43:57] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer openai/code-davinci-002
190
+ [2024-05-12 00:43:57] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer openai/gpt-3.5-turbo
191
+ [2024-05-12 00:43:57] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer openai/gpt-4
192
+ [2024-05-12 00:43:57] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer openai/text-davinci-003
193
+ [2024-05-12 00:43:57] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer paust/pko-t5-large
194
+ Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
195
+ [2024-05-12 00:44:18] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer thu-coai/CharacterGLM-6B
196
+ [2024-05-12 00:44:58] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer tiiuae/falcon-180b
197
+ [2024-05-12 00:45:19] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer tiiuae/falcon-7b
198
+
199
+ Process finished with exit code 0
200
+
201
+
202
+ ```
utils/oov_util.py CHANGED
@@ -2,11 +2,117 @@
2
 
3
 
4
  import os
 
 
5
 
6
- CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
7
 
8
- space_tokens = ["空格 ,两个空格 ,三个空格 ,制表符\t,换行符\n"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
 
11
 
12
- docs = [line.strip() for line in open(os.path.join(CURRENT_DIR, "test.txt"), "r", encoding="utf-8")]
 
2
 
3
 
4
  import os
5
+ import json
6
+ from vocab import all_tokenizer_config, load_tokenizer, TokenizerImpl
7
 
 
8
 
9
+ text = "hello; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속;" \
10
+ " 確実に春が近づいてること; a közoktatással? _ Belföld;" \
11
+ " pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ;" \
12
+ " निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:;" \
13
+ " « અમરેલીનાં મહિલા વિકાસ; 🦙❤❥웃유♋☮✊;" \
14
+ "װיקיװערטערבוך "
15
+ whitespace = "\t \n\n\r "
16
+ bytes = b"\x00\x01\x02\x03\x04".decode('utf-8')
17
+
18
+ text += whitespace
19
+
20
+
21
+ def get_unk(tokenizer_config):
22
+ tokenizer = load_tokenizer(tokenizer_config)
23
+ if hasattr(tokenizer, "unk_token"):
24
+ return f"{tokenizer.unk_token}, {tokenizer.unk_token_id}"
25
+ else:
26
+ return "unk_token not found"
27
+
28
+
29
+ # def infer_tokenizer_impl(tokenizer_config):
30
+ def infer_tokenizer_type(tokenizer_config):
31
+ tokenizer = load_tokenizer(tokenizer_config)
32
+ if tokenizer_config.impl == TokenizerImpl.TikToken:
33
+ return "tiktoken"
34
+ if hasattr(tokenizer, "backend_tokenizer"):
35
+ return str(type(tokenizer.backend_tokenizer.model)) # type(tokenizer._tokenizer.model))
36
+ # orion: sp_model.Load(vocab_file),继承 PreTrainedTokenizer
37
+ elif hasattr(tokenizer, "sp_model"): # 基于 sentencepiece 包
38
+ # for i in range(tokenizer.sp_model.piece_size()):
39
+ # if tokenizer.sp_model.is_byte(i):
40
+ # print("")
41
+ return f"sp_model, byte_num: {sum([tokenizer.sp_model.is_byte(i) for i in range(tokenizer.sp_model.piece_size())])}"
42
+
43
+ # sp.Load(model_path) ,并且包括image_tokenizer
44
+ elif "glm-" in tokenizer_config.name_or_path:
45
+ return f"byte_num: {sum([tokenizer.sp_tokenizer.text_tokenizer.sp.is_byte(i) for i in range(tokenizer.sp_tokenizer.text_tokenizer.sp.piece_size())])}"
46
+ # sp.Load(model_path) ,没有image_tokenizer
47
+ elif "glm2-" in tokenizer_config.name_or_path \
48
+ or "glm3-" in tokenizer_config.name_or_path \
49
+ or "CharacterGLM-6B" in tokenizer_config.name_or_path:
50
+ return f"byte_num: {sum([tokenizer.tokenizer.sp_model.is_byte(i) for i in range(tokenizer.tokenizer.sp_model.piece_size())])}"
51
+ elif "abeja/gpt-neox-japanese-2.7b" == tokenizer_config.name_or_path: # 支持 byte-level,解决oov问题
52
+ return f"japanese-bpe: https://github.com/tanreinama/Japanese-BPEEncoder_V2"
53
+ # bert-base-japanese: 特殊的地方在于 "word_tokenizer_type": "mecab",见 https://huggingface.co/tohoku-nlp/bert-base-japanese/blob/main/tokenizer_config.json
54
+ elif "bert-base-japanese" in tokenizer_config.name_or_path:
55
+ return "wordpiece.MecabTokenizer, 支持byte-level https://taku910.github.io/mecab/"
56
+ elif "moss" in tokenizer_config.name_or_path:
57
+ return "应该是 sentencepiece.byte_bpe,待确认"
58
+ elif "byt5" in tokenizer_config.name_or_path:
59
+ return "未知,待定"
60
+ else:
61
+ print("catch", tokenizer_config.name_or_path)
62
+ raise "error"
63
+
64
+
65
+
66
+
67
+
68
+ def test_reversible(tokenizer_config):
69
+ """
70
+ xlm-roberta-base 为什么oov这么少?是因为有 byte吗?
71
+ :param tokenizer_config:
72
+ :return:
73
+ """
74
+ tokenizer = load_tokenizer(tokenizer_config)
75
+ encoding = tokenizer.encode(text, add_special_tokens=False)
76
+ decoding = tokenizer.decode(encoding)
77
+
78
+ if text in decoding:
79
+ # print(tokenizer_config.name, tokenizer_config.impl, "reversible: true")
80
+ pass
81
+ else:
82
+ unk_count = sum([1 for token_id in encoding if token_id == tokenizer.unk_token_id])
83
+ oov_tokens = []
84
+ # if tokenizer_config.impl == TokenizerImpl.SentencePiece:
85
+ # print(sum([tokenizer.is_byte(i) for i in range(tokenizer.piece_size())]))
86
+
87
+ print("#######"*5)
88
+ print(f"{tokenizer_config.name_or_path}, {infer_tokenizer_type(tokenizer_config)}\n"
89
+ f"reversible: false; unk_token: {get_unk(tokenizer_config)},"
90
+ f" unk_ratio: {unk_count/len(encoding):.4f}; oov: []")
91
+ for i in range(len(text)):
92
+ if text[i] != decoding[i]:
93
+ # print(f"text[{i}] = {str(bytes(text[i:], 'utf-8'))}\n"
94
+ # f"decoding[{i}] = {str(bytes(decoding[i:], 'utf-8'))}")
95
+ print(f"text[{i}] = {json.dumps(text[i:], ensure_ascii=False)}, \n"
96
+ f"decoding[{i}] = {json.dumps(decoding[i:], ensure_ascii=False)}")
97
+
98
+ break
99
+
100
+
101
+
102
+ for config in all_tokenizer_config:
103
+ # if "xlm-roberta-base" in config.name:
104
+ # if "xlm-roberta-base" in config.name:
105
+ # if "chatglm3-6b" in config.name:
106
+ # if "bert-base-japanese" in config.name:
107
+ # if "moss" in config.name:
108
+ # if "byt5" in config.name:
109
+ if "baichuan" in config.name_or_path:
110
+ # if "CharacterGLM-6B" in config.name:
111
+ # if "fastchat-t5" in config.name: # 报错 pyo3_runtime.PanicException: AddedVocabulary bad split
112
+ # if True:
113
+ # test_unk(config)
114
+ test_reversible(config)
115
+
116
 
117
 
118
 
 
utils/speed_util.py DELETED
@@ -1,9 +0,0 @@
1
- """
2
- encode速度
3
- decode速度
4
-
5
- ## examples
6
-
7
- qwen的encode速度有点慢
8
-
9
- """
 
 
 
 
 
 
 
 
 
 
utils/symbol.py DELETED
@@ -1,35 +0,0 @@
1
- """
2
- special_symbols: https://github.com/google/sentencepiece/blob/master/doc/special_symbols.md
3
- emoji:
4
- """
5
-
6
- import sys
7
-
8
-
9
- # 来自 https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/tokenizer/gpt2_tokenization.py#L55
10
- # 啥意思?
11
- def bytes_to_unicode():
12
- """
13
- Returns list of utf-8 byte and a corresponding list of unicode strings.
14
- The reversible bpe codes work on unicode strings.
15
- This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
16
- When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
17
- This is a signficant percentage of your normal, say, 32K bpe vocab.
18
- To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
19
- And avoids mapping to whitespace/control characters the bpe code barfs on.
20
- """
21
- _chr = unichr if sys.version_info[0] == 2 else chr
22
- bs = list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + \
23
- list(range(ord("®"), ord("ÿ") + 1))
24
- cs = bs[:]
25
- n = 0
26
- for b in range(2**8):
27
- if b not in bs:
28
- bs.append(b)
29
- cs.append(2**8 + n)
30
- n += 1
31
- cs = [_chr(n) for n in cs]
32
- return dict(zip(bs, cs))
33
-
34
- aa = bytes_to_unicode()
35
- print(aa)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/text_util.py CHANGED
@@ -1,12 +1,23 @@
 
 
 
 
 
 
 
 
1
 
2
  def is_digit_char(uchar):
3
  return uchar in "0123456789"
4
 
5
 
6
- def has_digit(text):
7
  return any(is_digit_char(ch) for ch in text)
8
 
9
 
 
 
 
10
  def is_all_digit(text):
11
  return all(is_digit_char(char) for char in text)
12
 
 
1
+ """
2
+ char_
3
+ """
4
+
5
+
6
+ def detect_lang_from_unicode():
7
+ pass
8
+
9
 
10
  def is_digit_char(uchar):
11
  return uchar in "0123456789"
12
 
13
 
14
+ def contains_digit(text):
15
  return any(is_digit_char(ch) for ch in text)
16
 
17
 
18
+ def get_digit_count(text):
19
+ pass
20
+
21
  def is_all_digit(text):
22
  return all(is_digit_char(char) for char in text)
23
 
utils/vocab.jd.txt.v2 DELETED
@@ -1,10268 +0,0 @@
1
- [PAD]
2
- [unused1]
3
- [unused2]
4
- [unused3]
5
- [unused4]
6
- [unused5]
7
- [unused6]
8
- [unused7]
9
- [unused8]
10
- [unused9]
11
- [unused10]
12
- [unused11]
13
- [unused12]
14
- [unused13]
15
- [unused14]
16
- [unused15]
17
- [unused16]
18
- [unused17]
19
- [unused18]
20
- [unused19]
21
- [unused20]
22
- [unused21]
23
- [unused22]
24
- [unused23]
25
- [unused24]
26
- [unused25]
27
- [unused26]
28
- [unused27]
29
- [unused28]
30
- [unused29]
31
- [unused30]
32
- [unused31]
33
- [unused32]
34
- [unused33]
35
- [unused34]
36
- [unused35]
37
- [unused36]
38
- [unused37]
39
- [unused38]
40
- [unused39]
41
- [unused40]
42
- [unused41]
43
- [unused42]
44
- [unused43]
45
- [unused44]
46
- [unused45]
47
- [unused46]
48
- [unused47]
49
- [unused48]
50
- [unused49]
51
- [unused50]
52
- [unused51]
53
- [unused52]
54
- [unused53]
55
- [unused54]
56
- [unused55]
57
- [unused56]
58
- [unused57]
59
- [unused58]
60
- [unused59]
61
- [unused60]
62
- [unused61]
63
- [unused62]
64
- [unused63]
65
- [unused64]
66
- [unused65]
67
- [unused66]
68
- [unused67]
69
- [unused68]
70
- [unused69]
71
- [unused70]
72
- [unused71]
73
- [unused72]
74
- [unused73]
75
- [unused74]
76
- [unused75]
77
- [unused76]
78
- [unused77]
79
- [unused78]
80
- [unused79]
81
- [unused80]
82
- [unused81]
83
- [unused82]
84
- [unused83]
85
- [unused84]
86
- [unused85]
87
- [unused86]
88
- [unused87]
89
- [unused88]
90
- [unused89]
91
- [unused90]
92
- [unused91]
93
- [unused92]
94
- [unused93]
95
- [unused94]
96
- [unused95]
97
- [unused96]
98
- [unused97]
99
- [unused98]
100
- [unused99]
101
- [UNK]
102
- [CLS]
103
- [SEP]
104
- [MASK]
105
- <S>
106
- <T>
107
- !
108
- "
109
-
110
-
111
-
112
-
113
-
114
-
115
-
116
- #
117
- $
118
- %
119
- &
120
- '
121
- (
122
- )
123
- *
124
- +
125
- ,
126
- -
127
- .
128
- /
129
- 0
130
- 1
131
- 2
132
- 3
133
- 4
134
- 5
135
- 6
136
- 7
137
- 8
138
- 9
139
- 10
140
- 11
141
- 12
142
- 13
143
- 14
144
- 15
145
- 16
146
- 17
147
- 18
148
- 19
149
- 20
150
- 21
151
- 22
152
- 23
153
- 24
154
- 25
155
- 26
156
- 27
157
- 28
158
- 29
159
- 30
160
- 31
161
- 32
162
- 33
163
- 34
164
- 35
165
- 36
166
- 37
167
- 38
168
- 39
169
- 40
170
- 41
171
- 42
172
- 43
173
- 44
174
- 45
175
- 46
176
- 47
177
- 48
178
- 49
179
- 50
180
- 51
181
- 52
182
- 53
183
- 54
184
- 55
185
- 56
186
- 57
187
- 58
188
- 59
189
- 60
190
- 61
191
- 62
192
- 63
193
- 64
194
- 65
195
- 66
196
- 67
197
- 68
198
- 69
199
- 70
200
- 71
201
- 72
202
- 73
203
- 74
204
- 75
205
- 76
206
- 77
207
- 78
208
- 79
209
- 80
210
- 81
211
- 82
212
- 83
213
- 84
214
- 85
215
- 86
216
- 87
217
- 88
218
- 89
219
- 90
220
- 91
221
- 92
222
- 93
223
- 94
224
- 95
225
- 96
226
- 97
227
- 98
228
- 99
229
- 100
230
- 120
231
- 128
232
- 180
233
- 200
234
- 256
235
- 304
236
- 360
237
- 500
238
- 512
239
- 1000
240
- 1080
241
- 2000
242
- 2014
243
- 2015
244
- 2016
245
- 2017
246
- 2018
247
- 2019
248
- 2020
249
- 2021
250
- 2022
251
- :
252
- ;
253
- <
254
- =
255
- >
256
- ?
257
- @
258
- [
259
- \
260
- ]
261
- ^
262
- _
263
- a
264
- b
265
- c
266
- d
267
- e
268
- f
269
- g
270
- h
271
- i
272
- j
273
- k
274
- l
275
- m
276
- n
277
- o
278
- p
279
- q
280
- r
281
- s
282
- t
283
- u
284
- v
285
- w
286
- x
287
- y
288
- z
289
- {
290
- |
291
- }
292
- ~
293
- £
294
- ¤
295
- ¥
296
- §
297
- «
298
- °
299
- ±
300
- ²
301
- ³
302
- µ
303
- ·
304
- ¹
305
- º
306
- »
307
- ¼
308
- ×
309
- ß
310
- æ
311
- ÷
312
- ø
313
- đ
314
- ŋ
315
- ɔ
316
- ə
317
- ɡ
318
- ʰ
319
- ˇ
320
- ˈ
321
- ˊ
322
- ˋ
323
- ˍ
324
- ː
325
- ˙
326
- ˚
327
- ˢ
328
- α
329
- β
330
- γ
331
- δ
332
- ε
333
- η
334
- θ
335
- ι
336
- κ
337
- λ
338
- μ
339
- ν
340
- ο
341
- π
342
- ρ
343
- ς
344
- σ
345
- τ
346
- υ
347
- φ
348
- χ
349
- ψ
350
- ω
351
- а
352
- б
353
- в
354
- г
355
- д
356
- е
357
- ж
358
- з
359
- и
360
- к
361
- л
362
- м
363
- н
364
- о
365
- п
366
- р
367
- с
368
- т
369
- у
370
- ф
371
- х
372
- ц
373
- ч
374
- ш
375
- ы
376
- ь
377
- я
378
- і
379
-
380
-
381
-
382
-
383
-
384
-
385
-
386
-
387
-
388
-
389
-
390
-
391
-
392
-
393
-
394
-
395
-
396
-
397
-
398
-
399
-
400
-
401
-
402
-
403
-
404
-
405
-
406
-
407
-
408
-
409
-
410
-
411
-
412
-
413
-
414
-
415
-
416
-
417
-
418
-
419
-
420
-
421
-
422
-
423
-
424
-
425
-
426
-
427
-
428
-
429
-
430
-
431
-
432
-
433
-
434
-
435
-
436
-
437
-
438
-
439
-
440
-
441
-
442
-
443
-
444
-
445
-
446
-
447
-
448
-
449
-
450
-
451
-
452
-
453
-
454
-
455
-
456
-
457
-
458
-
459
-
460
-
461
-
462
-
463
-
464
-
465
-
466
-
467
-
468
-
469
-
470
-
471
-
472
-
473
-
474
-
475
-
476
-
477
-
478
-
479
-
480
-
481
-
482
-
483
-
484
-
485
-
486
-
487
-
488
-
489
-
490
-
491
-
492
-
493
-
494
-
495
-
496
-
497
-
498
-
499
-
500
-
501
-
502
-
503
-
504
-
505
-
506
-
507
-
508
-
509
-
510
-
511
-
512
-
513
-
514
-
515
-
516
-
517
-
518
-
519
-
520
-
521
-
522
-
523
-
524
-
525
-
526
-
527
-
528
-
529
-
530
-
531
-
532
-
533
-
534
-
535
-
536
-
537
-
538
-
539
-
540
-
541
-
542
-
543
-
544
-
545
-
546
-
547
-
548
- ⦿
549
-
550
-
551
-
552
-
553
-
554
-
555
-
556
-
557
-
558
-
559
-
560
-
561
-
562
-
563
-
564
-
565
-
566
-
567
-
568
-
569
-
570
-
571
-
572
-
573
-
574
-
575
-
576
-
577
-
578
-
579
-
580
-
581
-
582
-
583
-
584
-
585
-
586
-
587
-
588
-
589
-
590
-
591
-
592
-
593
-
594
-
595
-
596
-
597
-
598
-
599
-
600
-
601
-
602
-
603
-
604
-
605
-
606
-
607
-
608
-
609
-
610
-
611
-
612
-
613
-
614
-
615
-
616
-
617
-
618
-
619
-
620
-
621
- 丿
622
-
623
-
624
-
625
-
626
-
627
-
628
-
629
-
630
-
631
-
632
-
633
-
634
-
635
-
636
-
637
-
638
-
639
-
640
-
641
-
642
-
643
-
644
-
645
-
646
-
647
-
648
-
649
-
650
-
651
-
652
-
653
-
654
-
655
-
656
-
657
-
658
-
659
-
660
-
661
-
662
-
663
-
664
-
665
-
666
-
667
-
668
-
669
-
670
-
671
-
672
-
673
-
674
-
675
-
676
-
677
-
678
-
679
-
680
-
681
-
682
-
683
-
684
- 亿
685
-
686
-
687
-
688
-
689
-
690
-
691
-
692
-
693
-
694
-
695
-
696
-
697
-
698
-
699
-
700
-
701
-
702
-
703
-
704
-
705
-
706
-
707
-
708
-
709
-
710
-
711
-
712
-
713
-
714
-
715
-
716
-
717
-
718
-
719
-
720
-
721
- 仿
722
-
723
-
724
-
725
-
726
-
727
-
728
-
729
-
730
-
731
-
732
-
733
-
734
-
735
-
736
-
737
-
738
-
739
-
740
-
741
-
742
-
743
-
744
-
745
-
746
-
747
-
748
-
749
-
750
-
751
-
752
-
753
-
754
-
755
-
756
-
757
-
758
-
759
-
760
-
761
-
762
-
763
-
764
-
765
-
766
-
767
-
768
-
769
-
770
-
771
-
772
-
773
-
774
-
775
-
776
-
777
-
778
-
779
-
780
-
781
-
782
-
783
-
784
- 使
785
-
786
-
787
-
788
-
789
-
790
-
791
-
792
-
793
-
794
-
795
-
796
-
797
-
798
-
799
-
800
-
801
-
802
-
803
-
804
-
805
-
806
-
807
- 便
808
-
809
-
810
-
811
-
812
-
813
-
814
-
815
-
816
-
817
-
818
-
819
-
820
-
821
-
822
-
823
-
824
-
825
-
826
-
827
-
828
-
829
-
830
-
831
-
832
-
833
-
834
-
835
-
836
-
837
-
838
-
839
-
840
-
841
-
842
-
843
-
844
-
845
-
846
-
847
-
848
-
849
-
850
-
851
-
852
-
853
-
854
-
855
-
856
-
857
-
858
-
859
-
860
-
861
-
862
-
863
-
864
-
865
-
866
-
867
-
868
-
869
-
870
-
871
-
872
-
873
-
874
-
875
-
876
-
877
-
878
-
879
-
880
-
881
-
882
-
883
-
884
-
885
-
886
-
887
-
888
-
889
-
890
-
891
-
892
-
893
-
894
-
895
-
896
-
897
-
898
-
899
-
900
-
901
-
902
-
903
-
904
-
905
-
906
-
907
-
908
-
909
-
910
-
911
-
912
-
913
-
914
-
915
-
916
-
917
-
918
-
919
-
920
-
921
-
922
-
923
-
924
-
925
-
926
-
927
-
928
-
929
-
930
-
931
-
932
-
933
-
934
-
935
-
936
-
937
-
938
-
939
-
940
-
941
- ��
942
-
943
-
944
-
945
-
946
-
947
-
948
-
949
-
950
-
951
-
952
-
953
-
954
-
955
-
956
-
957
-
958
-
959
-
960
-
961
-
962
-
963
-
964
-
965
-
966
-
967
-
968
-
969
-
970
-
971
-
972
-
973
-
974
-
975
-
976
-
977
-
978
-
979
-
980
-
981
-
982
-
983
-
984
-
985
-
986
-
987
-
988
-
989
-
990
-
991
-
992
-
993
-
994
-
995
-
996
-
997
-
998
-
999
-
1000
-
1001
-
1002
-
1003
-
1004
-
1005
-
1006
-
1007
-
1008
-
1009
-
1010
-
1011
-
1012
-
1013
-
1014
-
1015
-
1016
-
1017
-
1018
-
1019
-
1020
-
1021
-
1022
-
1023
-
1024
-
1025
-
1026
-
1027
-
1028
-
1029
-
1030
-
1031
-
1032
-
1033
-
1034
-
1035
-
1036
-
1037
-
1038
-
1039
-
1040
-
1041
-
1042
-
1043
-
1044
-
1045
-
1046
-
1047
-
1048
-
1049
-
1050
-
1051
-
1052
-
1053
-
1054
-
1055
-
1056
-
1057
-
1058
-
1059
-
1060
-
1061
-
1062
-
1063
-
1064
-
1065
-
1066
-
1067
-
1068
-
1069
-
1070
-
1071
-
1072
-
1073
-
1074
-
1075
-
1076
-
1077
-
1078
-
1079
-
1080
-
1081
-
1082
-
1083
-
1084
-
1085
-
1086
-
1087
-
1088
-
1089
-
1090
-
1091
-
1092
-
1093
-
1094
-
1095
-
1096
-
1097
-
1098
-
1099
-
1100
-
1101
-
1102
-
1103
-
1104
-
1105
-
1106
-
1107
-
1108
-
1109
-
1110
-
1111
-
1112
-
1113
-
1114
-
1115
-
1116
-
1117
-
1118
-
1119
-
1120
-
1121
-
1122
-
1123
-
1124
-
1125
-
1126
-
1127
-
1128
-
1129
-
1130
-
1131
-
1132
-
1133
-
1134
-
1135
-
1136
-
1137
-
1138
-
1139
-
1140
-
1141
-
1142
-
1143
-
1144
-
1145
-
1146
-
1147
-
1148
-
1149
-
1150
-
1151
-
1152
-
1153
-
1154
-
1155
-
1156
-
1157
-
1158
-
1159
-
1160
-
1161
-
1162
-
1163
-
1164
-
1165
-
1166
-
1167
-
1168
-
1169
-
1170
-
1171
-
1172
-
1173
-
1174
-
1175
-
1176
-
1177
-
1178
-
1179
-
1180
-
1181
-
1182
-
1183
-
1184
-
1185
-
1186
-
1187
-
1188
-
1189
-
1190
-
1191
-
1192
-
1193
-
1194
-
1195
-
1196
-
1197
-
1198
-
1199
-
1200
-
1201
-
1202
-
1203
-
1204
-
1205
-
1206
-
1207
-
1208
-
1209
-
1210
-
1211
-
1212
-
1213
-
1214
-
1215
-
1216
-
1217
-
1218
-
1219
-
1220
-
1221
-
1222
-
1223
-
1224
-
1225
-
1226
-
1227
-
1228
-
1229
-
1230
-
1231
-
1232
-
1233
-
1234
-
1235
-
1236
-
1237
-
1238
-
1239
-
1240
-
1241
-
1242
-
1243
-
1244
-
1245
-
1246
-
1247
-
1248
-
1249
-
1250
-
1251
-
1252
-
1253
-
1254
-
1255
-
1256
-
1257
-
1258
-
1259
-
1260
-
1261
-
1262
-
1263
-
1264
-
1265
-
1266
-
1267
-
1268
-
1269
-
1270
-
1271
-
1272
-
1273
-
1274
-
1275
-
1276
-
1277
-
1278
-
1279
-
1280
-
1281
-
1282
-
1283
-
1284
-
1285
-
1286
-
1287
-
1288
-
1289
-
1290
-
1291
-
1292
-
1293
-
1294
-
1295
-
1296
-
1297
-
1298
-
1299
-
1300
-
1301
-
1302
-
1303
-
1304
-
1305
-
1306
-
1307
-
1308
-
1309
-
1310
-
1311
-
1312
-
1313
-
1314
-
1315
-
1316
-
1317
-
1318
-
1319
-
1320
-
1321
-
1322
-
1323
-
1324
-
1325
-
1326
-
1327
-
1328
-
1329
-
1330
-
1331
-
1332
-
1333
-
1334
-
1335
-
1336
-
1337
-
1338
-
1339
-
1340
-
1341
-
1342
-
1343
-
1344
-
1345
-
1346
-
1347
-
1348
-
1349
-
1350
-
1351
-
1352
-
1353
-
1354
-
1355
-
1356
-
1357
-
1358
-
1359
-
1360
-
1361
-
1362
-
1363
-
1364
-
1365
-
1366
-
1367
-
1368
-
1369
-
1370
-
1371
-
1372
-
1373
-
1374
-
1375
-
1376
-
1377
-
1378
-
1379
-
1380
-
1381
-
1382
-
1383
-
1384
-
1385
-
1386
-
1387
-
1388
-
1389
-
1390
-
1391
-
1392
-
1393
-
1394
-
1395
-
1396
-
1397
-
1398
-
1399
-
1400
-
1401
-
1402
-
1403
-
1404
-
1405
-
1406
-
1407
-
1408
-
1409
-
1410
-
1411
-
1412
-
1413
-
1414
-
1415
-
1416
-
1417
-
1418
-
1419
-
1420
-
1421
-
1422
-
1423
-
1424
-
1425
-
1426
-
1427
-
1428
-
1429
-
1430
-
1431
-
1432
-
1433
-
1434
-
1435
-
1436
-
1437
-
1438
-
1439
-
1440
-
1441
-
1442
-
1443
-
1444
-
1445
-
1446
-
1447
-
1448
-
1449
-
1450
-
1451
-
1452
-
1453
-
1454
-
1455
-
1456
-
1457
-
1458
-
1459
-
1460
-
1461
-
1462
-
1463
-
1464
-
1465
-
1466
-
1467
-
1468
-
1469
-
1470
-
1471
-
1472
-
1473
-
1474
-
1475
-
1476
-
1477
-
1478
-
1479
-
1480
-
1481
-
1482
-
1483
-
1484
-
1485
-
1486
-
1487
-
1488
-
1489
-
1490
-
1491
-
1492
-
1493
-
1494
-
1495
-
1496
-
1497
-
1498
-
1499
-
1500
-
1501
-
1502
-
1503
-
1504
-
1505
-
1506
-
1507
-
1508
-
1509
-
1510
-
1511
-
1512
-
1513
-
1514
-
1515
-
1516
-
1517
-
1518
-
1519
-
1520
-
1521
-
1522
-
1523
-
1524
-
1525
-
1526
-
1527
-
1528
-
1529
-
1530
-
1531
-
1532
-
1533
-
1534
-
1535
-
1536
-
1537
-
1538
-
1539
-
1540
-
1541
-
1542
-
1543
-
1544
-
1545
-
1546
-
1547
-
1548
-
1549
-
1550
-
1551
-
1552
-
1553
-
1554
-
1555
-
1556
-
1557
-
1558
-
1559
-
1560
-
1561
-
1562
-
1563
-
1564
-
1565
-
1566
-
1567
-
1568
-
1569
-
1570
-
1571
-
1572
-
1573
-
1574
-
1575
-
1576
-
1577
-
1578
-
1579
-
1580
-
1581
-
1582
-
1583
-
1584
-
1585
-
1586
-
1587
-
1588
-
1589
-
1590
-
1591
-
1592
-
1593
-
1594
-
1595
-
1596
-
1597
-
1598
-
1599
-
1600
-
1601
-
1602
-
1603
-
1604
-
1605
-
1606
-
1607
-
1608
-
1609
-
1610
-
1611
-
1612
-
1613
-
1614
-
1615
-
1616
-
1617
-
1618
-
1619
-
1620
-
1621
-
1622
-
1623
-
1624
-
1625
-
1626
-
1627
-
1628
-
1629
-
1630
-
1631
-
1632
-
1633
-
1634
-
1635
-
1636
-
1637
-
1638
-
1639
-
1640
-
1641
-
1642
-
1643
-
1644
-
1645
-
1646
-
1647
-
1648
-
1649
-
1650
-
1651
-
1652
-
1653
-
1654
-
1655
-
1656
-
1657
-
1658
-
1659
-
1660
-
1661
-
1662
-
1663
-
1664
-
1665
-
1666
-
1667
-
1668
-
1669
-
1670
-
1671
-
1672
-
1673
-
1674
-
1675
-
1676
-
1677
-
1678
-
1679
-
1680
-
1681
-
1682
-
1683
-
1684
-
1685
-
1686
-
1687
-
1688
-
1689
-
1690
-
1691
-
1692
-
1693
-
1694
-
1695
-
1696
-
1697
-
1698
-
1699
-
1700
-
1701
-
1702
-
1703
-
1704
-
1705
-
1706
-
1707
-
1708
-
1709
-
1710
-
1711
-
1712
-
1713
-
1714
-
1715
-
1716
-
1717
-
1718
-
1719
-
1720
-
1721
-
1722
-
1723
-
1724
-
1725
-
1726
-
1727
-
1728
-
1729
-
1730
-
1731
-
1732
-
1733
-
1734
-
1735
-
1736
- 姿
1737
-
1738
-
1739
-
1740
-
1741
-
1742
-
1743
-
1744
-
1745
-
1746
-
1747
-
1748
-
1749
-
1750
-
1751
-
1752
-
1753
-
1754
-
1755
-
1756
-
1757
-
1758
-
1759
-
1760
-
1761
-
1762
-
1763
-
1764
-
1765
-
1766
-
1767
-
1768
-
1769
-
1770
-
1771
-
1772
- 婿
1773
-
1774
-
1775
-
1776
-
1777
-
1778
-
1779
-
1780
-
1781
-
1782
-
1783
-
1784
-
1785
-
1786
-
1787
-
1788
-
1789
-
1790
-
1791
-
1792
-
1793
-
1794
-
1795
-
1796
-
1797
-
1798
-
1799
-
1800
-
1801
- 嬿
1802
-
1803
-
1804
-
1805
-
1806
-
1807
-
1808
-
1809
-
1810
-
1811
-
1812
-
1813
-
1814
-
1815
-
1816
-
1817
-
1818
-
1819
-
1820
-
1821
-
1822
-
1823
-
1824
-
1825
-
1826
-
1827
-
1828
-
1829
-
1830
-
1831
-
1832
-
1833
-
1834
-
1835
-
1836
-
1837
-
1838
-
1839
-
1840
-
1841
-
1842
-
1843
-
1844
-
1845
-
1846
-
1847
-
1848
-
1849
-
1850
-
1851
-
1852
-
1853
-
1854
-
1855
-
1856
-
1857
-
1858
-
1859
-
1860
-
1861
-
1862
-
1863
-
1864
-
1865
-
1866
-
1867
-
1868
- 宿
1869
-
1870
-
1871
-
1872
-
1873
-
1874
-
1875
-
1876
-
1877
-
1878
-
1879
-
1880
-
1881
-
1882
-
1883
-
1884
-
1885
-
1886
-
1887
-
1888
-
1889
-
1890
-
1891
-
1892
-
1893
- 寿
1894
-
1895
-
1896
-
1897
-
1898
-
1899
-
1900
-
1901
-
1902
-
1903
-
1904
-
1905
-
1906
-
1907
-
1908
-
1909
-
1910
-
1911
-
1912
-
1913
-
1914
-
1915
-
1916
-
1917
-
1918
-
1919
-
1920
- 尿
1921
-
1922
-
1923
-
1924
-
1925
-
1926
-
1927
-
1928
-
1929
-
1930
-
1931
-
1932
-
1933
-
1934
-
1935
-
1936
-
1937
-
1938
-
1939
-
1940
-
1941
-
1942
-
1943
- 屿
1944
-
1945
-
1946
-
1947
-
1948
-
1949
-
1950
-
1951
-
1952
-
1953
-
1954
-
1955
-
1956
-
1957
-
1958
-
1959
-
1960
-
1961
-
1962
-
1963
-
1964
-
1965
- ��
1966
-
1967
-
1968
-
1969
-
1970
-
1971
-
1972
-
1973
-
1974
-
1975
-
1976
-
1977
-
1978
-
1979
-
1980
-
1981
-
1982
-
1983
-
1984
-
1985
-
1986
-
1987
-
1988
-
1989
-
1990
-
1991
-
1992
-
1993
-
1994
-
1995
-
1996
-
1997
-
1998
-
1999
-
2000
-
2001
-
2002
-
2003
-
2004
-
2005
-
2006
-
2007
-
2008
-
2009
-
2010
-
2011
-
2012
-
2013
-
2014
-
2015
-
2016
-
2017
-
2018
-
2019
-
2020
-
2021
-
2022
-
2023
-
2024
-
2025
-
2026
-
2027
- 巿
2028
-
2029
-
2030
-
2031
-
2032
-
2033
-
2034
-
2035
-
2036
-
2037
-
2038
-
2039
-
2040
-
2041
-
2042
-
2043
-
2044
-
2045
-
2046
-
2047
-
2048
-
2049
-
2050
-
2051
-
2052
-
2053
-
2054
-
2055
-
2056
-
2057
-
2058
-
2059
-
2060
-
2061
-
2062
-
2063
-
2064
-
2065
-
2066
-
2067
-
2068
-
2069
-
2070
- 广
2071
-
2072
-
2073
-
2074
-
2075
-
2076
-
2077
-
2078
-
2079
-
2080
-
2081
-
2082
-
2083
-
2084
-
2085
-
2086
-
2087
-
2088
-
2089
-
2090
-
2091
-
2092
-
2093
-
2094
-
2095
-
2096
-
2097
-
2098
-
2099
-
2100
-
2101
-
2102
-
2103
-
2104
-
2105
-
2106
-
2107
- 廿
2108
-
2109
-
2110
-
2111
-
2112
-
2113
-
2114
-
2115
-
2116
-
2117
-
2118
-
2119
-
2120
-
2121
-
2122
-
2123
-
2124
-
2125
-
2126
-
2127
-
2128
-
2129
-
2130
-
2131
-
2132
-
2133
-
2134
-
2135
-
2136
-
2137
-
2138
-
2139
-
2140
-
2141
-
2142
-
2143
-
2144
-
2145
-
2146
-
2147
-
2148
-
2149
-
2150
-
2151
-
2152
-
2153
-
2154
-
2155
-
2156
-
2157
-
2158
- 彿
2159
-
2160
-
2161
-
2162
-
2163
-
2164
-
2165
-
2166
-
2167
-
2168
-
2169
-
2170
-
2171
-
2172
-
2173
-
2174
-
2175
-
2176
-
2177
-
2178
-
2179
-
2180
-
2181
-
2182
-
2183
-
2184
-
2185
-
2186
-
2187
-
2188
-
2189
-
2190
-
2191
-
2192
-
2193
-
2194
-
2195
-
2196
-
2197
-
2198
-
2199
-
2200
-
2201
-
2202
-
2203
-
2204
-
2205
-
2206
-
2207
-
2208
-
2209
-
2210
- 忿
2211
- 怀
2212
-
2213
-
2214
-
2215
-
2216
-
2217
-
2218
-
2219
-
2220
-
2221
-
2222
-
2223
-
2224
-
2225
-
2226
-
2227
-
2228
-
2229
-
2230
-
2231
-
2232
-
2233
-
2234
-
2235
-
2236
-
2237
-
2238
-
2239
-
2240
-
2241
-
2242
-
2243
-
2244
-
2245
-
2246
-
2247
-
2248
-
2249
-
2250
-
2251
-
2252
-
2253
-
2254
-
2255
-
2256
-
2257
-
2258
-
2259
-
2260
-
2261
-
2262
-
2263
-
2264
-
2265
-
2266
-
2267
-
2268
-
2269
-
2270
-
2271
-
2272
-
2273
-
2274
-
2275
-
2276
-
2277
-
2278
-
2279
-
2280
-
2281
-
2282
-
2283
-
2284
-
2285
-
2286
-
2287
-
2288
-
2289
-
2290
-
2291
-
2292
-
2293
-
2294
-
2295
-
2296
-
2297
-
2298
-
2299
-
2300
-
2301
-
2302
-
2303
-
2304
-
2305
-
2306
-
2307
-
2308
-
2309
-
2310
-
2311
-
2312
-
2313
-
2314
-
2315
-
2316
-
2317
-
2318
-
2319
-
2320
-
2321
-
2322
-
2323
-
2324
-
2325
-
2326
-
2327
-
2328
-
2329
-
2330
-
2331
-
2332
-
2333
-
2334
-
2335
-
2336
-
2337
-
2338
-
2339
-
2340
-
2341
-
2342
-
2343
-
2344
-
2345
-
2346
-
2347
-
2348
-
2349
-
2350
-
2351
-
2352
-
2353
-
2354
-
2355
-
2356
-
2357
-
2358
-
2359
-
2360
-
2361
-
2362
-
2363
-
2364
-
2365
-
2366
-
2367
-
2368
-
2369
-
2370
-
2371
-
2372
-
2373
-
2374
-
2375
-
2376
-
2377
-
2378
-
2379
-
2380
-
2381
-
2382
-
2383
-
2384
-
2385
-
2386
-
2387
-
2388
-
2389
-
2390
-
2391
-
2392
-
2393
-
2394
-
2395
-
2396
-
2397
-
2398
-
2399
-
2400
-
2401
-
2402
-
2403
-
2404
-
2405
-
2406
-
2407
-
2408
-
2409
-
2410
-
2411
-
2412
-
2413
-
2414
-
2415
-
2416
-
2417
-
2418
-
2419
-
2420
-
2421
-
2422
-
2423
-
2424
-
2425
-
2426
-
2427
-
2428
-
2429
-
2430
-
2431
-
2432
-
2433
-
2434
-
2435
-
2436
-
2437
-
2438
-
2439
-
2440
-
2441
-
2442
-
2443
-
2444
-
2445
-
2446
-
2447
-
2448
-
2449
-
2450
-
2451
-
2452
-
2453
-
2454
-
2455
-
2456
-
2457
-
2458
-
2459
-
2460
-
2461
-
2462
-
2463
-
2464
-
2465
-
2466
-
2467
-
2468
-
2469
-
2470
-
2471
-
2472
-
2473
-
2474
-
2475
-
2476
-
2477
-
2478
-
2479
-
2480
-
2481
-
2482
-
2483
-
2484
-
2485
-
2486
-
2487
-
2488
-
2489
-
2490
-
2491
-
2492
-
2493
-
2494
-
2495
-
2496
-
2497
-
2498
-
2499
-
2500
-
2501
-
2502
-
2503
-
2504
-
2505
-
2506
-
2507
-
2508
-
2509
-
2510
-
2511
-
2512
-
2513
-
2514
-
2515
-
2516
-
2517
-
2518
-
2519
-
2520
-
2521
-
2522
-
2523
-
2524
-
2525
-
2526
-
2527
-
2528
-
2529
-
2530
-
2531
-
2532
-
2533
-
2534
-
2535
-
2536
-
2537
-
2538
-
2539
-
2540
-
2541
-
2542
-
2543
-
2544
-
2545
-
2546
-
2547
-
2548
-
2549
-
2550
-
2551
-
2552
-
2553
-
2554
-
2555
-
2556
-
2557
-
2558
-
2559
-
2560
-
2561
-
2562
-
2563
-
2564
-
2565
-
2566
-
2567
-
2568
-
2569
-
2570
-
2571
-
2572
-
2573
-
2574
-
2575
-
2576
-
2577
-
2578
-
2579
-
2580
-
2581
-
2582
-
2583
-
2584
-
2585
-
2586
-
2587
-
2588
-
2589
-
2590
-
2591
-
2592
-
2593
-
2594
-
2595
-
2596
-
2597
-
2598
-
2599
-
2600
-
2601
-
2602
-
2603
-
2604
-
2605
-
2606
-
2607
-
2608
-
2609
-
2610
-
2611
-
2612
-
2613
-
2614
-
2615
-
2616
-
2617
-
2618
-
2619
-
2620
-
2621
-
2622
-
2623
-
2624
-
2625
-
2626
-
2627
-
2628
-
2629
-
2630
-
2631
-
2632
-
2633
-
2634
-
2635
-
2636
-
2637
-
2638
-
2639
-
2640
-
2641
-
2642
-
2643
-
2644
-
2645
-
2646
-
2647
-
2648
-
2649
-
2650
-
2651
-
2652
-
2653
-
2654
-
2655
-
2656
-
2657
-
2658
-
2659
-
2660
-
2661
-
2662
-
2663
-
2664
-
2665
-
2666
-
2667
-
2668
-
2669
-
2670
-
2671
-
2672
-
2673
-
2674
-
2675
-
2676
-
2677
-
2678
-
2679
-
2680
-
2681
-
2682
-
2683
-
2684
-
2685
-
2686
-
2687
-
2688
-
2689
-
2690
-
2691
-
2692
-
2693
-
2694
-
2695
-
2696
-
2697
-
2698
-
2699
-
2700
-
2701
-
2702
-
2703
-
2704
-
2705
-
2706
-
2707
-
2708
-
2709
-
2710
-
2711
-
2712
-
2713
-
2714
-
2715
-
2716
-
2717
-
2718
-
2719
-
2720
-
2721
-
2722
-
2723
-
2724
-
2725
-
2726
-
2727
-
2728
-
2729
-
2730
-
2731
-
2732
-
2733
-
2734
-
2735
-
2736
-
2737
-
2738
-
2739
-
2740
-
2741
-
2742
-
2743
-
2744
-
2745
-
2746
-
2747
-
2748
-
2749
-
2750
-
2751
-
2752
-
2753
-
2754
-
2755
-
2756
-
2757
-
2758
-
2759
-
2760
-
2761
-
2762
-
2763
-
2764
-
2765
-
2766
-
2767
-
2768
-
2769
-
2770
-
2771
-
2772
-
2773
-
2774
-
2775
-
2776
-
2777
-
2778
-
2779
-
2780
-
2781
-
2782
-
2783
-
2784
-
2785
-
2786
-
2787
-
2788
-
2789
-
2790
-
2791
-
2792
-
2793
-
2794
-
2795
-
2796
-
2797
-
2798
-
2799
-
2800
-
2801
-
2802
-
2803
-
2804
-
2805
-
2806
-
2807
-
2808
-
2809
-
2810
-
2811
-
2812
-
2813
-
2814
-
2815
-
2816
-
2817
-
2818
-
2819
-
2820
-
2821
-
2822
-
2823
-
2824
-
2825
-
2826
-
2827
-
2828
-
2829
-
2830
-
2831
-
2832
-
2833
-
2834
-
2835
-
2836
-
2837
-
2838
-
2839
-
2840
-
2841
-
2842
-
2843
-
2844
-
2845
-
2846
-
2847
-
2848
-
2849
-
2850
-
2851
-
2852
-
2853
-
2854
-
2855
-
2856
-
2857
-
2858
-
2859
-
2860
-
2861
-
2862
-
2863
-
2864
-
2865
-
2866
-
2867
-
2868
-
2869
-
2870
-
2871
-
2872
-
2873
-
2874
-
2875
-
2876
-
2877
-
2878
-
2879
-
2880
-
2881
-
2882
-
2883
-
2884
-
2885
-
2886
-
2887
-
2888
-
2889
-
2890
-
2891
-
2892
-
2893
-
2894
-
2895
-
2896
-
2897
-
2898
-
2899
-
2900
-
2901
-
2902
-
2903
-
2904
-
2905
-
2906
-
2907
-
2908
-
2909
-
2910
-
2911
-
2912
-
2913
-
2914
-
2915
-
2916
-
2917
-
2918
-
2919
-
2920
-
2921
-
2922
-
2923
-
2924
-
2925
-
2926
-
2927
-
2928
-
2929
-
2930
-
2931
-
2932
-
2933
-
2934
-
2935
-
2936
-
2937
-
2938
-
2939
-
2940
-
2941
-
2942
-
2943
-
2944
-
2945
-
2946
-
2947
-
2948
-
2949
-
2950
-
2951
-
2952
-
2953
-
2954
-
2955
-
2956
-
2957
-
2958
-
2959
-
2960
-
2961
-
2962
-
2963
-
2964
-
2965
-
2966
-
2967
-
2968
-
2969
-
2970
-
2971
-
2972
-
2973
-
2974
-
2975
-
2976
-
2977
-
2978
-
2979
-
2980
-
2981
-
2982
-
2983
-
2984
-
2985
-
2986
-
2987
-
2988
-
2989
- ��
2990
-
2991
-
2992
-
2993
-
2994
-
2995
-
2996
-
2997
-
2998
- 椿
2999
-
3000
-
3001
-
3002
-
3003
-
3004
-
3005
-
3006
-
3007
-
3008
-
3009
-
3010
-
3011
-
3012
-
3013
-
3014
-
3015
-
3016
-
3017
-
3018
-
3019
-
3020
-
3021
-
3022
-
3023
-
3024
-
3025
-
3026
-
3027
-
3028
-
3029
-
3030
-
3031
-
3032
-
3033
-
3034
-
3035
-
3036
-
3037
-
3038
-
3039
-
3040
-
3041
-
3042
-
3043
- 槿
3044
-
3045
-
3046
-
3047
-
3048
-
3049
-
3050
-
3051
-
3052
-
3053
-
3054
-
3055
-
3056
-
3057
-
3058
-
3059
-
3060
-
3061
-
3062
-
3063
-
3064
-
3065
-
3066
-
3067
-
3068
-
3069
-
3070
-
3071
-
3072
-
3073
-
3074
-
3075
-
3076
-
3077
-
3078
-
3079
-
3080
-
3081
-
3082
-
3083
-
3084
-
3085
-
3086
-
3087
-
3088
-
3089
-
3090
-
3091
-
3092
-
3093
-
3094
-
3095
-
3096
-
3097
-
3098
-
3099
-
3100
-
3101
-
3102
-
3103
-
3104
-
3105
-
3106
-
3107
-
3108
-
3109
-
3110
-
3111
-
3112
-
3113
-
3114
-
3115
-
3116
-
3117
- 殿
3118
-
3119
-
3120
-
3121
-
3122
-
3123
-
3124
-
3125
-
3126
-
3127
-
3128
-
3129
-
3130
-
3131
-
3132
-
3133
-
3134
-
3135
-
3136
-
3137
-
3138
-
3139
-
3140
-
3141
-
3142
-
3143
-
3144
-
3145
-
3146
-
3147
-
3148
-
3149
-
3150
-
3151
-
3152
-
3153
-
3154
-
3155
-
3156
-
3157
-
3158
-
3159
-
3160
-
3161
-
3162
-
3163
-
3164
-
3165
-
3166
-
3167
-
3168
-
3169
-
3170
-
3171
-
3172
-
3173
-
3174
-
3175
-
3176
-
3177
-
3178
-
3179
-
3180
-
3181
-
3182
-
3183
-
3184
-
3185
-
3186
-
3187
-
3188
-
3189
-
3190
-
3191
-
3192
-
3193
-
3194
-
3195
-
3196
-
3197
-
3198
-
3199
-
3200
-
3201
-
3202
-
3203
-
3204
-
3205
-
3206
-
3207
-
3208
-
3209
-
3210
-
3211
-
3212
-
3213
-
3214
-
3215
-
3216
-
3217
-
3218
-
3219
-
3220
-
3221
- 沿
3222
-
3223
-
3224
-
3225
-
3226
-
3227
-
3228
-
3229
-
3230
-
3231
-
3232
-
3233
-
3234
-
3235
-
3236
-
3237
-
3238
-
3239
-
3240
-
3241
-
3242
-
3243
-
3244
-
3245
-
3246
-
3247
-
3248
-
3249
-
3250
-
3251
-
3252
-
3253
-
3254
-
3255
-
3256
-
3257
-
3258
-
3259
-
3260
-
3261
-
3262
-
3263
-
3264
-
3265
-
3266
-
3267
-
3268
-
3269
-
3270
-
3271
-
3272
-
3273
-
3274
-
3275
-
3276
-
3277
-
3278
-
3279
-
3280
-
3281
-
3282
-
3283
-
3284
-
3285
-
3286
-
3287
-
3288
-
3289
-
3290
-
3291
-
3292
-
3293
-
3294
-
3295
-
3296
-
3297
-
3298
-
3299
-
3300
-
3301
-
3302
-
3303
-
3304
-
3305
-
3306
-
3307
-
3308
-
3309
-
3310
-
3311
-
3312
-
3313
-
3314
-
3315
-
3316
-
3317
-
3318
-
3319
-
3320
-
3321
-
3322
-
3323
-
3324
-
3325
-
3326
-
3327
- 涿
3328
-
3329
-
3330
-
3331
-
3332
-
3333
-
3334
-
3335
-
3336
-
3337
-
3338
-
3339
-
3340
-
3341
-
3342
-
3343
-
3344
-
3345
-
3346
-
3347
-
3348
-
3349
-
3350
-
3351
-
3352
-
3353
-
3354
-
3355
-
3356
-
3357
-
3358
-
3359
-
3360
-
3361
-
3362
-
3363
-
3364
-
3365
-
3366
-
3367
-
3368
-
3369
-
3370
-
3371
-
3372
-
3373
-
3374
-
3375
-
3376
-
3377
-
3378
-
3379
-
3380
-
3381
-
3382
-
3383
-
3384
-
3385
-
3386
-
3387
-
3388
- 湿
3389
-
3390
-
3391
-
3392
-
3393
-
3394
-
3395
-
3396
-
3397
-
3398
-
3399
-
3400
-
3401
-
3402
-
3403
-
3404
-
3405
-
3406
-
3407
-
3408
-
3409
-
3410
-
3411
-
3412
-
3413
-
3414
-
3415
-
3416
-
3417
-
3418
-
3419
-
3420
-
3421
-
3422
-
3423
-
3424
-
3425
-
3426
-
3427
-
3428
-
3429
-
3430
-
3431
-
3432
-
3433
-
3434
-
3435
-
3436
-
3437
-
3438
-
3439
-
3440
-
3441
-
3442
-
3443
-
3444
-
3445
-
3446
-
3447
-
3448
-
3449
-
3450
-
3451
-
3452
-
3453
-
3454
-
3455
-
3456
-
3457
-
3458
-
3459
-
3460
-
3461
-
3462
-
3463
-
3464
-
3465
-
3466
-
3467
-
3468
-
3469
-
3470
-
3471
-
3472
-
3473
-
3474
-
3475
-
3476
-
3477
-
3478
-
3479
-
3480
-
3481
-
3482
-
3483
-
3484
-
3485
-
3486
-
3487
-
3488
-
3489
-
3490
-
3491
-
3492
-
3493
-
3494
-
3495
-
3496
-
3497
-
3498
-
3499
-
3500
-
3501
-
3502
-
3503
-
3504
-
3505
-
3506
-
3507
-
3508
-
3509
-
3510
-
3511
-
3512
-
3513
-
3514
-
3515
-
3516
-
3517
-
3518
-
3519
-
3520
-
3521
-
3522
-
3523
-
3524
-
3525
-
3526
-
3527
-
3528
-
3529
-
3530
-
3531
-
3532
-
3533
-
3534
-
3535
-
3536
-
3537
-
3538
-
3539
-
3540
-
3541
-
3542
-
3543
-
3544
-
3545
-
3546
-
3547
-
3548
-
3549
-
3550
-
3551
-
3552
-
3553
-
3554
-
3555
-
3556
-
3557
-
3558
-
3559
-
3560
-
3561
-
3562
-
3563
-
3564
-
3565
-
3566
-
3567
-
3568
-
3569
-
3570
-
3571
-
3572
-
3573
-
3574
-
3575
-
3576
-
3577
-
3578
-
3579
-
3580
-
3581
-
3582
-
3583
-
3584
-
3585
-
3586
-
3587
-
3588
-
3589
-
3590
-
3591
-
3592
-
3593
-
3594
-
3595
-
3596
-
3597
-
3598
-
3599
-
3600
-
3601
-
3602
-
3603
-
3604
-
3605
-
3606
-
3607
-
3608
-
3609
-
3610
-
3611
-
3612
-
3613
-
3614
-
3615
-
3616
-
3617
-
3618
-
3619
-
3620
-
3621
-
3622
-
3623
-
3624
-
3625
-
3626
-
3627
-
3628
-
3629
-
3630
-
3631
-
3632
-
3633
-
3634
-
3635
-
3636
-
3637
-
3638
-
3639
-
3640
-
3641
-
3642
-
3643
-
3644
-
3645
-
3646
-
3647
-
3648
-
3649
-
3650
-
3651
-
3652
-
3653
-
3654
-
3655
-
3656
-
3657
-
3658
-
3659
-
3660
-
3661
-
3662
-
3663
-
3664
-
3665
-
3666
-
3667
-
3668
-
3669
-
3670
-
3671
-
3672
-
3673
-
3674
-
3675
-
3676
-
3677
-
3678
-
3679
-
3680
-
3681
-
3682
-
3683
-
3684
-
3685
-
3686
-
3687
-
3688
-
3689
-
3690
-
3691
-
3692
-
3693
-
3694
-
3695
-
3696
-
3697
-
3698
-
3699
-
3700
-
3701
-
3702
-
3703
-
3704
-
3705
-
3706
-
3707
-
3708
-
3709
-
3710
-
3711
-
3712
-
3713
-
3714
-
3715
-
3716
-
3717
-
3718
-
3719
-
3720
-
3721
-
3722
-
3723
-
3724
-
3725
-
3726
-
3727
-
3728
-
3729
-
3730
-
3731
-
3732
-
3733
-
3734
-
3735
-
3736
-
3737
-
3738
-
3739
-
3740
-
3741
-
3742
-
3743
-
3744
-
3745
-
3746
-
3747
-
3748
-
3749
-
3750
-
3751
-
3752
-
3753
-
3754
-
3755
-
3756
-
3757
-
3758
-
3759
-
3760
-
3761
-
3762
-
3763
-
3764
-
3765
-
3766
-
3767
-
3768
-
3769
-
3770
-
3771
-
3772
-
3773
-
3774
-
3775
-
3776
-
3777
-
3778
-
3779
-
3780
-
3781
-
3782
-
3783
-
3784
-
3785
-
3786
-
3787
-
3788
-
3789
-
3790
-
3791
-
3792
-
3793
-
3794
-
3795
-
3796
-
3797
-
3798
-
3799
-
3800
-
3801
-
3802
-
3803
-
3804
-
3805
-
3806
-
3807
-
3808
-
3809
-
3810
-
3811
-
3812
-
3813
-
3814
-
3815
-
3816
-
3817
-
3818
-
3819
-
3820
-
3821
-
3822
-
3823
-
3824
-
3825
-
3826
-
3827
-
3828
-
3829
-
3830
-
3831
-
3832
-
3833
-
3834
-
3835
-
3836
-
3837
-
3838
-
3839
-
3840
-
3841
-
3842
-
3843
-
3844
-
3845
-
3846
-
3847
-
3848
-
3849
-
3850
-
3851
-
3852
-
3853
-
3854
-
3855
-
3856
-
3857
-
3858
-
3859
-
3860
-
3861
-
3862
-
3863
-
3864
-
3865
-
3866
-
3867
-
3868
-
3869
-
3870
-
3871
-
3872
-
3873
-
3874
-
3875
-
3876
-
3877
-
3878
-
3879
-
3880
-
3881
-
3882
-
3883
-
3884
-
3885
-
3886
-
3887
-
3888
-
3889
-
3890
-
3891
-
3892
-
3893
-
3894
-
3895
-
3896
-
3897
-
3898
-
3899
-
3900
-
3901
-
3902
-
3903
-
3904
-
3905
-
3906
-
3907
-
3908
-
3909
-
3910
-
3911
-
3912
-
3913
-
3914
-
3915
-
3916
-
3917
-
3918
-
3919
-
3920
-
3921
-
3922
-
3923
-
3924
-
3925
-
3926
-
3927
-
3928
-
3929
-
3930
-
3931
-
3932
-
3933
-
3934
-
3935
-
3936
-
3937
-
3938
-
3939
-
3940
-
3941
-
3942
-
3943
-
3944
-
3945
-
3946
-
3947
-
3948
-
3949
-
3950
-
3951
-
3952
-
3953
-
3954
-
3955
-
3956
-
3957
-
3958
-
3959
-
3960
-
3961
-
3962
-
3963
-
3964
-
3965
-
3966
-
3967
-
3968
-
3969
-
3970
-
3971
-
3972
-
3973
-
3974
-
3975
-
3976
-
3977
-
3978
-
3979
-
3980
-
3981
-
3982
-
3983
-
3984
-
3985
-
3986
-
3987
-
3988
-
3989
-
3990
-
3991
-
3992
-
3993
-
3994
-
3995
-
3996
-
3997
-
3998
-
3999
-
4000
-
4001
-
4002
-
4003
-
4004
-
4005
-
4006
-
4007
-
4008
-
4009
-
4010
-
4011
-
4012
-
4013
- ��
4014
-
4015
-
4016
-
4017
-
4018
-
4019
-
4020
-
4021
-
4022
-
4023
-
4024
-
4025
-
4026
-
4027
-
4028
-
4029
-
4030
-
4031
-
4032
-
4033
-
4034
-
4035
-
4036
-
4037
-
4038
-
4039
-
4040
-
4041
-
4042
-
4043
-
4044
-
4045
-
4046
-
4047
-
4048
-
4049
-
4050
-
4051
-
4052
-
4053
-
4054
-
4055
-
4056
-
4057
-
4058
-
4059
-
4060
-
4061
-
4062
-
4063
-
4064
-
4065
-
4066
-
4067
-
4068
-
4069
-
4070
-
4071
-
4072
-
4073
-
4074
-
4075
-
4076
-
4077
-
4078
-
4079
-
4080
-
4081
-
4082
-
4083
-
4084
-
4085
-
4086
-
4087
-
4088
-
4089
-
4090
-
4091
-
4092
-
4093
-
4094
-
4095
-
4096
-
4097
-
4098
-
4099
-
4100
-
4101
-
4102
-
4103
-
4104
-
4105
-
4106
-
4107
-
4108
-
4109
-
4110
-
4111
-
4112
-
4113
-
4114
-
4115
-
4116
-
4117
-
4118
-
4119
-
4120
-
4121
-
4122
-
4123
-
4124
-
4125
-
4126
-
4127
-
4128
-
4129
-
4130
-
4131
-
4132
-
4133
-
4134
-
4135
-
4136
-
4137
-
4138
-
4139
-
4140
-
4141
-
4142
-
4143
-
4144
-
4145
-
4146
-
4147
-
4148
-
4149
-
4150
-
4151
-
4152
-
4153
-
4154
-
4155
-
4156
-
4157
-
4158
-
4159
-
4160
-
4161
-
4162
-
4163
-
4164
-
4165
-
4166
-
4167
-
4168
-
4169
-
4170
-
4171
-
4172
- 稿
4173
-
4174
-
4175
-
4176
-
4177
-
4178
-
4179
-
4180
-
4181
- 穿
4182
-
4183
-
4184
-
4185
-
4186
-
4187
-
4188
-
4189
-
4190
-
4191
-
4192
-
4193
-
4194
-
4195
-
4196
-
4197
-
4198
-
4199
-
4200
-
4201
- 窿
4202
-
4203
-
4204
-
4205
-
4206
-
4207
-
4208
-
4209
-
4210
-
4211
-
4212
-
4213
-
4214
-
4215
-
4216
- 竿
4217
-
4218
-
4219
-
4220
-
4221
-
4222
-
4223
-
4224
-
4225
-
4226
-
4227
-
4228
-
4229
-
4230
-
4231
-
4232
-
4233
-
4234
-
4235
-
4236
-
4237
-
4238
-
4239
-
4240
-
4241
-
4242
-
4243
-
4244
-
4245
-
4246
-
4247
-
4248
-
4249
-
4250
-
4251
-
4252
-
4253
-
4254
-
4255
-
4256
-
4257
-
4258
-
4259
-
4260
-
4261
-
4262
-
4263
-
4264
-
4265
-
4266
-
4267
-
4268
-
4269
-
4270
-
4271
-
4272
-
4273
-
4274
-
4275
-
4276
-
4277
-
4278
-
4279
-
4280
-
4281
-
4282
-
4283
-
4284
-
4285
-
4286
-
4287
- 簿
4288
-
4289
-
4290
-
4291
-
4292
-
4293
-
4294
-
4295
-
4296
-
4297
-
4298
-
4299
-
4300
-
4301
-
4302
-
4303
-
4304
-
4305
-
4306
-
4307
-
4308
-
4309
-
4310
-
4311
-
4312
-
4313
-
4314
-
4315
-
4316
- 粿
4317
-
4318
-
4319
-
4320
-
4321
-
4322
-
4323
-
4324
-
4325
-
4326
-
4327
-
4328
-
4329
-
4330
-
4331
-
4332
-
4333
-
4334
-
4335
-
4336
-
4337
-
4338
-
4339
-
4340
-
4341
-
4342
-
4343
-
4344
-
4345
-
4346
-
4347
-
4348
-
4349
-
4350
-
4351
-
4352
-
4353
-
4354
-
4355
-
4356
-
4357
-
4358
-
4359
-
4360
-
4361
-
4362
-
4363
-
4364
-
4365
-
4366
-
4367
-
4368
-
4369
-
4370
-
4371
-
4372
-
4373
-
4374
-
4375
-
4376
-
4377
- 线
4378
-
4379
-
4380
-
4381
-
4382
-
4383
-
4384
-
4385
-
4386
-
4387
-
4388
-
4389
-
4390
-
4391
-
4392
-
4393
-
4394
-
4395
-
4396
-
4397
-
4398
-
4399
-
4400
-
4401
-
4402
-
4403
-
4404
-
4405
-
4406
-
4407
-
4408
-
4409
-
4410
-
4411
-
4412
-
4413
-
4414
-
4415
-
4416
-
4417
-
4418
-
4419
-
4420
-
4421
-
4422
-
4423
-
4424
-
4425
- 绿
4426
-
4427
-
4428
-
4429
-
4430
-
4431
-
4432
-
4433
-
4434
-
4435
-
4436
-
4437
-
4438
-
4439
-
4440
-
4441
-
4442
-
4443
-
4444
-
4445
-
4446
-
4447
-
4448
-
4449
-
4450
-
4451
-
4452
-
4453
-
4454
-
4455
-
4456
-
4457
-
4458
-
4459
-
4460
-
4461
-
4462
-
4463
-
4464
-
4465
-
4466
-
4467
-
4468
-
4469
-
4470
-
4471
-
4472
-
4473
-
4474
-
4475
-
4476
-
4477
-
4478
-
4479
-
4480
-
4481
-
4482
-
4483
-
4484
-
4485
-
4486
-
4487
-
4488
-
4489
-
4490
- 羿
4491
-
4492
-
4493
-
4494
-
4495
-
4496
-
4497
-
4498
-
4499
-
4500
-
4501
-
4502
-
4503
-
4504
-
4505
-
4506
-
4507
-
4508
- 耀
4509
-
4510
-
4511
-
4512
-
4513
-
4514
-
4515
-
4516
-
4517
-
4518
-
4519
-
4520
-
4521
-
4522
-
4523
-
4524
-
4525
-
4526
-
4527
-
4528
-
4529
-
4530
-
4531
-
4532
-
4533
-
4534
-
4535
-
4536
-
4537
-
4538
-
4539
-
4540
-
4541
-
4542
-
4543
-
4544
-
4545
-
4546
-
4547
-
4548
-
4549
-
4550
-
4551
-
4552
-
4553
-
4554
-
4555
-
4556
-
4557
-
4558
-
4559
-
4560
-
4561
-
4562
-
4563
-
4564
-
4565
-
4566
-
4567
-
4568
-
4569
-
4570
-
4571
-
4572
-
4573
-
4574
-
4575
-
4576
-
4577
-
4578
-
4579
-
4580
-
4581
-
4582
-
4583
-
4584
-
4585
-
4586
-
4587
-
4588
-
4589
-
4590
-
4591
-
4592
-
4593
-
4594
-
4595
-
4596
-
4597
-
4598
-
4599
-
4600
-
4601
-
4602
-
4603
-
4604
-
4605
-
4606
-
4607
-
4608
-
4609
-
4610
-
4611
-
4612
-
4613
-
4614
-
4615
-
4616
-
4617
-
4618
-
4619
-
4620
-
4621
-
4622
-
4623
-
4624
-
4625
-
4626
-
4627
-
4628
-
4629
-
4630
-
4631
-
4632
-
4633
-
4634
-
4635
-
4636
-
4637
-
4638
-
4639
-
4640
-
4641
-
4642
-
4643
-
4644
-
4645
-
4646
-
4647
-
4648
-
4649
-
4650
-
4651
-
4652
-
4653
-
4654
-
4655
-
4656
-
4657
-
4658
-
4659
-
4660
-
4661
-
4662
-
4663
-
4664
-
4665
-
4666
-
4667
-
4668
-
4669
-
4670
-
4671
-
4672
-
4673
-
4674
-
4675
-
4676
-
4677
-
4678
-
4679
-
4680
-
4681
-
4682
-
4683
-
4684
-
4685
-
4686
-
4687
-
4688
-
4689
-
4690
-
4691
-
4692
-
4693
-
4694
-
4695
-
4696
-
4697
-
4698
-
4699
-
4700
-
4701
-
4702
-
4703
-
4704
-
4705
-
4706
-
4707
-
4708
-
4709
-
4710
-
4711
-
4712
-
4713
-
4714
-
4715
-
4716
-
4717
-
4718
-
4719
-
4720
-
4721
-
4722
-
4723
-
4724
-
4725
-
4726
-
4727
-
4728
-
4729
-
4730
-
4731
-
4732
-
4733
-
4734
-
4735
-
4736
-
4737
-
4738
-
4739
-
4740
-
4741
-
4742
-
4743
-
4744
-
4745
-
4746
-
4747
-
4748
-
4749
-
4750
-
4751
-
4752
-
4753
-
4754
-
4755
-
4756
-
4757
-
4758
-
4759
-
4760
-
4761
-
4762
-
4763
-
4764
-
4765
-
4766
-
4767
-
4768
-
4769
-
4770
-
4771
-
4772
-
4773
-
4774
-
4775
-
4776
-
4777
-
4778
-
4779
-
4780
-
4781
-
4782
-
4783
-
4784
-
4785
-
4786
-
4787
-
4788
-
4789
-
4790
-
4791
-
4792
-
4793
-
4794
-
4795
-
4796
-
4797
-
4798
-
4799
-
4800
-
4801
-
4802
-
4803
-
4804
-
4805
-
4806
-
4807
-
4808
-
4809
-
4810
-
4811
-
4812
-
4813
-
4814
-
4815
-
4816
-
4817
-
4818
-
4819
-
4820
-
4821
-
4822
-
4823
-
4824
-
4825
-
4826
-
4827
-
4828
-
4829
-
4830
-
4831
-
4832
-
4833
-
4834
-
4835
-
4836
-
4837
-
4838
-
4839
-
4840
-
4841
-
4842
-
4843
-
4844
-
4845
-
4846
-
4847
-
4848
-
4849
-
4850
-
4851
-
4852
-
4853
-
4854
-
4855
-
4856
-
4857
-
4858
-
4859
-
4860
-
4861
-
4862
-
4863
-
4864
-
4865
-
4866
-
4867
-
4868
-
4869
-
4870
-
4871
-
4872
-
4873
-
4874
-
4875
-
4876
-
4877
-
4878
-
4879
-
4880
-
4881
-
4882
-
4883
-
4884
-
4885
-
4886
-
4887
-
4888
-
4889
-
4890
-
4891
-
4892
-
4893
-
4894
-
4895
-
4896
-
4897
-
4898
-
4899
-
4900
-
4901
-
4902
-
4903
-
4904
-
4905
-
4906
-
4907
-
4908
-
4909
-
4910
-
4911
-
4912
-
4913
-
4914
-
4915
-
4916
-
4917
-
4918
-
4919
-
4920
-
4921
-
4922
-
4923
-
4924
-
4925
-
4926
-
4927
-
4928
-
4929
-
4930
-
4931
-
4932
-
4933
-
4934
-
4935
-
4936
-
4937
-
4938
-
4939
-
4940
-
4941
-
4942
-
4943
-
4944
-
4945
-
4946
-
4947
-
4948
-
4949
-
4950
-
4951
-
4952
-
4953
-
4954
-
4955
-
4956
-
4957
-
4958
-
4959
-
4960
-
4961
-
4962
-
4963
-
4964
-
4965
-
4966
-
4967
-
4968
-
4969
-
4970
-
4971
-
4972
-
4973
-
4974
-
4975
-
4976
-
4977
-
4978
-
4979
-
4980
-
4981
-
4982
-
4983
-
4984
-
4985
-
4986
-
4987
-
4988
-
4989
-
4990
-
4991
-
4992
-
4993
-
4994
-
4995
-
4996
-
4997
-
4998
-
4999
-
5000
-
5001
-
5002
-
5003
-
5004
-
5005
-
5006
-
5007
-
5008
-
5009
-
5010
-
5011
-
5012
-
5013
-
5014
-
5015
-
5016
-
5017
-
5018
-
5019
-
5020
-
5021
-
5022
-
5023
-
5024
-
5025
-
5026
-
5027
-
5028
-
5029
-
5030
-
5031
-
5032
-
5033
-
5034
-
5035
-
5036
-
5037
- ��
5038
-
5039
-
5040
-
5041
-
5042
-
5043
-
5044
-
5045
-
5046
-
5047
-
5048
-
5049
-
5050
-
5051
-
5052
-
5053
-
5054
-
5055
-
5056
-
5057
-
5058
-
5059
-
5060
-
5061
-
5062
-
5063
-
5064
-
5065
-
5066
-
5067
-
5068
-
5069
-
5070
-
5071
-
5072
-
5073
-
5074
-
5075
-
5076
-
5077
-
5078
-
5079
-
5080
-
5081
-
5082
-
5083
-
5084
-
5085
-
5086
-
5087
-
5088
-
5089
-
5090
-
5091
-
5092
-
5093
-
5094
-
5095
-
5096
-
5097
-
5098
-
5099
-
5100
-
5101
-
5102
-
5103
-
5104
-
5105
-
5106
-
5107
-
5108
-
5109
-
5110
-
5111
-
5112
-
5113
-
5114
-
5115
-
5116
-
5117
-
5118
-
5119
-
5120
-
5121
-
5122
-
5123
-
5124
-
5125
-
5126
-
5127
-
5128
-
5129
-
5130
-
5131
-
5132
-
5133
-
5134
-
5135
-
5136
-
5137
-
5138
-
5139
-
5140
-
5141
-
5142
-
5143
- 西
5144
-
5145
-
5146
-
5147
-
5148
-
5149
-
5150
-
5151
-
5152
-
5153
-
5154
-
5155
-
5156
-
5157
-
5158
-
5159
-
5160
-
5161
-
5162
-
5163
-
5164
-
5165
-
5166
-
5167
-
5168
-
5169
-
5170
-
5171
-
5172
-
5173
-
5174
-
5175
-
5176
-
5177
-
5178
-
5179
-
5180
-
5181
-
5182
-
5183
-
5184
-
5185
-
5186
-
5187
-
5188
-
5189
-
5190
-
5191
-
5192
-
5193
-
5194
-
5195
-
5196
-
5197
-
5198
-
5199
-
5200
-
5201
-
5202
-
5203
-
5204
- 访
5205
-
5206
-
5207
-
5208
-
5209
-
5210
-
5211
-
5212
-
5213
-
5214
-
5215
-
5216
-
5217
-
5218
-
5219
-
5220
-
5221
-
5222
-
5223
-
5224
-
5225
-
5226
-
5227
-
5228
-
5229
-
5230
-
5231
-
5232
-
5233
-
5234
-
5235
-
5236
-
5237
-
5238
-
5239
-
5240
-
5241
-
5242
-
5243
-
5244
-
5245
-
5246
-
5247
-
5248
-
5249
-
5250
-
5251
-
5252
- 诿
5253
-
5254
-
5255
-
5256
-
5257
-
5258
-
5259
-
5260
-
5261
-
5262
-
5263
-
5264
-
5265
-
5266
-
5267
-
5268
-
5269
-
5270
-
5271
-
5272
-
5273
-
5274
-
5275
-
5276
-
5277
-
5278
-
5279
-
5280
-
5281
-
5282
-
5283
-
5284
-
5285
-
5286
-
5287
-
5288
-
5289
-
5290
-
5291
-
5292
-
5293
-
5294
-
5295
-
5296
-
5297
-
5298
-
5299
-
5300
-
5301
-
5302
-
5303
-
5304
-
5305
-
5306
-
5307
-
5308
-
5309
-
5310
-
5311
-
5312
-
5313
-
5314
-
5315
-
5316
-
5317
-
5318
-
5319
-
5320
-
5321
-
5322
-
5323
-
5324
-
5325
-
5326
-
5327
-
5328
-
5329
-
5330
-
5331
-
5332
-
5333
-
5334
-
5335
-
5336
-
5337
-
5338
-
5339
-
5340
-
5341
-
5342
-
5343
-
5344
-
5345
- 贿
5346
-
5347
-
5348
-
5349
-
5350
-
5351
-
5352
-
5353
-
5354
-
5355
-
5356
-
5357
-
5358
-
5359
-
5360
-
5361
-
5362
-
5363
-
5364
-
5365
-
5366
-
5367
-
5368
-
5369
-
5370
-
5371
-
5372
-
5373
-
5374
-
5375
-
5376
-
5377
-
5378
-
5379
-
5380
-
5381
-
5382
-
5383
-
5384
-
5385
-
5386
-
5387
-
5388
-
5389
-
5390
-
5391
-
5392
-
5393
-
5394
-
5395
-
5396
-
5397
-
5398
-
5399
-
5400
-
5401
-
5402
-
5403
-
5404
-
5405
-
5406
-
5407
-
5408
-
5409
-
5410
-
5411
-
5412
-
5413
-
5414
-
5415
-
5416
-
5417
-
5418
-
5419
-
5420
-
5421
-
5422
-
5423
-
5424
-
5425
-
5426
-
5427
-
5428
-
5429
-
5430
-
5431
-
5432
-
5433
-
5434
-
5435
-
5436
-
5437
-
5438
-
5439
-
5440
-
5441
-
5442
-
5443
-
5444
-
5445
-
5446
-
5447
-
5448
-
5449
- 蹿
5450
-
5451
-
5452
-
5453
-
5454
-
5455
-
5456
-
5457
-
5458
-
5459
-
5460
-
5461
-
5462
-
5463
-
5464
-
5465
-
5466
-
5467
-
5468
-
5469
-
5470
-
5471
-
5472
-
5473
-
5474
-
5475
- 轿
5476
-
5477
-
5478
-
5479
-
5480
-
5481
-
5482
-
5483
-
5484
-
5485
-
5486
-
5487
-
5488
-
5489
-
5490
-
5491
-
5492
-
5493
-
5494
-
5495
-
5496
-
5497
-
5498
-
5499
-
5500
-
5501
-
5502
-
5503
-
5504
-
5505
-
5506
-
5507
-
5508
-
5509
-
5510
-
5511
-
5512
-
5513
-
5514
-
5515
-
5516
-
5517
-
5518
-
5519
-
5520
-
5521
-
5522
-
5523
-
5524
-
5525
-
5526
-
5527
-
5528
-
5529
-
5530
-
5531
-
5532
-
5533
-
5534
-
5535
-
5536
-
5537
-
5538
-
5539
-
5540
- 退
5541
-
5542
-
5543
-
5544
-
5545
-
5546
-
5547
-
5548
-
5549
-
5550
-
5551
-
5552
-
5553
-
5554
-
5555
-
5556
-
5557
-
5558
-
5559
-
5560
-
5561
-
5562
-
5563
-
5564
-
5565
-
5566
-
5567
-
5568
-
5569
-
5570
-
5571
-
5572
-
5573
-
5574
-
5575
-
5576
-
5577
-
5578
-
5579
-
5580
-
5581
-
5582
-
5583
-
5584
-
5585
-
5586
-
5587
-
5588
-
5589
-
5590
-
5591
-
5592
-
5593
-
5594
-
5595
-
5596
-
5597
-
5598
-
5599
-
5600
-
5601
-
5602
-
5603
-
5604
-
5605
-
5606
-
5607
-
5608
-
5609
-
5610
-
5611
-
5612
-
5613
-
5614
-
5615
-
5616
-
5617
-
5618
-
5619
-
5620
-
5621
-
5622
-
5623
-
5624
-
5625
-
5626
-
5627
-
5628
-
5629
-
5630
-
5631
-
5632
-
5633
-
5634
-
5635
-
5636
-
5637
-
5638
-
5639
-
5640
-
5641
-
5642
-
5643
-
5644
-
5645
-
5646
-
5647
-
5648
-
5649
-
5650
-
5651
-
5652
-
5653
-
5654
-
5655
-
5656
-
5657
-
5658
-
5659
-
5660
-
5661
-
5662
-
5663
-
5664
-
5665
-
5666
-
5667
-
5668
-
5669
-
5670
-
5671
-
5672
-
5673
-
5674
-
5675
-
5676
-
5677
-
5678
-
5679
-
5680
-
5681
-
5682
-
5683
-
5684
-
5685
-
5686
-
5687
-
5688
-
5689
-
5690
-
5691
-
5692
-
5693
-
5694
-
5695
-
5696
-
5697
-
5698
-
5699
-
5700
-
5701
-
5702
-
5703
-
5704
-
5705
-
5706
-
5707
-
5708
-
5709
-
5710
-
5711
-
5712
-
5713
-
5714
-
5715
-
5716
-
5717
-
5718
-
5719
-
5720
-
5721
-
5722
-
5723
-
5724
-
5725
-
5726
-
5727
-
5728
-
5729
-
5730
-
5731
-
5732
-
5733
-
5734
-
5735
-
5736
-
5737
-
5738
-
5739
-
5740
-
5741
-
5742
-
5743
-
5744
-
5745
-
5746
-
5747
-
5748
-
5749
-
5750
-
5751
-
5752
-
5753
-
5754
-
5755
-
5756
-
5757
-
5758
-
5759
-
5760
-
5761
-
5762
-
5763
-
5764
-
5765
-
5766
-
5767
-
5768
-
5769
-
5770
-
5771
-
5772
-
5773
-
5774
-
5775
-
5776
-
5777
-
5778
-
5779
-
5780
-
5781
-
5782
-
5783
-
5784
-
5785
-
5786
-
5787
-
5788
-
5789
-
5790
-
5791
-
5792
-
5793
-
5794
-
5795
-
5796
-
5797
-
5798
-
5799
-
5800
-
5801
-
5802
-
5803
-
5804
-
5805
-
5806
-
5807
-
5808
-
5809
-
5810
-
5811
-
5812
-
5813
-
5814
-
5815
-
5816
-
5817
-
5818
-
5819
-
5820
-
5821
-
5822
-
5823
-
5824
-
5825
-
5826
-
5827
-
5828
-
5829
-
5830
-
5831
-
5832
-
5833
-
5834
-
5835
-
5836
-
5837
-
5838
-
5839
-
5840
-
5841
-
5842
-
5843
-
5844
-
5845
-
5846
-
5847
-
5848
-
5849
-
5850
-
5851
-
5852
-
5853
-
5854
-
5855
-
5856
-
5857
-
5858
-
5859
-
5860
-
5861
-
5862
-
5863
-
5864
-
5865
-
5866
-
5867
-
5868
-
5869
-
5870
-
5871
-
5872
-
5873
-
5874
-
5875
-
5876
-
5877
-
5878
-
5879
-
5880
-
5881
-
5882
-
5883
-
5884
-
5885
-
5886
-
5887
-
5888
-
5889
-
5890
-
5891
-
5892
-
5893
-
5894
-
5895
-
5896
-
5897
-
5898
-
5899
-
5900
-
5901
-
5902
-
5903
-
5904
-
5905
-
5906
-
5907
-
5908
-
5909
-
5910
-
5911
-
5912
-
5913
-
5914
-
5915
-
5916
-
5917
-
5918
-
5919
-
5920
-
5921
-
5922
-
5923
-
5924
-
5925
-
5926
-
5927
-
5928
-
5929
-
5930
-
5931
-
5932
-
5933
-
5934
-
5935
-
5936
-
5937
-
5938
-
5939
-
5940
-
5941
-
5942
-
5943
-
5944
-
5945
-
5946
-
5947
-
5948
-
5949
-
5950
-
5951
-
5952
-
5953
-
5954
-
5955
-
5956
-
5957
-
5958
-
5959
-
5960
-
5961
-
5962
-
5963
-
5964
-
5965
-
5966
-
5967
-
5968
-
5969
-
5970
-
5971
-
5972
-
5973
-
5974
-
5975
-
5976
-
5977
-
5978
-
5979
-
5980
-
5981
-
5982
-
5983
-
5984
-
5985
-
5986
-
5987
-
5988
-
5989
-
5990
-
5991
-
5992
-
5993
-
5994
-
5995
-
5996
-
5997
-
5998
-
5999
-
6000
-
6001
-
6002
-
6003
-
6004
-
6005
-
6006
-
6007
-
6008
-
6009
-
6010
-
6011
-
6012
-
6013
-
6014
-
6015
-
6016
-
6017
-
6018
-
6019
-
6020
-
6021
-
6022
-
6023
-
6024
-
6025
-
6026
-
6027
-
6028
-
6029
-
6030
-
6031
-
6032
-
6033
-
6034
-
6035
-
6036
-
6037
-
6038
-
6039
-
6040
-
6041
-
6042
-
6043
-
6044
-
6045
-
6046
-
6047
-
6048
-
6049
-
6050
-
6051
-
6052
-
6053
-
6054
-
6055
-
6056
-
6057
-
6058
-
6059
-
6060
-
6061
- ��
6062
-
6063
-
6064
-
6065
- 饿
6066
-
6067
-
6068
-
6069
-
6070
-
6071
-
6072
-
6073
-
6074
-
6075
-
6076
-
6077
-
6078
-
6079
-
6080
-
6081
-
6082
-
6083
-
6084
-
6085
-
6086
-
6087
-
6088
-
6089
-
6090
-
6091
-
6092
-
6093
-
6094
-
6095
-
6096
-
6097
-
6098
-
6099
-
6100
-
6101
-
6102
-
6103
- 驿
6104
-
6105
-
6106
-
6107
-
6108
-
6109
-
6110
-
6111
-
6112
-
6113
-
6114
-
6115
-
6116
-
6117
-
6118
-
6119
-
6120
-
6121
-
6122
-
6123
-
6124
-
6125
-
6126
-
6127
-
6128
-
6129
-
6130
-
6131
-
6132
-
6133
-
6134
-
6135
-
6136
-
6137
-
6138
-
6139
-
6140
-
6141
-
6142
-
6143
-
6144
-
6145
-
6146
-
6147
-
6148
-
6149
-
6150
-
6151
-
6152
-
6153
-
6154
-
6155
-
6156
-
6157
- 鱿
6158
-
6159
-
6160
-
6161
-
6162
-
6163
-
6164
-
6165
-
6166
-
6167
-
6168
-
6169
-
6170
-
6171
-
6172
-
6173
-
6174
-
6175
-
6176
-
6177
-
6178
-
6179
-
6180
-
6181
-
6182
-
6183
-
6184
-
6185
-
6186
-
6187
-
6188
-
6189
-
6190
-
6191
-
6192
-
6193
-
6194
-
6195
-
6196
-
6197
-
6198
-
6199
-
6200
- 鸿
6201
-
6202
-
6203
-
6204
-
6205
-
6206
-
6207
-
6208
-
6209
-
6210
-
6211
-
6212
-
6213
-
6214
-
6215
-
6216
-
6217
-
6218
-
6219
-
6220
-
6221
-
6222
- 鹿
6223
-
6224
-
6225
-
6226
-
6227
-
6228
-
6229
-
6230
-
6231
-
6232
-
6233
-
6234
-
6235
-
6236
-
6237
-
6238
-
6239
-
6240
-
6241
-
6242
-
6243
-
6244
-
6245
-
6246
-
6247
-
6248
-
6249
-
6250
-
6251
-
6252
-
6253
-
6254
-
6255
-
6256
-
6257
-
6258
-
6259
-
6260
- 齿
6261
-
6262
-
6263
-
6264
-
6265
-
6266
-
6267
-
6268
-
6269
-
6270
-
6271
-
6272
-
6273
-
6274
-
6275
-
6276
-
6277
-
6278
-
6279
-
6280
-
6281
-
6282
-
6283
-
6284
-
6285
-
6286
-
6287
-
6288
-
6289
-
6290
-
6291
-
6292
-
6293
-
6294
-
6295
-
6296
-
6297
-
6298
-
6299
-
6300
-
6301
-
6302
-
6303
-
6304
-
6305
-
6306
-
6307
-
6308
-
6309
-
6310
-
6311
-
6312
-
6313
-
6314
-
6315
-
6316
-
6317
-
6318
-
6319
-
6320
-
6321
-
6322
-
6323
-
6324
-
6325
-
6326
-
6327
-
6328
-
6329
-
6330
-
6331
-
6332
-
6333
-
6334
-
6335
-
6336
-
6337
-
6338
-
6339
-
6340
-
6341
-
6342
-
6343
-
6344
-
6345
-
6346
-
6347
-
6348
-
6349
-
6350
-
6351
-
6352
- ︿
6353
-
6354
-
6355
-
6356
-
6357
-
6358
-
6359
-
6360
-
6361
-
6362
-
6363
-
6364
-
6365
-
6366
-
6367
-
6368
-
6369
-
6370
-
6371
-
6372
-
6373
-
6374
-
6375
-
6376
-
6377
-
6378
-
6379
-
6380
-
6381
-
6382
-
6383
-
6384
-
6385
-
6386
-
6387
-
6388
-
6389
-
6390
- ...
6391
- yam
6392
- lofter
6393
- ##s
6394
- by
6395
- ##0
6396
- com
6397
- ##a
6398
- ##2
6399
- ##1
6400
- ##3
6401
- ##e
6402
- ##8
6403
- ##5
6404
- ##6
6405
- ##4
6406
- ##9
6407
- ##7
6408
- ##t
6409
- ##o
6410
- ##d
6411
- ##i
6412
- ##n
6413
- app
6414
- www
6415
- the
6416
- ##m
6417
- ##c
6418
- ##l
6419
- ##y
6420
- ##r
6421
- ##g
6422
- http
6423
- qq
6424
- ##p
6425
- ##f
6426
- google
6427
- pixnet
6428
- cookies
6429
- tripadvisor
6430
- ##er
6431
- ##k
6432
- ##h
6433
- facebook
6434
- ##b
6435
- of
6436
- ##x
6437
- ##u
6438
- iphone
6439
- ip
6440
- in
6441
- ##w
6442
- ##ing
6443
- ctrip
6444
- ##on
6445
- ##v
6446
- to
6447
- id
6448
- it
6449
- windows
6450
- llc
6451
- top
6452
- led
6453
- at
6454
- ##an
6455
- ##z
6456
- android
6457
- and
6458
- vr
6459
- blogthis
6460
- twitter
6461
- ##le
6462
- ok
6463
- cn
6464
- no
6465
- ios
6466
- ##in
6467
- ##mm
6468
- on
6469
- te
6470
- ig
6471
- lv
6472
- ##ng
6473
- ##us
6474
- pc
6475
- ──
6476
- ##te
6477
- ##ed
6478
- html
6479
- ncc
6480
- wifi
6481
- email
6482
- blog
6483
- is
6484
- mail
6485
- online
6486
- ##al
6487
- dvd
6488
- ##ic
6489
- studio
6490
- ##℃
6491
- ##ia
6492
- line
6493
- vip
6494
- ##q
6495
- ##ce
6496
- ##en
6497
- for
6498
- ##is
6499
- ##ra
6500
- ##es
6501
- ##j
6502
- usb
6503
- net
6504
- cp
6505
- asia
6506
- ##cm
6507
- diy
6508
- new
6509
- ta
6510
- language
6511
- vs
6512
- apple
6513
- tw
6514
- web
6515
- ##ne
6516
- ipad
6517
- you
6518
- ##re
6519
- ##tion
6520
- ps
6521
- de
6522
- bt
6523
- pony
6524
- atm
6525
- ##ch
6526
- ceo
6527
- ##or
6528
- go
6529
- ##na
6530
- av
6531
- pro
6532
- cafe
6533
- pinterest
6534
- pixstyleme3c
6535
- ##ta
6536
- more
6537
- said
6538
- mp3
6539
- ##ll
6540
- nba
6541
- jun
6542
- tv
6543
- pm
6544
- nbsp
6545
- ##ie
6546
- linux
6547
- ##ma
6548
- cd
6549
- hd
6550
- ##ion
6551
- am
6552
- ##th
6553
- ##st
6554
- ##se
6555
- ##et
6556
- gdp
6557
- my
6558
- abc
6559
- flash
6560
- one
6561
- ##ck
6562
- gps
6563
- ##ly
6564
- web885
6565
- ##ge
6566
- xd
6567
- boss
6568
- isbn
6569
- org
6570
- ##ry
6571
- me
6572
- love
6573
- ##ter
6574
- ##ar
6575
- ##la
6576
- hotel
6577
- pk
6578
- ie
6579
- ##os
6580
- ##el
6581
- seo
6582
- cpu
6583
- ##ml
6584
- p2p
6585
- may
6586
- sun
6587
- tue
6588
- internet
6589
- cc
6590
- posted
6591
- youtube
6592
- ##at
6593
- ##man
6594
- ii
6595
- abs
6596
- nt
6597
- pdf
6598
- yahoo
6599
- ago
6600
- ##it
6601
- news
6602
- mac
6603
- ##me
6604
- java
6605
- spa
6606
- ##de
6607
- ##nt
6608
- hk
6609
- all
6610
- plus
6611
- la
6612
- ##mb
6613
- ##ve
6614
- west
6615
- ##da
6616
- air
6617
- ##ps
6618
- ##to
6619
- logo
6620
- htc
6621
- php
6622
- https
6623
- fi
6624
- momo
6625
- ##son
6626
- sat
6627
- ##ke
6628
- ebd
6629
- suv
6630
- wi
6631
- day
6632
- apk
6633
- ##um
6634
- mv
6635
- galaxy
6636
- wiki
6637
- or
6638
- brake
6639
- this
6640
- mon
6641
- po
6642
- javascript
6643
- life
6644
- home
6645
- june
6646
- ##ss
6647
- system
6648
- pp
6649
- world
6650
- fb
6651
- br
6652
- ##as
6653
- ic
6654
- ai
6655
- leonardo
6656
- safari
6657
- live
6658
- free
6659
- xx
6660
- wed
6661
- win7
6662
- kiehl
6663
- ##co
6664
- lg
6665
- o2o
6666
- ##go
6667
- us
6668
- mm
6669
- vfm
6670
- kanye
6671
- ##id
6672
- jr
6673
- ##ey
6674
- rss
6675
- ##sa
6676
- ##ro
6677
- ##am
6678
- ##no
6679
- thu
6680
- fri
6681
- ##sh
6682
- ##ki
6683
- comments
6684
- name
6685
- ##pe
6686
- ##ine
6687
- max
6688
- uber
6689
- ##mi
6690
- ##ton
6691
- wordpress
6692
- office
6693
- ##ment
6694
- bd
6695
- win10
6696
- ##ld
6697
- ##li
6698
- gmail
6699
- bb
6700
- dior
6701
- ##rs
6702
- ##ri
6703
- ##rd
6704
- up
6705
- cad
6706
- dr
6707
- read
6708
- ##io
6709
- url
6710
- pvc
6711
- paypal
6712
- show
6713
- policy
6714
- ##ty
6715
- with
6716
- txt
6717
- ##ba
6718
- dna
6719
- from
6720
- post
6721
- mini
6722
- ar
6723
- taiwan
6724
- john
6725
- ##ga
6726
- privacy
6727
- agoda
6728
- ##ny
6729
- word
6730
- ##by
6731
- ##ur
6732
- ##hz
6733
- ##ang
6734
- cookie
6735
- netscape
6736
- ##ka
6737
- ##~
6738
- ##ad
6739
- house
6740
- share
6741
- note
6742
- ibm
6743
- code
6744
- hello
6745
- nike
6746
- sim
6747
- survey
6748
- wikia
6749
- cbc
6750
- ##tor
6751
- ##kg
6752
- ##rt
6753
- campaign
6754
- store
6755
- os
6756
- ##ct
6757
- ##ts
6758
- ##°
6759
- api
6760
- ##ns
6761
- excel
6762
- ##ao
6763
- ##nd
6764
- university
6765
- ##ya
6766
- ##il
6767
- pierre
6768
- ipo
6769
- hotels
6770
- ##ian
6771
- years
6772
- ##ers
6773
- high
6774
- ##day
6775
- time
6776
- ##ay
6777
- bug
6778
- ##line
6779
- ##be
6780
- xp
6781
- talk2yam
6782
- yamservice
6783
- coco
6784
- ##dy
6785
- sony
6786
- ##ies
6787
- microsoft
6788
- david
6789
- people
6790
- ##ha
6791
- instagram
6792
- intel
6793
- ##ot
6794
- iso
6795
- ##va
6796
- ##mo
6797
- ##land
6798
- xxx
6799
- man
6800
- co
6801
- ltxsw
6802
- ##ation
6803
- baby
6804
- ##pa
6805
- ##ol
6806
- tag
6807
- ##ue
6808
- msn
6809
- oppo
6810
- ##ca
6811
- control
6812
- ##om
6813
- st
6814
- chrome
6815
- ##ure
6816
- be
6817
- lol
6818
- ##bo
6819
- lady
6820
- ##way
6821
- ##ko
6822
- ##do
6823
- ##un
6824
- corporation
6825
- ##ni
6826
- herme
6827
- ##up
6828
- ui
6829
- ##ds
6830
- ppt
6831
- admin
6832
- three
6833
- bbc
6834
- re
6835
- ca
6836
- hp
6837
- ##ee
6838
- tpp
6839
- ##ive
6840
- root
6841
- ##cc
6842
- ##ble
6843
- ##ity
6844
- adobe
6845
- park
6846
- et
6847
- oled
6848
- city
6849
- ##ex
6850
- ##ler
6851
- ##ap
6852
- china
6853
- ##book
6854
- view
6855
- ##ice
6856
- global
6857
- ##km
6858
- your
6859
- hong
6860
- ##mg
6861
- out
6862
- ##ms
6863
- ng
6864
- ebay
6865
- menu
6866
- ubuntu
6867
- ##cy
6868
- rom
6869
- ##view
6870
- open
6871
- ktv
6872
- do
6873
- server
6874
- ##lo
6875
- if
6876
- english
6877
- ##oo
6878
- step1
6879
- kong
6880
- club
6881
- july
6882
- inc
6883
- mr
6884
- hi
6885
- ##net
6886
- touch
6887
- ##ls
6888
- ##ii
6889
- michael
6890
- lcd
6891
- phone
6892
- james
6893
- step2
6894
- ios9
6895
- ##box
6896
- dc
6897
- ##ley
6898
- samsung
6899
- pokemon
6900
- css
6901
- ##ent
6902
- ##les
6903
- s8
6904
- atom
6905
- play
6906
- bmw
6907
- ##said
6908
- sa
6909
- etf
6910
- ctrl
6911
- adidas
6912
- amazon
6913
- ##ber
6914
- ##ner
6915
- visa
6916
- ##der
6917
- connectivity
6918
- ##hi
6919
- firefox
6920
- hr
6921
- so
6922
- style
6923
- mark
6924
- pop
6925
- ol
6926
- skip
6927
- as
6928
- ##ir
6929
- mba
6930
- ##ai
6931
- le
6932
- ##ver
6933
- cafe2017
6934
- lte
6935
- super
6936
- ##ron
6937
- amd
6938
- like
6939
- are
6940
- ##ster
6941
- we
6942
- ##sk
6943
- paul
6944
- data
6945
- international
6946
- ##ft
6947
- longchamp
6948
- ssd
6949
- good
6950
- ##ti
6951
- reply
6952
- ##my
6953
- apr
6954
- star
6955
- ##ker
6956
- source
6957
- js
6958
- get
6959
- force
6960
- photo
6961
- ##one
6962
- ##ow
6963
- link
6964
- bbs
6965
- goods
6966
- ##lin
6967
- python
6968
- ##ip
6969
- game
6970
- ##ics
6971
- blue
6972
- page
6973
- itunes
6974
- gt
6975
- gif
6976
- ##ff
6977
- group
6978
- about
6979
- bar
6980
- ganji
6981
- ##nce
6982
- music
6983
- lee
6984
- not
6985
- ##per
6986
- an
6987
- faq
6988
- comment
6989
- days
6990
- ##ock
6991
- ##bs
6992
- v1
6993
- player
6994
- xbox
6995
- sql
6996
- fm
6997
- f1
6998
- ##ah
6999
- ##lv
7000
- ##mp
7001
- melody
7002
- xml
7003
- market
7004
- ##au
7005
- what
7006
- gl
7007
- ##age
7008
- tips
7009
- book
7010
- ##ting
7011
- mysql
7012
- can
7013
- ##ung
7014
- wonderland
7015
- watch
7016
- ##ction
7017
- mar
7018
- mobile
7019
- article
7020
- ##db
7021
- part
7022
- party
7023
- ##ore
7024
- ##op
7025
- dj
7026
- main
7027
- ##ong
7028
- art
7029
- ad
7030
- pm2
7031
- japan
7032
- ts
7033
- ##ica
7034
- der
7035
- sm
7036
- ##wa
7037
- ct
7038
- homemesh
7039
- search
7040
- ##tv
7041
- ##di
7042
- macbook
7043
- service
7044
- type
7045
- ##ier
7046
- ##si
7047
- ##ok
7048
- best
7049
- goris
7050
- lock
7051
- cf
7052
- big
7053
- ##ut
7054
- ftp
7055
- carol
7056
- ##vi
7057
- happy
7058
- sd
7059
- ##ac
7060
- anti
7061
- pe
7062
- cnn
7063
- iii
7064
- esp
7065
- jan
7066
- tags
7067
- august
7068
- vol
7069
- ##fs
7070
- ##sion
7071
- design
7072
- ac
7073
- press
7074
- jordan
7075
- ppp
7076
- that
7077
- key
7078
- check
7079
- ##tt
7080
- ##㎡
7081
- ##lt
7082
- power
7083
- ##bc
7084
- vivi
7085
- he
7086
- jpg
7087
- ##rry
7088
- nb
7089
- ##ted
7090
- ##rn
7091
- usd
7092
- ##t00
7093
- master
7094
- model
7095
- al
7096
- ram
7097
- goo
7098
- ##ui
7099
- red
7100
- ##ary
7101
- rpg
7102
- item
7103
- ##pm
7104
- ##za
7105
- project
7106
- hot
7107
- td
7108
- blogabstract
7109
- ##ger
7110
- gr2
7111
- black
7112
- electronic
7113
- nfc
7114
- year
7115
- asus
7116
- html5
7117
- cindy
7118
- ##hd
7119
- m3
7120
- esc
7121
- ##od
7122
- booking
7123
- fed
7124
- tvb
7125
- ##ina
7126
- mit
7127
- chan
7128
- distribution
7129
- next
7130
- peter
7131
- bios
7132
- steam
7133
- cm
7134
- pk10
7135
- ##ix
7136
- dec
7137
- nasa
7138
- ##ana
7139
- icecat
7140
- b1
7141
- will
7142
- li
7143
- se
7144
- ##ji
7145
- ##ard
7146
- oct
7147
- ##ain
7148
- jp
7149
- ##ze
7150
- ##bi
7151
- cio
7152
- smart
7153
- h5
7154
- ##port
7155
- curve
7156
- vpn
7157
- ##nm
7158
- ##dia
7159
- utc
7160
- rmvb
7161
- chanel
7162
- a4
7163
- miss
7164
- ##and
7165
- ##im
7166
- media
7167
- who
7168
- she
7169
- girl
7170
- vera
7171
- class
7172
- vivo
7173
- king
7174
- ##ei
7175
- national
7176
- ab
7177
- ipod
7178
- ap
7179
- ms
7180
- mp4
7181
- msci
7182
- ##po
7183
- mg
7184
- index
7185
- ##bit
7186
- ##out
7187
- ##zz
7188
- apec
7189
- photoshop
7190
- opec
7191
- ##tes
7192
- ##ast
7193
- ○○
7194
- ##ling
7195
- ##ory
7196
- ##ical
7197
- kitty
7198
- content
7199
- step3
7200
- ##cn
7201
- win8
7202
- vc
7203
- iphone7
7204
- robert
7205
- tcl
7206
- beauty
7207
- en
7208
- dollars
7209
- ##ys
7210
- ##oc
7211
- step
7212
- pay
7213
- yy
7214
- a1
7215
- ##lly
7216
- ##ks
7217
- download
7218
- sep
7219
- exe
7220
- ph
7221
- school
7222
- gb
7223
- center
7224
- pr
7225
- street
7226
- ##board
7227
- uv
7228
- ##lan
7229
- winrar
7230
- ##que
7231
- ##ua
7232
- ##com
7233
- gpu
7234
- ettoday
7235
- fu
7236
- tom
7237
- ##ren
7238
- ##via
7239
- b2b
7240
- ##tch
7241
- rose
7242
- arm
7243
- mb
7244
- ##ial
7245
- ##nn
7246
- nvidia
7247
- step4
7248
- mvp
7249
- york
7250
- how
7251
- cpi
7252
- gov
7253
- kg
7254
- joe
7255
- ##xx
7256
- mandy
7257
- pa
7258
- ##ser
7259
- copyright
7260
- fashion
7261
- don
7262
- ecu
7263
- ##ist
7264
- ##art
7265
- erp
7266
- wap
7267
- have
7268
- ##lm
7269
- talk
7270
- ##ek
7271
- ##ning
7272
- ##if
7273
- ch
7274
- ##ite
7275
- video
7276
- cs
7277
- san
7278
- iot
7279
- look
7280
- ##ku
7281
- october
7282
- ##ux
7283
- trump
7284
- ##hs
7285
- ##ide
7286
- box
7287
- first
7288
- ##ins
7289
- april
7290
- ##ight
7291
- angel
7292
- protected
7293
- aa
7294
- x1
7295
- m2
7296
- ##fe
7297
- ##×
7298
- ##ho
7299
- size
7300
- min
7301
- ofo
7302
- fun
7303
- gomaji
7304
- ex
7305
- hdmi
7306
- food
7307
- dns
7308
- march
7309
- chris
7310
- kevin
7311
- ##lla
7312
- ##pp
7313
- ##ec
7314
- ag
7315
- ems
7316
- ##rm
7317
- ##ham
7318
- off
7319
- asp
7320
- team
7321
- fandom
7322
- ed
7323
- ##ell
7324
- info
7325
- sina
7326
- ##able
7327
- ##ctor
7328
- dll
7329
- rights
7330
- ltd
7331
- idc
7332
- jul
7333
- ma
7334
- surface
7335
- mall
7336
- eps
7337
- green
7338
- map
7339
- space
7340
- donald
7341
- v2
7342
- sodu
7343
- ##light
7344
- reserved
7345
- htm
7346
- ##han
7347
- mod
7348
- ##ise
7349
- ##tions
7350
- ti
7351
- ##shi
7352
- doc
7353
- icp
7354
- wang
7355
- ##ram
7356
- shopping
7357
- aug
7358
- ##pi
7359
- ##well
7360
- now
7361
- wam
7362
- b2
7363
- ##hu
7364
- ##gb
7365
- f2
7366
- mix
7367
- ##ef
7368
- ##uan
7369
- bwl
7370
- ##plus
7371
- ##res
7372
- core
7373
- ##ess
7374
- tea
7375
- hktvmall
7376
- nhk
7377
- ##ate
7378
- list
7379
- ##ese
7380
- feb
7381
- inn
7382
- nov
7383
- daniel
7384
- ##ci
7385
- pass
7386
- ##bet
7387
- ##nk
7388
- coffee
7389
- ssl
7390
- airbnb
7391
- ##ute
7392
- fbi
7393
- woshipm
7394
- skype
7395
- ea
7396
- cg
7397
- sp
7398
- ##fc
7399
- ##www
7400
- yes
7401
- edge
7402
- alt
7403
- fpga
7404
- ##ght
7405
- ##gs
7406
- iso9001
7407
- ##ile
7408
- ##wood
7409
- ##uo
7410
- image
7411
- lin
7412
- icon
7413
- american
7414
- ##em
7415
- set
7416
- says
7417
- ##king
7418
- ##tive
7419
- blogger
7420
- ##ox
7421
- ##zy
7422
- ##red
7423
- ##ium
7424
- ##lf
7425
- nokia
7426
- claire
7427
- ##ding
7428
- november
7429
- lohas
7430
- ##tic
7431
- ##cs
7432
- ##che
7433
- ##ire
7434
- ##gy
7435
- ##ult
7436
- db
7437
- january
7438
- win
7439
- road
7440
- ptt
7441
- ##fa
7442
- ##mer
7443
- anna
7444
- pchome
7445
- udn
7446
- ef
7447
- ##time
7448
- ##tte
7449
- g20
7450
- white
7451
- garden
7452
- eleven
7453
- di
7454
- chen
7455
- young
7456
- cosplay
7457
- bat
7458
- ##tra
7459
- kindle
7460
- npc
7461
- steve
7462
- etc
7463
- ##ern
7464
- call
7465
- xperia
7466
- ces
7467
- travel
7468
- sk
7469
- s7
7470
- ##ous
7471
- ##int
7472
- edu
7473
- file
7474
- cho
7475
- qr
7476
- ##car
7477
- ##our
7478
- ##ant
7479
- eric
7480
- rends
7481
- ##jo
7482
- mastercard
7483
- kb
7484
- ##min
7485
- ##ino
7486
- vista
7487
- ##ris
7488
- ##ud
7489
- jack
7490
- ##set
7491
- pos
7492
- ##her
7493
- ##ou
7494
- taipei
7495
- beta
7496
- ##fi
7497
- express
7498
- body
7499
- ##ill
7500
- aphojoy
7501
- user
7502
- december
7503
- meiki
7504
- ##ick
7505
- tweet
7506
- richard
7507
- ##av
7508
- iphone6
7509
- ##dd
7510
- views
7511
- ##mark
7512
- pd
7513
- times
7514
- level
7515
- ##ash
7516
- point
7517
- ##ome
7518
- koreanmall
7519
- ##ak
7520
- george
7521
- q2
7522
- wma
7523
- tcp
7524
- full
7525
- mlb
7526
- ##lle
7527
- ##watch
7528
- tm
7529
- run
7530
- smith
7531
- business
7532
- ##und
7533
- color
7534
- ##tal
7535
- ##less
7536
- moon
7537
- ##rl
7538
- update
7539
- pcb
7540
- shop
7541
- little
7542
- end
7543
- ##mhz
7544
- van
7545
- dsp
7546
- easy
7547
- ##house
7548
- ##key
7549
- history
7550
- oh
7551
- ##hy
7552
- ##web
7553
- oem
7554
- let
7555
- was
7556
- ##gg
7557
- review
7558
- ##wan
7559
- ##°c
7560
- uc
7561
- title
7562
- ##val
7563
- united
7564
- ##ons
7565
- doi
7566
- trivago
7567
- overdope
7568
- sbs
7569
- ##ance
7570
- grand
7571
- special
7572
- imf
7573
- wx17house
7574
- ##so
7575
- audi
7576
- ##he
7577
- london
7578
- william
7579
- ##rp
7580
- ##ake
7581
- science
7582
- beach
7583
- cfa
7584
- amp
7585
- ps4
7586
- ##link
7587
- ##hp
7588
- crm
7589
- ferragamo
7590
- bell
7591
- make
7592
- ##eng
7593
- under
7594
- zh
7595
- photos
7596
- ##style
7597
- via
7598
- da
7599
- ##gi
7600
- company
7601
- i7
7602
- ##ray
7603
- thomas
7604
- ufo
7605
- i5
7606
- ##max
7607
- plc
7608
- ben
7609
- back
7610
- research
7611
- mike
7612
- ##pc
7613
- september
7614
- ##ace
7615
- vps
7616
- february
7617
- pantos
7618
- wp
7619
- lisa
7620
- jquery
7621
- night
7622
- long
7623
- offer
7624
- ##berg
7625
- ##news
7626
- ray
7627
- fks
7628
- wto
7629
- over
7630
- ##all
7631
- ##rus
7632
- ##works
7633
- blogtitle
7634
- loftpermalink
7635
- martin
7636
- test
7637
- ling
7638
- km
7639
- fda
7640
- v3
7641
- ##ja
7642
- outlet
7643
- family
7644
- ##ea
7645
- ##top
7646
- story
7647
- ##ness
7648
- salvatore
7649
- ##lu
7650
- swift
7651
- room
7652
- oracle
7653
- ##ul
7654
- sam
7655
- b2c
7656
- week
7657
- pi
7658
- rock
7659
- ##ean
7660
- ##gle
7661
- cctv
7662
- after
7663
- chinese
7664
- ##back
7665
- powered
7666
- x2
7667
- ##tan
7668
- ##nes
7669
- canon
7670
- only
7671
- ##zi
7672
- ##las
7673
- say
7674
- ##oe
7675
- ##sd
7676
- ##bot
7677
- ##world
7678
- ##zo
7679
- sky
7680
- made
7681
- top100
7682
- just
7683
- pmi
7684
- gap
7685
- ##vr
7686
- les
7687
- ball
7688
- vogue
7689
- vi
7690
- ing
7691
- ofweek
7692
- cos
7693
- ##list
7694
- ##ort
7695
- ##lon
7696
- last
7697
- ##tc
7698
- ##of
7699
- ##bus
7700
- ##gen
7701
- real
7702
- eva
7703
- a3
7704
- nas
7705
- ##lie
7706
- ##ria
7707
- ##coin
7708
- ##bt
7709
- his
7710
- cat
7711
- nata
7712
- vive
7713
- health
7714
- drive
7715
- sir
7716
- du
7717
- cup
7718
- ##ook
7719
- ##sy
7720
- alex
7721
- msg
7722
- tour
7723
- ##word
7724
- ebooks
7725
- r8
7726
- block
7727
- nice
7728
- pvp
7729
- months
7730
- rewards
7731
- ##ther
7732
- ##xi
7733
- ##sc
7734
- micro
7735
- gg
7736
- blogfp
7737
- op
7738
- daily
7739
- m1
7740
- true
7741
- ##bb
7742
- ml
7743
- ##tar
7744
- ##ky
7745
- anthony
7746
- ##yo
7747
- state
7748
- ##ara
7749
- ##aa
7750
- ##rc
7751
- ##tz
7752
- ##ston
7753
- gear
7754
- ##eo
7755
- ##ade
7756
- ge
7757
- see
7758
- ##win
7759
- ##ura
7760
- ss
7761
- heart
7762
- ##den
7763
- ##ita
7764
- down
7765
- ##sm
7766
- el
7767
- png
7768
- rakuten
7769
- whatsapp
7770
- bay
7771
- dream
7772
- add
7773
- ##use
7774
- pad
7775
- gucci
7776
- mpv
7777
- ##ode
7778
- ##fo
7779
- island
7780
- jason
7781
- chicago
7782
- ##hone
7783
- io
7784
- sogo
7785
- be2
7786
- ##ology
7787
- cloud
7788
- vcd
7789
- ##con
7790
- ##ford
7791
- ##joy
7792
- ##kb
7793
- ##rade
7794
- but
7795
- ##ach
7796
- docker
7797
- ##ful
7798
- rfid
7799
- ul
7800
- ##ase
7801
- hit
7802
- ford
7803
- ##star
7804
- a2
7805
- sdk
7806
- reading
7807
- edited
7808
- ##are
7809
- cmos
7810
- ##mc
7811
- siri
7812
- light
7813
- ##ella
7814
- bloomberg
7815
- ##read
7816
- pizza
7817
- ##ison
7818
- jimmy
7819
- ##vm
7820
- college
7821
- node
7822
- journal
7823
- ba
7824
- ##play
7825
- ##cer
7826
- magic
7827
- ##yu
7828
- jump
7829
- tt
7830
- ##ings
7831
- asr
7832
- ##lia
7833
- step5
7834
- network
7835
- ##cd
7836
- mc
7837
- pixstyleme
7838
- money
7839
- bl
7840
- act
7841
- ##tus
7842
- tokyo
7843
- ##rial
7844
- ##life
7845
- emba
7846
- ##ae
7847
- saas
7848
- tcs
7849
- ##rk
7850
- ##wang
7851
- summer
7852
- ##sp
7853
- ko
7854
- ##ving
7855
- premium
7856
- netflix
7857
- uk
7858
- mt
7859
- ##lton
7860
- right
7861
- frank
7862
- two
7863
- ##ple
7864
- ##cal
7865
- ##sen
7866
- ##ville
7867
- hold
7868
- nexus
7869
- dd
7870
- ##ius
7871
- ##mah
7872
- tila
7873
- zero
7874
- ce
7875
- ##tin
7876
- resort
7877
- ##ws
7878
- charles
7879
- old
7880
- p10
7881
- report
7882
- ##ru
7883
- bus
7884
- vans
7885
- lt
7886
- ##est
7887
- pv
7888
- links
7889
- rebecca
7890
- ##dm
7891
- azure
7892
- limited
7893
- bit
7894
- ##mon
7895
- moto
7896
- ##eam
7897
- var
7898
- eos
7899
- blogspot
7900
- e3
7901
- dos
7902
- dm
7903
- fc
7904
- ##ments
7905
- ##ik
7906
- ##kw
7907
- boy
7908
- ##bin
7909
- ##ata
7910
- er
7911
- ##vin
7912
- ##tu
7913
- ##ula
7914
- station
7915
- ##ature
7916
- files
7917
- zara
7918
- hdr
7919
- top10
7920
- nature
7921
- magazine
7922
- s6
7923
- marriott
7924
- avira
7925
- case
7926
- tab
7927
- ##ran
7928
- tony
7929
- ##home
7930
- oculus
7931
- im
7932
- ##ral
7933
- jean
7934
- saint
7935
- cry
7936
- rosie
7937
- ##force
7938
- ##ini
7939
- ice
7940
- ##bert
7941
- ##nder
7942
- ##mber
7943
- pet
7944
- plurk
7945
- ##sis
7946
- ##ence
7947
- tim
7948
- ##nc
7949
- ##name
7950
- log
7951
- ips
7952
- great
7953
- ikea
7954
- malaysia
7955
- unix
7956
- ##ncy
7957
- ##nie
7958
- akb48
7959
- ##ye
7960
- ##oid
7961
- ##chi
7962
- oa
7963
- xuehai
7964
- ##orm
7965
- ##rf
7966
- ##ware
7967
- ho
7968
- ##pro
7969
- text
7970
- ##era
7971
- bob
7972
- ##ub
7973
- scp
7974
- avi
7975
- ##zen
7976
- mi
7977
- wu
7978
- museum
7979
- qvod
7980
- apache
7981
- lake
7982
- jcb
7983
- ni
7984
- ##hr
7985
- hill
7986
- ne
7987
- weibo
7988
- ruby
7989
- ##row
7990
- iv
7991
- ##ish
7992
- github
7993
- mate
7994
- ##lot
7995
- ##ane
7996
- andrew
7997
- ##tina
7998
- t1
7999
- rf
8000
- ed2k
8001
- ##vel
8002
- way
8003
- final
8004
- ns
8005
- sweet
8006
- bytes
8007
- ##ene
8008
- ##cker
8009
- ##px
8010
- topapp
8011
- helpapp
8012
- rs
8013
- low
8014
- g4g
8015
- care
8016
- ldquo
8017
- ##fork
8018
- leave
8019
- rm
8020
- edition
8021
- ##gan
8022
- ##zon
8023
- ##qq
8024
- ##google
8025
- ##ism
8026
- gold
8027
- explorer
8028
- ##zer
8029
- toyota
8030
- category
8031
- select
8032
- visual
8033
- ##labels
8034
- restaurant
8035
- ##md
8036
- posts
8037
- s1
8038
- ##ico
8039
- angelababy
8040
- sports
8041
- s3
8042
- mbc
8043
- shell
8044
- x86
8045
- candy
8046
- ##new
8047
- kbs
8048
- face
8049
- xl
8050
- ##here
8051
- swissinfo
8052
- v8
8053
- dram
8054
- ##ual
8055
- ##vice
8056
- ##wer
8057
- sport
8058
- q1
8059
- ios10
8060
- public
8061
- int
8062
- card
8063
- ep
8064
- au
8065
- rt
8066
- bill
8067
- ##mll
8068
- kim
8069
- wan
8070
- ##uk
8071
- x3
8072
- scott
8073
- ##ming
8074
- e5
8075
- h7n9
8076
- worldcat
8077
- brown
8078
- ##vo
8079
- ##led
8080
- ##ax
8081
- ##ert
8082
- paris
8083
- polo
8084
- ##lr
8085
- capital
8086
- ##hing
8087
- bank
8088
- cv
8089
- ##chat
8090
- adc
8091
- ##ule
8092
- digital
8093
- hotmail
8094
- ##pad
8095
- bbq
8096
- quot
8097
- ##ring
8098
- before
8099
- wali
8100
- mcu
8101
- costco
8102
- north
8103
- switch
8104
- ##city
8105
- philips
8106
- ##mann
8107
- management
8108
- panasonic
8109
- ##cl
8110
- ##vd
8111
- ##ping
8112
- ##rge
8113
- alice
8114
- ##lk
8115
- css3
8116
- ##ney
8117
- vision
8118
- alpha
8119
- ##ular
8120
- ##tter
8121
- lz
8122
- mode
8123
- gre
8124
- pci
8125
- ##tm
8126
- ##yan
8127
- ##let
8128
- work
8129
- war
8130
- coach
8131
- ah
8132
- mary
8133
- huang
8134
- ##pt
8135
- a8
8136
- pt
8137
- follow
8138
- ##berry
8139
- ##ew
8140
- a5
8141
- ghost
8142
- ##wn
8143
- ##og
8144
- south
8145
- ##code
8146
- girls
8147
- ##rid
8148
- action
8149
- villa
8150
- git
8151
- r11
8152
- table
8153
- games
8154
- ##cket
8155
- error
8156
- ##anonymoussaid
8157
- ##ag
8158
- here
8159
- ##ame
8160
- ##gc
8161
- qa
8162
- ##lis
8163
- gmp
8164
- ##gin
8165
- vmalife
8166
- ##cher
8167
- yu
8168
- wedding
8169
- ##tis
8170
- demo
8171
- dragon
8172
- soho
8173
- social
8174
- bye
8175
- ##rant
8176
- river
8177
- orz
8178
- acer
8179
- ##ats
8180
- del
8181
- ##ven
8182
- ups
8183
- value
8184
- macd
8185
- yougou
8186
- ##dn
8187
- ##ano
8188
- ll
8189
- ##urt
8190
- ##rent
8191
- continue
8192
- script
8193
- ##wen
8194
- ##ect
8195
- paper
8196
- shift
8197
- ##chel
8198
- ##cat
8199
- x5
8200
- fox
8201
- car
8202
- aaa
8203
- ##blog
8204
- loading
8205
- ##yn
8206
- ##tp
8207
- kuso
8208
- si
8209
- sns
8210
- rmb
8211
- vdc
8212
- forest
8213
- central
8214
- prime
8215
- help
8216
- ultra
8217
- ##rmb
8218
- square
8219
- ##field
8220
- ##reen
8221
- ##ors
8222
- ##ju
8223
- c1
8224
- start
8225
- ##air
8226
- ##map
8227
- cdn
8228
- ##wo
8229
- cba
8230
- stephen
8231
- m8
8232
- ##get
8233
- opera
8234
- ##base
8235
- ##ood
8236
- vsa
8237
- com™
8238
- ##aw
8239
- ##ail
8240
- count
8241
- t2
8242
- ##een
8243
- hop
8244
- ##gp
8245
- vsc
8246
- tree
8247
- ##eg
8248
- ##ose
8249
- ##ories
8250
- ##shop
8251
- alphago
8252
- v4
8253
- simon
8254
- fluke62max
8255
- zip
8256
- ##sta
8257
- louis
8258
- cr
8259
- bas
8260
- bc
8261
- ##yer
8262
- hadoop
8263
- ##ube
8264
- ##wi
8265
- hola
8266
- ##low
8267
- place
8268
- centre
8269
- d3
8270
- ##fer
8271
- ##media
8272
- exchange
8273
- series
8274
- ##san
8275
- eb
8276
- ##bank
8277
- q3
8278
- ##nge
8279
- ##mail
8280
- take
8281
- ##lp
8282
- client
8283
- east
8284
- cache
8285
- event
8286
- vincent
8287
- ##nse
8288
- sui
8289
- adchoice
8290
- ##stry
8291
- ##zone
8292
- ga
8293
- apps
8294
- sea
8295
- ##ab
8296
- cisco
8297
- ##rner
8298
- kymco
8299
- ##care
8300
- dha
8301
- ##pu
8302
- ##yi
8303
- minkoff
8304
- royal
8305
- p1
8306
- annie
8307
- collection
8308
- kpi
8309
- playstation
8310
- bh
8311
- ##bar
8312
- queen
8313
- radio
8314
- andy
8315
- armani
8316
- ##xy
8317
- manager
8318
- iherb
8319
- ##ery
8320
- ##share
8321
- spring
8322
- raid
8323
- johnson
8324
- ##ob
8325
- volvo
8326
- hall
8327
- ##ball
8328
- v6
8329
- our
8330
- taylor
8331
- ##hk
8332
- bi
8333
- ##cp
8334
- kate
8335
- bo
8336
- water
8337
- technology
8338
- ##rie
8339
- ##ona
8340
- ##sl
8341
- hpv
8342
- gtx
8343
- hip
8344
- rdquo
8345
- jayz
8346
- stone
8347
- ##lex
8348
- ##rum
8349
- namespace
8350
- ##ale
8351
- ##atic
8352
- des
8353
- ##erson
8354
- ##ql
8355
- ##ves
8356
- ##type
8357
- enter
8358
- d2
8359
- ##mix
8360
- ##bian
8361
- a9
8362
- jj
8363
- ky
8364
- ##lc
8365
- access
8366
- movie
8367
- ##hc
8368
- tower
8369
- ##ration
8370
- ##mit
8371
- ##nch
8372
- ua
8373
- tel
8374
- prefix
8375
- ##o2
8376
- ##point
8377
- ott
8378
- ##http
8379
- ##ury
8380
- baidu
8381
- ##ink
8382
- member
8383
- ##logy
8384
- bigbang
8385
- nownews
8386
- ##js
8387
- ##shot
8388
- ##tb
8389
- eba
8390
- ##tics
8391
- ##lus
8392
- v5
8393
- spark
8394
- ##ama
8395
- there
8396
- ##ions
8397
- god
8398
- ##lls
8399
- ##down
8400
- hiv
8401
- ##ress
8402
- burberry
8403
- day2
8404
- ##kv
8405
- jeff
8406
- related
8407
- film
8408
- edit
8409
- joseph
8410
- ##ark
8411
- cx
8412
- order
8413
- g9
8414
- ##ans
8415
- ##tty
8416
- s5
8417
- ##bee
8418
- thread
8419
- xr
8420
- buy
8421
- sh
8422
- land
8423
- spotify
8424
- mx
8425
- ##ari
8426
- ##verse
8427
- sf
8428
- why
8429
- nego
8430
- sunny
8431
- dom
8432
- exo
8433
- positioning
8434
- fit
8435
- rgb
8436
- ##tton
8437
- kiss
8438
- alexa
8439
- adam
8440
- lp
8441
- mp
8442
- ##ties
8443
- ##llow
8444
- amy
8445
- ##du
8446
- np
8447
- institute
8448
- ##rth
8449
- ##lar
8450
- ##des
8451
- sidebar
8452
- imax
8453
- site
8454
- ##cky
8455
- ##kit
8456
- ##ime
8457
- season
8458
- ##fun
8459
- gogoro
8460
- a7
8461
- pu
8462
- lily
8463
- fire
8464
- twd600
8465
- ##vis
8466
- ##cture
8467
- information
8468
- close
8469
- friday
8470
- yi
8471
- nick
8472
- ##tta
8473
- ##tel
8474
- ##lock
8475
- cbd
8476
- economy
8477
- tinker
8478
- double
8479
- voice
8480
- ##app
8481
- oops
8482
- channel
8483
- today
8484
- ##right
8485
- raw
8486
- xyz
8487
- jim
8488
- edm
8489
- ##cent
8490
- supreme
8491
- ds
8492
- ##its
8493
- ##asia
8494
- dropbox
8495
- ##tti
8496
- books
8497
- ##tle
8498
- ##ller
8499
- ##ken
8500
- ##more
8501
- ##boy
8502
- sex
8503
- ##dom
8504
- t3
8505
- ##ider
8506
- ##unch
8507
- feel
8508
- ##put
8509
- s2
8510
- mo
8511
- ##gh
8512
- men
8513
- ka
8514
- amoled
8515
- div
8516
- ##tr
8517
- ##n1
8518
- port
8519
- howard
8520
- ##tags
8521
- ken
8522
- dnf
8523
- ##nus
8524
- adsense
8525
- ide
8526
- buff
8527
- thunder
8528
- ##town
8529
- ##ique
8530
- has
8531
- ##body
8532
- auto
8533
- pin
8534
- ##erry
8535
- tee
8536
- number
8537
- ##the
8538
- object
8539
- psp
8540
- cool
8541
- udnbkk
8542
- ##mic
8543
- miui
8544
- ##tro
8545
- most
8546
- r2
8547
- ##alk
8548
- ##nity
8549
- s4
8550
- law
8551
- version
8552
- ##oa
8553
- n1
8554
- sgs
8555
- docomo
8556
- ##tf
8557
- ##ack
8558
- henry
8559
- fc2
8560
- ##ded
8561
- ##sco
8562
- ##rite
8563
- linkedin
8564
- ##ada
8565
- ##now
8566
- wii
8567
- ##ndy
8568
- ucbug
8569
- sputniknews
8570
- legalminer
8571
- ##ika
8572
- ##xp
8573
- ##bu
8574
- q10
8575
- oo
8576
- b6
8577
- come
8578
- ##rman
8579
- cheese
8580
- ming
8581
- maker
8582
- ##gm
8583
- nikon
8584
- ##fig
8585
- ppi
8586
- kelly
8587
- jchere
8588
- ted
8589
- md
8590
- fgo
8591
- tech
8592
- ##tto
8593
- dan
8594
- soc
8595
- ##gl
8596
- ##len
8597
- hair
8598
- earth
8599
- img
8600
- ##pper
8601
- ##a1
8602
- acca
8603
- ##ition
8604
- ##ference
8605
- suite
8606
- ##ig
8607
- outlook
8608
- ##mond
8609
- ##cation
8610
- ##pr
8611
- airport
8612
- ##over
8613
- jones
8614
- ##ith
8615
- lab
8616
- ##su
8617
- co2
8618
- town
8619
- piece
8620
- ##llo
8621
- no1
8622
- vmware
8623
- ##qi
8624
- focus
8625
- reader
8626
- ##admin
8627
- ##ora
8628
- tb
8629
- false
8630
- ##log
8631
- know
8632
- lan
8633
- ##ces
8634
- f4
8635
- ##ume
8636
- motel
8637
- stop
8638
- ##oper
8639
- na
8640
- flickr
8641
- netcomponents
8642
- ##af
8643
- pose
8644
- williams
8645
- local
8646
- ##ound
8647
- ##cg
8648
- ##site
8649
- ##iko
8650
- gsm
8651
- con
8652
- ##ath
8653
- friends
8654
- ##hip
8655
- cell
8656
- ##rey
8657
- cream
8658
- ##cks
8659
- ##dp
8660
- facebooktwitterpinterestgoogle
8661
- sso
8662
- shtml
8663
- song
8664
- swiss
8665
- ##mw
8666
- lumia
8667
- xdd
8668
- string
8669
- tiffany
8670
- marc
8671
- insee
8672
- russell
8673
- sc
8674
- dell
8675
- ##ations
8676
- camera
8677
- ##vs
8678
- ##flow
8679
- ##late
8680
- classic
8681
- ##nter
8682
- stay
8683
- g1
8684
- mtv
8685
- ##ever
8686
- ##lab
8687
- ##nger
8688
- qe
8689
- sata
8690
- ryan
8691
- d1
8692
- cms
8693
- ##cing
8694
- su
8695
- editor
8696
- ##nap
8697
- security
8698
- sunday
8699
- association
8700
- ##ens
8701
- ##bra
8702
- acg
8703
- sofascore
8704
- mkv
8705
- ##ign
8706
- jonathan
8707
- gary
8708
- build
8709
- labels
8710
- ##oto
8711
- tesla
8712
- moba
8713
- qi
8714
- gohappy
8715
- general
8716
- ajax
8717
- society
8718
- ##test
8719
- ##urs
8720
- wps
8721
- fedora
8722
- ##ich
8723
- mozilla
8724
- ##dr
8725
- usa
8726
- urn
8727
- ##lina
8728
- grace
8729
- ##die
8730
- ##try
8731
- ##ader
8732
- elle
8733
- ##chen
8734
- price
8735
- ##ten
8736
- uhz
8737
- ##ough
8738
- eq
8739
- ##hen
8740
- states
8741
- push
8742
- session
8743
- balance
8744
- wow
8745
- ##cus
8746
- ##py
8747
- when
8748
- ##ward
8749
- ##ep
8750
- wong
8751
- library
8752
- prada
8753
- ##cle
8754
- running
8755
- ##ree
8756
- ck
8757
- date
8758
- q4
8759
- ##ctive
8760
- ##ool
8761
- mk
8762
- ##ira
8763
- die
8764
- secret
8765
- rq
8766
- dota
8767
- buffet
8768
- e6
8769
- ##ez
8770
- pan
8771
- ha
8772
- ##card
8773
- ##cha
8774
- alan
8775
- day3
8776
- eye
8777
- f3
8778
- ##end
8779
- france
8780
- keep
8781
- adi
8782
- rna
8783
- tvbs
8784
- ##ala
8785
- solo
8786
- nova
8787
- ##tail
8788
- support
8789
- ##ries
8790
- ##ved
8791
- base
8792
- copy
8793
- iis
8794
- fps
8795
- ##ways
8796
- hero
8797
- hgih
8798
- profile
8799
- fish
8800
- mu
8801
- ssh
8802
- entertainment
8803
- chang
8804
- ##wd
8805
- click
8806
- cake
8807
- ##ond
8808
- pre
8809
- ##tom
8810
- kic
8811
- pixel
8812
- ##ov
8813
- ##fl
8814
- product
8815
- ##pd
8816
- dear
8817
- ##gate
8818
- es
8819
- yumi
8820
- audio
8821
- ##²
8822
- ##sky
8823
- echo
8824
- bin
8825
- where
8826
- ##ture
8827
- ##ape
8828
- find
8829
- sap
8830
- isis
8831
- nand
8832
- ##load
8833
- ##ream
8834
- band
8835
- a6
8836
- never
8837
- ##post
8838
- festival
8839
- ##we
8840
- guide
8841
- zenfone
8842
- ##ike
8843
- gd
8844
- forum
8845
- jessica
8846
- strong
8847
- alexander
8848
- ##ould
8849
- software
8850
- allen
8851
- ##ious
8852
- program
8853
- else
8854
- lohasthree
8855
- ##gar
8856
- please
8857
- rc
8858
- ##ggle
8859
- ##ric
8860
- bim
8861
- ##own
8862
- eclipse
8863
- brian
8864
- ##side
8865
- ##other
8866
- ##tech
8867
- ##ator
8868
- engine
8869
- ##ged
8870
- plaza
8871
- ##fit
8872
- cia
8873
- ngo
8874
- westbrook
8875
- shi
8876
- tbs
8877
- sci
8878
- reuters
8879
- ##ily
8880
- contextlink
8881
- ##hn
8882
- af
8883
- ##cil
8884
- bridge
8885
- very
8886
- ##cel
8887
- cambridge
8888
- ##ize
8889
- ##aid
8890
- ##data
8891
- frm
8892
- ##head
8893
- award
8894
- butler
8895
- ##sun
8896
- meta
8897
- ##mar
8898
- america
8899
- ps3
8900
- puma
8901
- pmid
8902
- lc
8903
- kitchen
8904
- ##lic
8905
- day1
8906
- future
8907
- ##text
8908
- ##page
8909
- ##rris
8910
- pm1
8911
- ##ket
8912
- fans
8913
- christian
8914
- bot
8915
- kids
8916
- trackback
8917
- ##hai
8918
- c3
8919
- display
8920
- ##hl
8921
- n2
8922
- idea
8923
- ##sent
8924
- airmail
8925
- ##ug
8926
- ##men
8927
- pwm
8928
- ##lution
8929
- awards
8930
- schemas
8931
- asics
8932
- wikipedia
8933
- font
8934
- ##tional
8935
- ##vy
8936
- c2
8937
- ##dget
8938
- ##ein
8939
- contact
8940
- pepper
8941
- ##uel
8942
- ##ument
8943
- ##hang
8944
- q5
8945
- ##sue
8946
- rain
8947
- ##ndi
8948
- wei
8949
- swatch
8950
- ##cept
8951
- popular
8952
- ##ste
8953
- ##tag
8954
- p2
8955
- trc
8956
- ##west
8957
- ##live
8958
- justin
8959
- honda
8960
- ping
8961
- messenger
8962
- ##rap
8963
- v9
8964
- unity
8965
- appqq
8966
- leo
8967
- ##tone
8968
- ##ass
8969
- uniqlo
8970
- her
8971
- jane
8972
- memory
8973
- moneydj
8974
- ##tical
8975
- human
8976
- ##m2
8977
- coc
8978
- miacare
8979
- ##mn
8980
- tmt
8981
- ##core
8982
- vim
8983
- kk
8984
- ##may
8985
- fan
8986
- target
8987
- use
8988
- too
8989
- fast
8990
- services
8991
- ##ope
8992
- omega
8993
- energy
8994
- pinkoi
8995
- ##rain
8996
- jackson
8997
- ##ement
8998
- p9
8999
- rd
9000
- ##tier
9001
- ##vic
9002
- zone
9003
- dl
9004
- isofix
9005
- cpa
9006
- m4
9007
- kimi
9008
- davis
9009
- ##lay
9010
- lulu
9011
- ##uck
9012
- weeks
9013
- qs
9014
- ##hop
9015
- ae
9016
- ##ear
9017
- eia
9018
- ##fly
9019
- korea
9020
- jpeg
9021
- boost
9022
- ##ship
9023
- small
9024
- eur
9025
- valley
9026
- ##iel
9027
- simple
9028
- ##ude
9029
- rn
9030
- k2
9031
- ##ena
9032
- non
9033
- patrick
9034
- feed
9035
- process
9036
- well
9037
- qqmei
9038
- ##thing
9039
- they
9040
- aws
9041
- lu
9042
- pink
9043
- ##ters
9044
- ##kin
9045
- board
9046
- ##vertisement
9047
- wine
9048
- ##ien
9049
- unicode
9050
- ##dge
9051
- r1
9052
- ##tant
9053
- ##twitter
9054
- cool1
9055
- isp
9056
- standard
9057
- matt
9058
- ##fu
9059
- ##iner
9060
- googlemsn
9061
- pixnetfacebookyahoo
9062
- x7
9063
- ##uce
9064
- sao
9065
- ##ev
9066
- ##file
9067
- xddd
9068
- shirt
9069
- ##rio
9070
- ##hat
9071
- givenchy
9072
- ya
9073
- bang
9074
- ##lio
9075
- monday
9076
- crystal
9077
- ##abc
9078
- head
9079
- ubuntuforumwikilinuxpastechat
9080
- ##vc
9081
- ##rity
9082
- cnc
9083
- ipv6
9084
- null
9085
- ##ost
9086
- yang
9087
- imsean
9088
- tiger
9089
- ##fet
9090
- dji
9091
- ji
9092
- maria
9093
- ##come
9094
- foundation
9095
- ##beth
9096
- active
9097
- ##aft
9098
- ##don
9099
- sr
9100
- emma
9101
- ##khz
9102
- living
9103
- sas
9104
- x6
9105
- ##face
9106
- pptv
9107
- x4
9108
- ##mate
9109
- han
9110
- sophie
9111
- ##jing
9112
- fifa
9113
- ##mand
9114
- other
9115
- sale
9116
- inwedding
9117
- ##gn
9118
- ##mmy
9119
- ##pmlast
9120
- bad
9121
- nana
9122
- nbc
9123
- ##wu
9124
- note7
9125
- single
9126
- ##bel
9127
- window
9128
- ##dio
9129
- ##ht
9130
- union
9131
- age
9132
- ##ivity
9133
- domain
9134
- neo
9135
- ##isa
9136
- ##lter
9137
- f5
9138
- steven
9139
- ##cts
9140
- powerpoint
9141
- tft
9142
- self
9143
- g2
9144
- ft
9145
- zol
9146
- ##act
9147
- mwc
9148
- nbapop
9149
- eds
9150
- ace
9151
- ##room
9152
- previous
9153
- author
9154
- tomtom
9155
- il
9156
- ##ets
9157
- hu
9158
- financial
9159
- bp
9160
- chi
9161
- ##hg
9162
- fairmont
9163
- cross
9164
- gay
9165
- h2
9166
- function
9167
- also
9168
- ##raph
9169
- ##ils
9170
- i3
9171
- avenue
9172
- ##host
9173
- ##bon
9174
- ##tsu
9175
- message
9176
- navigation
9177
- fintech
9178
- h6
9179
- ##ject
9180
- ##vas
9181
- ##firm
9182
- credit
9183
- ##wf
9184
- xxxx
9185
- form
9186
- ##nor
9187
- ##space
9188
- huawei
9189
- plan
9190
- json
9191
- sbl
9192
- ##dc
9193
- machine
9194
- wish
9195
- ##sol
9196
- windows7
9197
- edward
9198
- development
9199
- washington
9200
- ##nsis
9201
- lo
9202
- ##sio
9203
- ##ym
9204
- ##bor
9205
- planet
9206
- ##wt
9207
- ieee
9208
- gpa
9209
- camp
9210
- ann
9211
- gm
9212
- ##tw
9213
- ##oka
9214
- connect
9215
- ##rss
9216
- ##work
9217
- ##atus
9218
- wall
9219
- chicken
9220
- soul
9221
- ##times
9222
- fa
9223
- ##ather
9224
- ##cord
9225
- ##eep
9226
- hitachi
9227
- gui
9228
- harry
9229
- ##pan
9230
- e1
9231
- disney
9232
- ##press
9233
- wind
9234
- frigidaire
9235
- ##tl
9236
- liu
9237
- hsu
9238
- basic
9239
- von
9240
- ev
9241
- learning
9242
- ##ull
9243
- expedia
9244
- archives
9245
- change
9246
- ##wei
9247
- santa
9248
- cut
9249
- ins
9250
- turbo
9251
- brand
9252
- cf1
9253
- return
9254
- ##rip
9255
- h1
9256
- ##nis
9257
- application
9258
- emc
9259
- rx
9260
- ##oon
9261
- quick
9262
- wilson
9263
- wing
9264
- chapter
9265
- ##bug
9266
- beyond
9267
- ##cms
9268
- ##dar
9269
- ##oh
9270
- zoom
9271
- e2
9272
- trip
9273
- sb
9274
- ##nba
9275
- rcep
9276
- aspx
9277
- ci
9278
- gc
9279
- gnu
9280
- ##count
9281
- advanced
9282
- dance
9283
- dv
9284
- ##url
9285
- ##ging
9286
- am09
9287
- shadow
9288
- battle
9289
- ##cia
9290
- emily
9291
- ##tation
9292
- host
9293
- ff
9294
- techorz
9295
- sars
9296
- ##mini
9297
- ##mporary
9298
- ##ering
9299
- nc
9300
- ##next
9301
- cma
9302
- ##mbps
9303
- ##gas
9304
- ##ift
9305
- ##dot
9306
- amana
9307
- ##ros
9308
- ir
9309
- ##eet
9310
- ##ible
9311
- ##aka
9312
- dcs
9313
- iq
9314
- l1
9315
- ##lor
9316
- maggie
9317
- ##iu
9318
- ##gt
9319
- articles
9320
- create
9321
- ##burg
9322
- ##iki
9323
- database
9324
- fantasy
9325
- ##rex
9326
- ##cam
9327
- dlc
9328
- dean
9329
- ##you
9330
- hard
9331
- path
9332
- gaming
9333
- victoria
9334
- maps
9335
- cb
9336
- ##lee
9337
- ##itor
9338
- overchicstoretvhome
9339
- systems
9340
- ##xt
9341
- p3
9342
- sarah
9343
- ##nan
9344
- x9
9345
- install
9346
- second
9347
- ##ann
9348
- ##ph
9349
- ##rcle
9350
- ##nic
9351
- ##nar
9352
- ec
9353
- metro
9354
- chocolate
9355
- ##rian
9356
- ##table
9357
- skin
9358
- ##sn
9359
- mountain
9360
- inparadise
9361
- ib
9362
- ##jia
9363
- eeworld
9364
- creative
9365
- g5
9366
- g3
9367
- parker
9368
- ecfa
9369
- village
9370
- sylvia
9371
- hbl
9372
- ##ques
9373
- ##onsored
9374
- ##x2
9375
- ##v4
9376
- ##tein
9377
- ie6
9378
- ##stack
9379
- ver
9380
- ##ads
9381
- ##baby
9382
- sound
9383
- bbe
9384
- ##lone
9385
- ##uid
9386
- ads
9387
- gundam
9388
- thinkpad
9389
- scrum
9390
- match
9391
- ##ave
9392
- mems
9393
- ##oy
9394
- ##talk
9395
- glass
9396
- lamigo
9397
- span
9398
- ##eme
9399
- job
9400
- ##a5
9401
- jay
9402
- wade
9403
- kde
9404
- ##lace
9405
- ocean
9406
- tvg
9407
- ##covery
9408
- ##r3
9409
- ##ners
9410
- ##rea
9411
- junior
9412
- think
9413
- ##aine
9414
- cover
9415
- ##ision
9416
- ##sia
9417
- ##bow
9418
- msi
9419
- ##love
9420
- soft
9421
- z2
9422
- ##pl
9423
- mobil
9424
- mind
9425
- ##uy
9426
- nginx
9427
- ##oi
9428
- ##rr
9429
- ##mple
9430
- ##sson
9431
- ##nts
9432
- comhd
9433
- crv3000
9434
- ##uard
9435
- deep
9436
- lost
9437
- field
9438
- gallery
9439
- ##bia
9440
- rate
9441
- spf
9442
- redis
9443
- traction
9444
- icloud
9445
- fe
9446
- jose
9447
- ##tory
9448
- into
9449
- sohu
9450
- fx
9451
- kicstart2
9452
- ##hia
9453
- ##sit
9454
- ra
9455
- ##walk
9456
- ##xure
9457
- ##pact
9458
- pacific
9459
- xa
9460
- natural
9461
- carlo
9462
- ##walker
9463
- ##can
9464
- cto
9465
- gigi
9466
- pen
9467
- ##hoo
9468
- ob
9469
- matlab
9470
- ##yy
9471
- ##iti
9472
- mango
9473
- ##bbs
9474
- sense
9475
- c5
9476
- oxford
9477
- walker
9478
- jennifer
9479
- ##ola
9480
- course
9481
- ##bre
9482
- ##pus
9483
- ##rder
9484
- lucky
9485
- ivy
9486
- ##nia
9487
- sotheby
9488
- side
9489
- ##ugh
9490
- joy
9491
- ##orage
9492
- ##ush
9493
- ##bat
9494
- ##dt
9495
- r9
9496
- ##gio
9497
- country
9498
- wear
9499
- ##lax
9500
- ##moon
9501
- seven
9502
- study
9503
- lonzo
9504
- evolution
9505
- ##kk
9506
- gs
9507
- kd
9508
- arduino
9509
- b12
9510
- ##lux
9511
- arpg
9512
- ##rdon
9513
- cook
9514
- ##x5
9515
- dark
9516
- five
9517
- ##als
9518
- ##ida
9519
- sign
9520
- something
9521
- ##nda
9522
- ##posted
9523
- fresh
9524
- tf
9525
- cam
9526
- ##mine
9527
- ##skip
9528
- ##form
9529
- ##ssion
9530
- education
9531
- ##tee
9532
- dyson
9533
- stage
9534
- ##jie
9535
- want
9536
- ##night
9537
- epson
9538
- pack
9539
- ##ppy
9540
- ##█
9541
- wd
9542
- ##eh
9543
- ##rence
9544
- left
9545
- ##lvin
9546
- golden
9547
- mhz
9548
- discovery
9549
- ##trix
9550
- ##n2
9551
- loft
9552
- ##uch
9553
- ##dra
9554
- ##sse
9555
- speed
9556
- sorry
9557
- welcome
9558
- ##urn
9559
- wave
9560
- gaga
9561
- ##lmer
9562
- teddy
9563
- rp
9564
- ##sha
9565
- rar
9566
- holiday
9567
- ##vg
9568
- ##nos
9569
- ##rail
9570
- gartner
9571
- gi
9572
- ##dium
9573
- kit
9574
- b3
9575
- eco
9576
- sean
9577
- ##stone
9578
- autocad
9579
- nu
9580
- ##np
9581
- f16
9582
- write
9583
- m5
9584
- ##ias
9585
- images
9586
- atp
9587
- ##dk
9588
- fsm
9589
- ve
9590
- ##xxx
9591
- ##cake
9592
- unit
9593
- lim
9594
- ru
9595
- ##ification
9596
- published
9597
- angela
9598
- analytics
9599
- ak
9600
- ##nel
9601
- gmt
9602
- ##icon
9603
- again
9604
- ##₂
9605
- ##bby
9606
- ios11
9607
- waze
9608
- ##ust
9609
- framework
9610
- iptv
9611
- delete
9612
- cl
9613
- wwdc
9614
- ##fw
9615
- ##xon
9616
- brandt
9617
- ##ses
9618
- ##dragon
9619
- tc
9620
- vetements
9621
- anne
9622
- monte
9623
- modern
9624
- official
9625
- ##ere
9626
- ##nne
9627
- ##oud
9628
- etnews
9629
- ##a2
9630
- ##graphy
9631
- ##rtex
9632
- l2
9633
- ##gma
9634
- mount
9635
- ccd
9636
- archive
9637
- morning
9638
- tan
9639
- ddos
9640
- e7
9641
- day4
9642
- gis
9643
- its
9644
- factory
9645
- bruce
9646
- pg
9647
- ##ito
9648
- guest
9649
- cdma
9650
- ##lling
9651
- n3
9652
- mega
9653
- eyes
9654
- ro
9655
- women
9656
- dac
9657
- church
9658
- ##jun
9659
- singapore
9660
- ##facebook
9661
- starbucks
9662
- ##tos
9663
- ##stin
9664
- ##shine
9665
- zen
9666
- ##mu
9667
- tina
9668
- request
9669
- ##gence
9670
- qt
9671
- q7
9672
- ##zzi
9673
- diary
9674
- ##tore
9675
- ##ead
9676
- cst
9677
- ##osa
9678
- canada
9679
- agent
9680
- va
9681
- ##jiang
9682
- ##lam
9683
- sg
9684
- ##nix
9685
- ##sday
9686
- g6
9687
- ##master
9688
- bing
9689
- ##zl
9690
- charlie
9691
- nb40
9692
- thai
9693
- ln284ct
9694
- ##itz
9695
- bonnie
9696
- ##food
9697
- ##lent
9698
- originals
9699
- ##stro
9700
- ##lts
9701
- ##bscribe
9702
- children
9703
- ntd
9704
- yesstyle
9705
- hmv
9706
- ##tment
9707
- d5
9708
- arts
9709
- sms
9710
- ##pn
9711
- topios9
9712
- lifestyle
9713
- virtual
9714
- ##ague
9715
- xz
9716
- ##deo
9717
- muji
9718
- unt
9719
- ##nnis
9720
- faq1
9721
- ##ette
9722
- fly
9723
- curry
9724
- ##pop
9725
- release
9726
- ##cast
9727
- ##ews
9728
- ##stle
9729
- ios7
9730
- ##ima
9731
- dog
9732
- lenovo
9733
- ##r4
9734
- roger
9735
- cbs
9736
- vornado
9737
- ##desk
9738
- ##ald
9739
- ##van
9740
- oil
9741
- some
9742
- break
9743
- common
9744
- ##jy
9745
- ##lines
9746
- g7
9747
- twice
9748
- ella
9749
- nano
9750
- belle
9751
- ##mes
9752
- ##self
9753
- ##note
9754
- jb
9755
- benz
9756
- ##ova
9757
- save
9758
- ##wing
9759
- kai
9760
- ##hua
9761
- ##rect
9762
- rainer
9763
- ##unge
9764
- adsl
9765
- guestname
9766
- ##uma
9767
- ##kins
9768
- ##zu
9769
- tokichoi
9770
- ##price
9771
- county
9772
- ##med
9773
- ##mus
9774
- rmk
9775
- address
9776
- vm
9777
- openload
9778
- ##group
9779
- ##hin
9780
- ##iginal
9781
- amg
9782
- urban
9783
- ##oz
9784
- jobs
9785
- emi
9786
- ##public
9787
- beautiful
9788
- ##sch
9789
- album
9790
- ##dden
9791
- ##bell
9792
- jerry
9793
- works
9794
- hostel
9795
- miller
9796
- ##drive
9797
- ##rmin
9798
- boot
9799
- ##fx
9800
- ##nome
9801
- ##ctionary
9802
- ##oman
9803
- ##lish
9804
- ##cr
9805
- ##hm
9806
- ##how
9807
- francis
9808
- xi
9809
- c919
9810
- b5
9811
- evernote
9812
- ##uc
9813
- vga
9814
- coupe
9815
- ##urg
9816
- ##cca
9817
- ##uality
9818
- multi
9819
- ##ett
9820
- em
9821
- hey
9822
- ##ani
9823
- ##tax
9824
- ##rma
9825
- inside
9826
- than
9827
- leonnhurt
9828
- ##jin
9829
- ict
9830
- bird
9831
- notes
9832
- ##dical
9833
- ##lli
9834
- result
9835
- iu
9836
- ee
9837
- smap
9838
- gopro
9839
- ##last
9840
- yin
9841
- pure
9842
- ##dan
9843
- ##rame
9844
- mama
9845
- ##oot
9846
- bean
9847
- marketing
9848
- ##hur
9849
- bella
9850
- sync
9851
- xuite
9852
- ##ground
9853
- discuz
9854
- ##getrelax
9855
- ##ince
9856
- ##bay
9857
- cj
9858
- gmat
9859
- apt
9860
- ##pass
9861
- jing
9862
- ##rix
9863
- c4
9864
- rich
9865
- niusnews
9866
- ##ello
9867
- bag
9868
- ##eting
9869
- ##mobile
9870
- culture
9871
- area
9872
- ##ience
9873
- details
9874
- gp
9875
- universal
9876
- silver
9877
- dit
9878
- private
9879
- ddd
9880
- u11
9881
- kanshu
9882
- ##ified
9883
- fung
9884
- ##nny
9885
- dx
9886
- tai
9887
- ##fr
9888
- ##lean
9889
- ##pin
9890
- ##rin
9891
- ly
9892
- rick
9893
- ##bility
9894
- usb3
9895
- banner
9896
- ##baru
9897
- ##gion
9898
- metal
9899
- dt
9900
- vdf
9901
- karl
9902
- qualcomm
9903
- bear
9904
- oldid
9905
- ian
9906
- jo
9907
- ##tors
9908
- population
9909
- ##ernel
9910
- mmorpg
9911
- ##mv
9912
- ##bike
9913
- ww
9914
- friend
9915
- ##ager
9916
- exhibition
9917
- ##del
9918
- ##pods
9919
- fpx
9920
- structure
9921
- ##free
9922
- ##tings
9923
- kl
9924
- ##rley
9925
- ##copyright
9926
- ##mma
9927
- california
9928
- orange
9929
- yoga
9930
- canmake
9931
- honey
9932
- ##anda
9933
- nikkie
9934
- dhl
9935
- publishing
9936
- ##mall
9937
- ##gnet
9938
- ##┅
9939
- e88
9940
- ##dog
9941
- fishbase
9942
- ##!
9943
- ##"
9944
- ###
9945
- ##$
9946
- ##%
9947
- ##&
9948
- ##'
9949
- ##(
9950
- ##)
9951
- ##*
9952
- ##+
9953
- ##,
9954
- ##-
9955
- ##.
9956
- ##/
9957
- ##:
9958
- ##;
9959
- ##<
9960
- ##=
9961
- ##>
9962
- ##?
9963
- ##@
9964
- ##[
9965
- ##\
9966
- ##]
9967
- ##^
9968
- ##_
9969
- ##{
9970
- ##|
9971
- ##}
9972
- ##~
9973
- ##£
9974
- ##¤
9975
- ##¥
9976
- ##§
9977
- ##«
9978
- ##±
9979
- ##³
9980
- ##µ
9981
- ##·
9982
- ##¹
9983
- ##º
9984
- ##»
9985
- ##¼
9986
- ##ß
9987
- ##æ
9988
- ##÷
9989
- ##ø
9990
- ##đ
9991
- ##ŋ
9992
- ##ɔ
9993
- ##ə
9994
- ##ɡ
9995
- ##ʰ
9996
- ##ˇ
9997
- ##ˈ
9998
- ##ˊ
9999
- ##ˋ
10000
- ##ˍ
10001
- ##ː
10002
- ##˙
10003
- ##˚
10004
- ##ˢ
10005
- ##α
10006
- ##β
10007
- ##γ
10008
- ##δ
10009
- ##ε
10010
- ##η
10011
- ##θ
10012
- ##ι
10013
- ##κ
10014
- ##λ
10015
- ##μ
10016
- ##ν
10017
- ##ο
10018
- ##��
10019
- ##ρ
10020
- ##ς
10021
- ##σ
10022
- ##τ
10023
- ##υ
10024
- ##φ
10025
- ##χ
10026
- ##ψ
10027
- ##б
10028
- ##в
10029
- ##г
10030
- ##д
10031
- ##е
10032
- ##ж
10033
- ##з
10034
- ##к
10035
- ##л
10036
- ##м
10037
- ##н
10038
- ##о
10039
- ##п
10040
- ##р
10041
- ##с
10042
- ##т
10043
- ##у
10044
- ##ф
10045
- ##х
10046
- ##ц
10047
- ##ч
10048
- ##ш
10049
- ##ы
10050
- ##ь
10051
- ##і
10052
- ##ก
10053
- ##ง
10054
- ##น
10055
- ##ม
10056
- ##ย
10057
- ##ร
10058
- ##อ
10059
- ##า
10060
- ##เ
10061
- ##๑
10062
- ##་
10063
- ##ღ
10064
- ##ᵃ
10065
- ##ᵉ
10066
- ##ᵍ
10067
- ##ᵏ
10068
- ##ᵐ
10069
- ##ᵒ
10070
- ##ᵘ
10071
- ##‖
10072
- ##„
10073
- ##†
10074
- ##•
10075
- ##‥
10076
- ##‧
10077
- ##
10078
- ##‰
10079
- ##′
10080
- ##″
10081
- ##‹
10082
- ##›
10083
- ##※
10084
- ##‿
10085
- ##⁄
10086
- ##ⁱ
10087
- ##⁺
10088
- ##ⁿ
10089
- ##₁
10090
- ##₃
10091
- ##₄
10092
- ##€
10093
- ##№
10094
- ##ⅰ
10095
- ##ⅱ
10096
- ##ⅲ
10097
- ##ⅳ
10098
- ##ⅴ
10099
- ##⇒
10100
- ##∀
10101
- ##−
10102
- ##∕
10103
- ##∙
10104
- ##√
10105
- ##∞
10106
- ##∟
10107
- ##∠
10108
- ##∣
10109
- ##∩
10110
- ##∮
10111
- ##∶
10112
- ##∼
10113
- ##∽
10114
- ##≈
10115
- ##≒
10116
- ##≡
10117
- ##≤
10118
- ##≥
10119
- ##≦
10120
- ##≧
10121
- ##≪
10122
- ##≫
10123
- ##⊙
10124
- ##⋅
10125
- ##⋈
10126
- ##⋯
10127
- ##⌒
10128
- ##①
10129
- ##②
10130
- ##③
10131
- ##④
10132
- ##⑤
10133
- ##⑥
10134
- ##⑦
10135
- ##⑧
10136
- ##⑨
10137
- ##⑩
10138
- ##⑴
10139
- ##⑵
10140
- ##⑶
10141
- ##⑷
10142
- ##⑸
10143
- ##⒈
10144
- ##⒉
10145
- ##⒊
10146
- ##⒋
10147
- ##ⓒ
10148
- ##ⓔ
10149
- ##ⓘ
10150
- ##━
10151
- ##┃
10152
- ##┆
10153
- ##┊
10154
- ##┌
10155
- ##└
10156
- ##├
10157
- ##┣
10158
- ##═
10159
- ##║
10160
- ##╚
10161
- ##╞
10162
- ##╠
10163
- ##╭
10164
- ##╮
10165
- ##╯
10166
- ##╰
10167
- ##╱
10168
- ##╳
10169
- ##▂
10170
- ##▃
10171
- ##▅
10172
- ##▇
10173
- ##▉
10174
- ##▋
10175
- ##▌
10176
- ##▍
10177
- ##▎
10178
- ##□
10179
- ##▬
10180
- ##△
10181
- ##►
10182
- ##▽
10183
- ##◇
10184
- ##◕
10185
- ##◠
10186
- ##◢
10187
- ##◤
10188
- ##☞
10189
- ##☼
10190
- ##♡
10191
- ##♫
10192
- ##♬
10193
- ##✕
10194
- ##✦
10195
- ##✪
10196
- ##✰
10197
- ##✿
10198
- ##❀
10199
- ##➜
10200
- ##➤
10201
- ##⦿
10202
- ##、
10203
- ##。
10204
- ##〃
10205
- ##々
10206
- ##〇
10207
- ##〈
10208
- ##〉
10209
- ##《
10210
- ##》
10211
- ##「
10212
- ##」
10213
- ##『
10214
- ##』
10215
- ##【
10216
- ##】
10217
- ##〓
10218
- ##〔
10219
- ##〕
10220
- ##〖
10221
- ##〗
10222
- ##〜
10223
- ##〝
10224
- ##〞
10225
- ##ㄧ
10226
- ##ㆍ
10227
- ##㈦
10228
- ##㊣
10229
- ##㗎
10230
- ##︰
10231
- ##︱
10232
- ##︶
10233
- ##︿
10234
- ##﹁
10235
- ##﹂
10236
- ##﹍
10237
- ##﹏
10238
- ##﹐
10239
- ##﹑
10240
- ##﹒
10241
- ##﹔
10242
- ##﹕
10243
- ##﹖
10244
- ##﹗
10245
- ##﹙
10246
- ##﹚
10247
- ##﹝
10248
- ##﹞
10249
- ##﹡
10250
- ##﹣
10251
- ##!
10252
- ##(
10253
- ##)
10254
- ##,
10255
- ##:
10256
- ##;
10257
- ##?
10258
- ##。
10259
- ##「
10260
- ##」
10261
- ##、
10262
- ##・
10263
- ##ッ
10264
- ##ー
10265
- ##゙
10266
- ##゚
10267
- ## ̄
10268
- ##¥
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
vocab.py ADDED
@@ -0,0 +1,453 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from patcher import tiktoken_patch
2
+ import tiktoken
3
+ from transformers import AutoTokenizer
4
+ from enum import Enum, auto
5
+ from dataclasses import dataclass, field
6
+
7
+ from utils.log_util import logger
8
+ from typing import Dict, Any, Union
9
+
10
+ """Interface:
11
+ tokenizer.encode
12
+ tokenizer.decode
13
+ tokenizer.convert_tokens_to_string # gpt4 没有这个方法
14
+ tokenizer.convert_ids_to_tokens
15
+
16
+
17
+ tokenizer.parent = ""
18
+ tokenizer.vocab_size
19
+ tokenizer.get_vocab() # gpt-neox-20b, llama
20
+ tokenizer.type = TokenizerType.ByteBPE.name
21
+ tokenizer.implementation = TokenizerImpl.SentencePiece.name # https://github.com/facebookresearch/llama/blob/main/llama/tokenizer.py
22
+ "HFGPT2Tokenizer", "HFTokenizer", "GPT2BPETokenizer", "CharLevelTokenizer", "TiktokenTokenizer", "SPMTokenizer", https://github.com/EleutherAI/gpt-neox/blob/main/tools/preprocess_data.py
23
+
24
+
25
+ tokenizer.comments = "split all numbers into individual digits, " \
26
+ "and fallback to bytes to decompose unknown UTF-8 characters"
27
+
28
+ tokenizer.all_special_tokens # baichuan
29
+ tokenizer.special_tokens_set # gpt3.5_turbo
30
+ tokenizer.special_tokens_map
31
+ """
32
+
33
+
34
+ class TokenizerImpl(Enum):
35
+ """
36
+ - https://github.com/huggingface/tokenizers/blob/main/bindings/python/py_src/tokenizers/implementations/__init__.py
37
+ - https://huggingface.co/docs/transformers/tokenizer_summary
38
+ - https://github.com/EleutherAI/gpt-neox/blob/main/megatron/tokenizer/tokenizer.py
39
+
40
+ ## google/BertTokenizer
41
+ - https://github.com/huggingface/tokenizers/blob/main/bindings/python/py_src/tokenizers/implementations/bert_wordpiece.py
42
+ - 特征
43
+ - 算法:BERT的编码器是 BPE-WordPiece,将单词拆分成多个前缀符号(比如BERT中的##)最小单元
44
+ - 词典:有##开头的token,表示subword,
45
+ - 中文采用char粒度分词
46
+ - 英文采用 WordPiece
47
+
48
+
49
+
50
+
51
+ ## google/sentencepiece
52
+ - https://github.com/google/sentencepiece/blob/3863f7648e5d8edb571ac592f3ac4f5f0695275a/src/sentencepiece_model.proto#L48
53
+ - 支持 sentencepiece 和 wordpiece
54
+ - sentencepiece 有byte-bpe吗?
55
+ - UNIGRAM = 1; // Unigram language model with dynamic algorithm
56
+ - BPE = 2; // Byte Pair Encoding
57
+ - WORD = 3; // Delimitered by whitespace.
58
+ - CHAR = 4; // tokenizes into character sequence
59
+ - wordpiece
60
+ - 特征:
61
+ - 训练: spm_train --model_type unigram/bpe/char/word
62
+ - 特殊符号: Ġ
63
+ - 文件: *.sp_model 或 *.model (可选文件 .vocab,) spm简称 (其他格式比如 tokenizer.json是给hf_tokenizer兼容用的)
64
+ - 实现:
65
+ - 依赖: protobuf
66
+ - 训练: `import sentencepiece as spm; spm.SentencePieceTrainer.train` 或 `spm_train`
67
+ - 加载: `import sentencepiece as spm; spm.SentencePieceProcessor().Load(vocab_file)`
68
+ - 方法: 是SentencePieceProcessor类型,sp_model.id_to_piece,有tokenizer.json tokenizer.model,
69
+ - 分词:
70
+ - pre_tokenizers.ByteLevel(add_prefix_space=True, use_regex=False)
71
+ - 词典: 词典字符有 ▁ (U+2581) ,表示空格或句首。
72
+ - 示例:google-t5, llama,baichuan, orion,
73
+ - llama: tokenizer.json(包含model.vocab model.merges) tokenizer.model
74
+ - grok: 原始是 .model文件,后面转成了 tokenizer.json
75
+ - google-t5: tokenizer.json, spiece.model
76
+ - Skywork-13B-Math: tokenizer.model
77
+ - xlm_roberta: sentencepiece.bpe.model
78
+ - GPT2Tokenizer
79
+ - tokenizer.json, vocab.json, merges.txt (https://huggingface.co/openai-community/gpt2)
80
+ - vocab.bpe, encoder.json, dict.txt (fairseq版本,不常用,可以忽略这个版本)
81
+
82
+
83
+
84
+ ## thu/icetk
85
+ - icetk: sentencepiece的分支,支持image_tokenizer。
86
+ - glm, chatglm1, chatglm2
87
+
88
+ ## huggingface/tokenizers
89
+ - https://github.com/huggingface/tokenizers
90
+ - VS sentencepiece
91
+ - 支持sentencepiece
92
+ - .model转化为 (merges.txt + vocab.json) 或者 tokenizer.json
93
+ - https://github.com/huggingface/tokenizers/blob/main/bindings/python/scripts/sentencepiece_extractor.py
94
+ - 加载 merges.txt, vocab.json
95
+ - SentencePieceBPETokenizer https://github.com/huggingface/tokenizers/blob/v0.19.1/bindings/python/py_src/tokenizers/implementations/sentencepiece_bpe.py#L10
96
+ - 在 sentencepiece基础上,hf_tokenizer支持pre-tokenization的正则表达式,对tab和换行支持更好,支持special token
97
+ - 类型: 支持 BBPE, WordPiece or Unigram
98
+ - 特征:
99
+ - 文件: tokenizer.json(包含后两个文件的内容), merges.txt, vocab.json
100
+ - added_tokens 在vocab中不一定存在。
101
+ - 实现:
102
+ - 训练: `from tokenizers.trainers import BpeTrainer, UnigramTrainer, WordLevelTrainer, WordPieceTrainer`
103
+ - 加载:
104
+ - 方法: .model.from_file .model.save .model.token_to_id .model.tokenize
105
+ - .model 是 tokenizer.models.BPE 类型
106
+ - 词典有 Ġ "\u0120" 开头
107
+ - 优势
108
+ -
109
+ - 示例:gpt2, gpt_neox_20b, moss, bloom, qwen2
110
+ - 优势:相对sentence piece,
111
+ - ss
112
+
113
+ ## openai/tiktoken
114
+ - 特征:空格就是空格,
115
+ - 示例:gpt3.5 gpt4, qwen,
116
+ """
117
+ """ 算法体系 https://www.huaxiaozhuan.com/%E5%B7%A5%E5%85%B7/huggingface_transformer/chapters/1_tokenizer.html
118
+ - word-base tokenizer:
119
+ - char-base tokenizer:
120
+ - subword-based Tokenizer
121
+ - BPE
122
+ - byte-bpe: base vocabulary大小是256
123
+ - WordPiece:
124
+ - 相比BPE,WordPiece 仅保存最终词表,而不保存学到的 merge rule
125
+ - Unigram
126
+ - SentencePiece
127
+
128
+ """
129
+
130
+ # 分类体系:https://github.com/huggingface/tokenizers/blob/main/bindings/python/py_src/tokenizers/implementations/
131
+ BertTokenizer = "wordpiece.BertTokenizer"
132
+ JapaneseTokenizer = ("wordpiece.MecabTokenizer", "https://github.com/polm/fugashi") # 常用日语包 ipadic,fugashi,
133
+ ByteLevelBPETokenizer = "byte_level_bpe" # BBPE
134
+ SentencePieceBPETokenizer = "sentencepiece_bpe"
135
+
136
+ # 分类体系
137
+
138
+ # SentencePeice(BPE)
139
+ SentencePiece = auto() # sentencepiece.bpe, sentencepiece.unigram, sentencepiece.char, sentencepiece.word,
140
+ byte_level_bpe = auto()
141
+ # HFTokenizer = auto() # , 支持
142
+ TikToken = auto()
143
+ # subword-nmt
144
+ # WordPiece
145
+
146
+
147
+ # load_vocab_with_SPECIAL_TOKEN = True # 如果不包含会导致计算词典大小错误、overlap_token计算不一致。
148
+
149
+
150
+ @dataclass
151
+ class TokenizerConfig:
152
+ """
153
+ https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/blob/main/src/leaderboard/read_evals.py
154
+ """
155
+ name_or_path: str # org/model (path on hub), as unique id
156
+ name_display: str = None #
157
+ impl: TokenizerImpl = None # implementation, tokenizer_class/type
158
+ org: str = None
159
+ link: str = None # http://**
160
+ desc: str = None # description
161
+ meta: str = None
162
+ level: str = None # char-level, word-level, byte-level
163
+ init_kwargs: Dict[str, Any] = field(default_factory=dict, )
164
+
165
+ def __post_init__(self):
166
+ if self.link is None:
167
+ self.link = "https://huggingface.co/" + self.name_or_path # TODO + revision
168
+ if self.name_display is None:
169
+ self.name_display = self.name_or_path
170
+
171
+ @classmethod
172
+ def init_from_json_file(cls, json_filepath: str) -> 'TokenizerConfig':
173
+ pass
174
+
175
+ def __eq__(self, other):
176
+ if isinstance(other, self.__class__):
177
+ return self.__dict__ == other.__dict__
178
+ else:
179
+ return False
180
+
181
+ def __hash__(self):
182
+ return hash(self.name_or_path)
183
+
184
+
185
+ # format: , description, hf_path, tokenizer_class/type, comments, Organization
186
+ # TODO: append link and description to the end of dropdown button.
187
+ _all_tokenizer_config = [
188
+ ##### bert 系列
189
+ TokenizerConfig("google-bert/bert-base-cased", impl=TokenizerImpl.BertTokenizer, org="Google",
190
+ desc="first add whitespace around any CJK character, then perform wordpiece tokenization."),
191
+ TokenizerConfig("google-bert/bert-base-uncased", impl=TokenizerImpl.BertTokenizer, org="Google",
192
+ desc="first add whitespace around any CJK character, then perform wordpiece tokenization."),
193
+ TokenizerConfig("google-bert/bert-base-chinese", impl=TokenizerImpl.BertTokenizer, org="Google",
194
+ desc="first add whitespace around any CJK character, then perform wordpiece tokenization."),
195
+ TokenizerConfig("google-bert/bert-base-german-cased", impl=TokenizerImpl.BertTokenizer, org="Google"),
196
+ TokenizerConfig("dbmdz/bert-base-german-uncased", impl=TokenizerImpl.BertTokenizer, org="dbmdz"),
197
+ TokenizerConfig("google-bert/bert-base-multilingual-uncased", impl=TokenizerImpl.BertTokenizer, org="Google"),
198
+ TokenizerConfig("google-bert/bert-base-multilingual-cased", impl=TokenizerImpl.BertTokenizer, org="Google"),
199
+ TokenizerConfig("tohoku-nlp/bert-base-japanese", impl=TokenizerImpl.BertTokenizer, org="Tohoku",
200
+ desc="The texts are first tokenized by MeCab morphological parser with the IPA dictionary, "
201
+ "then split into subwords by the WordPiece algorithm."),
202
+ TokenizerConfig("clue/roberta_chinese_clue_tiny", name_display="clue/roberta-chinese-clue",
203
+ impl=TokenizerImpl.BertTokenizer, org="CLUE",
204
+ init_kwargs={"revision": "refs/pr/1"},
205
+ desc="",
206
+ meta="去掉了繁体字, https://github.com/CLUEbenchmark/CLUEPretrainedModels/blob/master/README.md"),
207
+ TokenizerConfig("eson/kplug-base-encoder", name_display="eson/kplug", impl=TokenizerImpl.BertTokenizer, org="JD"),
208
+ TokenizerConfig("ckiplab/gpt2-base-chinese", impl=TokenizerImpl.BertTokenizer, org="SINICA"), # 台湾中央研究院
209
+ # WoBERT
210
+ # WoBERT Plus https://github.com/ZhuiyiTechnology/WoBERT
211
+
212
+
213
+ ##### GPT2Tokenizer
214
+ TokenizerConfig("openai-community/gpt2", impl=TokenizerImpl.SentencePiece, org="OpenAI"),
215
+ # byte-level BPE,没有byte,是unicode-level的吗?
216
+ TokenizerConfig("ClassCat/gpt2-base-french", impl=TokenizerImpl.SentencePiece, org="ClassCat"),
217
+ TokenizerConfig("ClassCat/gpt2-base-spanish", impl=TokenizerImpl.SentencePiece, org="ClassCat"),
218
+ TokenizerConfig("fnlp/moss-moon-003-sft", impl=TokenizerImpl.SentencePiece, init_kwargs={"revision": "refs/pr/6"},
219
+ org="Fudan",
220
+ desc="This tokenizer has been trained to treat spaces like parts of the tokens "
221
+ "(a bit like sentencepiece) so a word will be encoded differently whether "
222
+ "it is at the beginning of the sentence (without space) or not",
223
+ meta="在gpt2词典基础上,扩充了5万中文"),
224
+ TokenizerConfig("bigscience/bloom", impl=TokenizerImpl.SentencePiece, org="BigScience",
225
+ meta="比gpt_neox的词典 对中文支持更好。"),
226
+ # ("bloomz_6b4_zh",
227
+ # ("BelleGroup/BELLE-7B-2M", # 模型和词典都基于bloom
228
+ #
229
+ TokenizerConfig("EleutherAI/gpt-neox-20b", impl=TokenizerImpl.SentencePiece, org="EleutherAI"), # 5万
230
+ TokenizerConfig("cyberagent/open-calm-7b", impl=TokenizerImpl.SentencePiece, org="CyberAgent"), # GPTNeoXTokenizer
231
+ TokenizerConfig("abeja/gpt-neox-japanese-2.7b", impl=TokenizerImpl.SentencePiece, org="ABEJA"),
232
+ TokenizerConfig("Qwen/Qwen1.5-14B-Chat", name_display="Qwen/Qwen1.5", impl=TokenizerImpl.SentencePiece, org="Alibaba"), # 15万,速度有点慢
233
+ TokenizerConfig("HuggingFaceH4/starchat-alpha", impl=TokenizerImpl.SentencePiece, org="-"),
234
+
235
+ ####### google/sentencepiece tokenizer:
236
+ # T5 llama internlm
237
+ TokenizerConfig("google-t5/t5-large", name_display="google-t5/t5", impl=TokenizerImpl.SentencePiece, org="Google"),
238
+ # t5_small, t5_base, t5_large, flan_t5_base,
239
+ # ("t5_base", "", "sentencepiece"),
240
+ # TokenizerConfig("google/flan-t5-base", impl=TokenizerImpl.SentencePiece, ),
241
+ TokenizerConfig("lmsys/fastchat-t5-3b-v1.0", impl=TokenizerImpl.SentencePiece,
242
+ org="LMSYS",
243
+ init_kwargs={"use_fast": False} # 解决 pyo3_runtime.PanicException: AddedVocabulary bad split
244
+ ),
245
+ TokenizerConfig("CohereForAI/aya-101", org="Cohere For AI"), # "tokenizer_class": "T5Tokenizer",
246
+
247
+ TokenizerConfig("ClueAI/ChatYuan-large-v2", impl=TokenizerImpl.SentencePiece, org="CLUE"),
248
+ TokenizerConfig("ClueAI/PromptCLUE-base", impl=TokenizerImpl.SentencePiece, org="CLUE"),
249
+ TokenizerConfig("gradientai/Llama-3-8B-Instruct-Gradient-1048k", name_display="Meta/llama3",
250
+ impl=TokenizerImpl.SentencePiece, org="Meta",
251
+ desc="llama split all numbers into individual digits, and fallback to bytes to decompose unknown UTF-8 characters"),
252
+ # byte-level BPE
253
+ # '中文单字': 700, '中文多字': 0
254
+ TokenizerConfig("NousResearch/Llama-2-7b-chat-hf", name_display="Meta/llama2", impl=TokenizerImpl.SentencePiece,
255
+ org="Meta"),
256
+ TokenizerConfig("huggyllama/llama-7b", name_display="Meta/llama", impl=TokenizerImpl.SentencePiece, org="Meta"),
257
+ TokenizerConfig("hpcai-tech/grok-1", name_display="xai-org/grok-1", impl=TokenizerImpl.SentencePiece, org="xAI"),
258
+ # 由.model文件转化为了
259
+ TokenizerConfig("hfl/chinese-llama-lora-7b", impl=TokenizerImpl.SentencePiece, org="-",
260
+ meta="向原始LLaMA的词汇表中添加2w个中文词汇,针对原版LLaMA模型扩充了中文词表, 提升了中文编解码效率"),
261
+ #
262
+ TokenizerConfig("hfl/chinese-llama-2-7b", impl=TokenizerImpl.SentencePiece, org="-",
263
+ meta="重新设计了新词表(大小:55296),进一步提升了中文字词的覆盖程度"), #
264
+ TokenizerConfig("hfl/llama-3-chinese-8b", impl=TokenizerImpl.SentencePiece, org="-"),
265
+ TokenizerConfig("hfl/chinese-alpaca-lora-7b", impl=TokenizerImpl.SentencePiece, org="-"),
266
+ # 中文Alpaca模型在上述中文LLaMA模型的基础上进一步使用了指令数据进行精调。 "比chinese_llama词典多一个`[PAD]`,请勿混用"
267
+ #
268
+ # ("belle_llama_ext_7b",
269
+ # ("alpaca_7b",
270
+ TokenizerConfig("baichuan-inc/Baichuan-7B", name_display="baichuan-inc/baichuan",
271
+ impl=TokenizerImpl.SentencePiece,
272
+ level="byte-level", org="Baichuan"),
273
+ TokenizerConfig("baichuan-inc/Baichuan2-7B-Chat", name_display="baichuan-inc/baichuan2",
274
+ impl=TokenizerImpl.SentencePiece, org="Baichuan",
275
+ desc="expand the vocabulary size from 64000 in Baichuan1 to 125696"),
276
+ TokenizerConfig("internlm/internlm-chat-7b", impl=TokenizerImpl.SentencePiece, org="Shanghai AI Lab"),
277
+ # 上海AI实验室 + 商汤
278
+ TokenizerConfig("internlm/internlm2-chat-7b", impl=TokenizerImpl.SentencePiece, org="Shanghai AI Lab"),
279
+ TokenizerConfig("internlm/internlm2-math-7b", impl=TokenizerImpl.SentencePiece, org="Shanghai AI Lab"),
280
+ TokenizerConfig("internlm/internlm-xcomposer-7b", impl=TokenizerImpl.SentencePiece, org="Shanghai AI Lab"),
281
+ TokenizerConfig("tiiuae/falcon-7b", impl=TokenizerImpl.SentencePiece, org="TII"),
282
+ TokenizerConfig("tiiuae/falcon-180b", impl=TokenizerImpl.SentencePiece, org="TII"),
283
+ TokenizerConfig("Skywork/Skywork-13B-base", impl=TokenizerImpl.SentencePiece, org="Kunlun"),
284
+ TokenizerConfig("Skywork/Skywork-13B-Math", impl=TokenizerImpl.SentencePiece, org="Kunlun"), # 文件:tokenizer.model
285
+ TokenizerConfig("FacebookAI/xlm-roberta-base", impl=TokenizerImpl.SentencePiece, org="Facebook"),
286
+ # 这个的tokenizer.json 为什么没有merges? vocab里为什么有概率值?
287
+ # "goat",
288
+
289
+ # ##### glm系列
290
+ # "glm_chinese",),
291
+ TokenizerConfig("THUDM/chatglm-6b", impl=TokenizerImpl.SentencePiece, org="Tsinghua",
292
+ meta=f"num_image_tokens: {12}; num_image_tokens: {34} ",
293
+ init_kwargs={"revision": "refs/pr/100"}),
294
+ TokenizerConfig("THUDM/chatglm2-6b", impl=TokenizerImpl.SentencePiece, org="Tsinghua", ),
295
+ TokenizerConfig("THUDM/chatglm3-6b", impl=TokenizerImpl.SentencePiece, org="Tsinghua", ),
296
+ TokenizerConfig("thu-coai/CharacterGLM-6B", impl=TokenizerImpl.SentencePiece, org="Tsinghua", ),
297
+
298
+ # tiktoken 系列
299
+ TokenizerConfig("openai/text-davinci-003", impl=TokenizerImpl.TikToken, org="OpenAI",
300
+ link="https://github.com/openai/tiktoken"),
301
+ #
302
+ TokenizerConfig("openai/code-davinci-002", impl=TokenizerImpl.TikToken, org="OpenAI",
303
+ link="https://github.com/openai/tiktoken"),
304
+ TokenizerConfig("openai/gpt-3.5-turbo", impl=TokenizerImpl.TikToken, org="OpenAI",
305
+ link="https://github.com/openai/tiktoken",
306
+ desc="tiktoken is a fast BPE tokeniser for use with OpenAI's models. There are 16 tokens KeyError"),
307
+ TokenizerConfig("openai/gpt-4", impl=TokenizerImpl.TikToken, org="OpenAI",
308
+ link="https://github.com/openai/tiktoken", ),
309
+ TokenizerConfig("openai/gpt-4o", impl=TokenizerImpl.TikToken, org="OpenAI",
310
+ link="https://github.com/openai/tiktoken", ),
311
+ TokenizerConfig("Qwen/Qwen-7B-Chat", name_display="Qwen/Qwen", impl=TokenizerImpl.TikToken, org="Alibaba",
312
+ init_kwargs={"revision": "refs/pr/56"},
313
+ meta="在gpt4词典基础上,删除了100个多数字token,增加10000中文词token;并优化了special_token的分词"),
314
+ # https://huggingface.co/Qwen/Qwen-7B-Chat#%E6%A8%A1%E5%9E%8B%E7%BB%86%E8%8A%82%EF%BC%88model%EF%BC%89
315
+ # 该词表在GPT-4使用的BPE词表cl100k_base基础上,对中文、多语言进行了优化,在对中、英、代码数据的高效编解码的基础上,
316
+ # 对部分多语言更加友好,方便用户在不扩展词表的情况下对部分语种进行能力增强。 词表对数字按单个数字位切分。
317
+
318
+ # TokenizerConfig("Qwen/Qwen-72B-Chat", impl=TokenizerImpl.TikToken),
319
+
320
+ # 未分类
321
+ # ("amber", ""),
322
+ TokenizerConfig("LLM360/CrystalCoder", org="MBZUAI"),
323
+ TokenizerConfig("mistralai/Mistral-7B-v0.1", org="Mistral"),
324
+ TokenizerConfig("mistralai/Mixtral-8x7B-v0.1", org="Mistral"),
325
+
326
+ TokenizerConfig("paust/pko-t5-large", org="PAUST"),
327
+
328
+ TokenizerConfig("01-ai/Yi-6B", org="Yi"),
329
+ TokenizerConfig("01-ai/Yi-34B", org="Yi"),
330
+ TokenizerConfig("01-ai/Yi-VL-34B", org="Yi"),
331
+ TokenizerConfig("OrionStarAI/Orion-14B-Chat", org="OrionStar"),
332
+ TokenizerConfig("microsoft/phi-1", org="Microsoft"),
333
+ TokenizerConfig("microsoft/phi-2", org="Microsoft"),
334
+ TokenizerConfig("microsoft/Phi-3-mini-4k-instruct", org="Microsoft", meta="即llama vocab"),
335
+ TokenizerConfig("Upstage/SOLAR-10.7B-v1.0", org="-"),
336
+ TokenizerConfig("google/mobilebert-uncased", org="Google"),
337
+ # ("google/mobilenet_v2_1.0_224",), # error
338
+ TokenizerConfig("google/switch-c-2048", org="Google"),
339
+ TokenizerConfig("google/byt5-small", org="Google"),
340
+ TokenizerConfig("google/mt5-large", org="Google"),
341
+ TokenizerConfig("WizardLM/WizardCoder-Python-7B-V1.0", org="Microsoft"),
342
+ TokenizerConfig("WizardLM/WizardCoder-15B-V1.0", org="Microsoft"),
343
+ TokenizerConfig("WizardLM/WizardLM-7B-V1.0", org="Microsoft"),
344
+ TokenizerConfig("WizardLM/WizardMath-70B-V1.0", org="Microsoft"),
345
+ TokenizerConfig("TigerResearch/tigerbot-70b-chat-v4-4k", org="Tigerobo"),
346
+ TokenizerConfig("TigerResearch/tigerbot-13b-chat-v2", org="Tigerobo"),
347
+ TokenizerConfig("deepseek-ai/deepseek-coder-33b-instruct", org="DeepSeek"),
348
+ TokenizerConfig("deepseek-ai/deepseek-llm-7b-base", org="DeepSeek"),
349
+ TokenizerConfig("deepseek-ai/DeepSeek-V2", org="DeepSeek"),
350
+ TokenizerConfig("google/gemma-7b", org="Google"),
351
+ TokenizerConfig("allenai/OLMo-7B", org="Allen AI"),
352
+ TokenizerConfig("HuggingFaceH4/zephyr-7b-beta", org="HuggingFace"),
353
+ TokenizerConfig("ai21labs/Jamba-v0.1", org="AI21"),
354
+ TokenizerConfig("databricks/dbrx-instruct", org="Databricks"),
355
+
356
+ # ("claude",),
357
+ # https://github.com/Duxiaoman-DI/XuanYuan
358
+
359
+ # https://huggingface.co/apple/OpenELM-3B-Instruct https://huggingface.co/apple/OpenELM-3B
360
+
361
+ ]
362
+
363
+ assert len(set([config.name_display for config in _all_tokenizer_config])) == len(_all_tokenizer_config)
364
+ assert len(set([config.name_or_path for config in _all_tokenizer_config])) == len(_all_tokenizer_config)
365
+ assert len(set([config.name_or_path.split("/")[-1] for config in _all_tokenizer_config])) == len(_all_tokenizer_config)
366
+
367
+
368
+ class TokenizerFactory:
369
+
370
+ def __init__(self):
371
+ self.all_tokenizer_configs = sorted(_all_tokenizer_config, key=lambda k: k.name_or_path)
372
+ self.all_tokenizer_names = [config.name_or_path for config in self.all_tokenizer_configs]
373
+ self.name_to_config_list = [
374
+ {config.name_or_path: config for config in self.all_tokenizer_configs},
375
+ {config.name_display: config for config in self.all_tokenizer_configs},
376
+ {config.name_display.split("/")[-1]: config for config in self.all_tokenizer_configs},
377
+ ]
378
+ self.tokenizer_cache = {}
379
+
380
+ def get_tokenizer_config(self, tokenizer_name: str) -> TokenizerConfig:
381
+ for name_to_config in self.name_to_config_list:
382
+ if tokenizer_name in name_to_config:
383
+ return name_to_config[tokenizer_name]
384
+ return None
385
+
386
+ def get_tokenizer(self, tokenizer_name: str):
387
+ """
388
+ :param tokenizer_config:
389
+ :return:
390
+ """
391
+ tokenizer_config = self.get_tokenizer_config(tokenizer_name)
392
+
393
+ # 1. load from cache
394
+ if tokenizer_config in self.tokenizer_cache:
395
+ return self.tokenizer_cache[tokenizer_config]
396
+
397
+ # 2. load tokenizer
398
+ logger.info(f"loading tokenizer {tokenizer_config.name_or_path}")
399
+ if tokenizer_config.impl == TokenizerImpl.TikToken and "openai" in tokenizer_config.name_or_path:
400
+ tokenizer = tiktoken.encoding_for_model(tokenizer_config.name_or_path.replace("openai/", ""))
401
+ else:
402
+ tokenizer = AutoTokenizer.from_pretrained(
403
+ tokenizer_config.name_or_path,
404
+ trust_remote_code=True,
405
+ **tokenizer_config.init_kwargs
406
+ )
407
+ self.tokenizer_cache[tokenizer_config] = tokenizer
408
+ return tokenizer
409
+
410
+ def get_name_with_hyperlink(self, tokenizer_name):
411
+ def model_hyperlink(link, model_name):
412
+ model_name = model_name
413
+ return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
414
+
415
+ tokenizer_config = self.get_tokenizer_config(tokenizer_name)
416
+ return model_hyperlink(tokenizer_config.link, tokenizer_config.name_display.split("/")[-1])
417
+
418
+
419
+ tokenizer_factory = TokenizerFactory()
420
+
421
+ # class TokenizerType(Enum):
422
+ #
423
+ # # BERTTokenizer
424
+ # # 依赖一个txt文件
425
+ #
426
+ #
427
+ # # https://github.com/EleutherAI/gpt-neox/blob/v2.0/megatron/tokenizer/tokenizer.py#L231
428
+ # # 依赖一个json文件,Tokenizer.from_file(vocab_file)
429
+ # # 案例:gpt-neox-20B
430
+ # HFTokenizer = auto()
431
+ #
432
+ # # 依赖: model_file, sentencepiece.SentencePieceProcessor(model_file)
433
+ # # 案例:
434
+ # SentencePieceTokenizer = auto()
435
+ #
436
+ #
437
+ # # 依赖: 3个json文件:vocab.json, merges.txt, special_tokens.txt
438
+ # # 源码:
439
+ # # - https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/tokenizer/gpt2_tokenization.py#L92
440
+ # # Byte-level BPE
441
+ # GPT2BPETokenizer = auto()
442
+
443
+
444
+ if __name__ == "__main__":
445
+
446
+ for tokenizer_config in tokenizer_factory.all_tokenizer_configs:
447
+ if True:
448
+ # if "t5" in tokenizer_config.name_or_path:
449
+ tokenizer1 = tokenizer_factory.get_tokenizer(tokenizer_config.name_or_path)
450
+ tokenizer2 = tokenizer_factory.get_tokenizer(tokenizer_config.name_display)
451
+ tokenizer3 = tokenizer_factory.get_tokenizer(tokenizer_config.name_display.split("/")[-1])
452
+ assert tokenizer1 == tokenizer2 == tokenizer3
453
+ print(tokenizer_config.name_or_path, len(tokenizer1))
vocab/Intern_gpt/README.md DELETED
File without changes
vocab/__init__.py DELETED
@@ -1,260 +0,0 @@
1
- import importlib
2
- from enum import Enum, auto
3
-
4
- """Interface:
5
- tokenizer.encode
6
- tokenizer.decode
7
- tokenizer.convert_tokens_to_string # gpt4 没有这个方法
8
- tokenizer.convert_ids_to_tokens
9
-
10
-
11
- tokenizer.parent = ""
12
- tokenizer.vocab_size
13
- tokenizer.get_vocab() # gpt-neox-20b, llama
14
- tokenizer.type = TokenizerType.ByteBPE.name
15
- tokenizer.implementation = TokenizerImpl.SentencePiece.name # https://github.com/facebookresearch/llama/blob/main/llama/tokenizer.py
16
- "HFGPT2Tokenizer", "HFTokenizer", "GPT2BPETokenizer", "CharLevelTokenizer", "TiktokenTokenizer", "SPMTokenizer", https://github.com/EleutherAI/gpt-neox/blob/main/tools/preprocess_data.py
17
-
18
- - google/bert
19
- - 特征
20
- - 词典:有##开头的token,表示subword
21
- - 示例:
22
- - bpe-google/sentencepiece:
23
- - 特征:
24
- - 训练:
25
- - 文件: *.sp_model 或 *.model (可选文件 .vocab,) spm简称
26
- - 实现:
27
- - 依赖: protobuf
28
- - 训练: `import sentencepiece as spm; spm.SentencePieceTrainer.train` 或 `spm_train`
29
- - 加载: `import sentencepiece as spm; spm.SentencePieceProcessor().Load(vocab_file)`
30
- - 方法: 是SentencePieceProcessor类型,sp_model.id_to_piece,有tokenizer.json tokenizer.model,
31
- - 分词:
32
- - pre_tokenizers.ByteLevel(add_prefix_space=True, use_regex=False)
33
- - 词典: 词典字符有 ▁ (U+2581) ,表示空格或句首。
34
- - 示例:google-t5, llama,baichuan, orion,
35
- - icetk: sentencepiece的分支,支持image_tokenizer
36
- - glm, chatglm1, chatglm2
37
- - openai/tiktoken
38
- - bpe-hf_tokenizer
39
- - ss
40
- - 特征:
41
- - 文件: tokenizer.json(包含后两个文件的内容), merges.txt, vocab.json
42
- - added_tokens 在vocab中不一定存在。
43
- - 实现:
44
- - 训练: `from tokenizers.trainers import BpeTrainer, UnigramTrainer, WordLevelTrainer, WordPieceTrainer`
45
- - 加载:
46
- - 方法: .model.from_file .model.save .model.token_to_id .model.tokenize
47
- - .model 是 tokenizer.models.BPE 类型
48
- - 词典有 Ġ "\u0120" 开头
49
- - 优势
50
- -
51
- - 示例:gpt2, gpt_neox_20b, moss, bloom, qwen2
52
- - 优势:相对sentence piece,hf_tokenizer支持pre-tokenization的正则表达式,对tab和换行支持更好 ()
53
- - ss
54
- - tiktoken
55
- - 特征:空格就是空格,
56
- - 示例:gpt3.5 gpt4, qwen,
57
- tokenizer.comments = "split all numbers into individual digits, " \
58
- "and fallback to bytes to decompose unknown UTF-8 characters"
59
-
60
- tokenizer.all_special_tokens # baichuan
61
- tokenizer.special_tokens_set # gpt3.5_turbo
62
- tokenizer.special_tokens_map
63
-
64
- tokenizer.dependency [sentencepiece, tiktoken, icetk]
65
- """
66
-
67
- from utils.log_util import logger
68
-
69
- # Animal = Enum('Animal', 'ANT BEE CAT DOG')
70
-
71
- uniq_tokenizers = [
72
- ""
73
- ]
74
-
75
- # format: alias/abbr, description, hf_path, tokenizer_class/type, comments, Organization
76
- # TODO: append link and description to the end of dropdown button.
77
- all_tokenizers = [
78
- ##### bert 系列
79
- ("bert_base_cased", "", "bert"),
80
- ("bert_base_uncased", "", "bert"),
81
- ("bert_base_chinese", "", "bert"),
82
- ("roberta_chinese_clue", "", "bert"),
83
- ("kplug",),
84
- ("gpt2_chinese",),
85
-
86
- ##### GPT2Tokenizer
87
- ("gpt2", "", "GPT2Tokenizer",), #
88
- ("moss", "", "GPT2Tokenizer",),
89
- ("bloom", "", "GPT2Tokenizer",),
90
- # ("bloomz_6b4_zh",
91
- # ("belle_7b_2m", # 模型和词典都基于bloom
92
- #
93
- ("gpt_nexo_20b", "", "GPT2Tokenizer",), # 5万
94
- ("qwen1_5_14b_chat", "", "GPT2Tokenizer",), # 15万,速度有点慢
95
- ("starchat_alpha", "", "GPT2Tokenizer",),
96
-
97
- ####### google/sentencepiece tokenizer:
98
- # T5 llama internlm
99
- ("t5_small", "", "sentencepiece"),
100
- ("t5_base", "", "sentencepiece"),
101
- ("t5_large", "", "sentencepiece"),
102
- ("chatyuan_large_v2", "", "sentencepiece"),
103
- ("prompt_clue", "", "sentencepiece"),
104
-
105
- ("llama", "", "sentencepiece", "llama use single digits and thus uses 4 tokens to encode the number 1000"), # '中文单字': 700, '中文多字': 0
106
- ("llama2", "", "sentencepiece"),
107
- ("llama3", "", "sentencepiece"),
108
- ("chinese_llama", "", "sentencepiece"), #
109
- ("chinese_llama2", "", "sentencepiece"), #
110
- ("llama_3_chinese_8b", "sentencepiece"),
111
- # ("chinese_alpaca_lora_7b", # 中文Alpaca模型在上述中文LLaMA模型的基础上进一步使用了指令数据进行精调。
112
- # ("belle_llama_ext_7b",
113
- # ("alpaca_7b",
114
- ("baichuan", "", "sentencepiece"),
115
- ("baichuan2", "", "sentencepiece"),
116
- ("internlm_chat_7b", "", "sentencepiece"),
117
- ("internlm2_chat_7b", "", "sentencepiece"),
118
- ("internlm2_math_7b", "", "sentencepiece"),
119
- ("internlm_xcomposer_7b", "", "sentencepiece"),
120
- ("falcon_7b", "", "sentencepiece"),
121
- ("falcon_180b", "", "sentencepiece"),
122
- ("skywork_13b_base",),
123
- ("skywork_13b_math",),
124
- ("xlm_roberta", ),
125
- # "goat",
126
-
127
- # ##### glm系列
128
- # "glm_chinese",),
129
- ("chatglm_6b", "", "sentencepiece"),
130
- ("chatglm2_6b", "", "sentencepiece"),
131
- ("chatglm3_6b", "", "sentencepiece"),
132
- ("character_glm_6b", "", "sentencepiece"),
133
-
134
- # tiktoken 系列
135
- ("qwen_1_8b_chat", "", "tiktoken"),
136
- ("qwen_7b_chat", "", "tiktoken"),
137
- ("qwen_72b_chat", "", "tiktoken"),
138
- ("text_davinci_003", "", "tiktoken"),
139
- ("code_davinci_002", "", "tiktoken"),
140
- ("gpt_35_turbo", "", "tiktoken"),
141
- ("gpt_4", "", "tiktoken"),
142
-
143
- # 未分类
144
- # ("amber", ""),
145
- ("crystal_coder", ""),
146
- ("mistral_7b",),
147
- ("mixtral_8_7b",),
148
-
149
-
150
- ("flan_t5_base",),
151
- ("fastchat_t5_3b",),
152
- ("pko_t5_large",),
153
- ("wizardcoder_15b_v1",),
154
- ("yi_6b",),
155
- ("yi_34b",),
156
- ("yi_vl34b",),
157
- ("orion_14b_chat",),
158
- ("phi_1",),
159
- ("phi_2",),
160
- ("phi_3_mini", "即llama vocab"),
161
- ("solar_10_7b",),
162
- ("mobilebert_uncased",),
163
- # ("mobilenet_v2",), # error
164
- ("switch_c_2048",),
165
- ("byt5_small",),
166
- ("mt5_large",),
167
- ("wizardcoder_python_7b_v1",),
168
- ("wizardlm_7b_v1",),
169
- ("wizardmath_70b_v1",),
170
- ("tigerbot_70b_chat_v4_4k",),
171
- ("tigerbot_13b_chat_v2",),
172
- ("deepseek_coder_33b_instruct",),
173
- ("deepseek_llm_7b_base",),
174
- ("gemma_7b",),
175
- ("olmo_7b",),
176
- ("aya_101",),
177
- ("zephyr_7b_beta",),
178
- ("jamba_v0_1", ),
179
- ("dbrx_instruct", ),
180
- ("grok_1",),
181
- # ("claude",),
182
- ("gpt_nexo_20b", ),
183
- ("gpt_neox_japanese_2_7b", ),
184
-
185
- ]
186
-
187
- all_tokenizers = [tokenizer[0] for tokenizer in all_tokenizers]
188
- all_tokenizers = sorted(all_tokenizers)
189
-
190
-
191
- class TokenizerType(Enum):
192
- """
193
- - https://huggingface.co/docs/transformers/tokenizer_summary
194
- - https://github.com/EleutherAI/gpt-neox/blob/main/megatron/tokenizer/tokenizer.py
195
- - https://github.com/google/sentencepiece/blob/3863f7648e5d8edb571ac592f3ac4f5f0695275a/src/sentencepiece_model.proto#L48
196
- - UNIGRAM = 1; // Unigram language model with dynamic algorithm
197
- - BPE = 2; // Byte Pair Encoding
198
- - WORD = 3; // Delimitered by whitespace.
199
- - CHAR = 4; // tokenizes into character sequence
200
- """
201
- BPE = auto()
202
- ByteBPE = auto() # BBPE Byte-Level BPE
203
- GPT2BPETokenizer = auto() #
204
- BERTTokenizer = auto()
205
-
206
-
207
- # class TokenizerType(Enum):
208
- #
209
- # # BERTTokenizer
210
- # # 依赖一个txt文件
211
- #
212
- #
213
- # # https://github.com/EleutherAI/gpt-neox/blob/v2.0/megatron/tokenizer/tokenizer.py#L231
214
- # # 依赖一个json文件,Tokenizer.from_file(vocab_file)
215
- # # 案例:gpt-neox-20B
216
- # HFTokenizer = auto()
217
- #
218
- # # 依赖: model_file, sentencepiece.SentencePieceProcessor(model_file)
219
- # # 案例:
220
- # SentencePieceTokenizer = auto()
221
- #
222
- #
223
- # # 依赖: 3个json文件:vocab.json, merges.txt, special_tokens.txt
224
- # # 源码:
225
- # # - https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/tokenizer/gpt2_tokenization.py#L92
226
- # # Byte-level BPE
227
- # GPT2BPETokenizer = auto()
228
-
229
-
230
- class TokenizerImpl(Enum):
231
-
232
- """
233
- https://github.com/google/sentencepiece,支持 sentencepiece(BPE,unigram,char,word), wordpiece,
234
- spm_train --model_type unigram/bpe/char/word
235
- """
236
- SentencePiece = auto()
237
-
238
- # https://github.com/huggingface/transformers/blob/v4.30.2/src/transformers/models/gpt2/tokenization_gpt2.py#L104
239
- # 构造词典:
240
- # GPT2Tokenizer = auto()
241
- # BertTokenizer = auto() #
242
-
243
- """
244
- """
245
- HFTokenizer = auto() # https://github.com/huggingface/tokenizers, 支持
246
-
247
-
248
- cache = {}
249
-
250
- def load_tokener(model_name):
251
- if model_name in cache:
252
- return cache[model_name]
253
- logger.info(f"loading tokenizer {model_name}")
254
- tokenizer = importlib.import_module("." + model_name, 'vocab').tokenizer
255
- tokenizer.alias = model_name
256
- return tokenizer
257
-
258
-
259
- if __name__ == "__main__":
260
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
vocab/_alpaca_7b/README.md DELETED
File without changes
vocab/_goat/README.md DELETED
File without changes
vocab/_goat/__init__.py DELETED
File without changes
vocab/albert/__init__.py DELETED
@@ -1,6 +0,0 @@
1
- """
2
-
3
- SentencePiece(unigram)
4
-
5
- https://huggingface.co/docs/transformers/tokenizer_summary#sentencepiece
6
- """
 
 
 
 
 
 
 
vocab/aya_101/__init__.py DELETED
@@ -1,5 +0,0 @@
1
-
2
-
3
- from transformers import AutoTokenizer
4
-
5
- tokenizer = AutoTokenizer.from_pretrained("CohereForAI/aya-101")
 
 
 
 
 
 
vocab/baichuan/Baichuan-7B/config.json DELETED
@@ -1,26 +0,0 @@
1
- {
2
- "architectures": [
3
- "BaiChuanForCausalLM"
4
- ],
5
- "auto_map": {
6
- "AutoConfig": "configuration_baichuan.BaiChuanConfig",
7
- "AutoModelForCausalLM": "modeling_baichuan.BaiChuanForCausalLM"
8
- },
9
- "bos_token_id": 1,
10
- "eos_token_id": 2,
11
- "hidden_act": "silu",
12
- "hidden_size": 4096,
13
- "initializer_range": 0.02,
14
- "intermediate_size": 11008,
15
- "max_position_embeddings": 4096,
16
- "model_type": "baichuan",
17
- "num_attention_heads": 32,
18
- "num_hidden_layers": 32,
19
- "pad_token_id": 0,
20
- "rms_norm_eps": 1e-06,
21
- "tie_word_embeddings": false,
22
- "torch_dtype": "float32",
23
- "transformers_version": "4.29.1",
24
- "use_cache": true,
25
- "vocab_size": 64000
26
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
vocab/baichuan/Baichuan-7B/configuration_baichuan.py DELETED
@@ -1,66 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
3
- #
4
- # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
5
- # and OPT implementations in this library. It has been modified from its
6
- # original forms to accommodate minor architectural differences compared
7
- # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
8
- #
9
- # Licensed under the Apache License, Version 2.0 (the "License");
10
- # you may not use this file except in compliance with the License.
11
- # You may obtain a copy of the License at
12
- #
13
- # http://www.apache.org/licenses/LICENSE-2.0
14
- #
15
- # Unless required by applicable law or agreed to in writing, software
16
- # distributed under the License is distributed on an "AS IS" BASIS,
17
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
- # See the License for the specific language governing permissions and
19
- # limitations under the License.
20
-
21
- from transformers.configuration_utils import PretrainedConfig
22
- from transformers.utils import logging
23
-
24
-
25
- logger = logging.get_logger(__name__)
26
-
27
-
28
- class BaiChuanConfig(PretrainedConfig):
29
- model_type = "baichuan"
30
- keys_to_ignore_at_inference = ["past_key_values"]
31
-
32
- def __init__(
33
- self,
34
- vocab_size=64000,
35
- hidden_size=4096,
36
- intermediate_size=11008,
37
- num_hidden_layers=32,
38
- num_attention_heads=32,
39
- hidden_act="silu",
40
- max_position_embeddings=4096,
41
- initializer_range=0.02,
42
- rms_norm_eps=1e-6,
43
- use_cache=True,
44
- pad_token_id=0,
45
- bos_token_id=1,
46
- eos_token_id=2,
47
- tie_word_embeddings=False,
48
- **kwargs,
49
- ):
50
- self.vocab_size = vocab_size
51
- self.max_position_embeddings = max_position_embeddings
52
- self.hidden_size = hidden_size
53
- self.intermediate_size = intermediate_size
54
- self.num_hidden_layers = num_hidden_layers
55
- self.num_attention_heads = num_attention_heads
56
- self.hidden_act = hidden_act
57
- self.initializer_range = initializer_range
58
- self.rms_norm_eps = rms_norm_eps
59
- self.use_cache = use_cache
60
- super().__init__(
61
- pad_token_id=pad_token_id,
62
- bos_token_id=bos_token_id,
63
- eos_token_id=eos_token_id,
64
- tie_word_embeddings=tie_word_embeddings,
65
- **kwargs,
66
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
vocab/baichuan/Baichuan-7B/special_tokens_map.json DELETED
@@ -1,23 +0,0 @@
1
- {
2
- "bos_token": {
3
- "content": "<s>",
4
- "lstrip": false,
5
- "normalized": true,
6
- "rstrip": false,
7
- "single_word": false
8
- },
9
- "eos_token": {
10
- "content": "</s>",
11
- "lstrip": false,
12
- "normalized": true,
13
- "rstrip": false,
14
- "single_word": false
15
- },
16
- "unk_token": {
17
- "content": "<unk>",
18
- "lstrip": false,
19
- "normalized": true,
20
- "rstrip": false,
21
- "single_word": false
22
- }
23
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
vocab/baichuan/Baichuan-7B/tokenization_baichuan.py DELETED
@@ -1,250 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
3
- #
4
- # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
5
- # and OPT implementations in this library. It has been modified from its
6
- # original forms to accommodate minor architectural differences compared
7
- # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
8
- #
9
- # Licensed under the Apache License, Version 2.0 (the "License");
10
- # you may not use this file except in compliance with the License.
11
- # You may obtain a copy of the License at
12
- #
13
- # http://www.apache.org/licenses/LICENSE-2.0
14
- #
15
- # Unless required by applicable law or agreed to in writing, software
16
- # distributed under the License is distributed on an "AS IS" BASIS,
17
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
- # See the License for the specific language governing permissions and
19
- # limitations under the License.
20
-
21
- import os
22
- from shutil import copyfile
23
- from typing import Any, Dict, List, Optional, Tuple
24
-
25
- import sentencepiece as spm
26
-
27
- from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
28
- from transformers.utils import logging
29
-
30
-
31
- logger = logging.get_logger(__name__)
32
-
33
- VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
34
-
35
- PRETRAINED_VOCAB_FILES_MAP = {
36
- "vocab_file": {},
37
- "tokenizer_file": {},
38
- }
39
- PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {}
40
-
41
-
42
- class BaiChuanTokenizer(PreTrainedTokenizer):
43
- """
44
- Construct a BaiChuan tokenizer. Based on byte-level Byte-Pair-Encoding.
45
-
46
- Args:
47
- vocab_file (`str`):
48
- Path to the vocabulary file.
49
- """
50
-
51
- vocab_files_names = VOCAB_FILES_NAMES
52
- pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
53
- max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
54
- model_input_names = ["input_ids", "attention_mask"]
55
-
56
- def __init__(
57
- self,
58
- vocab_file,
59
- unk_token="<unk>",
60
- bos_token="<s>",
61
- eos_token="</s>",
62
- pad_token=None,
63
- sp_model_kwargs: Optional[Dict[str, Any]] = None,
64
- add_bos_token=True,
65
- add_eos_token=False,
66
- clean_up_tokenization_spaces=False,
67
- **kwargs,
68
- ):
69
- self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
70
- bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
71
- eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
72
- unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
73
- pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
74
- self.vocab_file = vocab_file
75
- self.add_bos_token = add_bos_token
76
- self.add_eos_token = add_eos_token
77
- self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
78
- self.sp_model.Load(vocab_file)
79
- super().__init__(
80
- bos_token=bos_token,
81
- eos_token=eos_token,
82
- unk_token=unk_token,
83
- pad_token=pad_token,
84
- add_bos_token=add_bos_token,
85
- add_eos_token=add_eos_token,
86
- sp_model_kwargs=self.sp_model_kwargs,
87
- clean_up_tokenization_spaces=clean_up_tokenization_spaces,
88
- **kwargs,
89
- )
90
-
91
- def __getstate__(self):
92
- state = self.__dict__.copy()
93
- state["sp_model"] = None
94
- return state
95
-
96
- def __setstate__(self, d):
97
- self.__dict__ = d
98
- self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
99
- self.sp_model.Load(self.vocab_file)
100
-
101
- @property
102
- def vocab_size(self):
103
- """Returns vocab size"""
104
- return self.sp_model.get_piece_size()
105
-
106
- def get_vocab(self):
107
- """Returns vocab as a dict"""
108
- vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
109
- vocab.update(self.added_tokens_encoder)
110
- return vocab
111
-
112
- def _tokenize(self, text):
113
- """Returns a tokenized string."""
114
- return self.sp_model.encode(text, out_type=str)
115
-
116
- def _convert_token_to_id(self, token):
117
- """Converts a token (str) in an id using the vocab."""
118
- return self.sp_model.piece_to_id(token)
119
-
120
- def _convert_id_to_token(self, index):
121
- """Converts an index (integer) in a token (str) using the vocab."""
122
- token = self.sp_model.IdToPiece(index)
123
- return token
124
-
125
- def convert_tokens_to_string(self, tokens):
126
- """Converts a sequence of tokens (string) in a single string."""
127
- current_sub_tokens = []
128
- out_string = ""
129
- prev_is_special = False
130
- for i, token in enumerate(tokens):
131
- # make sure that special tokens are not decoded using sentencepiece model
132
- if token in self.all_special_tokens:
133
- if not prev_is_special and i != 0:
134
- out_string += " "
135
- out_string += self.sp_model.decode(current_sub_tokens) + token
136
- prev_is_special = True
137
- current_sub_tokens = []
138
- else:
139
- current_sub_tokens.append(token)
140
- prev_is_special = False
141
- out_string += self.sp_model.decode(current_sub_tokens)
142
- return out_string
143
-
144
- def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
145
- """
146
- Save the vocabulary and special tokens file to a directory.
147
-
148
- Args:
149
- save_directory (`str`):
150
- The directory in which to save the vocabulary.
151
-
152
- Returns:
153
- `Tuple(str)`: Paths to the files saved.
154
- """
155
- if not os.path.isdir(save_directory):
156
- logger.error(f"Vocabulary path ({save_directory}) should be a directory")
157
- return
158
- out_vocab_file = os.path.join(
159
- save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
160
- )
161
-
162
- if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
163
- copyfile(self.vocab_file, out_vocab_file)
164
- elif not os.path.isfile(self.vocab_file):
165
- with open(out_vocab_file, "wb") as fi:
166
- content_spiece_model = self.sp_model.serialized_model_proto()
167
- fi.write(content_spiece_model)
168
-
169
- return (out_vocab_file,)
170
-
171
- def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
172
- bos_token_id = [self.bos_token_id] if self.add_bos_token else []
173
- eos_token_id = [self.eos_token_id] if self.add_eos_token else []
174
-
175
- output = bos_token_id + token_ids_0 + eos_token_id
176
-
177
- if token_ids_1 is not None:
178
- output = output + bos_token_id + token_ids_1 + eos_token_id
179
-
180
- return output
181
-
182
- def get_special_tokens_mask(
183
- self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
184
- ) -> List[int]:
185
- """
186
- Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
187
- special tokens using the tokenizer `prepare_for_model` method.
188
-
189
- Args:
190
- token_ids_0 (`List[int]`):
191
- List of IDs.
192
- token_ids_1 (`List[int]`, *optional*):
193
- Optional second list of IDs for sequence pairs.
194
- already_has_special_tokens (`bool`, *optional*, defaults to `False`):
195
- Whether or not the token list is already formatted with special tokens for the model.
196
-
197
- Returns:
198
- `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
199
- """
200
- if already_has_special_tokens:
201
- return super().get_special_tokens_mask(
202
- token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
203
- )
204
-
205
- bos_token_id = [1] if self.add_bos_token else []
206
- eos_token_id = [1] if self.add_eos_token else []
207
-
208
- if token_ids_1 is None:
209
- return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
210
- return (
211
- bos_token_id
212
- + ([0] * len(token_ids_0))
213
- + eos_token_id
214
- + bos_token_id
215
- + ([0] * len(token_ids_1))
216
- + eos_token_id
217
- )
218
-
219
- def create_token_type_ids_from_sequences(
220
- self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
221
- ) -> List[int]:
222
- """
223
- Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
224
- sequence pair mask has the following format:
225
-
226
- ```
227
- 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
228
- | first sequence | second sequence |
229
- ```
230
-
231
- if token_ids_1 is None, only returns the first portion of the mask (0s).
232
-
233
- Args:
234
- token_ids_0 (`List[int]`):
235
- List of ids.
236
- token_ids_1 (`List[int]`, *optional*):
237
- Optional second list of IDs for sequence pairs.
238
-
239
- Returns:
240
- `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
241
- """
242
- bos_token_id = [self.bos_token_id] if self.add_bos_token else []
243
- eos_token_id = [self.eos_token_id] if self.add_eos_token else []
244
-
245
- output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
246
-
247
- if token_ids_1 is not None:
248
- output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
249
-
250
- return output
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
vocab/baichuan/Baichuan-7B/tokenizer.model DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:4be54af290d93c113bcbf421115ae9eed9d6340408f564898f1e966dc738ef01
3
- size 1136699
 
 
 
 
vocab/baichuan/Baichuan-7B/tokenizer_config.json DELETED
@@ -1,35 +0,0 @@
1
- {
2
- "auto_map": {
3
- "AutoTokenizer": ["tokenization_baichuan.BaiChuanTokenizer", null]
4
- },
5
- "add_bos_token": false,
6
- "add_eos_token": false,
7
- "bos_token": {
8
- "__type": "AddedToken",
9
- "content": "<s>",
10
- "lstrip": false,
11
- "normalized": true,
12
- "rstrip": false,
13
- "single_word": false
14
- },
15
- "clean_up_tokenization_spaces": false,
16
- "eos_token": {
17
- "__type": "AddedToken",
18
- "content": "</s>",
19
- "lstrip": false,
20
- "normalized": true,
21
- "rstrip": false,
22
- "single_word": false
23
- },
24
- "model_max_length": 1000000000000000019884624838656,
25
- "sp_model_kwargs": {},
26
- "tokenizer_class": "BaiChuanTokenizer",
27
- "unk_token": {
28
- "__type": "AddedToken",
29
- "content": "<unk>",
30
- "lstrip": false,
31
- "normalized": true,
32
- "rstrip": false,
33
- "single_word": false
34
- }
35
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
vocab/baichuan/__init__.py DELETED
@@ -1,19 +0,0 @@
1
- import os
2
- import config
3
- from transformers import AutoTokenizer
4
- from vocab import TokenizerType
5
-
6
-
7
- if config.USE_REMOTE:
8
- tokenizer = AutoTokenizer.from_pretrained("baichuan-inc/Baichuan-7B", trust_remote_code=True)
9
- else:
10
- CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
11
- TOKENIZER_DIR = os.path.join(CURRENT_DIR, "Baichuan-7B")
12
- tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR, trust_remote_code=True)
13
-
14
-
15
- # byte-bpe sentencepiece
16
- tokenizer.type = TokenizerType.ByteBPE
17
-
18
- tokenizer.comments = "使用 SentencePiece 中的 Byte-Pair Encoding (BPE) 作为分词算法"
19
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
vocab/baichuan/demo.py DELETED
@@ -1,6 +0,0 @@
1
-
2
- from vocab.baichuan import tokenizer
3
-
4
- id1 = tokenizer.encode("<pad>")
5
- token1 = tokenizer.decode(125696)
6
-
 
 
 
 
 
 
 
vocab/baichuan/error.md DELETED
@@ -1,8 +0,0 @@
1
-
2
-
3
- ## AttributeError: 'BaichuanTokenizer' object has no attribute 'sp_model'
4
-
5
- https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat/discussions/18
6
-
7
-
8
- transfomers 4.34 doesn't work for me either. Degrading to 4.33.1 works in my case