Spaces:
Running
Running
remove vocabs; update compression_app; add character_app;
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +0 -10
- .gitignore +5 -1
- vocab/README.md → README.2.md +3 -1
- README.md +16 -1
- app.py +7 -5
- character_app.py +80 -0
- character_util.py +213 -0
- app_compression.py → compression_app.py +42 -40
- utils/compression_util.py → compression_util.py +151 -61
- config.py +0 -20
- patcher/README.md +15 -0
- patcher/sptokenizer_patch_deprecated.py +0 -105
- patcher/sptokenizer_wrapper.py +0 -61
- patcher/tiktoken_patch.py +2 -2
- app_playground.py → playground_app.py +34 -19
- examples.py → playground_examples.py +9 -9
- util.py → playground_util.py +39 -35
- requirements.txt +3 -1
- stats/character_stats.json +1712 -0
- stats/compress_rate.json +0 -4286
- stats/compression_rate.json +0 -0
- utils/byte_util.py +0 -0
- utils/character_util.py +0 -231
- utils/convert_sp_to_json.py +0 -4
- utils/fn_util.py +0 -0
- utils/lang_util.py +26 -30
- utils/lang_util_2.py +0 -115
- utils/oov.md +202 -0
- utils/oov_util.py +109 -3
- utils/speed_util.py +0 -9
- utils/symbol.py +0 -35
- utils/text_util.py +12 -1
- utils/vocab.jd.txt.v2 +0 -10268
- vocab.py +453 -0
- vocab/Intern_gpt/README.md +0 -0
- vocab/__init__.py +0 -260
- vocab/_alpaca_7b/README.md +0 -0
- vocab/_goat/README.md +0 -0
- vocab/_goat/__init__.py +0 -0
- vocab/albert/__init__.py +0 -6
- vocab/aya_101/__init__.py +0 -5
- vocab/baichuan/Baichuan-7B/config.json +0 -26
- vocab/baichuan/Baichuan-7B/configuration_baichuan.py +0 -66
- vocab/baichuan/Baichuan-7B/special_tokens_map.json +0 -23
- vocab/baichuan/Baichuan-7B/tokenization_baichuan.py +0 -250
- vocab/baichuan/Baichuan-7B/tokenizer.model +0 -3
- vocab/baichuan/Baichuan-7B/tokenizer_config.json +0 -35
- vocab/baichuan/__init__.py +0 -19
- vocab/baichuan/demo.py +0 -6
- vocab/baichuan/error.md +0 -8
.gitattributes
CHANGED
@@ -33,13 +33,3 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
-
vocab/belle_7b_2m/belle-7b-2m/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
37 |
-
vocab/bloom/tokenizer/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
38 |
-
vocab/gemma_7b/gemma-7b/tokenizer.model filter=lfs diff=lfs merge=lfs -text
|
39 |
-
vocab/gemma_7b/gemma-7b/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
40 |
-
vocab/grok_1/tokenizer.model filter=lfs diff=lfs merge=lfs -text
|
41 |
-
vocab/llama3/Meta-Llama-3-70B/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
42 |
-
vocab/mistral_7b/Mistral-7B-v0.1/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
43 |
-
vocab/mistral_7b/Mistral-7B-v0.1/tokenizer.model filter=lfs diff=lfs merge=lfs -text
|
44 |
-
vocab/mixtral_8_7b/Mixtral-8x7B-v0.1/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
45 |
-
vocab/mixtral_8_7b/Mixtral-8x7B-v0.1/tokenizer.model filter=lfs diff=lfs merge=lfs -text
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.gitignore
CHANGED
@@ -14,4 +14,8 @@ downloads/
|
|
14 |
eggs/
|
15 |
.eggs/
|
16 |
.idea/
|
17 |
-
gradio_cached_examples
|
|
|
|
|
|
|
|
|
|
14 |
eggs/
|
15 |
.eggs/
|
16 |
.idea/
|
17 |
+
gradio_cached_examples
|
18 |
+
stats/
|
19 |
+
test/
|
20 |
+
wip/
|
21 |
+
tools/
|
vocab/README.md → README.2.md
RENAMED
@@ -67,7 +67,7 @@ carol
|
|
67 |
```
|
68 |
|
69 |
|
70 |
-
##
|
71 |
|
72 |
https://github.com/pytorch/fairseq/blob/master/tests/test_noising.py#L37
|
73 |
|
@@ -77,6 +77,8 @@ https://github.com/pytorch/fairseq/blob/master/tests/test_noising.py#L37
|
|
77 |
|
78 |
跟BERT类似,只不过BERT是词后缀,这里是词前缀。
|
79 |
|
|
|
|
|
80 |
|
81 |
## GPT2
|
82 |
|
|
|
67 |
```
|
68 |
|
69 |
|
70 |
+
## @@
|
71 |
|
72 |
https://github.com/pytorch/fairseq/blob/master/tests/test_noising.py#L37
|
73 |
|
|
|
77 |
|
78 |
跟BERT类似,只不过BERT是词后缀,这里是词前缀。
|
79 |
|
80 |
+
这种应该是 https://github.com/rsennrich/subword-nmt
|
81 |
+
|
82 |
|
83 |
## GPT2
|
84 |
|
README.md
CHANGED
@@ -7,6 +7,8 @@ sdk: gradio
|
|
7 |
sdk_version: 4.28.3
|
8 |
app_file: app.py
|
9 |
pinned: false
|
|
|
|
|
10 |
---
|
11 |
|
12 |
|
@@ -210,4 +212,17 @@ python utils/compress_rate_util.py
|
|
210 |
|
211 |
- Getting the most out of your tokenizer for pre-training and domain adaptation
|
212 |
- Efficient and Effective Text Encoding for Chinese LLaMA and Alpaca
|
213 |
-
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
sdk_version: 4.28.3
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
+
datasets:
|
11 |
+
- cc100
|
12 |
---
|
13 |
|
14 |
|
|
|
212 |
|
213 |
- Getting the most out of your tokenizer for pre-training and domain adaptation
|
214 |
- Efficient and Effective Text Encoding for Chinese LLaMA and Alpaca
|
215 |
+
- blog
|
216 |
+
- https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
|
217 |
+
- https://huggingface.co/docs/transformers/tokenizer_summary#sentencepiece
|
218 |
+
- https://www.huaxiaozhuan.com/%E5%B7%A5%E5%85%B7/huggingface_transformer/chapters/1_tokenizer.html
|
219 |
+
- https://zhuanlan.zhihu.com/p/652520262
|
220 |
+
- https://github.com/QwenLM/Qwen/blob/main/tokenization_note_zh.md
|
221 |
+
- demo
|
222 |
+
- https://huggingface.co/spaces/Xenova/the-tokenizer-playground
|
223 |
+
- https://github.com/dqbd/tiktokenizer
|
224 |
+
- https://chat.lmsys.org/?leaderboard
|
225 |
+
- https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard
|
226 |
+
- paper
|
227 |
+
- ss
|
228 |
+
-
|
app.py
CHANGED
@@ -1,16 +1,18 @@
|
|
1 |
|
2 |
-
import
|
3 |
-
from
|
4 |
-
from
|
5 |
from patcher.gr_interface import TabbedInterface
|
6 |
|
7 |
|
8 |
demo = TabbedInterface(
|
9 |
-
[
|
10 |
-
[" ⚔️ Playground", "🏆 Compression Leaderboard",], # 编码速度,解码速度,字符分类(zh、num等,支持正则),支持的语言,机构,。
|
11 |
title='<div align="center">Tokenizer Arena ⚔️</div>',
|
12 |
css="css/style.css"
|
13 |
)
|
14 |
|
|
|
|
|
15 |
if __name__ == "__main__":
|
16 |
demo.launch()
|
|
|
1 |
|
2 |
+
from playground_app import demo as playground_tab
|
3 |
+
from compression_app import demo as compression_tab
|
4 |
+
from character_app import demo as character_tab
|
5 |
from patcher.gr_interface import TabbedInterface
|
6 |
|
7 |
|
8 |
demo = TabbedInterface(
|
9 |
+
[playground_tab, compression_tab, character_tab],
|
10 |
+
[" ⚔️ Playground", "🏆 Compression Leaderboard", "📊 Character Statistics"], # 编码速度,解码速度,字符分类(zh、num等,支持正则),支持的语言,机构,。
|
11 |
title='<div align="center">Tokenizer Arena ⚔️</div>',
|
12 |
css="css/style.css"
|
13 |
)
|
14 |
|
15 |
+
demo.load(js=open("js/onload.js", "r", encoding="utf-8").read())
|
16 |
+
|
17 |
if __name__ == "__main__":
|
18 |
demo.launch()
|
character_app.py
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from character_util import get_character_table
|
3 |
+
|
4 |
+
all_columns = [
|
5 |
+
("digit", "digit"),
|
6 |
+
("space", "space"),
|
7 |
+
("lang-chinese", 'zh'),
|
8 |
+
("lang-korea", 'ko'),
|
9 |
+
("lang-japanese", 'ja'),
|
10 |
+
# ("byte", "byte"),
|
11 |
+
# ("oov", "oov")
|
12 |
+
]
|
13 |
+
default_columns = ["digit", "zh"]
|
14 |
+
|
15 |
+
# columns = ["lang-zh", "lang-korea", "lang-japanese", "number", "space", "bytes", "oov"]
|
16 |
+
|
17 |
+
abbr2name = {column[1]: column[0].split('-')[-1] for column in all_columns}
|
18 |
+
|
19 |
+
|
20 |
+
def get_column_info(columns):
|
21 |
+
print(columns)
|
22 |
+
markdown = ""
|
23 |
+
for column in columns:
|
24 |
+
markdown += f"- `num({column})`: num of tokens containing {abbr2name[column]} characters\n" \
|
25 |
+
f"- `len({column})`: `min,median,max` length of tokens containing {abbr2name[column]} characters\n"
|
26 |
+
return markdown
|
27 |
+
|
28 |
+
|
29 |
+
with gr.Blocks() as demo:
|
30 |
+
gr.Markdown("## 🛠️ Setting") # ⚙
|
31 |
+
with gr.Accordion("Please select the type of character you want to count.", open=True):
|
32 |
+
# file size 💽 🖴, tokens 🧮
|
33 |
+
with gr.Row():
|
34 |
+
with gr.Column():
|
35 |
+
columns = gr.Checkboxgroup(
|
36 |
+
all_columns,
|
37 |
+
value=default_columns,
|
38 |
+
label="character type",
|
39 |
+
# info=""
|
40 |
+
)
|
41 |
+
gr.Markdown(
|
42 |
+
"To count other types of characters, you can modify [character_util.py]"
|
43 |
+
"(https://huggingface.co/spaces/eson/tokenizer-arena/blob/main/character_util.py). "
|
44 |
+
)
|
45 |
+
column_info = gr.Markdown(
|
46 |
+
get_column_info(default_columns)
|
47 |
+
)
|
48 |
+
|
49 |
+
gr.Markdown("## 📊 Character Statistics")
|
50 |
+
search_bar = gr.Textbox(
|
51 |
+
placeholder="🔍 Search by tokenizer or organization (e.g., 'llama', 'openai') and press ENTER...",
|
52 |
+
show_label=False,
|
53 |
+
elem_id="search-bar",
|
54 |
+
)
|
55 |
+
compress_rate_table = gr.Dataframe(datatype="html", wrap=True)
|
56 |
+
|
57 |
+
search_bar.submit(
|
58 |
+
get_character_table,
|
59 |
+
inputs=[search_bar, columns],
|
60 |
+
outputs=compress_rate_table
|
61 |
+
)
|
62 |
+
columns.change(
|
63 |
+
get_character_table,
|
64 |
+
inputs=[search_bar, columns],
|
65 |
+
outputs=compress_rate_table
|
66 |
+
)
|
67 |
+
columns.change(
|
68 |
+
get_column_info,
|
69 |
+
inputs=[columns],
|
70 |
+
outputs=column_info
|
71 |
+
)
|
72 |
+
|
73 |
+
demo.load(
|
74 |
+
get_character_table,
|
75 |
+
inputs=[search_bar, columns],
|
76 |
+
outputs=compress_rate_table
|
77 |
+
)
|
78 |
+
|
79 |
+
if __name__ == "__main__":
|
80 |
+
demo.launch()
|
character_util.py
ADDED
@@ -0,0 +1,213 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
TODO:
|
3 |
+
1. 繁体、简体、语种、
|
4 |
+
2. 确认 bert的space token数目
|
5 |
+
3. add token_impl
|
6 |
+
4.
|
7 |
+
"""
|
8 |
+
import os
|
9 |
+
import json
|
10 |
+
import numpy as np
|
11 |
+
import pandas as pd
|
12 |
+
from collections import Counter, defaultdict
|
13 |
+
from vocab import tokenizer_factory
|
14 |
+
from typing import Optional, Union, Literal
|
15 |
+
from utils.log_util import logger
|
16 |
+
from utils.text_util import contains_digit, get_space_count
|
17 |
+
from utils.lang_util import detect_language, language_ranges
|
18 |
+
|
19 |
+
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
|
20 |
+
|
21 |
+
|
22 |
+
def _to_unicode(text):
|
23 |
+
return ''.join(r'\u{:04X}'.format(ord(chr)) for chr in text)
|
24 |
+
|
25 |
+
|
26 |
+
def _get_coding_length(tokenizer, vocab, filter=None):
|
27 |
+
"""
|
28 |
+
oov character may be tokenized into more than one token.
|
29 |
+
"""
|
30 |
+
all_length = []
|
31 |
+
for word in vocab:
|
32 |
+
if len(word) > 1:
|
33 |
+
continue
|
34 |
+
if filter is not None and filter(word):
|
35 |
+
continue
|
36 |
+
try:
|
37 |
+
tokens = tokenizer.encode(word)
|
38 |
+
except Exception as e:
|
39 |
+
print(e)
|
40 |
+
|
41 |
+
all_length.append(len(tokens))
|
42 |
+
# if len(tokens.ids) > 1:
|
43 |
+
# if len(tokens) > 3:
|
44 |
+
# print(word, tokens)
|
45 |
+
|
46 |
+
dist_length = Counter(all_length)
|
47 |
+
mean_length = round(sum(all_length) / len(all_length), 2)
|
48 |
+
return dist_length, mean_length
|
49 |
+
|
50 |
+
|
51 |
+
cache = {}
|
52 |
+
|
53 |
+
|
54 |
+
def _dist(token_lens):
|
55 |
+
"""
|
56 |
+
:param token_lens:
|
57 |
+
:return: min,median,max of token_lens
|
58 |
+
"""
|
59 |
+
if not token_lens:
|
60 |
+
return "-"
|
61 |
+
return f"{min(token_lens)},{round(np.median(token_lens))},{max(token_lens)}"
|
62 |
+
|
63 |
+
|
64 |
+
def iter_vocab(
|
65 |
+
tokenizer_name: str,
|
66 |
+
from_cache: bool = True,
|
67 |
+
cache_dir: str = "stats",
|
68 |
+
) -> Union[pd.DataFrame, dict]:
|
69 |
+
"""
|
70 |
+
:param tokenizer_name:
|
71 |
+
:param from_cache:
|
72 |
+
:param cache_dir:
|
73 |
+
:return:
|
74 |
+
"""
|
75 |
+
tokenizer_config = tokenizer_factory.get_tokenizer_config(tokenizer_name)
|
76 |
+
|
77 |
+
cache_dir = os.path.join(CURRENT_DIR, cache_dir)
|
78 |
+
os.makedirs(cache_dir, exist_ok=True)
|
79 |
+
|
80 |
+
# load from cache
|
81 |
+
cache_path = os.path.join(cache_dir, "character_stats.json")
|
82 |
+
if not cache and os.path.exists(cache_path):
|
83 |
+
with open(cache_path, "r", encoding="utf-8") as f_tmp:
|
84 |
+
cache.update(json.load(f_tmp))
|
85 |
+
if from_cache and tokenizer_name in cache:
|
86 |
+
logger.info(f"load {tokenizer_config.name_or_path} from cache")
|
87 |
+
return cache[tokenizer_name]
|
88 |
+
|
89 |
+
tokenizer = tokenizer_factory.get_tokenizer(tokenizer_name)
|
90 |
+
|
91 |
+
tokens_by_lang = {lang[1]: [] for lang in language_ranges.keys()}
|
92 |
+
digit_tokens = []
|
93 |
+
space_tokens = []
|
94 |
+
byte_tokens = []
|
95 |
+
|
96 |
+
buffer = []
|
97 |
+
for token_id in range(tokenizer.vocab_size):
|
98 |
+
# for token_id in tokenizer.get_vocab():
|
99 |
+
# for token_id in range(len(tokenizer)):
|
100 |
+
decode_str = tokenizer.decode([token_id], skip_special_tokens=False)
|
101 |
+
token = tokenizer.convert_ids_to_tokens([token_id], skip_special_tokens=False)[0]
|
102 |
+
tags = []
|
103 |
+
if token is None: # 有些词典有空的id(不连续)
|
104 |
+
continue
|
105 |
+
if isinstance(token, bytes):
|
106 |
+
token = token.decode("utf-8", errors="ignore")
|
107 |
+
|
108 |
+
if hasattr(tokenizer, "sp_model"): # 基于 sentencepiece 包
|
109 |
+
if tokenizer.sp_model.is_byte(token_id):
|
110 |
+
tags.append("is_byte")
|
111 |
+
byte_tokens.append(token)
|
112 |
+
|
113 |
+
language_tags = detect_language(decode_str)
|
114 |
+
for language in language_tags:
|
115 |
+
tokens_by_lang[language[1]].append(decode_str)
|
116 |
+
|
117 |
+
if contains_digit(decode_str):
|
118 |
+
tags.append("digit")
|
119 |
+
digit_tokens.append(decode_str)
|
120 |
+
|
121 |
+
space_count = get_space_count(decode_str)
|
122 |
+
if space_count > 0:
|
123 |
+
space_tokens.append(decode_str)
|
124 |
+
|
125 |
+
buffer.append(json.dumps(
|
126 |
+
{
|
127 |
+
"id": token_id,
|
128 |
+
"token": token,
|
129 |
+
"token_decode": decode_str,
|
130 |
+
"token_dumps": json.dumps(token),
|
131 |
+
"token_unicode": _to_unicode(token),
|
132 |
+
"token_len": len(decode_str),
|
133 |
+
},
|
134 |
+
ensure_ascii=False) + "\n")
|
135 |
+
|
136 |
+
result = {
|
137 |
+
"tokenizer": tokenizer_factory.get_name_with_hyperlink(tokenizer_name),
|
138 |
+
"organization": tokenizer_config.org,
|
139 |
+
# "impl": str(tokenizer.__class__),
|
140 |
+
# "vocab_size-": tokenizer.vocab_size, # vocab_size_without_added_token
|
141 |
+
"vocab_size": len(tokenizer),
|
142 |
+
|
143 |
+
# "中文汉字编码长度均值": mean_length, # 不用统计,因为字典包含中文字符多,一般就意味着 中文汉字编码长度短。
|
144 |
+
# "中文汉字编码长度分布": json.dumps(dist_length),
|
145 |
+
|
146 |
+
"num(digit)": len(digit_tokens),
|
147 |
+
"len(digit)": _dist([len(token) for token in digit_tokens]),
|
148 |
+
"num(space)": len(space_tokens),
|
149 |
+
"len(space)": _dist([len(token) for token in space_tokens]),
|
150 |
+
|
151 |
+
# "num(byte)": len(byte_tokens)
|
152 |
+
}
|
153 |
+
|
154 |
+
for lang, tokens in tokens_by_lang.items():
|
155 |
+
result[f"num({lang})"] = len(tokens)
|
156 |
+
result["len(" + lang + ")"] = _dist([len(token) for token in tokens])
|
157 |
+
|
158 |
+
out_path = os.path.join(cache_dir, f"iter_vocab/{tokenizer_name.replace('/', '_')}.vocab.jsonl")
|
159 |
+
with open(out_path, "w", encoding="utf-8") as f_out:
|
160 |
+
for line in buffer:
|
161 |
+
f_out.write(line)
|
162 |
+
len_before = len(cache)
|
163 |
+
cache[tokenizer_name] = result
|
164 |
+
len_after = len(cache)
|
165 |
+
logger.info(f"saving {tokenizer_name} to memory and file cache: {len_before}->{len_after}")
|
166 |
+
with open(cache_path, "w", encoding="utf-8") as f_out:
|
167 |
+
f_out.write(json.dumps(cache, ensure_ascii=False, indent=2))
|
168 |
+
return result
|
169 |
+
|
170 |
+
|
171 |
+
def to_dataframe(stats, columns):
|
172 |
+
table = []
|
173 |
+
for stat in stats.values():
|
174 |
+
filtered_stat = {}
|
175 |
+
for k, v in stat.items():
|
176 |
+
if not k.startswith("num") and not k.startswith("len"):
|
177 |
+
filtered_stat[k] = v
|
178 |
+
if any(column in k for column in columns):
|
179 |
+
k = k.replace("ja-kana", "kana")
|
180 |
+
filtered_stat[k] = v
|
181 |
+
table.append(filtered_stat)
|
182 |
+
df = pd.DataFrame(table)
|
183 |
+
return df
|
184 |
+
|
185 |
+
|
186 |
+
def get_character_table(
|
187 |
+
tokenizer_filter: Optional[str] = None,
|
188 |
+
columns: Optional[str] = None,
|
189 |
+
return_type: Optional[Literal["dict", "dataframe"]] = "dataframe"
|
190 |
+
) -> Union[pd.DataFrame, dict]:
|
191 |
+
"""
|
192 |
+
"""
|
193 |
+
logger.info(f"columns: {columns}, tokenizer_filter: {tokenizer_filter}")
|
194 |
+
stats = {}
|
195 |
+
if tokenizer_filter is not None:
|
196 |
+
tokenizer_names = [tokenizer_config.name_or_path for tokenizer_config in tokenizer_factory.all_tokenizer_configs
|
197 |
+
if tokenizer_filter.lower() in tokenizer_config.name_or_path.lower()]
|
198 |
+
else:
|
199 |
+
tokenizer_names = tokenizer_factory.all_tokenizer_names
|
200 |
+
|
201 |
+
for tokenizer_name in tokenizer_names:
|
202 |
+
stat = iter_vocab(tokenizer_name)
|
203 |
+
stats[tokenizer_name] = stat
|
204 |
+
|
205 |
+
if return_type == "dataframe":
|
206 |
+
stats = to_dataframe(stats, columns)
|
207 |
+
return stats
|
208 |
+
|
209 |
+
|
210 |
+
if __name__ == "__main__":
|
211 |
+
# aa = get_character_table(tokenizer_filter="baichuan")
|
212 |
+
df = get_character_table()
|
213 |
+
logger.info(f"\n{df.to_markdown(index=False)}")
|
app_compression.py → compression_app.py
RENAMED
@@ -1,6 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
-
from
|
3 |
-
|
4 |
|
5 |
with gr.Blocks() as demo:
|
6 |
# gr.Markdown("## Convertor")
|
@@ -44,63 +52,56 @@ with gr.Blocks() as demo:
|
|
44 |
# )
|
45 |
|
46 |
gr.Markdown("## 🛠️ Setting") # ⚙
|
47 |
-
with gr.Accordion("Please select corpus and measure of compression rate
|
48 |
# file size 💽 🖴, tokens 🧮
|
49 |
-
#
|
50 |
-
# "Please select corpus and measure of compression rate.\n"
|
51 |
-
#"`num_of_trillion_tokens` `num_of_billion_tokens`\n"
|
52 |
-
# "- `b_tokens/g_bytes` measures how many billion tokens per gigabytes corpus. \n"
|
53 |
-
# "- `t_tokens/t_bytes` measures how many trillion tokens per terabytes corpus. \n"
|
54 |
-
# "- `n_chars/n_tokens` measures how many chars per token in the current corpus. \n\n"
|
55 |
-
# "All the above measures are depend on corpus. You can reproduce this "
|
56 |
-
# "procedure at [github](https://github.com/xu-song/tokenizer-arena/)."
|
57 |
-
# )
|
58 |
-
|
59 |
with gr.Row():
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
|
|
67 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
|
|
|
|
76 |
)
|
77 |
|
78 |
-
gr.Markdown(
|
79 |
-
# "`num_of_trillion_tokens` `num_of_billion_tokens`\n"
|
80 |
-
"- `b_tokens/g_bytes` measures how many billion tokens per gigabytes corpus. \n"
|
81 |
-
"- `t_tokens/t_bytes` measures how many trillion tokens per terabytes corpus. \n"
|
82 |
-
"- `n_chars/n_tokens` measures how many chars per token in the tokenized corpus. \n"
|
83 |
-
# "\nAll the above measures are depend on corpus. You can reproduce this "
|
84 |
-
# "procedure at [github](https://github.com/xu-song/tokenizer-arena/)."
|
85 |
-
)
|
86 |
-
|
87 |
gr.Markdown("## 🏆 Compression Rate Leaderboard")
|
88 |
search_bar = gr.Textbox(
|
89 |
-
placeholder="🔍 Search
|
90 |
show_label=False,
|
91 |
elem_id="search-bar",
|
92 |
)
|
93 |
-
compress_rate_table = gr.Dataframe()
|
94 |
|
95 |
# func call
|
96 |
compress_rate_corpus.change(
|
97 |
get_compression_leaderboard,
|
98 |
-
inputs=[compress_rate_corpus, compress_rate_unit],
|
99 |
outputs=compress_rate_table
|
100 |
)
|
101 |
compress_rate_unit.change(
|
102 |
get_compression_leaderboard,
|
103 |
-
inputs=[compress_rate_corpus, compress_rate_unit],
|
104 |
outputs=compress_rate_table
|
105 |
)
|
106 |
# file_size.change(
|
@@ -123,5 +124,6 @@ with gr.Blocks() as demo:
|
|
123 |
inputs=[compress_rate_corpus, compress_rate_unit],
|
124 |
outputs=compress_rate_table
|
125 |
)
|
|
|
126 |
if __name__ == "__main__":
|
127 |
demo.launch()
|
|
|
1 |
+
"""
|
2 |
+
TODO:
|
3 |
+
- 统计 tokenizer_impl
|
4 |
+
- 统计 OOV
|
5 |
+
- 统计 reversal
|
6 |
+
- 增加 math,code
|
7 |
+
"""
|
8 |
+
|
9 |
import gradio as gr
|
10 |
+
from compression_util import get_compression_leaderboard, common_corpuses
|
11 |
+
|
12 |
|
13 |
with gr.Blocks() as demo:
|
14 |
# gr.Markdown("## Convertor")
|
|
|
52 |
# )
|
53 |
|
54 |
gr.Markdown("## 🛠️ Setting") # ⚙
|
55 |
+
with gr.Accordion("Please select the corpus and measure of compression rate.", open=True):
|
56 |
# file size 💽 🖴, tokens 🧮
|
57 |
+
# Total amount of disk used
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
with gr.Row():
|
59 |
+
with gr.Column():
|
60 |
+
compress_rate_corpus = gr.Dropdown(
|
61 |
+
common_corpuses, # , "code"
|
62 |
+
value=["cc100/en", "cc100/zh-Hans", "cc100/fr", "cc100/es"],
|
63 |
+
label="corpus",
|
64 |
+
multiselect=True
|
65 |
+
# info=""
|
66 |
+
)
|
67 |
|
68 |
+
# unit of file_size: gigabyte terabyte
|
69 |
+
# unit of token_num: million billion trillion
|
70 |
+
# The most common units of measurement include length (meter, inch, foot), weight (gram, kilogram, pound), volume (liter, gallon, milliliter), time (second, minute, hour)
|
71 |
+
compress_rate_unit = gr.Radio(
|
72 |
+
["b_tokens/g_bytes", "t_tokens/t_bytes"],
|
73 |
+
value="b_tokens/g_bytes",
|
74 |
+
label="measure", # evaluation metric
|
75 |
+
)
|
76 |
|
77 |
+
gr.Markdown(
|
78 |
+
"- `corpus`: tokenization is performed on the selected subsets of [cc100](https://huggingface.co/datasets/cc100) corpus.\n"
|
79 |
+
"- `b_tokens/g_bytes` measures how many billion tokens per gigabytes corpus.\n"
|
80 |
+
"- `t_tokens/t_bytes` measures how many trillion tokens per terabytes corpus.\n"
|
81 |
+
# "- `g_bytes/b_tokens` measures how many gigabytes corpus per billion tokens.\n"
|
82 |
+
# "- `t_bytes/t_tokens` measures how many terabytes corpus per trillion tokens.\n"
|
83 |
+
"- `char/token` measures how many chars per token on the tokenized corpus.\n"
|
84 |
+
"- `oov_ratio`: out-of-vocabulary ratio on the selected corpus. 👉 get [oov charset](https://huggingface.co/spaces/eson/tokenizer-arena/blob/main/stats/compression_rate.json)\n\n"
|
85 |
+
"You can reproduce this procedure with [compression_util.py](https://huggingface.co/spaces/eson/tokenizer-arena/blob/main/compression_util.py)."
|
86 |
)
|
87 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
gr.Markdown("## 🏆 Compression Rate Leaderboard")
|
89 |
search_bar = gr.Textbox(
|
90 |
+
placeholder="🔍 Search by tokenizer or organization (e.g., 'llama', 'openai') and press ENTER...",
|
91 |
show_label=False,
|
92 |
elem_id="search-bar",
|
93 |
)
|
94 |
+
compress_rate_table = gr.Dataframe(datatype="html")
|
95 |
|
96 |
# func call
|
97 |
compress_rate_corpus.change(
|
98 |
get_compression_leaderboard,
|
99 |
+
inputs=[compress_rate_corpus, compress_rate_unit, search_bar],
|
100 |
outputs=compress_rate_table
|
101 |
)
|
102 |
compress_rate_unit.change(
|
103 |
get_compression_leaderboard,
|
104 |
+
inputs=[compress_rate_corpus, compress_rate_unit, search_bar],
|
105 |
outputs=compress_rate_table
|
106 |
)
|
107 |
# file_size.change(
|
|
|
124 |
inputs=[compress_rate_corpus, compress_rate_unit],
|
125 |
outputs=compress_rate_table
|
126 |
)
|
127 |
+
|
128 |
if __name__ == "__main__":
|
129 |
demo.launch()
|
utils/compression_util.py → compression_util.py
RENAMED
@@ -2,8 +2,8 @@
|
|
2 |
|
3 |
中文数据:clue superclue
|
4 |
英文数据:glue cnn_dailymail gigaword
|
5 |
-
|
6 |
-
|
7 |
|
8 |
"""
|
9 |
|
@@ -13,15 +13,15 @@ import sys
|
|
13 |
import pandas as pd
|
14 |
from datasets import load_dataset
|
15 |
from utils.log_util import logger
|
16 |
-
from vocab import
|
17 |
-
from vocab import all_tokenizers
|
18 |
from typing import List, Optional, Union, Literal
|
19 |
|
20 |
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
|
21 |
|
22 |
common_units = ["g_bytes/b_tokens", "b_tokens/g_bytes", "t_bytes/t_tokens", "t_tokens/t_bytes", "n_chars/n_tokens", ]
|
23 |
-
|
24 |
-
|
|
|
25 |
|
26 |
VALID_CODES_CC100 = [
|
27 |
"am", "ar", "as", "az", "be", "bg", "bn", "bn_rom", "br", "bs", "ca", "cs", "cy", "da", "de",
|
@@ -44,9 +44,12 @@ def get_n_bytes_of_string(string_text):
|
|
44 |
|
45 |
|
46 |
def unit_convertor(stat, unit):
|
47 |
-
n_tokens = stat["
|
48 |
-
n_chars = stat["
|
49 |
-
n_bytes = stat["
|
|
|
|
|
|
|
50 |
|
51 |
n_tokens_in_billion = n_tokens / (1000 * 1000 * 1000)
|
52 |
n_tokens_in_trillion = n_tokens / (1000 * 1000 * 1000 * 1000)
|
@@ -57,11 +60,9 @@ def unit_convertor(stat, unit):
|
|
57 |
|
58 |
if unit == "n_tokens/n_bytes":
|
59 |
value = n_tokens / n_bytes
|
60 |
-
|
61 |
-
# the average number of characters per token
|
62 |
-
elif unit in ["n_chars/n_tokens", "chars_per_token"]: # 重要:平均一个token包含多少个字符。
|
63 |
value = n_chars / n_tokens
|
64 |
-
elif unit
|
65 |
value = n_tokens / n_chars
|
66 |
elif unit == "g_bytes/b_tokens":
|
67 |
value = n_bytes_in_gb / n_tokens_in_billion
|
@@ -76,14 +77,48 @@ def unit_convertor(stat, unit):
|
|
76 |
return round(value, 3)
|
77 |
|
78 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
def to_dataframe(stats, units=None):
|
80 |
if units is None:
|
81 |
units = common_units
|
82 |
elif not isinstance(units, list):
|
83 |
units = [units]
|
84 |
table = []
|
85 |
-
|
86 |
-
|
|
|
87 |
for unit in units:
|
88 |
if unit not in stat:
|
89 |
columns[unit] = unit_convertor(stat, unit)
|
@@ -98,105 +133,159 @@ cache = {}
|
|
98 |
|
99 |
|
100 |
def tokenize_corpus(
|
101 |
-
tokenizer_name: str,
|
102 |
corpuses: List[str],
|
103 |
-
|
104 |
) -> dict:
|
105 |
"""
|
106 |
这个要独立的cache,因为速度慢。
|
107 |
-
:param
|
108 |
:param corpuses:
|
109 |
:param cache_path:
|
110 |
:return:
|
111 |
"""
|
112 |
|
113 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
n_tokens = 0
|
115 |
n_chars = 0
|
116 |
-
|
|
|
|
|
|
|
|
|
|
|
117 |
for dataset in datasets:
|
118 |
for item in dataset:
|
119 |
text = item["text"]
|
120 |
n_bytes += get_n_bytes_of_string(text)
|
121 |
n_chars += len(text)
|
122 |
-
|
123 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
stat = {
|
125 |
-
|
126 |
-
"
|
127 |
-
"
|
128 |
-
"
|
129 |
-
"
|
|
|
|
|
130 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
131 |
return stat
|
132 |
|
133 |
# load from cache
|
134 |
-
cache_id = f"{tokenizer_name}
|
|
|
135 |
if not cache and os.path.exists(cache_path):
|
136 |
with open(cache_path, "r", encoding="utf-8") as f_tmp:
|
137 |
cache.update(json.load(f_tmp))
|
138 |
if cache_id in cache:
|
139 |
-
logger.info(f"loading {cache_id} from in-memory cache")
|
140 |
return cache[cache_id]
|
141 |
|
142 |
# tokenize corpus
|
143 |
-
tokenizer =
|
144 |
-
datasets = [load_dataset("eson/cc100-samples", corpus.replace("cc100
|
145 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
146 |
|
147 |
# save to cache
|
148 |
len_before = len(cache)
|
149 |
cache[cache_id] = stat
|
150 |
len_after = len(cache)
|
151 |
-
logger.info(f"saving {cache_id} to
|
152 |
with open(cache_path, "w", encoding="utf-8") as f_tmp:
|
153 |
-
json.dump(cache, f_tmp, indent=2)
|
154 |
return stat
|
155 |
|
156 |
|
157 |
def get_compression_leaderboard(
|
158 |
-
corpuses: List[str] = ['cc100
|
159 |
unit: str = "b_tokens/g_bytes",
|
160 |
tokenizer_filter: Optional[str] = None,
|
161 |
return_type: Optional[Literal["dict", "dataframe"]] = "dataframe"
|
162 |
) -> Union[pd.DataFrame, dict]:
|
163 |
"""
|
164 |
-
## TODO
|
165 |
-
- search by organization,
|
166 |
"""
|
167 |
logger.info(f"corpuses: {corpuses}; unit: {unit}; tokenizer_filter: {tokenizer_filter}")
|
168 |
stats = {}
|
169 |
if tokenizer_filter is not None:
|
170 |
-
|
|
|
171 |
else:
|
172 |
-
|
173 |
-
for
|
174 |
-
|
175 |
-
|
176 |
-
|
|
|
177 |
|
178 |
if return_type == "dataframe":
|
179 |
token_number_unit, file_size_unit = unit.split("/")
|
180 |
reverse_unit = f"{file_size_unit}/{token_number_unit}"
|
181 |
-
stats = to_dataframe(stats, [unit, reverse_unit, "
|
182 |
-
stats = stats.sort_values(unit)
|
183 |
-
stats = stats.rename(columns={unit: f' ⬆️{unit}'})
|
184 |
return stats
|
185 |
|
186 |
|
187 |
-
def update_compress_rate():
|
188 |
-
pass
|
189 |
-
|
190 |
-
|
191 |
-
def test():
|
192 |
-
tokenizer_name = "gpt_4"
|
193 |
-
tokenizer = load_tokener(tokenizer_name)
|
194 |
-
stats = {tokenizer_name: tokenize_corpus(tokenizer, ["cc100-en", "cc100-zh-Hans"])}
|
195 |
-
df = to_dataframe(stats)
|
196 |
-
# print(df.to_markdown(index=False, tablefmt='fancy_grid'))
|
197 |
-
logger.info(f"\n{df.to_markdown(index=False)}")
|
198 |
-
|
199 |
-
|
200 |
def main():
|
201 |
if len(sys.argv) == 3:
|
202 |
tokenizer_filter = [sys.argv[1]]
|
@@ -204,11 +293,12 @@ def main():
|
|
204 |
else:
|
205 |
tokenizer_filter = None
|
206 |
corpuses = common_corpuses
|
207 |
-
|
|
|
|
|
208 |
# print(df.to_markdown(index=False, tablefmt='fancy_grid'))
|
209 |
logger.info(f"\n{df.to_markdown(index=False)}")
|
210 |
|
211 |
|
212 |
if __name__ == "__main__":
|
213 |
main()
|
214 |
-
# test()
|
|
|
2 |
|
3 |
中文数据:clue superclue
|
4 |
英文数据:glue cnn_dailymail gigaword
|
5 |
+
code:
|
6 |
+
math:
|
7 |
|
8 |
"""
|
9 |
|
|
|
13 |
import pandas as pd
|
14 |
from datasets import load_dataset
|
15 |
from utils.log_util import logger
|
16 |
+
from vocab import tokenizer_factory, TokenizerConfig
|
|
|
17 |
from typing import List, Optional, Union, Literal
|
18 |
|
19 |
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
|
20 |
|
21 |
common_units = ["g_bytes/b_tokens", "b_tokens/g_bytes", "t_bytes/t_tokens", "t_tokens/t_bytes", "n_chars/n_tokens", ]
|
22 |
+
|
23 |
+
common_corpuses = sorted(["cc100/en", "cc100/zh-Hans", "cc100/es", "cc100/fr", "cc100/de", "cc100/ko",
|
24 |
+
"cc100/fa", "cc100/ar", "cc100/ja"])
|
25 |
|
26 |
VALID_CODES_CC100 = [
|
27 |
"am", "ar", "as", "az", "be", "bg", "bn", "bn_rom", "br", "bs", "ca", "cs", "cy", "da", "de",
|
|
|
44 |
|
45 |
|
46 |
def unit_convertor(stat, unit):
|
47 |
+
n_tokens = stat["_n_tokens"]
|
48 |
+
n_chars = stat["_n_chars"]
|
49 |
+
n_bytes = stat["_n_bytes"]
|
50 |
+
|
51 |
+
if n_tokens is None:
|
52 |
+
return None
|
53 |
|
54 |
n_tokens_in_billion = n_tokens / (1000 * 1000 * 1000)
|
55 |
n_tokens_in_trillion = n_tokens / (1000 * 1000 * 1000 * 1000)
|
|
|
60 |
|
61 |
if unit == "n_tokens/n_bytes":
|
62 |
value = n_tokens / n_bytes
|
63 |
+
elif unit in ["char/token", "chars_per_token"]: # 重要:平均一个token包含多少个字符。
|
|
|
|
|
64 |
value = n_chars / n_tokens
|
65 |
+
elif unit in ["token/char", "tokens_per_char"]: # 一个中文汉字需要几个token?
|
66 |
value = n_tokens / n_chars
|
67 |
elif unit == "g_bytes/b_tokens":
|
68 |
value = n_bytes_in_gb / n_tokens_in_billion
|
|
|
77 |
return round(value, 3)
|
78 |
|
79 |
|
80 |
+
def _merge_stats_by_corpus(stats_by_corpus, oov_threshold=0.3):
|
81 |
+
"""
|
82 |
+
"""
|
83 |
+
all_stats = list(stats_by_corpus.values())
|
84 |
+
assert len(set([stats["tokenizer"] for stats in all_stats])) == 1
|
85 |
+
reversible = all(stat['reversible'] for stat in all_stats)
|
86 |
+
is_support = all(stat['oov_ratio'] < oov_threshold for stat in all_stats)
|
87 |
+
|
88 |
+
merged_stats = {
|
89 |
+
"tokenizer": all_stats[0]["tokenizer"],
|
90 |
+
"organization": all_stats[0]["organization"],
|
91 |
+
"vocab_size": all_stats[0]["vocab_size"],
|
92 |
+
"_n_bytes": 0,
|
93 |
+
"_n_tokens": 0 if is_support else None,
|
94 |
+
"_n_chars": 0,
|
95 |
+
"_n_oov_chars": 0,
|
96 |
+
"reversible": True,
|
97 |
+
}
|
98 |
+
for stats in all_stats:
|
99 |
+
merged_stats["_n_bytes"] += stats["_n_bytes"]
|
100 |
+
merged_stats["_n_chars"] += stats["_n_chars"]
|
101 |
+
if is_support: # The number of tokens cannot be accurately counted, when there are too many UNKs.
|
102 |
+
merged_stats["_n_tokens"] += stats["_n_tokens"]
|
103 |
+
merged_stats["_n_oov_chars"] += stats["_n_oov_chars"]
|
104 |
+
merged_stats["reversible"] &= stats['reversible']
|
105 |
+
|
106 |
+
merged_stats.update({
|
107 |
+
"oov_ratio": float("%.4g" % (stats["_n_oov_chars"] / stats["_n_chars"])),
|
108 |
+
"reversible": reversible
|
109 |
+
})
|
110 |
+
return merged_stats
|
111 |
+
|
112 |
+
|
113 |
def to_dataframe(stats, units=None):
|
114 |
if units is None:
|
115 |
units = common_units
|
116 |
elif not isinstance(units, list):
|
117 |
units = [units]
|
118 |
table = []
|
119 |
+
|
120 |
+
for stat in stats.values():
|
121 |
+
columns = {k: v for k, v in stat.items() if not k.startswith("_")}
|
122 |
for unit in units:
|
123 |
if unit not in stat:
|
124 |
columns[unit] = unit_convertor(stat, unit)
|
|
|
133 |
|
134 |
|
135 |
def tokenize_corpus(
|
136 |
+
tokenizer_name: str, # 可以免加载tokenizer直接出结果
|
137 |
corpuses: List[str],
|
138 |
+
cache_dir: str = "stats"
|
139 |
) -> dict:
|
140 |
"""
|
141 |
这个要独立的cache,因为速度慢。
|
142 |
+
:param tokenizer_config: 可以不加载就
|
143 |
:param corpuses:
|
144 |
:param cache_path:
|
145 |
:return:
|
146 |
"""
|
147 |
|
148 |
+
def _char_based_oov(src_text, decode_text):
|
149 |
+
oov_chars = []
|
150 |
+
for char in src_text:
|
151 |
+
if char not in decode_text:
|
152 |
+
oov_chars.append(char)
|
153 |
+
|
154 |
+
n_oov_chars = len(oov_chars)
|
155 |
+
oov_charset = list(dict.fromkeys(oov_chars))
|
156 |
+
return n_oov_chars, oov_charset
|
157 |
+
|
158 |
+
def _tokenize(tokenizer, datasets, detail_path=None):
|
159 |
+
"""
|
160 |
+
export_diff: true | false
|
161 |
+
:param tokenizer:
|
162 |
+
:param datasets:
|
163 |
+
:param detail_path:
|
164 |
+
:return:
|
165 |
+
"""
|
166 |
+
n_bytes = 0
|
167 |
n_tokens = 0
|
168 |
n_chars = 0
|
169 |
+
n_oov_chars = 0
|
170 |
+
diff_details = []
|
171 |
+
oov_charset = set()
|
172 |
+
unk_token_id = None
|
173 |
+
if hasattr(tokenizer, "unk_token"):
|
174 |
+
unk_token_id = tokenizer.unk_token_id
|
175 |
for dataset in datasets:
|
176 |
for item in dataset:
|
177 |
text = item["text"]
|
178 |
n_bytes += get_n_bytes_of_string(text)
|
179 |
n_chars += len(text)
|
180 |
+
ids = tokenizer.encode(text, add_special_tokens=False)
|
181 |
+
|
182 |
+
# detect oov
|
183 |
+
decode_text = tokenizer.decode(ids)
|
184 |
+
decode_text_without_unk = tokenizer.decode([token_id for token_id in ids if token_id != unk_token_id])
|
185 |
+
if decode_text != text:
|
186 |
+
_n_oov_chars, _oov_charset = _char_based_oov(text, decode_text_without_unk)
|
187 |
+
diff_details.append(
|
188 |
+
{
|
189 |
+
"text": text,
|
190 |
+
"decode_text": decode_text,
|
191 |
+
"decode_text_without_unk": decode_text_without_unk,
|
192 |
+
"n_oov_chars": _n_oov_chars,
|
193 |
+
'oov_ratio': _n_oov_chars / len(text),
|
194 |
+
'oov_charset': json.dumps(_oov_charset, ensure_ascii=False),
|
195 |
+
}
|
196 |
+
)
|
197 |
+
n_oov_chars += _n_oov_chars
|
198 |
+
oov_charset.update(_oov_charset)
|
199 |
+
n_tokens += len(ids)
|
200 |
stat = {
|
201 |
+
"_n_bytes": n_bytes,
|
202 |
+
"_n_tokens": n_tokens,
|
203 |
+
"_n_chars": n_chars,
|
204 |
+
"_n_oov_chars": n_oov_chars,
|
205 |
+
"oov_ratio": n_oov_chars / n_chars,
|
206 |
+
'_oov_charset': json.dumps(list(oov_charset), ensure_ascii=False),
|
207 |
+
"reversible": len(diff_details) == 0
|
208 |
}
|
209 |
+
|
210 |
+
if detail_path and diff_details:
|
211 |
+
logger.info(f"saving tokenization detail to '{detail_path}'")
|
212 |
+
with open(detail_path, "w", encoding="utf-8") as f:
|
213 |
+
f.write(json.dumps(diff_details, ensure_ascii=False, indent=2))
|
214 |
+
# print(f"{tokenizer_config.name_or_path}, {infer_tokenizer_type(tokenizer_config)}\n"
|
215 |
+
# f"reversible: false; unk_token: {get_unk(tokenizer_config)},"
|
216 |
+
# f" unk_ratio: {unk_count / len(encoding):.4f}; oov: []")
|
217 |
+
# for diff_detail in diff_details:
|
218 |
+
# # print(f"text[{i}] = {str(bytes(text[i:], 'utf-8'))}\n"
|
219 |
+
# # f"decoding[{i}] = {str(bytes(decoding[i:], 'utf-8'))}")
|
220 |
+
# f.write(f"text= {json.dumps(text[i:], ensure_ascii=False)}, \n"
|
221 |
+
# f"decoding[{i}] = {json.dumps(decoding[i:], ensure_ascii=False)}")
|
222 |
return stat
|
223 |
|
224 |
# load from cache
|
225 |
+
cache_id = f"{tokenizer_name} @ {'.'.join(corpuses)}"
|
226 |
+
cache_path = os.path.join(cache_dir, "compression_rate.json")
|
227 |
if not cache and os.path.exists(cache_path):
|
228 |
with open(cache_path, "r", encoding="utf-8") as f_tmp:
|
229 |
cache.update(json.load(f_tmp))
|
230 |
if cache_id in cache:
|
231 |
+
# logger.info(f"loading {cache_id} from in-memory cache")
|
232 |
return cache[cache_id]
|
233 |
|
234 |
# tokenize corpus
|
235 |
+
tokenizer = tokenizer_factory.get_tokenizer(tokenizer_name)
|
236 |
+
datasets = [load_dataset("eson/cc100-samples", corpus.replace("cc100/", ""), split="train") for corpus in corpuses]
|
237 |
+
|
238 |
+
stat = {
|
239 |
+
"tokenizer": tokenizer_factory.get_name_with_hyperlink(tokenizer_name),
|
240 |
+
"organization": tokenizer_factory.get_tokenizer_config(tokenizer_name).org,
|
241 |
+
"vocab_size": len(tokenizer),
|
242 |
+
}
|
243 |
+
tokenize_detail_dir = os.path.join(cache_dir, "compression_rate")
|
244 |
+
os.makedirs(tokenize_detail_dir, exist_ok=True)
|
245 |
+
tokenize_detail_path = os.path.join(tokenize_detail_dir, cache_id.replace("/", ".") + ".diff.json")
|
246 |
+
stat.update(_tokenize(tokenizer, datasets, detail_path=tokenize_detail_path))
|
247 |
+
# add basic info
|
248 |
|
249 |
# save to cache
|
250 |
len_before = len(cache)
|
251 |
cache[cache_id] = stat
|
252 |
len_after = len(cache)
|
253 |
+
logger.info(f"saving '{cache_id}' to memory and file cache '{cache_path}': {len_before}->{len_after}")
|
254 |
with open(cache_path, "w", encoding="utf-8") as f_tmp:
|
255 |
+
json.dump(cache, f_tmp, ensure_ascii=False, indent=2)
|
256 |
return stat
|
257 |
|
258 |
|
259 |
def get_compression_leaderboard(
|
260 |
+
corpuses: List[str] = ['cc100/en'],
|
261 |
unit: str = "b_tokens/g_bytes",
|
262 |
tokenizer_filter: Optional[str] = None,
|
263 |
return_type: Optional[Literal["dict", "dataframe"]] = "dataframe"
|
264 |
) -> Union[pd.DataFrame, dict]:
|
265 |
"""
|
|
|
|
|
266 |
"""
|
267 |
logger.info(f"corpuses: {corpuses}; unit: {unit}; tokenizer_filter: {tokenizer_filter}")
|
268 |
stats = {}
|
269 |
if tokenizer_filter is not None:
|
270 |
+
tokenizer_names = [tokenizer_name for tokenizer_name in tokenizer_factory.all_tokenizer_names
|
271 |
+
if tokenizer_filter.lower() in tokenizer_name.lower()]
|
272 |
else:
|
273 |
+
tokenizer_names = tokenizer_factory.all_tokenizer_names
|
274 |
+
for tokenizer_name in tokenizer_names:
|
275 |
+
stats_by_corpus = {}
|
276 |
+
for corpus in corpuses:
|
277 |
+
stats_by_corpus[corpus] = tokenize_corpus(tokenizer_name, [corpus])
|
278 |
+
stats[tokenizer_name] = _merge_stats_by_corpus(stats_by_corpus)
|
279 |
|
280 |
if return_type == "dataframe":
|
281 |
token_number_unit, file_size_unit = unit.split("/")
|
282 |
reverse_unit = f"{file_size_unit}/{token_number_unit}"
|
283 |
+
stats = to_dataframe(stats, [unit, reverse_unit, "char/token"])
|
284 |
+
stats = stats.sort_values(["oov_ratio", unit], ascending=[True, True])
|
285 |
+
stats = stats.rename(columns={"oov_ratio": f' ⬆️oov_ratio'}).rename(columns={unit: f' ⬆️{unit}'}) # ⬇
|
286 |
return stats
|
287 |
|
288 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
289 |
def main():
|
290 |
if len(sys.argv) == 3:
|
291 |
tokenizer_filter = [sys.argv[1]]
|
|
|
293 |
else:
|
294 |
tokenizer_filter = None
|
295 |
corpuses = common_corpuses
|
296 |
+
# tokenizer_filter = "openai"
|
297 |
+
# corpuses = ["cc100/en", "cc100/zh-Hans"]
|
298 |
+
df = get_compression_leaderboard(corpuses, tokenizer_filter=tokenizer_filter)
|
299 |
# print(df.to_markdown(index=False, tablefmt='fancy_grid'))
|
300 |
logger.info(f"\n{df.to_markdown(index=False)}")
|
301 |
|
302 |
|
303 |
if __name__ == "__main__":
|
304 |
main()
|
|
config.py
DELETED
@@ -1,20 +0,0 @@
|
|
1 |
-
USE_REMOTE = False # use remote tokenizer or local tokenizer
|
2 |
-
|
3 |
-
# load_vocab_with_SPECIAL_TOKEN = True # 如果不包含会导致计算词典大小错误、overlap_token计算不一致。
|
4 |
-
|
5 |
-
# encoding config
|
6 |
-
ADD_SPECIAL_TOKEN = False
|
7 |
-
|
8 |
-
#
|
9 |
-
LAZY_IMPORT = True
|
10 |
-
|
11 |
-
# DEBUG: 设置环境变量 RUST_BACKTRACE=full
|
12 |
-
#
|
13 |
-
|
14 |
-
default_user_input = """\
|
15 |
-
Replace this text in the input field to see how tokenization works.
|
16 |
-
Buenos días!
|
17 |
-
华为发布Mate60手机。
|
18 |
-
ラグビーワールドカップ2023フランス"""
|
19 |
-
default_tokenizer_type_1 = "llama3"
|
20 |
-
default_tokenizer_type_2 = "gpt_4"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
patcher/README.md
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
|
3 |
+
## vocabsize不一致问题
|
4 |
+
|
5 |
+
|
6 |
+
- .vcab_size
|
7 |
+
- Size of the base vocabulary (without the added tokens)
|
8 |
+
- 来自 https://huggingface.co/transformers/v2.11.0/main_classes/tokenizer.html
|
9 |
+
- len(tokenizer)
|
10 |
+
- Size of the full vocabulary with the added tokens.
|
11 |
+
- https://github.com/huggingface/transformers/issues/12632
|
12 |
+
- max(tokenizer.get_vocab().values())
|
13 |
+
- 包括不连续的 token_id
|
14 |
+
- https://github.com/huggingface/transformers/issues/4875
|
15 |
+
|
patcher/sptokenizer_patch_deprecated.py
DELETED
@@ -1,105 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
|
3 |
-
## adapt to transformer tokenizer
|
4 |
-
|
5 |
-
https://github.com/huggingface/transformers/blob/v4.40.1/src/transformers/tokenization_utils.py#L379
|
6 |
-
|
7 |
-
## usage
|
8 |
-
|
9 |
-
- grok
|
10 |
-
|
11 |
-
## 风险评估
|
12 |
-
|
13 |
-
- 可能会干扰 sentencepiece.SentencePieceProcessor的正常使用,比如 .vocab_size 原来是个方法,patch后是个property
|
14 |
-
|
15 |
-
|
16 |
-
## TODO
|
17 |
-
|
18 |
-
不用patch,改用wrapper。常见的 tokenizer通常是封装的 sentencepiece,
|
19 |
-
"""
|
20 |
-
|
21 |
-
import sentencepiece
|
22 |
-
|
23 |
-
|
24 |
-
@property
|
25 |
-
def vocab_size(self):
|
26 |
-
"""Returns vocab size"""
|
27 |
-
return self.get_piece_size()
|
28 |
-
|
29 |
-
|
30 |
-
def get_vocab(self):
|
31 |
-
"""Returns vocab as a dict"""
|
32 |
-
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
|
33 |
-
# vocab.update(self.added_tokens_encoder)
|
34 |
-
return vocab
|
35 |
-
|
36 |
-
|
37 |
-
def _tokenize(self, text):
|
38 |
-
"""Returns a tokenized string."""
|
39 |
-
return self.encode(text, out_type=str)
|
40 |
-
|
41 |
-
|
42 |
-
def _convert_token_to_id(self, token):
|
43 |
-
"""Converts a token (str) in an id using the vocab."""
|
44 |
-
return self.piece_to_id(token)
|
45 |
-
|
46 |
-
|
47 |
-
def _convert_id_to_token(self, index):
|
48 |
-
"""Converts an index (integer) in a token (str) using the vocab."""
|
49 |
-
token = self.IdToPiece(index)
|
50 |
-
return token
|
51 |
-
|
52 |
-
|
53 |
-
def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
|
54 |
-
""" copy from transformers.PreTrainedTokenizer
|
55 |
-
Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
|
56 |
-
added tokens.
|
57 |
-
|
58 |
-
Args:
|
59 |
-
ids (`int` or `List[int]`):
|
60 |
-
The token id (or token ids) to convert to tokens.
|
61 |
-
skip_special_tokens (`bool`, *optional*, defaults to `False`):
|
62 |
-
Whether or not to remove special tokens in the decoding.
|
63 |
-
|
64 |
-
Returns:
|
65 |
-
`str` or `List[str]`: The decoded token(s).
|
66 |
-
"""
|
67 |
-
self._added_tokens_decoder = {} # add by xs
|
68 |
-
if isinstance(ids, int):
|
69 |
-
if ids in self._added_tokens_decoder:
|
70 |
-
return self._added_tokens_decoder[ids].content
|
71 |
-
else:
|
72 |
-
return self._convert_id_to_token(ids)
|
73 |
-
tokens = []
|
74 |
-
for index in ids:
|
75 |
-
index = int(index)
|
76 |
-
if skip_special_tokens and index in self.all_special_ids:
|
77 |
-
continue
|
78 |
-
if index in self._added_tokens_decoder:
|
79 |
-
tokens.append(self._added_tokens_decoder[index].content)
|
80 |
-
else:
|
81 |
-
tokens.append(self._convert_id_to_token(index))
|
82 |
-
return tokens
|
83 |
-
|
84 |
-
|
85 |
-
def encode(self, *args, **kwargs):
|
86 |
-
"""
|
87 |
-
add_special_token 是为了兼容 hf_tokenizer
|
88 |
-
"""
|
89 |
-
kwargs.pop("add_special_tokens", None)
|
90 |
-
kwargs.pop("allowed_special", None)
|
91 |
-
return self.Encode(*args, **kwargs)
|
92 |
-
|
93 |
-
|
94 |
-
def decode(self, *args, **kwargs):
|
95 |
-
kwargs.pop("skip_special_tokens", None)
|
96 |
-
return self.Decode(*args, **kwargs)
|
97 |
-
|
98 |
-
|
99 |
-
sentencepiece.SentencePieceProcessor.vocab_size = vocab_size #
|
100 |
-
sentencepiece.SentencePieceProcessor.get_vocab = get_vocab
|
101 |
-
sentencepiece.SentencePieceProcessor._convert_id_to_token = _convert_id_to_token
|
102 |
-
sentencepiece.SentencePieceProcessor.convert_ids_to_tokens = convert_ids_to_tokens
|
103 |
-
# sentencepiece.SentencePieceProcessor.tokenize = _tokenize
|
104 |
-
sentencepiece.SentencePieceProcessor.encode = encode
|
105 |
-
sentencepiece.SentencePieceProcessor.decode = decode
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
patcher/sptokenizer_wrapper.py
DELETED
@@ -1,61 +0,0 @@
|
|
1 |
-
""" 封装 sentencepiece.SentencePieceProcessor,以便符合transformers中的tokenizer标准
|
2 |
-
|
3 |
-
## reference
|
4 |
-
|
5 |
-
|
6 |
-
## usage
|
7 |
-
|
8 |
-
- grok
|
9 |
-
|
10 |
-
"""
|
11 |
-
|
12 |
-
import sentencepiece as spm
|
13 |
-
from transformers import PreTrainedTokenizer
|
14 |
-
|
15 |
-
|
16 |
-
class SPTokenizerWrapper(PreTrainedTokenizer):
|
17 |
-
"""
|
18 |
-
|
19 |
-
## impl in PreTrainedTokenizer
|
20 |
-
- convert_ids_to_tokens
|
21 |
-
"""
|
22 |
-
|
23 |
-
def __init__(self, vocab_file):
|
24 |
-
self.vocab_file = vocab_file
|
25 |
-
self.sp_model = spm.SentencePieceProcessor(self.vocab_file)
|
26 |
-
super().__init__()
|
27 |
-
|
28 |
-
@property
|
29 |
-
def vocab_size(self):
|
30 |
-
"""Returns vocab size"""
|
31 |
-
return self.sp_model.get_piece_size()
|
32 |
-
|
33 |
-
def get_vocab(self):
|
34 |
-
"""Returns vocab as a dict"""
|
35 |
-
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
|
36 |
-
return vocab
|
37 |
-
|
38 |
-
def _convert_token_to_id(self, token):
|
39 |
-
"""Converts a token (str) in an id using the vocab."""
|
40 |
-
return self.sp_model.piece_to_id(token)
|
41 |
-
|
42 |
-
def _convert_id_to_token(self, index):
|
43 |
-
"""Converts an index (integer) in a token (str) using the vocab."""
|
44 |
-
token = self.sp_model.IdToPiece(index)
|
45 |
-
return token
|
46 |
-
|
47 |
-
# def (self, ids, skip_special_tokens=False): # impl in PreTrainedTokenizer
|
48 |
-
|
49 |
-
|
50 |
-
def encode(self, *args, **kwargs):
|
51 |
-
kwargs.pop("add_special_tokens", None)
|
52 |
-
kwargs.pop("allowed_special", None)
|
53 |
-
return self.sp_model.Encode(*args, **kwargs)
|
54 |
-
|
55 |
-
def decode(self, *args, **kwargs):
|
56 |
-
kwargs.pop("skip_special_tokens", None)
|
57 |
-
return self.sp_model.Decode(*args, **kwargs)
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
# PreTrainedTokenizer.convert_ids_to_tokens
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
patcher/tiktoken_patch.py
CHANGED
@@ -70,8 +70,8 @@ def get_vocab(self, token_type="str"):
|
|
70 |
|
71 |
@property
|
72 |
def vocab_size(self):
|
73 |
-
"""Returns vocab size"""
|
74 |
-
return self.
|
75 |
|
76 |
|
77 |
def encode(self, *args, **kwargs):
|
|
|
70 |
|
71 |
@property
|
72 |
def vocab_size(self):
|
73 |
+
"""Returns vocab size without special tokens"""
|
74 |
+
return len(self._mergeable_ranks)
|
75 |
|
76 |
|
77 |
def encode(self, *args, **kwargs):
|
app_playground.py → playground_app.py
RENAMED
@@ -36,9 +36,12 @@ table
|
|
36 |
"""
|
37 |
|
38 |
import gradio as gr
|
39 |
-
from vocab import
|
40 |
-
from
|
41 |
-
from
|
|
|
|
|
|
|
42 |
|
43 |
get_window_url_params = """
|
44 |
function(url_params) {
|
@@ -48,6 +51,8 @@ get_window_url_params = """
|
|
48 |
}
|
49 |
"""
|
50 |
|
|
|
|
|
51 |
with gr.Blocks() as demo:
|
52 |
# links: https://www.coderstool.com/utf8-encoding-decoding
|
53 |
# 功能:输入文本,进行分词
|
@@ -60,6 +65,7 @@ with gr.Blocks() as demo:
|
|
60 |
example_types,
|
61 |
value="Examples",
|
62 |
type="index",
|
|
|
63 |
show_label=False,
|
64 |
container=False,
|
65 |
scale=0,
|
@@ -102,21 +108,26 @@ with gr.Blocks() as demo:
|
|
102 |
with gr.Column(scale=6):
|
103 |
with gr.Group():
|
104 |
tokenizer_name_1 = gr.Dropdown(
|
105 |
-
|
106 |
label="Tokenizer 1",
|
107 |
)
|
108 |
with gr.Group():
|
109 |
with gr.Row():
|
|
|
|
|
|
|
|
|
|
|
110 |
stats_vocab_size_1 = gr.TextArea(
|
111 |
label="Vocab Size",
|
112 |
lines=1,
|
113 |
elem_classes="statistics"
|
114 |
)
|
115 |
-
stats_zh_token_size_1 = gr.TextArea(
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
)
|
120 |
# stats_compress_rate_1 = gr.TextArea(
|
121 |
# label="Compress Rate",
|
122 |
# lines=1,
|
@@ -140,21 +151,26 @@ with gr.Blocks() as demo:
|
|
140 |
with gr.Column(scale=6):
|
141 |
with gr.Group():
|
142 |
tokenizer_name_2 = gr.Dropdown(
|
143 |
-
|
144 |
label="Tokenizer 2",
|
145 |
)
|
146 |
with gr.Group():
|
147 |
with gr.Row():
|
148 |
-
|
149 |
-
label="
|
150 |
lines=1,
|
151 |
-
elem_classes="statistics"
|
152 |
)
|
153 |
-
|
154 |
-
label="
|
155 |
lines=1,
|
156 |
-
elem_classes="statistics"
|
157 |
)
|
|
|
|
|
|
|
|
|
|
|
158 |
# stats_compress_rate_2 = gr.TextArea(
|
159 |
# label="Compress Rate",
|
160 |
# lines=1,
|
@@ -196,7 +212,7 @@ with gr.Blocks() as demo:
|
|
196 |
|
197 |
tokenizer_name_1.change(tokenize, [user_input, tokenizer_name_1],
|
198 |
[output_text_1, output_table_1])
|
199 |
-
tokenizer_name_1.change(basic_count, [tokenizer_name_1], [stats_vocab_size_1,
|
200 |
tokenizer_name_1.change(get_overlap_token_size, [tokenizer_name_1, tokenizer_name_2],
|
201 |
[stats_overlap_token_size_1, stats_overlap_token_size_2])
|
202 |
# tokenizer_type_1.change(get_compress_rate, [tokenizer_type_1, compress_rate_corpus, compress_rate_unit],
|
@@ -209,7 +225,7 @@ with gr.Blocks() as demo:
|
|
209 |
|
210 |
tokenizer_name_2.change(tokenize, [user_input, tokenizer_name_2],
|
211 |
[output_text_2, output_table_2])
|
212 |
-
tokenizer_name_2.change(basic_count, [tokenizer_name_2], [stats_vocab_size_2,
|
213 |
tokenizer_name_2.change(get_overlap_token_size, [tokenizer_name_1, tokenizer_name_2],
|
214 |
[stats_overlap_token_size_1, stats_overlap_token_size_2])
|
215 |
# tokenizer_type_2.change(get_compress_rate,
|
@@ -235,7 +251,6 @@ with gr.Blocks() as demo:
|
|
235 |
[user_input, tokenizer_name_1, tokenizer_name_2]
|
236 |
)
|
237 |
|
238 |
-
demo.load(js=open("js/onload.js", "r", encoding="utf-8").read())
|
239 |
demo.load(
|
240 |
fn=on_load,
|
241 |
inputs=[user_input], # 这里只需要传个空object即可。
|
|
|
36 |
"""
|
37 |
|
38 |
import gradio as gr
|
39 |
+
from vocab import tokenizer_factory
|
40 |
+
from playground_examples import example_types, example_fn
|
41 |
+
from playground_util import tokenize, tokenize_pair, basic_count, get_overlap_token_size, on_load
|
42 |
+
|
43 |
+
|
44 |
+
|
45 |
|
46 |
get_window_url_params = """
|
47 |
function(url_params) {
|
|
|
51 |
}
|
52 |
"""
|
53 |
|
54 |
+
all_tokenizer_name = [(config.name_display, config.name_or_path) for config in tokenizer_factory.all_tokenizer_configs]
|
55 |
+
|
56 |
with gr.Blocks() as demo:
|
57 |
# links: https://www.coderstool.com/utf8-encoding-decoding
|
58 |
# 功能:输入文本,进行分词
|
|
|
65 |
example_types,
|
66 |
value="Examples",
|
67 |
type="index",
|
68 |
+
allow_custom_value=True,
|
69 |
show_label=False,
|
70 |
container=False,
|
71 |
scale=0,
|
|
|
108 |
with gr.Column(scale=6):
|
109 |
with gr.Group():
|
110 |
tokenizer_name_1 = gr.Dropdown(
|
111 |
+
all_tokenizer_name,
|
112 |
label="Tokenizer 1",
|
113 |
)
|
114 |
with gr.Group():
|
115 |
with gr.Row():
|
116 |
+
organization_1 = gr.TextArea(
|
117 |
+
label="Organization",
|
118 |
+
lines=1,
|
119 |
+
elem_classes="statistics",
|
120 |
+
)
|
121 |
stats_vocab_size_1 = gr.TextArea(
|
122 |
label="Vocab Size",
|
123 |
lines=1,
|
124 |
elem_classes="statistics"
|
125 |
)
|
126 |
+
# stats_zh_token_size_1 = gr.TextArea(
|
127 |
+
# label="ZH char/word",
|
128 |
+
# lines=1,
|
129 |
+
# elem_classes="statistics",
|
130 |
+
# )
|
131 |
# stats_compress_rate_1 = gr.TextArea(
|
132 |
# label="Compress Rate",
|
133 |
# lines=1,
|
|
|
151 |
with gr.Column(scale=6):
|
152 |
with gr.Group():
|
153 |
tokenizer_name_2 = gr.Dropdown(
|
154 |
+
all_tokenizer_name,
|
155 |
label="Tokenizer 2",
|
156 |
)
|
157 |
with gr.Group():
|
158 |
with gr.Row():
|
159 |
+
organization_2 = gr.TextArea(
|
160 |
+
label="Organization",
|
161 |
lines=1,
|
162 |
+
elem_classes="statistics",
|
163 |
)
|
164 |
+
stats_vocab_size_2 = gr.TextArea(
|
165 |
+
label="Vocab Size",
|
166 |
lines=1,
|
167 |
+
elem_classes="statistics"
|
168 |
)
|
169 |
+
# stats_zh_token_size_2 = gr.TextArea(
|
170 |
+
# label="ZH char/word", # 中文字/词
|
171 |
+
# lines=1,
|
172 |
+
# elem_classes="statistics",
|
173 |
+
# )
|
174 |
# stats_compress_rate_2 = gr.TextArea(
|
175 |
# label="Compress Rate",
|
176 |
# lines=1,
|
|
|
212 |
|
213 |
tokenizer_name_1.change(tokenize, [user_input, tokenizer_name_1],
|
214 |
[output_text_1, output_table_1])
|
215 |
+
tokenizer_name_1.change(basic_count, [tokenizer_name_1], [stats_vocab_size_1, organization_1])
|
216 |
tokenizer_name_1.change(get_overlap_token_size, [tokenizer_name_1, tokenizer_name_2],
|
217 |
[stats_overlap_token_size_1, stats_overlap_token_size_2])
|
218 |
# tokenizer_type_1.change(get_compress_rate, [tokenizer_type_1, compress_rate_corpus, compress_rate_unit],
|
|
|
225 |
|
226 |
tokenizer_name_2.change(tokenize, [user_input, tokenizer_name_2],
|
227 |
[output_text_2, output_table_2])
|
228 |
+
tokenizer_name_2.change(basic_count, [tokenizer_name_2], [stats_vocab_size_2, organization_2])
|
229 |
tokenizer_name_2.change(get_overlap_token_size, [tokenizer_name_1, tokenizer_name_2],
|
230 |
[stats_overlap_token_size_1, stats_overlap_token_size_2])
|
231 |
# tokenizer_type_2.change(get_compress_rate,
|
|
|
251 |
[user_input, tokenizer_name_1, tokenizer_name_2]
|
252 |
)
|
253 |
|
|
|
254 |
demo.load(
|
255 |
fn=on_load,
|
256 |
inputs=[user_input], # 这里只需要传个空object即可。
|
examples.py → playground_examples.py
RENAMED
@@ -19,11 +19,11 @@ https://www.computerhope.com/jargon/s/specchar.htm
|
|
19 |
|
20 |
examples = {
|
21 |
"en": [
|
22 |
-
["number: (10086 + 98) = 100184", "llama", "bloom"], #
|
23 |
-
["whitespace: 2spaces 8spaces\t1tab\t\t2tab\n1newline", "llama", "
|
24 |
# !?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.
|
25 |
-
["punctuation: ,.:/?+=\",。!?;【】〔〕〖〗", "
|
26 |
-
["symbol: 🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan", "llama"],
|
27 |
# ["special: [PAD] [UNK] [CLS] [SEP] [MASK] <|system|> <|user|> <|assistant|> <|endoftext|>", "", ""],
|
28 |
],
|
29 |
"zh": [
|
@@ -37,16 +37,16 @@ examples = {
|
|
37 |
|
38 |
more_examples = [
|
39 |
# bert系列
|
40 |
-
("
|
41 |
-
("
|
42 |
-
("clue", "kplug", "", ""),
|
43 |
|
44 |
# llama系列 (基于sentencepiece)
|
45 |
("baichuan", "baichuan2", "baichuan2支持多空格 ,多个换行\n\n\n,do not add dummy prefix as Baichuan1"),
|
46 |
("llama", "baichuan2", "baichuan2支持多空格 ,多个换行\n\n"),
|
47 |
-
("llama", "
|
48 |
("llama", "llama3", "扩充词典"),
|
49 |
-
("
|
50 |
|
51 |
# glm系列 (基于sentencepiece)
|
52 |
("glm", "chatglm1", ""),
|
|
|
19 |
|
20 |
examples = {
|
21 |
"en": [
|
22 |
+
["number: (10086 + 98) = 100184", "huggyllama/llama-7b", "bigscience/bloom"], #
|
23 |
+
["whitespace: 2spaces 8spaces\t1tab\t\t2tab\n1newline", "huggyllama/llama-7b", "google-bert/bert-base-cased"], # chatglm 有blank_n, bert丢掉了空格,
|
24 |
# !?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.
|
25 |
+
["punctuation: ,.:/?+=\",。!?;【】〔〕〖〗", "google/gemma-7b", "huggyllama/llama-7b"], # llama词典有点小
|
26 |
+
["symbol: 🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan-inc/Baichuan-7B", "huggyllama/llama-7b"],
|
27 |
# ["special: [PAD] [UNK] [CLS] [SEP] [MASK] <|system|> <|user|> <|assistant|> <|endoftext|>", "", ""],
|
28 |
],
|
29 |
"zh": [
|
|
|
37 |
|
38 |
more_examples = [
|
39 |
# bert系列
|
40 |
+
("bert-base-cased", "bert-base-uncased", "", ""), # # clue VS kplug, bert VS clue
|
41 |
+
("bert-base-cased", "clue", "", "增加了[]()"),
|
42 |
+
("roberta-chinese-clue", "kplug", "", ""),
|
43 |
|
44 |
# llama系列 (基于sentencepiece)
|
45 |
("baichuan", "baichuan2", "baichuan2支持多空格 ,多个换行\n\n\n,do not add dummy prefix as Baichuan1"),
|
46 |
("llama", "baichuan2", "baichuan2支持多空格 ,多个换行\n\n"),
|
47 |
+
("llama", "chinese-llama-2-7b", ""),
|
48 |
("llama", "llama3", "扩充词典"),
|
49 |
+
("chinese-llama-lora-7b", "chinese-llama-2-7b", ""),
|
50 |
|
51 |
# glm系列 (基于sentencepiece)
|
52 |
("glm", "chatglm1", ""),
|
util.py → playground_util.py
RENAMED
@@ -1,22 +1,33 @@
|
|
1 |
import gradio as gr
|
2 |
import json
|
3 |
import pandas as pd
|
4 |
-
import
|
5 |
-
from
|
6 |
-
from utils.character_util import iter_vocab
|
7 |
from utils.log_util import logger
|
8 |
-
from utils.compression_util import tokenize_corpus, unit_convertor
|
9 |
from functools import lru_cache
|
10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
@lru_cache
|
13 |
-
def tokenize(
|
|
|
|
|
|
|
|
|
|
|
14 |
"""
|
15 |
"""
|
16 |
logger.info("param=" + json.dumps({"text": text, "tokenizer_type": tokenizer_name}, ensure_ascii=False))
|
17 |
pos_tokens = []
|
18 |
-
tokenizer =
|
19 |
-
if
|
20 |
encoding = tokenizer.encode(text, add_special_tokens=True)
|
21 |
else:
|
22 |
encoding = tokenizer.encode(text, add_special_tokens=False)
|
@@ -34,7 +45,7 @@ def tokenize(text, tokenizer_name, color_num=5):
|
|
34 |
token_str = token.decode("utf-8")
|
35 |
except:
|
36 |
token_str = token.decode("utf-8", errors="ignore")
|
37 |
-
logger.error(f"{idx}: decode_error: " + json.dumps(
|
38 |
{"tokenizer_type": tokenizer_name, "token": str(token), "token_str": token_str},
|
39 |
ensure_ascii=False))
|
40 |
|
@@ -45,7 +56,8 @@ def tokenize(text, tokenizer_name, color_num=5):
|
|
45 |
token_bytes = bytes(token_str, "utf-8")
|
46 |
# json_dumps = json.dumps(token_str)
|
47 |
else:
|
48 |
-
logger.error(f"{idx}: wrong type for token {token_id} {type(token)} " + json.dumps(
|
|
|
49 |
token_str = token
|
50 |
token_bytes = token
|
51 |
# continue
|
@@ -82,30 +94,22 @@ def tokenize_pair(text, tokenizer_type_1, tokenizer_type_2):
|
|
82 |
@lru_cache
|
83 |
def basic_count(tokenizer_name):
|
84 |
stats = iter_vocab(tokenizer_name)
|
85 |
-
return stats['vocab_size'], f'{stats["
|
86 |
# return tokenizer.vocab_size, f'{stats["中文汉字数"]["中文单字"]}/{stats["中文汉字数"]["中文多字"]}'
|
87 |
|
88 |
-
def get_compress_rate(tokenizer_type, all_corpus, unit):
|
89 |
-
tokenizer = load_tokener(tokenizer_type)
|
90 |
-
compress_rate_stats = tokenize_corpus(tokenizer, all_corpus)
|
91 |
-
compress_rate = unit_convertor(compress_rate_stats, unit)
|
92 |
-
return compress_rate
|
93 |
|
94 |
-
# def
|
95 |
-
#
|
96 |
-
#
|
97 |
-
#
|
98 |
-
#
|
99 |
-
|
100 |
-
# stat = tokenize_corpus(tokenizer, [lang])
|
101 |
-
# stats[tokenizer_name] = stat
|
102 |
-
# pprint(stats)
|
103 |
|
104 |
|
105 |
@lru_cache
|
106 |
-
def get_overlap_token_size(
|
107 |
-
tokenizer1 =
|
108 |
-
tokenizer2 =
|
109 |
|
110 |
vocab_set_1 = tokenizer1.get_vocab().keys()
|
111 |
vocab_set_2 = tokenizer2.get_vocab().keys()
|
@@ -121,11 +125,10 @@ def get_overlap_token_size(tokenizer_type_1, tokenizer_type_2):
|
|
121 |
overlap_tokens = vocab_set_1 & vocab_set_2
|
122 |
overlap_token_size = len(overlap_tokens)
|
123 |
logger.info(
|
124 |
-
f"{overlap_token_size} OverlapTokens of {
|
125 |
return overlap_token_size, overlap_token_size
|
126 |
|
127 |
|
128 |
-
|
129 |
def on_load(url_params, request: gr.Request):
|
130 |
"""
|
131 |
onLoad
|
@@ -148,15 +151,16 @@ def on_load(url_params, request: gr.Request):
|
|
148 |
# if "referer" in request.headers: # not work for huggingface-space
|
149 |
# url_params = parse_qs(urlparse(request.headers["referer"]).query)
|
150 |
# url_params = {k: v[0] for k, v in url_params.items() if len(v) > 0}
|
151 |
-
tokenizer_type_1 = url_params.get("tokenizer1",
|
152 |
-
tokenizer_type_2 = url_params.get("tokenizer2",
|
153 |
-
text = url_params.get("text",
|
154 |
logger.info(f"client_ip: {client_ip}; params: {url_params}")
|
155 |
return text, tokenizer_type_1, tokenizer_type_2
|
156 |
|
157 |
|
158 |
-
def compress_rate_unit_change(unit):
|
159 |
-
|
|
|
160 |
|
161 |
def test_coding():
|
162 |
bytes1 = b'\xe4\xb8\xad'
|
@@ -164,5 +168,5 @@ def test_coding():
|
|
164 |
|
165 |
|
166 |
if __name__ == "__main__":
|
167 |
-
print(get_overlap_token_size("
|
168 |
# print(basic_count("internlm_chat_7b"))
|
|
|
1 |
import gradio as gr
|
2 |
import json
|
3 |
import pandas as pd
|
4 |
+
from vocab import tokenizer_factory
|
5 |
+
from character_util import iter_vocab
|
|
|
6 |
from utils.log_util import logger
|
|
|
7 |
from functools import lru_cache
|
8 |
|
9 |
+
default_user_input = """\
|
10 |
+
Replace this text in the input field to see how tokenization works.
|
11 |
+
Buenos días!
|
12 |
+
华为发布Mate60手机。
|
13 |
+
ラグビーワールドカップ2023フランス"""
|
14 |
+
# default_tokenizer_name_1 = "Meta/llama3"
|
15 |
+
default_tokenizer_name_1 = "gradientai/Llama-3-8B-Instruct-Gradient-1048k"
|
16 |
+
default_tokenizer_name_2 = "openai/gpt-4"
|
17 |
|
18 |
@lru_cache
|
19 |
+
def tokenize(
|
20 |
+
text: str,
|
21 |
+
tokenizer_name: str,
|
22 |
+
color_num: int = 5,
|
23 |
+
add_special_token: bool = False
|
24 |
+
):
|
25 |
"""
|
26 |
"""
|
27 |
logger.info("param=" + json.dumps({"text": text, "tokenizer_type": tokenizer_name}, ensure_ascii=False))
|
28 |
pos_tokens = []
|
29 |
+
tokenizer = tokenizer_factory.get_tokenizer(tokenizer_name)
|
30 |
+
if add_special_token:
|
31 |
encoding = tokenizer.encode(text, add_special_tokens=True)
|
32 |
else:
|
33 |
encoding = tokenizer.encode(text, add_special_tokens=False)
|
|
|
45 |
token_str = token.decode("utf-8")
|
46 |
except:
|
47 |
token_str = token.decode("utf-8", errors="ignore")
|
48 |
+
logger.error(f"{idx}: decode_error: " + json.dumps( # gpt_35_turbo 经常有token会decode error,这里用来记录一下
|
49 |
{"tokenizer_type": tokenizer_name, "token": str(token), "token_str": token_str},
|
50 |
ensure_ascii=False))
|
51 |
|
|
|
56 |
token_bytes = bytes(token_str, "utf-8")
|
57 |
# json_dumps = json.dumps(token_str)
|
58 |
else:
|
59 |
+
logger.error(f"{idx}: wrong type for token {token_id} {type(token)} " + json.dumps(
|
60 |
+
{"text": text, "tokenizer_type": tokenizer_name}, ensure_ascii=False))
|
61 |
token_str = token
|
62 |
token_bytes = token
|
63 |
# continue
|
|
|
94 |
@lru_cache
|
95 |
def basic_count(tokenizer_name):
|
96 |
stats = iter_vocab(tokenizer_name)
|
97 |
+
return stats['vocab_size'], f'{stats["organization"]}'
|
98 |
# return tokenizer.vocab_size, f'{stats["中文汉字数"]["中文单字"]}/{stats["中文汉字数"]["中文多字"]}'
|
99 |
|
|
|
|
|
|
|
|
|
|
|
100 |
|
101 |
+
# def get_compress_rate(tokenizer_name, all_corpus, unit):
|
102 |
+
# tokenizer = tokenizer_factory.get_tokenizer(tokenizer_name)
|
103 |
+
# compress_rate_stats = tokenize_corpus(tokenizer, all_corpus)
|
104 |
+
# compress_rate = unit_convertor(compress_rate_stats, unit)
|
105 |
+
# return compress_rate
|
106 |
+
|
|
|
|
|
|
|
107 |
|
108 |
|
109 |
@lru_cache
|
110 |
+
def get_overlap_token_size(tokenizer_name_1, tokenizer_name_2):
|
111 |
+
tokenizer1 = tokenizer_factory.get_tokenizer(tokenizer_name_1)
|
112 |
+
tokenizer2 = tokenizer_factory.get_tokenizer(tokenizer_name_2)
|
113 |
|
114 |
vocab_set_1 = tokenizer1.get_vocab().keys()
|
115 |
vocab_set_2 = tokenizer2.get_vocab().keys()
|
|
|
125 |
overlap_tokens = vocab_set_1 & vocab_set_2
|
126 |
overlap_token_size = len(overlap_tokens)
|
127 |
logger.info(
|
128 |
+
f"{overlap_token_size} OverlapTokens of {tokenizer_name_1} {tokenizer_name_2}: {list(overlap_tokens)[:10]}")
|
129 |
return overlap_token_size, overlap_token_size
|
130 |
|
131 |
|
|
|
132 |
def on_load(url_params, request: gr.Request):
|
133 |
"""
|
134 |
onLoad
|
|
|
151 |
# if "referer" in request.headers: # not work for huggingface-space
|
152 |
# url_params = parse_qs(urlparse(request.headers["referer"]).query)
|
153 |
# url_params = {k: v[0] for k, v in url_params.items() if len(v) > 0}
|
154 |
+
tokenizer_type_1 = url_params.get("tokenizer1", default_tokenizer_name_1)
|
155 |
+
tokenizer_type_2 = url_params.get("tokenizer2", default_tokenizer_name_2)
|
156 |
+
text = url_params.get("text", default_user_input)
|
157 |
logger.info(f"client_ip: {client_ip}; params: {url_params}")
|
158 |
return text, tokenizer_type_1, tokenizer_type_2
|
159 |
|
160 |
|
161 |
+
# def compress_rate_unit_change(unit):
|
162 |
+
# return gr.update(label=f"Compress Rate: {unit}"), gr.update(label=f"Compress Rate: {unit}"),
|
163 |
+
|
164 |
|
165 |
def test_coding():
|
166 |
bytes1 = b'\xe4\xb8\xad'
|
|
|
168 |
|
169 |
|
170 |
if __name__ == "__main__":
|
171 |
+
print(get_overlap_token_size("gpt-35-turbo", "gpt-4"))
|
172 |
# print(basic_count("internlm_chat_7b"))
|
requirements.txt
CHANGED
@@ -6,4 +6,6 @@ torch
|
|
6 |
zhon
|
7 |
nltk
|
8 |
boto3
|
9 |
-
ai2-olmo
|
|
|
|
|
|
6 |
zhon
|
7 |
nltk
|
8 |
boto3
|
9 |
+
ai2-olmo
|
10 |
+
ipadic
|
11 |
+
fugashi
|
stats/character_stats.json
ADDED
@@ -0,0 +1,1712 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"FacebookAI/xlm-roberta-base": {
|
3 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/FacebookAI/xlm-roberta-base\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">xlm-roberta-base</a>",
|
4 |
+
"organization": "Facebook",
|
5 |
+
"vocab_size": 250002,
|
6 |
+
"num(digit)": 2728,
|
7 |
+
"len(digit)": "1,3,9",
|
8 |
+
"num(space)": 1,
|
9 |
+
"len(space)": "1,1,1",
|
10 |
+
"num(ar)": 14644,
|
11 |
+
"len(ar)": "1,4,16",
|
12 |
+
"num(zh)": 18457,
|
13 |
+
"len(zh)": "1,2,16",
|
14 |
+
"num(ja)": 20572,
|
15 |
+
"len(ja)": "1,2,16",
|
16 |
+
"num(ja-kana)": 3434,
|
17 |
+
"len(ja-kana)": "1,3,12",
|
18 |
+
"num(ko)": 5373,
|
19 |
+
"len(ko)": "1,2,8"
|
20 |
+
},
|
21 |
+
"clue/roberta_chinese_clue_tiny": {
|
22 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/clue/roberta_chinese_clue_tiny\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">roberta-chinese-clue</a>",
|
23 |
+
"organization": "CLUE",
|
24 |
+
"vocab_size": 8021,
|
25 |
+
"num(digit)": 230,
|
26 |
+
"len(digit)": "1,4,10",
|
27 |
+
"num(space)": 0,
|
28 |
+
"len(space)": "-",
|
29 |
+
"num(ar)": 30,
|
30 |
+
"len(ar)": "1,2,3",
|
31 |
+
"num(zh)": 5689,
|
32 |
+
"len(zh)": "1,1,1",
|
33 |
+
"num(ja)": 5691,
|
34 |
+
"len(ja)": "1,1,3",
|
35 |
+
"num(ja-kana)": 0,
|
36 |
+
"len(ja-kana)": "-",
|
37 |
+
"num(ko)": 0,
|
38 |
+
"len(ko)": "-"
|
39 |
+
},
|
40 |
+
"dbmdz/bert-base-german-uncased": {
|
41 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/dbmdz/bert-base-german-uncased\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">bert-base-german-uncased</a>",
|
42 |
+
"organization": "dbmdz",
|
43 |
+
"vocab_size": 31102,
|
44 |
+
"num(digit)": 1733,
|
45 |
+
"len(digit)": "1,4,12",
|
46 |
+
"num(space)": 0,
|
47 |
+
"len(space)": "-",
|
48 |
+
"num(ar)": 0,
|
49 |
+
"len(ar)": "-",
|
50 |
+
"num(zh)": 0,
|
51 |
+
"len(zh)": "-",
|
52 |
+
"num(ja)": 0,
|
53 |
+
"len(ja)": "-",
|
54 |
+
"num(ja-kana)": 0,
|
55 |
+
"len(ja-kana)": "-",
|
56 |
+
"num(ko)": 0,
|
57 |
+
"len(ko)": "-"
|
58 |
+
},
|
59 |
+
"google-bert/bert-base-cased": {
|
60 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/google-bert/bert-base-cased\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">bert-base-cased</a>",
|
61 |
+
"organization": "Google",
|
62 |
+
"vocab_size": 28996,
|
63 |
+
"num(digit)": 926,
|
64 |
+
"len(digit)": "1,4,11",
|
65 |
+
"num(space)": 0,
|
66 |
+
"len(space)": "-",
|
67 |
+
"num(ar)": 94,
|
68 |
+
"len(ar)": "1,3,4",
|
69 |
+
"num(zh)": 226,
|
70 |
+
"len(zh)": "1,2,3",
|
71 |
+
"num(ja)": 390,
|
72 |
+
"len(ja)": "1,2,3",
|
73 |
+
"num(ja-kana)": 164,
|
74 |
+
"len(ja-kana)": "1,2,3",
|
75 |
+
"num(ko)": 10,
|
76 |
+
"len(ko)": "1,2,3"
|
77 |
+
},
|
78 |
+
"google-bert/bert-base-chinese": {
|
79 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/google-bert/bert-base-chinese\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">bert-base-chinese</a>",
|
80 |
+
"organization": "Google",
|
81 |
+
"vocab_size": 21128,
|
82 |
+
"num(digit)": 1451,
|
83 |
+
"len(digit)": "1,3,12",
|
84 |
+
"num(space)": 2,
|
85 |
+
"len(space)": "1,2,3",
|
86 |
+
"num(ar)": 30,
|
87 |
+
"len(ar)": "1,2,3",
|
88 |
+
"num(zh)": 14642,
|
89 |
+
"len(zh)": "1,2,3",
|
90 |
+
"num(ja)": 15197,
|
91 |
+
"len(ja)": "1,3,15",
|
92 |
+
"num(ja-kana)": 553,
|
93 |
+
"len(ja-kana)": "1,3,15",
|
94 |
+
"num(ko)": 0,
|
95 |
+
"len(ko)": "-"
|
96 |
+
},
|
97 |
+
"google-bert/bert-base-german-cased": {
|
98 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/google-bert/bert-base-german-cased\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">bert-base-german-cased</a>",
|
99 |
+
"organization": "Google",
|
100 |
+
"vocab_size": 30000,
|
101 |
+
"num(digit)": 4065,
|
102 |
+
"len(digit)": "1,11,22",
|
103 |
+
"num(space)": 0,
|
104 |
+
"len(space)": "-",
|
105 |
+
"num(ar)": 0,
|
106 |
+
"len(ar)": "-",
|
107 |
+
"num(zh)": 0,
|
108 |
+
"len(zh)": "-",
|
109 |
+
"num(ja)": 0,
|
110 |
+
"len(ja)": "-",
|
111 |
+
"num(ja-kana)": 0,
|
112 |
+
"len(ja-kana)": "-",
|
113 |
+
"num(ko)": 0,
|
114 |
+
"len(ko)": "-"
|
115 |
+
},
|
116 |
+
"google-bert/bert-base-multilingual-cased": {
|
117 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/google-bert/bert-base-multilingual-cased\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">bert-base-multilingual-cased</a>",
|
118 |
+
"organization": "Google",
|
119 |
+
"vocab_size": 119547,
|
120 |
+
"num(digit)": 2583,
|
121 |
+
"len(digit)": "1,3,13",
|
122 |
+
"num(space)": 0,
|
123 |
+
"len(space)": "-",
|
124 |
+
"num(ar)": 4873,
|
125 |
+
"len(ar)": "1,5,14",
|
126 |
+
"num(zh)": 13542,
|
127 |
+
"len(zh)": "1,2,3",
|
128 |
+
"num(ja)": 14880,
|
129 |
+
"len(ja)": "1,3,10",
|
130 |
+
"num(ja-kana)": 1336,
|
131 |
+
"len(ja-kana)": "1,4,10",
|
132 |
+
"num(ko)": 3271,
|
133 |
+
"len(ko)": "1,3,6"
|
134 |
+
},
|
135 |
+
"google-bert/bert-base-multilingual-uncased": {
|
136 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/google-bert/bert-base-multilingual-uncased\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">bert-base-multilingual-uncased</a>",
|
137 |
+
"organization": "Google",
|
138 |
+
"vocab_size": 105879,
|
139 |
+
"num(digit)": 2510,
|
140 |
+
"len(digit)": "1,3,13",
|
141 |
+
"num(space)": 2,
|
142 |
+
"len(space)": "1,2,3",
|
143 |
+
"num(ar)": 4530,
|
144 |
+
"len(ar)": "1,5,13",
|
145 |
+
"num(zh)": 16658,
|
146 |
+
"len(zh)": "1,2,3",
|
147 |
+
"num(ja)": 17858,
|
148 |
+
"len(ja)": "1,3,10",
|
149 |
+
"num(ja-kana)": 1188,
|
150 |
+
"len(ja-kana)": "1,4,10",
|
151 |
+
"num(ko)": 0,
|
152 |
+
"len(ko)": "-"
|
153 |
+
},
|
154 |
+
"google-bert/bert-base-uncased": {
|
155 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/google-bert/bert-base-uncased\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">bert-base-uncased</a>",
|
156 |
+
"organization": "Google",
|
157 |
+
"vocab_size": 30522,
|
158 |
+
"num(digit)": 2056,
|
159 |
+
"len(digit)": "1,4,11",
|
160 |
+
"num(space)": 0,
|
161 |
+
"len(space)": "-",
|
162 |
+
"num(ar)": 88,
|
163 |
+
"len(ar)": "1,3,5",
|
164 |
+
"num(zh)": 488,
|
165 |
+
"len(zh)": "1,2,3",
|
166 |
+
"num(ja)": 676,
|
167 |
+
"len(ja)": "1,2,3",
|
168 |
+
"num(ja-kana)": 188,
|
169 |
+
"len(ja-kana)": "1,2,3",
|
170 |
+
"num(ko)": 0,
|
171 |
+
"len(ko)": "-"
|
172 |
+
},
|
173 |
+
"google/mobilebert-uncased": {
|
174 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/google/mobilebert-uncased\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">mobilebert-uncased</a>",
|
175 |
+
"organization": "Google",
|
176 |
+
"vocab_size": 30522,
|
177 |
+
"num(digit)": 2056,
|
178 |
+
"len(digit)": "1,4,11",
|
179 |
+
"num(space)": 0,
|
180 |
+
"len(space)": "-",
|
181 |
+
"num(ar)": 88,
|
182 |
+
"len(ar)": "1,3,5",
|
183 |
+
"num(zh)": 488,
|
184 |
+
"len(zh)": "1,2,3",
|
185 |
+
"num(ja)": 676,
|
186 |
+
"len(ja)": "1,2,3",
|
187 |
+
"num(ja-kana)": 188,
|
188 |
+
"len(ja-kana)": "1,2,3",
|
189 |
+
"num(ko)": 0,
|
190 |
+
"len(ko)": "-"
|
191 |
+
},
|
192 |
+
"tohoku-nlp/bert-base-japanese": {
|
193 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/tohoku-nlp/bert-base-japanese\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">bert-base-japanese</a>",
|
194 |
+
"organization": "Tohoku",
|
195 |
+
"vocab_size": 32000,
|
196 |
+
"num(digit)": 669,
|
197 |
+
"len(digit)": "1,3,5",
|
198 |
+
"num(space)": 0,
|
199 |
+
"len(space)": "-",
|
200 |
+
"num(ar)": 10,
|
201 |
+
"len(ar)": "1,3,3",
|
202 |
+
"num(zh)": 18792,
|
203 |
+
"len(zh)": "1,2,11",
|
204 |
+
"num(ja)": 28367,
|
205 |
+
"len(ja)": "1,2,13",
|
206 |
+
"num(ja-kana)": 12359,
|
207 |
+
"len(ja-kana)": "1,4,13",
|
208 |
+
"num(ko)": 0,
|
209 |
+
"len(ko)": "-"
|
210 |
+
},
|
211 |
+
"gpt-4": {
|
212 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://github.com/openai/tiktoken\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">gpt-4</a>",
|
213 |
+
"organization": "OpenAI",
|
214 |
+
"vocab_size": 100277,
|
215 |
+
"num(digit)": 1110,
|
216 |
+
"len(digit)": "1,3,3",
|
217 |
+
"num(space)": 47472,
|
218 |
+
"len(space)": "1,7,128",
|
219 |
+
"num(ar)": 113,
|
220 |
+
"len(ar)": "1,2,10",
|
221 |
+
"num(zh)": 868,
|
222 |
+
"len(zh)": "1,1,7",
|
223 |
+
"num(ja)": 1035,
|
224 |
+
"len(ja)": "1,1,7",
|
225 |
+
"num(ja-kana)": 169,
|
226 |
+
"len(ja-kana)": "1,1,7",
|
227 |
+
"num(ko)": 299,
|
228 |
+
"len(ko)": "1,2,4"
|
229 |
+
},
|
230 |
+
"llama3": {
|
231 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/gradientai/Llama-3-8B-Instruct-Gradient-1048k\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">llama3</a>",
|
232 |
+
"organization": "Meta",
|
233 |
+
"vocab_size": 128256,
|
234 |
+
"num(digit)": 1110,
|
235 |
+
"len(digit)": "1,3,3",
|
236 |
+
"num(space)": 60860,
|
237 |
+
"len(space)": "1,6,128",
|
238 |
+
"num(ar)": 3810,
|
239 |
+
"len(ar)": "1,4,11",
|
240 |
+
"num(zh)": 4424,
|
241 |
+
"len(zh)": "1,1,7",
|
242 |
+
"num(ja)": 5387,
|
243 |
+
"len(ja)": "1,2,8",
|
244 |
+
"num(ja-kana)": 1086,
|
245 |
+
"len(ja-kana)": "1,2,8",
|
246 |
+
"num(ko)": 2281,
|
247 |
+
"len(ko)": "1,2,6"
|
248 |
+
},
|
249 |
+
"google-t5/t5-large": {
|
250 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/google-t5/t5-large\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">t5</a>",
|
251 |
+
"organization": "Google",
|
252 |
+
"vocab_size": 32100,
|
253 |
+
"num(digit)": 1133,
|
254 |
+
"len(digit)": "1,3,13",
|
255 |
+
"num(space)": 0,
|
256 |
+
"len(space)": "-",
|
257 |
+
"num(ar)": 0,
|
258 |
+
"len(ar)": "-",
|
259 |
+
"num(zh)": 0,
|
260 |
+
"len(zh)": "-",
|
261 |
+
"num(ja)": 0,
|
262 |
+
"len(ja)": "-",
|
263 |
+
"num(ja-kana)": 0,
|
264 |
+
"len(ja-kana)": "-",
|
265 |
+
"num(ko)": 0,
|
266 |
+
"len(ko)": "-"
|
267 |
+
},
|
268 |
+
"google/byt5-small": {
|
269 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/google/byt5-small\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">byt5-small</a>",
|
270 |
+
"organization": "Google",
|
271 |
+
"vocab_size": 384,
|
272 |
+
"num(digit)": 10,
|
273 |
+
"len(digit)": "1,1,1",
|
274 |
+
"num(space)": 10,
|
275 |
+
"len(space)": "1,1,1",
|
276 |
+
"num(ar)": 0,
|
277 |
+
"len(ar)": "-",
|
278 |
+
"num(zh)": 0,
|
279 |
+
"len(zh)": "-",
|
280 |
+
"num(ja)": 0,
|
281 |
+
"len(ja)": "-",
|
282 |
+
"num(ja-kana)": 0,
|
283 |
+
"len(ja-kana)": "-",
|
284 |
+
"num(ko)": 0,
|
285 |
+
"len(ko)": "-"
|
286 |
+
},
|
287 |
+
"google/mt5-large": {
|
288 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/google/mt5-large\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">mt5-large</a>",
|
289 |
+
"organization": "Google",
|
290 |
+
"vocab_size": 250100,
|
291 |
+
"num(digit)": 16829,
|
292 |
+
"len(digit)": "1,4,16",
|
293 |
+
"num(space)": 1,
|
294 |
+
"len(space)": "1,1,1",
|
295 |
+
"num(ar)": 7459,
|
296 |
+
"len(ar)": "1,3,16",
|
297 |
+
"num(zh)": 21489,
|
298 |
+
"len(zh)": "1,2,16",
|
299 |
+
"num(ja)": 27078,
|
300 |
+
"len(ja)": "1,2,16",
|
301 |
+
"num(ja-kana)": 9160,
|
302 |
+
"len(ja-kana)": "1,3,14",
|
303 |
+
"num(ko)": 4041,
|
304 |
+
"len(ko)": "1,1,10"
|
305 |
+
},
|
306 |
+
"lmsys/fastchat-t5-3b-v1.0": {
|
307 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/lmsys/fastchat-t5-3b-v1.0\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">fastchat-t5-3b-v1.0</a>",
|
308 |
+
"organization": "LMSYS",
|
309 |
+
"vocab_size": 32110,
|
310 |
+
"num(digit)": 1033,
|
311 |
+
"len(digit)": "1,3,8",
|
312 |
+
"num(space)": 0,
|
313 |
+
"len(space)": "-",
|
314 |
+
"num(ar)": 0,
|
315 |
+
"len(ar)": "-",
|
316 |
+
"num(zh)": 0,
|
317 |
+
"len(zh)": "-",
|
318 |
+
"num(ja)": 0,
|
319 |
+
"len(ja)": "-",
|
320 |
+
"num(ja-kana)": 0,
|
321 |
+
"len(ja-kana)": "-",
|
322 |
+
"num(ko)": 0,
|
323 |
+
"len(ko)": "-"
|
324 |
+
},
|
325 |
+
"paust/pko-t5-large": {
|
326 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/paust/pko-t5-large\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">pko-t5-large</a>",
|
327 |
+
"organization": "PAUST",
|
328 |
+
"vocab_size": 50358,
|
329 |
+
"num(digit)": 51,
|
330 |
+
"len(digit)": "1,2,3",
|
331 |
+
"num(space)": 10,
|
332 |
+
"len(space)": "1,1,1",
|
333 |
+
"num(ar)": 0,
|
334 |
+
"len(ar)": "-",
|
335 |
+
"num(zh)": 0,
|
336 |
+
"len(zh)": "-",
|
337 |
+
"num(ja)": 0,
|
338 |
+
"len(ja)": "-",
|
339 |
+
"num(ja-kana)": 0,
|
340 |
+
"len(ja-kana)": "-",
|
341 |
+
"num(ko)": 49050,
|
342 |
+
"len(ko)": "1,2,16"
|
343 |
+
},
|
344 |
+
"bloom": {
|
345 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/bigscience/bloom\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">bloom</a>",
|
346 |
+
"organization": "BigScience",
|
347 |
+
"vocab_size": 250680,
|
348 |
+
"num(digit)": 6629,
|
349 |
+
"len(digit)": "1,4,50",
|
350 |
+
"num(space)": 140180,
|
351 |
+
"len(space)": "1,6,600",
|
352 |
+
"num(ar)": 20854,
|
353 |
+
"len(ar)": "1,5,16",
|
354 |
+
"num(zh)": 30603,
|
355 |
+
"len(zh)": "1,2,23",
|
356 |
+
"num(ja)": 30816,
|
357 |
+
"len(ja)": "1,2,23",
|
358 |
+
"num(ja-kana)": 214,
|
359 |
+
"len(ja-kana)": "1,1,3",
|
360 |
+
"num(ko)": 338,
|
361 |
+
"len(ko)": "1,1,3"
|
362 |
+
},
|
363 |
+
"llama": {
|
364 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/huggyllama/llama-7b\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">llama</a>",
|
365 |
+
"organization": "Meta",
|
366 |
+
"vocab_size": 32000,
|
367 |
+
"num(digit)": 20,
|
368 |
+
"len(digit)": "1,1,1",
|
369 |
+
"num(space)": 61,
|
370 |
+
"len(space)": "1,2,15",
|
371 |
+
"num(ar)": 55,
|
372 |
+
"len(ar)": "1,1,2",
|
373 |
+
"num(zh)": 700,
|
374 |
+
"len(zh)": "1,1,1",
|
375 |
+
"num(ja)": 837,
|
376 |
+
"len(ja)": "1,1,1",
|
377 |
+
"num(ja-kana)": 137,
|
378 |
+
"len(ja-kana)": "1,1,1",
|
379 |
+
"num(ko)": 111,
|
380 |
+
"len(ko)": "1,1,1"
|
381 |
+
},
|
382 |
+
"ClueAI/ChatYuan-large-v2": {
|
383 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/ClueAI/ChatYuan-large-v2\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">ChatYuan-large-v2</a>",
|
384 |
+
"organization": "CLUE",
|
385 |
+
"vocab_size": 32128,
|
386 |
+
"num(digit)": 740,
|
387 |
+
"len(digit)": "1,3,9",
|
388 |
+
"num(space)": 0,
|
389 |
+
"len(space)": "-",
|
390 |
+
"num(ar)": 2,
|
391 |
+
"len(ar)": "1,1,1",
|
392 |
+
"num(zh)": 29591,
|
393 |
+
"len(zh)": "1,2,16",
|
394 |
+
"num(ja)": 29736,
|
395 |
+
"len(ja)": "1,2,16",
|
396 |
+
"num(ja-kana)": 145,
|
397 |
+
"len(ja-kana)": "1,1,2",
|
398 |
+
"num(ko)": 0,
|
399 |
+
"len(ko)": "-"
|
400 |
+
},
|
401 |
+
"Meta/llama3": {
|
402 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/gradientai/Llama-3-8B-Instruct-Gradient-1048k\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">llama3</a>",
|
403 |
+
"organization": "Meta",
|
404 |
+
"vocab_size": 128256,
|
405 |
+
"num(digit)": 1110,
|
406 |
+
"len(digit)": "1,3,3",
|
407 |
+
"num(space)": 60860,
|
408 |
+
"len(space)": "1,6,128",
|
409 |
+
"num(ar)": 3810,
|
410 |
+
"len(ar)": "1,4,11",
|
411 |
+
"num(zh)": 4424,
|
412 |
+
"len(zh)": "1,1,7",
|
413 |
+
"num(ja)": 5387,
|
414 |
+
"len(ja)": "1,2,8",
|
415 |
+
"num(ja-kana)": 1086,
|
416 |
+
"len(ja-kana)": "1,2,8",
|
417 |
+
"num(ko)": 2281,
|
418 |
+
"len(ko)": "1,2,6"
|
419 |
+
},
|
420 |
+
"openai/gpt-4": {
|
421 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://github.com/openai/tiktoken\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">gpt-4</a>",
|
422 |
+
"organization": "OpenAI",
|
423 |
+
"vocab_size": 100277,
|
424 |
+
"num(digit)": 1110,
|
425 |
+
"len(digit)": "1,3,3",
|
426 |
+
"num(space)": 47472,
|
427 |
+
"len(space)": "1,7,128",
|
428 |
+
"num(ar)": 113,
|
429 |
+
"len(ar)": "1,2,10",
|
430 |
+
"num(zh)": 868,
|
431 |
+
"len(zh)": "1,1,7",
|
432 |
+
"num(ja)": 1035,
|
433 |
+
"len(ja)": "1,1,7",
|
434 |
+
"num(ja-kana)": 169,
|
435 |
+
"len(ja-kana)": "1,1,7",
|
436 |
+
"num(ko)": 299,
|
437 |
+
"len(ko)": "1,2,4"
|
438 |
+
},
|
439 |
+
"gradientai/Llama-3-8B-Instruct-Gradient-1048k": {
|
440 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/gradientai/Llama-3-8B-Instruct-Gradient-1048k\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">llama3</a>",
|
441 |
+
"organization": "Meta",
|
442 |
+
"vocab_size": 128256,
|
443 |
+
"num(digit)": 1110,
|
444 |
+
"len(digit)": "1,3,3",
|
445 |
+
"num(space)": 60860,
|
446 |
+
"len(space)": "1,6,128",
|
447 |
+
"num(ar)": 3810,
|
448 |
+
"len(ar)": "1,4,11",
|
449 |
+
"num(zh)": 4424,
|
450 |
+
"len(zh)": "1,1,7",
|
451 |
+
"num(ja)": 5387,
|
452 |
+
"len(ja)": "1,2,8",
|
453 |
+
"num(ja-kana)": 1086,
|
454 |
+
"len(ja-kana)": "1,2,8",
|
455 |
+
"num(ko)": 2281,
|
456 |
+
"len(ko)": "1,2,6"
|
457 |
+
},
|
458 |
+
"bigscience/bloom": {
|
459 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/bigscience/bloom\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">bloom</a>",
|
460 |
+
"organization": "BigScience",
|
461 |
+
"vocab_size": 250680,
|
462 |
+
"num(digit)": 6629,
|
463 |
+
"len(digit)": "1,4,50",
|
464 |
+
"num(space)": 140180,
|
465 |
+
"len(space)": "1,6,600",
|
466 |
+
"num(ar)": 20854,
|
467 |
+
"len(ar)": "1,5,16",
|
468 |
+
"num(zh)": 30603,
|
469 |
+
"len(zh)": "1,2,23",
|
470 |
+
"num(ja)": 30816,
|
471 |
+
"len(ja)": "1,2,23",
|
472 |
+
"num(ja-kana)": 214,
|
473 |
+
"len(ja-kana)": "1,1,3",
|
474 |
+
"num(ko)": 338,
|
475 |
+
"len(ko)": "1,1,3"
|
476 |
+
},
|
477 |
+
"huggyllama/llama-7b": {
|
478 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/huggyllama/llama-7b\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">llama</a>",
|
479 |
+
"organization": "Meta",
|
480 |
+
"vocab_size": 32000,
|
481 |
+
"num(digit)": 20,
|
482 |
+
"len(digit)": "1,1,1",
|
483 |
+
"num(space)": 61,
|
484 |
+
"len(space)": "1,2,15",
|
485 |
+
"num(ar)": 55,
|
486 |
+
"len(ar)": "1,1,2",
|
487 |
+
"num(zh)": 700,
|
488 |
+
"len(zh)": "1,1,1",
|
489 |
+
"num(ja)": 837,
|
490 |
+
"len(ja)": "1,1,1",
|
491 |
+
"num(ja-kana)": 137,
|
492 |
+
"len(ja-kana)": "1,1,1",
|
493 |
+
"num(ko)": 111,
|
494 |
+
"len(ko)": "1,1,1"
|
495 |
+
},
|
496 |
+
"baichuan-inc/Baichuan-7B": {
|
497 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/baichuan-inc/Baichuan-7B\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">baichuan</a>",
|
498 |
+
"organization": "Baichuan",
|
499 |
+
"vocab_size": 64000,
|
500 |
+
"num(digit)": 335,
|
501 |
+
"len(digit)": "1,14,14",
|
502 |
+
"num(space)": 13,
|
503 |
+
"len(space)": "1,1,1",
|
504 |
+
"num(ar)": 299,
|
505 |
+
"len(ar)": "1,1,2",
|
506 |
+
"num(zh)": 27676,
|
507 |
+
"len(zh)": "1,1,9",
|
508 |
+
"num(ja)": 28522,
|
509 |
+
"len(ja)": "1,1,9",
|
510 |
+
"num(ja-kana)": 178,
|
511 |
+
"len(ja-kana)": "1,1,1",
|
512 |
+
"num(ko)": 1591,
|
513 |
+
"len(ko)": "1,1,1"
|
514 |
+
},
|
515 |
+
"01-ai/Yi-34B": {
|
516 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/01-ai/Yi-34B\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">Yi-34B</a>",
|
517 |
+
"organization": "Yi",
|
518 |
+
"vocab_size": 64000,
|
519 |
+
"num(digit)": 200,
|
520 |
+
"len(digit)": "1,13,15",
|
521 |
+
"num(space)": 24274,
|
522 |
+
"len(space)": "1,7,16",
|
523 |
+
"num(ar)": 18,
|
524 |
+
"len(ar)": "1,1,4",
|
525 |
+
"num(zh)": 21356,
|
526 |
+
"len(zh)": "1,2,12",
|
527 |
+
"num(ja)": 21407,
|
528 |
+
"len(ja)": "1,2,12",
|
529 |
+
"num(ja-kana)": 51,
|
530 |
+
"len(ja-kana)": "1,1,2",
|
531 |
+
"num(ko)": 28,
|
532 |
+
"len(ko)": "1,1,2"
|
533 |
+
},
|
534 |
+
"01-ai/Yi-6B": {
|
535 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/01-ai/Yi-6B\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">Yi-6B</a>",
|
536 |
+
"organization": "Yi",
|
537 |
+
"vocab_size": 64000,
|
538 |
+
"num(digit)": 200,
|
539 |
+
"len(digit)": "1,13,15",
|
540 |
+
"num(space)": 24274,
|
541 |
+
"len(space)": "1,7,16",
|
542 |
+
"num(ar)": 18,
|
543 |
+
"len(ar)": "1,1,4",
|
544 |
+
"num(zh)": 21356,
|
545 |
+
"len(zh)": "1,2,12",
|
546 |
+
"num(ja)": 21407,
|
547 |
+
"len(ja)": "1,2,12",
|
548 |
+
"num(ja-kana)": 51,
|
549 |
+
"len(ja-kana)": "1,1,2",
|
550 |
+
"num(ko)": 28,
|
551 |
+
"len(ko)": "1,1,2"
|
552 |
+
},
|
553 |
+
"01-ai/Yi-VL-34B": {
|
554 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/01-ai/Yi-VL-34B\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">Yi-VL-34B</a>",
|
555 |
+
"organization": "Yi",
|
556 |
+
"vocab_size": 64000,
|
557 |
+
"num(digit)": 200,
|
558 |
+
"len(digit)": "1,13,15",
|
559 |
+
"num(space)": 43,
|
560 |
+
"len(space)": "1,2,15",
|
561 |
+
"num(ar)": 18,
|
562 |
+
"len(ar)": "1,1,4",
|
563 |
+
"num(zh)": 21356,
|
564 |
+
"len(zh)": "1,2,12",
|
565 |
+
"num(ja)": 21407,
|
566 |
+
"len(ja)": "1,2,12",
|
567 |
+
"num(ja-kana)": 51,
|
568 |
+
"len(ja-kana)": "1,1,2",
|
569 |
+
"num(ko)": 28,
|
570 |
+
"len(ko)": "1,1,2"
|
571 |
+
},
|
572 |
+
"ClassCat/gpt2-base-french": {
|
573 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/ClassCat/gpt2-base-french\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">gpt2-base-french</a>",
|
574 |
+
"organization": "ClassCat",
|
575 |
+
"vocab_size": 50000,
|
576 |
+
"num(digit)": 1833,
|
577 |
+
"len(digit)": "1,4,5",
|
578 |
+
"num(space)": 31889,
|
579 |
+
"len(space)": "1,7,32",
|
580 |
+
"num(ar)": 41,
|
581 |
+
"len(ar)": "1,1,4",
|
582 |
+
"num(zh)": 27,
|
583 |
+
"len(zh)": "1,1,1",
|
584 |
+
"num(ja)": 46,
|
585 |
+
"len(ja)": "1,1,2",
|
586 |
+
"num(ja-kana)": 19,
|
587 |
+
"len(ja-kana)": "1,1,2",
|
588 |
+
"num(ko)": 0,
|
589 |
+
"len(ko)": "-"
|
590 |
+
},
|
591 |
+
"ClassCat/gpt2-base-spanish": {
|
592 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/ClassCat/gpt2-base-spanish\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">gpt2-base-spanish</a>",
|
593 |
+
"organization": "ClassCat",
|
594 |
+
"vocab_size": 50000,
|
595 |
+
"num(digit)": 1492,
|
596 |
+
"len(digit)": "1,4,9",
|
597 |
+
"num(space)": 34496,
|
598 |
+
"len(space)": "1,8,32",
|
599 |
+
"num(ar)": 36,
|
600 |
+
"len(ar)": "1,1,4",
|
601 |
+
"num(zh)": 13,
|
602 |
+
"len(zh)": "1,1,1",
|
603 |
+
"num(ja)": 36,
|
604 |
+
"len(ja)": "1,1,2",
|
605 |
+
"num(ja-kana)": 23,
|
606 |
+
"len(ja-kana)": "1,1,2",
|
607 |
+
"num(ko)": 0,
|
608 |
+
"len(ko)": "-"
|
609 |
+
},
|
610 |
+
"ClueAI/PromptCLUE-base": {
|
611 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/ClueAI/PromptCLUE-base\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">PromptCLUE-base</a>",
|
612 |
+
"organization": "CLUE",
|
613 |
+
"vocab_size": 32128,
|
614 |
+
"num(digit)": 740,
|
615 |
+
"len(digit)": "1,3,9",
|
616 |
+
"num(space)": 0,
|
617 |
+
"len(space)": "-",
|
618 |
+
"num(ar)": 2,
|
619 |
+
"len(ar)": "1,1,1",
|
620 |
+
"num(zh)": 29591,
|
621 |
+
"len(zh)": "1,2,16",
|
622 |
+
"num(ja)": 29736,
|
623 |
+
"len(ja)": "1,2,16",
|
624 |
+
"num(ja-kana)": 145,
|
625 |
+
"len(ja-kana)": "1,1,2",
|
626 |
+
"num(ko)": 0,
|
627 |
+
"len(ko)": "-"
|
628 |
+
},
|
629 |
+
"CohereForAI/aya-101": {
|
630 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/CohereForAI/aya-101\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">aya-101</a>",
|
631 |
+
"organization": "Cohere For AI",
|
632 |
+
"vocab_size": 250100,
|
633 |
+
"num(digit)": 16829,
|
634 |
+
"len(digit)": "1,4,16",
|
635 |
+
"num(space)": 1,
|
636 |
+
"len(space)": "1,1,1",
|
637 |
+
"num(ar)": 7459,
|
638 |
+
"len(ar)": "1,3,16",
|
639 |
+
"num(zh)": 21489,
|
640 |
+
"len(zh)": "1,2,16",
|
641 |
+
"num(ja)": 27078,
|
642 |
+
"len(ja)": "1,2,16",
|
643 |
+
"num(ja-kana)": 9160,
|
644 |
+
"len(ja-kana)": "1,3,14",
|
645 |
+
"num(ko)": 4041,
|
646 |
+
"len(ko)": "1,1,10"
|
647 |
+
},
|
648 |
+
"EleutherAI/gpt-neox-20b": {
|
649 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/EleutherAI/gpt-neox-20b\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">gpt-neox-20b</a>",
|
650 |
+
"organization": "EleutherAI",
|
651 |
+
"vocab_size": 50277,
|
652 |
+
"num(digit)": 2036,
|
653 |
+
"len(digit)": "1,3,35",
|
654 |
+
"num(space)": 28996,
|
655 |
+
"len(space)": "1,7,512",
|
656 |
+
"num(ar)": 94,
|
657 |
+
"len(ar)": "1,2,4",
|
658 |
+
"num(zh)": 313,
|
659 |
+
"len(zh)": "1,1,2",
|
660 |
+
"num(ja)": 480,
|
661 |
+
"len(ja)": "1,1,4",
|
662 |
+
"num(ja-kana)": 167,
|
663 |
+
"len(ja-kana)": "1,1,4",
|
664 |
+
"num(ko)": 25,
|
665 |
+
"len(ko)": "1,1,2"
|
666 |
+
},
|
667 |
+
"HuggingFaceH4/starchat-alpha": {
|
668 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/HuggingFaceH4/starchat-alpha\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">starchat-alpha</a>",
|
669 |
+
"organization": "-",
|
670 |
+
"vocab_size": 49156,
|
671 |
+
"num(digit)": 10,
|
672 |
+
"len(digit)": "1,1,1",
|
673 |
+
"num(space)": 16515,
|
674 |
+
"len(space)": "1,6,256",
|
675 |
+
"num(ar)": 84,
|
676 |
+
"len(ar)": "1,2,4",
|
677 |
+
"num(zh)": 2030,
|
678 |
+
"len(zh)": "1,1,7",
|
679 |
+
"num(ja)": 2368,
|
680 |
+
"len(ja)": "1,1,8",
|
681 |
+
"num(ja-kana)": 360,
|
682 |
+
"len(ja-kana)": "1,2,8",
|
683 |
+
"num(ko)": 491,
|
684 |
+
"len(ko)": "1,2,5"
|
685 |
+
},
|
686 |
+
"HuggingFaceH4/zephyr-7b-beta": {
|
687 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/HuggingFaceH4/zephyr-7b-beta\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">zephyr-7b-beta</a>",
|
688 |
+
"organization": "HuggingFace",
|
689 |
+
"vocab_size": 32000,
|
690 |
+
"num(digit)": 20,
|
691 |
+
"len(digit)": "1,1,1",
|
692 |
+
"num(space)": 85,
|
693 |
+
"len(space)": "1,3,15",
|
694 |
+
"num(ar)": 71,
|
695 |
+
"len(ar)": "1,1,2",
|
696 |
+
"num(zh)": 1459,
|
697 |
+
"len(zh)": "1,1,2",
|
698 |
+
"num(ja)": 1593,
|
699 |
+
"len(ja)": "1,1,2",
|
700 |
+
"num(ja-kana)": 134,
|
701 |
+
"len(ja-kana)": "1,1,1",
|
702 |
+
"num(ko)": 346,
|
703 |
+
"len(ko)": "1,1,1"
|
704 |
+
},
|
705 |
+
"LLM360/CrystalCoder": {
|
706 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/LLM360/CrystalCoder\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">CrystalCoder</a>",
|
707 |
+
"organization": "MBZUAI",
|
708 |
+
"vocab_size": 32022,
|
709 |
+
"num(digit)": 20,
|
710 |
+
"len(digit)": "1,1,1",
|
711 |
+
"num(space)": 61,
|
712 |
+
"len(space)": "1,2,15",
|
713 |
+
"num(ar)": 55,
|
714 |
+
"len(ar)": "1,1,2",
|
715 |
+
"num(zh)": 700,
|
716 |
+
"len(zh)": "1,1,1",
|
717 |
+
"num(ja)": 837,
|
718 |
+
"len(ja)": "1,1,1",
|
719 |
+
"num(ja-kana)": 137,
|
720 |
+
"len(ja-kana)": "1,1,1",
|
721 |
+
"num(ko)": 111,
|
722 |
+
"len(ko)": "1,1,1"
|
723 |
+
},
|
724 |
+
"NousResearch/Llama-2-7b-chat-hf": {
|
725 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/NousResearch/Llama-2-7b-chat-hf\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">llama2</a>",
|
726 |
+
"organization": "Meta",
|
727 |
+
"vocab_size": 32001,
|
728 |
+
"num(digit)": 20,
|
729 |
+
"len(digit)": "1,1,1",
|
730 |
+
"num(space)": 61,
|
731 |
+
"len(space)": "1,2,15",
|
732 |
+
"num(ar)": 55,
|
733 |
+
"len(ar)": "1,1,2",
|
734 |
+
"num(zh)": 700,
|
735 |
+
"len(zh)": "1,1,1",
|
736 |
+
"num(ja)": 837,
|
737 |
+
"len(ja)": "1,1,1",
|
738 |
+
"num(ja-kana)": 137,
|
739 |
+
"len(ja-kana)": "1,1,1",
|
740 |
+
"num(ko)": 111,
|
741 |
+
"len(ko)": "1,1,1"
|
742 |
+
},
|
743 |
+
"OrionStarAI/Orion-14B-Chat": {
|
744 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/OrionStarAI/Orion-14B-Chat\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">Orion-14B-Chat</a>",
|
745 |
+
"organization": "OrionStar",
|
746 |
+
"vocab_size": 84608,
|
747 |
+
"num(digit)": 1559,
|
748 |
+
"len(digit)": "1,4,14",
|
749 |
+
"num(space)": 18383,
|
750 |
+
"len(space)": "1,6,16",
|
751 |
+
"num(ar)": 102,
|
752 |
+
"len(ar)": "1,1,1",
|
753 |
+
"num(zh)": 46998,
|
754 |
+
"len(zh)": "1,2,16",
|
755 |
+
"num(ja)": 49644,
|
756 |
+
"len(ja)": "1,2,16",
|
757 |
+
"num(ja-kana)": 2987,
|
758 |
+
"len(ja-kana)": "1,3,11",
|
759 |
+
"num(ko)": 5110,
|
760 |
+
"len(ko)": "1,2,7"
|
761 |
+
},
|
762 |
+
"Qwen/Qwen-7B-Chat": {
|
763 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/Qwen/Qwen-7B-Chat\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">Qwen</a>",
|
764 |
+
"organization": "Alibaba",
|
765 |
+
"vocab_size": 151851,
|
766 |
+
"num(digit)": 10,
|
767 |
+
"len(digit)": "1,1,1",
|
768 |
+
"num(space)": 55883,
|
769 |
+
"len(space)": "1,6,128",
|
770 |
+
"num(ar)": 4018,
|
771 |
+
"len(ar)": "1,3,12",
|
772 |
+
"num(zh)": 25557,
|
773 |
+
"len(zh)": "1,2,7",
|
774 |
+
"num(ja)": 27206,
|
775 |
+
"len(ja)": "1,2,11",
|
776 |
+
"num(ja-kana)": 2089,
|
777 |
+
"len(ja-kana)": "1,3,11",
|
778 |
+
"num(ko)": 3495,
|
779 |
+
"len(ko)": "1,1,5"
|
780 |
+
},
|
781 |
+
"Qwen/Qwen1.5-14B-Chat": {
|
782 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/Qwen/Qwen1.5-14B-Chat\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">Qwen1.5</a>",
|
783 |
+
"organization": "Alibaba",
|
784 |
+
"vocab_size": 151646,
|
785 |
+
"num(digit)": 10,
|
786 |
+
"len(digit)": "1,1,1",
|
787 |
+
"num(space)": 55883,
|
788 |
+
"len(space)": "1,6,128",
|
789 |
+
"num(ar)": 4018,
|
790 |
+
"len(ar)": "1,3,12",
|
791 |
+
"num(zh)": 25557,
|
792 |
+
"len(zh)": "1,2,7",
|
793 |
+
"num(ja)": 27206,
|
794 |
+
"len(ja)": "1,2,11",
|
795 |
+
"num(ja-kana)": 2089,
|
796 |
+
"len(ja-kana)": "1,3,11",
|
797 |
+
"num(ko)": 3495,
|
798 |
+
"len(ko)": "1,1,5"
|
799 |
+
},
|
800 |
+
"Skywork/Skywork-13B-Math": {
|
801 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/Skywork/Skywork-13B-Math\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">Skywork-13B-Math</a>",
|
802 |
+
"organization": "Kunlun",
|
803 |
+
"vocab_size": 65519,
|
804 |
+
"num(digit)": 20,
|
805 |
+
"len(digit)": "1,1,1",
|
806 |
+
"num(space)": 62,
|
807 |
+
"len(space)": "1,2,15",
|
808 |
+
"num(ar)": 56,
|
809 |
+
"len(ar)": "1,1,2",
|
810 |
+
"num(zh)": 33913,
|
811 |
+
"len(zh)": "1,2,5",
|
812 |
+
"num(ja)": 34064,
|
813 |
+
"len(ja)": "1,2,5",
|
814 |
+
"num(ja-kana)": 150,
|
815 |
+
"len(ja-kana)": "1,1,1",
|
816 |
+
"num(ko)": 111,
|
817 |
+
"len(ko)": "1,1,1"
|
818 |
+
},
|
819 |
+
"Skywork/Skywork-13B-base": {
|
820 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/Skywork/Skywork-13B-base\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">Skywork-13B-base</a>",
|
821 |
+
"organization": "Kunlun",
|
822 |
+
"vocab_size": 65519,
|
823 |
+
"num(digit)": 20,
|
824 |
+
"len(digit)": "1,1,1",
|
825 |
+
"num(space)": 62,
|
826 |
+
"len(space)": "1,2,15",
|
827 |
+
"num(ar)": 56,
|
828 |
+
"len(ar)": "1,1,2",
|
829 |
+
"num(zh)": 33913,
|
830 |
+
"len(zh)": "1,2,5",
|
831 |
+
"num(ja)": 34064,
|
832 |
+
"len(ja)": "1,2,5",
|
833 |
+
"num(ja-kana)": 150,
|
834 |
+
"len(ja-kana)": "1,1,1",
|
835 |
+
"num(ko)": 111,
|
836 |
+
"len(ko)": "1,1,1"
|
837 |
+
},
|
838 |
+
"THUDM/chatglm-6b": {
|
839 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/THUDM/chatglm-6b\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">chatglm-6b</a>",
|
840 |
+
"organization": "Tsinghua",
|
841 |
+
"vocab_size": 130344,
|
842 |
+
"num(digit)": 20,
|
843 |
+
"len(digit)": "1,1,1",
|
844 |
+
"num(space)": 93,
|
845 |
+
"len(space)": "1,34,80",
|
846 |
+
"num(ar)": 137,
|
847 |
+
"len(ar)": "1,2,4",
|
848 |
+
"num(zh)": 61358,
|
849 |
+
"len(zh)": "1,2,16",
|
850 |
+
"num(ja)": 61784,
|
851 |
+
"len(ja)": "1,2,16",
|
852 |
+
"num(ja-kana)": 439,
|
853 |
+
"len(ja-kana)": "1,2,5",
|
854 |
+
"num(ko)": 114,
|
855 |
+
"len(ko)": "1,1,3"
|
856 |
+
},
|
857 |
+
"THUDM/chatglm2-6b": {
|
858 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/THUDM/chatglm2-6b\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">chatglm2-6b</a>",
|
859 |
+
"organization": "Tsinghua",
|
860 |
+
"vocab_size": 64787,
|
861 |
+
"num(digit)": 20,
|
862 |
+
"len(digit)": "1,1,1",
|
863 |
+
"num(space)": 67,
|
864 |
+
"len(space)": "1,2,15",
|
865 |
+
"num(ar)": 57,
|
866 |
+
"len(ar)": "1,1,2",
|
867 |
+
"num(zh)": 30922,
|
868 |
+
"len(zh)": "1,2,16",
|
869 |
+
"num(ja)": 31065,
|
870 |
+
"len(ja)": "1,2,16",
|
871 |
+
"num(ja-kana)": 143,
|
872 |
+
"len(ja-kana)": "1,1,1",
|
873 |
+
"num(ko)": 604,
|
874 |
+
"len(ko)": "1,1,1"
|
875 |
+
},
|
876 |
+
"THUDM/chatglm3-6b": {
|
877 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/THUDM/chatglm3-6b\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">chatglm3-6b</a>",
|
878 |
+
"organization": "Tsinghua",
|
879 |
+
"vocab_size": 64796,
|
880 |
+
"num(digit)": 20,
|
881 |
+
"len(digit)": "1,1,1",
|
882 |
+
"num(space)": 67,
|
883 |
+
"len(space)": "1,2,15",
|
884 |
+
"num(ar)": 57,
|
885 |
+
"len(ar)": "1,1,2",
|
886 |
+
"num(zh)": 30922,
|
887 |
+
"len(zh)": "1,2,16",
|
888 |
+
"num(ja)": 31065,
|
889 |
+
"len(ja)": "1,2,16",
|
890 |
+
"num(ja-kana)": 143,
|
891 |
+
"len(ja-kana)": "1,1,1",
|
892 |
+
"num(ko)": 604,
|
893 |
+
"len(ko)": "1,1,1"
|
894 |
+
},
|
895 |
+
"TigerResearch/tigerbot-13b-chat-v2": {
|
896 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/TigerResearch/tigerbot-13b-chat-v2\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">tigerbot-13b-chat-v2</a>",
|
897 |
+
"organization": "Tigerobo",
|
898 |
+
"vocab_size": 60515,
|
899 |
+
"num(digit)": 20,
|
900 |
+
"len(digit)": "1,1,1",
|
901 |
+
"num(space)": 61,
|
902 |
+
"len(space)": "1,2,15",
|
903 |
+
"num(ar)": 55,
|
904 |
+
"len(ar)": "1,1,2",
|
905 |
+
"num(zh)": 28603,
|
906 |
+
"len(zh)": "1,2,16",
|
907 |
+
"num(ja)": 28770,
|
908 |
+
"len(ja)": "1,2,16",
|
909 |
+
"num(ja-kana)": 167,
|
910 |
+
"len(ja-kana)": "1,1,2",
|
911 |
+
"num(ko)": 261,
|
912 |
+
"len(ko)": "1,1,1"
|
913 |
+
},
|
914 |
+
"TigerResearch/tigerbot-70b-chat-v4-4k": {
|
915 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/TigerResearch/tigerbot-70b-chat-v4-4k\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">tigerbot-70b-chat-v4-4k</a>",
|
916 |
+
"organization": "Tigerobo",
|
917 |
+
"vocab_size": 65110,
|
918 |
+
"num(digit)": 20,
|
919 |
+
"len(digit)": "1,1,1",
|
920 |
+
"num(space)": 61,
|
921 |
+
"len(space)": "1,2,15",
|
922 |
+
"num(ar)": 55,
|
923 |
+
"len(ar)": "1,1,2",
|
924 |
+
"num(zh)": 30509,
|
925 |
+
"len(zh)": "1,2,16",
|
926 |
+
"num(ja)": 32061,
|
927 |
+
"len(ja)": "1,2,16",
|
928 |
+
"num(ja-kana)": 2071,
|
929 |
+
"len(ja-kana)": "1,2,8",
|
930 |
+
"num(ko)": 1504,
|
931 |
+
"len(ko)": "1,1,5"
|
932 |
+
},
|
933 |
+
"Upstage/SOLAR-10.7B-v1.0": {
|
934 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/Upstage/SOLAR-10.7B-v1.0\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">SOLAR-10.7B-v1.0</a>",
|
935 |
+
"organization": "-",
|
936 |
+
"vocab_size": 32000,
|
937 |
+
"num(digit)": 20,
|
938 |
+
"len(digit)": "1,1,1",
|
939 |
+
"num(space)": 85,
|
940 |
+
"len(space)": "1,3,15",
|
941 |
+
"num(ar)": 71,
|
942 |
+
"len(ar)": "1,1,2",
|
943 |
+
"num(zh)": 1459,
|
944 |
+
"len(zh)": "1,1,2",
|
945 |
+
"num(ja)": 1593,
|
946 |
+
"len(ja)": "1,1,2",
|
947 |
+
"num(ja-kana)": 134,
|
948 |
+
"len(ja-kana)": "1,1,1",
|
949 |
+
"num(ko)": 346,
|
950 |
+
"len(ko)": "1,1,1"
|
951 |
+
},
|
952 |
+
"WizardLM/WizardCoder-15B-V1.0": {
|
953 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/WizardLM/WizardCoder-15B-V1.0\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">WizardCoder-15B-V1.0</a>",
|
954 |
+
"organization": "Microsoft",
|
955 |
+
"vocab_size": 49153,
|
956 |
+
"num(digit)": 10,
|
957 |
+
"len(digit)": "1,1,1",
|
958 |
+
"num(space)": 16515,
|
959 |
+
"len(space)": "1,6,256",
|
960 |
+
"num(ar)": 84,
|
961 |
+
"len(ar)": "1,2,4",
|
962 |
+
"num(zh)": 2030,
|
963 |
+
"len(zh)": "1,1,7",
|
964 |
+
"num(ja)": 2368,
|
965 |
+
"len(ja)": "1,1,8",
|
966 |
+
"num(ja-kana)": 360,
|
967 |
+
"len(ja-kana)": "1,2,8",
|
968 |
+
"num(ko)": 491,
|
969 |
+
"len(ko)": "1,2,5"
|
970 |
+
},
|
971 |
+
"WizardLM/WizardCoder-Python-7B-V1.0": {
|
972 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/WizardLM/WizardCoder-Python-7B-V1.0\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">WizardCoder-Python-7B-V1.0</a>",
|
973 |
+
"organization": "Microsoft",
|
974 |
+
"vocab_size": 32001,
|
975 |
+
"num(digit)": 20,
|
976 |
+
"len(digit)": "1,1,1",
|
977 |
+
"num(space)": 61,
|
978 |
+
"len(space)": "1,2,15",
|
979 |
+
"num(ar)": 55,
|
980 |
+
"len(ar)": "1,1,2",
|
981 |
+
"num(zh)": 700,
|
982 |
+
"len(zh)": "1,1,1",
|
983 |
+
"num(ja)": 837,
|
984 |
+
"len(ja)": "1,1,1",
|
985 |
+
"num(ja-kana)": 137,
|
986 |
+
"len(ja-kana)": "1,1,1",
|
987 |
+
"num(ko)": 111,
|
988 |
+
"len(ko)": "1,1,1"
|
989 |
+
},
|
990 |
+
"WizardLM/WizardLM-7B-V1.0": {
|
991 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/WizardLM/WizardLM-7B-V1.0\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">WizardLM-7B-V1.0</a>",
|
992 |
+
"organization": "Microsoft",
|
993 |
+
"vocab_size": 32001,
|
994 |
+
"num(digit)": 20,
|
995 |
+
"len(digit)": "1,1,1",
|
996 |
+
"num(space)": 61,
|
997 |
+
"len(space)": "1,2,15",
|
998 |
+
"num(ar)": 55,
|
999 |
+
"len(ar)": "1,1,2",
|
1000 |
+
"num(zh)": 700,
|
1001 |
+
"len(zh)": "1,1,1",
|
1002 |
+
"num(ja)": 837,
|
1003 |
+
"len(ja)": "1,1,1",
|
1004 |
+
"num(ja-kana)": 137,
|
1005 |
+
"len(ja-kana)": "1,1,1",
|
1006 |
+
"num(ko)": 111,
|
1007 |
+
"len(ko)": "1,1,1"
|
1008 |
+
},
|
1009 |
+
"WizardLM/WizardMath-70B-V1.0": {
|
1010 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/WizardLM/WizardMath-70B-V1.0\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">WizardMath-70B-V1.0</a>",
|
1011 |
+
"organization": "Microsoft",
|
1012 |
+
"vocab_size": 32002,
|
1013 |
+
"num(digit)": 20,
|
1014 |
+
"len(digit)": "1,1,1",
|
1015 |
+
"num(space)": 61,
|
1016 |
+
"len(space)": "1,2,15",
|
1017 |
+
"num(ar)": 55,
|
1018 |
+
"len(ar)": "1,1,2",
|
1019 |
+
"num(zh)": 700,
|
1020 |
+
"len(zh)": "1,1,1",
|
1021 |
+
"num(ja)": 837,
|
1022 |
+
"len(ja)": "1,1,1",
|
1023 |
+
"num(ja-kana)": 137,
|
1024 |
+
"len(ja-kana)": "1,1,1",
|
1025 |
+
"num(ko)": 111,
|
1026 |
+
"len(ko)": "1,1,1"
|
1027 |
+
},
|
1028 |
+
"abeja/gpt-neox-japanese-2.7b": {
|
1029 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/abeja/gpt-neox-japanese-2.7b\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">gpt-neox-japanese-2.7b</a>",
|
1030 |
+
"organization": "ABEJA",
|
1031 |
+
"vocab_size": 32000,
|
1032 |
+
"num(digit)": 20,
|
1033 |
+
"len(digit)": "1,1,1",
|
1034 |
+
"num(space)": 0,
|
1035 |
+
"len(space)": "-",
|
1036 |
+
"num(ar)": 0,
|
1037 |
+
"len(ar)": "-",
|
1038 |
+
"num(zh)": 15176,
|
1039 |
+
"len(zh)": "1,2,2",
|
1040 |
+
"num(ja)": 31482,
|
1041 |
+
"len(ja)": "1,2,3",
|
1042 |
+
"num(ja-kana)": 16306,
|
1043 |
+
"len(ja-kana)": "1,3,3",
|
1044 |
+
"num(ko)": 0,
|
1045 |
+
"len(ko)": "-"
|
1046 |
+
},
|
1047 |
+
"ai21labs/Jamba-v0.1": {
|
1048 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/ai21labs/Jamba-v0.1\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">Jamba-v0.1</a>",
|
1049 |
+
"organization": "AI21",
|
1050 |
+
"vocab_size": 65536,
|
1051 |
+
"num(digit)": 1556,
|
1052 |
+
"len(digit)": "1,16,17",
|
1053 |
+
"num(space)": 39501,
|
1054 |
+
"len(space)": "1,7,32",
|
1055 |
+
"num(ar)": 867,
|
1056 |
+
"len(ar)": "1,3,8",
|
1057 |
+
"num(zh)": 1157,
|
1058 |
+
"len(zh)": "1,1,2",
|
1059 |
+
"num(ja)": 1287,
|
1060 |
+
"len(ja)": "1,1,2",
|
1061 |
+
"num(ja-kana)": 130,
|
1062 |
+
"len(ja-kana)": "1,1,2",
|
1063 |
+
"num(ko)": 312,
|
1064 |
+
"len(ko)": "1,1,2"
|
1065 |
+
},
|
1066 |
+
"allenai/OLMo-7B": {
|
1067 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/allenai/OLMo-7B\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">OLMo-7B</a>",
|
1068 |
+
"organization": "Allen AI",
|
1069 |
+
"vocab_size": 50280,
|
1070 |
+
"num(digit)": 2036,
|
1071 |
+
"len(digit)": "1,3,35",
|
1072 |
+
"num(space)": 29019,
|
1073 |
+
"len(space)": "1,7,512",
|
1074 |
+
"num(ar)": 94,
|
1075 |
+
"len(ar)": "1,2,4",
|
1076 |
+
"num(zh)": 313,
|
1077 |
+
"len(zh)": "1,1,2",
|
1078 |
+
"num(ja)": 480,
|
1079 |
+
"len(ja)": "1,1,4",
|
1080 |
+
"num(ja-kana)": 167,
|
1081 |
+
"len(ja-kana)": "1,1,4",
|
1082 |
+
"num(ko)": 25,
|
1083 |
+
"len(ko)": "1,1,2"
|
1084 |
+
},
|
1085 |
+
"baichuan-inc/Baichuan2-7B-Chat": {
|
1086 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">baichuan2</a>",
|
1087 |
+
"organization": "Baichuan",
|
1088 |
+
"vocab_size": 125696,
|
1089 |
+
"num(digit)": 1023,
|
1090 |
+
"len(digit)": "1,14,14",
|
1091 |
+
"num(space)": 26013,
|
1092 |
+
"len(space)": "1,7,32",
|
1093 |
+
"num(ar)": 335,
|
1094 |
+
"len(ar)": "1,1,27",
|
1095 |
+
"num(zh)": 70398,
|
1096 |
+
"len(zh)": "1,2,32",
|
1097 |
+
"num(ja)": 71269,
|
1098 |
+
"len(ja)": "1,2,32",
|
1099 |
+
"num(ja-kana)": 206,
|
1100 |
+
"len(ja-kana)": "1,1,9",
|
1101 |
+
"num(ko)": 1595,
|
1102 |
+
"len(ko)": "1,1,2"
|
1103 |
+
},
|
1104 |
+
"ckiplab/gpt2-base-chinese": {
|
1105 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/ckiplab/gpt2-base-chinese\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">gpt2-base-chinese</a>",
|
1106 |
+
"organization": "SINICA",
|
1107 |
+
"vocab_size": 21128,
|
1108 |
+
"num(digit)": 1451,
|
1109 |
+
"len(digit)": "1,3,12",
|
1110 |
+
"num(space)": 2,
|
1111 |
+
"len(space)": "1,2,3",
|
1112 |
+
"num(ar)": 30,
|
1113 |
+
"len(ar)": "1,2,3",
|
1114 |
+
"num(zh)": 14642,
|
1115 |
+
"len(zh)": "1,2,3",
|
1116 |
+
"num(ja)": 15197,
|
1117 |
+
"len(ja)": "1,3,15",
|
1118 |
+
"num(ja-kana)": 553,
|
1119 |
+
"len(ja-kana)": "1,3,15",
|
1120 |
+
"num(ko)": 0,
|
1121 |
+
"len(ko)": "-"
|
1122 |
+
},
|
1123 |
+
"cyberagent/open-calm-7b": {
|
1124 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/cyberagent/open-calm-7b\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">open-calm-7b</a>",
|
1125 |
+
"organization": "CyberAgent",
|
1126 |
+
"vocab_size": 52000,
|
1127 |
+
"num(digit)": 690,
|
1128 |
+
"len(digit)": "1,3,5",
|
1129 |
+
"num(space)": 1698,
|
1130 |
+
"len(space)": "1,4,33",
|
1131 |
+
"num(ar)": 10,
|
1132 |
+
"len(ar)": "1,1,4",
|
1133 |
+
"num(zh)": 30775,
|
1134 |
+
"len(zh)": "1,3,31",
|
1135 |
+
"num(ja)": 45790,
|
1136 |
+
"len(ja)": "1,3,31",
|
1137 |
+
"num(ja-kana)": 32535,
|
1138 |
+
"len(ja-kana)": "1,3,31",
|
1139 |
+
"num(ko)": 0,
|
1140 |
+
"len(ko)": "-"
|
1141 |
+
},
|
1142 |
+
"databricks/dbrx-instruct": {
|
1143 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/databricks/dbrx-instruct\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">dbrx-instruct</a>",
|
1144 |
+
"organization": "Databricks",
|
1145 |
+
"vocab_size": 100280,
|
1146 |
+
"num(digit)": 1126,
|
1147 |
+
"len(digit)": "1,3,17",
|
1148 |
+
"num(space)": 47400,
|
1149 |
+
"len(space)": "1,7,128",
|
1150 |
+
"num(ar)": 113,
|
1151 |
+
"len(ar)": "1,2,10",
|
1152 |
+
"num(zh)": 868,
|
1153 |
+
"len(zh)": "1,1,7",
|
1154 |
+
"num(ja)": 1035,
|
1155 |
+
"len(ja)": "1,1,7",
|
1156 |
+
"num(ja-kana)": 169,
|
1157 |
+
"len(ja-kana)": "1,1,7",
|
1158 |
+
"num(ko)": 299,
|
1159 |
+
"len(ko)": "1,2,4"
|
1160 |
+
},
|
1161 |
+
"deepseek-ai/DeepSeek-V2": {
|
1162 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/deepseek-ai/DeepSeek-V2\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">DeepSeek-V2</a>",
|
1163 |
+
"organization": "DeepSeek",
|
1164 |
+
"vocab_size": 100002,
|
1165 |
+
"num(digit)": 10,
|
1166 |
+
"len(digit)": "1,1,1",
|
1167 |
+
"num(space)": 48073,
|
1168 |
+
"len(space)": "1,7,128",
|
1169 |
+
"num(ar)": 48,
|
1170 |
+
"len(ar)": "1,1,4",
|
1171 |
+
"num(zh)": 18052,
|
1172 |
+
"len(zh)": "1,2,16",
|
1173 |
+
"num(ja)": 18090,
|
1174 |
+
"len(ja)": "1,2,16",
|
1175 |
+
"num(ja-kana)": 38,
|
1176 |
+
"len(ja-kana)": "1,1,2",
|
1177 |
+
"num(ko)": 16,
|
1178 |
+
"len(ko)": "1,1,2"
|
1179 |
+
},
|
1180 |
+
"deepseek-ai/deepseek-coder-33b-instruct": {
|
1181 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/deepseek-ai/deepseek-coder-33b-instruct\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">deepseek-coder-33b-instruct</a>",
|
1182 |
+
"organization": "DeepSeek",
|
1183 |
+
"vocab_size": 32022,
|
1184 |
+
"num(digit)": 10,
|
1185 |
+
"len(digit)": "1,1,1",
|
1186 |
+
"num(space)": 15254,
|
1187 |
+
"len(space)": "1,6,65",
|
1188 |
+
"num(ar)": 12,
|
1189 |
+
"len(ar)": "1,1,2",
|
1190 |
+
"num(zh)": 4803,
|
1191 |
+
"len(zh)": "1,2,4",
|
1192 |
+
"num(ja)": 4804,
|
1193 |
+
"len(ja)": "1,2,4",
|
1194 |
+
"num(ja-kana)": 1,
|
1195 |
+
"len(ja-kana)": "1,1,1",
|
1196 |
+
"num(ko)": 0,
|
1197 |
+
"len(ko)": "-"
|
1198 |
+
},
|
1199 |
+
"deepseek-ai/deepseek-llm-7b-base": {
|
1200 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/deepseek-ai/deepseek-llm-7b-base\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">deepseek-llm-7b-base</a>",
|
1201 |
+
"organization": "DeepSeek",
|
1202 |
+
"vocab_size": 100015,
|
1203 |
+
"num(digit)": 10,
|
1204 |
+
"len(digit)": "1,1,1",
|
1205 |
+
"num(space)": 48073,
|
1206 |
+
"len(space)": "1,7,128",
|
1207 |
+
"num(ar)": 48,
|
1208 |
+
"len(ar)": "1,1,4",
|
1209 |
+
"num(zh)": 18052,
|
1210 |
+
"len(zh)": "1,2,16",
|
1211 |
+
"num(ja)": 18090,
|
1212 |
+
"len(ja)": "1,2,16",
|
1213 |
+
"num(ja-kana)": 38,
|
1214 |
+
"len(ja-kana)": "1,1,2",
|
1215 |
+
"num(ko)": 16,
|
1216 |
+
"len(ko)": "1,1,2"
|
1217 |
+
},
|
1218 |
+
"eson/kplug-base-encoder": {
|
1219 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/eson/kplug-base-encoder\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">kplug</a>",
|
1220 |
+
"organization": "JD",
|
1221 |
+
"vocab_size": 10261,
|
1222 |
+
"num(digit)": 420,
|
1223 |
+
"len(digit)": "1,3,12",
|
1224 |
+
"num(space)": 0,
|
1225 |
+
"len(space)": "-",
|
1226 |
+
"num(ar)": 0,
|
1227 |
+
"len(ar)": "-",
|
1228 |
+
"num(zh)": 5764,
|
1229 |
+
"len(zh)": "1,1,1",
|
1230 |
+
"num(ja)": 5766,
|
1231 |
+
"len(ja)": "1,1,3",
|
1232 |
+
"num(ja-kana)": 0,
|
1233 |
+
"len(ja-kana)": "-",
|
1234 |
+
"num(ko)": 0,
|
1235 |
+
"len(ko)": "-"
|
1236 |
+
},
|
1237 |
+
"fnlp/moss-moon-003-sft": {
|
1238 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/fnlp/moss-moon-003-sft\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">moss-moon-003-sft</a>",
|
1239 |
+
"organization": "Fudan",
|
1240 |
+
"vocab_size": 106072,
|
1241 |
+
"num(digit)": 1848,
|
1242 |
+
"len(digit)": "1,3,16",
|
1243 |
+
"num(space)": 33566,
|
1244 |
+
"len(space)": "1,7,102",
|
1245 |
+
"num(ar)": 25,
|
1246 |
+
"len(ar)": "1,1,4",
|
1247 |
+
"num(zh)": 54230,
|
1248 |
+
"len(zh)": "1,2,15",
|
1249 |
+
"num(ja)": 54381,
|
1250 |
+
"len(ja)": "1,2,15",
|
1251 |
+
"num(ja-kana)": 152,
|
1252 |
+
"len(ja-kana)": "1,1,7",
|
1253 |
+
"num(ko)": 0,
|
1254 |
+
"len(ko)": "-"
|
1255 |
+
},
|
1256 |
+
"google/gemma-7b": {
|
1257 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/google/gemma-7b\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">gemma-7b</a>",
|
1258 |
+
"organization": "Google",
|
1259 |
+
"vocab_size": 256000,
|
1260 |
+
"num(digit)": 134,
|
1261 |
+
"len(digit)": "1,10,12",
|
1262 |
+
"num(space)": 125662,
|
1263 |
+
"len(space)": "1,7,31",
|
1264 |
+
"num(ar)": 6274,
|
1265 |
+
"len(ar)": "1,4,15",
|
1266 |
+
"num(zh)": 23767,
|
1267 |
+
"len(zh)": "1,2,12",
|
1268 |
+
"num(ja)": 28852,
|
1269 |
+
"len(ja)": "1,2,12",
|
1270 |
+
"num(ja-kana)": 7061,
|
1271 |
+
"len(ja-kana)": "1,3,12",
|
1272 |
+
"num(ko)": 2295,
|
1273 |
+
"len(ko)": "1,1,5"
|
1274 |
+
},
|
1275 |
+
"google/switch-c-2048": {
|
1276 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/google/switch-c-2048\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">switch-c-2048</a>",
|
1277 |
+
"organization": "Google",
|
1278 |
+
"vocab_size": 32100,
|
1279 |
+
"num(digit)": 1133,
|
1280 |
+
"len(digit)": "1,3,13",
|
1281 |
+
"num(space)": 0,
|
1282 |
+
"len(space)": "-",
|
1283 |
+
"num(ar)": 0,
|
1284 |
+
"len(ar)": "-",
|
1285 |
+
"num(zh)": 0,
|
1286 |
+
"len(zh)": "-",
|
1287 |
+
"num(ja)": 0,
|
1288 |
+
"len(ja)": "-",
|
1289 |
+
"num(ja-kana)": 0,
|
1290 |
+
"len(ja-kana)": "-",
|
1291 |
+
"num(ko)": 0,
|
1292 |
+
"len(ko)": "-"
|
1293 |
+
},
|
1294 |
+
"hfl/chinese-alpaca-lora-7b": {
|
1295 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/hfl/chinese-alpaca-lora-7b\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">chinese-alpaca-lora-7b</a>",
|
1296 |
+
"organization": "-",
|
1297 |
+
"vocab_size": 49954,
|
1298 |
+
"num(digit)": 614,
|
1299 |
+
"len(digit)": "1,3,5",
|
1300 |
+
"num(space)": 61,
|
1301 |
+
"len(space)": "1,2,15",
|
1302 |
+
"num(ar)": 55,
|
1303 |
+
"len(ar)": "1,1,2",
|
1304 |
+
"num(zh)": 17839,
|
1305 |
+
"len(zh)": "1,2,13",
|
1306 |
+
"num(ja)": 17993,
|
1307 |
+
"len(ja)": "1,2,13",
|
1308 |
+
"num(ja-kana)": 154,
|
1309 |
+
"len(ja-kana)": "1,1,1",
|
1310 |
+
"num(ko)": 135,
|
1311 |
+
"len(ko)": "1,1,1"
|
1312 |
+
},
|
1313 |
+
"hfl/chinese-llama-2-7b": {
|
1314 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/hfl/chinese-llama-2-7b\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">chinese-llama-2-7b</a>",
|
1315 |
+
"organization": "-",
|
1316 |
+
"vocab_size": 55296,
|
1317 |
+
"num(digit)": 20,
|
1318 |
+
"len(digit)": "1,1,1",
|
1319 |
+
"num(space)": 61,
|
1320 |
+
"len(space)": "1,2,15",
|
1321 |
+
"num(ar)": 55,
|
1322 |
+
"len(ar)": "1,1,2",
|
1323 |
+
"num(zh)": 23974,
|
1324 |
+
"len(zh)": "1,2,16",
|
1325 |
+
"num(ja)": 24111,
|
1326 |
+
"len(ja)": "1,2,16",
|
1327 |
+
"num(ja-kana)": 137,
|
1328 |
+
"len(ja-kana)": "1,1,1",
|
1329 |
+
"num(ko)": 111,
|
1330 |
+
"len(ko)": "1,1,1"
|
1331 |
+
},
|
1332 |
+
"hfl/chinese-llama-lora-7b": {
|
1333 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/hfl/chinese-llama-lora-7b\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">chinese-llama-lora-7b</a>",
|
1334 |
+
"organization": "-",
|
1335 |
+
"vocab_size": 49953,
|
1336 |
+
"num(digit)": 614,
|
1337 |
+
"len(digit)": "1,3,5",
|
1338 |
+
"num(space)": 61,
|
1339 |
+
"len(space)": "1,2,15",
|
1340 |
+
"num(ar)": 55,
|
1341 |
+
"len(ar)": "1,1,2",
|
1342 |
+
"num(zh)": 17839,
|
1343 |
+
"len(zh)": "1,2,13",
|
1344 |
+
"num(ja)": 17993,
|
1345 |
+
"len(ja)": "1,2,13",
|
1346 |
+
"num(ja-kana)": 154,
|
1347 |
+
"len(ja-kana)": "1,1,1",
|
1348 |
+
"num(ko)": 135,
|
1349 |
+
"len(ko)": "1,1,1"
|
1350 |
+
},
|
1351 |
+
"hfl/llama-3-chinese-8b": {
|
1352 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/hfl/llama-3-chinese-8b\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">llama-3-chinese-8b</a>",
|
1353 |
+
"organization": "-",
|
1354 |
+
"vocab_size": 128256,
|
1355 |
+
"num(digit)": 1110,
|
1356 |
+
"len(digit)": "1,3,3",
|
1357 |
+
"num(space)": 60860,
|
1358 |
+
"len(space)": "1,6,128",
|
1359 |
+
"num(ar)": 3810,
|
1360 |
+
"len(ar)": "1,4,11",
|
1361 |
+
"num(zh)": 4424,
|
1362 |
+
"len(zh)": "1,1,7",
|
1363 |
+
"num(ja)": 5387,
|
1364 |
+
"len(ja)": "1,2,8",
|
1365 |
+
"num(ja-kana)": 1086,
|
1366 |
+
"len(ja-kana)": "1,2,8",
|
1367 |
+
"num(ko)": 2281,
|
1368 |
+
"len(ko)": "1,2,6"
|
1369 |
+
},
|
1370 |
+
"hpcai-tech/grok-1": {
|
1371 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/hpcai-tech/grok-1\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">grok-1</a>",
|
1372 |
+
"organization": "xAI",
|
1373 |
+
"vocab_size": 131072,
|
1374 |
+
"num(digit)": 40,
|
1375 |
+
"len(digit)": "1,6,13",
|
1376 |
+
"num(space)": 399,
|
1377 |
+
"len(space)": "1,3,16",
|
1378 |
+
"num(ar)": 69,
|
1379 |
+
"len(ar)": "1,2,4",
|
1380 |
+
"num(zh)": 1626,
|
1381 |
+
"len(zh)": "1,2,7",
|
1382 |
+
"num(ja)": 3118,
|
1383 |
+
"len(ja)": "1,2,8",
|
1384 |
+
"num(ja-kana)": 1908,
|
1385 |
+
"len(ja-kana)": "1,2,8",
|
1386 |
+
"num(ko)": 67,
|
1387 |
+
"len(ko)": "1,1,2"
|
1388 |
+
},
|
1389 |
+
"internlm/internlm-chat-7b": {
|
1390 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/internlm/internlm-chat-7b\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">internlm-chat-7b</a>",
|
1391 |
+
"organization": "Shanghai AI Lab",
|
1392 |
+
"vocab_size": 103168,
|
1393 |
+
"num(digit)": 1259,
|
1394 |
+
"len(digit)": "1,3,19",
|
1395 |
+
"num(space)": 33008,
|
1396 |
+
"len(space)": "1,6,128",
|
1397 |
+
"num(ar)": 6702,
|
1398 |
+
"len(ar)": "1,4,16",
|
1399 |
+
"num(zh)": 32000,
|
1400 |
+
"len(zh)": "1,2,15",
|
1401 |
+
"num(ja)": 32866,
|
1402 |
+
"len(ja)": "1,2,15",
|
1403 |
+
"num(ja-kana)": 864,
|
1404 |
+
"len(ja-kana)": "1,2,9",
|
1405 |
+
"num(ko)": 298,
|
1406 |
+
"len(ko)": "1,1,1"
|
1407 |
+
},
|
1408 |
+
"internlm/internlm-xcomposer-7b": {
|
1409 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/internlm/internlm-xcomposer-7b\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">internlm-xcomposer-7b</a>",
|
1410 |
+
"organization": "Shanghai AI Lab",
|
1411 |
+
"vocab_size": 103168,
|
1412 |
+
"num(digit)": 1261,
|
1413 |
+
"len(digit)": "1,3,19",
|
1414 |
+
"num(space)": 33008,
|
1415 |
+
"len(space)": "1,6,128",
|
1416 |
+
"num(ar)": 6702,
|
1417 |
+
"len(ar)": "1,4,16",
|
1418 |
+
"num(zh)": 32000,
|
1419 |
+
"len(zh)": "1,2,15",
|
1420 |
+
"num(ja)": 32866,
|
1421 |
+
"len(ja)": "1,2,15",
|
1422 |
+
"num(ja-kana)": 864,
|
1423 |
+
"len(ja-kana)": "1,2,9",
|
1424 |
+
"num(ko)": 298,
|
1425 |
+
"len(ko)": "1,1,1"
|
1426 |
+
},
|
1427 |
+
"internlm/internlm2-chat-7b": {
|
1428 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/internlm/internlm2-chat-7b\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">internlm2-chat-7b</a>",
|
1429 |
+
"organization": "Shanghai AI Lab",
|
1430 |
+
"vocab_size": 92544,
|
1431 |
+
"num(digit)": 1261,
|
1432 |
+
"len(digit)": "1,3,18",
|
1433 |
+
"num(space)": 28681,
|
1434 |
+
"len(space)": "1,7,128",
|
1435 |
+
"num(ar)": 30,
|
1436 |
+
"len(ar)": "1,1,1",
|
1437 |
+
"num(zh)": 31148,
|
1438 |
+
"len(zh)": "1,2,15",
|
1439 |
+
"num(ja)": 31296,
|
1440 |
+
"len(ja)": "1,2,15",
|
1441 |
+
"num(ja-kana)": 148,
|
1442 |
+
"len(ja-kana)": "1,1,1",
|
1443 |
+
"num(ko)": 83,
|
1444 |
+
"len(ko)": "1,1,1"
|
1445 |
+
},
|
1446 |
+
"internlm/internlm2-math-7b": {
|
1447 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/internlm/internlm2-math-7b\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">internlm2-math-7b</a>",
|
1448 |
+
"organization": "Shanghai AI Lab",
|
1449 |
+
"vocab_size": 92544,
|
1450 |
+
"num(digit)": 1261,
|
1451 |
+
"len(digit)": "1,3,18",
|
1452 |
+
"num(space)": 28681,
|
1453 |
+
"len(space)": "1,7,128",
|
1454 |
+
"num(ar)": 30,
|
1455 |
+
"len(ar)": "1,1,1",
|
1456 |
+
"num(zh)": 31148,
|
1457 |
+
"len(zh)": "1,2,15",
|
1458 |
+
"num(ja)": 31296,
|
1459 |
+
"len(ja)": "1,2,15",
|
1460 |
+
"num(ja-kana)": 148,
|
1461 |
+
"len(ja-kana)": "1,1,1",
|
1462 |
+
"num(ko)": 83,
|
1463 |
+
"len(ko)": "1,1,1"
|
1464 |
+
},
|
1465 |
+
"microsoft/Phi-3-mini-4k-instruct": {
|
1466 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/microsoft/Phi-3-mini-4k-instruct\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">Phi-3-mini-4k-instruct</a>",
|
1467 |
+
"organization": "Microsoft",
|
1468 |
+
"vocab_size": 32011,
|
1469 |
+
"num(digit)": 20,
|
1470 |
+
"len(digit)": "1,1,1",
|
1471 |
+
"num(space)": 61,
|
1472 |
+
"len(space)": "1,2,15",
|
1473 |
+
"num(ar)": 55,
|
1474 |
+
"len(ar)": "1,1,2",
|
1475 |
+
"num(zh)": 700,
|
1476 |
+
"len(zh)": "1,1,1",
|
1477 |
+
"num(ja)": 837,
|
1478 |
+
"len(ja)": "1,1,1",
|
1479 |
+
"num(ja-kana)": 137,
|
1480 |
+
"len(ja-kana)": "1,1,1",
|
1481 |
+
"num(ko)": 111,
|
1482 |
+
"len(ko)": "1,1,1"
|
1483 |
+
},
|
1484 |
+
"microsoft/phi-1": {
|
1485 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/microsoft/phi-1\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">phi-1</a>",
|
1486 |
+
"organization": "Microsoft",
|
1487 |
+
"vocab_size": 50295,
|
1488 |
+
"num(digit)": 1691,
|
1489 |
+
"len(digit)": "1,3,16",
|
1490 |
+
"num(space)": 33129,
|
1491 |
+
"len(space)": "1,7,66",
|
1492 |
+
"num(ar)": 22,
|
1493 |
+
"len(ar)": "1,1,3",
|
1494 |
+
"num(zh)": 51,
|
1495 |
+
"len(zh)": "1,1,4",
|
1496 |
+
"num(ja)": 183,
|
1497 |
+
"len(ja)": "1,1,7",
|
1498 |
+
"num(ja-kana)": 133,
|
1499 |
+
"len(ja-kana)": "1,1,7",
|
1500 |
+
"num(ko)": 0,
|
1501 |
+
"len(ko)": "-"
|
1502 |
+
},
|
1503 |
+
"microsoft/phi-2": {
|
1504 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/microsoft/phi-2\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">phi-2</a>",
|
1505 |
+
"organization": "Microsoft",
|
1506 |
+
"vocab_size": 50295,
|
1507 |
+
"num(digit)": 1691,
|
1508 |
+
"len(digit)": "1,3,16",
|
1509 |
+
"num(space)": 33129,
|
1510 |
+
"len(space)": "1,7,66",
|
1511 |
+
"num(ar)": 22,
|
1512 |
+
"len(ar)": "1,1,3",
|
1513 |
+
"num(zh)": 51,
|
1514 |
+
"len(zh)": "1,1,4",
|
1515 |
+
"num(ja)": 183,
|
1516 |
+
"len(ja)": "1,1,7",
|
1517 |
+
"num(ja-kana)": 133,
|
1518 |
+
"len(ja-kana)": "1,1,7",
|
1519 |
+
"num(ko)": 0,
|
1520 |
+
"len(ko)": "-"
|
1521 |
+
},
|
1522 |
+
"mistralai/Mistral-7B-v0.1": {
|
1523 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/mistralai/Mistral-7B-v0.1\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">Mistral-7B-v0.1</a>",
|
1524 |
+
"organization": "Mistral",
|
1525 |
+
"vocab_size": 32000,
|
1526 |
+
"num(digit)": 20,
|
1527 |
+
"len(digit)": "1,1,1",
|
1528 |
+
"num(space)": 85,
|
1529 |
+
"len(space)": "1,3,15",
|
1530 |
+
"num(ar)": 71,
|
1531 |
+
"len(ar)": "1,1,2",
|
1532 |
+
"num(zh)": 1459,
|
1533 |
+
"len(zh)": "1,1,2",
|
1534 |
+
"num(ja)": 1593,
|
1535 |
+
"len(ja)": "1,1,2",
|
1536 |
+
"num(ja-kana)": 134,
|
1537 |
+
"len(ja-kana)": "1,1,1",
|
1538 |
+
"num(ko)": 346,
|
1539 |
+
"len(ko)": "1,1,1"
|
1540 |
+
},
|
1541 |
+
"mistralai/Mixtral-8x7B-v0.1": {
|
1542 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/mistralai/Mixtral-8x7B-v0.1\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">Mixtral-8x7B-v0.1</a>",
|
1543 |
+
"organization": "Mistral",
|
1544 |
+
"vocab_size": 32000,
|
1545 |
+
"num(digit)": 20,
|
1546 |
+
"len(digit)": "1,1,1",
|
1547 |
+
"num(space)": 85,
|
1548 |
+
"len(space)": "1,3,15",
|
1549 |
+
"num(ar)": 71,
|
1550 |
+
"len(ar)": "1,1,2",
|
1551 |
+
"num(zh)": 1459,
|
1552 |
+
"len(zh)": "1,1,2",
|
1553 |
+
"num(ja)": 1593,
|
1554 |
+
"len(ja)": "1,1,2",
|
1555 |
+
"num(ja-kana)": 134,
|
1556 |
+
"len(ja-kana)": "1,1,1",
|
1557 |
+
"num(ko)": 346,
|
1558 |
+
"len(ko)": "1,1,1"
|
1559 |
+
},
|
1560 |
+
"openai-community/gpt2": {
|
1561 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/openai-community/gpt2\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">gpt2</a>",
|
1562 |
+
"organization": "OpenAI",
|
1563 |
+
"vocab_size": 50257,
|
1564 |
+
"num(digit)": 1691,
|
1565 |
+
"len(digit)": "1,3,16",
|
1566 |
+
"num(space)": 33129,
|
1567 |
+
"len(space)": "1,7,66",
|
1568 |
+
"num(ar)": 22,
|
1569 |
+
"len(ar)": "1,1,3",
|
1570 |
+
"num(zh)": 51,
|
1571 |
+
"len(zh)": "1,1,4",
|
1572 |
+
"num(ja)": 183,
|
1573 |
+
"len(ja)": "1,1,7",
|
1574 |
+
"num(ja-kana)": 133,
|
1575 |
+
"len(ja-kana)": "1,1,7",
|
1576 |
+
"num(ko)": 0,
|
1577 |
+
"len(ko)": "-"
|
1578 |
+
},
|
1579 |
+
"openai/code-davinci-002": {
|
1580 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://github.com/openai/tiktoken\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">code-davinci-002</a>",
|
1581 |
+
"organization": "OpenAI",
|
1582 |
+
"vocab_size": 50281,
|
1583 |
+
"num(digit)": 1691,
|
1584 |
+
"len(digit)": "1,3,16",
|
1585 |
+
"num(space)": 33175,
|
1586 |
+
"len(space)": "1,7,66",
|
1587 |
+
"num(ar)": 22,
|
1588 |
+
"len(ar)": "1,1,3",
|
1589 |
+
"num(zh)": 51,
|
1590 |
+
"len(zh)": "1,1,4",
|
1591 |
+
"num(ja)": 183,
|
1592 |
+
"len(ja)": "1,1,7",
|
1593 |
+
"num(ja-kana)": 133,
|
1594 |
+
"len(ja-kana)": "1,1,7",
|
1595 |
+
"num(ko)": 0,
|
1596 |
+
"len(ko)": "-"
|
1597 |
+
},
|
1598 |
+
"openai/gpt-3.5-turbo": {
|
1599 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://github.com/openai/tiktoken\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">gpt-3.5-turbo</a>",
|
1600 |
+
"organization": "OpenAI",
|
1601 |
+
"vocab_size": 100277,
|
1602 |
+
"num(digit)": 1110,
|
1603 |
+
"len(digit)": "1,3,3",
|
1604 |
+
"num(space)": 47472,
|
1605 |
+
"len(space)": "1,7,128",
|
1606 |
+
"num(ar)": 113,
|
1607 |
+
"len(ar)": "1,2,10",
|
1608 |
+
"num(zh)": 868,
|
1609 |
+
"len(zh)": "1,1,7",
|
1610 |
+
"num(ja)": 1035,
|
1611 |
+
"len(ja)": "1,1,7",
|
1612 |
+
"num(ja-kana)": 169,
|
1613 |
+
"len(ja-kana)": "1,1,7",
|
1614 |
+
"num(ko)": 299,
|
1615 |
+
"len(ko)": "1,2,4"
|
1616 |
+
},
|
1617 |
+
"openai/gpt-4o": {
|
1618 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://github.com/openai/tiktoken\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">gpt-4o</a>",
|
1619 |
+
"organization": "OpenAI",
|
1620 |
+
"vocab_size": 200019,
|
1621 |
+
"num(digit)": 1110,
|
1622 |
+
"len(digit)": "1,3,3",
|
1623 |
+
"num(space)": 109316,
|
1624 |
+
"len(space)": "1,6,128",
|
1625 |
+
"num(ar)": 8055,
|
1626 |
+
"len(ar)": "1,4,12",
|
1627 |
+
"num(zh)": 7563,
|
1628 |
+
"len(zh)": "1,2,11",
|
1629 |
+
"num(ja)": 8292,
|
1630 |
+
"len(ja)": "1,2,11",
|
1631 |
+
"num(ja-kana)": 809,
|
1632 |
+
"len(ja-kana)": "1,2,11",
|
1633 |
+
"num(ko)": 2365,
|
1634 |
+
"len(ko)": "1,2,8"
|
1635 |
+
},
|
1636 |
+
"openai/text-davinci-003": {
|
1637 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://github.com/openai/tiktoken\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">text-davinci-003</a>",
|
1638 |
+
"organization": "OpenAI",
|
1639 |
+
"vocab_size": 50281,
|
1640 |
+
"num(digit)": 1691,
|
1641 |
+
"len(digit)": "1,3,16",
|
1642 |
+
"num(space)": 33175,
|
1643 |
+
"len(space)": "1,7,66",
|
1644 |
+
"num(ar)": 22,
|
1645 |
+
"len(ar)": "1,1,3",
|
1646 |
+
"num(zh)": 51,
|
1647 |
+
"len(zh)": "1,1,4",
|
1648 |
+
"num(ja)": 183,
|
1649 |
+
"len(ja)": "1,1,7",
|
1650 |
+
"num(ja-kana)": 133,
|
1651 |
+
"len(ja-kana)": "1,1,7",
|
1652 |
+
"num(ko)": 0,
|
1653 |
+
"len(ko)": "-"
|
1654 |
+
},
|
1655 |
+
"thu-coai/CharacterGLM-6B": {
|
1656 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/thu-coai/CharacterGLM-6B\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">CharacterGLM-6B</a>",
|
1657 |
+
"organization": "Tsinghua",
|
1658 |
+
"vocab_size": 64789,
|
1659 |
+
"num(digit)": 20,
|
1660 |
+
"len(digit)": "1,1,1",
|
1661 |
+
"num(space)": 67,
|
1662 |
+
"len(space)": "1,2,15",
|
1663 |
+
"num(ar)": 57,
|
1664 |
+
"len(ar)": "1,1,2",
|
1665 |
+
"num(zh)": 30922,
|
1666 |
+
"len(zh)": "1,2,16",
|
1667 |
+
"num(ja)": 31065,
|
1668 |
+
"len(ja)": "1,2,16",
|
1669 |
+
"num(ja-kana)": 143,
|
1670 |
+
"len(ja-kana)": "1,1,1",
|
1671 |
+
"num(ko)": 604,
|
1672 |
+
"len(ko)": "1,1,1"
|
1673 |
+
},
|
1674 |
+
"tiiuae/falcon-180b": {
|
1675 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/tiiuae/falcon-180b\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">falcon-180b</a>",
|
1676 |
+
"organization": "TII",
|
1677 |
+
"vocab_size": 65024,
|
1678 |
+
"num(digit)": 1108,
|
1679 |
+
"len(digit)": "1,3,3",
|
1680 |
+
"num(space)": 40202,
|
1681 |
+
"len(space)": "1,7,65",
|
1682 |
+
"num(ar)": 21,
|
1683 |
+
"len(ar)": "1,1,4",
|
1684 |
+
"num(zh)": 1627,
|
1685 |
+
"len(zh)": "1,1,3",
|
1686 |
+
"num(ja)": 1652,
|
1687 |
+
"len(ja)": "1,1,3",
|
1688 |
+
"num(ja-kana)": 25,
|
1689 |
+
"len(ja-kana)": "1,1,1",
|
1690 |
+
"num(ko)": 1,
|
1691 |
+
"len(ko)": "1,1,1"
|
1692 |
+
},
|
1693 |
+
"tiiuae/falcon-7b": {
|
1694 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/tiiuae/falcon-7b\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">falcon-7b</a>",
|
1695 |
+
"organization": "TII",
|
1696 |
+
"vocab_size": 65024,
|
1697 |
+
"num(digit)": 1108,
|
1698 |
+
"len(digit)": "1,3,3",
|
1699 |
+
"num(space)": 40202,
|
1700 |
+
"len(space)": "1,7,65",
|
1701 |
+
"num(ar)": 21,
|
1702 |
+
"len(ar)": "1,1,4",
|
1703 |
+
"num(zh)": 1627,
|
1704 |
+
"len(zh)": "1,1,3",
|
1705 |
+
"num(ja)": 1652,
|
1706 |
+
"len(ja)": "1,1,3",
|
1707 |
+
"num(ja-kana)": 25,
|
1708 |
+
"len(ja-kana)": "1,1,1",
|
1709 |
+
"num(ko)": 1,
|
1710 |
+
"len(ko)": "1,1,1"
|
1711 |
+
}
|
1712 |
+
}
|
stats/compress_rate.json
DELETED
@@ -1,4286 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"amber.cc100-en": {
|
3 |
-
"vocab_size": 32000,
|
4 |
-
"n_bytes": 1124813,
|
5 |
-
"n_tokens": 294627,
|
6 |
-
"n_chars": 1121360
|
7 |
-
},
|
8 |
-
"aya_101.cc100-en": {
|
9 |
-
"vocab_size": 250100,
|
10 |
-
"n_bytes": 1124813,
|
11 |
-
"n_tokens": 317881,
|
12 |
-
"n_chars": 1121360
|
13 |
-
},
|
14 |
-
"baichuan.cc100-en": {
|
15 |
-
"vocab_size": 64000,
|
16 |
-
"n_bytes": 1124813,
|
17 |
-
"n_tokens": 280108,
|
18 |
-
"n_chars": 1121360
|
19 |
-
},
|
20 |
-
"baichuan2.cc100-en": {
|
21 |
-
"vocab_size": 125696,
|
22 |
-
"n_bytes": 1124813,
|
23 |
-
"n_tokens": 269011,
|
24 |
-
"n_chars": 1121360
|
25 |
-
},
|
26 |
-
"bert_base_cased.cc100-en": {
|
27 |
-
"vocab_size": 28996,
|
28 |
-
"n_bytes": 1124813,
|
29 |
-
"n_tokens": 288022,
|
30 |
-
"n_chars": 1121360
|
31 |
-
},
|
32 |
-
"bert_base_chinese.cc100-en": {
|
33 |
-
"vocab_size": 21128,
|
34 |
-
"n_bytes": 1124813,
|
35 |
-
"n_tokens": 377068,
|
36 |
-
"n_chars": 1121360
|
37 |
-
},
|
38 |
-
"bert_base_uncased.cc100-en": {
|
39 |
-
"vocab_size": 30522,
|
40 |
-
"n_bytes": 1124813,
|
41 |
-
"n_tokens": 280575,
|
42 |
-
"n_chars": 1121360
|
43 |
-
},
|
44 |
-
"bloom.cc100-en": {
|
45 |
-
"vocab_size": 250680,
|
46 |
-
"n_bytes": 1124813,
|
47 |
-
"n_tokens": 257405,
|
48 |
-
"n_chars": 1121360
|
49 |
-
},
|
50 |
-
"byt5_small.cc100-en": {
|
51 |
-
"vocab_size": 384,
|
52 |
-
"n_bytes": 1124813,
|
53 |
-
"n_tokens": 1134813,
|
54 |
-
"n_chars": 1121360
|
55 |
-
},
|
56 |
-
"character_glm_6b.cc100-en": {
|
57 |
-
"vocab_size": 64789,
|
58 |
-
"n_bytes": 1124813,
|
59 |
-
"n_tokens": 289347,
|
60 |
-
"n_chars": 1121360
|
61 |
-
},
|
62 |
-
"chatglm2_6b.cc100-en": {
|
63 |
-
"vocab_size": 64787,
|
64 |
-
"n_bytes": 1124813,
|
65 |
-
"n_tokens": 289329,
|
66 |
-
"n_chars": 1121360
|
67 |
-
},
|
68 |
-
"chatglm3_6b.cc100-en": {
|
69 |
-
"vocab_size": 64796,
|
70 |
-
"n_bytes": 1124813,
|
71 |
-
"n_tokens": 289347,
|
72 |
-
"n_chars": 1121360
|
73 |
-
},
|
74 |
-
"chatglm_6b.cc100-en": {
|
75 |
-
"vocab_size": 150344,
|
76 |
-
"n_bytes": 1124813,
|
77 |
-
"n_tokens": 284761,
|
78 |
-
"n_chars": 1121360
|
79 |
-
},
|
80 |
-
"chatyuan_large_v2.cc100-en": {
|
81 |
-
"vocab_size": 32128,
|
82 |
-
"n_bytes": 1124813,
|
83 |
-
"n_tokens": 536033,
|
84 |
-
"n_chars": 1121360
|
85 |
-
},
|
86 |
-
"chinese_llama.cc100-en": {
|
87 |
-
"vocab_size": 49953,
|
88 |
-
"n_bytes": 1124813,
|
89 |
-
"n_tokens": 291514,
|
90 |
-
"n_chars": 1121360
|
91 |
-
},
|
92 |
-
"chinese_llama2.cc100-en": {
|
93 |
-
"vocab_size": 55296,
|
94 |
-
"n_bytes": 1124813,
|
95 |
-
"n_tokens": 294627,
|
96 |
-
"n_chars": 1121360
|
97 |
-
},
|
98 |
-
"code_davinci_002.cc100-en": {
|
99 |
-
"vocab_size": 50281,
|
100 |
-
"n_bytes": 1124813,
|
101 |
-
"n_tokens": 258403,
|
102 |
-
"n_chars": 1121360
|
103 |
-
},
|
104 |
-
"crystal_coder.cc100-en": {
|
105 |
-
"vocab_size": 32022,
|
106 |
-
"n_bytes": 1124813,
|
107 |
-
"n_tokens": 284627,
|
108 |
-
"n_chars": 1121360
|
109 |
-
},
|
110 |
-
"dbrx_instruct.cc100-en": {
|
111 |
-
"vocab_size": 100280,
|
112 |
-
"n_bytes": 1124813,
|
113 |
-
"n_tokens": 254985,
|
114 |
-
"n_chars": 1121360
|
115 |
-
},
|
116 |
-
"deepseek_coder_33b_instruct.cc100-en": {
|
117 |
-
"vocab_size": 32022,
|
118 |
-
"n_bytes": 1124813,
|
119 |
-
"n_tokens": 287408,
|
120 |
-
"n_chars": 1121360
|
121 |
-
},
|
122 |
-
"deepseek_llm_7b_base.cc100-en": {
|
123 |
-
"vocab_size": 100015,
|
124 |
-
"n_bytes": 1124813,
|
125 |
-
"n_tokens": 272324,
|
126 |
-
"n_chars": 1121360
|
127 |
-
},
|
128 |
-
"falcon_180b.cc100-en": {
|
129 |
-
"vocab_size": 65024,
|
130 |
-
"n_bytes": 1124813,
|
131 |
-
"n_tokens": 262509,
|
132 |
-
"n_chars": 1121360
|
133 |
-
},
|
134 |
-
"falcon_7b.cc100-en": {
|
135 |
-
"vocab_size": 65024,
|
136 |
-
"n_bytes": 1124813,
|
137 |
-
"n_tokens": 262509,
|
138 |
-
"n_chars": 1121360
|
139 |
-
},
|
140 |
-
"fastchat_t5_3b.cc100-en": {
|
141 |
-
"vocab_size": 32110,
|
142 |
-
"n_bytes": 1124813,
|
143 |
-
"n_tokens": 484941,
|
144 |
-
"n_chars": 1121360
|
145 |
-
},
|
146 |
-
"flan_t5_base.cc100-en": {
|
147 |
-
"vocab_size": 32100,
|
148 |
-
"n_bytes": 1124813,
|
149 |
-
"n_tokens": 290104,
|
150 |
-
"n_chars": 1121360
|
151 |
-
},
|
152 |
-
"gemma_7b.cc100-en": {
|
153 |
-
"vocab_size": 256000,
|
154 |
-
"n_bytes": 1124813,
|
155 |
-
"n_tokens": 268010,
|
156 |
-
"n_chars": 1121360
|
157 |
-
},
|
158 |
-
"gpt2.cc100-en": {
|
159 |
-
"vocab_size": 50257,
|
160 |
-
"n_bytes": 1124813,
|
161 |
-
"n_tokens": 258428,
|
162 |
-
"n_chars": 1121360
|
163 |
-
},
|
164 |
-
"gpt2_chinese.cc100-en": {
|
165 |
-
"vocab_size": 21128,
|
166 |
-
"n_bytes": 1124813,
|
167 |
-
"n_tokens": 392641,
|
168 |
-
"n_chars": 1121360
|
169 |
-
},
|
170 |
-
"gpt_35_turbo.cc100-en": {
|
171 |
-
"vocab_size": 100277,
|
172 |
-
"n_bytes": 1124813,
|
173 |
-
"n_tokens": 254985,
|
174 |
-
"n_chars": 1121360
|
175 |
-
},
|
176 |
-
"gpt_4.cc100-en": {
|
177 |
-
"vocab_size": 100277,
|
178 |
-
"n_bytes": 1124813,
|
179 |
-
"n_tokens": 254985,
|
180 |
-
"n_chars": 1121360
|
181 |
-
},
|
182 |
-
"gpt_nexo_20b.cc100-en": {
|
183 |
-
"vocab_size": 50277,
|
184 |
-
"n_bytes": 1124813,
|
185 |
-
"n_tokens": 259357,
|
186 |
-
"n_chars": 1121360
|
187 |
-
},
|
188 |
-
"grok_1.cc100-en": {
|
189 |
-
"vocab_size": 131072,
|
190 |
-
"n_bytes": 1124813,
|
191 |
-
"n_tokens": 258048,
|
192 |
-
"n_chars": 1121360
|
193 |
-
},
|
194 |
-
"internlm2_chat_7b.cc100-en": {
|
195 |
-
"vocab_size": 92544,
|
196 |
-
"n_bytes": 1124813,
|
197 |
-
"n_tokens": 271583,
|
198 |
-
"n_chars": 1121360
|
199 |
-
},
|
200 |
-
"internlm2_math_7b.cc100-en": {
|
201 |
-
"vocab_size": 92544,
|
202 |
-
"n_bytes": 1124813,
|
203 |
-
"n_tokens": 271583,
|
204 |
-
"n_chars": 1121360
|
205 |
-
},
|
206 |
-
"internlm_chat_7b.cc100-en": {
|
207 |
-
"vocab_size": 103168,
|
208 |
-
"n_bytes": 1124813,
|
209 |
-
"n_tokens": 271293,
|
210 |
-
"n_chars": 1121360
|
211 |
-
},
|
212 |
-
"internlm_xcomposer_7b.cc100-en": {
|
213 |
-
"vocab_size": 103168,
|
214 |
-
"n_bytes": 1124813,
|
215 |
-
"n_tokens": 271293,
|
216 |
-
"n_chars": 1121360
|
217 |
-
},
|
218 |
-
"jamba_v0_1.cc100-en": {
|
219 |
-
"vocab_size": 65536,
|
220 |
-
"n_bytes": 1124813,
|
221 |
-
"n_tokens": 274242,
|
222 |
-
"n_chars": 1121360
|
223 |
-
},
|
224 |
-
"kplug.cc100-en": {
|
225 |
-
"vocab_size": 10261,
|
226 |
-
"n_bytes": 1124813,
|
227 |
-
"n_tokens": 393564,
|
228 |
-
"n_chars": 1121360
|
229 |
-
},
|
230 |
-
"llama.cc100-en": {
|
231 |
-
"vocab_size": 32000,
|
232 |
-
"n_bytes": 1124813,
|
233 |
-
"n_tokens": 294627,
|
234 |
-
"n_chars": 1121360
|
235 |
-
},
|
236 |
-
"llama2.cc100-en": {
|
237 |
-
"vocab_size": 32001,
|
238 |
-
"n_bytes": 1124813,
|
239 |
-
"n_tokens": 294627,
|
240 |
-
"n_chars": 1121360
|
241 |
-
},
|
242 |
-
"llama3.cc100-en": {
|
243 |
-
"vocab_size": 128256,
|
244 |
-
"n_bytes": 1124813,
|
245 |
-
"n_tokens": 254944,
|
246 |
-
"n_chars": 1121360
|
247 |
-
},
|
248 |
-
"mistral_7b.cc100-en": {
|
249 |
-
"vocab_size": 32000,
|
250 |
-
"n_bytes": 1124813,
|
251 |
-
"n_tokens": 285801,
|
252 |
-
"n_chars": 1121360
|
253 |
-
},
|
254 |
-
"mixtral_8_7b.cc100-en": {
|
255 |
-
"vocab_size": 32000,
|
256 |
-
"n_bytes": 1124813,
|
257 |
-
"n_tokens": 285801,
|
258 |
-
"n_chars": 1121360
|
259 |
-
},
|
260 |
-
"mobilebert_uncased.cc100-en": {
|
261 |
-
"vocab_size": 30522,
|
262 |
-
"n_bytes": 1124813,
|
263 |
-
"n_tokens": 280575,
|
264 |
-
"n_chars": 1121360
|
265 |
-
},
|
266 |
-
"moss.cc100-en": {
|
267 |
-
"vocab_size": 106072,
|
268 |
-
"n_bytes": 1124813,
|
269 |
-
"n_tokens": 257070,
|
270 |
-
"n_chars": 1121360
|
271 |
-
},
|
272 |
-
"mt5_large.cc100-en": {
|
273 |
-
"vocab_size": 250100,
|
274 |
-
"n_bytes": 1124813,
|
275 |
-
"n_tokens": 317881,
|
276 |
-
"n_chars": 1121360
|
277 |
-
},
|
278 |
-
"olmo_7b.cc100-en": {
|
279 |
-
"vocab_size": 50280,
|
280 |
-
"n_bytes": 1124813,
|
281 |
-
"n_tokens": 259357,
|
282 |
-
"n_chars": 1121360
|
283 |
-
},
|
284 |
-
"orion_14b_chat.cc100-en": {
|
285 |
-
"vocab_size": 84608,
|
286 |
-
"n_bytes": 1124813,
|
287 |
-
"n_tokens": 265948,
|
288 |
-
"n_chars": 1121360
|
289 |
-
},
|
290 |
-
"phi_1.cc100-en": {
|
291 |
-
"vocab_size": 50295,
|
292 |
-
"n_bytes": 1124813,
|
293 |
-
"n_tokens": 258409,
|
294 |
-
"n_chars": 1121360
|
295 |
-
},
|
296 |
-
"phi_2.cc100-en": {
|
297 |
-
"vocab_size": 50295,
|
298 |
-
"n_bytes": 1124813,
|
299 |
-
"n_tokens": 258409,
|
300 |
-
"n_chars": 1121360
|
301 |
-
},
|
302 |
-
"phi_3_mini.cc100-en": {
|
303 |
-
"vocab_size": 32011,
|
304 |
-
"n_bytes": 1124813,
|
305 |
-
"n_tokens": 294627,
|
306 |
-
"n_chars": 1121360
|
307 |
-
},
|
308 |
-
"pko_t5_large.cc100-en": {
|
309 |
-
"vocab_size": 50358,
|
310 |
-
"n_bytes": 1124813,
|
311 |
-
"n_tokens": 658985,
|
312 |
-
"n_chars": 1121360
|
313 |
-
},
|
314 |
-
"prompt_clue.cc100-en": {
|
315 |
-
"vocab_size": 32128,
|
316 |
-
"n_bytes": 1124813,
|
317 |
-
"n_tokens": 536033,
|
318 |
-
"n_chars": 1121360
|
319 |
-
},
|
320 |
-
"qwen1_5_14b_chat.cc100-en": {
|
321 |
-
"vocab_size": 151646,
|
322 |
-
"n_bytes": 1124813,
|
323 |
-
"n_tokens": 257983,
|
324 |
-
"n_chars": 1121360
|
325 |
-
},
|
326 |
-
"qwen_1_8b_chat.cc100-en": {
|
327 |
-
"vocab_size": 151851,
|
328 |
-
"n_bytes": 1124813,
|
329 |
-
"n_tokens": 257983,
|
330 |
-
"n_chars": 1121360
|
331 |
-
},
|
332 |
-
"qwen_72b_chat.cc100-en": {
|
333 |
-
"vocab_size": 151851,
|
334 |
-
"n_bytes": 1124813,
|
335 |
-
"n_tokens": 257983,
|
336 |
-
"n_chars": 1121360
|
337 |
-
},
|
338 |
-
"qwen_7b_chat.cc100-en": {
|
339 |
-
"vocab_size": 151851,
|
340 |
-
"n_bytes": 1124813,
|
341 |
-
"n_tokens": 257983,
|
342 |
-
"n_chars": 1121360
|
343 |
-
},
|
344 |
-
"roberta_chinese_clue.cc100-en": {
|
345 |
-
"vocab_size": 8021,
|
346 |
-
"n_bytes": 1124813,
|
347 |
-
"n_tokens": 583058,
|
348 |
-
"n_chars": 1121360
|
349 |
-
},
|
350 |
-
"skywork_13b_base.cc100-en": {
|
351 |
-
"vocab_size": 65519,
|
352 |
-
"n_bytes": 1124813,
|
353 |
-
"n_tokens": 294617,
|
354 |
-
"n_chars": 1121360
|
355 |
-
},
|
356 |
-
"skywork_13b_math.cc100-en": {
|
357 |
-
"vocab_size": 65519,
|
358 |
-
"n_bytes": 1124813,
|
359 |
-
"n_tokens": 294617,
|
360 |
-
"n_chars": 1121360
|
361 |
-
},
|
362 |
-
"solar_10_7b.cc100-en": {
|
363 |
-
"vocab_size": 32000,
|
364 |
-
"n_bytes": 1124813,
|
365 |
-
"n_tokens": 285801,
|
366 |
-
"n_chars": 1121360
|
367 |
-
},
|
368 |
-
"starchat_alpha.cc100-en": {
|
369 |
-
"vocab_size": 49156,
|
370 |
-
"n_bytes": 1124813,
|
371 |
-
"n_tokens": 288965,
|
372 |
-
"n_chars": 1121360
|
373 |
-
},
|
374 |
-
"switch_c_2048.cc100-en": {
|
375 |
-
"vocab_size": 32100,
|
376 |
-
"n_bytes": 1124813,
|
377 |
-
"n_tokens": 290104,
|
378 |
-
"n_chars": 1121360
|
379 |
-
},
|
380 |
-
"t5_base.cc100-en": {
|
381 |
-
"vocab_size": 32100,
|
382 |
-
"n_bytes": 1124813,
|
383 |
-
"n_tokens": 290104,
|
384 |
-
"n_chars": 1121360
|
385 |
-
},
|
386 |
-
"t5_large.cc100-en": {
|
387 |
-
"vocab_size": 32100,
|
388 |
-
"n_bytes": 1124813,
|
389 |
-
"n_tokens": 290104,
|
390 |
-
"n_chars": 1121360
|
391 |
-
},
|
392 |
-
"t5_small.cc100-en": {
|
393 |
-
"vocab_size": 32100,
|
394 |
-
"n_bytes": 1124813,
|
395 |
-
"n_tokens": 290104,
|
396 |
-
"n_chars": 1121360
|
397 |
-
},
|
398 |
-
"text_davinci_003.cc100-en": {
|
399 |
-
"vocab_size": 50281,
|
400 |
-
"n_bytes": 1124813,
|
401 |
-
"n_tokens": 258403,
|
402 |
-
"n_chars": 1121360
|
403 |
-
},
|
404 |
-
"tigerbot_13b_chat_v2.cc100-en": {
|
405 |
-
"vocab_size": 60515,
|
406 |
-
"n_bytes": 1124813,
|
407 |
-
"n_tokens": 285652,
|
408 |
-
"n_chars": 1121360
|
409 |
-
},
|
410 |
-
"tigerbot_70b_chat_v4_4k.cc100-en": {
|
411 |
-
"vocab_size": 65110,
|
412 |
-
"n_bytes": 1124813,
|
413 |
-
"n_tokens": 286946,
|
414 |
-
"n_chars": 1121360
|
415 |
-
},
|
416 |
-
"wizardcoder_15b_v1.cc100-en": {
|
417 |
-
"vocab_size": 49153,
|
418 |
-
"n_bytes": 1124813,
|
419 |
-
"n_tokens": 288965,
|
420 |
-
"n_chars": 1121360
|
421 |
-
},
|
422 |
-
"wizardcoder_python_7b_v1.cc100-en": {
|
423 |
-
"vocab_size": 32001,
|
424 |
-
"n_bytes": 1124813,
|
425 |
-
"n_tokens": 294627,
|
426 |
-
"n_chars": 1121360
|
427 |
-
},
|
428 |
-
"wizardlm_7b_v1.cc100-en": {
|
429 |
-
"vocab_size": 32001,
|
430 |
-
"n_bytes": 1124813,
|
431 |
-
"n_tokens": 294627,
|
432 |
-
"n_chars": 1121360
|
433 |
-
},
|
434 |
-
"wizardmath_70b_v1.cc100-en": {
|
435 |
-
"vocab_size": 32002,
|
436 |
-
"n_bytes": 1124813,
|
437 |
-
"n_tokens": 294627,
|
438 |
-
"n_chars": 1121360
|
439 |
-
},
|
440 |
-
"xlm_roberta.cc100-en": {
|
441 |
-
"vocab_size": 250002,
|
442 |
-
"n_bytes": 1124813,
|
443 |
-
"n_tokens": 300026,
|
444 |
-
"n_chars": 1121360
|
445 |
-
},
|
446 |
-
"yi_34b.cc100-en": {
|
447 |
-
"vocab_size": 64000,
|
448 |
-
"n_bytes": 1124813,
|
449 |
-
"n_tokens": 270400,
|
450 |
-
"n_chars": 1121360
|
451 |
-
},
|
452 |
-
"yi_6b.cc100-en": {
|
453 |
-
"vocab_size": 64000,
|
454 |
-
"n_bytes": 1124813,
|
455 |
-
"n_tokens": 270400,
|
456 |
-
"n_chars": 1121360
|
457 |
-
},
|
458 |
-
"yi_vl34b.cc100-en": {
|
459 |
-
"vocab_size": 64000,
|
460 |
-
"n_bytes": 1124813,
|
461 |
-
"n_tokens": 269738,
|
462 |
-
"n_chars": 1121360
|
463 |
-
},
|
464 |
-
"zephyr_7b_beta.cc100-en": {
|
465 |
-
"vocab_size": 32000,
|
466 |
-
"n_bytes": 1124813,
|
467 |
-
"n_tokens": 285801,
|
468 |
-
"n_chars": 1121360
|
469 |
-
},
|
470 |
-
"amber.cc100-zh-Hans": {
|
471 |
-
"vocab_size": 32000,
|
472 |
-
"n_bytes": 2633047,
|
473 |
-
"n_tokens": 1330093,
|
474 |
-
"n_chars": 927311
|
475 |
-
},
|
476 |
-
"aya_101.cc100-zh-Hans": {
|
477 |
-
"vocab_size": 250100,
|
478 |
-
"n_bytes": 2633047,
|
479 |
-
"n_tokens": 631182,
|
480 |
-
"n_chars": 927311
|
481 |
-
},
|
482 |
-
"baichuan.cc100-zh-Hans": {
|
483 |
-
"vocab_size": 64000,
|
484 |
-
"n_bytes": 2633047,
|
485 |
-
"n_tokens": 626117,
|
486 |
-
"n_chars": 927311
|
487 |
-
},
|
488 |
-
"baichuan2.cc100-zh-Hans": {
|
489 |
-
"vocab_size": 125696,
|
490 |
-
"n_bytes": 2633047,
|
491 |
-
"n_tokens": 541464,
|
492 |
-
"n_chars": 927311
|
493 |
-
},
|
494 |
-
"bert_base_cased.cc100-zh-Hans": {
|
495 |
-
"vocab_size": 28996,
|
496 |
-
"n_bytes": 2633047,
|
497 |
-
"n_tokens": 899709,
|
498 |
-
"n_chars": 927311
|
499 |
-
},
|
500 |
-
"bert_base_chinese.cc100-zh-Hans": {
|
501 |
-
"vocab_size": 21128,
|
502 |
-
"n_bytes": 2633047,
|
503 |
-
"n_tokens": 896599,
|
504 |
-
"n_chars": 927311
|
505 |
-
},
|
506 |
-
"bert_base_uncased.cc100-zh-Hans": {
|
507 |
-
"vocab_size": 30522,
|
508 |
-
"n_bytes": 2633047,
|
509 |
-
"n_tokens": 898554,
|
510 |
-
"n_chars": 927311
|
511 |
-
},
|
512 |
-
"bloom.cc100-zh-Hans": {
|
513 |
-
"vocab_size": 250680,
|
514 |
-
"n_bytes": 2633047,
|
515 |
-
"n_tokens": 573008,
|
516 |
-
"n_chars": 927311
|
517 |
-
},
|
518 |
-
"byt5_small.cc100-zh-Hans": {
|
519 |
-
"vocab_size": 384,
|
520 |
-
"n_bytes": 2633047,
|
521 |
-
"n_tokens": 2643047,
|
522 |
-
"n_chars": 927311
|
523 |
-
},
|
524 |
-
"character_glm_6b.cc100-zh-Hans": {
|
525 |
-
"vocab_size": 64789,
|
526 |
-
"n_bytes": 2633047,
|
527 |
-
"n_tokens": 583646,
|
528 |
-
"n_chars": 927311
|
529 |
-
},
|
530 |
-
"chatglm2_6b.cc100-zh-Hans": {
|
531 |
-
"vocab_size": 64787,
|
532 |
-
"n_bytes": 2633047,
|
533 |
-
"n_tokens": 583646,
|
534 |
-
"n_chars": 927311
|
535 |
-
},
|
536 |
-
"chatglm3_6b.cc100-zh-Hans": {
|
537 |
-
"vocab_size": 64796,
|
538 |
-
"n_bytes": 2633047,
|
539 |
-
"n_tokens": 583646,
|
540 |
-
"n_chars": 927311
|
541 |
-
},
|
542 |
-
"chatglm_6b.cc100-zh-Hans": {
|
543 |
-
"vocab_size": 150344,
|
544 |
-
"n_bytes": 2633047,
|
545 |
-
"n_tokens": 527384,
|
546 |
-
"n_chars": 927311
|
547 |
-
},
|
548 |
-
"chatyuan_large_v2.cc100-zh-Hans": {
|
549 |
-
"vocab_size": 32128,
|
550 |
-
"n_bytes": 2633047,
|
551 |
-
"n_tokens": 564905,
|
552 |
-
"n_chars": 927311
|
553 |
-
},
|
554 |
-
"chinese_llama.cc100-zh-Hans": {
|
555 |
-
"vocab_size": 49953,
|
556 |
-
"n_bytes": 2633047,
|
557 |
-
"n_tokens": 623219,
|
558 |
-
"n_chars": 927311
|
559 |
-
},
|
560 |
-
"chinese_llama2.cc100-zh-Hans": {
|
561 |
-
"vocab_size": 55296,
|
562 |
-
"n_bytes": 2633047,
|
563 |
-
"n_tokens": 625766,
|
564 |
-
"n_chars": 927311
|
565 |
-
},
|
566 |
-
"code_davinci_002.cc100-zh-Hans": {
|
567 |
-
"vocab_size": 50281,
|
568 |
-
"n_bytes": 2633047,
|
569 |
-
"n_tokens": 1876809,
|
570 |
-
"n_chars": 927311
|
571 |
-
},
|
572 |
-
"crystal_coder.cc100-zh-Hans": {
|
573 |
-
"vocab_size": 32022,
|
574 |
-
"n_bytes": 2633047,
|
575 |
-
"n_tokens": 1320093,
|
576 |
-
"n_chars": 927311
|
577 |
-
},
|
578 |
-
"dbrx_instruct.cc100-zh-Hans": {
|
579 |
-
"vocab_size": 100280,
|
580 |
-
"n_bytes": 2633047,
|
581 |
-
"n_tokens": 1084939,
|
582 |
-
"n_chars": 927311
|
583 |
-
},
|
584 |
-
"deepseek_coder_33b_instruct.cc100-zh-Hans": {
|
585 |
-
"vocab_size": 32022,
|
586 |
-
"n_bytes": 2633047,
|
587 |
-
"n_tokens": 720577,
|
588 |
-
"n_chars": 927311
|
589 |
-
},
|
590 |
-
"deepseek_llm_7b_base.cc100-zh-Hans": {
|
591 |
-
"vocab_size": 100015,
|
592 |
-
"n_bytes": 2633047,
|
593 |
-
"n_tokens": 605081,
|
594 |
-
"n_chars": 927311
|
595 |
-
},
|
596 |
-
"falcon_180b.cc100-zh-Hans": {
|
597 |
-
"vocab_size": 65024,
|
598 |
-
"n_bytes": 2633047,
|
599 |
-
"n_tokens": 1124681,
|
600 |
-
"n_chars": 927311
|
601 |
-
},
|
602 |
-
"falcon_7b.cc100-zh-Hans": {
|
603 |
-
"vocab_size": 65024,
|
604 |
-
"n_bytes": 2633047,
|
605 |
-
"n_tokens": 1124681,
|
606 |
-
"n_chars": 927311
|
607 |
-
},
|
608 |
-
"fastchat_t5_3b.cc100-zh-Hans": {
|
609 |
-
"vocab_size": 32110,
|
610 |
-
"n_bytes": 2633047,
|
611 |
-
"n_tokens": 178974,
|
612 |
-
"n_chars": 927311
|
613 |
-
},
|
614 |
-
"flan_t5_base.cc100-zh-Hans": {
|
615 |
-
"vocab_size": 32100,
|
616 |
-
"n_bytes": 2633047,
|
617 |
-
"n_tokens": 173520,
|
618 |
-
"n_chars": 927311
|
619 |
-
},
|
620 |
-
"gemma_7b.cc100-zh-Hans": {
|
621 |
-
"vocab_size": 256000,
|
622 |
-
"n_bytes": 2633047,
|
623 |
-
"n_tokens": 641795,
|
624 |
-
"n_chars": 927311
|
625 |
-
},
|
626 |
-
"gpt2.cc100-zh-Hans": {
|
627 |
-
"vocab_size": 50257,
|
628 |
-
"n_bytes": 2633047,
|
629 |
-
"n_tokens": 1876809,
|
630 |
-
"n_chars": 927311
|
631 |
-
},
|
632 |
-
"gpt2_chinese.cc100-zh-Hans": {
|
633 |
-
"vocab_size": 21128,
|
634 |
-
"n_bytes": 2633047,
|
635 |
-
"n_tokens": 899506,
|
636 |
-
"n_chars": 927311
|
637 |
-
},
|
638 |
-
"gpt_35_turbo.cc100-zh-Hans": {
|
639 |
-
"vocab_size": 100277,
|
640 |
-
"n_bytes": 2633047,
|
641 |
-
"n_tokens": 1084939,
|
642 |
-
"n_chars": 927311
|
643 |
-
},
|
644 |
-
"gpt_4.cc100-zh-Hans": {
|
645 |
-
"vocab_size": 100277,
|
646 |
-
"n_bytes": 2633047,
|
647 |
-
"n_tokens": 1084939,
|
648 |
-
"n_chars": 927311
|
649 |
-
},
|
650 |
-
"gpt_nexo_20b.cc100-zh-Hans": {
|
651 |
-
"vocab_size": 50277,
|
652 |
-
"n_bytes": 2633047,
|
653 |
-
"n_tokens": 1220529,
|
654 |
-
"n_chars": 927311
|
655 |
-
},
|
656 |
-
"grok_1.cc100-zh-Hans": {
|
657 |
-
"vocab_size": 131072,
|
658 |
-
"n_bytes": 2633047,
|
659 |
-
"n_tokens": 1414508,
|
660 |
-
"n_chars": 927311
|
661 |
-
},
|
662 |
-
"internlm2_chat_7b.cc100-zh-Hans": {
|
663 |
-
"vocab_size": 92544,
|
664 |
-
"n_bytes": 2633047,
|
665 |
-
"n_tokens": 579976,
|
666 |
-
"n_chars": 927311
|
667 |
-
},
|
668 |
-
"internlm2_math_7b.cc100-zh-Hans": {
|
669 |
-
"vocab_size": 92544,
|
670 |
-
"n_bytes": 2633047,
|
671 |
-
"n_tokens": 579976,
|
672 |
-
"n_chars": 927311
|
673 |
-
},
|
674 |
-
"internlm_chat_7b.cc100-zh-Hans": {
|
675 |
-
"vocab_size": 103168,
|
676 |
-
"n_bytes": 2633047,
|
677 |
-
"n_tokens": 579109,
|
678 |
-
"n_chars": 927311
|
679 |
-
},
|
680 |
-
"internlm_xcomposer_7b.cc100-zh-Hans": {
|
681 |
-
"vocab_size": 103168,
|
682 |
-
"n_bytes": 2633047,
|
683 |
-
"n_tokens": 579109,
|
684 |
-
"n_chars": 927311
|
685 |
-
},
|
686 |
-
"jamba_v0_1.cc100-zh-Hans": {
|
687 |
-
"vocab_size": 65536,
|
688 |
-
"n_bytes": 2633047,
|
689 |
-
"n_tokens": 1067054,
|
690 |
-
"n_chars": 927311
|
691 |
-
},
|
692 |
-
"kplug.cc100-zh-Hans": {
|
693 |
-
"vocab_size": 10261,
|
694 |
-
"n_bytes": 2633047,
|
695 |
-
"n_tokens": 902451,
|
696 |
-
"n_chars": 927311
|
697 |
-
},
|
698 |
-
"llama.cc100-zh-Hans": {
|
699 |
-
"vocab_size": 32000,
|
700 |
-
"n_bytes": 2633047,
|
701 |
-
"n_tokens": 1330093,
|
702 |
-
"n_chars": 927311
|
703 |
-
},
|
704 |
-
"llama2.cc100-zh-Hans": {
|
705 |
-
"vocab_size": 32001,
|
706 |
-
"n_bytes": 2633047,
|
707 |
-
"n_tokens": 1330093,
|
708 |
-
"n_chars": 927311
|
709 |
-
},
|
710 |
-
"llama3.cc100-zh-Hans": {
|
711 |
-
"vocab_size": 128256,
|
712 |
-
"n_bytes": 2633047,
|
713 |
-
"n_tokens": 747405,
|
714 |
-
"n_chars": 927311
|
715 |
-
},
|
716 |
-
"mistral_7b.cc100-zh-Hans": {
|
717 |
-
"vocab_size": 32000,
|
718 |
-
"n_bytes": 2633047,
|
719 |
-
"n_tokens": 1041023,
|
720 |
-
"n_chars": 927311
|
721 |
-
},
|
722 |
-
"mixtral_8_7b.cc100-zh-Hans": {
|
723 |
-
"vocab_size": 32000,
|
724 |
-
"n_bytes": 2633047,
|
725 |
-
"n_tokens": 1041023,
|
726 |
-
"n_chars": 927311
|
727 |
-
},
|
728 |
-
"mobilebert_uncased.cc100-zh-Hans": {
|
729 |
-
"vocab_size": 30522,
|
730 |
-
"n_bytes": 2633047,
|
731 |
-
"n_tokens": 898554,
|
732 |
-
"n_chars": 927311
|
733 |
-
},
|
734 |
-
"moss.cc100-zh-Hans": {
|
735 |
-
"vocab_size": 106072,
|
736 |
-
"n_bytes": 2633047,
|
737 |
-
"n_tokens": 557455,
|
738 |
-
"n_chars": 927311
|
739 |
-
},
|
740 |
-
"mt5_large.cc100-zh-Hans": {
|
741 |
-
"vocab_size": 250100,
|
742 |
-
"n_bytes": 2633047,
|
743 |
-
"n_tokens": 631182,
|
744 |
-
"n_chars": 927311
|
745 |
-
},
|
746 |
-
"olmo_7b.cc100-zh-Hans": {
|
747 |
-
"vocab_size": 50280,
|
748 |
-
"n_bytes": 2633047,
|
749 |
-
"n_tokens": 1220529,
|
750 |
-
"n_chars": 927311
|
751 |
-
},
|
752 |
-
"orion_14b_chat.cc100-zh-Hans": {
|
753 |
-
"vocab_size": 84608,
|
754 |
-
"n_bytes": 2633047,
|
755 |
-
"n_tokens": 529926,
|
756 |
-
"n_chars": 927311
|
757 |
-
},
|
758 |
-
"phi_1.cc100-zh-Hans": {
|
759 |
-
"vocab_size": 50295,
|
760 |
-
"n_bytes": 2633047,
|
761 |
-
"n_tokens": 1876809,
|
762 |
-
"n_chars": 927311
|
763 |
-
},
|
764 |
-
"phi_2.cc100-zh-Hans": {
|
765 |
-
"vocab_size": 50295,
|
766 |
-
"n_bytes": 2633047,
|
767 |
-
"n_tokens": 1876809,
|
768 |
-
"n_chars": 927311
|
769 |
-
},
|
770 |
-
"phi_3_mini.cc100-zh-Hans": {
|
771 |
-
"vocab_size": 32011,
|
772 |
-
"n_bytes": 2633047,
|
773 |
-
"n_tokens": 1330093,
|
774 |
-
"n_chars": 927311
|
775 |
-
},
|
776 |
-
"pko_t5_large.cc100-zh-Hans": {
|
777 |
-
"vocab_size": 50358,
|
778 |
-
"n_bytes": 2633047,
|
779 |
-
"n_tokens": 2533519,
|
780 |
-
"n_chars": 927311
|
781 |
-
},
|
782 |
-
"prompt_clue.cc100-zh-Hans": {
|
783 |
-
"vocab_size": 32128,
|
784 |
-
"n_bytes": 2633047,
|
785 |
-
"n_tokens": 564905,
|
786 |
-
"n_chars": 927311
|
787 |
-
},
|
788 |
-
"qwen1_5_14b_chat.cc100-zh-Hans": {
|
789 |
-
"vocab_size": 151646,
|
790 |
-
"n_bytes": 2633047,
|
791 |
-
"n_tokens": 589211,
|
792 |
-
"n_chars": 927311
|
793 |
-
},
|
794 |
-
"qwen_1_8b_chat.cc100-zh-Hans": {
|
795 |
-
"vocab_size": 151851,
|
796 |
-
"n_bytes": 2633047,
|
797 |
-
"n_tokens": 589211,
|
798 |
-
"n_chars": 927311
|
799 |
-
},
|
800 |
-
"qwen_72b_chat.cc100-zh-Hans": {
|
801 |
-
"vocab_size": 151851,
|
802 |
-
"n_bytes": 2633047,
|
803 |
-
"n_tokens": 589211,
|
804 |
-
"n_chars": 927311
|
805 |
-
},
|
806 |
-
"qwen_7b_chat.cc100-zh-Hans": {
|
807 |
-
"vocab_size": 151851,
|
808 |
-
"n_bytes": 2633047,
|
809 |
-
"n_tokens": 589211,
|
810 |
-
"n_chars": 927311
|
811 |
-
},
|
812 |
-
"roberta_chinese_clue.cc100-zh-Hans": {
|
813 |
-
"vocab_size": 8021,
|
814 |
-
"n_bytes": 2633047,
|
815 |
-
"n_tokens": 907144,
|
816 |
-
"n_chars": 927311
|
817 |
-
},
|
818 |
-
"skywork_13b_base.cc100-zh-Hans": {
|
819 |
-
"vocab_size": 65519,
|
820 |
-
"n_bytes": 2633047,
|
821 |
-
"n_tokens": 663923,
|
822 |
-
"n_chars": 927311
|
823 |
-
},
|
824 |
-
"skywork_13b_math.cc100-zh-Hans": {
|
825 |
-
"vocab_size": 65519,
|
826 |
-
"n_bytes": 2633047,
|
827 |
-
"n_tokens": 663923,
|
828 |
-
"n_chars": 927311
|
829 |
-
},
|
830 |
-
"solar_10_7b.cc100-zh-Hans": {
|
831 |
-
"vocab_size": 32000,
|
832 |
-
"n_bytes": 2633047,
|
833 |
-
"n_tokens": 1041023,
|
834 |
-
"n_chars": 927311
|
835 |
-
},
|
836 |
-
"starchat_alpha.cc100-zh-Hans": {
|
837 |
-
"vocab_size": 49156,
|
838 |
-
"n_bytes": 2633047,
|
839 |
-
"n_tokens": 882018,
|
840 |
-
"n_chars": 927311
|
841 |
-
},
|
842 |
-
"switch_c_2048.cc100-zh-Hans": {
|
843 |
-
"vocab_size": 32100,
|
844 |
-
"n_bytes": 2633047,
|
845 |
-
"n_tokens": 173519,
|
846 |
-
"n_chars": 927311
|
847 |
-
},
|
848 |
-
"t5_base.cc100-zh-Hans": {
|
849 |
-
"vocab_size": 32100,
|
850 |
-
"n_bytes": 2633047,
|
851 |
-
"n_tokens": 173519,
|
852 |
-
"n_chars": 927311
|
853 |
-
},
|
854 |
-
"t5_large.cc100-zh-Hans": {
|
855 |
-
"vocab_size": 32100,
|
856 |
-
"n_bytes": 2633047,
|
857 |
-
"n_tokens": 173519,
|
858 |
-
"n_chars": 927311
|
859 |
-
},
|
860 |
-
"t5_small.cc100-zh-Hans": {
|
861 |
-
"vocab_size": 32100,
|
862 |
-
"n_bytes": 2633047,
|
863 |
-
"n_tokens": 173519,
|
864 |
-
"n_chars": 927311
|
865 |
-
},
|
866 |
-
"text_davinci_003.cc100-zh-Hans": {
|
867 |
-
"vocab_size": 50281,
|
868 |
-
"n_bytes": 2633047,
|
869 |
-
"n_tokens": 1876809,
|
870 |
-
"n_chars": 927311
|
871 |
-
},
|
872 |
-
"tigerbot_13b_chat_v2.cc100-zh-Hans": {
|
873 |
-
"vocab_size": 60515,
|
874 |
-
"n_bytes": 2633047,
|
875 |
-
"n_tokens": 577385,
|
876 |
-
"n_chars": 927311
|
877 |
-
},
|
878 |
-
"tigerbot_70b_chat_v4_4k.cc100-zh-Hans": {
|
879 |
-
"vocab_size": 65110,
|
880 |
-
"n_bytes": 2633047,
|
881 |
-
"n_tokens": 577211,
|
882 |
-
"n_chars": 927311
|
883 |
-
},
|
884 |
-
"wizardcoder_15b_v1.cc100-zh-Hans": {
|
885 |
-
"vocab_size": 49153,
|
886 |
-
"n_bytes": 2633047,
|
887 |
-
"n_tokens": 882018,
|
888 |
-
"n_chars": 927311
|
889 |
-
},
|
890 |
-
"wizardcoder_python_7b_v1.cc100-zh-Hans": {
|
891 |
-
"vocab_size": 32001,
|
892 |
-
"n_bytes": 2633047,
|
893 |
-
"n_tokens": 1330093,
|
894 |
-
"n_chars": 927311
|
895 |
-
},
|
896 |
-
"wizardlm_7b_v1.cc100-zh-Hans": {
|
897 |
-
"vocab_size": 32001,
|
898 |
-
"n_bytes": 2633047,
|
899 |
-
"n_tokens": 1330093,
|
900 |
-
"n_chars": 927311
|
901 |
-
},
|
902 |
-
"wizardmath_70b_v1.cc100-zh-Hans": {
|
903 |
-
"vocab_size": 32002,
|
904 |
-
"n_bytes": 2633047,
|
905 |
-
"n_tokens": 1330093,
|
906 |
-
"n_chars": 927311
|
907 |
-
},
|
908 |
-
"xlm_roberta.cc100-zh-Hans": {
|
909 |
-
"vocab_size": 250002,
|
910 |
-
"n_bytes": 2633047,
|
911 |
-
"n_tokens": 619844,
|
912 |
-
"n_chars": 927311
|
913 |
-
},
|
914 |
-
"yi_34b.cc100-zh-Hans": {
|
915 |
-
"vocab_size": 64000,
|
916 |
-
"n_bytes": 2633047,
|
917 |
-
"n_tokens": 588729,
|
918 |
-
"n_chars": 927311
|
919 |
-
},
|
920 |
-
"yi_6b.cc100-zh-Hans": {
|
921 |
-
"vocab_size": 64000,
|
922 |
-
"n_bytes": 2633047,
|
923 |
-
"n_tokens": 588729,
|
924 |
-
"n_chars": 927311
|
925 |
-
},
|
926 |
-
"yi_vl34b.cc100-zh-Hans": {
|
927 |
-
"vocab_size": 64000,
|
928 |
-
"n_bytes": 2633047,
|
929 |
-
"n_tokens": 596166,
|
930 |
-
"n_chars": 927311
|
931 |
-
},
|
932 |
-
"zephyr_7b_beta.cc100-zh-Hans": {
|
933 |
-
"vocab_size": 32000,
|
934 |
-
"n_bytes": 2633047,
|
935 |
-
"n_tokens": 1041023,
|
936 |
-
"n_chars": 927311
|
937 |
-
},
|
938 |
-
"amber.cc100-es": {
|
939 |
-
"vocab_size": 32000,
|
940 |
-
"n_bytes": 1664455,
|
941 |
-
"n_tokens": 492235,
|
942 |
-
"n_chars": 1630297
|
943 |
-
},
|
944 |
-
"aya_101.cc100-es": {
|
945 |
-
"vocab_size": 250100,
|
946 |
-
"n_bytes": 1664455,
|
947 |
-
"n_tokens": 472231,
|
948 |
-
"n_chars": 1630297
|
949 |
-
},
|
950 |
-
"baichuan.cc100-es": {
|
951 |
-
"vocab_size": 64000,
|
952 |
-
"n_bytes": 1664455,
|
953 |
-
"n_tokens": 585804,
|
954 |
-
"n_chars": 1630297
|
955 |
-
},
|
956 |
-
"baichuan2.cc100-es": {
|
957 |
-
"vocab_size": 125696,
|
958 |
-
"n_bytes": 1664455,
|
959 |
-
"n_tokens": 551326,
|
960 |
-
"n_chars": 1630297
|
961 |
-
},
|
962 |
-
"bert_base_cased.cc100-es": {
|
963 |
-
"vocab_size": 28996,
|
964 |
-
"n_bytes": 1664455,
|
965 |
-
"n_tokens": 630231,
|
966 |
-
"n_chars": 1630297
|
967 |
-
},
|
968 |
-
"bert_base_chinese.cc100-es": {
|
969 |
-
"vocab_size": 21128,
|
970 |
-
"n_bytes": 1664455,
|
971 |
-
"n_tokens": 609419,
|
972 |
-
"n_chars": 1630297
|
973 |
-
},
|
974 |
-
"bert_base_uncased.cc100-es": {
|
975 |
-
"vocab_size": 30522,
|
976 |
-
"n_bytes": 1664455,
|
977 |
-
"n_tokens": 558042,
|
978 |
-
"n_chars": 1630297
|
979 |
-
},
|
980 |
-
"bloom.cc100-es": {
|
981 |
-
"vocab_size": 250680,
|
982 |
-
"n_bytes": 1664455,
|
983 |
-
"n_tokens": 350793,
|
984 |
-
"n_chars": 1630297
|
985 |
-
},
|
986 |
-
"byt5_small.cc100-es": {
|
987 |
-
"vocab_size": 384,
|
988 |
-
"n_bytes": 1664455,
|
989 |
-
"n_tokens": 1674455,
|
990 |
-
"n_chars": 1630297
|
991 |
-
},
|
992 |
-
"character_glm_6b.cc100-es": {
|
993 |
-
"vocab_size": 64789,
|
994 |
-
"n_bytes": 1664455,
|
995 |
-
"n_tokens": 566501,
|
996 |
-
"n_chars": 1630297
|
997 |
-
},
|
998 |
-
"chatglm2_6b.cc100-es": {
|
999 |
-
"vocab_size": 64787,
|
1000 |
-
"n_bytes": 1664455,
|
1001 |
-
"n_tokens": 566476,
|
1002 |
-
"n_chars": 1630297
|
1003 |
-
},
|
1004 |
-
"chatglm3_6b.cc100-es": {
|
1005 |
-
"vocab_size": 64796,
|
1006 |
-
"n_bytes": 1664455,
|
1007 |
-
"n_tokens": 566501,
|
1008 |
-
"n_chars": 1630297
|
1009 |
-
},
|
1010 |
-
"chatglm_6b.cc100-es": {
|
1011 |
-
"vocab_size": 150344,
|
1012 |
-
"n_bytes": 1664455,
|
1013 |
-
"n_tokens": 514848,
|
1014 |
-
"n_chars": 1630297
|
1015 |
-
},
|
1016 |
-
"chatyuan_large_v2.cc100-es": {
|
1017 |
-
"vocab_size": 32128,
|
1018 |
-
"n_bytes": 1664455,
|
1019 |
-
"n_tokens": 889530,
|
1020 |
-
"n_chars": 1630297
|
1021 |
-
},
|
1022 |
-
"chinese_llama.cc100-es": {
|
1023 |
-
"vocab_size": 49953,
|
1024 |
-
"n_bytes": 1664455,
|
1025 |
-
"n_tokens": 486672,
|
1026 |
-
"n_chars": 1630297
|
1027 |
-
},
|
1028 |
-
"chinese_llama2.cc100-es": {
|
1029 |
-
"vocab_size": 55296,
|
1030 |
-
"n_bytes": 1664455,
|
1031 |
-
"n_tokens": 492235,
|
1032 |
-
"n_chars": 1630297
|
1033 |
-
},
|
1034 |
-
"code_davinci_002.cc100-es": {
|
1035 |
-
"vocab_size": 50281,
|
1036 |
-
"n_bytes": 1664455,
|
1037 |
-
"n_tokens": 569853,
|
1038 |
-
"n_chars": 1630297
|
1039 |
-
},
|
1040 |
-
"crystal_coder.cc100-es": {
|
1041 |
-
"vocab_size": 32022,
|
1042 |
-
"n_bytes": 1664455,
|
1043 |
-
"n_tokens": 482235,
|
1044 |
-
"n_chars": 1630297
|
1045 |
-
},
|
1046 |
-
"dbrx_instruct.cc100-es": {
|
1047 |
-
"vocab_size": 100280,
|
1048 |
-
"n_bytes": 1664455,
|
1049 |
-
"n_tokens": 433875,
|
1050 |
-
"n_chars": 1630297
|
1051 |
-
},
|
1052 |
-
"deepseek_coder_33b_instruct.cc100-es": {
|
1053 |
-
"vocab_size": 32022,
|
1054 |
-
"n_bytes": 1664455,
|
1055 |
-
"n_tokens": 523884,
|
1056 |
-
"n_chars": 1630297
|
1057 |
-
},
|
1058 |
-
"deepseek_llm_7b_base.cc100-es": {
|
1059 |
-
"vocab_size": 100015,
|
1060 |
-
"n_bytes": 1664455,
|
1061 |
-
"n_tokens": 480877,
|
1062 |
-
"n_chars": 1630297
|
1063 |
-
},
|
1064 |
-
"falcon_180b.cc100-es": {
|
1065 |
-
"vocab_size": 65024,
|
1066 |
-
"n_bytes": 1664455,
|
1067 |
-
"n_tokens": 442138,
|
1068 |
-
"n_chars": 1630297
|
1069 |
-
},
|
1070 |
-
"falcon_7b.cc100-es": {
|
1071 |
-
"vocab_size": 65024,
|
1072 |
-
"n_bytes": 1664455,
|
1073 |
-
"n_tokens": 442138,
|
1074 |
-
"n_chars": 1630297
|
1075 |
-
},
|
1076 |
-
"fastchat_t5_3b.cc100-es": {
|
1077 |
-
"vocab_size": 32110,
|
1078 |
-
"n_bytes": 1664455,
|
1079 |
-
"n_tokens": 970105,
|
1080 |
-
"n_chars": 1630297
|
1081 |
-
},
|
1082 |
-
"flan_t5_base.cc100-es": {
|
1083 |
-
"vocab_size": 32100,
|
1084 |
-
"n_bytes": 1664455,
|
1085 |
-
"n_tokens": 706405,
|
1086 |
-
"n_chars": 1630297
|
1087 |
-
},
|
1088 |
-
"gemma_7b.cc100-es": {
|
1089 |
-
"vocab_size": 256000,
|
1090 |
-
"n_bytes": 1664455,
|
1091 |
-
"n_tokens": 371321,
|
1092 |
-
"n_chars": 1630297
|
1093 |
-
},
|
1094 |
-
"gpt2.cc100-es": {
|
1095 |
-
"vocab_size": 50257,
|
1096 |
-
"n_bytes": 1664455,
|
1097 |
-
"n_tokens": 569853,
|
1098 |
-
"n_chars": 1630297
|
1099 |
-
},
|
1100 |
-
"gpt2_chinese.cc100-es": {
|
1101 |
-
"vocab_size": 21128,
|
1102 |
-
"n_bytes": 1664455,
|
1103 |
-
"n_tokens": 703390,
|
1104 |
-
"n_chars": 1630297
|
1105 |
-
},
|
1106 |
-
"gpt_35_turbo.cc100-es": {
|
1107 |
-
"vocab_size": 100277,
|
1108 |
-
"n_bytes": 1664455,
|
1109 |
-
"n_tokens": 433875,
|
1110 |
-
"n_chars": 1630297
|
1111 |
-
},
|
1112 |
-
"gpt_4.cc100-es": {
|
1113 |
-
"vocab_size": 100277,
|
1114 |
-
"n_bytes": 1664455,
|
1115 |
-
"n_tokens": 433875,
|
1116 |
-
"n_chars": 1630297
|
1117 |
-
},
|
1118 |
-
"gpt_nexo_20b.cc100-es": {
|
1119 |
-
"vocab_size": 50277,
|
1120 |
-
"n_bytes": 1664455,
|
1121 |
-
"n_tokens": 494577,
|
1122 |
-
"n_chars": 1630297
|
1123 |
-
},
|
1124 |
-
"grok_1.cc100-es": {
|
1125 |
-
"vocab_size": 131072,
|
1126 |
-
"n_bytes": 1664455,
|
1127 |
-
"n_tokens": 449392,
|
1128 |
-
"n_chars": 1630297
|
1129 |
-
},
|
1130 |
-
"internlm2_chat_7b.cc100-es": {
|
1131 |
-
"vocab_size": 92544,
|
1132 |
-
"n_bytes": 1664455,
|
1133 |
-
"n_tokens": 518871,
|
1134 |
-
"n_chars": 1630297
|
1135 |
-
},
|
1136 |
-
"internlm2_math_7b.cc100-es": {
|
1137 |
-
"vocab_size": 92544,
|
1138 |
-
"n_bytes": 1664455,
|
1139 |
-
"n_tokens": 518871,
|
1140 |
-
"n_chars": 1630297
|
1141 |
-
},
|
1142 |
-
"internlm_chat_7b.cc100-es": {
|
1143 |
-
"vocab_size": 103168,
|
1144 |
-
"n_bytes": 1664455,
|
1145 |
-
"n_tokens": 516572,
|
1146 |
-
"n_chars": 1630297
|
1147 |
-
},
|
1148 |
-
"internlm_xcomposer_7b.cc100-es": {
|
1149 |
-
"vocab_size": 103168,
|
1150 |
-
"n_bytes": 1664455,
|
1151 |
-
"n_tokens": 516572,
|
1152 |
-
"n_chars": 1630297
|
1153 |
-
},
|
1154 |
-
"jamba_v0_1.cc100-es": {
|
1155 |
-
"vocab_size": 65536,
|
1156 |
-
"n_bytes": 1664455,
|
1157 |
-
"n_tokens": 420883,
|
1158 |
-
"n_chars": 1630297
|
1159 |
-
},
|
1160 |
-
"kplug.cc100-es": {
|
1161 |
-
"vocab_size": 10261,
|
1162 |
-
"n_bytes": 1664455,
|
1163 |
-
"n_tokens": 704804,
|
1164 |
-
"n_chars": 1630297
|
1165 |
-
},
|
1166 |
-
"llama.cc100-es": {
|
1167 |
-
"vocab_size": 32000,
|
1168 |
-
"n_bytes": 1664455,
|
1169 |
-
"n_tokens": 492235,
|
1170 |
-
"n_chars": 1630297
|
1171 |
-
},
|
1172 |
-
"llama2.cc100-es": {
|
1173 |
-
"vocab_size": 32001,
|
1174 |
-
"n_bytes": 1664455,
|
1175 |
-
"n_tokens": 492235,
|
1176 |
-
"n_chars": 1630297
|
1177 |
-
},
|
1178 |
-
"llama3.cc100-es": {
|
1179 |
-
"vocab_size": 128256,
|
1180 |
-
"n_bytes": 1664455,
|
1181 |
-
"n_tokens": 433289,
|
1182 |
-
"n_chars": 1630297
|
1183 |
-
},
|
1184 |
-
"mistral_7b.cc100-es": {
|
1185 |
-
"vocab_size": 32000,
|
1186 |
-
"n_bytes": 1664455,
|
1187 |
-
"n_tokens": 513915,
|
1188 |
-
"n_chars": 1630297
|
1189 |
-
},
|
1190 |
-
"mixtral_8_7b.cc100-es": {
|
1191 |
-
"vocab_size": 32000,
|
1192 |
-
"n_bytes": 1664455,
|
1193 |
-
"n_tokens": 513915,
|
1194 |
-
"n_chars": 1630297
|
1195 |
-
},
|
1196 |
-
"mobilebert_uncased.cc100-es": {
|
1197 |
-
"vocab_size": 30522,
|
1198 |
-
"n_bytes": 1664455,
|
1199 |
-
"n_tokens": 558042,
|
1200 |
-
"n_chars": 1630297
|
1201 |
-
},
|
1202 |
-
"moss.cc100-es": {
|
1203 |
-
"vocab_size": 106072,
|
1204 |
-
"n_bytes": 1664455,
|
1205 |
-
"n_tokens": 568539,
|
1206 |
-
"n_chars": 1630297
|
1207 |
-
},
|
1208 |
-
"mt5_large.cc100-es": {
|
1209 |
-
"vocab_size": 250100,
|
1210 |
-
"n_bytes": 1664455,
|
1211 |
-
"n_tokens": 472231,
|
1212 |
-
"n_chars": 1630297
|
1213 |
-
},
|
1214 |
-
"olmo_7b.cc100-es": {
|
1215 |
-
"vocab_size": 50280,
|
1216 |
-
"n_bytes": 1664455,
|
1217 |
-
"n_tokens": 494577,
|
1218 |
-
"n_chars": 1630297
|
1219 |
-
},
|
1220 |
-
"orion_14b_chat.cc100-es": {
|
1221 |
-
"vocab_size": 84608,
|
1222 |
-
"n_bytes": 1664455,
|
1223 |
-
"n_tokens": 628571,
|
1224 |
-
"n_chars": 1630297
|
1225 |
-
},
|
1226 |
-
"phi_1.cc100-es": {
|
1227 |
-
"vocab_size": 50295,
|
1228 |
-
"n_bytes": 1664455,
|
1229 |
-
"n_tokens": 569853,
|
1230 |
-
"n_chars": 1630297
|
1231 |
-
},
|
1232 |
-
"phi_2.cc100-es": {
|
1233 |
-
"vocab_size": 50295,
|
1234 |
-
"n_bytes": 1664455,
|
1235 |
-
"n_tokens": 569853,
|
1236 |
-
"n_chars": 1630297
|
1237 |
-
},
|
1238 |
-
"phi_3_mini.cc100-es": {
|
1239 |
-
"vocab_size": 32011,
|
1240 |
-
"n_bytes": 1664455,
|
1241 |
-
"n_tokens": 492235,
|
1242 |
-
"n_chars": 1630297
|
1243 |
-
},
|
1244 |
-
"pko_t5_large.cc100-es": {
|
1245 |
-
"vocab_size": 50358,
|
1246 |
-
"n_bytes": 1664455,
|
1247 |
-
"n_tokens": 1134056,
|
1248 |
-
"n_chars": 1630297
|
1249 |
-
},
|
1250 |
-
"prompt_clue.cc100-es": {
|
1251 |
-
"vocab_size": 32128,
|
1252 |
-
"n_bytes": 1664455,
|
1253 |
-
"n_tokens": 889530,
|
1254 |
-
"n_chars": 1630297
|
1255 |
-
},
|
1256 |
-
"qwen1_5_14b_chat.cc100-es": {
|
1257 |
-
"vocab_size": 151646,
|
1258 |
-
"n_bytes": 1664455,
|
1259 |
-
"n_tokens": 434264,
|
1260 |
-
"n_chars": 1630297
|
1261 |
-
},
|
1262 |
-
"qwen_1_8b_chat.cc100-es": {
|
1263 |
-
"vocab_size": 151851,
|
1264 |
-
"n_bytes": 1664455,
|
1265 |
-
"n_tokens": 434264,
|
1266 |
-
"n_chars": 1630297
|
1267 |
-
},
|
1268 |
-
"qwen_72b_chat.cc100-es": {
|
1269 |
-
"vocab_size": 151851,
|
1270 |
-
"n_bytes": 1664455,
|
1271 |
-
"n_tokens": 434264,
|
1272 |
-
"n_chars": 1630297
|
1273 |
-
},
|
1274 |
-
"qwen_7b_chat.cc100-es": {
|
1275 |
-
"vocab_size": 151851,
|
1276 |
-
"n_bytes": 1664455,
|
1277 |
-
"n_tokens": 434264,
|
1278 |
-
"n_chars": 1630297
|
1279 |
-
},
|
1280 |
-
"roberta_chinese_clue.cc100-es": {
|
1281 |
-
"vocab_size": 8021,
|
1282 |
-
"n_bytes": 1664455,
|
1283 |
-
"n_tokens": 866564,
|
1284 |
-
"n_chars": 1630297
|
1285 |
-
},
|
1286 |
-
"skywork_13b_base.cc100-es": {
|
1287 |
-
"vocab_size": 65519,
|
1288 |
-
"n_bytes": 1664455,
|
1289 |
-
"n_tokens": 492211,
|
1290 |
-
"n_chars": 1630297
|
1291 |
-
},
|
1292 |
-
"skywork_13b_math.cc100-es": {
|
1293 |
-
"vocab_size": 65519,
|
1294 |
-
"n_bytes": 1664455,
|
1295 |
-
"n_tokens": 492211,
|
1296 |
-
"n_chars": 1630297
|
1297 |
-
},
|
1298 |
-
"solar_10_7b.cc100-es": {
|
1299 |
-
"vocab_size": 32000,
|
1300 |
-
"n_bytes": 1664455,
|
1301 |
-
"n_tokens": 513915,
|
1302 |
-
"n_chars": 1630297
|
1303 |
-
},
|
1304 |
-
"starchat_alpha.cc100-es": {
|
1305 |
-
"vocab_size": 49156,
|
1306 |
-
"n_bytes": 1664455,
|
1307 |
-
"n_tokens": 530592,
|
1308 |
-
"n_chars": 1630297
|
1309 |
-
},
|
1310 |
-
"switch_c_2048.cc100-es": {
|
1311 |
-
"vocab_size": 32100,
|
1312 |
-
"n_bytes": 1664455,
|
1313 |
-
"n_tokens": 706400,
|
1314 |
-
"n_chars": 1630297
|
1315 |
-
},
|
1316 |
-
"t5_base.cc100-es": {
|
1317 |
-
"vocab_size": 32100,
|
1318 |
-
"n_bytes": 1664455,
|
1319 |
-
"n_tokens": 706400,
|
1320 |
-
"n_chars": 1630297
|
1321 |
-
},
|
1322 |
-
"t5_large.cc100-es": {
|
1323 |
-
"vocab_size": 32100,
|
1324 |
-
"n_bytes": 1664455,
|
1325 |
-
"n_tokens": 706400,
|
1326 |
-
"n_chars": 1630297
|
1327 |
-
},
|
1328 |
-
"t5_small.cc100-es": {
|
1329 |
-
"vocab_size": 32100,
|
1330 |
-
"n_bytes": 1664455,
|
1331 |
-
"n_tokens": 706400,
|
1332 |
-
"n_chars": 1630297
|
1333 |
-
},
|
1334 |
-
"text_davinci_003.cc100-es": {
|
1335 |
-
"vocab_size": 50281,
|
1336 |
-
"n_bytes": 1664455,
|
1337 |
-
"n_tokens": 569853,
|
1338 |
-
"n_chars": 1630297
|
1339 |
-
},
|
1340 |
-
"tigerbot_13b_chat_v2.cc100-es": {
|
1341 |
-
"vocab_size": 60515,
|
1342 |
-
"n_bytes": 1664455,
|
1343 |
-
"n_tokens": 482553,
|
1344 |
-
"n_chars": 1630297
|
1345 |
-
},
|
1346 |
-
"tigerbot_70b_chat_v4_4k.cc100-es": {
|
1347 |
-
"vocab_size": 65110,
|
1348 |
-
"n_bytes": 1664455,
|
1349 |
-
"n_tokens": 484099,
|
1350 |
-
"n_chars": 1630297
|
1351 |
-
},
|
1352 |
-
"wizardcoder_15b_v1.cc100-es": {
|
1353 |
-
"vocab_size": 49153,
|
1354 |
-
"n_bytes": 1664455,
|
1355 |
-
"n_tokens": 530592,
|
1356 |
-
"n_chars": 1630297
|
1357 |
-
},
|
1358 |
-
"wizardcoder_python_7b_v1.cc100-es": {
|
1359 |
-
"vocab_size": 32001,
|
1360 |
-
"n_bytes": 1664455,
|
1361 |
-
"n_tokens": 492235,
|
1362 |
-
"n_chars": 1630297
|
1363 |
-
},
|
1364 |
-
"wizardlm_7b_v1.cc100-es": {
|
1365 |
-
"vocab_size": 32001,
|
1366 |
-
"n_bytes": 1664455,
|
1367 |
-
"n_tokens": 492235,
|
1368 |
-
"n_chars": 1630297
|
1369 |
-
},
|
1370 |
-
"wizardmath_70b_v1.cc100-es": {
|
1371 |
-
"vocab_size": 32002,
|
1372 |
-
"n_bytes": 1664455,
|
1373 |
-
"n_tokens": 492235,
|
1374 |
-
"n_chars": 1630297
|
1375 |
-
},
|
1376 |
-
"xlm_roberta.cc100-es": {
|
1377 |
-
"vocab_size": 250002,
|
1378 |
-
"n_bytes": 1664455,
|
1379 |
-
"n_tokens": 399850,
|
1380 |
-
"n_chars": 1630297
|
1381 |
-
},
|
1382 |
-
"yi_34b.cc100-es": {
|
1383 |
-
"vocab_size": 64000,
|
1384 |
-
"n_bytes": 1664455,
|
1385 |
-
"n_tokens": 577018,
|
1386 |
-
"n_chars": 1630297
|
1387 |
-
},
|
1388 |
-
"yi_6b.cc100-es": {
|
1389 |
-
"vocab_size": 64000,
|
1390 |
-
"n_bytes": 1664455,
|
1391 |
-
"n_tokens": 577018,
|
1392 |
-
"n_chars": 1630297
|
1393 |
-
},
|
1394 |
-
"yi_vl34b.cc100-es": {
|
1395 |
-
"vocab_size": 64000,
|
1396 |
-
"n_bytes": 1664455,
|
1397 |
-
"n_tokens": 576794,
|
1398 |
-
"n_chars": 1630297
|
1399 |
-
},
|
1400 |
-
"zephyr_7b_beta.cc100-es": {
|
1401 |
-
"vocab_size": 32000,
|
1402 |
-
"n_bytes": 1664455,
|
1403 |
-
"n_tokens": 513915,
|
1404 |
-
"n_chars": 1630297
|
1405 |
-
},
|
1406 |
-
"aya_101.cc100-fr": {
|
1407 |
-
"vocab_size": 250100,
|
1408 |
-
"n_bytes": 1540504,
|
1409 |
-
"n_tokens": 470944,
|
1410 |
-
"n_chars": 1484970
|
1411 |
-
},
|
1412 |
-
"baichuan.cc100-fr": {
|
1413 |
-
"vocab_size": 64000,
|
1414 |
-
"n_bytes": 1540504,
|
1415 |
-
"n_tokens": 540430,
|
1416 |
-
"n_chars": 1484970
|
1417 |
-
},
|
1418 |
-
"baichuan2.cc100-fr": {
|
1419 |
-
"vocab_size": 125696,
|
1420 |
-
"n_bytes": 1540504,
|
1421 |
-
"n_tokens": 512313,
|
1422 |
-
"n_chars": 1484970
|
1423 |
-
},
|
1424 |
-
"bert_base_cased.cc100-fr": {
|
1425 |
-
"vocab_size": 28996,
|
1426 |
-
"n_bytes": 1540504,
|
1427 |
-
"n_tokens": 583210,
|
1428 |
-
"n_chars": 1484970
|
1429 |
-
},
|
1430 |
-
"bert_base_chinese.cc100-fr": {
|
1431 |
-
"vocab_size": 21128,
|
1432 |
-
"n_bytes": 1540504,
|
1433 |
-
"n_tokens": 553134,
|
1434 |
-
"n_chars": 1484970
|
1435 |
-
},
|
1436 |
-
"bert_base_uncased.cc100-fr": {
|
1437 |
-
"vocab_size": 30522,
|
1438 |
-
"n_bytes": 1540504,
|
1439 |
-
"n_tokens": 504075,
|
1440 |
-
"n_chars": 1484970
|
1441 |
-
},
|
1442 |
-
"bloom.cc100-fr": {
|
1443 |
-
"vocab_size": 250680,
|
1444 |
-
"n_bytes": 1540504,
|
1445 |
-
"n_tokens": 321639,
|
1446 |
-
"n_chars": 1484970
|
1447 |
-
},
|
1448 |
-
"byt5_small.cc100-fr": {
|
1449 |
-
"vocab_size": 384,
|
1450 |
-
"n_bytes": 1540504,
|
1451 |
-
"n_tokens": 1550504,
|
1452 |
-
"n_chars": 1484970
|
1453 |
-
},
|
1454 |
-
"character_glm_6b.cc100-fr": {
|
1455 |
-
"vocab_size": 64789,
|
1456 |
-
"n_bytes": 1540504,
|
1457 |
-
"n_tokens": 515052,
|
1458 |
-
"n_chars": 1484970
|
1459 |
-
},
|
1460 |
-
"chatglm2_6b.cc100-fr": {
|
1461 |
-
"vocab_size": 64787,
|
1462 |
-
"n_bytes": 1540504,
|
1463 |
-
"n_tokens": 515028,
|
1464 |
-
"n_chars": 1484970
|
1465 |
-
},
|
1466 |
-
"chatglm3_6b.cc100-fr": {
|
1467 |
-
"vocab_size": 64796,
|
1468 |
-
"n_bytes": 1540504,
|
1469 |
-
"n_tokens": 515052,
|
1470 |
-
"n_chars": 1484970
|
1471 |
-
},
|
1472 |
-
"chatglm_6b.cc100-fr": {
|
1473 |
-
"vocab_size": 150344,
|
1474 |
-
"n_bytes": 1540504,
|
1475 |
-
"n_tokens": 499261,
|
1476 |
-
"n_chars": 1484970
|
1477 |
-
},
|
1478 |
-
"chatyuan_large_v2.cc100-fr": {
|
1479 |
-
"vocab_size": 32128,
|
1480 |
-
"n_bytes": 1540504,
|
1481 |
-
"n_tokens": 822012,
|
1482 |
-
"n_chars": 1484970
|
1483 |
-
},
|
1484 |
-
"chinese_llama.cc100-fr": {
|
1485 |
-
"vocab_size": 49953,
|
1486 |
-
"n_bytes": 1540504,
|
1487 |
-
"n_tokens": 450352,
|
1488 |
-
"n_chars": 1484970
|
1489 |
-
},
|
1490 |
-
"chinese_llama2.cc100-fr": {
|
1491 |
-
"vocab_size": 55296,
|
1492 |
-
"n_bytes": 1540504,
|
1493 |
-
"n_tokens": 457243,
|
1494 |
-
"n_chars": 1484970
|
1495 |
-
},
|
1496 |
-
"code_davinci_002.cc100-fr": {
|
1497 |
-
"vocab_size": 50281,
|
1498 |
-
"n_bytes": 1540504,
|
1499 |
-
"n_tokens": 521776,
|
1500 |
-
"n_chars": 1484970
|
1501 |
-
},
|
1502 |
-
"crystal_coder.cc100-fr": {
|
1503 |
-
"vocab_size": 32022,
|
1504 |
-
"n_bytes": 1540504,
|
1505 |
-
"n_tokens": 447243,
|
1506 |
-
"n_chars": 1484970
|
1507 |
-
},
|
1508 |
-
"dbrx_instruct.cc100-fr": {
|
1509 |
-
"vocab_size": 100280,
|
1510 |
-
"n_bytes": 1540504,
|
1511 |
-
"n_tokens": 412685,
|
1512 |
-
"n_chars": 1484970
|
1513 |
-
},
|
1514 |
-
"deepseek_coder_33b_instruct.cc100-fr": {
|
1515 |
-
"vocab_size": 32022,
|
1516 |
-
"n_bytes": 1540504,
|
1517 |
-
"n_tokens": 537538,
|
1518 |
-
"n_chars": 1484970
|
1519 |
-
},
|
1520 |
-
"deepseek_llm_7b_base.cc100-fr": {
|
1521 |
-
"vocab_size": 100015,
|
1522 |
-
"n_bytes": 1540504,
|
1523 |
-
"n_tokens": 507693,
|
1524 |
-
"n_chars": 1484970
|
1525 |
-
},
|
1526 |
-
"falcon_180b.cc100-fr": {
|
1527 |
-
"vocab_size": 65024,
|
1528 |
-
"n_bytes": 1540504,
|
1529 |
-
"n_tokens": 407853,
|
1530 |
-
"n_chars": 1484970
|
1531 |
-
},
|
1532 |
-
"falcon_7b.cc100-fr": {
|
1533 |
-
"vocab_size": 65024,
|
1534 |
-
"n_bytes": 1540504,
|
1535 |
-
"n_tokens": 407853,
|
1536 |
-
"n_chars": 1484970
|
1537 |
-
},
|
1538 |
-
"fastchat_t5_3b.cc100-fr": {
|
1539 |
-
"vocab_size": 32110,
|
1540 |
-
"n_bytes": 1540504,
|
1541 |
-
"n_tokens": 717675,
|
1542 |
-
"n_chars": 1484970
|
1543 |
-
},
|
1544 |
-
"flan_t5_base.cc100-fr": {
|
1545 |
-
"vocab_size": 32100,
|
1546 |
-
"n_bytes": 1540504,
|
1547 |
-
"n_tokens": 476135,
|
1548 |
-
"n_chars": 1484970
|
1549 |
-
},
|
1550 |
-
"gemma_7b.cc100-fr": {
|
1551 |
-
"vocab_size": 256000,
|
1552 |
-
"n_bytes": 1540504,
|
1553 |
-
"n_tokens": 374551,
|
1554 |
-
"n_chars": 1484970
|
1555 |
-
},
|
1556 |
-
"gpt2.cc100-fr": {
|
1557 |
-
"vocab_size": 50257,
|
1558 |
-
"n_bytes": 1540504,
|
1559 |
-
"n_tokens": 521776,
|
1560 |
-
"n_chars": 1484970
|
1561 |
-
},
|
1562 |
-
"gpt2_chinese.cc100-fr": {
|
1563 |
-
"vocab_size": 21128,
|
1564 |
-
"n_bytes": 1540504,
|
1565 |
-
"n_tokens": 636442,
|
1566 |
-
"n_chars": 1484970
|
1567 |
-
},
|
1568 |
-
"gpt_35_turbo.cc100-fr": {
|
1569 |
-
"vocab_size": 100277,
|
1570 |
-
"n_bytes": 1540504,
|
1571 |
-
"n_tokens": 412685,
|
1572 |
-
"n_chars": 1484970
|
1573 |
-
},
|
1574 |
-
"gpt_4.cc100-fr": {
|
1575 |
-
"vocab_size": 100277,
|
1576 |
-
"n_bytes": 1540504,
|
1577 |
-
"n_tokens": 412685,
|
1578 |
-
"n_chars": 1484970
|
1579 |
-
},
|
1580 |
-
"gpt_nexo_20b.cc100-fr": {
|
1581 |
-
"vocab_size": 50277,
|
1582 |
-
"n_bytes": 1540504,
|
1583 |
-
"n_tokens": 458961,
|
1584 |
-
"n_chars": 1484970
|
1585 |
-
},
|
1586 |
-
"grok_1.cc100-fr": {
|
1587 |
-
"vocab_size": 131072,
|
1588 |
-
"n_bytes": 1540504,
|
1589 |
-
"n_tokens": 428298,
|
1590 |
-
"n_chars": 1484970
|
1591 |
-
},
|
1592 |
-
"internlm2_chat_7b.cc100-fr": {
|
1593 |
-
"vocab_size": 92544,
|
1594 |
-
"n_bytes": 1540504,
|
1595 |
-
"n_tokens": 496629,
|
1596 |
-
"n_chars": 1484970
|
1597 |
-
},
|
1598 |
-
"internlm2_math_7b.cc100-fr": {
|
1599 |
-
"vocab_size": 92544,
|
1600 |
-
"n_bytes": 1540504,
|
1601 |
-
"n_tokens": 496629,
|
1602 |
-
"n_chars": 1484970
|
1603 |
-
},
|
1604 |
-
"internlm_chat_7b.cc100-fr": {
|
1605 |
-
"vocab_size": 103168,
|
1606 |
-
"n_bytes": 1540504,
|
1607 |
-
"n_tokens": 495045,
|
1608 |
-
"n_chars": 1484970
|
1609 |
-
},
|
1610 |
-
"internlm_xcomposer_7b.cc100-fr": {
|
1611 |
-
"vocab_size": 103168,
|
1612 |
-
"n_bytes": 1540504,
|
1613 |
-
"n_tokens": 495045,
|
1614 |
-
"n_chars": 1484970
|
1615 |
-
},
|
1616 |
-
"jamba_v0_1.cc100-fr": {
|
1617 |
-
"vocab_size": 65536,
|
1618 |
-
"n_bytes": 1540504,
|
1619 |
-
"n_tokens": 412899,
|
1620 |
-
"n_chars": 1484970
|
1621 |
-
},
|
1622 |
-
"kplug.cc100-fr": {
|
1623 |
-
"vocab_size": 10261,
|
1624 |
-
"n_bytes": 1540504,
|
1625 |
-
"n_tokens": 638107,
|
1626 |
-
"n_chars": 1484970
|
1627 |
-
},
|
1628 |
-
"llama.cc100-fr": {
|
1629 |
-
"vocab_size": 32000,
|
1630 |
-
"n_bytes": 1540504,
|
1631 |
-
"n_tokens": 457243,
|
1632 |
-
"n_chars": 1484970
|
1633 |
-
},
|
1634 |
-
"llama2.cc100-fr": {
|
1635 |
-
"vocab_size": 32001,
|
1636 |
-
"n_bytes": 1540504,
|
1637 |
-
"n_tokens": 457243,
|
1638 |
-
"n_chars": 1484970
|
1639 |
-
},
|
1640 |
-
"llama3.cc100-fr": {
|
1641 |
-
"vocab_size": 128256,
|
1642 |
-
"n_bytes": 1540504,
|
1643 |
-
"n_tokens": 412146,
|
1644 |
-
"n_chars": 1484970
|
1645 |
-
},
|
1646 |
-
"mistral_7b.cc100-fr": {
|
1647 |
-
"vocab_size": 32000,
|
1648 |
-
"n_bytes": 1540504,
|
1649 |
-
"n_tokens": 476666,
|
1650 |
-
"n_chars": 1484970
|
1651 |
-
},
|
1652 |
-
"mixtral_8_7b.cc100-fr": {
|
1653 |
-
"vocab_size": 32000,
|
1654 |
-
"n_bytes": 1540504,
|
1655 |
-
"n_tokens": 476666,
|
1656 |
-
"n_chars": 1484970
|
1657 |
-
},
|
1658 |
-
"mobilebert_uncased.cc100-fr": {
|
1659 |
-
"vocab_size": 30522,
|
1660 |
-
"n_bytes": 1540504,
|
1661 |
-
"n_tokens": 504075,
|
1662 |
-
"n_chars": 1484970
|
1663 |
-
},
|
1664 |
-
"moss.cc100-fr": {
|
1665 |
-
"vocab_size": 106072,
|
1666 |
-
"n_bytes": 1540504,
|
1667 |
-
"n_tokens": 515669,
|
1668 |
-
"n_chars": 1484970
|
1669 |
-
},
|
1670 |
-
"mt5_large.cc100-fr": {
|
1671 |
-
"vocab_size": 250100,
|
1672 |
-
"n_bytes": 1540504,
|
1673 |
-
"n_tokens": 470944,
|
1674 |
-
"n_chars": 1484970
|
1675 |
-
},
|
1676 |
-
"olmo_7b.cc100-fr": {
|
1677 |
-
"vocab_size": 50280,
|
1678 |
-
"n_bytes": 1540504,
|
1679 |
-
"n_tokens": 458961,
|
1680 |
-
"n_chars": 1484970
|
1681 |
-
},
|
1682 |
-
"orion_14b_chat.cc100-fr": {
|
1683 |
-
"vocab_size": 84608,
|
1684 |
-
"n_bytes": 1540504,
|
1685 |
-
"n_tokens": 564107,
|
1686 |
-
"n_chars": 1484970
|
1687 |
-
},
|
1688 |
-
"phi_1.cc100-fr": {
|
1689 |
-
"vocab_size": 50295,
|
1690 |
-
"n_bytes": 1540504,
|
1691 |
-
"n_tokens": 521776,
|
1692 |
-
"n_chars": 1484970
|
1693 |
-
},
|
1694 |
-
"phi_2.cc100-fr": {
|
1695 |
-
"vocab_size": 50295,
|
1696 |
-
"n_bytes": 1540504,
|
1697 |
-
"n_tokens": 521776,
|
1698 |
-
"n_chars": 1484970
|
1699 |
-
},
|
1700 |
-
"phi_3_mini.cc100-fr": {
|
1701 |
-
"vocab_size": 32011,
|
1702 |
-
"n_bytes": 1540504,
|
1703 |
-
"n_tokens": 457243,
|
1704 |
-
"n_chars": 1484970
|
1705 |
-
},
|
1706 |
-
"pko_t5_large.cc100-fr": {
|
1707 |
-
"vocab_size": 50358,
|
1708 |
-
"n_bytes": 1540504,
|
1709 |
-
"n_tokens": 1044665,
|
1710 |
-
"n_chars": 1484970
|
1711 |
-
},
|
1712 |
-
"prompt_clue.cc100-fr": {
|
1713 |
-
"vocab_size": 32128,
|
1714 |
-
"n_bytes": 1540504,
|
1715 |
-
"n_tokens": 822012,
|
1716 |
-
"n_chars": 1484970
|
1717 |
-
},
|
1718 |
-
"qwen1_5_14b_chat.cc100-fr": {
|
1719 |
-
"vocab_size": 151646,
|
1720 |
-
"n_bytes": 1540504,
|
1721 |
-
"n_tokens": 413637,
|
1722 |
-
"n_chars": 1484970
|
1723 |
-
},
|
1724 |
-
"qwen_1_8b_chat.cc100-fr": {
|
1725 |
-
"vocab_size": 151851,
|
1726 |
-
"n_bytes": 1540504,
|
1727 |
-
"n_tokens": 413637,
|
1728 |
-
"n_chars": 1484970
|
1729 |
-
},
|
1730 |
-
"qwen_72b_chat.cc100-fr": {
|
1731 |
-
"vocab_size": 151851,
|
1732 |
-
"n_bytes": 1540504,
|
1733 |
-
"n_tokens": 413637,
|
1734 |
-
"n_chars": 1484970
|
1735 |
-
},
|
1736 |
-
"qwen_7b_chat.cc100-fr": {
|
1737 |
-
"vocab_size": 151851,
|
1738 |
-
"n_bytes": 1540504,
|
1739 |
-
"n_tokens": 413637,
|
1740 |
-
"n_chars": 1484970
|
1741 |
-
},
|
1742 |
-
"roberta_chinese_clue.cc100-fr": {
|
1743 |
-
"vocab_size": 8021,
|
1744 |
-
"n_bytes": 1540504,
|
1745 |
-
"n_tokens": 787363,
|
1746 |
-
"n_chars": 1484970
|
1747 |
-
},
|
1748 |
-
"skywork_13b_base.cc100-fr": {
|
1749 |
-
"vocab_size": 65519,
|
1750 |
-
"n_bytes": 1540504,
|
1751 |
-
"n_tokens": 457233,
|
1752 |
-
"n_chars": 1484970
|
1753 |
-
},
|
1754 |
-
"skywork_13b_math.cc100-fr": {
|
1755 |
-
"vocab_size": 65519,
|
1756 |
-
"n_bytes": 1540504,
|
1757 |
-
"n_tokens": 457233,
|
1758 |
-
"n_chars": 1484970
|
1759 |
-
},
|
1760 |
-
"solar_10_7b.cc100-fr": {
|
1761 |
-
"vocab_size": 32000,
|
1762 |
-
"n_bytes": 1540504,
|
1763 |
-
"n_tokens": 476666,
|
1764 |
-
"n_chars": 1484970
|
1765 |
-
},
|
1766 |
-
"starchat_alpha.cc100-fr": {
|
1767 |
-
"vocab_size": 49156,
|
1768 |
-
"n_bytes": 1540504,
|
1769 |
-
"n_tokens": 509958,
|
1770 |
-
"n_chars": 1484970
|
1771 |
-
},
|
1772 |
-
"switch_c_2048.cc100-fr": {
|
1773 |
-
"vocab_size": 32100,
|
1774 |
-
"n_bytes": 1540504,
|
1775 |
-
"n_tokens": 476133,
|
1776 |
-
"n_chars": 1484970
|
1777 |
-
},
|
1778 |
-
"t5_base.cc100-fr": {
|
1779 |
-
"vocab_size": 32100,
|
1780 |
-
"n_bytes": 1540504,
|
1781 |
-
"n_tokens": 476133,
|
1782 |
-
"n_chars": 1484970
|
1783 |
-
},
|
1784 |
-
"t5_large.cc100-fr": {
|
1785 |
-
"vocab_size": 32100,
|
1786 |
-
"n_bytes": 1540504,
|
1787 |
-
"n_tokens": 476133,
|
1788 |
-
"n_chars": 1484970
|
1789 |
-
},
|
1790 |
-
"t5_small.cc100-fr": {
|
1791 |
-
"vocab_size": 32100,
|
1792 |
-
"n_bytes": 1540504,
|
1793 |
-
"n_tokens": 476133,
|
1794 |
-
"n_chars": 1484970
|
1795 |
-
},
|
1796 |
-
"text_davinci_003.cc100-fr": {
|
1797 |
-
"vocab_size": 50281,
|
1798 |
-
"n_bytes": 1540504,
|
1799 |
-
"n_tokens": 521776,
|
1800 |
-
"n_chars": 1484970
|
1801 |
-
},
|
1802 |
-
"tigerbot_13b_chat_v2.cc100-fr": {
|
1803 |
-
"vocab_size": 60515,
|
1804 |
-
"n_bytes": 1540504,
|
1805 |
-
"n_tokens": 447372,
|
1806 |
-
"n_chars": 1484970
|
1807 |
-
},
|
1808 |
-
"tigerbot_70b_chat_v4_4k.cc100-fr": {
|
1809 |
-
"vocab_size": 65110,
|
1810 |
-
"n_bytes": 1540504,
|
1811 |
-
"n_tokens": 448567,
|
1812 |
-
"n_chars": 1484970
|
1813 |
-
},
|
1814 |
-
"wizardcoder_15b_v1.cc100-fr": {
|
1815 |
-
"vocab_size": 49153,
|
1816 |
-
"n_bytes": 1540504,
|
1817 |
-
"n_tokens": 509958,
|
1818 |
-
"n_chars": 1484970
|
1819 |
-
},
|
1820 |
-
"wizardcoder_python_7b_v1.cc100-fr": {
|
1821 |
-
"vocab_size": 32001,
|
1822 |
-
"n_bytes": 1540504,
|
1823 |
-
"n_tokens": 457243,
|
1824 |
-
"n_chars": 1484970
|
1825 |
-
},
|
1826 |
-
"wizardlm_7b_v1.cc100-fr": {
|
1827 |
-
"vocab_size": 32001,
|
1828 |
-
"n_bytes": 1540504,
|
1829 |
-
"n_tokens": 457243,
|
1830 |
-
"n_chars": 1484970
|
1831 |
-
},
|
1832 |
-
"wizardmath_70b_v1.cc100-fr": {
|
1833 |
-
"vocab_size": 32002,
|
1834 |
-
"n_bytes": 1540504,
|
1835 |
-
"n_tokens": 457243,
|
1836 |
-
"n_chars": 1484970
|
1837 |
-
},
|
1838 |
-
"xlm_roberta.cc100-fr": {
|
1839 |
-
"vocab_size": 250002,
|
1840 |
-
"n_bytes": 1540504,
|
1841 |
-
"n_tokens": 405041,
|
1842 |
-
"n_chars": 1484970
|
1843 |
-
},
|
1844 |
-
"yi_34b.cc100-fr": {
|
1845 |
-
"vocab_size": 64000,
|
1846 |
-
"n_bytes": 1540504,
|
1847 |
-
"n_tokens": 533106,
|
1848 |
-
"n_chars": 1484970
|
1849 |
-
},
|
1850 |
-
"yi_6b.cc100-fr": {
|
1851 |
-
"vocab_size": 64000,
|
1852 |
-
"n_bytes": 1540504,
|
1853 |
-
"n_tokens": 533106,
|
1854 |
-
"n_chars": 1484970
|
1855 |
-
},
|
1856 |
-
"yi_vl34b.cc100-fr": {
|
1857 |
-
"vocab_size": 64000,
|
1858 |
-
"n_bytes": 1540504,
|
1859 |
-
"n_tokens": 532288,
|
1860 |
-
"n_chars": 1484970
|
1861 |
-
},
|
1862 |
-
"zephyr_7b_beta.cc100-fr": {
|
1863 |
-
"vocab_size": 32000,
|
1864 |
-
"n_bytes": 1540504,
|
1865 |
-
"n_tokens": 476666,
|
1866 |
-
"n_chars": 1484970
|
1867 |
-
},
|
1868 |
-
"gpt_neox_japanese_2_7b.cc100-en": {
|
1869 |
-
"vocab_size": 32000,
|
1870 |
-
"n_bytes": 1124813,
|
1871 |
-
"n_tokens": 1121413,
|
1872 |
-
"n_chars": 1121360
|
1873 |
-
},
|
1874 |
-
"gpt_neox_japanese_2_7b.cc100-zh-Hans": {
|
1875 |
-
"vocab_size": 32000,
|
1876 |
-
"n_bytes": 2633047,
|
1877 |
-
"n_tokens": 1049033,
|
1878 |
-
"n_chars": 927311
|
1879 |
-
},
|
1880 |
-
"aya_101.cc100-ja": {
|
1881 |
-
"vocab_size": 250100,
|
1882 |
-
"n_bytes": 1774770,
|
1883 |
-
"n_tokens": 300542,
|
1884 |
-
"n_chars": 603065
|
1885 |
-
},
|
1886 |
-
"baichuan.cc100-ja": {
|
1887 |
-
"vocab_size": 64000,
|
1888 |
-
"n_bytes": 1774770,
|
1889 |
-
"n_tokens": 591656,
|
1890 |
-
"n_chars": 603065
|
1891 |
-
},
|
1892 |
-
"baichuan2.cc100-ja": {
|
1893 |
-
"vocab_size": 125696,
|
1894 |
-
"n_bytes": 1774770,
|
1895 |
-
"n_tokens": 554936,
|
1896 |
-
"n_chars": 603065
|
1897 |
-
},
|
1898 |
-
"bert_base_cased.cc100-ja": {
|
1899 |
-
"vocab_size": 28996,
|
1900 |
-
"n_bytes": 1774770,
|
1901 |
-
"n_tokens": 410492,
|
1902 |
-
"n_chars": 603065
|
1903 |
-
},
|
1904 |
-
"bert_base_chinese.cc100-ja": {
|
1905 |
-
"vocab_size": 21128,
|
1906 |
-
"n_bytes": 1774770,
|
1907 |
-
"n_tokens": 396831,
|
1908 |
-
"n_chars": 603065
|
1909 |
-
},
|
1910 |
-
"bert_base_uncased.cc100-ja": {
|
1911 |
-
"vocab_size": 30522,
|
1912 |
-
"n_bytes": 1774770,
|
1913 |
-
"n_tokens": 580634,
|
1914 |
-
"n_chars": 603065
|
1915 |
-
},
|
1916 |
-
"bloom.cc100-ja": {
|
1917 |
-
"vocab_size": 250680,
|
1918 |
-
"n_bytes": 1774770,
|
1919 |
-
"n_tokens": 523592,
|
1920 |
-
"n_chars": 603065
|
1921 |
-
},
|
1922 |
-
"byt5_small.cc100-ja": {
|
1923 |
-
"vocab_size": 384,
|
1924 |
-
"n_bytes": 1774770,
|
1925 |
-
"n_tokens": 1784770,
|
1926 |
-
"n_chars": 603065
|
1927 |
-
},
|
1928 |
-
"aya_101.cc100-ar": {
|
1929 |
-
"vocab_size": 250100,
|
1930 |
-
"n_bytes": 2813283,
|
1931 |
-
"n_tokens": 631736,
|
1932 |
-
"n_chars": 1560987
|
1933 |
-
},
|
1934 |
-
"baichuan.cc100-ar": {
|
1935 |
-
"vocab_size": 64000,
|
1936 |
-
"n_bytes": 2813283,
|
1937 |
-
"n_tokens": 1422976,
|
1938 |
-
"n_chars": 1560987
|
1939 |
-
},
|
1940 |
-
"baichuan2.cc100-ar": {
|
1941 |
-
"vocab_size": 125696,
|
1942 |
-
"n_bytes": 2813283,
|
1943 |
-
"n_tokens": 1337285,
|
1944 |
-
"n_chars": 1560987
|
1945 |
-
},
|
1946 |
-
"bert_base_cased.cc100-ar": {
|
1947 |
-
"vocab_size": 28996,
|
1948 |
-
"n_bytes": 2813283,
|
1949 |
-
"n_tokens": 1232449,
|
1950 |
-
"n_chars": 1560987
|
1951 |
-
},
|
1952 |
-
"bert_base_chinese.cc100-ar": {
|
1953 |
-
"vocab_size": 21128,
|
1954 |
-
"n_bytes": 2813283,
|
1955 |
-
"n_tokens": 536389,
|
1956 |
-
"n_chars": 1560987
|
1957 |
-
},
|
1958 |
-
"bert_base_uncased.cc100-ar": {
|
1959 |
-
"vocab_size": 30522,
|
1960 |
-
"n_bytes": 2813283,
|
1961 |
-
"n_tokens": 1269370,
|
1962 |
-
"n_chars": 1560987
|
1963 |
-
},
|
1964 |
-
"bloom.cc100-ar": {
|
1965 |
-
"vocab_size": 250680,
|
1966 |
-
"n_bytes": 2813283,
|
1967 |
-
"n_tokens": 427489,
|
1968 |
-
"n_chars": 1560987
|
1969 |
-
},
|
1970 |
-
"byt5_small.cc100-ar": {
|
1971 |
-
"vocab_size": 384,
|
1972 |
-
"n_bytes": 2813283,
|
1973 |
-
"n_tokens": 2823283,
|
1974 |
-
"n_chars": 1560987
|
1975 |
-
},
|
1976 |
-
"character_glm_6b.cc100-ar": {
|
1977 |
-
"vocab_size": 64789,
|
1978 |
-
"n_bytes": 2813283,
|
1979 |
-
"n_tokens": 1441847,
|
1980 |
-
"n_chars": 1560987
|
1981 |
-
},
|
1982 |
-
"chatglm2_6b.cc100-ar": {
|
1983 |
-
"vocab_size": 64787,
|
1984 |
-
"n_bytes": 2813283,
|
1985 |
-
"n_tokens": 1441847,
|
1986 |
-
"n_chars": 1560987
|
1987 |
-
},
|
1988 |
-
"chatglm3_6b.cc100-ar": {
|
1989 |
-
"vocab_size": 64796,
|
1990 |
-
"n_bytes": 2813283,
|
1991 |
-
"n_tokens": 1441847,
|
1992 |
-
"n_chars": 1560987
|
1993 |
-
},
|
1994 |
-
"chatglm_6b.cc100-ar": {
|
1995 |
-
"vocab_size": 150344,
|
1996 |
-
"n_bytes": 2813283,
|
1997 |
-
"n_tokens": 1097200,
|
1998 |
-
"n_chars": 1560987
|
1999 |
-
},
|
2000 |
-
"chatyuan_large_v2.cc100-ar": {
|
2001 |
-
"vocab_size": 32128,
|
2002 |
-
"n_bytes": 2813283,
|
2003 |
-
"n_tokens": 1006313,
|
2004 |
-
"n_chars": 1560987
|
2005 |
-
},
|
2006 |
-
"chinese_llama.cc100-ar": {
|
2007 |
-
"vocab_size": 49953,
|
2008 |
-
"n_bytes": 2813283,
|
2009 |
-
"n_tokens": 1421625,
|
2010 |
-
"n_chars": 1560987
|
2011 |
-
},
|
2012 |
-
"chinese_llama2.cc100-ar": {
|
2013 |
-
"vocab_size": 55296,
|
2014 |
-
"n_bytes": 2813283,
|
2015 |
-
"n_tokens": 1432081,
|
2016 |
-
"n_chars": 1560987
|
2017 |
-
},
|
2018 |
-
"code_davinci_002.cc100-ar": {
|
2019 |
-
"vocab_size": 50281,
|
2020 |
-
"n_bytes": 2813283,
|
2021 |
-
"n_tokens": 1558111,
|
2022 |
-
"n_chars": 1560987
|
2023 |
-
},
|
2024 |
-
"crystal_coder.cc100-ar": {
|
2025 |
-
"vocab_size": 32022,
|
2026 |
-
"n_bytes": 2813283,
|
2027 |
-
"n_tokens": 1422081,
|
2028 |
-
"n_chars": 1560987
|
2029 |
-
},
|
2030 |
-
"dbrx_instruct.cc100-ar": {
|
2031 |
-
"vocab_size": 100280,
|
2032 |
-
"n_bytes": 2813283,
|
2033 |
-
"n_tokens": 1105640,
|
2034 |
-
"n_chars": 1560987
|
2035 |
-
},
|
2036 |
-
"deepseek_coder_33b_instruct.cc100-ar": {
|
2037 |
-
"vocab_size": 32022,
|
2038 |
-
"n_bytes": 2813283,
|
2039 |
-
"n_tokens": 1958863,
|
2040 |
-
"n_chars": 1560987
|
2041 |
-
},
|
2042 |
-
"deepseek_llm_7b_base.cc100-ar": {
|
2043 |
-
"vocab_size": 100015,
|
2044 |
-
"n_bytes": 2813283,
|
2045 |
-
"n_tokens": 1426103,
|
2046 |
-
"n_chars": 1560987
|
2047 |
-
},
|
2048 |
-
"falcon_180b.cc100-ar": {
|
2049 |
-
"vocab_size": 65024,
|
2050 |
-
"n_bytes": 2813283,
|
2051 |
-
"n_tokens": 1597443,
|
2052 |
-
"n_chars": 1560987
|
2053 |
-
},
|
2054 |
-
"falcon_7b.cc100-ar": {
|
2055 |
-
"vocab_size": 65024,
|
2056 |
-
"n_bytes": 2813283,
|
2057 |
-
"n_tokens": 1597443,
|
2058 |
-
"n_chars": 1560987
|
2059 |
-
},
|
2060 |
-
"fastchat_t5_3b.cc100-ar": {
|
2061 |
-
"vocab_size": 32110,
|
2062 |
-
"n_bytes": 2813283,
|
2063 |
-
"n_tokens": 832267,
|
2064 |
-
"n_chars": 1560987
|
2065 |
-
},
|
2066 |
-
"flan_t5_base.cc100-ar": {
|
2067 |
-
"vocab_size": 32100,
|
2068 |
-
"n_bytes": 2813283,
|
2069 |
-
"n_tokens": 568957,
|
2070 |
-
"n_chars": 1560987
|
2071 |
-
},
|
2072 |
-
"gemma_7b.cc100-ar": {
|
2073 |
-
"vocab_size": 256000,
|
2074 |
-
"n_bytes": 2813283,
|
2075 |
-
"n_tokens": 573788,
|
2076 |
-
"n_chars": 1560987
|
2077 |
-
},
|
2078 |
-
"gpt2.cc100-ar": {
|
2079 |
-
"vocab_size": 50257,
|
2080 |
-
"n_bytes": 2813283,
|
2081 |
-
"n_tokens": 1558111,
|
2082 |
-
"n_chars": 1560987
|
2083 |
-
},
|
2084 |
-
"gpt2_chinese.cc100-ar": {
|
2085 |
-
"vocab_size": 21128,
|
2086 |
-
"n_bytes": 2813283,
|
2087 |
-
"n_tokens": 617677,
|
2088 |
-
"n_chars": 1560987
|
2089 |
-
},
|
2090 |
-
"gpt_35_turbo.cc100-ar": {
|
2091 |
-
"vocab_size": 100277,
|
2092 |
-
"n_bytes": 2813283,
|
2093 |
-
"n_tokens": 1105640,
|
2094 |
-
"n_chars": 1560987
|
2095 |
-
},
|
2096 |
-
"gpt_4.cc100-ar": {
|
2097 |
-
"vocab_size": 100277,
|
2098 |
-
"n_bytes": 2813283,
|
2099 |
-
"n_tokens": 1105640,
|
2100 |
-
"n_chars": 1560987
|
2101 |
-
},
|
2102 |
-
"gpt_neox_japanese_2_7b.cc100-ar": {
|
2103 |
-
"vocab_size": 32000,
|
2104 |
-
"n_bytes": 2813283,
|
2105 |
-
"n_tokens": 2809195,
|
2106 |
-
"n_chars": 1560987
|
2107 |
-
},
|
2108 |
-
"gpt_nexo_20b.cc100-ar": {
|
2109 |
-
"vocab_size": 50277,
|
2110 |
-
"n_bytes": 2813283,
|
2111 |
-
"n_tokens": 1106277,
|
2112 |
-
"n_chars": 1560987
|
2113 |
-
},
|
2114 |
-
"grok_1.cc100-ar": {
|
2115 |
-
"vocab_size": 131072,
|
2116 |
-
"n_bytes": 2813283,
|
2117 |
-
"n_tokens": 1392088,
|
2118 |
-
"n_chars": 1560987
|
2119 |
-
},
|
2120 |
-
"internlm2_chat_7b.cc100-ar": {
|
2121 |
-
"vocab_size": 92544,
|
2122 |
-
"n_bytes": 2813283,
|
2123 |
-
"n_tokens": 1635378,
|
2124 |
-
"n_chars": 1560987
|
2125 |
-
},
|
2126 |
-
"internlm2_math_7b.cc100-ar": {
|
2127 |
-
"vocab_size": 92544,
|
2128 |
-
"n_bytes": 2813283,
|
2129 |
-
"n_tokens": 1635378,
|
2130 |
-
"n_chars": 1560987
|
2131 |
-
},
|
2132 |
-
"internlm_chat_7b.cc100-ar": {
|
2133 |
-
"vocab_size": 103168,
|
2134 |
-
"n_bytes": 2813283,
|
2135 |
-
"n_tokens": 532046,
|
2136 |
-
"n_chars": 1560987
|
2137 |
-
},
|
2138 |
-
"internlm_xcomposer_7b.cc100-ar": {
|
2139 |
-
"vocab_size": 103168,
|
2140 |
-
"n_bytes": 2813283,
|
2141 |
-
"n_tokens": 532046,
|
2142 |
-
"n_chars": 1560987
|
2143 |
-
},
|
2144 |
-
"jamba_v0_1.cc100-ar": {
|
2145 |
-
"vocab_size": 65536,
|
2146 |
-
"n_bytes": 2813283,
|
2147 |
-
"n_tokens": 727886,
|
2148 |
-
"n_chars": 1560987
|
2149 |
-
},
|
2150 |
-
"kplug.cc100-ar": {
|
2151 |
-
"vocab_size": 10261,
|
2152 |
-
"n_bytes": 2813283,
|
2153 |
-
"n_tokens": 331987,
|
2154 |
-
"n_chars": 1560987
|
2155 |
-
},
|
2156 |
-
"llama.cc100-ar": {
|
2157 |
-
"vocab_size": 32000,
|
2158 |
-
"n_bytes": 2813283,
|
2159 |
-
"n_tokens": 1432081,
|
2160 |
-
"n_chars": 1560987
|
2161 |
-
},
|
2162 |
-
"llama2.cc100-ar": {
|
2163 |
-
"vocab_size": 32001,
|
2164 |
-
"n_bytes": 2813283,
|
2165 |
-
"n_tokens": 1432081,
|
2166 |
-
"n_chars": 1560987
|
2167 |
-
},
|
2168 |
-
"llama3.cc100-ar": {
|
2169 |
-
"vocab_size": 128256,
|
2170 |
-
"n_bytes": 2813283,
|
2171 |
-
"n_tokens": 615514,
|
2172 |
-
"n_chars": 1560987
|
2173 |
-
},
|
2174 |
-
"mistral_7b.cc100-ar": {
|
2175 |
-
"vocab_size": 32000,
|
2176 |
-
"n_bytes": 2813283,
|
2177 |
-
"n_tokens": 1406319,
|
2178 |
-
"n_chars": 1560987
|
2179 |
-
},
|
2180 |
-
"mixtral_8_7b.cc100-ar": {
|
2181 |
-
"vocab_size": 32000,
|
2182 |
-
"n_bytes": 2813283,
|
2183 |
-
"n_tokens": 1406319,
|
2184 |
-
"n_chars": 1560987
|
2185 |
-
},
|
2186 |
-
"mobilebert_uncased.cc100-ar": {
|
2187 |
-
"vocab_size": 30522,
|
2188 |
-
"n_bytes": 2813283,
|
2189 |
-
"n_tokens": 1269370,
|
2190 |
-
"n_chars": 1560987
|
2191 |
-
},
|
2192 |
-
"moss.cc100-ar": {
|
2193 |
-
"vocab_size": 106072,
|
2194 |
-
"n_bytes": 2813283,
|
2195 |
-
"n_tokens": 1557671,
|
2196 |
-
"n_chars": 1560987
|
2197 |
-
},
|
2198 |
-
"mt5_large.cc100-ar": {
|
2199 |
-
"vocab_size": 250100,
|
2200 |
-
"n_bytes": 2813283,
|
2201 |
-
"n_tokens": 631736,
|
2202 |
-
"n_chars": 1560987
|
2203 |
-
},
|
2204 |
-
"olmo_7b.cc100-ar": {
|
2205 |
-
"vocab_size": 50280,
|
2206 |
-
"n_bytes": 2813283,
|
2207 |
-
"n_tokens": 1106277,
|
2208 |
-
"n_chars": 1560987
|
2209 |
-
},
|
2210 |
-
"orion_14b_chat.cc100-ar": {
|
2211 |
-
"vocab_size": 84608,
|
2212 |
-
"n_bytes": 2813283,
|
2213 |
-
"n_tokens": 1531053,
|
2214 |
-
"n_chars": 1560987
|
2215 |
-
},
|
2216 |
-
"phi_1.cc100-ar": {
|
2217 |
-
"vocab_size": 50295,
|
2218 |
-
"n_bytes": 2813283,
|
2219 |
-
"n_tokens": 1558111,
|
2220 |
-
"n_chars": 1560987
|
2221 |
-
},
|
2222 |
-
"phi_2.cc100-ar": {
|
2223 |
-
"vocab_size": 50295,
|
2224 |
-
"n_bytes": 2813283,
|
2225 |
-
"n_tokens": 1558111,
|
2226 |
-
"n_chars": 1560987
|
2227 |
-
},
|
2228 |
-
"phi_3_mini.cc100-ar": {
|
2229 |
-
"vocab_size": 32011,
|
2230 |
-
"n_bytes": 2813283,
|
2231 |
-
"n_tokens": 1432081,
|
2232 |
-
"n_chars": 1560987
|
2233 |
-
},
|
2234 |
-
"pko_t5_large.cc100-ar": {
|
2235 |
-
"vocab_size": 50358,
|
2236 |
-
"n_bytes": 2813283,
|
2237 |
-
"n_tokens": 2815586,
|
2238 |
-
"n_chars": 1560987
|
2239 |
-
},
|
2240 |
-
"prompt_clue.cc100-ar": {
|
2241 |
-
"vocab_size": 32128,
|
2242 |
-
"n_bytes": 2813283,
|
2243 |
-
"n_tokens": 1006313,
|
2244 |
-
"n_chars": 1560987
|
2245 |
-
},
|
2246 |
-
"qwen1_5_14b_chat.cc100-ar": {
|
2247 |
-
"vocab_size": 151646,
|
2248 |
-
"n_bytes": 2813283,
|
2249 |
-
"n_tokens": 614959,
|
2250 |
-
"n_chars": 1560987
|
2251 |
-
},
|
2252 |
-
"qwen_1_8b_chat.cc100-ar": {
|
2253 |
-
"vocab_size": 151851,
|
2254 |
-
"n_bytes": 2813283,
|
2255 |
-
"n_tokens": 614959,
|
2256 |
-
"n_chars": 1560987
|
2257 |
-
},
|
2258 |
-
"qwen_72b_chat.cc100-ar": {
|
2259 |
-
"vocab_size": 151851,
|
2260 |
-
"n_bytes": 2813283,
|
2261 |
-
"n_tokens": 614959,
|
2262 |
-
"n_chars": 1560987
|
2263 |
-
},
|
2264 |
-
"qwen_7b_chat.cc100-ar": {
|
2265 |
-
"vocab_size": 151851,
|
2266 |
-
"n_bytes": 2813283,
|
2267 |
-
"n_tokens": 614959,
|
2268 |
-
"n_chars": 1560987
|
2269 |
-
},
|
2270 |
-
"roberta_chinese_clue.cc100-ar": {
|
2271 |
-
"vocab_size": 8021,
|
2272 |
-
"n_bytes": 2813283,
|
2273 |
-
"n_tokens": 621762,
|
2274 |
-
"n_chars": 1560987
|
2275 |
-
},
|
2276 |
-
"skywork_13b_base.cc100-ar": {
|
2277 |
-
"vocab_size": 65519,
|
2278 |
-
"n_bytes": 2813283,
|
2279 |
-
"n_tokens": 1432065,
|
2280 |
-
"n_chars": 1560987
|
2281 |
-
},
|
2282 |
-
"skywork_13b_math.cc100-ar": {
|
2283 |
-
"vocab_size": 65519,
|
2284 |
-
"n_bytes": 2813283,
|
2285 |
-
"n_tokens": 1432065,
|
2286 |
-
"n_chars": 1560987
|
2287 |
-
},
|
2288 |
-
"solar_10_7b.cc100-ar": {
|
2289 |
-
"vocab_size": 32000,
|
2290 |
-
"n_bytes": 2813283,
|
2291 |
-
"n_tokens": 1406319,
|
2292 |
-
"n_chars": 1560987
|
2293 |
-
},
|
2294 |
-
"starchat_alpha.cc100-ar": {
|
2295 |
-
"vocab_size": 49156,
|
2296 |
-
"n_bytes": 2813283,
|
2297 |
-
"n_tokens": 1195640,
|
2298 |
-
"n_chars": 1560987
|
2299 |
-
},
|
2300 |
-
"switch_c_2048.cc100-ar": {
|
2301 |
-
"vocab_size": 32100,
|
2302 |
-
"n_bytes": 2813283,
|
2303 |
-
"n_tokens": 568855,
|
2304 |
-
"n_chars": 1560987
|
2305 |
-
},
|
2306 |
-
"t5_base.cc100-ar": {
|
2307 |
-
"vocab_size": 32100,
|
2308 |
-
"n_bytes": 2813283,
|
2309 |
-
"n_tokens": 568855,
|
2310 |
-
"n_chars": 1560987
|
2311 |
-
},
|
2312 |
-
"t5_large.cc100-ar": {
|
2313 |
-
"vocab_size": 32100,
|
2314 |
-
"n_bytes": 2813283,
|
2315 |
-
"n_tokens": 568855,
|
2316 |
-
"n_chars": 1560987
|
2317 |
-
},
|
2318 |
-
"t5_small.cc100-ar": {
|
2319 |
-
"vocab_size": 32100,
|
2320 |
-
"n_bytes": 2813283,
|
2321 |
-
"n_tokens": 568855,
|
2322 |
-
"n_chars": 1560987
|
2323 |
-
},
|
2324 |
-
"text_davinci_003.cc100-ar": {
|
2325 |
-
"vocab_size": 50281,
|
2326 |
-
"n_bytes": 2813283,
|
2327 |
-
"n_tokens": 1558111,
|
2328 |
-
"n_chars": 1560987
|
2329 |
-
},
|
2330 |
-
"tigerbot_13b_chat_v2.cc100-ar": {
|
2331 |
-
"vocab_size": 60515,
|
2332 |
-
"n_bytes": 2813283,
|
2333 |
-
"n_tokens": 1422070,
|
2334 |
-
"n_chars": 1560987
|
2335 |
-
},
|
2336 |
-
"tigerbot_70b_chat_v4_4k.cc100-ar": {
|
2337 |
-
"vocab_size": 65110,
|
2338 |
-
"n_bytes": 2813283,
|
2339 |
-
"n_tokens": 1422073,
|
2340 |
-
"n_chars": 1560987
|
2341 |
-
},
|
2342 |
-
"wizardcoder_15b_v1.cc100-ar": {
|
2343 |
-
"vocab_size": 49153,
|
2344 |
-
"n_bytes": 2813283,
|
2345 |
-
"n_tokens": 1195640,
|
2346 |
-
"n_chars": 1560987
|
2347 |
-
},
|
2348 |
-
"wizardcoder_python_7b_v1.cc100-ar": {
|
2349 |
-
"vocab_size": 32001,
|
2350 |
-
"n_bytes": 2813283,
|
2351 |
-
"n_tokens": 1432081,
|
2352 |
-
"n_chars": 1560987
|
2353 |
-
},
|
2354 |
-
"wizardlm_7b_v1.cc100-ar": {
|
2355 |
-
"vocab_size": 32001,
|
2356 |
-
"n_bytes": 2813283,
|
2357 |
-
"n_tokens": 1432081,
|
2358 |
-
"n_chars": 1560987
|
2359 |
-
},
|
2360 |
-
"wizardmath_70b_v1.cc100-ar": {
|
2361 |
-
"vocab_size": 32002,
|
2362 |
-
"n_bytes": 2813283,
|
2363 |
-
"n_tokens": 1432081,
|
2364 |
-
"n_chars": 1560987
|
2365 |
-
},
|
2366 |
-
"xlm_roberta.cc100-ar": {
|
2367 |
-
"vocab_size": 250002,
|
2368 |
-
"n_bytes": 2813283,
|
2369 |
-
"n_tokens": 518287,
|
2370 |
-
"n_chars": 1560987
|
2371 |
-
},
|
2372 |
-
"yi_34b.cc100-ar": {
|
2373 |
-
"vocab_size": 64000,
|
2374 |
-
"n_bytes": 2813283,
|
2375 |
-
"n_tokens": 1795801,
|
2376 |
-
"n_chars": 1560987
|
2377 |
-
},
|
2378 |
-
"yi_6b.cc100-ar": {
|
2379 |
-
"vocab_size": 64000,
|
2380 |
-
"n_bytes": 2813283,
|
2381 |
-
"n_tokens": 1795801,
|
2382 |
-
"n_chars": 1560987
|
2383 |
-
},
|
2384 |
-
"yi_vl34b.cc100-ar": {
|
2385 |
-
"vocab_size": 64000,
|
2386 |
-
"n_bytes": 2813283,
|
2387 |
-
"n_tokens": 1803957,
|
2388 |
-
"n_chars": 1560987
|
2389 |
-
},
|
2390 |
-
"zephyr_7b_beta.cc100-ar": {
|
2391 |
-
"vocab_size": 32000,
|
2392 |
-
"n_bytes": 2813283,
|
2393 |
-
"n_tokens": 1406319,
|
2394 |
-
"n_chars": 1560987
|
2395 |
-
},
|
2396 |
-
"aya_101.cc100-de": {
|
2397 |
-
"vocab_size": 250100,
|
2398 |
-
"n_bytes": 1814876,
|
2399 |
-
"n_tokens": 480418,
|
2400 |
-
"n_chars": 1784021
|
2401 |
-
},
|
2402 |
-
"baichuan.cc100-de": {
|
2403 |
-
"vocab_size": 64000,
|
2404 |
-
"n_bytes": 1814876,
|
2405 |
-
"n_tokens": 680512,
|
2406 |
-
"n_chars": 1784021
|
2407 |
-
},
|
2408 |
-
"baichuan2.cc100-de": {
|
2409 |
-
"vocab_size": 125696,
|
2410 |
-
"n_bytes": 1814876,
|
2411 |
-
"n_tokens": 628063,
|
2412 |
-
"n_chars": 1784021
|
2413 |
-
},
|
2414 |
-
"bert_base_cased.cc100-de": {
|
2415 |
-
"vocab_size": 28996,
|
2416 |
-
"n_bytes": 1814876,
|
2417 |
-
"n_tokens": 731093,
|
2418 |
-
"n_chars": 1784021
|
2419 |
-
},
|
2420 |
-
"bert_base_chinese.cc100-de": {
|
2421 |
-
"vocab_size": 21128,
|
2422 |
-
"n_bytes": 1814876,
|
2423 |
-
"n_tokens": 561246,
|
2424 |
-
"n_chars": 1784021
|
2425 |
-
},
|
2426 |
-
"bert_base_uncased.cc100-de": {
|
2427 |
-
"vocab_size": 30522,
|
2428 |
-
"n_bytes": 1814876,
|
2429 |
-
"n_tokens": 646485,
|
2430 |
-
"n_chars": 1784021
|
2431 |
-
},
|
2432 |
-
"bloom.cc100-de": {
|
2433 |
-
"vocab_size": 250680,
|
2434 |
-
"n_bytes": 1814876,
|
2435 |
-
"n_tokens": 541170,
|
2436 |
-
"n_chars": 1784021
|
2437 |
-
},
|
2438 |
-
"byt5_small.cc100-de": {
|
2439 |
-
"vocab_size": 384,
|
2440 |
-
"n_bytes": 1814876,
|
2441 |
-
"n_tokens": 1824876,
|
2442 |
-
"n_chars": 1784021
|
2443 |
-
},
|
2444 |
-
"character_glm_6b.cc100-de": {
|
2445 |
-
"vocab_size": 64789,
|
2446 |
-
"n_bytes": 1814876,
|
2447 |
-
"n_tokens": 639822,
|
2448 |
-
"n_chars": 1784021
|
2449 |
-
},
|
2450 |
-
"chatglm2_6b.cc100-de": {
|
2451 |
-
"vocab_size": 64787,
|
2452 |
-
"n_bytes": 1814876,
|
2453 |
-
"n_tokens": 639757,
|
2454 |
-
"n_chars": 1784021
|
2455 |
-
},
|
2456 |
-
"chatglm3_6b.cc100-de": {
|
2457 |
-
"vocab_size": 64796,
|
2458 |
-
"n_bytes": 1814876,
|
2459 |
-
"n_tokens": 639822,
|
2460 |
-
"n_chars": 1784021
|
2461 |
-
},
|
2462 |
-
"chatglm_6b.cc100-de": {
|
2463 |
-
"vocab_size": 150344,
|
2464 |
-
"n_bytes": 1814876,
|
2465 |
-
"n_tokens": 589464,
|
2466 |
-
"n_chars": 1784021
|
2467 |
-
},
|
2468 |
-
"chatyuan_large_v2.cc100-de": {
|
2469 |
-
"vocab_size": 32128,
|
2470 |
-
"n_bytes": 1814876,
|
2471 |
-
"n_tokens": 970463,
|
2472 |
-
"n_chars": 1784021
|
2473 |
-
},
|
2474 |
-
"chinese_llama.cc100-de": {
|
2475 |
-
"vocab_size": 49953,
|
2476 |
-
"n_bytes": 1814876,
|
2477 |
-
"n_tokens": 523859,
|
2478 |
-
"n_chars": 1784021
|
2479 |
-
},
|
2480 |
-
"chinese_llama2.cc100-de": {
|
2481 |
-
"vocab_size": 55296,
|
2482 |
-
"n_bytes": 1814876,
|
2483 |
-
"n_tokens": 537318,
|
2484 |
-
"n_chars": 1784021
|
2485 |
-
},
|
2486 |
-
"code_davinci_002.cc100-de": {
|
2487 |
-
"vocab_size": 50281,
|
2488 |
-
"n_bytes": 1814876,
|
2489 |
-
"n_tokens": 684666,
|
2490 |
-
"n_chars": 1784021
|
2491 |
-
},
|
2492 |
-
"crystal_coder.cc100-de": {
|
2493 |
-
"vocab_size": 32022,
|
2494 |
-
"n_bytes": 1814876,
|
2495 |
-
"n_tokens": 527320,
|
2496 |
-
"n_chars": 1784021
|
2497 |
-
},
|
2498 |
-
"dbrx_instruct.cc100-de": {
|
2499 |
-
"vocab_size": 100280,
|
2500 |
-
"n_bytes": 1814876,
|
2501 |
-
"n_tokens": 500870,
|
2502 |
-
"n_chars": 1784021
|
2503 |
-
},
|
2504 |
-
"deepseek_coder_33b_instruct.cc100-de": {
|
2505 |
-
"vocab_size": 32022,
|
2506 |
-
"n_bytes": 1814876,
|
2507 |
-
"n_tokens": 745618,
|
2508 |
-
"n_chars": 1784021
|
2509 |
-
},
|
2510 |
-
"deepseek_llm_7b_base.cc100-de": {
|
2511 |
-
"vocab_size": 100015,
|
2512 |
-
"n_bytes": 1814876,
|
2513 |
-
"n_tokens": 642573,
|
2514 |
-
"n_chars": 1784021
|
2515 |
-
},
|
2516 |
-
"falcon_180b.cc100-de": {
|
2517 |
-
"vocab_size": 65024,
|
2518 |
-
"n_bytes": 1814876,
|
2519 |
-
"n_tokens": 497054,
|
2520 |
-
"n_chars": 1784021
|
2521 |
-
},
|
2522 |
-
"falcon_7b.cc100-de": {
|
2523 |
-
"vocab_size": 65024,
|
2524 |
-
"n_bytes": 1814876,
|
2525 |
-
"n_tokens": 497054,
|
2526 |
-
"n_chars": 1784021
|
2527 |
-
},
|
2528 |
-
"fastchat_t5_3b.cc100-de": {
|
2529 |
-
"vocab_size": 32110,
|
2530 |
-
"n_bytes": 1814876,
|
2531 |
-
"n_tokens": 736989,
|
2532 |
-
"n_chars": 1784021
|
2533 |
-
},
|
2534 |
-
"flan_t5_base.cc100-de": {
|
2535 |
-
"vocab_size": 32100,
|
2536 |
-
"n_bytes": 1814876,
|
2537 |
-
"n_tokens": 480254,
|
2538 |
-
"n_chars": 1784021
|
2539 |
-
},
|
2540 |
-
"gemma_7b.cc100-de": {
|
2541 |
-
"vocab_size": 256000,
|
2542 |
-
"n_bytes": 1814876,
|
2543 |
-
"n_tokens": 416876,
|
2544 |
-
"n_chars": 1784021
|
2545 |
-
},
|
2546 |
-
"gpt2.cc100-de": {
|
2547 |
-
"vocab_size": 50257,
|
2548 |
-
"n_bytes": 1814876,
|
2549 |
-
"n_tokens": 684669,
|
2550 |
-
"n_chars": 1784021
|
2551 |
-
},
|
2552 |
-
"gpt2_chinese.cc100-de": {
|
2553 |
-
"vocab_size": 21128,
|
2554 |
-
"n_bytes": 1814876,
|
2555 |
-
"n_tokens": 786497,
|
2556 |
-
"n_chars": 1784021
|
2557 |
-
},
|
2558 |
-
"gpt_35_turbo.cc100-de": {
|
2559 |
-
"vocab_size": 100277,
|
2560 |
-
"n_bytes": 1814876,
|
2561 |
-
"n_tokens": 500870,
|
2562 |
-
"n_chars": 1784021
|
2563 |
-
},
|
2564 |
-
"gpt_4.cc100-de": {
|
2565 |
-
"vocab_size": 100277,
|
2566 |
-
"n_bytes": 1814876,
|
2567 |
-
"n_tokens": 500870,
|
2568 |
-
"n_chars": 1784021
|
2569 |
-
},
|
2570 |
-
"gpt_neox_japanese_2_7b.cc100-de": {
|
2571 |
-
"vocab_size": 32000,
|
2572 |
-
"n_bytes": 1814876,
|
2573 |
-
"n_tokens": 1807780,
|
2574 |
-
"n_chars": 1784021
|
2575 |
-
},
|
2576 |
-
"gpt_nexo_20b.cc100-de": {
|
2577 |
-
"vocab_size": 50277,
|
2578 |
-
"n_bytes": 1814876,
|
2579 |
-
"n_tokens": 583628,
|
2580 |
-
"n_chars": 1784021
|
2581 |
-
},
|
2582 |
-
"grok_1.cc100-de": {
|
2583 |
-
"vocab_size": 131072,
|
2584 |
-
"n_bytes": 1814876,
|
2585 |
-
"n_tokens": 505220,
|
2586 |
-
"n_chars": 1784021
|
2587 |
-
},
|
2588 |
-
"internlm2_chat_7b.cc100-de": {
|
2589 |
-
"vocab_size": 92544,
|
2590 |
-
"n_bytes": 1814876,
|
2591 |
-
"n_tokens": 583917,
|
2592 |
-
"n_chars": 1784021
|
2593 |
-
},
|
2594 |
-
"internlm2_math_7b.cc100-de": {
|
2595 |
-
"vocab_size": 92544,
|
2596 |
-
"n_bytes": 1814876,
|
2597 |
-
"n_tokens": 583917,
|
2598 |
-
"n_chars": 1784021
|
2599 |
-
},
|
2600 |
-
"internlm_chat_7b.cc100-de": {
|
2601 |
-
"vocab_size": 103168,
|
2602 |
-
"n_bytes": 1814876,
|
2603 |
-
"n_tokens": 580489,
|
2604 |
-
"n_chars": 1784021
|
2605 |
-
},
|
2606 |
-
"internlm_xcomposer_7b.cc100-de": {
|
2607 |
-
"vocab_size": 103168,
|
2608 |
-
"n_bytes": 1814876,
|
2609 |
-
"n_tokens": 580489,
|
2610 |
-
"n_chars": 1784021
|
2611 |
-
},
|
2612 |
-
"jamba_v0_1.cc100-de": {
|
2613 |
-
"vocab_size": 65536,
|
2614 |
-
"n_bytes": 1814876,
|
2615 |
-
"n_tokens": 535856,
|
2616 |
-
"n_chars": 1784021
|
2617 |
-
},
|
2618 |
-
"kplug.cc100-de": {
|
2619 |
-
"vocab_size": 10261,
|
2620 |
-
"n_bytes": 1814876,
|
2621 |
-
"n_tokens": 789053,
|
2622 |
-
"n_chars": 1784021
|
2623 |
-
},
|
2624 |
-
"llama.cc100-de": {
|
2625 |
-
"vocab_size": 32000,
|
2626 |
-
"n_bytes": 1814876,
|
2627 |
-
"n_tokens": 537320,
|
2628 |
-
"n_chars": 1784021
|
2629 |
-
},
|
2630 |
-
"llama2.cc100-de": {
|
2631 |
-
"vocab_size": 32001,
|
2632 |
-
"n_bytes": 1814876,
|
2633 |
-
"n_tokens": 537320,
|
2634 |
-
"n_chars": 1784021
|
2635 |
-
},
|
2636 |
-
"llama3.cc100-de": {
|
2637 |
-
"vocab_size": 128256,
|
2638 |
-
"n_bytes": 1814876,
|
2639 |
-
"n_tokens": 499766,
|
2640 |
-
"n_chars": 1784021
|
2641 |
-
},
|
2642 |
-
"mistral_7b.cc100-de": {
|
2643 |
-
"vocab_size": 32000,
|
2644 |
-
"n_bytes": 1814876,
|
2645 |
-
"n_tokens": 577526,
|
2646 |
-
"n_chars": 1784021
|
2647 |
-
},
|
2648 |
-
"mixtral_8_7b.cc100-de": {
|
2649 |
-
"vocab_size": 32000,
|
2650 |
-
"n_bytes": 1814876,
|
2651 |
-
"n_tokens": 577526,
|
2652 |
-
"n_chars": 1784021
|
2653 |
-
},
|
2654 |
-
"mobilebert_uncased.cc100-de": {
|
2655 |
-
"vocab_size": 30522,
|
2656 |
-
"n_bytes": 1814876,
|
2657 |
-
"n_tokens": 646485,
|
2658 |
-
"n_chars": 1784021
|
2659 |
-
},
|
2660 |
-
"moss.cc100-de": {
|
2661 |
-
"vocab_size": 106072,
|
2662 |
-
"n_bytes": 1814876,
|
2663 |
-
"n_tokens": 683401,
|
2664 |
-
"n_chars": 1784021
|
2665 |
-
},
|
2666 |
-
"mt5_large.cc100-de": {
|
2667 |
-
"vocab_size": 250100,
|
2668 |
-
"n_bytes": 1814876,
|
2669 |
-
"n_tokens": 480418,
|
2670 |
-
"n_chars": 1784021
|
2671 |
-
},
|
2672 |
-
"olmo_7b.cc100-de": {
|
2673 |
-
"vocab_size": 50280,
|
2674 |
-
"n_bytes": 1814876,
|
2675 |
-
"n_tokens": 583628,
|
2676 |
-
"n_chars": 1784021
|
2677 |
-
},
|
2678 |
-
"orion_14b_chat.cc100-de": {
|
2679 |
-
"vocab_size": 84608,
|
2680 |
-
"n_bytes": 1814876,
|
2681 |
-
"n_tokens": 744404,
|
2682 |
-
"n_chars": 1784021
|
2683 |
-
},
|
2684 |
-
"phi_1.cc100-de": {
|
2685 |
-
"vocab_size": 50295,
|
2686 |
-
"n_bytes": 1814876,
|
2687 |
-
"n_tokens": 684665,
|
2688 |
-
"n_chars": 1784021
|
2689 |
-
},
|
2690 |
-
"phi_2.cc100-de": {
|
2691 |
-
"vocab_size": 50295,
|
2692 |
-
"n_bytes": 1814876,
|
2693 |
-
"n_tokens": 684665,
|
2694 |
-
"n_chars": 1784021
|
2695 |
-
},
|
2696 |
-
"phi_3_mini.cc100-de": {
|
2697 |
-
"vocab_size": 32011,
|
2698 |
-
"n_bytes": 1814876,
|
2699 |
-
"n_tokens": 537320,
|
2700 |
-
"n_chars": 1784021
|
2701 |
-
},
|
2702 |
-
"pko_t5_large.cc100-de": {
|
2703 |
-
"vocab_size": 50358,
|
2704 |
-
"n_bytes": 1814876,
|
2705 |
-
"n_tokens": 1254350,
|
2706 |
-
"n_chars": 1784021
|
2707 |
-
},
|
2708 |
-
"prompt_clue.cc100-de": {
|
2709 |
-
"vocab_size": 32128,
|
2710 |
-
"n_bytes": 1814876,
|
2711 |
-
"n_tokens": 970463,
|
2712 |
-
"n_chars": 1784021
|
2713 |
-
},
|
2714 |
-
"qwen1_5_14b_chat.cc100-de": {
|
2715 |
-
"vocab_size": 151646,
|
2716 |
-
"n_bytes": 1814876,
|
2717 |
-
"n_tokens": 503561,
|
2718 |
-
"n_chars": 1784021
|
2719 |
-
},
|
2720 |
-
"qwen_1_8b_chat.cc100-de": {
|
2721 |
-
"vocab_size": 151851,
|
2722 |
-
"n_bytes": 1814876,
|
2723 |
-
"n_tokens": 503561,
|
2724 |
-
"n_chars": 1784021
|
2725 |
-
},
|
2726 |
-
"qwen_72b_chat.cc100-de": {
|
2727 |
-
"vocab_size": 151851,
|
2728 |
-
"n_bytes": 1814876,
|
2729 |
-
"n_tokens": 503561,
|
2730 |
-
"n_chars": 1784021
|
2731 |
-
},
|
2732 |
-
"qwen_7b_chat.cc100-de": {
|
2733 |
-
"vocab_size": 151851,
|
2734 |
-
"n_bytes": 1814876,
|
2735 |
-
"n_tokens": 503561,
|
2736 |
-
"n_chars": 1784021
|
2737 |
-
},
|
2738 |
-
"roberta_chinese_clue.cc100-de": {
|
2739 |
-
"vocab_size": 8021,
|
2740 |
-
"n_bytes": 1814876,
|
2741 |
-
"n_tokens": 915612,
|
2742 |
-
"n_chars": 1784021
|
2743 |
-
},
|
2744 |
-
"skywork_13b_base.cc100-de": {
|
2745 |
-
"vocab_size": 65519,
|
2746 |
-
"n_bytes": 1814876,
|
2747 |
-
"n_tokens": 537308,
|
2748 |
-
"n_chars": 1784021
|
2749 |
-
},
|
2750 |
-
"skywork_13b_math.cc100-de": {
|
2751 |
-
"vocab_size": 65519,
|
2752 |
-
"n_bytes": 1814876,
|
2753 |
-
"n_tokens": 537308,
|
2754 |
-
"n_chars": 1784021
|
2755 |
-
},
|
2756 |
-
"solar_10_7b.cc100-de": {
|
2757 |
-
"vocab_size": 32000,
|
2758 |
-
"n_bytes": 1814876,
|
2759 |
-
"n_tokens": 577526,
|
2760 |
-
"n_chars": 1784021
|
2761 |
-
},
|
2762 |
-
"starchat_alpha.cc100-de": {
|
2763 |
-
"vocab_size": 49156,
|
2764 |
-
"n_bytes": 1814876,
|
2765 |
-
"n_tokens": 620541,
|
2766 |
-
"n_chars": 1784021
|
2767 |
-
},
|
2768 |
-
"switch_c_2048.cc100-de": {
|
2769 |
-
"vocab_size": 32100,
|
2770 |
-
"n_bytes": 1814876,
|
2771 |
-
"n_tokens": 480254,
|
2772 |
-
"n_chars": 1784021
|
2773 |
-
},
|
2774 |
-
"t5_base.cc100-de": {
|
2775 |
-
"vocab_size": 32100,
|
2776 |
-
"n_bytes": 1814876,
|
2777 |
-
"n_tokens": 480254,
|
2778 |
-
"n_chars": 1784021
|
2779 |
-
},
|
2780 |
-
"t5_large.cc100-de": {
|
2781 |
-
"vocab_size": 32100,
|
2782 |
-
"n_bytes": 1814876,
|
2783 |
-
"n_tokens": 480254,
|
2784 |
-
"n_chars": 1784021
|
2785 |
-
},
|
2786 |
-
"t5_small.cc100-de": {
|
2787 |
-
"vocab_size": 32100,
|
2788 |
-
"n_bytes": 1814876,
|
2789 |
-
"n_tokens": 480254,
|
2790 |
-
"n_chars": 1784021
|
2791 |
-
},
|
2792 |
-
"text_davinci_003.cc100-de": {
|
2793 |
-
"vocab_size": 50281,
|
2794 |
-
"n_bytes": 1814876,
|
2795 |
-
"n_tokens": 684666,
|
2796 |
-
"n_chars": 1784021
|
2797 |
-
},
|
2798 |
-
"tigerbot_13b_chat_v2.cc100-de": {
|
2799 |
-
"vocab_size": 60515,
|
2800 |
-
"n_bytes": 1814876,
|
2801 |
-
"n_tokens": 528918,
|
2802 |
-
"n_chars": 1784021
|
2803 |
-
},
|
2804 |
-
"tigerbot_70b_chat_v4_4k.cc100-de": {
|
2805 |
-
"vocab_size": 65110,
|
2806 |
-
"n_bytes": 1814876,
|
2807 |
-
"n_tokens": 529170,
|
2808 |
-
"n_chars": 1784021
|
2809 |
-
},
|
2810 |
-
"wizardcoder_15b_v1.cc100-de": {
|
2811 |
-
"vocab_size": 49153,
|
2812 |
-
"n_bytes": 1814876,
|
2813 |
-
"n_tokens": 620541,
|
2814 |
-
"n_chars": 1784021
|
2815 |
-
},
|
2816 |
-
"wizardcoder_python_7b_v1.cc100-de": {
|
2817 |
-
"vocab_size": 32001,
|
2818 |
-
"n_bytes": 1814876,
|
2819 |
-
"n_tokens": 537320,
|
2820 |
-
"n_chars": 1784021
|
2821 |
-
},
|
2822 |
-
"wizardlm_7b_v1.cc100-de": {
|
2823 |
-
"vocab_size": 32001,
|
2824 |
-
"n_bytes": 1814876,
|
2825 |
-
"n_tokens": 537320,
|
2826 |
-
"n_chars": 1784021
|
2827 |
-
},
|
2828 |
-
"wizardmath_70b_v1.cc100-de": {
|
2829 |
-
"vocab_size": 32002,
|
2830 |
-
"n_bytes": 1814876,
|
2831 |
-
"n_tokens": 537320,
|
2832 |
-
"n_chars": 1784021
|
2833 |
-
},
|
2834 |
-
"xlm_roberta.cc100-de": {
|
2835 |
-
"vocab_size": 250002,
|
2836 |
-
"n_bytes": 1814876,
|
2837 |
-
"n_tokens": 432571,
|
2838 |
-
"n_chars": 1784021
|
2839 |
-
},
|
2840 |
-
"yi_34b.cc100-de": {
|
2841 |
-
"vocab_size": 64000,
|
2842 |
-
"n_bytes": 1814876,
|
2843 |
-
"n_tokens": 698366,
|
2844 |
-
"n_chars": 1784021
|
2845 |
-
},
|
2846 |
-
"yi_6b.cc100-de": {
|
2847 |
-
"vocab_size": 64000,
|
2848 |
-
"n_bytes": 1814876,
|
2849 |
-
"n_tokens": 698366,
|
2850 |
-
"n_chars": 1784021
|
2851 |
-
},
|
2852 |
-
"yi_vl34b.cc100-de": {
|
2853 |
-
"vocab_size": 64000,
|
2854 |
-
"n_bytes": 1814876,
|
2855 |
-
"n_tokens": 697065,
|
2856 |
-
"n_chars": 1784021
|
2857 |
-
},
|
2858 |
-
"zephyr_7b_beta.cc100-de": {
|
2859 |
-
"vocab_size": 32000,
|
2860 |
-
"n_bytes": 1814876,
|
2861 |
-
"n_tokens": 577526,
|
2862 |
-
"n_chars": 1784021
|
2863 |
-
},
|
2864 |
-
"gpt_neox_japanese_2_7b.cc100-es": {
|
2865 |
-
"vocab_size": 32000,
|
2866 |
-
"n_bytes": 1664455,
|
2867 |
-
"n_tokens": 1658946,
|
2868 |
-
"n_chars": 1630297
|
2869 |
-
},
|
2870 |
-
"gpt_neox_japanese_2_7b.cc100-fr": {
|
2871 |
-
"vocab_size": 32000,
|
2872 |
-
"n_bytes": 1540504,
|
2873 |
-
"n_tokens": 1524129,
|
2874 |
-
"n_chars": 1484970
|
2875 |
-
},
|
2876 |
-
"character_glm_6b.cc100-ja": {
|
2877 |
-
"vocab_size": 64789,
|
2878 |
-
"n_bytes": 1774770,
|
2879 |
-
"n_tokens": 601380,
|
2880 |
-
"n_chars": 603065
|
2881 |
-
},
|
2882 |
-
"chatglm2_6b.cc100-ja": {
|
2883 |
-
"vocab_size": 64787,
|
2884 |
-
"n_bytes": 1774770,
|
2885 |
-
"n_tokens": 601380,
|
2886 |
-
"n_chars": 603065
|
2887 |
-
},
|
2888 |
-
"chatglm3_6b.cc100-ja": {
|
2889 |
-
"vocab_size": 64796,
|
2890 |
-
"n_bytes": 1774770,
|
2891 |
-
"n_tokens": 601380,
|
2892 |
-
"n_chars": 603065
|
2893 |
-
},
|
2894 |
-
"chatglm_6b.cc100-ja": {
|
2895 |
-
"vocab_size": 150344,
|
2896 |
-
"n_bytes": 1774770,
|
2897 |
-
"n_tokens": 489930,
|
2898 |
-
"n_chars": 603065
|
2899 |
-
},
|
2900 |
-
"chatyuan_large_v2.cc100-ja": {
|
2901 |
-
"vocab_size": 32128,
|
2902 |
-
"n_bytes": 1774770,
|
2903 |
-
"n_tokens": 575118,
|
2904 |
-
"n_chars": 603065
|
2905 |
-
},
|
2906 |
-
"chinese_llama.cc100-ja": {
|
2907 |
-
"vocab_size": 49953,
|
2908 |
-
"n_bytes": 1774770,
|
2909 |
-
"n_tokens": 614177,
|
2910 |
-
"n_chars": 603065
|
2911 |
-
},
|
2912 |
-
"chinese_llama2.cc100-ja": {
|
2913 |
-
"vocab_size": 55296,
|
2914 |
-
"n_bytes": 1774770,
|
2915 |
-
"n_tokens": 624362,
|
2916 |
-
"n_chars": 603065
|
2917 |
-
},
|
2918 |
-
"code_davinci_002.cc100-ja": {
|
2919 |
-
"vocab_size": 50281,
|
2920 |
-
"n_bytes": 1774770,
|
2921 |
-
"n_tokens": 844362,
|
2922 |
-
"n_chars": 603065
|
2923 |
-
},
|
2924 |
-
"crystal_coder.cc100-ja": {
|
2925 |
-
"vocab_size": 32022,
|
2926 |
-
"n_bytes": 1774770,
|
2927 |
-
"n_tokens": 718461,
|
2928 |
-
"n_chars": 603065
|
2929 |
-
},
|
2930 |
-
"dbrx_instruct.cc100-ja": {
|
2931 |
-
"vocab_size": 100280,
|
2932 |
-
"n_bytes": 1774770,
|
2933 |
-
"n_tokens": 630348,
|
2934 |
-
"n_chars": 603065
|
2935 |
-
},
|
2936 |
-
"deepseek_coder_33b_instruct.cc100-ja": {
|
2937 |
-
"vocab_size": 32022,
|
2938 |
-
"n_bytes": 1774770,
|
2939 |
-
"n_tokens": 1018060,
|
2940 |
-
"n_chars": 603065
|
2941 |
-
},
|
2942 |
-
"deepseek_llm_7b_base.cc100-ja": {
|
2943 |
-
"vocab_size": 100015,
|
2944 |
-
"n_bytes": 1774770,
|
2945 |
-
"n_tokens": 761467,
|
2946 |
-
"n_chars": 603065
|
2947 |
-
},
|
2948 |
-
"falcon_180b.cc100-ja": {
|
2949 |
-
"vocab_size": 65024,
|
2950 |
-
"n_bytes": 1774770,
|
2951 |
-
"n_tokens": 842458,
|
2952 |
-
"n_chars": 603065
|
2953 |
-
},
|
2954 |
-
"falcon_7b.cc100-ja": {
|
2955 |
-
"vocab_size": 65024,
|
2956 |
-
"n_bytes": 1774770,
|
2957 |
-
"n_tokens": 842458,
|
2958 |
-
"n_chars": 603065
|
2959 |
-
},
|
2960 |
-
"fastchat_t5_3b.cc100-ja": {
|
2961 |
-
"vocab_size": 32110,
|
2962 |
-
"n_bytes": 1774770,
|
2963 |
-
"n_tokens": 53915,
|
2964 |
-
"n_chars": 603065
|
2965 |
-
},
|
2966 |
-
"flan_t5_base.cc100-ja": {
|
2967 |
-
"vocab_size": 32100,
|
2968 |
-
"n_bytes": 1774770,
|
2969 |
-
"n_tokens": 51999,
|
2970 |
-
"n_chars": 603065
|
2971 |
-
},
|
2972 |
-
"gemma_7b.cc100-ja": {
|
2973 |
-
"vocab_size": 256000,
|
2974 |
-
"n_bytes": 1774770,
|
2975 |
-
"n_tokens": 317873,
|
2976 |
-
"n_chars": 603065
|
2977 |
-
},
|
2978 |
-
"gpt2.cc100-ja": {
|
2979 |
-
"vocab_size": 50257,
|
2980 |
-
"n_bytes": 1774770,
|
2981 |
-
"n_tokens": 844362,
|
2982 |
-
"n_chars": 603065
|
2983 |
-
},
|
2984 |
-
"gpt2_chinese.cc100-ja": {
|
2985 |
-
"vocab_size": 21128,
|
2986 |
-
"n_bytes": 1774770,
|
2987 |
-
"n_tokens": 503085,
|
2988 |
-
"n_chars": 603065
|
2989 |
-
},
|
2990 |
-
"gpt_35_turbo.cc100-ja": {
|
2991 |
-
"vocab_size": 100277,
|
2992 |
-
"n_bytes": 1774770,
|
2993 |
-
"n_tokens": 630348,
|
2994 |
-
"n_chars": 603065
|
2995 |
-
},
|
2996 |
-
"gpt_4.cc100-ja": {
|
2997 |
-
"vocab_size": 100277,
|
2998 |
-
"n_bytes": 1774770,
|
2999 |
-
"n_tokens": 630348,
|
3000 |
-
"n_chars": 603065
|
3001 |
-
},
|
3002 |
-
"gpt_neox_japanese_2_7b.cc100-ja": {
|
3003 |
-
"vocab_size": 32000,
|
3004 |
-
"n_bytes": 1774770,
|
3005 |
-
"n_tokens": 410803,
|
3006 |
-
"n_chars": 603065
|
3007 |
-
},
|
3008 |
-
"gpt_nexo_20b.cc100-ja": {
|
3009 |
-
"vocab_size": 50277,
|
3010 |
-
"n_bytes": 1774770,
|
3011 |
-
"n_tokens": 605168,
|
3012 |
-
"n_chars": 603065
|
3013 |
-
},
|
3014 |
-
"grok_1.cc100-ja": {
|
3015 |
-
"vocab_size": 131072,
|
3016 |
-
"n_bytes": 1774770,
|
3017 |
-
"n_tokens": 497590,
|
3018 |
-
"n_chars": 603065
|
3019 |
-
},
|
3020 |
-
"internlm2_chat_7b.cc100-ja": {
|
3021 |
-
"vocab_size": 92544,
|
3022 |
-
"n_bytes": 1774770,
|
3023 |
-
"n_tokens": 595803,
|
3024 |
-
"n_chars": 603065
|
3025 |
-
},
|
3026 |
-
"internlm2_math_7b.cc100-ja": {
|
3027 |
-
"vocab_size": 92544,
|
3028 |
-
"n_bytes": 1774770,
|
3029 |
-
"n_tokens": 595803,
|
3030 |
-
"n_chars": 603065
|
3031 |
-
},
|
3032 |
-
"internlm_chat_7b.cc100-ja": {
|
3033 |
-
"vocab_size": 103168,
|
3034 |
-
"n_bytes": 1774770,
|
3035 |
-
"n_tokens": 448212,
|
3036 |
-
"n_chars": 603065
|
3037 |
-
},
|
3038 |
-
"internlm_xcomposer_7b.cc100-ja": {
|
3039 |
-
"vocab_size": 103168,
|
3040 |
-
"n_bytes": 1774770,
|
3041 |
-
"n_tokens": 448212,
|
3042 |
-
"n_chars": 603065
|
3043 |
-
},
|
3044 |
-
"jamba_v0_1.cc100-ja": {
|
3045 |
-
"vocab_size": 65536,
|
3046 |
-
"n_bytes": 1774770,
|
3047 |
-
"n_tokens": 683256,
|
3048 |
-
"n_chars": 603065
|
3049 |
-
},
|
3050 |
-
"kplug.cc100-ja": {
|
3051 |
-
"vocab_size": 10261,
|
3052 |
-
"n_bytes": 1774770,
|
3053 |
-
"n_tokens": 338023,
|
3054 |
-
"n_chars": 603065
|
3055 |
-
},
|
3056 |
-
"llama.cc100-ja": {
|
3057 |
-
"vocab_size": 32000,
|
3058 |
-
"n_bytes": 1774770,
|
3059 |
-
"n_tokens": 728461,
|
3060 |
-
"n_chars": 603065
|
3061 |
-
},
|
3062 |
-
"llama2.cc100-ja": {
|
3063 |
-
"vocab_size": 32001,
|
3064 |
-
"n_bytes": 1774770,
|
3065 |
-
"n_tokens": 728461,
|
3066 |
-
"n_chars": 603065
|
3067 |
-
},
|
3068 |
-
"llama3.cc100-ja": {
|
3069 |
-
"vocab_size": 128256,
|
3070 |
-
"n_bytes": 1774770,
|
3071 |
-
"n_tokens": 414715,
|
3072 |
-
"n_chars": 603065
|
3073 |
-
},
|
3074 |
-
"mistral_7b.cc100-ja": {
|
3075 |
-
"vocab_size": 32000,
|
3076 |
-
"n_bytes": 1774770,
|
3077 |
-
"n_tokens": 685134,
|
3078 |
-
"n_chars": 603065
|
3079 |
-
},
|
3080 |
-
"mixtral_8_7b.cc100-ja": {
|
3081 |
-
"vocab_size": 32000,
|
3082 |
-
"n_bytes": 1774770,
|
3083 |
-
"n_tokens": 685134,
|
3084 |
-
"n_chars": 603065
|
3085 |
-
},
|
3086 |
-
"mobilebert_uncased.cc100-ja": {
|
3087 |
-
"vocab_size": 30522,
|
3088 |
-
"n_bytes": 1774770,
|
3089 |
-
"n_tokens": 580634,
|
3090 |
-
"n_chars": 603065
|
3091 |
-
},
|
3092 |
-
"moss.cc100-ja": {
|
3093 |
-
"vocab_size": 106072,
|
3094 |
-
"n_bytes": 1774770,
|
3095 |
-
"n_tokens": 600011,
|
3096 |
-
"n_chars": 603065
|
3097 |
-
},
|
3098 |
-
"mt5_large.cc100-ja": {
|
3099 |
-
"vocab_size": 250100,
|
3100 |
-
"n_bytes": 1774770,
|
3101 |
-
"n_tokens": 300542,
|
3102 |
-
"n_chars": 603065
|
3103 |
-
},
|
3104 |
-
"olmo_7b.cc100-ja": {
|
3105 |
-
"vocab_size": 50280,
|
3106 |
-
"n_bytes": 1774770,
|
3107 |
-
"n_tokens": 605168,
|
3108 |
-
"n_chars": 603065
|
3109 |
-
},
|
3110 |
-
"orion_14b_chat.cc100-ja": {
|
3111 |
-
"vocab_size": 84608,
|
3112 |
-
"n_bytes": 1774770,
|
3113 |
-
"n_tokens": 324956,
|
3114 |
-
"n_chars": 603065
|
3115 |
-
},
|
3116 |
-
"phi_1.cc100-ja": {
|
3117 |
-
"vocab_size": 50295,
|
3118 |
-
"n_bytes": 1774770,
|
3119 |
-
"n_tokens": 844362,
|
3120 |
-
"n_chars": 603065
|
3121 |
-
},
|
3122 |
-
"phi_2.cc100-ja": {
|
3123 |
-
"vocab_size": 50295,
|
3124 |
-
"n_bytes": 1774770,
|
3125 |
-
"n_tokens": 844362,
|
3126 |
-
"n_chars": 603065
|
3127 |
-
},
|
3128 |
-
"phi_3_mini.cc100-ja": {
|
3129 |
-
"vocab_size": 32011,
|
3130 |
-
"n_bytes": 1774770,
|
3131 |
-
"n_tokens": 728461,
|
3132 |
-
"n_chars": 603065
|
3133 |
-
},
|
3134 |
-
"pko_t5_large.cc100-ja": {
|
3135 |
-
"vocab_size": 50358,
|
3136 |
-
"n_bytes": 1774770,
|
3137 |
-
"n_tokens": 1766950,
|
3138 |
-
"n_chars": 603065
|
3139 |
-
},
|
3140 |
-
"prompt_clue.cc100-ja": {
|
3141 |
-
"vocab_size": 32128,
|
3142 |
-
"n_bytes": 1774770,
|
3143 |
-
"n_tokens": 575118,
|
3144 |
-
"n_chars": 603065
|
3145 |
-
},
|
3146 |
-
"qwen1_5_14b_chat.cc100-ja": {
|
3147 |
-
"vocab_size": 151646,
|
3148 |
-
"n_bytes": 1774770,
|
3149 |
-
"n_tokens": 377144,
|
3150 |
-
"n_chars": 603065
|
3151 |
-
},
|
3152 |
-
"qwen_1_8b_chat.cc100-ja": {
|
3153 |
-
"vocab_size": 151851,
|
3154 |
-
"n_bytes": 1774770,
|
3155 |
-
"n_tokens": 377144,
|
3156 |
-
"n_chars": 603065
|
3157 |
-
},
|
3158 |
-
"qwen_72b_chat.cc100-ja": {
|
3159 |
-
"vocab_size": 151851,
|
3160 |
-
"n_bytes": 1774770,
|
3161 |
-
"n_tokens": 377144,
|
3162 |
-
"n_chars": 603065
|
3163 |
-
},
|
3164 |
-
"qwen_7b_chat.cc100-ja": {
|
3165 |
-
"vocab_size": 151851,
|
3166 |
-
"n_bytes": 1774770,
|
3167 |
-
"n_tokens": 377144,
|
3168 |
-
"n_chars": 603065
|
3169 |
-
},
|
3170 |
-
"roberta_chinese_clue.cc100-ja": {
|
3171 |
-
"vocab_size": 8021,
|
3172 |
-
"n_bytes": 1774770,
|
3173 |
-
"n_tokens": 339411,
|
3174 |
-
"n_chars": 603065
|
3175 |
-
},
|
3176 |
-
"skywork_13b_base.cc100-ja": {
|
3177 |
-
"vocab_size": 65519,
|
3178 |
-
"n_bytes": 1774770,
|
3179 |
-
"n_tokens": 603613,
|
3180 |
-
"n_chars": 603065
|
3181 |
-
},
|
3182 |
-
"skywork_13b_math.cc100-ja": {
|
3183 |
-
"vocab_size": 65519,
|
3184 |
-
"n_bytes": 1774770,
|
3185 |
-
"n_tokens": 603613,
|
3186 |
-
"n_chars": 603065
|
3187 |
-
},
|
3188 |
-
"solar_10_7b.cc100-ja": {
|
3189 |
-
"vocab_size": 32000,
|
3190 |
-
"n_bytes": 1774770,
|
3191 |
-
"n_tokens": 685134,
|
3192 |
-
"n_chars": 603065
|
3193 |
-
},
|
3194 |
-
"starchat_alpha.cc100-ja": {
|
3195 |
-
"vocab_size": 49156,
|
3196 |
-
"n_bytes": 1774770,
|
3197 |
-
"n_tokens": 546876,
|
3198 |
-
"n_chars": 603065
|
3199 |
-
},
|
3200 |
-
"switch_c_2048.cc100-ja": {
|
3201 |
-
"vocab_size": 32100,
|
3202 |
-
"n_bytes": 1774770,
|
3203 |
-
"n_tokens": 51947,
|
3204 |
-
"n_chars": 603065
|
3205 |
-
},
|
3206 |
-
"t5_base.cc100-ja": {
|
3207 |
-
"vocab_size": 32100,
|
3208 |
-
"n_bytes": 1774770,
|
3209 |
-
"n_tokens": 51947,
|
3210 |
-
"n_chars": 603065
|
3211 |
-
},
|
3212 |
-
"t5_large.cc100-ja": {
|
3213 |
-
"vocab_size": 32100,
|
3214 |
-
"n_bytes": 1774770,
|
3215 |
-
"n_tokens": 51947,
|
3216 |
-
"n_chars": 603065
|
3217 |
-
},
|
3218 |
-
"t5_small.cc100-ja": {
|
3219 |
-
"vocab_size": 32100,
|
3220 |
-
"n_bytes": 1774770,
|
3221 |
-
"n_tokens": 51947,
|
3222 |
-
"n_chars": 603065
|
3223 |
-
},
|
3224 |
-
"text_davinci_003.cc100-ja": {
|
3225 |
-
"vocab_size": 50281,
|
3226 |
-
"n_bytes": 1774770,
|
3227 |
-
"n_tokens": 844362,
|
3228 |
-
"n_chars": 603065
|
3229 |
-
},
|
3230 |
-
"tigerbot_13b_chat_v2.cc100-ja": {
|
3231 |
-
"vocab_size": 60515,
|
3232 |
-
"n_bytes": 1774770,
|
3233 |
-
"n_tokens": 567792,
|
3234 |
-
"n_chars": 603065
|
3235 |
-
},
|
3236 |
-
"tigerbot_70b_chat_v4_4k.cc100-ja": {
|
3237 |
-
"vocab_size": 65110,
|
3238 |
-
"n_bytes": 1774770,
|
3239 |
-
"n_tokens": 406571,
|
3240 |
-
"n_chars": 603065
|
3241 |
-
},
|
3242 |
-
"wizardcoder_15b_v1.cc100-ja": {
|
3243 |
-
"vocab_size": 49153,
|
3244 |
-
"n_bytes": 1774770,
|
3245 |
-
"n_tokens": 546876,
|
3246 |
-
"n_chars": 603065
|
3247 |
-
},
|
3248 |
-
"wizardcoder_python_7b_v1.cc100-ja": {
|
3249 |
-
"vocab_size": 32001,
|
3250 |
-
"n_bytes": 1774770,
|
3251 |
-
"n_tokens": 728461,
|
3252 |
-
"n_chars": 603065
|
3253 |
-
},
|
3254 |
-
"wizardlm_7b_v1.cc100-ja": {
|
3255 |
-
"vocab_size": 32001,
|
3256 |
-
"n_bytes": 1774770,
|
3257 |
-
"n_tokens": 728461,
|
3258 |
-
"n_chars": 603065
|
3259 |
-
},
|
3260 |
-
"wizardmath_70b_v1.cc100-ja": {
|
3261 |
-
"vocab_size": 32002,
|
3262 |
-
"n_bytes": 1774770,
|
3263 |
-
"n_tokens": 728461,
|
3264 |
-
"n_chars": 603065
|
3265 |
-
},
|
3266 |
-
"xlm_roberta.cc100-ja": {
|
3267 |
-
"vocab_size": 250002,
|
3268 |
-
"n_bytes": 1774770,
|
3269 |
-
"n_tokens": 344820,
|
3270 |
-
"n_chars": 603065
|
3271 |
-
},
|
3272 |
-
"yi_34b.cc100-ja": {
|
3273 |
-
"vocab_size": 64000,
|
3274 |
-
"n_bytes": 1774770,
|
3275 |
-
"n_tokens": 740791,
|
3276 |
-
"n_chars": 603065
|
3277 |
-
},
|
3278 |
-
"yi_6b.cc100-ja": {
|
3279 |
-
"vocab_size": 64000,
|
3280 |
-
"n_bytes": 1774770,
|
3281 |
-
"n_tokens": 740791,
|
3282 |
-
"n_chars": 603065
|
3283 |
-
},
|
3284 |
-
"yi_vl34b.cc100-ja": {
|
3285 |
-
"vocab_size": 64000,
|
3286 |
-
"n_bytes": 1774770,
|
3287 |
-
"n_tokens": 749927,
|
3288 |
-
"n_chars": 603065
|
3289 |
-
},
|
3290 |
-
"zephyr_7b_beta.cc100-ja": {
|
3291 |
-
"vocab_size": 32000,
|
3292 |
-
"n_bytes": 1774770,
|
3293 |
-
"n_tokens": 685134,
|
3294 |
-
"n_chars": 603065
|
3295 |
-
},
|
3296 |
-
"llama_3_chinese_8b.cc100-ar": {
|
3297 |
-
"vocab_size": 128256,
|
3298 |
-
"n_bytes": 2813283,
|
3299 |
-
"n_tokens": 625514,
|
3300 |
-
"n_chars": 1560987
|
3301 |
-
},
|
3302 |
-
"llama_3_chinese_8b.cc100-de": {
|
3303 |
-
"vocab_size": 128256,
|
3304 |
-
"n_bytes": 1814876,
|
3305 |
-
"n_tokens": 509766,
|
3306 |
-
"n_chars": 1784021
|
3307 |
-
},
|
3308 |
-
"llama_3_chinese_8b.cc100-en": {
|
3309 |
-
"vocab_size": 128256,
|
3310 |
-
"n_bytes": 1124813,
|
3311 |
-
"n_tokens": 264944,
|
3312 |
-
"n_chars": 1121360
|
3313 |
-
},
|
3314 |
-
"llama_3_chinese_8b.cc100-es": {
|
3315 |
-
"vocab_size": 128256,
|
3316 |
-
"n_bytes": 1664455,
|
3317 |
-
"n_tokens": 443289,
|
3318 |
-
"n_chars": 1630297
|
3319 |
-
},
|
3320 |
-
"aya_101.cc100-fa": {
|
3321 |
-
"vocab_size": 250100,
|
3322 |
-
"n_bytes": 2054052,
|
3323 |
-
"n_tokens": 429922,
|
3324 |
-
"n_chars": 1145876
|
3325 |
-
},
|
3326 |
-
"baichuan.cc100-fa": {
|
3327 |
-
"vocab_size": 64000,
|
3328 |
-
"n_bytes": 2054052,
|
3329 |
-
"n_tokens": 1142057,
|
3330 |
-
"n_chars": 1145876
|
3331 |
-
},
|
3332 |
-
"baichuan2.cc100-fa": {
|
3333 |
-
"vocab_size": 125696,
|
3334 |
-
"n_bytes": 2054052,
|
3335 |
-
"n_tokens": 1052077,
|
3336 |
-
"n_chars": 1145876
|
3337 |
-
},
|
3338 |
-
"bert_base_cased.cc100-fa": {
|
3339 |
-
"vocab_size": 28996,
|
3340 |
-
"n_bytes": 2054052,
|
3341 |
-
"n_tokens": 903078,
|
3342 |
-
"n_chars": 1145876
|
3343 |
-
},
|
3344 |
-
"bert_base_chinese.cc100-fa": {
|
3345 |
-
"vocab_size": 21128,
|
3346 |
-
"n_bytes": 2054052,
|
3347 |
-
"n_tokens": 396414,
|
3348 |
-
"n_chars": 1145876
|
3349 |
-
},
|
3350 |
-
"bert_base_uncased.cc100-fa": {
|
3351 |
-
"vocab_size": 30522,
|
3352 |
-
"n_bytes": 2054052,
|
3353 |
-
"n_tokens": 910783,
|
3354 |
-
"n_chars": 1145876
|
3355 |
-
},
|
3356 |
-
"bloom.cc100-fa": {
|
3357 |
-
"vocab_size": 250680,
|
3358 |
-
"n_bytes": 2054052,
|
3359 |
-
"n_tokens": 434406,
|
3360 |
-
"n_chars": 1145876
|
3361 |
-
},
|
3362 |
-
"byt5_small.cc100-fa": {
|
3363 |
-
"vocab_size": 384,
|
3364 |
-
"n_bytes": 2054052,
|
3365 |
-
"n_tokens": 2064052,
|
3366 |
-
"n_chars": 1145876
|
3367 |
-
},
|
3368 |
-
"character_glm_6b.cc100-fa": {
|
3369 |
-
"vocab_size": 64789,
|
3370 |
-
"n_bytes": 2054052,
|
3371 |
-
"n_tokens": 1165051,
|
3372 |
-
"n_chars": 1145876
|
3373 |
-
},
|
3374 |
-
"chatglm2_6b.cc100-fa": {
|
3375 |
-
"vocab_size": 64787,
|
3376 |
-
"n_bytes": 2054052,
|
3377 |
-
"n_tokens": 1165051,
|
3378 |
-
"n_chars": 1145876
|
3379 |
-
},
|
3380 |
-
"chatglm3_6b.cc100-fa": {
|
3381 |
-
"vocab_size": 64796,
|
3382 |
-
"n_bytes": 2054052,
|
3383 |
-
"n_tokens": 1165051,
|
3384 |
-
"n_chars": 1145876
|
3385 |
-
},
|
3386 |
-
"chatglm_6b.cc100-fa": {
|
3387 |
-
"vocab_size": 150344,
|
3388 |
-
"n_bytes": 2054052,
|
3389 |
-
"n_tokens": 910808,
|
3390 |
-
"n_chars": 1145876
|
3391 |
-
},
|
3392 |
-
"chatyuan_large_v2.cc100-fa": {
|
3393 |
-
"vocab_size": 32128,
|
3394 |
-
"n_bytes": 2054052,
|
3395 |
-
"n_tokens": 740377,
|
3396 |
-
"n_chars": 1145876
|
3397 |
-
},
|
3398 |
-
"chinese_llama.cc100-fa": {
|
3399 |
-
"vocab_size": 49953,
|
3400 |
-
"n_bytes": 2054052,
|
3401 |
-
"n_tokens": 1150750,
|
3402 |
-
"n_chars": 1145876
|
3403 |
-
},
|
3404 |
-
"chinese_llama2.cc100-fa": {
|
3405 |
-
"vocab_size": 55296,
|
3406 |
-
"n_bytes": 2054052,
|
3407 |
-
"n_tokens": 1155078,
|
3408 |
-
"n_chars": 1145876
|
3409 |
-
},
|
3410 |
-
"code_davinci_002.cc100-fa": {
|
3411 |
-
"vocab_size": 50281,
|
3412 |
-
"n_bytes": 2054052,
|
3413 |
-
"n_tokens": 1292300,
|
3414 |
-
"n_chars": 1145876
|
3415 |
-
},
|
3416 |
-
"crystal_coder.cc100-fa": {
|
3417 |
-
"vocab_size": 32022,
|
3418 |
-
"n_bytes": 2054052,
|
3419 |
-
"n_tokens": 1145076,
|
3420 |
-
"n_chars": 1145876
|
3421 |
-
},
|
3422 |
-
"dbrx_instruct.cc100-fa": {
|
3423 |
-
"vocab_size": 100280,
|
3424 |
-
"n_bytes": 2054052,
|
3425 |
-
"n_tokens": 818067,
|
3426 |
-
"n_chars": 1145876
|
3427 |
-
},
|
3428 |
-
"deepseek_coder_33b_instruct.cc100-fa": {
|
3429 |
-
"vocab_size": 32022,
|
3430 |
-
"n_bytes": 2054052,
|
3431 |
-
"n_tokens": 1326109,
|
3432 |
-
"n_chars": 1145876
|
3433 |
-
},
|
3434 |
-
"deepseek_llm_7b_base.cc100-fa": {
|
3435 |
-
"vocab_size": 100015,
|
3436 |
-
"n_bytes": 2054052,
|
3437 |
-
"n_tokens": 973451,
|
3438 |
-
"n_chars": 1145876
|
3439 |
-
},
|
3440 |
-
"falcon_180b.cc100-fa": {
|
3441 |
-
"vocab_size": 65024,
|
3442 |
-
"n_bytes": 2054052,
|
3443 |
-
"n_tokens": 1246580,
|
3444 |
-
"n_chars": 1145876
|
3445 |
-
},
|
3446 |
-
"falcon_7b.cc100-fa": {
|
3447 |
-
"vocab_size": 65024,
|
3448 |
-
"n_bytes": 2054052,
|
3449 |
-
"n_tokens": 1246580,
|
3450 |
-
"n_chars": 1145876
|
3451 |
-
},
|
3452 |
-
"fastchat_t5_3b.cc100-fa": {
|
3453 |
-
"vocab_size": 32110,
|
3454 |
-
"n_bytes": 2054052,
|
3455 |
-
"n_tokens": 712443,
|
3456 |
-
"n_chars": 1145876
|
3457 |
-
},
|
3458 |
-
"flan_t5_base.cc100-fa": {
|
3459 |
-
"vocab_size": 32100,
|
3460 |
-
"n_bytes": 2054052,
|
3461 |
-
"n_tokens": 493779,
|
3462 |
-
"n_chars": 1145876
|
3463 |
-
},
|
3464 |
-
"gemma_7b.cc100-fa": {
|
3465 |
-
"vocab_size": 256000,
|
3466 |
-
"n_bytes": 2054052,
|
3467 |
-
"n_tokens": 373762,
|
3468 |
-
"n_chars": 1145876
|
3469 |
-
},
|
3470 |
-
"gpt2.cc100-fa": {
|
3471 |
-
"vocab_size": 50257,
|
3472 |
-
"n_bytes": 2054052,
|
3473 |
-
"n_tokens": 1292300,
|
3474 |
-
"n_chars": 1145876
|
3475 |
-
},
|
3476 |
-
"gpt2_chinese.cc100-fa": {
|
3477 |
-
"vocab_size": 21128,
|
3478 |
-
"n_bytes": 2054052,
|
3479 |
-
"n_tokens": 406174,
|
3480 |
-
"n_chars": 1145876
|
3481 |
-
},
|
3482 |
-
"gpt_35_turbo.cc100-fa": {
|
3483 |
-
"vocab_size": 100277,
|
3484 |
-
"n_bytes": 2054052,
|
3485 |
-
"n_tokens": 818067,
|
3486 |
-
"n_chars": 1145876
|
3487 |
-
},
|
3488 |
-
"gpt_4.cc100-fa": {
|
3489 |
-
"vocab_size": 100277,
|
3490 |
-
"n_bytes": 2054052,
|
3491 |
-
"n_tokens": 818067,
|
3492 |
-
"n_chars": 1145876
|
3493 |
-
},
|
3494 |
-
"gpt_neox_japanese_2_7b.cc100-fa": {
|
3495 |
-
"vocab_size": 32000,
|
3496 |
-
"n_bytes": 2054052,
|
3497 |
-
"n_tokens": 2036715,
|
3498 |
-
"n_chars": 1145876
|
3499 |
-
},
|
3500 |
-
"gpt_nexo_20b.cc100-fa": {
|
3501 |
-
"vocab_size": 50277,
|
3502 |
-
"n_bytes": 2054052,
|
3503 |
-
"n_tokens": 866434,
|
3504 |
-
"n_chars": 1145876
|
3505 |
-
},
|
3506 |
-
"grok_1.cc100-fa": {
|
3507 |
-
"vocab_size": 131072,
|
3508 |
-
"n_bytes": 2054052,
|
3509 |
-
"n_tokens": 1073281,
|
3510 |
-
"n_chars": 1145876
|
3511 |
-
},
|
3512 |
-
"internlm2_chat_7b.cc100-fa": {
|
3513 |
-
"vocab_size": 92544,
|
3514 |
-
"n_bytes": 2054052,
|
3515 |
-
"n_tokens": 1195032,
|
3516 |
-
"n_chars": 1145876
|
3517 |
-
},
|
3518 |
-
"internlm2_math_7b.cc100-fa": {
|
3519 |
-
"vocab_size": 92544,
|
3520 |
-
"n_bytes": 2054052,
|
3521 |
-
"n_tokens": 1195032,
|
3522 |
-
"n_chars": 1145876
|
3523 |
-
},
|
3524 |
-
"internlm_chat_7b.cc100-fa": {
|
3525 |
-
"vocab_size": 103168,
|
3526 |
-
"n_bytes": 2054052,
|
3527 |
-
"n_tokens": 640945,
|
3528 |
-
"n_chars": 1145876
|
3529 |
-
},
|
3530 |
-
"internlm_xcomposer_7b.cc100-fa": {
|
3531 |
-
"vocab_size": 103168,
|
3532 |
-
"n_bytes": 2054052,
|
3533 |
-
"n_tokens": 640945,
|
3534 |
-
"n_chars": 1145876
|
3535 |
-
},
|
3536 |
-
"jamba_v0_1.cc100-fa": {
|
3537 |
-
"vocab_size": 65536,
|
3538 |
-
"n_bytes": 2054052,
|
3539 |
-
"n_tokens": 732550,
|
3540 |
-
"n_chars": 1145876
|
3541 |
-
},
|
3542 |
-
"kplug.cc100-fa": {
|
3543 |
-
"vocab_size": 10261,
|
3544 |
-
"n_bytes": 2054052,
|
3545 |
-
"n_tokens": 274671,
|
3546 |
-
"n_chars": 1145876
|
3547 |
-
},
|
3548 |
-
"llama.cc100-fa": {
|
3549 |
-
"vocab_size": 32000,
|
3550 |
-
"n_bytes": 2054052,
|
3551 |
-
"n_tokens": 1155076,
|
3552 |
-
"n_chars": 1145876
|
3553 |
-
},
|
3554 |
-
"llama2.cc100-fa": {
|
3555 |
-
"vocab_size": 32001,
|
3556 |
-
"n_bytes": 2054052,
|
3557 |
-
"n_tokens": 1155076,
|
3558 |
-
"n_chars": 1145876
|
3559 |
-
},
|
3560 |
-
"llama3.cc100-fa": {
|
3561 |
-
"vocab_size": 128256,
|
3562 |
-
"n_bytes": 2054052,
|
3563 |
-
"n_tokens": 387448,
|
3564 |
-
"n_chars": 1145876
|
3565 |
-
},
|
3566 |
-
"llama_3_chinese_8b.cc100-fa": {
|
3567 |
-
"vocab_size": 128256,
|
3568 |
-
"n_bytes": 2054052,
|
3569 |
-
"n_tokens": 397448,
|
3570 |
-
"n_chars": 1145876
|
3571 |
-
},
|
3572 |
-
"mistral_7b.cc100-fa": {
|
3573 |
-
"vocab_size": 32000,
|
3574 |
-
"n_bytes": 2054052,
|
3575 |
-
"n_tokens": 1133278,
|
3576 |
-
"n_chars": 1145876
|
3577 |
-
},
|
3578 |
-
"mixtral_8_7b.cc100-fa": {
|
3579 |
-
"vocab_size": 32000,
|
3580 |
-
"n_bytes": 2054052,
|
3581 |
-
"n_tokens": 1133278,
|
3582 |
-
"n_chars": 1145876
|
3583 |
-
},
|
3584 |
-
"mobilebert_uncased.cc100-fa": {
|
3585 |
-
"vocab_size": 30522,
|
3586 |
-
"n_bytes": 2054052,
|
3587 |
-
"n_tokens": 910783,
|
3588 |
-
"n_chars": 1145876
|
3589 |
-
},
|
3590 |
-
"moss.cc100-fa": {
|
3591 |
-
"vocab_size": 106072,
|
3592 |
-
"n_bytes": 2054052,
|
3593 |
-
"n_tokens": 1285426,
|
3594 |
-
"n_chars": 1145876
|
3595 |
-
},
|
3596 |
-
"mt5_large.cc100-fa": {
|
3597 |
-
"vocab_size": 250100,
|
3598 |
-
"n_bytes": 2054052,
|
3599 |
-
"n_tokens": 429922,
|
3600 |
-
"n_chars": 1145876
|
3601 |
-
},
|
3602 |
-
"olmo_7b.cc100-fa": {
|
3603 |
-
"vocab_size": 50280,
|
3604 |
-
"n_bytes": 2054052,
|
3605 |
-
"n_tokens": 866434,
|
3606 |
-
"n_chars": 1145876
|
3607 |
-
},
|
3608 |
-
"orion_14b_chat.cc100-fa": {
|
3609 |
-
"vocab_size": 84608,
|
3610 |
-
"n_bytes": 2054052,
|
3611 |
-
"n_tokens": 1131108,
|
3612 |
-
"n_chars": 1145876
|
3613 |
-
},
|
3614 |
-
"phi_1.cc100-fa": {
|
3615 |
-
"vocab_size": 50295,
|
3616 |
-
"n_bytes": 2054052,
|
3617 |
-
"n_tokens": 1292300,
|
3618 |
-
"n_chars": 1145876
|
3619 |
-
},
|
3620 |
-
"phi_2.cc100-fa": {
|
3621 |
-
"vocab_size": 50295,
|
3622 |
-
"n_bytes": 2054052,
|
3623 |
-
"n_tokens": 1292300,
|
3624 |
-
"n_chars": 1145876
|
3625 |
-
},
|
3626 |
-
"phi_3_mini.cc100-fa": {
|
3627 |
-
"vocab_size": 32011,
|
3628 |
-
"n_bytes": 2054052,
|
3629 |
-
"n_tokens": 1155076,
|
3630 |
-
"n_chars": 1145876
|
3631 |
-
},
|
3632 |
-
"pko_t5_large.cc100-fa": {
|
3633 |
-
"vocab_size": 50358,
|
3634 |
-
"n_bytes": 2054052,
|
3635 |
-
"n_tokens": 2061040,
|
3636 |
-
"n_chars": 1145876
|
3637 |
-
},
|
3638 |
-
"prompt_clue.cc100-fa": {
|
3639 |
-
"vocab_size": 32128,
|
3640 |
-
"n_bytes": 2054052,
|
3641 |
-
"n_tokens": 740377,
|
3642 |
-
"n_chars": 1145876
|
3643 |
-
},
|
3644 |
-
"qwen1_5_14b_chat.cc100-fa": {
|
3645 |
-
"vocab_size": 151646,
|
3646 |
-
"n_bytes": 2054052,
|
3647 |
-
"n_tokens": 643421,
|
3648 |
-
"n_chars": 1145876
|
3649 |
-
},
|
3650 |
-
"qwen_1_8b_chat.cc100-fa": {
|
3651 |
-
"vocab_size": 151851,
|
3652 |
-
"n_bytes": 2054052,
|
3653 |
-
"n_tokens": 643421,
|
3654 |
-
"n_chars": 1145876
|
3655 |
-
},
|
3656 |
-
"qwen_72b_chat.cc100-fa": {
|
3657 |
-
"vocab_size": 151851,
|
3658 |
-
"n_bytes": 2054052,
|
3659 |
-
"n_tokens": 643421,
|
3660 |
-
"n_chars": 1145876
|
3661 |
-
},
|
3662 |
-
"qwen_7b_chat.cc100-fa": {
|
3663 |
-
"vocab_size": 151851,
|
3664 |
-
"n_bytes": 2054052,
|
3665 |
-
"n_tokens": 643421,
|
3666 |
-
"n_chars": 1145876
|
3667 |
-
},
|
3668 |
-
"roberta_chinese_clue.cc100-fa": {
|
3669 |
-
"vocab_size": 8021,
|
3670 |
-
"n_bytes": 2054052,
|
3671 |
-
"n_tokens": 407763,
|
3672 |
-
"n_chars": 1145876
|
3673 |
-
},
|
3674 |
-
"skywork_13b_base.cc100-fa": {
|
3675 |
-
"vocab_size": 65519,
|
3676 |
-
"n_bytes": 2054052,
|
3677 |
-
"n_tokens": 1155072,
|
3678 |
-
"n_chars": 1145876
|
3679 |
-
},
|
3680 |
-
"skywork_13b_math.cc100-fa": {
|
3681 |
-
"vocab_size": 65519,
|
3682 |
-
"n_bytes": 2054052,
|
3683 |
-
"n_tokens": 1155072,
|
3684 |
-
"n_chars": 1145876
|
3685 |
-
},
|
3686 |
-
"solar_10_7b.cc100-fa": {
|
3687 |
-
"vocab_size": 32000,
|
3688 |
-
"n_bytes": 2054052,
|
3689 |
-
"n_tokens": 1133278,
|
3690 |
-
"n_chars": 1145876
|
3691 |
-
},
|
3692 |
-
"starchat_alpha.cc100-fa": {
|
3693 |
-
"vocab_size": 49156,
|
3694 |
-
"n_bytes": 2054052,
|
3695 |
-
"n_tokens": 851630,
|
3696 |
-
"n_chars": 1145876
|
3697 |
-
},
|
3698 |
-
"switch_c_2048.cc100-fa": {
|
3699 |
-
"vocab_size": 32100,
|
3700 |
-
"n_bytes": 2054052,
|
3701 |
-
"n_tokens": 493767,
|
3702 |
-
"n_chars": 1145876
|
3703 |
-
},
|
3704 |
-
"t5_base.cc100-fa": {
|
3705 |
-
"vocab_size": 32100,
|
3706 |
-
"n_bytes": 2054052,
|
3707 |
-
"n_tokens": 493767,
|
3708 |
-
"n_chars": 1145876
|
3709 |
-
},
|
3710 |
-
"t5_large.cc100-fa": {
|
3711 |
-
"vocab_size": 32100,
|
3712 |
-
"n_bytes": 2054052,
|
3713 |
-
"n_tokens": 493767,
|
3714 |
-
"n_chars": 1145876
|
3715 |
-
},
|
3716 |
-
"t5_small.cc100-fa": {
|
3717 |
-
"vocab_size": 32100,
|
3718 |
-
"n_bytes": 2054052,
|
3719 |
-
"n_tokens": 493767,
|
3720 |
-
"n_chars": 1145876
|
3721 |
-
},
|
3722 |
-
"text_davinci_003.cc100-fa": {
|
3723 |
-
"vocab_size": 50281,
|
3724 |
-
"n_bytes": 2054052,
|
3725 |
-
"n_tokens": 1292300,
|
3726 |
-
"n_chars": 1145876
|
3727 |
-
},
|
3728 |
-
"tigerbot_13b_chat_v2.cc100-fa": {
|
3729 |
-
"vocab_size": 60515,
|
3730 |
-
"n_bytes": 2054052,
|
3731 |
-
"n_tokens": 1145046,
|
3732 |
-
"n_chars": 1145876
|
3733 |
-
},
|
3734 |
-
"tigerbot_70b_chat_v4_4k.cc100-fa": {
|
3735 |
-
"vocab_size": 65110,
|
3736 |
-
"n_bytes": 2054052,
|
3737 |
-
"n_tokens": 1145048,
|
3738 |
-
"n_chars": 1145876
|
3739 |
-
},
|
3740 |
-
"wizardcoder_15b_v1.cc100-fa": {
|
3741 |
-
"vocab_size": 49153,
|
3742 |
-
"n_bytes": 2054052,
|
3743 |
-
"n_tokens": 851630,
|
3744 |
-
"n_chars": 1145876
|
3745 |
-
},
|
3746 |
-
"wizardcoder_python_7b_v1.cc100-fa": {
|
3747 |
-
"vocab_size": 32001,
|
3748 |
-
"n_bytes": 2054052,
|
3749 |
-
"n_tokens": 1155076,
|
3750 |
-
"n_chars": 1145876
|
3751 |
-
},
|
3752 |
-
"wizardlm_7b_v1.cc100-fa": {
|
3753 |
-
"vocab_size": 32001,
|
3754 |
-
"n_bytes": 2054052,
|
3755 |
-
"n_tokens": 1155076,
|
3756 |
-
"n_chars": 1145876
|
3757 |
-
},
|
3758 |
-
"wizardmath_70b_v1.cc100-fa": {
|
3759 |
-
"vocab_size": 32002,
|
3760 |
-
"n_bytes": 2054052,
|
3761 |
-
"n_tokens": 1155076,
|
3762 |
-
"n_chars": 1145876
|
3763 |
-
},
|
3764 |
-
"xlm_roberta.cc100-fa": {
|
3765 |
-
"vocab_size": 250002,
|
3766 |
-
"n_bytes": 2054052,
|
3767 |
-
"n_tokens": 330926,
|
3768 |
-
"n_chars": 1145876
|
3769 |
-
},
|
3770 |
-
"yi_34b.cc100-fa": {
|
3771 |
-
"vocab_size": 64000,
|
3772 |
-
"n_bytes": 2054052,
|
3773 |
-
"n_tokens": 1337264,
|
3774 |
-
"n_chars": 1145876
|
3775 |
-
},
|
3776 |
-
"yi_6b.cc100-fa": {
|
3777 |
-
"vocab_size": 64000,
|
3778 |
-
"n_bytes": 2054052,
|
3779 |
-
"n_tokens": 1337264,
|
3780 |
-
"n_chars": 1145876
|
3781 |
-
},
|
3782 |
-
"yi_vl34b.cc100-fa": {
|
3783 |
-
"vocab_size": 64000,
|
3784 |
-
"n_bytes": 2054052,
|
3785 |
-
"n_tokens": 1346819,
|
3786 |
-
"n_chars": 1145876
|
3787 |
-
},
|
3788 |
-
"zephyr_7b_beta.cc100-fa": {
|
3789 |
-
"vocab_size": 32000,
|
3790 |
-
"n_bytes": 2054052,
|
3791 |
-
"n_tokens": 1133278,
|
3792 |
-
"n_chars": 1145876
|
3793 |
-
},
|
3794 |
-
"llama_3_chinese_8b.cc100-fr": {
|
3795 |
-
"vocab_size": 128256,
|
3796 |
-
"n_bytes": 1540504,
|
3797 |
-
"n_tokens": 422146,
|
3798 |
-
"n_chars": 1484970
|
3799 |
-
},
|
3800 |
-
"llama_3_chinese_8b.cc100-ja": {
|
3801 |
-
"vocab_size": 128256,
|
3802 |
-
"n_bytes": 1774770,
|
3803 |
-
"n_tokens": 424715,
|
3804 |
-
"n_chars": 603065
|
3805 |
-
},
|
3806 |
-
"aya_101.cc100-ko": {
|
3807 |
-
"vocab_size": 250100,
|
3808 |
-
"n_bytes": 1524839,
|
3809 |
-
"n_tokens": 434586,
|
3810 |
-
"n_chars": 655190
|
3811 |
-
},
|
3812 |
-
"baichuan.cc100-ko": {
|
3813 |
-
"vocab_size": 64000,
|
3814 |
-
"n_bytes": 1524839,
|
3815 |
-
"n_tokens": 639258,
|
3816 |
-
"n_chars": 655190
|
3817 |
-
},
|
3818 |
-
"baichuan2.cc100-ko": {
|
3819 |
-
"vocab_size": 125696,
|
3820 |
-
"n_bytes": 1524839,
|
3821 |
-
"n_tokens": 623358,
|
3822 |
-
"n_chars": 655190
|
3823 |
-
},
|
3824 |
-
"bert_base_cased.cc100-ko": {
|
3825 |
-
"vocab_size": 28996,
|
3826 |
-
"n_bytes": 1524839,
|
3827 |
-
"n_tokens": 222828,
|
3828 |
-
"n_chars": 655190
|
3829 |
-
},
|
3830 |
-
"bert_base_chinese.cc100-ko": {
|
3831 |
-
"vocab_size": 21128,
|
3832 |
-
"n_bytes": 1524839,
|
3833 |
-
"n_tokens": 219752,
|
3834 |
-
"n_chars": 655190
|
3835 |
-
},
|
3836 |
-
"bert_base_uncased.cc100-ko": {
|
3837 |
-
"vocab_size": 30522,
|
3838 |
-
"n_bytes": 1524839,
|
3839 |
-
"n_tokens": 904756,
|
3840 |
-
"n_chars": 655190
|
3841 |
-
},
|
3842 |
-
"bloom.cc100-ko": {
|
3843 |
-
"vocab_size": 250680,
|
3844 |
-
"n_bytes": 1524839,
|
3845 |
-
"n_tokens": 742111,
|
3846 |
-
"n_chars": 655190
|
3847 |
-
},
|
3848 |
-
"byt5_small.cc100-ko": {
|
3849 |
-
"vocab_size": 384,
|
3850 |
-
"n_bytes": 1524839,
|
3851 |
-
"n_tokens": 1534839,
|
3852 |
-
"n_chars": 655190
|
3853 |
-
},
|
3854 |
-
"character_glm_6b.cc100-ko": {
|
3855 |
-
"vocab_size": 64789,
|
3856 |
-
"n_bytes": 1524839,
|
3857 |
-
"n_tokens": 672160,
|
3858 |
-
"n_chars": 655190
|
3859 |
-
},
|
3860 |
-
"chatglm2_6b.cc100-ko": {
|
3861 |
-
"vocab_size": 64787,
|
3862 |
-
"n_bytes": 1524839,
|
3863 |
-
"n_tokens": 672156,
|
3864 |
-
"n_chars": 655190
|
3865 |
-
},
|
3866 |
-
"chatglm3_6b.cc100-ko": {
|
3867 |
-
"vocab_size": 64796,
|
3868 |
-
"n_bytes": 1524839,
|
3869 |
-
"n_tokens": 672160,
|
3870 |
-
"n_chars": 655190
|
3871 |
-
},
|
3872 |
-
"chatglm_6b.cc100-ko": {
|
3873 |
-
"vocab_size": 150344,
|
3874 |
-
"n_bytes": 1524839,
|
3875 |
-
"n_tokens": 939630,
|
3876 |
-
"n_chars": 655190
|
3877 |
-
},
|
3878 |
-
"chatyuan_large_v2.cc100-ko": {
|
3879 |
-
"vocab_size": 32128,
|
3880 |
-
"n_bytes": 1524839,
|
3881 |
-
"n_tokens": 354411,
|
3882 |
-
"n_chars": 655190
|
3883 |
-
},
|
3884 |
-
"chinese_llama.cc100-ko": {
|
3885 |
-
"vocab_size": 49953,
|
3886 |
-
"n_bytes": 1524839,
|
3887 |
-
"n_tokens": 913553,
|
3888 |
-
"n_chars": 655190
|
3889 |
-
},
|
3890 |
-
"chinese_llama2.cc100-ko": {
|
3891 |
-
"vocab_size": 55296,
|
3892 |
-
"n_bytes": 1524839,
|
3893 |
-
"n_tokens": 963427,
|
3894 |
-
"n_chars": 655190
|
3895 |
-
},
|
3896 |
-
"code_davinci_002.cc100-ko": {
|
3897 |
-
"vocab_size": 50281,
|
3898 |
-
"n_bytes": 1524839,
|
3899 |
-
"n_tokens": 1308993,
|
3900 |
-
"n_chars": 655190
|
3901 |
-
},
|
3902 |
-
"crystal_coder.cc100-ko": {
|
3903 |
-
"vocab_size": 32022,
|
3904 |
-
"n_bytes": 1524839,
|
3905 |
-
"n_tokens": 954428,
|
3906 |
-
"n_chars": 655190
|
3907 |
-
},
|
3908 |
-
"dbrx_instruct.cc100-ko": {
|
3909 |
-
"vocab_size": 100280,
|
3910 |
-
"n_bytes": 1524839,
|
3911 |
-
"n_tokens": 652277,
|
3912 |
-
"n_chars": 655190
|
3913 |
-
},
|
3914 |
-
"deepseek_coder_33b_instruct.cc100-ko": {
|
3915 |
-
"vocab_size": 32022,
|
3916 |
-
"n_bytes": 1524839,
|
3917 |
-
"n_tokens": 1454805,
|
3918 |
-
"n_chars": 655190
|
3919 |
-
},
|
3920 |
-
"deepseek_llm_7b_base.cc100-ko": {
|
3921 |
-
"vocab_size": 100015,
|
3922 |
-
"n_bytes": 1524839,
|
3923 |
-
"n_tokens": 1081983,
|
3924 |
-
"n_chars": 655190
|
3925 |
-
},
|
3926 |
-
"falcon_180b.cc100-ko": {
|
3927 |
-
"vocab_size": 65024,
|
3928 |
-
"n_bytes": 1524839,
|
3929 |
-
"n_tokens": 1330568,
|
3930 |
-
"n_chars": 655190
|
3931 |
-
},
|
3932 |
-
"falcon_7b.cc100-ko": {
|
3933 |
-
"vocab_size": 65024,
|
3934 |
-
"n_bytes": 1524839,
|
3935 |
-
"n_tokens": 1330568,
|
3936 |
-
"n_chars": 655190
|
3937 |
-
},
|
3938 |
-
"fastchat_t5_3b.cc100-ko": {
|
3939 |
-
"vocab_size": 32110,
|
3940 |
-
"n_bytes": 1524839,
|
3941 |
-
"n_tokens": 484953,
|
3942 |
-
"n_chars": 655190
|
3943 |
-
},
|
3944 |
-
"flan_t5_base.cc100-ko": {
|
3945 |
-
"vocab_size": 32100,
|
3946 |
-
"n_bytes": 1524839,
|
3947 |
-
"n_tokens": 344457,
|
3948 |
-
"n_chars": 655190
|
3949 |
-
},
|
3950 |
-
"gemma_7b.cc100-ko": {
|
3951 |
-
"vocab_size": 256000,
|
3952 |
-
"n_bytes": 1524839,
|
3953 |
-
"n_tokens": 464410,
|
3954 |
-
"n_chars": 655190
|
3955 |
-
},
|
3956 |
-
"gpt2.cc100-ko": {
|
3957 |
-
"vocab_size": 50257,
|
3958 |
-
"n_bytes": 1524839,
|
3959 |
-
"n_tokens": 1309029,
|
3960 |
-
"n_chars": 655190
|
3961 |
-
},
|
3962 |
-
"gpt2_chinese.cc100-ko": {
|
3963 |
-
"vocab_size": 21128,
|
3964 |
-
"n_bytes": 1524839,
|
3965 |
-
"n_tokens": 1055974,
|
3966 |
-
"n_chars": 655190
|
3967 |
-
},
|
3968 |
-
"gpt_35_turbo.cc100-ko": {
|
3969 |
-
"vocab_size": 100277,
|
3970 |
-
"n_bytes": 1524839,
|
3971 |
-
"n_tokens": 652277,
|
3972 |
-
"n_chars": 655190
|
3973 |
-
},
|
3974 |
-
"gpt_4.cc100-ko": {
|
3975 |
-
"vocab_size": 100277,
|
3976 |
-
"n_bytes": 1524839,
|
3977 |
-
"n_tokens": 652277,
|
3978 |
-
"n_chars": 655190
|
3979 |
-
},
|
3980 |
-
"gpt_neox_japanese_2_7b.cc100-ko": {
|
3981 |
-
"vocab_size": 32000,
|
3982 |
-
"n_bytes": 1524839,
|
3983 |
-
"n_tokens": 1512832,
|
3984 |
-
"n_chars": 655190
|
3985 |
-
},
|
3986 |
-
"gpt_nexo_20b.cc100-ko": {
|
3987 |
-
"vocab_size": 50277,
|
3988 |
-
"n_bytes": 1524839,
|
3989 |
-
"n_tokens": 973288,
|
3990 |
-
"n_chars": 655190
|
3991 |
-
},
|
3992 |
-
"grok_1.cc100-ko": {
|
3993 |
-
"vocab_size": 131072,
|
3994 |
-
"n_bytes": 1524839,
|
3995 |
-
"n_tokens": 1152005,
|
3996 |
-
"n_chars": 655190
|
3997 |
-
},
|
3998 |
-
"internlm2_chat_7b.cc100-ko": {
|
3999 |
-
"vocab_size": 92544,
|
4000 |
-
"n_bytes": 1524839,
|
4001 |
-
"n_tokens": 1008524,
|
4002 |
-
"n_chars": 655190
|
4003 |
-
},
|
4004 |
-
"internlm2_math_7b.cc100-ko": {
|
4005 |
-
"vocab_size": 92544,
|
4006 |
-
"n_bytes": 1524839,
|
4007 |
-
"n_tokens": 1008524,
|
4008 |
-
"n_chars": 655190
|
4009 |
-
},
|
4010 |
-
"internlm_chat_7b.cc100-ko": {
|
4011 |
-
"vocab_size": 103168,
|
4012 |
-
"n_bytes": 1524839,
|
4013 |
-
"n_tokens": 839609,
|
4014 |
-
"n_chars": 655190
|
4015 |
-
},
|
4016 |
-
"internlm_xcomposer_7b.cc100-ko": {
|
4017 |
-
"vocab_size": 103168,
|
4018 |
-
"n_bytes": 1524839,
|
4019 |
-
"n_tokens": 839609,
|
4020 |
-
"n_chars": 655190
|
4021 |
-
},
|
4022 |
-
"jamba_v0_1.cc100-ko": {
|
4023 |
-
"vocab_size": 65536,
|
4024 |
-
"n_bytes": 1524839,
|
4025 |
-
"n_tokens": 715688,
|
4026 |
-
"n_chars": 655190
|
4027 |
-
},
|
4028 |
-
"kplug.cc100-ko": {
|
4029 |
-
"vocab_size": 10261,
|
4030 |
-
"n_bytes": 1524839,
|
4031 |
-
"n_tokens": 222771,
|
4032 |
-
"n_chars": 655190
|
4033 |
-
},
|
4034 |
-
"llama.cc100-ko": {
|
4035 |
-
"vocab_size": 32000,
|
4036 |
-
"n_bytes": 1524839,
|
4037 |
-
"n_tokens": 964428,
|
4038 |
-
"n_chars": 655190
|
4039 |
-
},
|
4040 |
-
"llama2.cc100-ko": {
|
4041 |
-
"vocab_size": 32001,
|
4042 |
-
"n_bytes": 1524839,
|
4043 |
-
"n_tokens": 964428,
|
4044 |
-
"n_chars": 655190
|
4045 |
-
},
|
4046 |
-
"llama3.cc100-ko": {
|
4047 |
-
"vocab_size": 128256,
|
4048 |
-
"n_bytes": 1524839,
|
4049 |
-
"n_tokens": 412595,
|
4050 |
-
"n_chars": 655190
|
4051 |
-
},
|
4052 |
-
"llama_3_chinese_8b.cc100-ko": {
|
4053 |
-
"vocab_size": 128256,
|
4054 |
-
"n_bytes": 1524839,
|
4055 |
-
"n_tokens": 422595,
|
4056 |
-
"n_chars": 655190
|
4057 |
-
},
|
4058 |
-
"mistral_7b.cc100-ko": {
|
4059 |
-
"vocab_size": 32000,
|
4060 |
-
"n_bytes": 1524839,
|
4061 |
-
"n_tokens": 728766,
|
4062 |
-
"n_chars": 655190
|
4063 |
-
},
|
4064 |
-
"mixtral_8_7b.cc100-ko": {
|
4065 |
-
"vocab_size": 32000,
|
4066 |
-
"n_bytes": 1524839,
|
4067 |
-
"n_tokens": 728766,
|
4068 |
-
"n_chars": 655190
|
4069 |
-
},
|
4070 |
-
"mobilebert_uncased.cc100-ko": {
|
4071 |
-
"vocab_size": 30522,
|
4072 |
-
"n_bytes": 1524839,
|
4073 |
-
"n_tokens": 904756,
|
4074 |
-
"n_chars": 655190
|
4075 |
-
},
|
4076 |
-
"moss.cc100-ko": {
|
4077 |
-
"vocab_size": 106072,
|
4078 |
-
"n_bytes": 1524839,
|
4079 |
-
"n_tokens": 1305249,
|
4080 |
-
"n_chars": 655190
|
4081 |
-
},
|
4082 |
-
"mt5_large.cc100-ko": {
|
4083 |
-
"vocab_size": 250100,
|
4084 |
-
"n_bytes": 1524839,
|
4085 |
-
"n_tokens": 434586,
|
4086 |
-
"n_chars": 655190
|
4087 |
-
},
|
4088 |
-
"olmo_7b.cc100-ko": {
|
4089 |
-
"vocab_size": 50280,
|
4090 |
-
"n_bytes": 1524839,
|
4091 |
-
"n_tokens": 973288,
|
4092 |
-
"n_chars": 655190
|
4093 |
-
},
|
4094 |
-
"orion_14b_chat.cc100-ko": {
|
4095 |
-
"vocab_size": 84608,
|
4096 |
-
"n_bytes": 1524839,
|
4097 |
-
"n_tokens": 351149,
|
4098 |
-
"n_chars": 655190
|
4099 |
-
},
|
4100 |
-
"phi_1.cc100-ko": {
|
4101 |
-
"vocab_size": 50295,
|
4102 |
-
"n_bytes": 1524839,
|
4103 |
-
"n_tokens": 1308988,
|
4104 |
-
"n_chars": 655190
|
4105 |
-
},
|
4106 |
-
"phi_2.cc100-ko": {
|
4107 |
-
"vocab_size": 50295,
|
4108 |
-
"n_bytes": 1524839,
|
4109 |
-
"n_tokens": 1308988,
|
4110 |
-
"n_chars": 655190
|
4111 |
-
},
|
4112 |
-
"phi_3_mini.cc100-ko": {
|
4113 |
-
"vocab_size": 32011,
|
4114 |
-
"n_bytes": 1524839,
|
4115 |
-
"n_tokens": 964428,
|
4116 |
-
"n_chars": 655190
|
4117 |
-
},
|
4118 |
-
"pko_t5_large.cc100-ko": {
|
4119 |
-
"vocab_size": 50358,
|
4120 |
-
"n_bytes": 1524839,
|
4121 |
-
"n_tokens": 471643,
|
4122 |
-
"n_chars": 655190
|
4123 |
-
},
|
4124 |
-
"prompt_clue.cc100-ko": {
|
4125 |
-
"vocab_size": 32128,
|
4126 |
-
"n_bytes": 1524839,
|
4127 |
-
"n_tokens": 354411,
|
4128 |
-
"n_chars": 655190
|
4129 |
-
},
|
4130 |
-
"qwen1_5_14b_chat.cc100-ko": {
|
4131 |
-
"vocab_size": 151646,
|
4132 |
-
"n_bytes": 1524839,
|
4133 |
-
"n_tokens": 457492,
|
4134 |
-
"n_chars": 655190
|
4135 |
-
},
|
4136 |
-
"qwen_1_8b_chat.cc100-ko": {
|
4137 |
-
"vocab_size": 151851,
|
4138 |
-
"n_bytes": 1524839,
|
4139 |
-
"n_tokens": 457492,
|
4140 |
-
"n_chars": 655190
|
4141 |
-
},
|
4142 |
-
"qwen_72b_chat.cc100-ko": {
|
4143 |
-
"vocab_size": 151851,
|
4144 |
-
"n_bytes": 1524839,
|
4145 |
-
"n_tokens": 457492,
|
4146 |
-
"n_chars": 655190
|
4147 |
-
},
|
4148 |
-
"qwen_7b_chat.cc100-ko": {
|
4149 |
-
"vocab_size": 151851,
|
4150 |
-
"n_bytes": 1524839,
|
4151 |
-
"n_tokens": 457492,
|
4152 |
-
"n_chars": 655190
|
4153 |
-
},
|
4154 |
-
"roberta_chinese_clue.cc100-ko": {
|
4155 |
-
"vocab_size": 8021,
|
4156 |
-
"n_bytes": 1524839,
|
4157 |
-
"n_tokens": 226812,
|
4158 |
-
"n_chars": 655190
|
4159 |
-
},
|
4160 |
-
"skywork_13b_base.cc100-ko": {
|
4161 |
-
"vocab_size": 65519,
|
4162 |
-
"n_bytes": 1524839,
|
4163 |
-
"n_tokens": 962744,
|
4164 |
-
"n_chars": 655190
|
4165 |
-
},
|
4166 |
-
"skywork_13b_math.cc100-ko": {
|
4167 |
-
"vocab_size": 65519,
|
4168 |
-
"n_bytes": 1524839,
|
4169 |
-
"n_tokens": 962744,
|
4170 |
-
"n_chars": 655190
|
4171 |
-
},
|
4172 |
-
"solar_10_7b.cc100-ko": {
|
4173 |
-
"vocab_size": 32000,
|
4174 |
-
"n_bytes": 1524839,
|
4175 |
-
"n_tokens": 728766,
|
4176 |
-
"n_chars": 655190
|
4177 |
-
},
|
4178 |
-
"starchat_alpha.cc100-ko": {
|
4179 |
-
"vocab_size": 49156,
|
4180 |
-
"n_bytes": 1524839,
|
4181 |
-
"n_tokens": 580873,
|
4182 |
-
"n_chars": 655190
|
4183 |
-
},
|
4184 |
-
"switch_c_2048.cc100-ko": {
|
4185 |
-
"vocab_size": 32100,
|
4186 |
-
"n_bytes": 1524839,
|
4187 |
-
"n_tokens": 344457,
|
4188 |
-
"n_chars": 655190
|
4189 |
-
},
|
4190 |
-
"t5_base.cc100-ko": {
|
4191 |
-
"vocab_size": 32100,
|
4192 |
-
"n_bytes": 1524839,
|
4193 |
-
"n_tokens": 344457,
|
4194 |
-
"n_chars": 655190
|
4195 |
-
},
|
4196 |
-
"t5_large.cc100-ko": {
|
4197 |
-
"vocab_size": 32100,
|
4198 |
-
"n_bytes": 1524839,
|
4199 |
-
"n_tokens": 344457,
|
4200 |
-
"n_chars": 655190
|
4201 |
-
},
|
4202 |
-
"t5_small.cc100-ko": {
|
4203 |
-
"vocab_size": 32100,
|
4204 |
-
"n_bytes": 1524839,
|
4205 |
-
"n_tokens": 344457,
|
4206 |
-
"n_chars": 655190
|
4207 |
-
},
|
4208 |
-
"text_davinci_003.cc100-ko": {
|
4209 |
-
"vocab_size": 50281,
|
4210 |
-
"n_bytes": 1524839,
|
4211 |
-
"n_tokens": 1308993,
|
4212 |
-
"n_chars": 655190
|
4213 |
-
},
|
4214 |
-
"tigerbot_13b_chat_v2.cc100-ko": {
|
4215 |
-
"vocab_size": 60515,
|
4216 |
-
"n_bytes": 1524839,
|
4217 |
-
"n_tokens": 793053,
|
4218 |
-
"n_chars": 655190
|
4219 |
-
},
|
4220 |
-
"tigerbot_70b_chat_v4_4k.cc100-ko": {
|
4221 |
-
"vocab_size": 65110,
|
4222 |
-
"n_bytes": 1524839,
|
4223 |
-
"n_tokens": 484082,
|
4224 |
-
"n_chars": 655190
|
4225 |
-
},
|
4226 |
-
"wizardcoder_15b_v1.cc100-ko": {
|
4227 |
-
"vocab_size": 49153,
|
4228 |
-
"n_bytes": 1524839,
|
4229 |
-
"n_tokens": 580873,
|
4230 |
-
"n_chars": 655190
|
4231 |
-
},
|
4232 |
-
"wizardcoder_python_7b_v1.cc100-ko": {
|
4233 |
-
"vocab_size": 32001,
|
4234 |
-
"n_bytes": 1524839,
|
4235 |
-
"n_tokens": 964428,
|
4236 |
-
"n_chars": 655190
|
4237 |
-
},
|
4238 |
-
"wizardlm_7b_v1.cc100-ko": {
|
4239 |
-
"vocab_size": 32001,
|
4240 |
-
"n_bytes": 1524839,
|
4241 |
-
"n_tokens": 964428,
|
4242 |
-
"n_chars": 655190
|
4243 |
-
},
|
4244 |
-
"wizardmath_70b_v1.cc100-ko": {
|
4245 |
-
"vocab_size": 32002,
|
4246 |
-
"n_bytes": 1524839,
|
4247 |
-
"n_tokens": 964428,
|
4248 |
-
"n_chars": 655190
|
4249 |
-
},
|
4250 |
-
"xlm_roberta.cc100-ko": {
|
4251 |
-
"vocab_size": 250002,
|
4252 |
-
"n_bytes": 1524839,
|
4253 |
-
"n_tokens": 374571,
|
4254 |
-
"n_chars": 655190
|
4255 |
-
},
|
4256 |
-
"yi_34b.cc100-ko": {
|
4257 |
-
"vocab_size": 64000,
|
4258 |
-
"n_bytes": 1524839,
|
4259 |
-
"n_tokens": 1203134,
|
4260 |
-
"n_chars": 655190
|
4261 |
-
},
|
4262 |
-
"yi_6b.cc100-ko": {
|
4263 |
-
"vocab_size": 64000,
|
4264 |
-
"n_bytes": 1524839,
|
4265 |
-
"n_tokens": 1203134,
|
4266 |
-
"n_chars": 655190
|
4267 |
-
},
|
4268 |
-
"yi_vl34b.cc100-ko": {
|
4269 |
-
"vocab_size": 64000,
|
4270 |
-
"n_bytes": 1524839,
|
4271 |
-
"n_tokens": 1210021,
|
4272 |
-
"n_chars": 655190
|
4273 |
-
},
|
4274 |
-
"zephyr_7b_beta.cc100-ko": {
|
4275 |
-
"vocab_size": 32000,
|
4276 |
-
"n_bytes": 1524839,
|
4277 |
-
"n_tokens": 728766,
|
4278 |
-
"n_chars": 655190
|
4279 |
-
},
|
4280 |
-
"llama_3_chinese_8b.cc100-zh-Hans": {
|
4281 |
-
"vocab_size": 128256,
|
4282 |
-
"n_bytes": 2633047,
|
4283 |
-
"n_tokens": 757405,
|
4284 |
-
"n_chars": 927311
|
4285 |
-
}
|
4286 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
stats/compression_rate.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
utils/byte_util.py
DELETED
File without changes
|
utils/character_util.py
DELETED
@@ -1,231 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
TODO: 繁体、简体、语种、
|
3 |
-
"""
|
4 |
-
import os
|
5 |
-
import json
|
6 |
-
from collections import Counter
|
7 |
-
from vocab import load_tokener
|
8 |
-
from utils.log_util import logger
|
9 |
-
from utils.text_util import is_all_digit, has_digit, get_digit_count, get_space_count
|
10 |
-
from utils.lang_util import detect_language
|
11 |
-
from utils.lang_util_2 import is_zh_char, is_all_zh, get_zh_count
|
12 |
-
|
13 |
-
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
|
14 |
-
|
15 |
-
zh_tokens = [line.strip() for line in open(os.path.join(CURRENT_DIR, "vocab.jd.txt.v2"), "r", encoding="utf-8") if
|
16 |
-
is_zh_char(line.strip())]
|
17 |
-
|
18 |
-
|
19 |
-
def digit_():
|
20 |
-
"""
|
21 |
-
qwen segments numbers by single digits.
|
22 |
-
"""
|
23 |
-
pass
|
24 |
-
|
25 |
-
|
26 |
-
def to_unicode(text):
|
27 |
-
return ''.join(r'\u{:04X}'.format(ord(chr)) for chr in text)
|
28 |
-
|
29 |
-
def zh_iterator():
|
30 |
-
for idx in range(ord(u'\u4e00'), ord(u'\u9fa5')):
|
31 |
-
yield (chr(idx))
|
32 |
-
|
33 |
-
|
34 |
-
def get_coding_length(tokenizer, vocab, filter=None):
|
35 |
-
"""
|
36 |
-
计算编码长度。(有些中文汉字被解码成多个token)
|
37 |
-
"""
|
38 |
-
all_length = []
|
39 |
-
for word in vocab:
|
40 |
-
if len(word) > 1:
|
41 |
-
continue
|
42 |
-
if filter is not None and filter(word):
|
43 |
-
continue
|
44 |
-
try:
|
45 |
-
tokens = tokenizer.encode(word)
|
46 |
-
except Exception as e:
|
47 |
-
print(e)
|
48 |
-
|
49 |
-
all_length.append(len(tokens))
|
50 |
-
# if len(tokens.ids) > 1:
|
51 |
-
# if len(tokens) > 3:
|
52 |
-
# print(word, tokens)
|
53 |
-
|
54 |
-
dist_length = Counter(all_length)
|
55 |
-
mean_length = round(sum(all_length) / len(all_length), 2)
|
56 |
-
return dist_length, mean_length
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
def remove_special_char():
|
61 |
-
"""
|
62 |
-
:return:
|
63 |
-
"""
|
64 |
-
# bert词典有 ##开头的
|
65 |
-
# byteBPE词典有带空格的
|
66 |
-
# decode_str = decode_str.strip().replace("#", "") # TODO, 按类型
|
67 |
-
pass
|
68 |
-
|
69 |
-
|
70 |
-
cache = {}
|
71 |
-
|
72 |
-
def _mean(datas):
|
73 |
-
return sum(datas) / len(datas)
|
74 |
-
|
75 |
-
def iter_vocab(tokenizer_name, from_cache=True, cache_dir="stats/iter_vocab"):
|
76 |
-
"""
|
77 |
-
由于速度较快,建议不采用文件缓存。
|
78 |
-
:param tokenizer:
|
79 |
-
:param from_cache:
|
80 |
-
:return:
|
81 |
-
"""
|
82 |
-
cache_dir = os.path.join(CURRENT_DIR, f"../{cache_dir}")
|
83 |
-
os.makedirs(cache_dir, exist_ok=True)
|
84 |
-
|
85 |
-
tokenizer = load_tokener(tokenizer_name)
|
86 |
-
|
87 |
-
|
88 |
-
# load from cache
|
89 |
-
if from_cache and tokenizer_name in cache:
|
90 |
-
logger.info(f"load {tokenizer_name} from cache")
|
91 |
-
return cache[tokenizer_name]
|
92 |
-
|
93 |
-
has_zh_tokens = []
|
94 |
-
all_zh_tokens = []
|
95 |
-
has_digit_tokens = []
|
96 |
-
all_digit_tokens = []
|
97 |
-
has_space_tokens = []
|
98 |
-
all_space_tokens = []
|
99 |
-
|
100 |
-
# zh_tags = ["all_zh", "has_zh"]
|
101 |
-
# digit_tags = ["all_digit", "has_digit"]
|
102 |
-
|
103 |
-
# zh_token_count = {"total": 0, "包含1个中文单字": 0, "中文多字": 0}
|
104 |
-
|
105 |
-
# symbol_count = 0
|
106 |
-
|
107 |
-
all_single_zh_tokens = set()
|
108 |
-
zh_symbol_count = 0
|
109 |
-
buffer = []
|
110 |
-
for token_id in range(tokenizer.vocab_size):
|
111 |
-
decode_str = tokenizer.decode([token_id], skip_special_tokens=False)
|
112 |
-
token = tokenizer.convert_ids_to_tokens([token_id], skip_special_tokens=False)[0]
|
113 |
-
# tokenizer.convert_tokens_to_string(tokens)
|
114 |
-
|
115 |
-
tags = []
|
116 |
-
|
117 |
-
if token is None: # 有些词典有空的id(不连续)
|
118 |
-
continue
|
119 |
-
if isinstance(token, bytes):
|
120 |
-
token = token.decode("utf-8", errors="ignore")
|
121 |
-
|
122 |
-
digit_count = get_digit_count(decode_str)
|
123 |
-
language_tags = detect_language(decode_str)
|
124 |
-
|
125 |
-
if "Chinese" in language_tags:
|
126 |
-
has_zh_tokens.append(decode_str)
|
127 |
-
|
128 |
-
if is_all_zh(decode_str):
|
129 |
-
tags.append("all_zh")
|
130 |
-
all_zh_tokens.append(decode_str)
|
131 |
-
|
132 |
-
|
133 |
-
if is_all_digit(decode_str):
|
134 |
-
tags.append("all_digit")
|
135 |
-
all_digit_tokens.append(decode_str)
|
136 |
-
if has_digit(decode_str):
|
137 |
-
tags.append("has_digit")
|
138 |
-
has_digit_tokens.append(decode_str)
|
139 |
-
|
140 |
-
|
141 |
-
space_count = get_space_count(decode_str)
|
142 |
-
if space_count > 0:
|
143 |
-
has_space_tokens.append(decode_str)
|
144 |
-
if space_count == len(decode_str):
|
145 |
-
all_space_tokens.append(decode_str)
|
146 |
-
|
147 |
-
zh_count = get_zh_count(decode_str)
|
148 |
-
|
149 |
-
buffer.append(json.dumps(
|
150 |
-
{"id": token_id,
|
151 |
-
"token": token,
|
152 |
-
"token_decode": decode_str,
|
153 |
-
"token_dumps": json.dumps(token),
|
154 |
-
"token_unicode": to_unicode(token),
|
155 |
-
"token_len": len(decode_str),
|
156 |
-
"zh_count": zh_count, # 包含汉字的数目
|
157 |
-
# "zh-smpli": zh_hans_count, # 简体中文 zh-Hans
|
158 |
-
"tags": tags,
|
159 |
-
"zh_symbol_count": zh_symbol_count,
|
160 |
-
},
|
161 |
-
ensure_ascii=False) + "\n")
|
162 |
-
|
163 |
-
# if zh_count >= 1:
|
164 |
-
# zh_token_count["total"] += 1
|
165 |
-
# if zh_count > 1:
|
166 |
-
# zh_token_count["中文多字"] += 1
|
167 |
-
# else:
|
168 |
-
# zh_token_count["中文单字"] += 1
|
169 |
-
# all_single_zh_tokens.add(decode_str.strip().replace("#", ""))
|
170 |
-
#
|
171 |
-
# zh_token_count["中文单字-去重���"] = len(all_single_zh_tokens)
|
172 |
-
|
173 |
-
dist_length, mean_length = get_coding_length(tokenizer, zh_tokens, filter=lambda k: not is_zh_char(k))
|
174 |
-
|
175 |
-
# TODO: 繁体字,简体字
|
176 |
-
|
177 |
-
result = {
|
178 |
-
"name": tokenizer_name,
|
179 |
-
"impl": str(tokenizer.__class__),
|
180 |
-
"vocab_size": len(tokenizer),
|
181 |
-
"中文token数": len(has_zh_tokens),
|
182 |
-
"中文token的平均长度": None,
|
183 |
-
"纯中文token的平均长度": None,
|
184 |
-
"中文标点数": zh_symbol_count,
|
185 |
-
"中文汉字编码长度均值": mean_length,
|
186 |
-
"中文汉字编码长度分布": json.dumps(dist_length),
|
187 |
-
"纯数字token数": len(all_digit_tokens),
|
188 |
-
"包含数字token数": len(has_digit_tokens),
|
189 |
-
"纯数字token的平均长度": round(_mean([len(item) for item in all_digit_tokens]), 2),
|
190 |
-
"纯中文token数": None, # all_zh
|
191 |
-
"纯space的token数": len(all_space_tokens),
|
192 |
-
"纯space的token数": len(all_space_tokens), # "#"
|
193 |
-
"纯space的token的平均长度": None, # avg_len( tokens_contains_space)
|
194 |
-
"contains_korea": None,
|
195 |
-
}
|
196 |
-
out_path = os.path.join(cache_dir, f"{tokenizer_name}.vocab.jsonl")
|
197 |
-
logger.info(f"saving vocab to {out_path}")
|
198 |
-
with open(out_path, "w", encoding="utf-8") as f_out:
|
199 |
-
f_out.write(json.dumps(result, ensure_ascii=False) + "\n")
|
200 |
-
for line in buffer:
|
201 |
-
f_out.write(line)
|
202 |
-
cache[tokenizer_name] = result
|
203 |
-
return result
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
if __name__ == "__main__":
|
211 |
-
# test_coding_length(jd_vocab_tokens, filter=lambda k: not is_chinese(k))
|
212 |
-
# test_coding_length(zh_punc)
|
213 |
-
# test_coding_length(zh_iterator())
|
214 |
-
|
215 |
-
# from vocab.chatglm2_6b import tokenizer; name = "chatglm2_6b"
|
216 |
-
# from vocab.chatglm_6b import tokenizer; name="chatglm_6b"
|
217 |
-
# from vocab.baichuan2 import tokenizer; name="baichuan2"
|
218 |
-
name="gpt_4"
|
219 |
-
# name="gpt2"
|
220 |
-
# name="qwen1_5_14b_chat"
|
221 |
-
# name="gpt_nexo_20b"
|
222 |
-
# name="fastchat_t5_3b"
|
223 |
-
|
224 |
-
|
225 |
-
print(iter_vocab(name))
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
utils/convert_sp_to_json.py
DELETED
@@ -1,4 +0,0 @@
|
|
1 |
-
|
2 |
-
from vocab.baichuan_7b import tokenizer
|
3 |
-
|
4 |
-
tokenizer.sp
|
|
|
|
|
|
|
|
|
|
utils/fn_util.py
DELETED
File without changes
|
utils/lang_util.py
CHANGED
@@ -18,43 +18,39 @@ import re
|
|
18 |
# 由于大部分是'latin',所以就不统计了。
|
19 |
common = ['Chinese', 'Japanese-Kana', 'Korean', 'Arabic', 'number']
|
20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
def detect_language(s):
|
22 |
# 定义各语言字符的Unicode范围
|
23 |
-
language_ranges = {
|
24 |
-
'Arabic': r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]',
|
25 |
-
# 'CJK' https://en.wikipedia.org/wiki/CJK_Unified_Ideographs
|
26 |
-
'Chinese': r'[\u4e00-\u9fff]',
|
27 |
-
'Japanese': r'[\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF\u3400-\u4DBF]', # https://stackoverflow.com/questions/19899554/unicode-range-for-japanese
|
28 |
-
'Japanese-Kana': r'[\u3040-\u309F\u30A0-\u30FF]', # Hiragana & Katakana
|
29 |
-
# 'Korean': r'[\uac00-\ud7a3]',
|
30 |
-
'Hangul': r'[\uac00-\ud7a3]',
|
31 |
-
|
32 |
-
|
33 |
-
# 拉丁字母系列
|
34 |
-
'Latin': r'[\u0000-\u007F\u0080-\u00FF]',
|
35 |
-
'English': r'[A-Za-z]', # 这可能会与其他使用基本拉丁字母的语言重叠
|
36 |
-
'French': r'[\u00C0-\u00FF]',
|
37 |
-
'German': r'[\u00C4\u00D6\u00DC\u00E4\u00F6\u00FC\u00DF]',
|
38 |
-
'Spanish-': r'[\u00C1\u00E1\u00C9\u00E9\u00CD\u00ED\u00D3\u00F3\u00DA\u00FA\u00D1\u00F1\u00FC]', # 西班牙语特有字符集合
|
39 |
-
|
40 |
-
|
41 |
-
# 斯拉夫语族
|
42 |
-
'Cyrillic': r'[\u0400-\u04FF\u0500-\u052F\u2DE0-\u2DFF\uA640-\uA69F]',
|
43 |
-
|
44 |
-
#
|
45 |
-
'Greek': r'[\u0370-\u03FF\u1F00-\u1FFF]', # 希腊字母
|
46 |
-
'Hebrew': r'[\u0590-\u05FF\uFB1D-\uFB4F]', # 希伯来语
|
47 |
-
|
48 |
-
|
49 |
-
}
|
50 |
-
|
51 |
detected_languages = []
|
52 |
-
|
53 |
for language, pattern in language_ranges.items():
|
54 |
if re.search(pattern, s):
|
55 |
detected_languages.append(language)
|
56 |
|
57 |
-
return detected_languages
|
58 |
|
59 |
|
60 |
if __name__ == "__main__":
|
|
|
18 |
# 由于大部分是'latin',所以就不统计了。
|
19 |
common = ['Chinese', 'Japanese-Kana', 'Korean', 'Arabic', 'number']
|
20 |
|
21 |
+
language_ranges = {
|
22 |
+
('Arabic', 'ar'): r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]',
|
23 |
+
# 'CJK' https://en.wikipedia.org/wiki/CJK_Unified_Ideographs
|
24 |
+
('Chinese', 'zh'): r'[\u4e00-\u9fff]',
|
25 |
+
('Japanese', 'ja'): r'[\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF\u3400-\u4DBF]',
|
26 |
+
# https://stackoverflow.com/questions/19899554/unicode-range-for-japanese
|
27 |
+
# Kana type refers to Japanese hiragana and katakana characters that represent phonetic sounds in the Japanese language.
|
28 |
+
('Japanese-Kana', 'ja-kana'): r'[\u3040-\u309F\u30A0-\u30FF]', # Hiragana & Katakana
|
29 |
+
('Korean', 'ko'): r'[\uac00-\ud7a3]',
|
30 |
+
|
31 |
+
# 拉丁字母系列
|
32 |
+
# ('Latin', 'la'): r'[\u0000-\u007F\u0080-\u00FF]',
|
33 |
+
# ('English', 'en'): r'[A-Za-z]', # 这可能会与其他使用基本拉丁字母的语言重叠
|
34 |
+
# ('French', 'fr'): r'[\u00C0-\u00FF]',
|
35 |
+
# ('German', 'de'): r'[\u00C4\u00D6\u00DC\u00E4\u00F6\u00FC\u00DF]',
|
36 |
+
# ('Spanish-特有'): r'[\u00C1\u00E1\u00C9\u00E9\u00CD\u00ED\u00D3\u00F3\u00DA\u00FA\u00D1\u00F1\u00FC]', # 西班牙语特有字符集合
|
37 |
+
|
38 |
+
# 斯拉夫语系列
|
39 |
+
# ('Cyrillic', ''): r'[\u0400-\u04FF\u0500-\u052F\u2DE0-\u2DFF\uA640-\uA69F]',
|
40 |
+
|
41 |
+
#
|
42 |
+
# 'Greek': r'[\u0370-\u03FF\u1F00-\u1FFF]', # 希腊字母
|
43 |
+
# 'Hebrew': r'[\u0590-\u05FF\uFB1D-\uFB4F]', # 希伯来语
|
44 |
+
}
|
45 |
+
|
46 |
def detect_language(s):
|
47 |
# 定义各语言字符的Unicode范围
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
detected_languages = []
|
|
|
49 |
for language, pattern in language_ranges.items():
|
50 |
if re.search(pattern, s):
|
51 |
detected_languages.append(language)
|
52 |
|
53 |
+
return detected_languages
|
54 |
|
55 |
|
56 |
if __name__ == "__main__":
|
utils/lang_util_2.py
DELETED
@@ -1,115 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
日语、韩语 等
|
3 |
-
https://www.cnblogs.com/luoganttcc/p/16605150.html
|
4 |
-
https://zhuanlan.zhihu.com/p/618684374
|
5 |
-
- https://zhuanlan.zhihu.com/p/84625185 赞
|
6 |
-
|
7 |
-
|
8 |
-
## 相关包
|
9 |
-
|
10 |
-
import opencc
|
11 |
-
import langid
|
12 |
-
imort langdetect
|
13 |
-
https://github.com/pemistahl/lingua-py
|
14 |
-
- 原理:
|
15 |
-
|
16 |
-
|
17 |
-
"""
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
from zhon.hanzi import punctuation as zh_punc
|
22 |
-
|
23 |
-
def is_zh_char(uchar):
|
24 |
-
"""
|
25 |
-
https://github.com/fxsjy/jieba/blob/master/jieba/__init__.py#L48
|
26 |
-
re.compile("([\u4E00-\u9FD5]+)", re.U)
|
27 |
-
"""
|
28 |
-
return u'\u4e00' <= uchar <= u'\u9fa5'
|
29 |
-
|
30 |
-
def has_zh_punc(text):
|
31 |
-
"""
|
32 |
-
是否包含中文标点
|
33 |
-
"""
|
34 |
-
return any(ch in zh_punc for ch in text)
|
35 |
-
|
36 |
-
|
37 |
-
def has_zh(text):
|
38 |
-
""" contains Chinese characters """
|
39 |
-
return any(is_zh_char(ch) for ch in text)
|
40 |
-
|
41 |
-
|
42 |
-
def get_zh_count(text):
|
43 |
-
return sum([is_zh_char(uchar) for uchar in text])
|
44 |
-
|
45 |
-
|
46 |
-
def is_all_zh(text):
|
47 |
-
return all(is_zh_char(char) for char in text)
|
48 |
-
|
49 |
-
|
50 |
-
def is_all_en(text):
|
51 |
-
return text.encode('utf-8').isalpha()
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
ranges = [
|
57 |
-
{"from": ord(u"\u3300"), "to": ord(u"\u33ff")}, # compatibility ideographs
|
58 |
-
{"from": ord(u"\ufe30"), "to": ord(u"\ufe4f")}, # compatibility ideographs
|
59 |
-
{"from": ord(u"\uf900"), "to": ord(u"\ufaff")}, # compatibility ideographs
|
60 |
-
{"from": ord(u"\U0002F800"), "to": ord(u"\U0002fa1f")}, # compatibility ideographs
|
61 |
-
{'from': ord(u'\u3040'), 'to': ord(u'\u309f')}, # Japanese Hiragana 日本平假名 96个
|
62 |
-
{"from": ord(u"\u30a0"), "to": ord(u"\u30ff")}, # Japanese Katakana 日语片假名 96个
|
63 |
-
{"from": ord(u"\u2e80"), "to": ord(u"\u2eff")}, # cjk radicals supplement
|
64 |
-
{"from": ord(u"\u4e00"), "to": ord(u"\u9fff")}, # 中文 u"\u4e00"-'\u9fa5',
|
65 |
-
{"from": ord(u"\u3400"), "to": ord(u"\u4dbf")}, #
|
66 |
-
{"from": ord(u"\U00020000"), "to": ord(u"\U0002a6df")},
|
67 |
-
{"from": ord(u"\U0002a700"), "to": ord(u"\U0002b73f")},
|
68 |
-
{"from": ord(u"\U0002b740"), "to": ord(u"\U0002b81f")},
|
69 |
-
{"from": ord(u"\U0002b820"), "to": ord(u"\U0002ceaf")} # included as of Unicode 8.0
|
70 |
-
]
|
71 |
-
|
72 |
-
# 韩语 [\uac00-\ud7ff]
|
73 |
-
|
74 |
-
|
75 |
-
def is_cjk(char):
|
76 |
-
"""
|
77 |
-
CJK(Chinese、Japanese、Korean)
|
78 |
-
日语中有很多汉字,日本汉字超过2万。
|
79 |
-
韩语有谚文,超过50个,有朝鲜汉字超过2万。
|
80 |
-
"""
|
81 |
-
return any([range["from"] <= ord(char) <= range["to"] for range in ranges])
|
82 |
-
|
83 |
-
|
84 |
-
def cjk_substrings(string):
|
85 |
-
i = 0
|
86 |
-
while i < len(string):
|
87 |
-
if is_cjk(string[i]):
|
88 |
-
start = i
|
89 |
-
while is_cjk(string[i]): i += 1
|
90 |
-
yield string[start:i]
|
91 |
-
i += 1
|
92 |
-
|
93 |
-
|
94 |
-
def aa():
|
95 |
-
# string = "sdf344asfasf天地方益3権sdfsdf".decode("utf-8")
|
96 |
-
for idx, item in enumerate(ranges):
|
97 |
-
print(idx, end=": ")
|
98 |
-
for j in range(10):
|
99 |
-
print(chr(item["from"] + j), end=", ")
|
100 |
-
print("")
|
101 |
-
# for sub in cjk_substrings(string):
|
102 |
-
# string = string.replace(sub, "(" + sub + ")")
|
103 |
-
# print(string)
|
104 |
-
|
105 |
-
|
106 |
-
def is_traditional_chinese(text):
|
107 |
-
cc = opencc.OpenCC('t2s')
|
108 |
-
converted_text = cc.convert(text)
|
109 |
-
if converted_text != text:
|
110 |
-
return True
|
111 |
-
return False
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
# aa()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
utils/oov.md
ADDED
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
```sh
|
3 |
+
###################################
|
4 |
+
ClueAI/ChatYuan-large-v2, <class 'tokenizers.models.Unigram'>
|
5 |
+
reversible: false; unk_token: <unk>, 2, unk_ratio: 0.2000; oov: []
|
6 |
+
text[7] = "Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
|
7 |
+
decoding[7] = "<unk>амглав<unk> у<unk>равления развития; <unk> <unk> 15~17<unk> <unk> 3<unk>; 確実に春が近づいてること; a közoktatással? _ Belföld; pum<unk>, i vjet<unk>r, vjeç; <unk>ا<unk> <unk> <unk>ا<unk> ; <unk> <unk> <unk> <unk> <unk> <unk>; <unk> <unk> ; <unk>зейн<unk>я асо<unk>:; <unk> <unk> <unk> <unk>; <unk>;<unk>"
|
8 |
+
|
9 |
+
|
10 |
+
###################################
|
11 |
+
ClueAI/PromptCLUE-base, <class 'tokenizers.models.Unigram'>
|
12 |
+
reversible: false; unk_token: <unk>, 2, unk_ratio: 0.2000; oov: []
|
13 |
+
text[7] = "Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
|
14 |
+
decoding[7] = "<unk>амглав<unk> у<unk>равления развития; <unk> <unk> 15~17<unk> <unk> 3<unk>; 確実に春が近づいてること; a közoktatással? _ Belföld; pum<unk>, i vjet<unk>r, vjeç; <unk>ا<unk> <unk> <unk>ا<unk> ; <unk> <unk> <unk> <unk> <unk> <unk>; <unk> <unk> ; <unk>зейн<unk>я асо<unk>:; <unk> <unk> <unk> <unk>; <unk>;<unk>"
|
15 |
+
###################################
|
16 |
+
CohereForAI/aya-101, <class 'tokenizers.models.Unigram'>
|
17 |
+
reversible: false; unk_token: <unk>, 2, unk_ratio: 0.0079; oov: []
|
18 |
+
text[73] = " a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
|
19 |
+
decoding[73] = "a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; <unk>❤❥웃유♋☮✊;װיקיװערטערבוך"
|
20 |
+
###################################
|
21 |
+
FacebookAI/xlm-roberta-base, <class 'tokenizers.models.Unigram'>
|
22 |
+
reversible: false; unk_token: <unk>, 3, unk_ratio: 0.0096; oov: []
|
23 |
+
text[73] = " a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
|
24 |
+
decoding[73] = "a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; <unk>❤❥웃유♋☮✊;װיקיװערטערבוך"
|
25 |
+
###################################
|
26 |
+
OrionStarAI/Orion-14B-Chat, sp_model, byte_num: 0
|
27 |
+
reversible: false; unk_token: <unk>, 0, unk_ratio: 0.0495; oov: []
|
28 |
+
text[71] = "; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
|
29 |
+
decoding[71] = "; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئ<unk> ⁇ ردوغان <unk> ⁇ قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለ<unk> ⁇ ጭ የግድግ<unk> ⁇ ; Дзейныя асобы:; « <unk> ⁇ <unk> ⁇ <unk> ⁇ ; \t\n <unk> ⁇ ❤❥웃유♋☮✊; <unk> ⁇ יקי<unk> ⁇ ערטערבוך "
|
30 |
+
###################################
|
31 |
+
THUDM/chatglm-6b, byte_num: 256
|
32 |
+
reversible: false; unk_token: <unk>, 0, unk_ratio: 0.0000; oov: []
|
33 |
+
text[237] = "\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
|
34 |
+
decoding[237] = " 🦙❤❥웃유♋☮✊;װיקיװערטערבוך"
|
35 |
+
###################################
|
36 |
+
abeja/gpt-neox-japanese-2.7b, japanese-bpe: https://github.com/tanreinama/Japanese-BPEEncoder_V2
|
37 |
+
reversible: false; unk_token: <|endoftext|>, 31999, unk_ratio: 0.0000; oov: []
|
38 |
+
text[7] = "Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
|
39 |
+
decoding[7] = "���������������� �������������������� ����������������; ������ ������ 15~17��� ��������� 3������; 確実に春が近づいてること; a k��zoktat��ssal? _ Belf��ld; pum��, i vjet��r, vje��; ���������������� ���� ���������������������� ; ��������������� ��������� ������ ��������� ������ ������������������������; ��������������� ��������������� ; �������������� ����������:; ǀ ��������������������������� ��������������� ���������������; \t\n\n🐯❤‖������🟥🟥🤚;��������������������������"
|
40 |
+
|
41 |
+
|
42 |
+
###################################
|
43 |
+
baichuan-inc/Baichuan-7B, sp_model, byte_num: 256
|
44 |
+
reversible: false; unk_token: <unk>, 0, unk_ratio: 0.0000; oov: []
|
45 |
+
text[237] = "\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
|
46 |
+
decoding[237] = " 🦙❤❥웃유♋☮✊;װיקיװערטערבוך "
|
47 |
+
###################################
|
48 |
+
ckiplab/gpt2-base-chinese, <class 'tokenizers.models.WordPiece'>
|
49 |
+
reversible: false; unk_token: [UNK], 100, unk_ratio: 0.1185; oov: []
|
50 |
+
text[5] = "; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
|
51 |
+
decoding[5] = " ; замглавы управления развития ; 특히 주소 15 ~ 17번 홀에선 3연속 ; 確 実 に 春 か 近 ついてること ; a kozoktatassal? _ belfold ; pume, i vjeter, vjec ; [UNK] [UNK] [UNK] ; [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] ; [UNK] [UNK] ; дзеиныя асобы : ; « [UNK] [UNK] [UNK] ; [UNK] ; [UNK]"
|
52 |
+
|
53 |
+
|
54 |
+
###################################
|
55 |
+
cl-tohoku/bert-base-japanese, wordpiece.MecabTokenizer, 支持byte-level https://github.com/polm/fugashi
|
56 |
+
reversible: false; unk_token: [UNK], 1, unk_ratio: 0.3951; oov: []
|
57 |
+
text[5] = "; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
|
58 |
+
decoding[5] = " ; [UNK] [UNK] [UNK] ; [UNK] [UNK] 15 ~ 17 [UNK] [UNK] 3 [UNK] ; 確実 に 春 が 近づい てる こと ; a közoktatással? _ Belföld ; [UNK], i [UNK], vjeç ; [UNK] [UNK] [UNK] ; [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] ; [UNK] [UNK] ; [UNK] [UNK] :; [UNK] [UNK] [UNK] [UNK] ; [UNK] [UNK] [UNK] [UNK] [UNK]"
|
59 |
+
|
60 |
+
|
61 |
+
###################################
|
62 |
+
clue/roberta_chinese_clue_tiny, <class 'tokenizers.models.WordPiece'>
|
63 |
+
reversible: false; unk_token: [UNK], 100, unk_ratio: 0.3580; oov: []
|
64 |
+
text[5] = "; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
|
65 |
+
decoding[5] = " ; [UNK] [UNK] [UNK] ; [UNK] [UNK] 15 ~ [UNK] [UNK] [UNK] ; [UNK] 実 [UNK] 春 [UNK] 近 [UNK] ; a kozoktatassal? _ belfold ; pume, i vjeter, vjec ; [UNK] [UNK] [UNK] ; [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] ; [UNK] [UNK] ; [UNK] асобы : ; « [UNK] [UNK] [UNK] ; [UNK] ; [UNK]"
|
66 |
+
|
67 |
+
|
68 |
+
###################################
|
69 |
+
dbmdz/bert-base-german-uncased, <class 'tokenizers.models.WordPiece'>
|
70 |
+
reversible: false; unk_token: [UNK], 101, unk_ratio: 0.4459; oov: []
|
71 |
+
text[5] = "; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
|
72 |
+
decoding[5] = " ; [UNK] [UNK] [UNK] ; [UNK] [UNK] 15 ~ [UNK] [UNK] [UNK] ; [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] ; a kozoktatassal? _ belfold ; pume, i vjeter, vjec ; [UNK] [UNK] [UNK] ; [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] ; [UNK] [UNK] ; [UNK] [UNK] : ; « [UNK] [UNK] [UNK] ; [UNK] ; [UNK]"
|
73 |
+
###################################
|
74 |
+
deepseek-ai/deepseek-coder-33b-instruct, <class 'tokenizers.models.BPE'>
|
75 |
+
reversible: false; unk_token: None, None, unk_ratio: 0.0000; oov: []
|
76 |
+
text[77] = "özoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
|
77 |
+
decoding[77] = "�zoktatással? _ Belf�ld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך "
|
78 |
+
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
|
79 |
+
[2024-05-12 00:30:36] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer eson/kplug-base-encoder
|
80 |
+
###################################
|
81 |
+
deepseek-ai/deepseek-llm-7b-base, <class 'tokenizers.models.BPE'>
|
82 |
+
reversible: false; unk_token: None, None, unk_ratio: 0.0000; oov: []
|
83 |
+
text[77] = "özoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
|
84 |
+
decoding[77] = "�zoktatással? _ Belf�ld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך "
|
85 |
+
[2024-05-12 00:30:56] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer fnlp/moss-moon-003-sft
|
86 |
+
###################################
|
87 |
+
eson/kplug-base-encoder, <class 'tokenizers.models.WordPiece'>
|
88 |
+
reversible: false; unk_token: [UNK], 100, unk_ratio: 0.3625; oov: []
|
89 |
+
text[5] = "; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
|
90 |
+
decoding[5] = " ; [UNK] [UNK] [UNK] ; [UNK] [UNK] 15 ~ [UNK] [UNK] [UNK] ; [UNK] 実 [UNK] 春 [UNK] 近 [UNK] ; a kozoktatassal? _ belfold ; pume, i vjeter, vjec ; [UNK] [UNK] [UNK] ; [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] ; [UNK] [UNK] ; [UNK] асобы : ; « [UNK] [UNK] [UNK] ; [UNK] ; [UNK]"
|
91 |
+
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
|
92 |
+
[2024-05-12 00:31:36] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer google-bert/bert-base-cased
|
93 |
+
###################################
|
94 |
+
fnlp/moss-moon-003-sft, 应该是 sentencepiece.byte_bpe,待确认
|
95 |
+
reversible: false; unk_token: <|endoftext|>, 106028, unk_ratio: 0.0000; oov: []
|
96 |
+
text[74] = "a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
|
97 |
+
decoding[74] = " a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך "
|
98 |
+
###################################
|
99 |
+
google-bert/bert-base-cased, <class 'tokenizers.models.WordPiece'>
|
100 |
+
reversible: false; unk_token: [UNK], 100, unk_ratio: 0.1732; oov: []
|
101 |
+
text[5] = "; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
|
102 |
+
decoding[5] = " ; Замглавы управления развития ; [UNK] [UNK] 15 ~ [UNK] [UNK] [UNK] ; [UNK] [UNK] に [UNK] [UNK] [UNK] [UNK] ; a közoktatással? _ Belföld ; pumë, i vjetër, vjeç ; [UNK] [UNK] قىرغىزىستان ; निम्न में से [UNK] सा [UNK] ; [UNK] [UNK] ; Дзейныя асобы : ; « [UNK] [UNK] [UNK] ; [UNK] ; [UNK]"
|
103 |
+
[2024-05-12 00:31:56] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer google-bert/bert-base-chinese
|
104 |
+
[2024-05-12 00:32:16] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer google-bert/bert-base-german-cased
|
105 |
+
###################################
|
106 |
+
google-bert/bert-base-chinese, <class 'tokenizers.models.WordPiece'>
|
107 |
+
reversible: false; unk_token: [UNK], 100, unk_ratio: 0.3704; oov: []
|
108 |
+
text[5] = "; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
|
109 |
+
decoding[5] = " ; [UNK] управления развития ; [UNK] [UNK] 15 ~ [UNK] [UNK] [UNK] ; 確 実 に 春 [UNK] 近 [UNK] ; a [UNK]? _ [UNK] ; [UNK], i [UNK], [UNK] ; [UNK] [UNK] [UNK] ; [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] ; [UNK] [UNK] ; [UNK] асобы : ; « [UNK] [UNK] [UNK] ; [UNK] ; [UNK]"
|
110 |
+
###################################
|
111 |
+
google-bert/bert-base-german-cased, <class 'tokenizers.models.WordPiece'>
|
112 |
+
reversible: false; unk_token: [UNK], 2, unk_ratio: 0.5938; oov: []
|
113 |
+
text[5] = "; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
|
114 |
+
decoding[5] = " ; [UNK] [UNK] [UNK] ; [UNK] [UNK] 15 ~ [UNK] [UNK] [UNK] ; [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] ; a [UNK]? _ Belföld ; [UNK], i [UNK], [UNK] ; [UNK] [UNK] [UNK] ; [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] ; [UNK] [UNK] ; [UNK] [UNK] : ; [UNK] [UNK] [UNK] [UNK] ; [UNK] ; [UNK]"
|
115 |
+
[2024-05-12 00:32:36] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer google-bert/bert-base-multilingual-cased
|
116 |
+
[2024-05-12 00:32:57] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer google-bert/bert-base-multilingual-uncased
|
117 |
+
###################################
|
118 |
+
google-bert/bert-base-multilingual-cased, <class 'tokenizers.models.WordPiece'>
|
119 |
+
reversible: false; unk_token: [UNK], 100, unk_ratio: 0.0531; oov: []
|
120 |
+
text[5] = "; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
|
121 |
+
decoding[5] = " ; Замглавы управления развития ; 특히 주소 15 ~ 17번 홀에선 3연속 ; 確 実 に 春 が 近 づいてること ; a közoktatással? _ Belföld ; pumë, i vjetër, vjeç ; [UNK] [UNK] قىرغىزىستان ; निम्न में से कौन सा हारडवेयर ; [UNK] [UNK] ; Дзейныя асобы : ; « અમરેલીનાં મહિલા વિકાસ ; [UNK] ; [UNK]"
|
122 |
+
[2024-05-12 00:33:17] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer google-bert/bert-base-uncased
|
123 |
+
###################################
|
124 |
+
google-bert/bert-base-multilingual-uncased, <class 'tokenizers.models.WordPiece'>
|
125 |
+
reversible: false; unk_token: [UNK], 100, unk_ratio: 0.0360; oov: []
|
126 |
+
text[5] = "; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
|
127 |
+
decoding[5] = " ; замглавы управления развития ; 특히 주소 15 ~ 17번 홀에선 3연속 ; 確 実 に 春 か 近 ついてること ; a kozoktatassal? _ belfold ; pume, i vjeter, vjec ; يەردوغان ۋە قىرغىزىستان ; निमन म स कौन सा हारडवयर ; [UNK] [UNK] ; дзеиныя асобы : ; « અમરલીના મહિલા વિકાસ ; [UNK] ; [UNK]"
|
128 |
+
[2024-05-12 00:33:37] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer google-t5/t5-large
|
129 |
+
###################################
|
130 |
+
google-bert/bert-base-uncased, <class 'tokenizers.models.WordPiece'>
|
131 |
+
reversible: false; unk_token: [UNK], 100, unk_ratio: 0.0867; oov: []
|
132 |
+
text[5] = "; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
|
133 |
+
decoding[5] = " ; замглавы управления развития ; 특히 주소 15 ~ 17번 홀에선 3연속 ; [UNK] [UNK] に 春 か [UNK] ついてること ; a kozoktatassal? _ belfold ; pume, i vjeter, vjec ; [UNK] [UNK] قىرغىزىستان ; निमन म स [UNK] सा हारडवयर ; [UNK] [UNK] ; дзеиныя асобы : ; « [UNK] [UNK] [UNK] ; [UNK] ; [UNK]"
|
134 |
+
###################################
|
135 |
+
google-t5/t5-large, <class 'tokenizers.models.Unigram'>
|
136 |
+
reversible: false; unk_token: <unk>, 2, unk_ratio: 0.2769; oov: []
|
137 |
+
text[7] = "Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
|
138 |
+
decoding[7] = "<unk>ам<unk>лав<unk> у<unk>равлени<unk> ра<unk>вити<unk>; <unk> <unk> 15<unk>17<unk> <unk> 3<unk>; <unk>; a közoktatással? _ Belföld; pum<unk>, i vjet<unk>r, vjeç; <unk> <unk> <unk> ; <unk> <unk> <unk> <unk> <unk> <unk>; <unk> <unk> ; <unk>е<unk>н<unk> асо<unk>:; « <unk> <unk> <unk>; <unk>;<unk>"
|
139 |
+
[2024-05-12 00:34:57] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer google/byt5-small
|
140 |
+
[2024-05-12 00:35:18] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer google/gemma-7b
|
141 |
+
[2024-05-12 00:35:39] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer google/mobilebert-uncased
|
142 |
+
[2024-05-12 00:36:59] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer google/mt5-large
|
143 |
+
###################################
|
144 |
+
google/mobilebert-uncased, <class 'tokenizers.models.WordPiece'>
|
145 |
+
reversible: false; unk_token: [UNK], 100, unk_ratio: 0.0867; oov: []
|
146 |
+
text[5] = "; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
|
147 |
+
decoding[5] = " ; замглавы управления развития ; 특히 주소 15 ~ 17번 홀에선 3연속 ; [UNK] [UNK] に 春 か [UNK] ついてること ; a kozoktatassal? _ belfold ; pume, i vjeter, vjec ; [UNK] [UNK] قىرغىزىستان ; निमन म स [UNK] सा हारडवयर ; [UNK] [UNK] ; дзеиныя асобы : ; « [UNK] [UNK] [UNK] ; [UNK] ; [UNK]"
|
148 |
+
C:\Users\xusong28\Miniconda3\envs\py3.10-torch1.13-hf.latest\lib\site-packages\transformers\convert_slow_tokenizer.py:560: UserWarning: The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.
|
149 |
+
warnings.warn(
|
150 |
+
[2024-05-12 00:37:23] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer google/switch-c-2048
|
151 |
+
###################################
|
152 |
+
google/mt5-large, <class 'tokenizers.models.Unigram'>
|
153 |
+
reversible: false; unk_token: <unk>, 2, unk_ratio: 0.0079; oov: []
|
154 |
+
text[73] = " a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
|
155 |
+
decoding[73] = "a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; <unk>❤❥웃유♋☮✊;װיקיװערטערבוך"
|
156 |
+
[2024-05-12 00:37:43] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer hfl/chinese-alpaca-lora-7b
|
157 |
+
###################################
|
158 |
+
google/switch-c-2048, <class 'tokenizers.models.Unigram'>
|
159 |
+
reversible: false; unk_token: <unk>, 2, unk_ratio: 0.2769; oov: []
|
160 |
+
text[7] = "Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
|
161 |
+
decoding[7] = "<unk>ам<unk>лав<unk> у<unk>равлени<unk> ра<unk>вити<unk>; <unk> <unk> 15<unk>17<unk> <unk> 3<unk>; <unk>; a közoktatással? _ Belföld; pum<unk>, i vjet<unk>r, vjeç; <unk> <unk> <unk> ; <unk> <unk> <unk> <unk> <unk> <unk>; <unk> <unk> ; <unk>е<unk>н<unk> асо<unk>:; « <unk> <unk> <unk>; <unk>;<unk>"
|
162 |
+
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
|
163 |
+
[2024-05-12 00:38:04] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer hfl/chinese-llama-2-7b
|
164 |
+
[2024-05-12 00:38:25] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer hfl/chinese-llama-lora-7b
|
165 |
+
[2024-05-12 00:38:46] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer hfl/llama-3-chinese-8b
|
166 |
+
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
|
167 |
+
[2024-05-12 00:39:07] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer hpcai-tech/grok-1
|
168 |
+
[2024-05-12 00:39:28] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer internlm/internlm-chat-7b
|
169 |
+
[2024-05-12 00:40:09] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer internlm/internlm-xcomposer-7b
|
170 |
+
[2024-05-12 00:40:31] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer internlm/internlm2-chat-7b
|
171 |
+
[2024-05-12 00:41:13] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer internlm/internlm2-math-7b
|
172 |
+
[2024-05-12 00:41:35] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer lmsys/fastchat-t5-3b-v1.0
|
173 |
+
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
|
174 |
+
###################################
|
175 |
+
[2024-05-12 00:41:55] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer meta-llama/Llama-2-7b-chat
|
176 |
+
lmsys/fastchat-t5-3b-v1.0, sp_model, byte_num: 0
|
177 |
+
reversible: false; unk_token: <unk>, 2, unk_ratio: 0.2105; oov: []
|
178 |
+
text[7] = "Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
|
179 |
+
decoding[7] = " <unk> ам<unk> лав<unk> у<unk> равлени<unk> ра<unk> вити<unk>; <unk> <unk> 15<unk> 17<unk> <unk> 3<unk>; <unk>; a közoktatással? _ Belföld; pum<unk>, i vjet<unk>r, vjeç; <unk> <unk> <unk> ; <unk> <unk> <unk> <unk> <unk> <unk>; <unk> <unk> ; <unk> е<unk> н<unk> асо<unk>:; « <unk> <unk> <unk>; \t \n <unk> ;<unk> "
|
180 |
+
[2024-05-12 00:41:55] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer meta-llama/Meta-Llama-3-8B
|
181 |
+
[2024-05-12 00:41:55] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer microsoft/Phi-3-mini-4k-instruct
|
182 |
+
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
|
183 |
+
[2024-05-12 00:42:16] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer microsoft/phi-1
|
184 |
+
[2024-05-12 00:42:36] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer microsoft/phi-2
|
185 |
+
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
|
186 |
+
[2024-05-12 00:42:56] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer mistralai/Mistral-7B-v0.1
|
187 |
+
[2024-05-12 00:43:16] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer mistralai/Mixtral-8x7B-v0.1
|
188 |
+
[2024-05-12 00:43:37] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer openai-community/gpt2
|
189 |
+
[2024-05-12 00:43:57] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer openai/code-davinci-002
|
190 |
+
[2024-05-12 00:43:57] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer openai/gpt-3.5-turbo
|
191 |
+
[2024-05-12 00:43:57] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer openai/gpt-4
|
192 |
+
[2024-05-12 00:43:57] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer openai/text-davinci-003
|
193 |
+
[2024-05-12 00:43:57] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer paust/pko-t5-large
|
194 |
+
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
|
195 |
+
[2024-05-12 00:44:18] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer thu-coai/CharacterGLM-6B
|
196 |
+
[2024-05-12 00:44:58] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer tiiuae/falcon-180b
|
197 |
+
[2024-05-12 00:45:19] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer tiiuae/falcon-7b
|
198 |
+
|
199 |
+
Process finished with exit code 0
|
200 |
+
|
201 |
+
|
202 |
+
```
|
utils/oov_util.py
CHANGED
@@ -2,11 +2,117 @@
|
|
2 |
|
3 |
|
4 |
import os
|
|
|
|
|
5 |
|
6 |
-
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
|
7 |
|
8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
|
11 |
|
12 |
-
docs = [line.strip() for line in open(os.path.join(CURRENT_DIR, "test.txt"), "r", encoding="utf-8")]
|
|
|
2 |
|
3 |
|
4 |
import os
|
5 |
+
import json
|
6 |
+
from vocab import all_tokenizer_config, load_tokenizer, TokenizerImpl
|
7 |
|
|
|
8 |
|
9 |
+
text = "hello; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속;" \
|
10 |
+
" 確実に春が近づいてること; a közoktatással? _ Belföld;" \
|
11 |
+
" pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ;" \
|
12 |
+
" निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:;" \
|
13 |
+
" « અમરેલીનાં મહિલા વિકાસ; 🦙❤❥웃유♋☮✊;" \
|
14 |
+
"װיקיװערטערבוך "
|
15 |
+
whitespace = "\t \n\n\r "
|
16 |
+
bytes = b"\x00\x01\x02\x03\x04".decode('utf-8')
|
17 |
+
|
18 |
+
text += whitespace
|
19 |
+
|
20 |
+
|
21 |
+
def get_unk(tokenizer_config):
|
22 |
+
tokenizer = load_tokenizer(tokenizer_config)
|
23 |
+
if hasattr(tokenizer, "unk_token"):
|
24 |
+
return f"{tokenizer.unk_token}, {tokenizer.unk_token_id}"
|
25 |
+
else:
|
26 |
+
return "unk_token not found"
|
27 |
+
|
28 |
+
|
29 |
+
# def infer_tokenizer_impl(tokenizer_config):
|
30 |
+
def infer_tokenizer_type(tokenizer_config):
|
31 |
+
tokenizer = load_tokenizer(tokenizer_config)
|
32 |
+
if tokenizer_config.impl == TokenizerImpl.TikToken:
|
33 |
+
return "tiktoken"
|
34 |
+
if hasattr(tokenizer, "backend_tokenizer"):
|
35 |
+
return str(type(tokenizer.backend_tokenizer.model)) # type(tokenizer._tokenizer.model))
|
36 |
+
# orion: sp_model.Load(vocab_file),继承 PreTrainedTokenizer
|
37 |
+
elif hasattr(tokenizer, "sp_model"): # 基于 sentencepiece 包
|
38 |
+
# for i in range(tokenizer.sp_model.piece_size()):
|
39 |
+
# if tokenizer.sp_model.is_byte(i):
|
40 |
+
# print("")
|
41 |
+
return f"sp_model, byte_num: {sum([tokenizer.sp_model.is_byte(i) for i in range(tokenizer.sp_model.piece_size())])}"
|
42 |
+
|
43 |
+
# sp.Load(model_path) ,并且包括image_tokenizer
|
44 |
+
elif "glm-" in tokenizer_config.name_or_path:
|
45 |
+
return f"byte_num: {sum([tokenizer.sp_tokenizer.text_tokenizer.sp.is_byte(i) for i in range(tokenizer.sp_tokenizer.text_tokenizer.sp.piece_size())])}"
|
46 |
+
# sp.Load(model_path) ,没有image_tokenizer
|
47 |
+
elif "glm2-" in tokenizer_config.name_or_path \
|
48 |
+
or "glm3-" in tokenizer_config.name_or_path \
|
49 |
+
or "CharacterGLM-6B" in tokenizer_config.name_or_path:
|
50 |
+
return f"byte_num: {sum([tokenizer.tokenizer.sp_model.is_byte(i) for i in range(tokenizer.tokenizer.sp_model.piece_size())])}"
|
51 |
+
elif "abeja/gpt-neox-japanese-2.7b" == tokenizer_config.name_or_path: # 支持 byte-level,解决oov问题
|
52 |
+
return f"japanese-bpe: https://github.com/tanreinama/Japanese-BPEEncoder_V2"
|
53 |
+
# bert-base-japanese: 特殊的地方在于 "word_tokenizer_type": "mecab",见 https://huggingface.co/tohoku-nlp/bert-base-japanese/blob/main/tokenizer_config.json
|
54 |
+
elif "bert-base-japanese" in tokenizer_config.name_or_path:
|
55 |
+
return "wordpiece.MecabTokenizer, 支持byte-level https://taku910.github.io/mecab/"
|
56 |
+
elif "moss" in tokenizer_config.name_or_path:
|
57 |
+
return "应该是 sentencepiece.byte_bpe,待确认"
|
58 |
+
elif "byt5" in tokenizer_config.name_or_path:
|
59 |
+
return "未知,待定"
|
60 |
+
else:
|
61 |
+
print("catch", tokenizer_config.name_or_path)
|
62 |
+
raise "error"
|
63 |
+
|
64 |
+
|
65 |
+
|
66 |
+
|
67 |
+
|
68 |
+
def test_reversible(tokenizer_config):
|
69 |
+
"""
|
70 |
+
xlm-roberta-base 为什么oov这么少?是因为有 byte吗?
|
71 |
+
:param tokenizer_config:
|
72 |
+
:return:
|
73 |
+
"""
|
74 |
+
tokenizer = load_tokenizer(tokenizer_config)
|
75 |
+
encoding = tokenizer.encode(text, add_special_tokens=False)
|
76 |
+
decoding = tokenizer.decode(encoding)
|
77 |
+
|
78 |
+
if text in decoding:
|
79 |
+
# print(tokenizer_config.name, tokenizer_config.impl, "reversible: true")
|
80 |
+
pass
|
81 |
+
else:
|
82 |
+
unk_count = sum([1 for token_id in encoding if token_id == tokenizer.unk_token_id])
|
83 |
+
oov_tokens = []
|
84 |
+
# if tokenizer_config.impl == TokenizerImpl.SentencePiece:
|
85 |
+
# print(sum([tokenizer.is_byte(i) for i in range(tokenizer.piece_size())]))
|
86 |
+
|
87 |
+
print("#######"*5)
|
88 |
+
print(f"{tokenizer_config.name_or_path}, {infer_tokenizer_type(tokenizer_config)}\n"
|
89 |
+
f"reversible: false; unk_token: {get_unk(tokenizer_config)},"
|
90 |
+
f" unk_ratio: {unk_count/len(encoding):.4f}; oov: []")
|
91 |
+
for i in range(len(text)):
|
92 |
+
if text[i] != decoding[i]:
|
93 |
+
# print(f"text[{i}] = {str(bytes(text[i:], 'utf-8'))}\n"
|
94 |
+
# f"decoding[{i}] = {str(bytes(decoding[i:], 'utf-8'))}")
|
95 |
+
print(f"text[{i}] = {json.dumps(text[i:], ensure_ascii=False)}, \n"
|
96 |
+
f"decoding[{i}] = {json.dumps(decoding[i:], ensure_ascii=False)}")
|
97 |
+
|
98 |
+
break
|
99 |
+
|
100 |
+
|
101 |
+
|
102 |
+
for config in all_tokenizer_config:
|
103 |
+
# if "xlm-roberta-base" in config.name:
|
104 |
+
# if "xlm-roberta-base" in config.name:
|
105 |
+
# if "chatglm3-6b" in config.name:
|
106 |
+
# if "bert-base-japanese" in config.name:
|
107 |
+
# if "moss" in config.name:
|
108 |
+
# if "byt5" in config.name:
|
109 |
+
if "baichuan" in config.name_or_path:
|
110 |
+
# if "CharacterGLM-6B" in config.name:
|
111 |
+
# if "fastchat-t5" in config.name: # 报错 pyo3_runtime.PanicException: AddedVocabulary bad split
|
112 |
+
# if True:
|
113 |
+
# test_unk(config)
|
114 |
+
test_reversible(config)
|
115 |
+
|
116 |
|
117 |
|
118 |
|
|
utils/speed_util.py
DELETED
@@ -1,9 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
encode速度
|
3 |
-
decode速度
|
4 |
-
|
5 |
-
## examples
|
6 |
-
|
7 |
-
qwen的encode速度有点慢
|
8 |
-
|
9 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
utils/symbol.py
DELETED
@@ -1,35 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
special_symbols: https://github.com/google/sentencepiece/blob/master/doc/special_symbols.md
|
3 |
-
emoji:
|
4 |
-
"""
|
5 |
-
|
6 |
-
import sys
|
7 |
-
|
8 |
-
|
9 |
-
# 来自 https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/tokenizer/gpt2_tokenization.py#L55
|
10 |
-
# 啥意思?
|
11 |
-
def bytes_to_unicode():
|
12 |
-
"""
|
13 |
-
Returns list of utf-8 byte and a corresponding list of unicode strings.
|
14 |
-
The reversible bpe codes work on unicode strings.
|
15 |
-
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
|
16 |
-
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
|
17 |
-
This is a signficant percentage of your normal, say, 32K bpe vocab.
|
18 |
-
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
|
19 |
-
And avoids mapping to whitespace/control characters the bpe code barfs on.
|
20 |
-
"""
|
21 |
-
_chr = unichr if sys.version_info[0] == 2 else chr
|
22 |
-
bs = list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + \
|
23 |
-
list(range(ord("®"), ord("ÿ") + 1))
|
24 |
-
cs = bs[:]
|
25 |
-
n = 0
|
26 |
-
for b in range(2**8):
|
27 |
-
if b not in bs:
|
28 |
-
bs.append(b)
|
29 |
-
cs.append(2**8 + n)
|
30 |
-
n += 1
|
31 |
-
cs = [_chr(n) for n in cs]
|
32 |
-
return dict(zip(bs, cs))
|
33 |
-
|
34 |
-
aa = bytes_to_unicode()
|
35 |
-
print(aa)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
utils/text_util.py
CHANGED
@@ -1,12 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
|
2 |
def is_digit_char(uchar):
|
3 |
return uchar in "0123456789"
|
4 |
|
5 |
|
6 |
-
def
|
7 |
return any(is_digit_char(ch) for ch in text)
|
8 |
|
9 |
|
|
|
|
|
|
|
10 |
def is_all_digit(text):
|
11 |
return all(is_digit_char(char) for char in text)
|
12 |
|
|
|
1 |
+
"""
|
2 |
+
char_
|
3 |
+
"""
|
4 |
+
|
5 |
+
|
6 |
+
def detect_lang_from_unicode():
|
7 |
+
pass
|
8 |
+
|
9 |
|
10 |
def is_digit_char(uchar):
|
11 |
return uchar in "0123456789"
|
12 |
|
13 |
|
14 |
+
def contains_digit(text):
|
15 |
return any(is_digit_char(ch) for ch in text)
|
16 |
|
17 |
|
18 |
+
def get_digit_count(text):
|
19 |
+
pass
|
20 |
+
|
21 |
def is_all_digit(text):
|
22 |
return all(is_digit_char(char) for char in text)
|
23 |
|
utils/vocab.jd.txt.v2
DELETED
@@ -1,10268 +0,0 @@
|
|
1 |
-
[PAD]
|
2 |
-
[unused1]
|
3 |
-
[unused2]
|
4 |
-
[unused3]
|
5 |
-
[unused4]
|
6 |
-
[unused5]
|
7 |
-
[unused6]
|
8 |
-
[unused7]
|
9 |
-
[unused8]
|
10 |
-
[unused9]
|
11 |
-
[unused10]
|
12 |
-
[unused11]
|
13 |
-
[unused12]
|
14 |
-
[unused13]
|
15 |
-
[unused14]
|
16 |
-
[unused15]
|
17 |
-
[unused16]
|
18 |
-
[unused17]
|
19 |
-
[unused18]
|
20 |
-
[unused19]
|
21 |
-
[unused20]
|
22 |
-
[unused21]
|
23 |
-
[unused22]
|
24 |
-
[unused23]
|
25 |
-
[unused24]
|
26 |
-
[unused25]
|
27 |
-
[unused26]
|
28 |
-
[unused27]
|
29 |
-
[unused28]
|
30 |
-
[unused29]
|
31 |
-
[unused30]
|
32 |
-
[unused31]
|
33 |
-
[unused32]
|
34 |
-
[unused33]
|
35 |
-
[unused34]
|
36 |
-
[unused35]
|
37 |
-
[unused36]
|
38 |
-
[unused37]
|
39 |
-
[unused38]
|
40 |
-
[unused39]
|
41 |
-
[unused40]
|
42 |
-
[unused41]
|
43 |
-
[unused42]
|
44 |
-
[unused43]
|
45 |
-
[unused44]
|
46 |
-
[unused45]
|
47 |
-
[unused46]
|
48 |
-
[unused47]
|
49 |
-
[unused48]
|
50 |
-
[unused49]
|
51 |
-
[unused50]
|
52 |
-
[unused51]
|
53 |
-
[unused52]
|
54 |
-
[unused53]
|
55 |
-
[unused54]
|
56 |
-
[unused55]
|
57 |
-
[unused56]
|
58 |
-
[unused57]
|
59 |
-
[unused58]
|
60 |
-
[unused59]
|
61 |
-
[unused60]
|
62 |
-
[unused61]
|
63 |
-
[unused62]
|
64 |
-
[unused63]
|
65 |
-
[unused64]
|
66 |
-
[unused65]
|
67 |
-
[unused66]
|
68 |
-
[unused67]
|
69 |
-
[unused68]
|
70 |
-
[unused69]
|
71 |
-
[unused70]
|
72 |
-
[unused71]
|
73 |
-
[unused72]
|
74 |
-
[unused73]
|
75 |
-
[unused74]
|
76 |
-
[unused75]
|
77 |
-
[unused76]
|
78 |
-
[unused77]
|
79 |
-
[unused78]
|
80 |
-
[unused79]
|
81 |
-
[unused80]
|
82 |
-
[unused81]
|
83 |
-
[unused82]
|
84 |
-
[unused83]
|
85 |
-
[unused84]
|
86 |
-
[unused85]
|
87 |
-
[unused86]
|
88 |
-
[unused87]
|
89 |
-
[unused88]
|
90 |
-
[unused89]
|
91 |
-
[unused90]
|
92 |
-
[unused91]
|
93 |
-
[unused92]
|
94 |
-
[unused93]
|
95 |
-
[unused94]
|
96 |
-
[unused95]
|
97 |
-
[unused96]
|
98 |
-
[unused97]
|
99 |
-
[unused98]
|
100 |
-
[unused99]
|
101 |
-
[UNK]
|
102 |
-
[CLS]
|
103 |
-
[SEP]
|
104 |
-
[MASK]
|
105 |
-
<S>
|
106 |
-
<T>
|
107 |
-
!
|
108 |
-
"
|
109 |
-
”
|
110 |
-
“
|
111 |
-
—
|
112 |
-
–
|
113 |
-
…
|
114 |
-
’
|
115 |
-
‘
|
116 |
-
#
|
117 |
-
$
|
118 |
-
%
|
119 |
-
&
|
120 |
-
'
|
121 |
-
(
|
122 |
-
)
|
123 |
-
*
|
124 |
-
+
|
125 |
-
,
|
126 |
-
-
|
127 |
-
.
|
128 |
-
/
|
129 |
-
0
|
130 |
-
1
|
131 |
-
2
|
132 |
-
3
|
133 |
-
4
|
134 |
-
5
|
135 |
-
6
|
136 |
-
7
|
137 |
-
8
|
138 |
-
9
|
139 |
-
10
|
140 |
-
11
|
141 |
-
12
|
142 |
-
13
|
143 |
-
14
|
144 |
-
15
|
145 |
-
16
|
146 |
-
17
|
147 |
-
18
|
148 |
-
19
|
149 |
-
20
|
150 |
-
21
|
151 |
-
22
|
152 |
-
23
|
153 |
-
24
|
154 |
-
25
|
155 |
-
26
|
156 |
-
27
|
157 |
-
28
|
158 |
-
29
|
159 |
-
30
|
160 |
-
31
|
161 |
-
32
|
162 |
-
33
|
163 |
-
34
|
164 |
-
35
|
165 |
-
36
|
166 |
-
37
|
167 |
-
38
|
168 |
-
39
|
169 |
-
40
|
170 |
-
41
|
171 |
-
42
|
172 |
-
43
|
173 |
-
44
|
174 |
-
45
|
175 |
-
46
|
176 |
-
47
|
177 |
-
48
|
178 |
-
49
|
179 |
-
50
|
180 |
-
51
|
181 |
-
52
|
182 |
-
53
|
183 |
-
54
|
184 |
-
55
|
185 |
-
56
|
186 |
-
57
|
187 |
-
58
|
188 |
-
59
|
189 |
-
60
|
190 |
-
61
|
191 |
-
62
|
192 |
-
63
|
193 |
-
64
|
194 |
-
65
|
195 |
-
66
|
196 |
-
67
|
197 |
-
68
|
198 |
-
69
|
199 |
-
70
|
200 |
-
71
|
201 |
-
72
|
202 |
-
73
|
203 |
-
74
|
204 |
-
75
|
205 |
-
76
|
206 |
-
77
|
207 |
-
78
|
208 |
-
79
|
209 |
-
80
|
210 |
-
81
|
211 |
-
82
|
212 |
-
83
|
213 |
-
84
|
214 |
-
85
|
215 |
-
86
|
216 |
-
87
|
217 |
-
88
|
218 |
-
89
|
219 |
-
90
|
220 |
-
91
|
221 |
-
92
|
222 |
-
93
|
223 |
-
94
|
224 |
-
95
|
225 |
-
96
|
226 |
-
97
|
227 |
-
98
|
228 |
-
99
|
229 |
-
100
|
230 |
-
120
|
231 |
-
128
|
232 |
-
180
|
233 |
-
200
|
234 |
-
256
|
235 |
-
304
|
236 |
-
360
|
237 |
-
500
|
238 |
-
512
|
239 |
-
1000
|
240 |
-
1080
|
241 |
-
2000
|
242 |
-
2014
|
243 |
-
2015
|
244 |
-
2016
|
245 |
-
2017
|
246 |
-
2018
|
247 |
-
2019
|
248 |
-
2020
|
249 |
-
2021
|
250 |
-
2022
|
251 |
-
:
|
252 |
-
;
|
253 |
-
<
|
254 |
-
=
|
255 |
-
>
|
256 |
-
?
|
257 |
-
@
|
258 |
-
[
|
259 |
-
\
|
260 |
-
]
|
261 |
-
^
|
262 |
-
_
|
263 |
-
a
|
264 |
-
b
|
265 |
-
c
|
266 |
-
d
|
267 |
-
e
|
268 |
-
f
|
269 |
-
g
|
270 |
-
h
|
271 |
-
i
|
272 |
-
j
|
273 |
-
k
|
274 |
-
l
|
275 |
-
m
|
276 |
-
n
|
277 |
-
o
|
278 |
-
p
|
279 |
-
q
|
280 |
-
r
|
281 |
-
s
|
282 |
-
t
|
283 |
-
u
|
284 |
-
v
|
285 |
-
w
|
286 |
-
x
|
287 |
-
y
|
288 |
-
z
|
289 |
-
{
|
290 |
-
|
|
291 |
-
}
|
292 |
-
~
|
293 |
-
£
|
294 |
-
¤
|
295 |
-
¥
|
296 |
-
§
|
297 |
-
«
|
298 |
-
°
|
299 |
-
±
|
300 |
-
²
|
301 |
-
³
|
302 |
-
µ
|
303 |
-
·
|
304 |
-
¹
|
305 |
-
º
|
306 |
-
»
|
307 |
-
¼
|
308 |
-
×
|
309 |
-
ß
|
310 |
-
æ
|
311 |
-
÷
|
312 |
-
ø
|
313 |
-
đ
|
314 |
-
ŋ
|
315 |
-
ɔ
|
316 |
-
ə
|
317 |
-
ɡ
|
318 |
-
ʰ
|
319 |
-
ˇ
|
320 |
-
ˈ
|
321 |
-
ˊ
|
322 |
-
ˋ
|
323 |
-
ˍ
|
324 |
-
ː
|
325 |
-
˙
|
326 |
-
˚
|
327 |
-
ˢ
|
328 |
-
α
|
329 |
-
β
|
330 |
-
γ
|
331 |
-
δ
|
332 |
-
ε
|
333 |
-
η
|
334 |
-
θ
|
335 |
-
ι
|
336 |
-
κ
|
337 |
-
λ
|
338 |
-
μ
|
339 |
-
ν
|
340 |
-
ο
|
341 |
-
π
|
342 |
-
ρ
|
343 |
-
ς
|
344 |
-
σ
|
345 |
-
τ
|
346 |
-
υ
|
347 |
-
φ
|
348 |
-
χ
|
349 |
-
ψ
|
350 |
-
ω
|
351 |
-
а
|
352 |
-
б
|
353 |
-
в
|
354 |
-
г
|
355 |
-
д
|
356 |
-
е
|
357 |
-
ж
|
358 |
-
з
|
359 |
-
и
|
360 |
-
к
|
361 |
-
л
|
362 |
-
м
|
363 |
-
н
|
364 |
-
о
|
365 |
-
п
|
366 |
-
р
|
367 |
-
с
|
368 |
-
т
|
369 |
-
у
|
370 |
-
ф
|
371 |
-
х
|
372 |
-
ц
|
373 |
-
ч
|
374 |
-
ш
|
375 |
-
ы
|
376 |
-
ь
|
377 |
-
я
|
378 |
-
і
|
379 |
-
ก
|
380 |
-
ง
|
381 |
-
น
|
382 |
-
ม
|
383 |
-
ย
|
384 |
-
ร
|
385 |
-
อ
|
386 |
-
า
|
387 |
-
เ
|
388 |
-
๑
|
389 |
-
་
|
390 |
-
ღ
|
391 |
-
ᵃ
|
392 |
-
ᵉ
|
393 |
-
ᵍ
|
394 |
-
ᵏ
|
395 |
-
ᵐ
|
396 |
-
ᵒ
|
397 |
-
ᵘ
|
398 |
-
‖
|
399 |
-
„
|
400 |
-
†
|
401 |
-
•
|
402 |
-
‥
|
403 |
-
‧
|
404 |
-
‰
|
405 |
-
′
|
406 |
-
″
|
407 |
-
‹
|
408 |
-
›
|
409 |
-
※
|
410 |
-
‿
|
411 |
-
⁄
|
412 |
-
ⁱ
|
413 |
-
⁺
|
414 |
-
ⁿ
|
415 |
-
₁
|
416 |
-
₂
|
417 |
-
₃
|
418 |
-
₄
|
419 |
-
€
|
420 |
-
℃
|
421 |
-
№
|
422 |
-
ⅰ
|
423 |
-
ⅱ
|
424 |
-
ⅲ
|
425 |
-
ⅳ
|
426 |
-
ⅴ
|
427 |
-
←
|
428 |
-
↑
|
429 |
-
→
|
430 |
-
↓
|
431 |
-
⇒
|
432 |
-
∀
|
433 |
-
−
|
434 |
-
∕
|
435 |
-
∙
|
436 |
-
√
|
437 |
-
∞
|
438 |
-
∟
|
439 |
-
∠
|
440 |
-
∣
|
441 |
-
∥
|
442 |
-
∩
|
443 |
-
∮
|
444 |
-
∶
|
445 |
-
∼
|
446 |
-
∽
|
447 |
-
≈
|
448 |
-
≒
|
449 |
-
≡
|
450 |
-
≤
|
451 |
-
≥
|
452 |
-
≦
|
453 |
-
≧
|
454 |
-
≪
|
455 |
-
≫
|
456 |
-
⊙
|
457 |
-
⋅
|
458 |
-
⋈
|
459 |
-
⋯
|
460 |
-
⌒
|
461 |
-
①
|
462 |
-
②
|
463 |
-
③
|
464 |
-
④
|
465 |
-
⑤
|
466 |
-
⑥
|
467 |
-
⑦
|
468 |
-
⑧
|
469 |
-
⑨
|
470 |
-
⑩
|
471 |
-
⑴
|
472 |
-
⑵
|
473 |
-
⑶
|
474 |
-
⑷
|
475 |
-
⑸
|
476 |
-
⒈
|
477 |
-
⒉
|
478 |
-
⒊
|
479 |
-
⒋
|
480 |
-
ⓒ
|
481 |
-
ⓔ
|
482 |
-
ⓘ
|
483 |
-
─
|
484 |
-
━
|
485 |
-
│
|
486 |
-
┃
|
487 |
-
┅
|
488 |
-
┆
|
489 |
-
┊
|
490 |
-
┌
|
491 |
-
└
|
492 |
-
├
|
493 |
-
┣
|
494 |
-
═
|
495 |
-
║
|
496 |
-
╚
|
497 |
-
╞
|
498 |
-
╠
|
499 |
-
╭
|
500 |
-
╮
|
501 |
-
╯
|
502 |
-
╰
|
503 |
-
╱
|
504 |
-
╳
|
505 |
-
▂
|
506 |
-
▃
|
507 |
-
▅
|
508 |
-
▇
|
509 |
-
█
|
510 |
-
▉
|
511 |
-
▋
|
512 |
-
▌
|
513 |
-
▍
|
514 |
-
▎
|
515 |
-
■
|
516 |
-
□
|
517 |
-
▬
|
518 |
-
▲
|
519 |
-
△
|
520 |
-
►
|
521 |
-
▼
|
522 |
-
▽
|
523 |
-
◆
|
524 |
-
◇
|
525 |
-
○
|
526 |
-
◎
|
527 |
-
●
|
528 |
-
◕
|
529 |
-
◠
|
530 |
-
◢
|
531 |
-
◤
|
532 |
-
★
|
533 |
-
☆
|
534 |
-
☞
|
535 |
-
☼
|
536 |
-
♡
|
537 |
-
♪
|
538 |
-
♫
|
539 |
-
♬
|
540 |
-
✕
|
541 |
-
✦
|
542 |
-
✪
|
543 |
-
✰
|
544 |
-
✿
|
545 |
-
❀
|
546 |
-
➜
|
547 |
-
➤
|
548 |
-
⦿
|
549 |
-
、
|
550 |
-
。
|
551 |
-
〃
|
552 |
-
々
|
553 |
-
〇
|
554 |
-
〈
|
555 |
-
〉
|
556 |
-
《
|
557 |
-
》
|
558 |
-
「
|
559 |
-
」
|
560 |
-
『
|
561 |
-
』
|
562 |
-
【
|
563 |
-
】
|
564 |
-
〓
|
565 |
-
〔
|
566 |
-
〕
|
567 |
-
〖
|
568 |
-
〗
|
569 |
-
〜
|
570 |
-
〝
|
571 |
-
〞
|
572 |
-
ㄧ
|
573 |
-
ㆍ
|
574 |
-
㈦
|
575 |
-
㊣
|
576 |
-
㎡
|
577 |
-
㗎
|
578 |
-
一
|
579 |
-
丁
|
580 |
-
七
|
581 |
-
万
|
582 |
-
丈
|
583 |
-
三
|
584 |
-
上
|
585 |
-
下
|
586 |
-
不
|
587 |
-
与
|
588 |
-
丐
|
589 |
-
丑
|
590 |
-
专
|
591 |
-
且
|
592 |
-
丕
|
593 |
-
世
|
594 |
-
丘
|
595 |
-
丙
|
596 |
-
业
|
597 |
-
丛
|
598 |
-
东
|
599 |
-
丝
|
600 |
-
丞
|
601 |
-
両
|
602 |
-
丢
|
603 |
-
两
|
604 |
-
严
|
605 |
-
丧
|
606 |
-
丨
|
607 |
-
个
|
608 |
-
丫
|
609 |
-
中
|
610 |
-
丰
|
611 |
-
串
|
612 |
-
临
|
613 |
-
丶
|
614 |
-
丸
|
615 |
-
丹
|
616 |
-
为
|
617 |
-
主
|
618 |
-
丼
|
619 |
-
丽
|
620 |
-
举
|
621 |
-
丿
|
622 |
-
乂
|
623 |
-
乃
|
624 |
-
久
|
625 |
-
么
|
626 |
-
义
|
627 |
-
之
|
628 |
-
乌
|
629 |
-
乍
|
630 |
-
乎
|
631 |
-
乏
|
632 |
-
乐
|
633 |
-
乒
|
634 |
-
乓
|
635 |
-
乔
|
636 |
-
乖
|
637 |
-
乗
|
638 |
-
乘
|
639 |
-
乙
|
640 |
-
乜
|
641 |
-
九
|
642 |
-
乞
|
643 |
-
也
|
644 |
-
习
|
645 |
-
乡
|
646 |
-
书
|
647 |
-
乩
|
648 |
-
买
|
649 |
-
乱
|
650 |
-
乳
|
651 |
-
亀
|
652 |
-
了
|
653 |
-
予
|
654 |
-
争
|
655 |
-
事
|
656 |
-
二
|
657 |
-
于
|
658 |
-
亏
|
659 |
-
云
|
660 |
-
互
|
661 |
-
五
|
662 |
-
井
|
663 |
-
亘
|
664 |
-
亚
|
665 |
-
些
|
666 |
-
亜
|
667 |
-
亟
|
668 |
-
亡
|
669 |
-
亢
|
670 |
-
交
|
671 |
-
亥
|
672 |
-
亦
|
673 |
-
产
|
674 |
-
亨
|
675 |
-
亩
|
676 |
-
享
|
677 |
-
京
|
678 |
-
亭
|
679 |
-
亮
|
680 |
-
亲
|
681 |
-
亳
|
682 |
-
亵
|
683 |
-
人
|
684 |
-
亿
|
685 |
-
什
|
686 |
-
仁
|
687 |
-
仃
|
688 |
-
仄
|
689 |
-
仅
|
690 |
-
仆
|
691 |
-
仇
|
692 |
-
今
|
693 |
-
介
|
694 |
-
仍
|
695 |
-
从
|
696 |
-
仏
|
697 |
-
仑
|
698 |
-
仓
|
699 |
-
仔
|
700 |
-
仕
|
701 |
-
他
|
702 |
-
仗
|
703 |
-
付
|
704 |
-
仙
|
705 |
-
仝
|
706 |
-
仞
|
707 |
-
仟
|
708 |
-
代
|
709 |
-
令
|
710 |
-
以
|
711 |
-
仨
|
712 |
-
仪
|
713 |
-
们
|
714 |
-
仮
|
715 |
-
仰
|
716 |
-
仲
|
717 |
-
件
|
718 |
-
价
|
719 |
-
任
|
720 |
-
份
|
721 |
-
仿
|
722 |
-
企
|
723 |
-
伉
|
724 |
-
伊
|
725 |
-
伍
|
726 |
-
伎
|
727 |
-
伏
|
728 |
-
伐
|
729 |
-
休
|
730 |
-
伕
|
731 |
-
众
|
732 |
-
优
|
733 |
-
伙
|
734 |
-
会
|
735 |
-
伝
|
736 |
-
伞
|
737 |
-
伟
|
738 |
-
传
|
739 |
-
伢
|
740 |
-
伤
|
741 |
-
伦
|
742 |
-
伪
|
743 |
-
伫
|
744 |
-
伯
|
745 |
-
估
|
746 |
-
伴
|
747 |
-
伶
|
748 |
-
伸
|
749 |
-
伺
|
750 |
-
似
|
751 |
-
伽
|
752 |
-
佃
|
753 |
-
但
|
754 |
-
位
|
755 |
-
低
|
756 |
-
住
|
757 |
-
佐
|
758 |
-
佑
|
759 |
-
体
|
760 |
-
佔
|
761 |
-
何
|
762 |
-
佗
|
763 |
-
佘
|
764 |
-
余
|
765 |
-
佚
|
766 |
-
佛
|
767 |
-
作
|
768 |
-
佝
|
769 |
-
佞
|
770 |
-
佟
|
771 |
-
你
|
772 |
-
佢
|
773 |
-
佣
|
774 |
-
佤
|
775 |
-
佥
|
776 |
-
佩
|
777 |
-
佬
|
778 |
-
佯
|
779 |
-
佰
|
780 |
-
佳
|
781 |
-
佶
|
782 |
-
佻
|
783 |
-
佼
|
784 |
-
使
|
785 |
-
侃
|
786 |
-
侄
|
787 |
-
侈
|
788 |
-
例
|
789 |
-
侍
|
790 |
-
侏
|
791 |
-
侑
|
792 |
-
侗
|
793 |
-
供
|
794 |
-
依
|
795 |
-
侠
|
796 |
-
価
|
797 |
-
侣
|
798 |
-
侥
|
799 |
-
侦
|
800 |
-
侧
|
801 |
-
侨
|
802 |
-
侬
|
803 |
-
侮
|
804 |
-
侯
|
805 |
-
侵
|
806 |
-
侷
|
807 |
-
便
|
808 |
-
促
|
809 |
-
俄
|
810 |
-
俊
|
811 |
-
俎
|
812 |
-
俏
|
813 |
-
俐
|
814 |
-
俑
|
815 |
-
俗
|
816 |
-
俘
|
817 |
-
俚
|
818 |
-
保
|
819 |
-
俞
|
820 |
-
俟
|
821 |
-
信
|
822 |
-
俨
|
823 |
-
俩
|
824 |
-
俪
|
825 |
-
俬
|
826 |
-
俭
|
827 |
-
修
|
828 |
-
俯
|
829 |
-
俱
|
830 |
-
俳
|
831 |
-
俸
|
832 |
-
俺
|
833 |
-
俾
|
834 |
-
倌
|
835 |
-
倍
|
836 |
-
倏
|
837 |
-
倒
|
838 |
-
倔
|
839 |
-
倖
|
840 |
-
倘
|
841 |
-
候
|
842 |
-
倚
|
843 |
-
倜
|
844 |
-
借
|
845 |
-
倡
|
846 |
-
値
|
847 |
-
倦
|
848 |
-
倩
|
849 |
-
倪
|
850 |
-
倬
|
851 |
-
倭
|
852 |
-
倶
|
853 |
-
债
|
854 |
-
值
|
855 |
-
倾
|
856 |
-
偃
|
857 |
-
假
|
858 |
-
偈
|
859 |
-
偌
|
860 |
-
偎
|
861 |
-
偏
|
862 |
-
偕
|
863 |
-
做
|
864 |
-
停
|
865 |
-
健
|
866 |
-
偶
|
867 |
-
偷
|
868 |
-
偻
|
869 |
-
偿
|
870 |
-
傀
|
871 |
-
傅
|
872 |
-
傍
|
873 |
-
傚
|
874 |
-
傣
|
875 |
-
傥
|
876 |
-
储
|
877 |
-
傩
|
878 |
-
催
|
879 |
-
傲
|
880 |
-
傻
|
881 |
-
働
|
882 |
-
像
|
883 |
-
僖
|
884 |
-
僚
|
885 |
-
僧
|
886 |
-
僭
|
887 |
-
僮
|
888 |
-
僱
|
889 |
-
僵
|
890 |
-
僻
|
891 |
-
儆
|
892 |
-
儋
|
893 |
-
儒
|
894 |
-
儡
|
895 |
-
儿
|
896 |
-
兀
|
897 |
-
允
|
898 |
-
元
|
899 |
-
兄
|
900 |
-
充
|
901 |
-
兆
|
902 |
-
先
|
903 |
-
光
|
904 |
-
克
|
905 |
-
免
|
906 |
-
児
|
907 |
-
兑
|
908 |
-
兔
|
909 |
-
兖
|
910 |
-
党
|
911 |
-
兜
|
912 |
-
兢
|
913 |
-
入
|
914 |
-
全
|
915 |
-
八
|
916 |
-
公
|
917 |
-
六
|
918 |
-
兮
|
919 |
-
兰
|
920 |
-
共
|
921 |
-
兲
|
922 |
-
关
|
923 |
-
兴
|
924 |
-
兵
|
925 |
-
其
|
926 |
-
具
|
927 |
-
典
|
928 |
-
兹
|
929 |
-
养
|
930 |
-
兼
|
931 |
-
兽
|
932 |
-
冀
|
933 |
-
内
|
934 |
-
円
|
935 |
-
冇
|
936 |
-
冈
|
937 |
-
冉
|
938 |
-
册
|
939 |
-
再
|
940 |
-
冏
|
941 |
-
��
|
942 |
-
冕
|
943 |
-
冗
|
944 |
-
写
|
945 |
-
军
|
946 |
-
农
|
947 |
-
冠
|
948 |
-
冢
|
949 |
-
冤
|
950 |
-
冥
|
951 |
-
冨
|
952 |
-
冬
|
953 |
-
冯
|
954 |
-
冰
|
955 |
-
冲
|
956 |
-
决
|
957 |
-
况
|
958 |
-
冶
|
959 |
-
冷
|
960 |
-
冻
|
961 |
-
冼
|
962 |
-
冽
|
963 |
-
冾
|
964 |
-
净
|
965 |
-
凄
|
966 |
-
准
|
967 |
-
凇
|
968 |
-
凉
|
969 |
-
凋
|
970 |
-
凌
|
971 |
-
减
|
972 |
-
凑
|
973 |
-
凛
|
974 |
-
凝
|
975 |
-
几
|
976 |
-
凡
|
977 |
-
凤
|
978 |
-
処
|
979 |
-
凪
|
980 |
-
凭
|
981 |
-
凯
|
982 |
-
凰
|
983 |
-
凳
|
984 |
-
凶
|
985 |
-
凸
|
986 |
-
凹
|
987 |
-
出
|
988 |
-
击
|
989 |
-
函
|
990 |
-
凿
|
991 |
-
刀
|
992 |
-
刁
|
993 |
-
刃
|
994 |
-
分
|
995 |
-
切
|
996 |
-
刈
|
997 |
-
刊
|
998 |
-
刍
|
999 |
-
刎
|
1000 |
-
刑
|
1001 |
-
划
|
1002 |
-
列
|
1003 |
-
刘
|
1004 |
-
则
|
1005 |
-
刚
|
1006 |
-
创
|
1007 |
-
初
|
1008 |
-
删
|
1009 |
-
判
|
1010 |
-
刨
|
1011 |
-
利
|
1012 |
-
别
|
1013 |
-
刮
|
1014 |
-
到
|
1015 |
-
制
|
1016 |
-
刷
|
1017 |
-
券
|
1018 |
-
刹
|
1019 |
-
刺
|
1020 |
-
刻
|
1021 |
-
刽
|
1022 |
-
剁
|
1023 |
-
剂
|
1024 |
-
剃
|
1025 |
-
剉
|
1026 |
-
削
|
1027 |
-
剌
|
1028 |
-
前
|
1029 |
-
剐
|
1030 |
-
剑
|
1031 |
-
剔
|
1032 |
-
剖
|
1033 |
-
剜
|
1034 |
-
剣
|
1035 |
-
剤
|
1036 |
-
剥
|
1037 |
-
剧
|
1038 |
-
剩
|
1039 |
-
剪
|
1040 |
-
副
|
1041 |
-
割
|
1042 |
-
剷
|
1043 |
-
剽
|
1044 |
-
剿
|
1045 |
-
劈
|
1046 |
-
力
|
1047 |
-
劝
|
1048 |
-
办
|
1049 |
-
功
|
1050 |
-
加
|
1051 |
-
务
|
1052 |
-
劣
|
1053 |
-
动
|
1054 |
-
助
|
1055 |
-
努
|
1056 |
-
劫
|
1057 |
-
劭
|
1058 |
-
励
|
1059 |
-
劲
|
1060 |
-
劳
|
1061 |
-
労
|
1062 |
-
劵
|
1063 |
-
効
|
1064 |
-
劾
|
1065 |
-
势
|
1066 |
-
勃
|
1067 |
-
勇
|
1068 |
-
勉
|
1069 |
-
勋
|
1070 |
-
勐
|
1071 |
-
勒
|
1072 |
-
勖
|
1073 |
-
勘
|
1074 |
-
募
|
1075 |
-
勤
|
1076 |
-
勧
|
1077 |
-
勳
|
1078 |
-
勺
|
1079 |
-
勾
|
1080 |
-
勿
|
1081 |
-
匀
|
1082 |
-
包
|
1083 |
-
匆
|
1084 |
-
匈
|
1085 |
-
匍
|
1086 |
-
匐
|
1087 |
-
匕
|
1088 |
-
化
|
1089 |
-
北
|
1090 |
-
匙
|
1091 |
-
匝
|
1092 |
-
匠
|
1093 |
-
匡
|
1094 |
-
匣
|
1095 |
-
匪
|
1096 |
-
匮
|
1097 |
-
匹
|
1098 |
-
区
|
1099 |
-
医
|
1100 |
-
匾
|
1101 |
-
匿
|
1102 |
-
十
|
1103 |
-
千
|
1104 |
-
卅
|
1105 |
-
升
|
1106 |
-
午
|
1107 |
-
卉
|
1108 |
-
半
|
1109 |
-
卍
|
1110 |
-
华
|
1111 |
-
协
|
1112 |
-
卑
|
1113 |
-
卒
|
1114 |
-
卓
|
1115 |
-
单
|
1116 |
-
卖
|
1117 |
-
南
|
1118 |
-
単
|
1119 |
-
博
|
1120 |
-
卜
|
1121 |
-
卞
|
1122 |
-
卟
|
1123 |
-
占
|
1124 |
-
卡
|
1125 |
-
卢
|
1126 |
-
卤
|
1127 |
-
卦
|
1128 |
-
卧
|
1129 |
-
卫
|
1130 |
-
卮
|
1131 |
-
卯
|
1132 |
-
印
|
1133 |
-
危
|
1134 |
-
即
|
1135 |
-
却
|
1136 |
-
卵
|
1137 |
-
卷
|
1138 |
-
卸
|
1139 |
-
卿
|
1140 |
-
厂
|
1141 |
-
厄
|
1142 |
-
厅
|
1143 |
-
历
|
1144 |
-
厉
|
1145 |
-
压
|
1146 |
-
厌
|
1147 |
-
厕
|
1148 |
-
厘
|
1149 |
-
厚
|
1150 |
-
厝
|
1151 |
-
原
|
1152 |
-
厢
|
1153 |
-
厥
|
1154 |
-
厦
|
1155 |
-
厨
|
1156 |
-
厩
|
1157 |
-
厮
|
1158 |
-
厳
|
1159 |
-
去
|
1160 |
-
县
|
1161 |
-
叁
|
1162 |
-
参
|
1163 |
-
又
|
1164 |
-
叉
|
1165 |
-
及
|
1166 |
-
友
|
1167 |
-
双
|
1168 |
-
反
|
1169 |
-
収
|
1170 |
-
发
|
1171 |
-
叔
|
1172 |
-
取
|
1173 |
-
受
|
1174 |
-
变
|
1175 |
-
叙
|
1176 |
-
叛
|
1177 |
-
叟
|
1178 |
-
叠
|
1179 |
-
叡
|
1180 |
-
口
|
1181 |
-
古
|
1182 |
-
句
|
1183 |
-
另
|
1184 |
-
叨
|
1185 |
-
叩
|
1186 |
-
只
|
1187 |
-
叫
|
1188 |
-
召
|
1189 |
-
叭
|
1190 |
-
叮
|
1191 |
-
可
|
1192 |
-
台
|
1193 |
-
叱
|
1194 |
-
史
|
1195 |
-
右
|
1196 |
-
叵
|
1197 |
-
叶
|
1198 |
-
号
|
1199 |
-
司
|
1200 |
-
叹
|
1201 |
-
叻
|
1202 |
-
叼
|
1203 |
-
叽
|
1204 |
-
吁
|
1205 |
-
吃
|
1206 |
-
各
|
1207 |
-
吆
|
1208 |
-
合
|
1209 |
-
吉
|
1210 |
-
吊
|
1211 |
-
吋
|
1212 |
-
同
|
1213 |
-
名
|
1214 |
-
后
|
1215 |
-
吏
|
1216 |
-
吐
|
1217 |
-
向
|
1218 |
-
吒
|
1219 |
-
吓
|
1220 |
-
吕
|
1221 |
-
吖
|
1222 |
-
吗
|
1223 |
-
君
|
1224 |
-
吝
|
1225 |
-
吞
|
1226 |
-
吟
|
1227 |
-
吠
|
1228 |
-
吡
|
1229 |
-
否
|
1230 |
-
吧
|
1231 |
-
吨
|
1232 |
-
吩
|
1233 |
-
含
|
1234 |
-
听
|
1235 |
-
吭
|
1236 |
-
吮
|
1237 |
-
启
|
1238 |
-
吱
|
1239 |
-
吴
|
1240 |
-
吵
|
1241 |
-
吸
|
1242 |
-
吹
|
1243 |
-
吻
|
1244 |
-
吼
|
1245 |
-
吽
|
1246 |
-
吾
|
1247 |
-
呀
|
1248 |
-
呃
|
1249 |
-
呆
|
1250 |
-
呈
|
1251 |
-
告
|
1252 |
-
呋
|
1253 |
-
呎
|
1254 |
-
呐
|
1255 |
-
呓
|
1256 |
-
呕
|
1257 |
-
呗
|
1258 |
-
员
|
1259 |
-
呛
|
1260 |
-
呜
|
1261 |
-
呢
|
1262 |
-
呤
|
1263 |
-
呦
|
1264 |
-
周
|
1265 |
-
呱
|
1266 |
-
呲
|
1267 |
-
味
|
1268 |
-
呵
|
1269 |
-
呷
|
1270 |
-
呸
|
1271 |
-
呻
|
1272 |
-
呼
|
1273 |
-
命
|
1274 |
-
咀
|
1275 |
-
咁
|
1276 |
-
咂
|
1277 |
-
咄
|
1278 |
-
咆
|
1279 |
-
咋
|
1280 |
-
和
|
1281 |
-
咎
|
1282 |
-
咏
|
1283 |
-
咐
|
1284 |
-
咒
|
1285 |
-
咔
|
1286 |
-
咕
|
1287 |
-
咖
|
1288 |
-
咗
|
1289 |
-
咘
|
1290 |
-
咙
|
1291 |
-
咚
|
1292 |
-
咛
|
1293 |
-
咣
|
1294 |
-
咤
|
1295 |
-
咦
|
1296 |
-
咧
|
1297 |
-
咨
|
1298 |
-
咩
|
1299 |
-
咪
|
1300 |
-
咫
|
1301 |
-
咬
|
1302 |
-
咭
|
1303 |
-
咯
|
1304 |
-
咱
|
1305 |
-
咲
|
1306 |
-
咳
|
1307 |
-
咸
|
1308 |
-
咻
|
1309 |
-
咽
|
1310 |
-
咿
|
1311 |
-
哀
|
1312 |
-
品
|
1313 |
-
哂
|
1314 |
-
哄
|
1315 |
-
哆
|
1316 |
-
哇
|
1317 |
-
哈
|
1318 |
-
哉
|
1319 |
-
哋
|
1320 |
-
哌
|
1321 |
-
响
|
1322 |
-
哎
|
1323 |
-
哏
|
1324 |
-
哐
|
1325 |
-
哑
|
1326 |
-
哒
|
1327 |
-
哔
|
1328 |
-
哗
|
1329 |
-
哟
|
1330 |
-
哥
|
1331 |
-
哦
|
1332 |
-
哧
|
1333 |
-
哨
|
1334 |
-
哩
|
1335 |
-
哪
|
1336 |
-
哭
|
1337 |
-
哮
|
1338 |
-
哲
|
1339 |
-
哺
|
1340 |
-
哼
|
1341 |
-
哽
|
1342 |
-
唁
|
1343 |
-
唆
|
1344 |
-
唇
|
1345 |
-
唉
|
1346 |
-
唏
|
1347 |
-
唐
|
1348 |
-
唑
|
1349 |
-
唔
|
1350 |
-
唠
|
1351 |
-
唤
|
1352 |
-
唧
|
1353 |
-
唬
|
1354 |
-
售
|
1355 |
-
唯
|
1356 |
-
唰
|
1357 |
-
唱
|
1358 |
-
唳
|
1359 |
-
唷
|
1360 |
-
唸
|
1361 |
-
唾
|
1362 |
-
啃
|
1363 |
-
啄
|
1364 |
-
商
|
1365 |
-
啉
|
1366 |
-
啊
|
1367 |
-
啕
|
1368 |
-
啖
|
1369 |
-
啜
|
1370 |
-
啡
|
1371 |
-
啤
|
1372 |
-
啥
|
1373 |
-
啦
|
1374 |
-
啧
|
1375 |
-
啪
|
1376 |
-
啫
|
1377 |
-
啬
|
1378 |
-
啮
|
1379 |
-
啰
|
1380 |
-
啱
|
1381 |
-
啲
|
1382 |
-
啵
|
1383 |
-
啶
|
1384 |
-
啷
|
1385 |
-
啸
|
1386 |
-
啻
|
1387 |
-
啼
|
1388 |
-
啾
|
1389 |
-
喀
|
1390 |
-
喂
|
1391 |
-
喃
|
1392 |
-
善
|
1393 |
-
喆
|
1394 |
-
喇
|
1395 |
-
喉
|
1396 |
-
喊
|
1397 |
-
喋
|
1398 |
-
喏
|
1399 |
-
喔
|
1400 |
-
喘
|
1401 |
-
喙
|
1402 |
-
喜
|
1403 |
-
喝
|
1404 |
-
喟
|
1405 |
-
喧
|
1406 |
-
喫
|
1407 |
-
喰
|
1408 |
-
喱
|
1409 |
-
喳
|
1410 |
-
喵
|
1411 |
-
営
|
1412 |
-
喷
|
1413 |
-
喹
|
1414 |
-
喺
|
1415 |
-
喻
|
1416 |
-
喽
|
1417 |
-
嗅
|
1418 |
-
嗑
|
1419 |
-
嗒
|
1420 |
-
嗓
|
1421 |
-
嗔
|
1422 |
-
嗖
|
1423 |
-
嗜
|
1424 |
-
嗝
|
1425 |
-
嗟
|
1426 |
-
嗡
|
1427 |
-
嗣
|
1428 |
-
嗤
|
1429 |
-
嗦
|
1430 |
-
嗨
|
1431 |
-
嗪
|
1432 |
-
嗬
|
1433 |
-
嗯
|
1434 |
-
嗰
|
1435 |
-
嗲
|
1436 |
-
嗳
|
1437 |
-
嗷
|
1438 |
-
嗽
|
1439 |
-
嘀
|
1440 |
-
嘅
|
1441 |
-
嘈
|
1442 |
-
嘉
|
1443 |
-
嘌
|
1444 |
-
嘎
|
1445 |
-
嘘
|
1446 |
-
嘚
|
1447 |
-
嘛
|
1448 |
-
嘞
|
1449 |
-
嘟
|
1450 |
-
嘢
|
1451 |
-
嘣
|
1452 |
-
嘤
|
1453 |
-
嘧
|
1454 |
-
嘭
|
1455 |
-
嘱
|
1456 |
-
嘲
|
1457 |
-
嘴
|
1458 |
-
嘶
|
1459 |
-
嘹
|
1460 |
-
嘻
|
1461 |
-
嘿
|
1462 |
-
噌
|
1463 |
-
噎
|
1464 |
-
噔
|
1465 |
-
噗
|
1466 |
-
噙
|
1467 |
-
噜
|
1468 |
-
噢
|
1469 |
-
噤
|
1470 |
-
器
|
1471 |
-
噩
|
1472 |
-
噪
|
1473 |
-
噬
|
1474 |
-
噱
|
1475 |
-
噶
|
1476 |
-
噻
|
1477 |
-
噼
|
1478 |
-
嚎
|
1479 |
-
嚏
|
1480 |
-
嚐
|
1481 |
-
嚓
|
1482 |
-
嚟
|
1483 |
-
嚣
|
1484 |
-
嚷
|
1485 |
-
嚼
|
1486 |
-
囉
|
1487 |
-
囊
|
1488 |
-
囍
|
1489 |
-
囔
|
1490 |
-
囗
|
1491 |
-
囚
|
1492 |
-
四
|
1493 |
-
囝
|
1494 |
-
回
|
1495 |
-
囟
|
1496 |
-
因
|
1497 |
-
囡
|
1498 |
-
团
|
1499 |
-
団
|
1500 |
-
囤
|
1501 |
-
囧
|
1502 |
-
囫
|
1503 |
-
园
|
1504 |
-
困
|
1505 |
-
囱
|
1506 |
-
囲
|
1507 |
-
図
|
1508 |
-
围
|
1509 |
-
囹
|
1510 |
-
固
|
1511 |
-
国
|
1512 |
-
图
|
1513 |
-
囿
|
1514 |
-
圃
|
1515 |
-
圄
|
1516 |
-
圆
|
1517 |
-
圈
|
1518 |
-
圏
|
1519 |
-
圜
|
1520 |
-
土
|
1521 |
-
圣
|
1522 |
-
圧
|
1523 |
-
在
|
1524 |
-
圩
|
1525 |
-
圭
|
1526 |
-
地
|
1527 |
-
圳
|
1528 |
-
场
|
1529 |
-
圻
|
1530 |
-
圾
|
1531 |
-
址
|
1532 |
-
坂
|
1533 |
-
均
|
1534 |
-
坊
|
1535 |
-
坍
|
1536 |
-
坎
|
1537 |
-
坏
|
1538 |
-
坐
|
1539 |
-
坑
|
1540 |
-
块
|
1541 |
-
坚
|
1542 |
-
坛
|
1543 |
-
坝
|
1544 |
-
坞
|
1545 |
-
坟
|
1546 |
-
坠
|
1547 |
-
坡
|
1548 |
-
坤
|
1549 |
-
坦
|
1550 |
-
坨
|
1551 |
-
坪
|
1552 |
-
坯
|
1553 |
-
坳
|
1554 |
-
坵
|
1555 |
-
坷
|
1556 |
-
垂
|
1557 |
-
垃
|
1558 |
-
垄
|
1559 |
-
型
|
1560 |
-
垒
|
1561 |
-
垚
|
1562 |
-
垛
|
1563 |
-
垠
|
1564 |
-
垢
|
1565 |
-
垣
|
1566 |
-
垦
|
1567 |
-
垩
|
1568 |
-
垫
|
1569 |
-
垭
|
1570 |
-
垮
|
1571 |
-
埂
|
1572 |
-
埃
|
1573 |
-
埋
|
1574 |
-
城
|
1575 |
-
埔
|
1576 |
-
埕
|
1577 |
-
埗
|
1578 |
-
域
|
1579 |
-
埠
|
1580 |
-
埤
|
1581 |
-
埵
|
1582 |
-
埸
|
1583 |
-
培
|
1584 |
-
基
|
1585 |
-
埼
|
1586 |
-
堀
|
1587 |
-
堂
|
1588 |
-
堃
|
1589 |
-
堆
|
1590 |
-
堇
|
1591 |
-
堑
|
1592 |
-
堕
|
1593 |
-
堙
|
1594 |
-
堡
|
1595 |
-
堤
|
1596 |
-
堪
|
1597 |
-
堰
|
1598 |
-
堵
|
1599 |
-
堺
|
1600 |
-
堿
|
1601 |
-
塌
|
1602 |
-
塑
|
1603 |
-
塔
|
1604 |
-
塘
|
1605 |
-
塞
|
1606 |
-
塩
|
1607 |
-
填
|
1608 |
-
塬
|
1609 |
-
塭
|
1610 |
-
塾
|
1611 |
-
墀
|
1612 |
-
境
|
1613 |
-
墅
|
1614 |
-
墉
|
1615 |
-
墒
|
1616 |
-
墓
|
1617 |
-
増
|
1618 |
-
墘
|
1619 |
-
墙
|
1620 |
-
增
|
1621 |
-
墟
|
1622 |
-
墨
|
1623 |
-
墩
|
1624 |
-
壁
|
1625 |
-
壅
|
1626 |
-
壆
|
1627 |
-
壊
|
1628 |
-
壑
|
1629 |
-
壕
|
1630 |
-
壤
|
1631 |
-
士
|
1632 |
-
壬
|
1633 |
-
壮
|
1634 |
-
声
|
1635 |
-
売
|
1636 |
-
壳
|
1637 |
-
壶
|
1638 |
-
壹
|
1639 |
-
处
|
1640 |
-
备
|
1641 |
-
変
|
1642 |
-
复
|
1643 |
-
夏
|
1644 |
-
夔
|
1645 |
-
夕
|
1646 |
-
外
|
1647 |
-
夙
|
1648 |
-
多
|
1649 |
-
夜
|
1650 |
-
够
|
1651 |
-
夥
|
1652 |
-
大
|
1653 |
-
天
|
1654 |
-
太
|
1655 |
-
夫
|
1656 |
-
夭
|
1657 |
-
央
|
1658 |
-
夯
|
1659 |
-
失
|
1660 |
-
头
|
1661 |
-
夷
|
1662 |
-
夸
|
1663 |
-
夹
|
1664 |
-
夺
|
1665 |
-
奂
|
1666 |
-
奄
|
1667 |
-
奇
|
1668 |
-
奈
|
1669 |
-
奉
|
1670 |
-
奋
|
1671 |
-
奎
|
1672 |
-
奏
|
1673 |
-
契
|
1674 |
-
奔
|
1675 |
-
奕
|
1676 |
-
奖
|
1677 |
-
套
|
1678 |
-
奘
|
1679 |
-
奚
|
1680 |
-
奠
|
1681 |
-
奢
|
1682 |
-
奥
|
1683 |
-
女
|
1684 |
-
奴
|
1685 |
-
奶
|
1686 |
-
奸
|
1687 |
-
她
|
1688 |
-
好
|
1689 |
-
如
|
1690 |
-
妃
|
1691 |
-
妄
|
1692 |
-
妆
|
1693 |
-
妇
|
1694 |
-
妈
|
1695 |
-
妊
|
1696 |
-
妍
|
1697 |
-
妒
|
1698 |
-
妓
|
1699 |
-
妖
|
1700 |
-
妘
|
1701 |
-
妙
|
1702 |
-
妞
|
1703 |
-
妣
|
1704 |
-
妤
|
1705 |
-
妥
|
1706 |
-
妨
|
1707 |
-
妩
|
1708 |
-
妪
|
1709 |
-
妮
|
1710 |
-
妲
|
1711 |
-
妳
|
1712 |
-
妹
|
1713 |
-
妻
|
1714 |
-
妾
|
1715 |
-
姆
|
1716 |
-
姉
|
1717 |
-
姊
|
1718 |
-
始
|
1719 |
-
姐
|
1720 |
-
姑
|
1721 |
-
姒
|
1722 |
-
姓
|
1723 |
-
委
|
1724 |
-
姗
|
1725 |
-
姚
|
1726 |
-
姜
|
1727 |
-
姝
|
1728 |
-
姣
|
1729 |
-
姥
|
1730 |
-
姨
|
1731 |
-
姪
|
1732 |
-
姫
|
1733 |
-
姬
|
1734 |
-
姹
|
1735 |
-
姻
|
1736 |
-
姿
|
1737 |
-
威
|
1738 |
-
娃
|
1739 |
-
娄
|
1740 |
-
娅
|
1741 |
-
娆
|
1742 |
-
娇
|
1743 |
-
娉
|
1744 |
-
娑
|
1745 |
-
娓
|
1746 |
-
娘
|
1747 |
-
娜
|
1748 |
-
娟
|
1749 |
-
娠
|
1750 |
-
娣
|
1751 |
-
娥
|
1752 |
-
娩
|
1753 |
-
娱
|
1754 |
-
娲
|
1755 |
-
娴
|
1756 |
-
娶
|
1757 |
-
娼
|
1758 |
-
婀
|
1759 |
-
婆
|
1760 |
-
婉
|
1761 |
-
婊
|
1762 |
-
婕
|
1763 |
-
婚
|
1764 |
-
婢
|
1765 |
-
婧
|
1766 |
-
婪
|
1767 |
-
婴
|
1768 |
-
婵
|
1769 |
-
婶
|
1770 |
-
婷
|
1771 |
-
婺
|
1772 |
-
婿
|
1773 |
-
媒
|
1774 |
-
媚
|
1775 |
-
媛
|
1776 |
-
媞
|
1777 |
-
媲
|
1778 |
-
媳
|
1779 |
-
媾
|
1780 |
-
嫁
|
1781 |
-
嫂
|
1782 |
-
嫉
|
1783 |
-
嫌
|
1784 |
-
嫑
|
1785 |
-
嫔
|
1786 |
-
嫖
|
1787 |
-
嫘
|
1788 |
-
嫚
|
1789 |
-
嫡
|
1790 |
-
嫣
|
1791 |
-
嫦
|
1792 |
-
嫩
|
1793 |
-
嫲
|
1794 |
-
嬅
|
1795 |
-
嬉
|
1796 |
-
嬗
|
1797 |
-
嬛
|
1798 |
-
嬢
|
1799 |
-
嬴
|
1800 |
-
嬷
|
1801 |
-
嬿
|
1802 |
-
孀
|
1803 |
-
孃
|
1804 |
-
子
|
1805 |
-
孑
|
1806 |
-
孔
|
1807 |
-
孕
|
1808 |
-
孖
|
1809 |
-
字
|
1810 |
-
存
|
1811 |
-
孙
|
1812 |
-
孚
|
1813 |
-
孛
|
1814 |
-
孜
|
1815 |
-
孝
|
1816 |
-
孟
|
1817 |
-
孢
|
1818 |
-
季
|
1819 |
-
孤
|
1820 |
-
学
|
1821 |
-
孩
|
1822 |
-
孪
|
1823 |
-
孬
|
1824 |
-
孰
|
1825 |
-
孱
|
1826 |
-
孳
|
1827 |
-
孵
|
1828 |
-
孺
|
1829 |
-
孽
|
1830 |
-
宁
|
1831 |
-
它
|
1832 |
-
宅
|
1833 |
-
宇
|
1834 |
-
守
|
1835 |
-
安
|
1836 |
-
宋
|
1837 |
-
完
|
1838 |
-
宏
|
1839 |
-
宓
|
1840 |
-
宕
|
1841 |
-
宗
|
1842 |
-
官
|
1843 |
-
宙
|
1844 |
-
定
|
1845 |
-
宛
|
1846 |
-
宜
|
1847 |
-
宝
|
1848 |
-
实
|
1849 |
-
実
|
1850 |
-
宠
|
1851 |
-
审
|
1852 |
-
客
|
1853 |
-
宣
|
1854 |
-
室
|
1855 |
-
宥
|
1856 |
-
宦
|
1857 |
-
宪
|
1858 |
-
宫
|
1859 |
-
宰
|
1860 |
-
害
|
1861 |
-
宴
|
1862 |
-
宵
|
1863 |
-
家
|
1864 |
-
宸
|
1865 |
-
容
|
1866 |
-
宽
|
1867 |
-
宾
|
1868 |
-
宿
|
1869 |
-
寂
|
1870 |
-
寄
|
1871 |
-
寅
|
1872 |
-
密
|
1873 |
-
寇
|
1874 |
-
富
|
1875 |
-
寐
|
1876 |
-
寒
|
1877 |
-
寓
|
1878 |
-
寛
|
1879 |
-
寝
|
1880 |
-
寞
|
1881 |
-
察
|
1882 |
-
寡
|
1883 |
-
寥
|
1884 |
-
寨
|
1885 |
-
寮
|
1886 |
-
寰
|
1887 |
-
寸
|
1888 |
-
对
|
1889 |
-
寺
|
1890 |
-
寻
|
1891 |
-
导
|
1892 |
-
対
|
1893 |
-
寿
|
1894 |
-
封
|
1895 |
-
専
|
1896 |
-
射
|
1897 |
-
将
|
1898 |
-
尉
|
1899 |
-
尊
|
1900 |
-
小
|
1901 |
-
少
|
1902 |
-
尔
|
1903 |
-
尕
|
1904 |
-
尖
|
1905 |
-
尘
|
1906 |
-
尚
|
1907 |
-
尝
|
1908 |
-
尤
|
1909 |
-
尧
|
1910 |
-
尬
|
1911 |
-
就
|
1912 |
-
尴
|
1913 |
-
尸
|
1914 |
-
尹
|
1915 |
-
尺
|
1916 |
-
尻
|
1917 |
-
尼
|
1918 |
-
尽
|
1919 |
-
尾
|
1920 |
-
尿
|
1921 |
-
局
|
1922 |
-
屁
|
1923 |
-
层
|
1924 |
-
屄
|
1925 |
-
居
|
1926 |
-
屈
|
1927 |
-
屉
|
1928 |
-
届
|
1929 |
-
屋
|
1930 |
-
屌
|
1931 |
-
屎
|
1932 |
-
屏
|
1933 |
-
屐
|
1934 |
-
屑
|
1935 |
-
展
|
1936 |
-
属
|
1937 |
-
屠
|
1938 |
-
屡
|
1939 |
-
履
|
1940 |
-
屯
|
1941 |
-
山
|
1942 |
-
屹
|
1943 |
-
屿
|
1944 |
-
岀
|
1945 |
-
岁
|
1946 |
-
岂
|
1947 |
-
岌
|
1948 |
-
岐
|
1949 |
-
岑
|
1950 |
-
岔
|
1951 |
-
岖
|
1952 |
-
岗
|
1953 |
-
岘
|
1954 |
-
岙
|
1955 |
-
岚
|
1956 |
-
岛
|
1957 |
-
岩
|
1958 |
-
岫
|
1959 |
-
岬
|
1960 |
-
岭
|
1961 |
-
岱
|
1962 |
-
岳
|
1963 |
-
岷
|
1964 |
-
岸
|
1965 |
-
��
|
1966 |
-
峋
|
1967 |
-
峒
|
1968 |
-
峙
|
1969 |
-
峡
|
1970 |
-
峤
|
1971 |
-
峥
|
1972 |
-
峦
|
1973 |
-
峨
|
1974 |
-
峪
|
1975 |
-
峭
|
1976 |
-
峯
|
1977 |
-
峰
|
1978 |
-
峻
|
1979 |
-
崁
|
1980 |
-
崂
|
1981 |
-
崆
|
1982 |
-
崇
|
1983 |
-
崎
|
1984 |
-
崑
|
1985 |
-
崔
|
1986 |
-
崖
|
1987 |
-
崙
|
1988 |
-
崛
|
1989 |
-
崧
|
1990 |
-
崩
|
1991 |
-
崭
|
1992 |
-
崴
|
1993 |
-
崽
|
1994 |
-
嵇
|
1995 |
-
嵊
|
1996 |
-
嵋
|
1997 |
-
嵌
|
1998 |
-
嵘
|
1999 |
-
嵩
|
2000 |
-
嵬
|
2001 |
-
嵯
|
2002 |
-
嶂
|
2003 |
-
嶋
|
2004 |
-
嶙
|
2005 |
-
巅
|
2006 |
-
巍
|
2007 |
-
巖
|
2008 |
-
川
|
2009 |
-
州
|
2010 |
-
巡
|
2011 |
-
巢
|
2012 |
-
工
|
2013 |
-
左
|
2014 |
-
巧
|
2015 |
-
巨
|
2016 |
-
巩
|
2017 |
-
巫
|
2018 |
-
差
|
2019 |
-
己
|
2020 |
-
已
|
2021 |
-
巳
|
2022 |
-
巴
|
2023 |
-
巷
|
2024 |
-
巻
|
2025 |
-
巽
|
2026 |
-
巾
|
2027 |
-
巿
|
2028 |
-
币
|
2029 |
-
市
|
2030 |
-
布
|
2031 |
-
帅
|
2032 |
-
帆
|
2033 |
-
师
|
2034 |
-
希
|
2035 |
-
帐
|
2036 |
-
帑
|
2037 |
-
帕
|
2038 |
-
帖
|
2039 |
-
帘
|
2040 |
-
帚
|
2041 |
-
帛
|
2042 |
-
帜
|
2043 |
-
帝
|
2044 |
-
带
|
2045 |
-
帧
|
2046 |
-
席
|
2047 |
-
帮
|
2048 |
-
帯
|
2049 |
-
帰
|
2050 |
-
帷
|
2051 |
-
常
|
2052 |
-
帼
|
2053 |
-
帽
|
2054 |
-
幂
|
2055 |
-
幄
|
2056 |
-
幅
|
2057 |
-
幌
|
2058 |
-
幔
|
2059 |
-
幕
|
2060 |
-
幡
|
2061 |
-
幢
|
2062 |
-
干
|
2063 |
-
平
|
2064 |
-
年
|
2065 |
-
并
|
2066 |
-
幸
|
2067 |
-
幻
|
2068 |
-
幼
|
2069 |
-
幽
|
2070 |
-
广
|
2071 |
-
庁
|
2072 |
-
広
|
2073 |
-
庄
|
2074 |
-
庆
|
2075 |
-
庇
|
2076 |
-
床
|
2077 |
-
序
|
2078 |
-
庐
|
2079 |
-
库
|
2080 |
-
应
|
2081 |
-
底
|
2082 |
-
庖
|
2083 |
-
店
|
2084 |
-
庙
|
2085 |
-
庚
|
2086 |
-
府
|
2087 |
-
庞
|
2088 |
-
废
|
2089 |
-
庠
|
2090 |
-
度
|
2091 |
-
座
|
2092 |
-
庭
|
2093 |
-
庵
|
2094 |
-
庶
|
2095 |
-
康
|
2096 |
-
庸
|
2097 |
-
庹
|
2098 |
-
庾
|
2099 |
-
廃
|
2100 |
-
廉
|
2101 |
-
廊
|
2102 |
-
廓
|
2103 |
-
廖
|
2104 |
-
延
|
2105 |
-
廷
|
2106 |
-
建
|
2107 |
-
廿
|
2108 |
-
开
|
2109 |
-
弁
|
2110 |
-
异
|
2111 |
-
弃
|
2112 |
-
弄
|
2113 |
-
弈
|
2114 |
-
弊
|
2115 |
-
弋
|
2116 |
-
式
|
2117 |
-
弑
|
2118 |
-
弓
|
2119 |
-
弔
|
2120 |
-
引
|
2121 |
-
弗
|
2122 |
-
弘
|
2123 |
-
弛
|
2124 |
-
弟
|
2125 |
-
张
|
2126 |
-
弥
|
2127 |
-
弦
|
2128 |
-
弧
|
2129 |
-
弩
|
2130 |
-
弭
|
2131 |
-
弯
|
2132 |
-
弱
|
2133 |
-
弹
|
2134 |
-
强
|
2135 |
-
弼
|
2136 |
-
弾
|
2137 |
-
彅
|
2138 |
-
归
|
2139 |
-
当
|
2140 |
-
录
|
2141 |
-
彗
|
2142 |
-
彝
|
2143 |
-
形
|
2144 |
-
彤
|
2145 |
-
彦
|
2146 |
-
彧
|
2147 |
-
彩
|
2148 |
-
彪
|
2149 |
-
彫
|
2150 |
-
彬
|
2151 |
-
彭
|
2152 |
-
彰
|
2153 |
-
影
|
2154 |
-
彷
|
2155 |
-
役
|
2156 |
-
彻
|
2157 |
-
彼
|
2158 |
-
彿
|
2159 |
-
往
|
2160 |
-
征
|
2161 |
-
径
|
2162 |
-
待
|
2163 |
-
徇
|
2164 |
-
很
|
2165 |
-
徉
|
2166 |
-
徊
|
2167 |
-
律
|
2168 |
-
徐
|
2169 |
-
徒
|
2170 |
-
従
|
2171 |
-
徕
|
2172 |
-
得
|
2173 |
-
徘
|
2174 |
-
徙
|
2175 |
-
徜
|
2176 |
-
御
|
2177 |
-
徨
|
2178 |
-
循
|
2179 |
-
徬
|
2180 |
-
微
|
2181 |
-
徳
|
2182 |
-
徴
|
2183 |
-
德
|
2184 |
-
徼
|
2185 |
-
徽
|
2186 |
-
心
|
2187 |
-
必
|
2188 |
-
忆
|
2189 |
-
忌
|
2190 |
-
忍
|
2191 |
-
忏
|
2192 |
-
忐
|
2193 |
-
忑
|
2194 |
-
忒
|
2195 |
-
忖
|
2196 |
-
志
|
2197 |
-
忘
|
2198 |
-
忙
|
2199 |
-
応
|
2200 |
-
忠
|
2201 |
-
忡
|
2202 |
-
忤
|
2203 |
-
忧
|
2204 |
-
忪
|
2205 |
-
快
|
2206 |
-
忱
|
2207 |
-
念
|
2208 |
-
忻
|
2209 |
-
忽
|
2210 |
-
忿
|
2211 |
-
怀
|
2212 |
-
态
|
2213 |
-
怂
|
2214 |
-
怅
|
2215 |
-
怆
|
2216 |
-
怎
|
2217 |
-
怏
|
2218 |
-
怒
|
2219 |
-
怔
|
2220 |
-
怕
|
2221 |
-
怖
|
2222 |
-
怙
|
2223 |
-
怜
|
2224 |
-
思
|
2225 |
-
怠
|
2226 |
-
怡
|
2227 |
-
急
|
2228 |
-
怦
|
2229 |
-
性
|
2230 |
-
怨
|
2231 |
-
怪
|
2232 |
-
怯
|
2233 |
-
怵
|
2234 |
-
总
|
2235 |
-
怼
|
2236 |
-
恁
|
2237 |
-
恃
|
2238 |
-
恋
|
2239 |
-
恍
|
2240 |
-
恐
|
2241 |
-
恒
|
2242 |
-
恕
|
2243 |
-
恙
|
2244 |
-
恚
|
2245 |
-
恢
|
2246 |
-
恣
|
2247 |
-
恤
|
2248 |
-
恨
|
2249 |
-
恩
|
2250 |
-
恪
|
2251 |
-
恫
|
2252 |
-
恬
|
2253 |
-
恭
|
2254 |
-
息
|
2255 |
-
恰
|
2256 |
-
恳
|
2257 |
-
恵
|
2258 |
-
恶
|
2259 |
-
恸
|
2260 |
-
恺
|
2261 |
-
恻
|
2262 |
-
恼
|
2263 |
-
恿
|
2264 |
-
悄
|
2265 |
-
悉
|
2266 |
-
悌
|
2267 |
-
悍
|
2268 |
-
悔
|
2269 |
-
悖
|
2270 |
-
悚
|
2271 |
-
悟
|
2272 |
-
悠
|
2273 |
-
患
|
2274 |
-
悦
|
2275 |
-
您
|
2276 |
-
悩
|
2277 |
-
悪
|
2278 |
-
悬
|
2279 |
-
悯
|
2280 |
-
悱
|
2281 |
-
悲
|
2282 |
-
悴
|
2283 |
-
悸
|
2284 |
-
悻
|
2285 |
-
悼
|
2286 |
-
悽
|
2287 |
-
情
|
2288 |
-
惆
|
2289 |
-
惇
|
2290 |
-
惊
|
2291 |
-
惋
|
2292 |
-
惑
|
2293 |
-
惕
|
2294 |
-
惘
|
2295 |
-
惚
|
2296 |
-
惜
|
2297 |
-
惟
|
2298 |
-
惠
|
2299 |
-
惦
|
2300 |
-
惧
|
2301 |
-
惨
|
2302 |
-
惩
|
2303 |
-
惫
|
2304 |
-
惬
|
2305 |
-
惭
|
2306 |
-
惮
|
2307 |
-
惯
|
2308 |
-
惰
|
2309 |
-
想
|
2310 |
-
惴
|
2311 |
-
惶
|
2312 |
-
惹
|
2313 |
-
惺
|
2314 |
-
愁
|
2315 |
-
愆
|
2316 |
-
愈
|
2317 |
-
愉
|
2318 |
-
愍
|
2319 |
-
意
|
2320 |
-
愕
|
2321 |
-
愚
|
2322 |
-
感
|
2323 |
-
愣
|
2324 |
-
愤
|
2325 |
-
愧
|
2326 |
-
愫
|
2327 |
-
愿
|
2328 |
-
慈
|
2329 |
-
慌
|
2330 |
-
慎
|
2331 |
-
慑
|
2332 |
-
慕
|
2333 |
-
慢
|
2334 |
-
慧
|
2335 |
-
慨
|
2336 |
-
慰
|
2337 |
-
慵
|
2338 |
-
慷
|
2339 |
-
慾
|
2340 |
-
憋
|
2341 |
-
憎
|
2342 |
-
憔
|
2343 |
-
憧
|
2344 |
-
憨
|
2345 |
-
憩
|
2346 |
-
憬
|
2347 |
-
憾
|
2348 |
-
懂
|
2349 |
-
懈
|
2350 |
-
懊
|
2351 |
-
懋
|
2352 |
-
懑
|
2353 |
-
懒
|
2354 |
-
懦
|
2355 |
-
懵
|
2356 |
-
懿
|
2357 |
-
戈
|
2358 |
-
戊
|
2359 |
-
戌
|
2360 |
-
戍
|
2361 |
-
戎
|
2362 |
-
戏
|
2363 |
-
成
|
2364 |
-
我
|
2365 |
-
戒
|
2366 |
-
戕
|
2367 |
-
或
|
2368 |
-
战
|
2369 |
-
戚
|
2370 |
-
戛
|
2371 |
-
戟
|
2372 |
-
戡
|
2373 |
-
戦
|
2374 |
-
截
|
2375 |
-
戬
|
2376 |
-
戮
|
2377 |
-
戳
|
2378 |
-
戴
|
2379 |
-
户
|
2380 |
-
戸
|
2381 |
-
戻
|
2382 |
-
戾
|
2383 |
-
房
|
2384 |
-
所
|
2385 |
-
扁
|
2386 |
-
扇
|
2387 |
-
扈
|
2388 |
-
扉
|
2389 |
-
手
|
2390 |
-
才
|
2391 |
-
扎
|
2392 |
-
扑
|
2393 |
-
扒
|
2394 |
-
打
|
2395 |
-
扔
|
2396 |
-
払
|
2397 |
-
托
|
2398 |
-
扛
|
2399 |
-
扣
|
2400 |
-
扦
|
2401 |
-
执
|
2402 |
-
扩
|
2403 |
-
扪
|
2404 |
-
扫
|
2405 |
-
扬
|
2406 |
-
扭
|
2407 |
-
扮
|
2408 |
-
扯
|
2409 |
-
扰
|
2410 |
-
扱
|
2411 |
-
扳
|
2412 |
-
扶
|
2413 |
-
批
|
2414 |
-
扼
|
2415 |
-
找
|
2416 |
-
承
|
2417 |
-
技
|
2418 |
-
抄
|
2419 |
-
抉
|
2420 |
-
把
|
2421 |
-
抑
|
2422 |
-
抒
|
2423 |
-
抓
|
2424 |
-
投
|
2425 |
-
抖
|
2426 |
-
抗
|
2427 |
-
折
|
2428 |
-
抚
|
2429 |
-
抛
|
2430 |
-
抜
|
2431 |
-
択
|
2432 |
-
抟
|
2433 |
-
抠
|
2434 |
-
抡
|
2435 |
-
抢
|
2436 |
-
护
|
2437 |
-
报
|
2438 |
-
抨
|
2439 |
-
披
|
2440 |
-
抬
|
2441 |
-
抱
|
2442 |
-
抵
|
2443 |
-
抹
|
2444 |
-
押
|
2445 |
-
抽
|
2446 |
-
抿
|
2447 |
-
拂
|
2448 |
-
拄
|
2449 |
-
担
|
2450 |
-
拆
|
2451 |
-
拇
|
2452 |
-
拈
|
2453 |
-
拉
|
2454 |
-
拌
|
2455 |
-
拍
|
2456 |
-
拎
|
2457 |
-
拐
|
2458 |
-
拒
|
2459 |
-
拓
|
2460 |
-
拔
|
2461 |
-
拖
|
2462 |
-
拗
|
2463 |
-
拘
|
2464 |
-
拙
|
2465 |
-
拚
|
2466 |
-
招
|
2467 |
-
拜
|
2468 |
-
拟
|
2469 |
-
拡
|
2470 |
-
拢
|
2471 |
-
拣
|
2472 |
-
拥
|
2473 |
-
拦
|
2474 |
-
拧
|
2475 |
-
拨
|
2476 |
-
择
|
2477 |
-
括
|
2478 |
-
拭
|
2479 |
-
拮
|
2480 |
-
拯
|
2481 |
-
拱
|
2482 |
-
拳
|
2483 |
-
拴
|
2484 |
-
拷
|
2485 |
-
拼
|
2486 |
-
拽
|
2487 |
-
拾
|
2488 |
-
拿
|
2489 |
-
持
|
2490 |
-
挂
|
2491 |
-
指
|
2492 |
-
挈
|
2493 |
-
按
|
2494 |
-
挎
|
2495 |
-
挑
|
2496 |
-
挖
|
2497 |
-
挙
|
2498 |
-
挚
|
2499 |
-
挛
|
2500 |
-
挝
|
2501 |
-
挞
|
2502 |
-
挟
|
2503 |
-
挠
|
2504 |
-
挡
|
2505 |
-
挣
|
2506 |
-
挤
|
2507 |
-
挥
|
2508 |
-
挨
|
2509 |
-
挪
|
2510 |
-
挫
|
2511 |
-
振
|
2512 |
-
挲
|
2513 |
-
挹
|
2514 |
-
挺
|
2515 |
-
挽
|
2516 |
-
捂
|
2517 |
-
捅
|
2518 |
-
捆
|
2519 |
-
捉
|
2520 |
-
捋
|
2521 |
-
捌
|
2522 |
-
捍
|
2523 |
-
捎
|
2524 |
-
捏
|
2525 |
-
捐
|
2526 |
-
捕
|
2527 |
-
捞
|
2528 |
-
损
|
2529 |
-
捡
|
2530 |
-
换
|
2531 |
-
捣
|
2532 |
-
捧
|
2533 |
-
捩
|
2534 |
-
据
|
2535 |
-
捱
|
2536 |
-
捲
|
2537 |
-
捶
|
2538 |
-
捷
|
2539 |
-
捺
|
2540 |
-
捻
|
2541 |
-
掀
|
2542 |
-
掂
|
2543 |
-
掇
|
2544 |
-
授
|
2545 |
-
掉
|
2546 |
-
掌
|
2547 |
-
掏
|
2548 |
-
掐
|
2549 |
-
排
|
2550 |
-
掖
|
2551 |
-
掘
|
2552 |
-
掠
|
2553 |
-
探
|
2554 |
-
掣
|
2555 |
-
接
|
2556 |
-
控
|
2557 |
-
推
|
2558 |
-
掩
|
2559 |
-
措
|
2560 |
-
掬
|
2561 |
-
掰
|
2562 |
-
掲
|
2563 |
-
掳
|
2564 |
-
掴
|
2565 |
-
掷
|
2566 |
-
掸
|
2567 |
-
掺
|
2568 |
-
揃
|
2569 |
-
揄
|
2570 |
-
揆
|
2571 |
-
揉
|
2572 |
-
揍
|
2573 |
-
描
|
2574 |
-
提
|
2575 |
-
插
|
2576 |
-
揖
|
2577 |
-
握
|
2578 |
-
揣
|
2579 |
-
揩
|
2580 |
-
揪
|
2581 |
-
揭
|
2582 |
-
援
|
2583 |
-
揶
|
2584 |
-
揸
|
2585 |
-
揹
|
2586 |
-
揽
|
2587 |
-
搀
|
2588 |
-
搁
|
2589 |
-
搂
|
2590 |
-
搅
|
2591 |
-
搏
|
2592 |
-
搐
|
2593 |
-
搓
|
2594 |
-
搔
|
2595 |
-
搜
|
2596 |
-
搞
|
2597 |
-
搡
|
2598 |
-
搪
|
2599 |
-
搬
|
2600 |
-
搭
|
2601 |
-
携
|
2602 |
-
搽
|
2603 |
-
摀
|
2604 |
-
摁
|
2605 |
-
摄
|
2606 |
-
摆
|
2607 |
-
摇
|
2608 |
-
摈
|
2609 |
-
摊
|
2610 |
-
摒
|
2611 |
-
摔
|
2612 |
-
摘
|
2613 |
-
摞
|
2614 |
-
摧
|
2615 |
-
摩
|
2616 |
-
摸
|
2617 |
-
摹
|
2618 |
-
撂
|
2619 |
-
撃
|
2620 |
-
撅
|
2621 |
-
撇
|
2622 |
-
撑
|
2623 |
-
撒
|
2624 |
-
撕
|
2625 |
-
撚
|
2626 |
-
撞
|
2627 |
-
撤
|
2628 |
-
撩
|
2629 |
-
撬
|
2630 |
-
播
|
2631 |
-
撮
|
2632 |
-
撰
|
2633 |
-
撵
|
2634 |
-
撷
|
2635 |
-
撸
|
2636 |
-
撼
|
2637 |
-
擀
|
2638 |
-
擂
|
2639 |
-
擅
|
2640 |
-
操
|
2641 |
-
擎
|
2642 |
-
擒
|
2643 |
-
擘
|
2644 |
-
擞
|
2645 |
-
擡
|
2646 |
-
擢
|
2647 |
-
擦
|
2648 |
-
攀
|
2649 |
-
攒
|
2650 |
-
攘
|
2651 |
-
攞
|
2652 |
-
攥
|
2653 |
-
攫
|
2654 |
-
支
|
2655 |
-
收
|
2656 |
-
攸
|
2657 |
-
改
|
2658 |
-
攻
|
2659 |
-
放
|
2660 |
-
政
|
2661 |
-
故
|
2662 |
-
效
|
2663 |
-
敌
|
2664 |
-
敍
|
2665 |
-
敎
|
2666 |
-
敏
|
2667 |
-
救
|
2668 |
-
敕
|
2669 |
-
敖
|
2670 |
-
教
|
2671 |
-
敛
|
2672 |
-
敝
|
2673 |
-
敞
|
2674 |
-
敢
|
2675 |
-
散
|
2676 |
-
敦
|
2677 |
-
敬
|
2678 |
-
数
|
2679 |
-
敲
|
2680 |
-
整
|
2681 |
-
敷
|
2682 |
-
文
|
2683 |
-
斋
|
2684 |
-
斌
|
2685 |
-
斎
|
2686 |
-
斐
|
2687 |
-
斑
|
2688 |
-
斓
|
2689 |
-
斗
|
2690 |
-
料
|
2691 |
-
斛
|
2692 |
-
斜
|
2693 |
-
斟
|
2694 |
-
斡
|
2695 |
-
斤
|
2696 |
-
斥
|
2697 |
-
斧
|
2698 |
-
斩
|
2699 |
-
斫
|
2700 |
-
断
|
2701 |
-
斯
|
2702 |
-
新
|
2703 |
-
方
|
2704 |
-
施
|
2705 |
-
旁
|
2706 |
-
旃
|
2707 |
-
旅
|
2708 |
-
旋
|
2709 |
-
旌
|
2710 |
-
旎
|
2711 |
-
族
|
2712 |
-
旖
|
2713 |
-
旗
|
2714 |
-
无
|
2715 |
-
既
|
2716 |
-
日
|
2717 |
-
旦
|
2718 |
-
旧
|
2719 |
-
旨
|
2720 |
-
早
|
2721 |
-
旬
|
2722 |
-
旭
|
2723 |
-
旮
|
2724 |
-
旱
|
2725 |
-
时
|
2726 |
-
旷
|
2727 |
-
旺
|
2728 |
-
旻
|
2729 |
-
昀
|
2730 |
-
昂
|
2731 |
-
昆
|
2732 |
-
昇
|
2733 |
-
昉
|
2734 |
-
昊
|
2735 |
-
昌
|
2736 |
-
明
|
2737 |
-
昏
|
2738 |
-
易
|
2739 |
-
昔
|
2740 |
-
昕
|
2741 |
-
昙
|
2742 |
-
星
|
2743 |
-
映
|
2744 |
-
春
|
2745 |
-
昧
|
2746 |
-
昨
|
2747 |
-
昭
|
2748 |
-
是
|
2749 |
-
昱
|
2750 |
-
昴
|
2751 |
-
昵
|
2752 |
-
昶
|
2753 |
-
昼
|
2754 |
-
显
|
2755 |
-
晁
|
2756 |
-
晃
|
2757 |
-
晋
|
2758 |
-
晌
|
2759 |
-
晏
|
2760 |
-
晒
|
2761 |
-
晓
|
2762 |
-
晔
|
2763 |
-
晕
|
2764 |
-
晖
|
2765 |
-
晗
|
2766 |
-
晚
|
2767 |
-
晞
|
2768 |
-
晟
|
2769 |
-
晤
|
2770 |
-
晦
|
2771 |
-
晨
|
2772 |
-
晩
|
2773 |
-
普
|
2774 |
-
景
|
2775 |
-
晰
|
2776 |
-
晴
|
2777 |
-
晶
|
2778 |
-
晷
|
2779 |
-
智
|
2780 |
-
晾
|
2781 |
-
暂
|
2782 |
-
暄
|
2783 |
-
暇
|
2784 |
-
暌
|
2785 |
-
暐
|
2786 |
-
暑
|
2787 |
-
暖
|
2788 |
-
暗
|
2789 |
-
暝
|
2790 |
-
暧
|
2791 |
-
暨
|
2792 |
-
暮
|
2793 |
-
暱
|
2794 |
-
暴
|
2795 |
-
暸
|
2796 |
-
暹
|
2797 |
-
曙
|
2798 |
-
曜
|
2799 |
-
曝
|
2800 |
-
曦
|
2801 |
-
曰
|
2802 |
-
曲
|
2803 |
-
曳
|
2804 |
-
更
|
2805 |
-
曹
|
2806 |
-
曼
|
2807 |
-
曾
|
2808 |
-
替
|
2809 |
-
最
|
2810 |
-
月
|
2811 |
-
有
|
2812 |
-
朋
|
2813 |
-
服
|
2814 |
-
朐
|
2815 |
-
朔
|
2816 |
-
朕
|
2817 |
-
朗
|
2818 |
-
望
|
2819 |
-
朝
|
2820 |
-
期
|
2821 |
-
朦
|
2822 |
-
木
|
2823 |
-
未
|
2824 |
-
末
|
2825 |
-
本
|
2826 |
-
札
|
2827 |
-
术
|
2828 |
-
朱
|
2829 |
-
朴
|
2830 |
-
朵
|
2831 |
-
机
|
2832 |
-
朽
|
2833 |
-
杀
|
2834 |
-
杂
|
2835 |
-
权
|
2836 |
-
杆
|
2837 |
-
杈
|
2838 |
-
杉
|
2839 |
-
李
|
2840 |
-
杏
|
2841 |
-
材
|
2842 |
-
村
|
2843 |
-
杓
|
2844 |
-
杖
|
2845 |
-
杜
|
2846 |
-
杞
|
2847 |
-
束
|
2848 |
-
杠
|
2849 |
-
条
|
2850 |
-
来
|
2851 |
-
杨
|
2852 |
-
杭
|
2853 |
-
杯
|
2854 |
-
杰
|
2855 |
-
杳
|
2856 |
-
杵
|
2857 |
-
杷
|
2858 |
-
杼
|
2859 |
-
松
|
2860 |
-
板
|
2861 |
-
极
|
2862 |
-
构
|
2863 |
-
枇
|
2864 |
-
枉
|
2865 |
-
枋
|
2866 |
-
析
|
2867 |
-
枕
|
2868 |
-
林
|
2869 |
-
枚
|
2870 |
-
果
|
2871 |
-
枝
|
2872 |
-
枢
|
2873 |
-
枣
|
2874 |
-
枪
|
2875 |
-
枫
|
2876 |
-
枭
|
2877 |
-
枯
|
2878 |
-
枰
|
2879 |
-
枱
|
2880 |
-
枳
|
2881 |
-
架
|
2882 |
-
枷
|
2883 |
-
枸
|
2884 |
-
柄
|
2885 |
-
柏
|
2886 |
-
某
|
2887 |
-
柑
|
2888 |
-
柒
|
2889 |
-
染
|
2890 |
-
柔
|
2891 |
-
柘
|
2892 |
-
柚
|
2893 |
-
柜
|
2894 |
-
柞
|
2895 |
-
柠
|
2896 |
-
柢
|
2897 |
-
查
|
2898 |
-
柩
|
2899 |
-
柬
|
2900 |
-
柯
|
2901 |
-
柱
|
2902 |
-
柳
|
2903 |
-
柴
|
2904 |
-
査
|
2905 |
-
柿
|
2906 |
-
栀
|
2907 |
-
栃
|
2908 |
-
栄
|
2909 |
-
栅
|
2910 |
-
标
|
2911 |
-
栈
|
2912 |
-
栉
|
2913 |
-
栋
|
2914 |
-
栎
|
2915 |
-
栏
|
2916 |
-
树
|
2917 |
-
栓
|
2918 |
-
栖
|
2919 |
-
栗
|
2920 |
-
校
|
2921 |
-
栩
|
2922 |
-
株
|
2923 |
-
样
|
2924 |
-
核
|
2925 |
-
根
|
2926 |
-
格
|
2927 |
-
栽
|
2928 |
-
栾
|
2929 |
-
桀
|
2930 |
-
桁
|
2931 |
-
桂
|
2932 |
-
桃
|
2933 |
-
桅
|
2934 |
-
框
|
2935 |
-
案
|
2936 |
-
桉
|
2937 |
-
桌
|
2938 |
-
桎
|
2939 |
-
桐
|
2940 |
-
桑
|
2941 |
-
桓
|
2942 |
-
桔
|
2943 |
-
桜
|
2944 |
-
桠
|
2945 |
-
桡
|
2946 |
-
桢
|
2947 |
-
档
|
2948 |
-
桥
|
2949 |
-
桦
|
2950 |
-
桧
|
2951 |
-
桨
|
2952 |
-
桩
|
2953 |
-
桶
|
2954 |
-
梁
|
2955 |
-
梅
|
2956 |
-
梆
|
2957 |
-
梏
|
2958 |
-
梓
|
2959 |
-
梗
|
2960 |
-
梢
|
2961 |
-
梦
|
2962 |
-
梧
|
2963 |
-
梨
|
2964 |
-
梭
|
2965 |
-
梯
|
2966 |
-
械
|
2967 |
-
梳
|
2968 |
-
梵
|
2969 |
-
梶
|
2970 |
-
检
|
2971 |
-
棂
|
2972 |
-
棉
|
2973 |
-
棋
|
2974 |
-
棍
|
2975 |
-
棒
|
2976 |
-
棕
|
2977 |
-
棘
|
2978 |
-
棚
|
2979 |
-
棠
|
2980 |
-
棣
|
2981 |
-
森
|
2982 |
-
棱
|
2983 |
-
棵
|
2984 |
-
棹
|
2985 |
-
棺
|
2986 |
-
椁
|
2987 |
-
椅
|
2988 |
-
椋
|
2989 |
-
��
|
2990 |
-
椎
|
2991 |
-
椒
|
2992 |
-
検
|
2993 |
-
椪
|
2994 |
-
椭
|
2995 |
-
椰
|
2996 |
-
椹
|
2997 |
-
椽
|
2998 |
-
椿
|
2999 |
-
楂
|
3000 |
-
楔
|
3001 |
-
楚
|
3002 |
-
楝
|
3003 |
-
楞
|
3004 |
-
楠
|
3005 |
-
楣
|
3006 |
-
楫
|
3007 |
-
楮
|
3008 |
-
楷
|
3009 |
-
楸
|
3010 |
-
楹
|
3011 |
-
楼
|
3012 |
-
楽
|
3013 |
-
概
|
3014 |
-
榄
|
3015 |
-
榆
|
3016 |
-
榈
|
3017 |
-
榉
|
3018 |
-
榔
|
3019 |
-
榕
|
3020 |
-
榖
|
3021 |
-
榛
|
3022 |
-
榜
|
3023 |
-
榨
|
3024 |
-
榫
|
3025 |
-
榭
|
3026 |
-
榱
|
3027 |
-
榴
|
3028 |
-
榷
|
3029 |
-
榻
|
3030 |
-
槁
|
3031 |
-
槃
|
3032 |
-
槌
|
3033 |
-
槎
|
3034 |
-
槐
|
3035 |
-
槓
|
3036 |
-
様
|
3037 |
-
槛
|
3038 |
-
槟
|
3039 |
-
槭
|
3040 |
-
槲
|
3041 |
-
槻
|
3042 |
-
槽
|
3043 |
-
槿
|
3044 |
-
樊
|
3045 |
-
樑
|
3046 |
-
樟
|
3047 |
-
模
|
3048 |
-
権
|
3049 |
-
横
|
3050 |
-
樫
|
3051 |
-
樯
|
3052 |
-
樱
|
3053 |
-
樵
|
3054 |
-
樽
|
3055 |
-
樾
|
3056 |
-
橄
|
3057 |
-
橇
|
3058 |
-
橐
|
3059 |
-
橘
|
3060 |
-
橙
|
3061 |
-
橡
|
3062 |
-
橱
|
3063 |
-
橹
|
3064 |
-
橼
|
3065 |
-
檀
|
3066 |
-
檄
|
3067 |
-
檎
|
3068 |
-
檐
|
3069 |
-
檗
|
3070 |
-
檬
|
3071 |
-
欠
|
3072 |
-
次
|
3073 |
-
欢
|
3074 |
-
欣
|
3075 |
-
欧
|
3076 |
-
欲
|
3077 |
-
欸
|
3078 |
-
欺
|
3079 |
-
款
|
3080 |
-
歆
|
3081 |
-
歇
|
3082 |
-
歉
|
3083 |
-
歌
|
3084 |
-
歎
|
3085 |
-
歓
|
3086 |
-
歙
|
3087 |
-
歛
|
3088 |
-
止
|
3089 |
-
正
|
3090 |
-
此
|
3091 |
-
步
|
3092 |
-
武
|
3093 |
-
歧
|
3094 |
-
歩
|
3095 |
-
歪
|
3096 |
-
歯
|
3097 |
-
歳
|
3098 |
-
歴
|
3099 |
-
歹
|
3100 |
-
死
|
3101 |
-
歼
|
3102 |
-
殁
|
3103 |
-
殃
|
3104 |
-
殆
|
3105 |
-
殇
|
3106 |
-
殉
|
3107 |
-
殊
|
3108 |
-
残
|
3109 |
-
殒
|
3110 |
-
殓
|
3111 |
-
殖
|
3112 |
-
殡
|
3113 |
-
殭
|
3114 |
-
殴
|
3115 |
-
段
|
3116 |
-
殷
|
3117 |
-
殿
|
3118 |
-
毁
|
3119 |
-
毂
|
3120 |
-
毅
|
3121 |
-
毋
|
3122 |
-
母
|
3123 |
-
毎
|
3124 |
-
每
|
3125 |
-
毒
|
3126 |
-
毓
|
3127 |
-
比
|
3128 |
-
毕
|
3129 |
-
毗
|
3130 |
-
毘
|
3131 |
-
毙
|
3132 |
-
毛
|
3133 |
-
毡
|
3134 |
-
毫
|
3135 |
-
毯
|
3136 |
-
毽
|
3137 |
-
氏
|
3138 |
-
氐
|
3139 |
-
民
|
3140 |
-
氓
|
3141 |
-
气
|
3142 |
-
氖
|
3143 |
-
気
|
3144 |
-
氙
|
3145 |
-
氛
|
3146 |
-
氟
|
3147 |
-
氡
|
3148 |
-
氢
|
3149 |
-
氤
|
3150 |
-
氦
|
3151 |
-
氧
|
3152 |
-
氨
|
3153 |
-
氪
|
3154 |
-
氮
|
3155 |
-
氯
|
3156 |
-
氰
|
3157 |
-
氲
|
3158 |
-
水
|
3159 |
-
氷
|
3160 |
-
永
|
3161 |
-
氹
|
3162 |
-
氾
|
3163 |
-
汀
|
3164 |
-
汁
|
3165 |
-
求
|
3166 |
-
汆
|
3167 |
-
汇
|
3168 |
-
汉
|
3169 |
-
汎
|
3170 |
-
汐
|
3171 |
-
汕
|
3172 |
-
汗
|
3173 |
-
汛
|
3174 |
-
汝
|
3175 |
-
汞
|
3176 |
-
江
|
3177 |
-
池
|
3178 |
-
污
|
3179 |
-
汤
|
3180 |
-
汨
|
3181 |
-
汩
|
3182 |
-
汪
|
3183 |
-
汰
|
3184 |
-
汲
|
3185 |
-
汴
|
3186 |
-
汶
|
3187 |
-
汹
|
3188 |
-
汽
|
3189 |
-
汾
|
3190 |
-
沁
|
3191 |
-
沂
|
3192 |
-
沃
|
3193 |
-
沅
|
3194 |
-
沈
|
3195 |
-
沉
|
3196 |
-
沌
|
3197 |
-
沏
|
3198 |
-
沐
|
3199 |
-
沓
|
3200 |
-
沙
|
3201 |
-
沛
|
3202 |
-
沟
|
3203 |
-
没
|
3204 |
-
沢
|
3205 |
-
沣
|
3206 |
-
沥
|
3207 |
-
沦
|
3208 |
-
沧
|
3209 |
-
沪
|
3210 |
-
沫
|
3211 |
-
沭
|
3212 |
-
沮
|
3213 |
-
沱
|
3214 |
-
河
|
3215 |
-
沸
|
3216 |
-
油
|
3217 |
-
治
|
3218 |
-
沼
|
3219 |
-
沽
|
3220 |
-
沾
|
3221 |
-
沿
|
3222 |
-
泄
|
3223 |
-
泉
|
3224 |
-
泊
|
3225 |
-
泌
|
3226 |
-
泓
|
3227 |
-
法
|
3228 |
-
泗
|
3229 |
-
泛
|
3230 |
-
泞
|
3231 |
-
泠
|
3232 |
-
泡
|
3233 |
-
波
|
3234 |
-
泣
|
3235 |
-
泥
|
3236 |
-
注
|
3237 |
-
泪
|
3238 |
-
泫
|
3239 |
-
泮
|
3240 |
-
泯
|
3241 |
-
泰
|
3242 |
-
泱
|
3243 |
-
泳
|
3244 |
-
泵
|
3245 |
-
泷
|
3246 |
-
泸
|
3247 |
-
泻
|
3248 |
-
泼
|
3249 |
-
泽
|
3250 |
-
泾
|
3251 |
-
洁
|
3252 |
-
洄
|
3253 |
-
洋
|
3254 |
-
洒
|
3255 |
-
洗
|
3256 |
-
洙
|
3257 |
-
洛
|
3258 |
-
洞
|
3259 |
-
津
|
3260 |
-
洩
|
3261 |
-
洪
|
3262 |
-
洮
|
3263 |
-
洱
|
3264 |
-
洲
|
3265 |
-
洵
|
3266 |
-
洸
|
3267 |
-
洹
|
3268 |
-
活
|
3269 |
-
洼
|
3270 |
-
洽
|
3271 |
-
派
|
3272 |
-
流
|
3273 |
-
浃
|
3274 |
-
浄
|
3275 |
-
浅
|
3276 |
-
浆
|
3277 |
-
浇
|
3278 |
-
浊
|
3279 |
-
测
|
3280 |
-
济
|
3281 |
-
浏
|
3282 |
-
浑
|
3283 |
-
浒
|
3284 |
-
浓
|
3285 |
-
浔
|
3286 |
-
浙
|
3287 |
-
浚
|
3288 |
-
浜
|
3289 |
-
浣
|
3290 |
-
浦
|
3291 |
-
浩
|
3292 |
-
浪
|
3293 |
-
浬
|
3294 |
-
浮
|
3295 |
-
浯
|
3296 |
-
浴
|
3297 |
-
海
|
3298 |
-
浸
|
3299 |
-
涂
|
3300 |
-
涅
|
3301 |
-
消
|
3302 |
-
涉
|
3303 |
-
涌
|
3304 |
-
涎
|
3305 |
-
涓
|
3306 |
-
涔
|
3307 |
-
涕
|
3308 |
-
涙
|
3309 |
-
涛
|
3310 |
-
涝
|
3311 |
-
涞
|
3312 |
-
涟
|
3313 |
-
涠
|
3314 |
-
涡
|
3315 |
-
涣
|
3316 |
-
涤
|
3317 |
-
润
|
3318 |
-
涧
|
3319 |
-
涨
|
3320 |
-
涩
|
3321 |
-
涪
|
3322 |
-
涮
|
3323 |
-
涯
|
3324 |
-
液
|
3325 |
-
涵
|
3326 |
-
涸
|
3327 |
-
涿
|
3328 |
-
淀
|
3329 |
-
淄
|
3330 |
-
淅
|
3331 |
-
淆
|
3332 |
-
淇
|
3333 |
-
淋
|
3334 |
-
淌
|
3335 |
-
淑
|
3336 |
-
淖
|
3337 |
-
淘
|
3338 |
-
淙
|
3339 |
-
淞
|
3340 |
-
淡
|
3341 |
-
淤
|
3342 |
-
淦
|
3343 |
-
淫
|
3344 |
-
淬
|
3345 |
-
淮
|
3346 |
-
深
|
3347 |
-
淳
|
3348 |
-
混
|
3349 |
-
淹
|
3350 |
-
添
|
3351 |
-
淼
|
3352 |
-
清
|
3353 |
-
済
|
3354 |
-
渉
|
3355 |
-
渊
|
3356 |
-
渋
|
3357 |
-
渍
|
3358 |
-
渎
|
3359 |
-
渐
|
3360 |
-
渔
|
3361 |
-
渗
|
3362 |
-
渚
|
3363 |
-
渝
|
3364 |
-
渠
|
3365 |
-
渡
|
3366 |
-
渣
|
3367 |
-
渤
|
3368 |
-
渥
|
3369 |
-
温
|
3370 |
-
渭
|
3371 |
-
港
|
3372 |
-
渲
|
3373 |
-
渴
|
3374 |
-
游
|
3375 |
-
渺
|
3376 |
-
湃
|
3377 |
-
湄
|
3378 |
-
湍
|
3379 |
-
湖
|
3380 |
-
湘
|
3381 |
-
湛
|
3382 |
-
湟
|
3383 |
-
湧
|
3384 |
-
湫
|
3385 |
-
湮
|
3386 |
-
湳
|
3387 |
-
湾
|
3388 |
-
湿
|
3389 |
-
満
|
3390 |
-
溃
|
3391 |
-
溅
|
3392 |
-
溉
|
3393 |
-
溏
|
3394 |
-
源
|
3395 |
-
溜
|
3396 |
-
溟
|
3397 |
-
溢
|
3398 |
-
溥
|
3399 |
-
溧
|
3400 |
-
溪
|
3401 |
-
溯
|
3402 |
-
溱
|
3403 |
-
溴
|
3404 |
-
溶
|
3405 |
-
溺
|
3406 |
-
溼
|
3407 |
-
滁
|
3408 |
-
滂
|
3409 |
-
滇
|
3410 |
-
滋
|
3411 |
-
滑
|
3412 |
-
滓
|
3413 |
-
滔
|
3414 |
-
滕
|
3415 |
-
滙
|
3416 |
-
滚
|
3417 |
-
滝
|
3418 |
-
滞
|
3419 |
-
滟
|
3420 |
-
满
|
3421 |
-
滢
|
3422 |
-
滤
|
3423 |
-
滥
|
3424 |
-
滦
|
3425 |
-
滨
|
3426 |
-
滩
|
3427 |
-
滴
|
3428 |
-
漂
|
3429 |
-
漆
|
3430 |
-
漉
|
3431 |
-
漏
|
3432 |
-
漓
|
3433 |
-
演
|
3434 |
-
漕
|
3435 |
-
漠
|
3436 |
-
漩
|
3437 |
-
漪
|
3438 |
-
漫
|
3439 |
-
漯
|
3440 |
-
漱
|
3441 |
-
漳
|
3442 |
-
漾
|
3443 |
-
潆
|
3444 |
-
潇
|
3445 |
-
潋
|
3446 |
-
潍
|
3447 |
-
潘
|
3448 |
-
潜
|
3449 |
-
潞
|
3450 |
-
潟
|
3451 |
-
潢
|
3452 |
-
潦
|
3453 |
-
潧
|
3454 |
-
潭
|
3455 |
-
潮
|
3456 |
-
潴
|
3457 |
-
潸
|
3458 |
-
潺
|
3459 |
-
潼
|
3460 |
-
澄
|
3461 |
-
澈
|
3462 |
-
澍
|
3463 |
-
澎
|
3464 |
-
澜
|
3465 |
-
澡
|
3466 |
-
澧
|
3467 |
-
澳
|
3468 |
-
澹
|
3469 |
-
激
|
3470 |
-
濂
|
3471 |
-
濑
|
3472 |
-
濒
|
3473 |
-
濠
|
3474 |
-
濡
|
3475 |
-
濬
|
3476 |
-
濮
|
3477 |
-
濯
|
3478 |
-
瀑
|
3479 |
-
瀚
|
3480 |
-
瀛
|
3481 |
-
瀞
|
3482 |
-
瀬
|
3483 |
-
灌
|
3484 |
-
灏
|
3485 |
-
灞
|
3486 |
-
火
|
3487 |
-
灬
|
3488 |
-
灭
|
3489 |
-
灯
|
3490 |
-
灰
|
3491 |
-
灵
|
3492 |
-
灶
|
3493 |
-
灸
|
3494 |
-
灼
|
3495 |
-
灾
|
3496 |
-
灿
|
3497 |
-
炀
|
3498 |
-
炁
|
3499 |
-
炅
|
3500 |
-
炉
|
3501 |
-
炊
|
3502 |
-
炎
|
3503 |
-
炒
|
3504 |
-
炔
|
3505 |
-
炕
|
3506 |
-
炖
|
3507 |
-
炙
|
3508 |
-
炜
|
3509 |
-
炫
|
3510 |
-
炬
|
3511 |
-
炭
|
3512 |
-
炮
|
3513 |
-
炯
|
3514 |
-
炳
|
3515 |
-
炷
|
3516 |
-
炸
|
3517 |
-
点
|
3518 |
-
炼
|
3519 |
-
炽
|
3520 |
-
烁
|
3521 |
-
烂
|
3522 |
-
烃
|
3523 |
-
烈
|
3524 |
-
烊
|
3525 |
-
烘
|
3526 |
-
烙
|
3527 |
-
烛
|
3528 |
-
烟
|
3529 |
-
烤
|
3530 |
-
烦
|
3531 |
-
烧
|
3532 |
-
烨
|
3533 |
-
烩
|
3534 |
-
烫
|
3535 |
-
烬
|
3536 |
-
热
|
3537 |
-
烯
|
3538 |
-
烷
|
3539 |
-
烹
|
3540 |
-
烽
|
3541 |
-
焉
|
3542 |
-
焊
|
3543 |
-
焕
|
3544 |
-
焖
|
3545 |
-
焗
|
3546 |
-
焘
|
3547 |
-
焙
|
3548 |
-
焚
|
3549 |
-
焜
|
3550 |
-
焦
|
3551 |
-
焯
|
3552 |
-
焰
|
3553 |
-
焱
|
3554 |
-
然
|
3555 |
-
焼
|
3556 |
-
煅
|
3557 |
-
煊
|
3558 |
-
煌
|
3559 |
-
煎
|
3560 |
-
煖
|
3561 |
-
煜
|
3562 |
-
煞
|
3563 |
-
煤
|
3564 |
-
煦
|
3565 |
-
照
|
3566 |
-
煨
|
3567 |
-
煮
|
3568 |
-
煲
|
3569 |
-
煸
|
3570 |
-
煽
|
3571 |
-
熄
|
3572 |
-
熊
|
3573 |
-
熏
|
3574 |
-
熔
|
3575 |
-
熙
|
3576 |
-
熟
|
3577 |
-
熠
|
3578 |
-
熨
|
3579 |
-
熬
|
3580 |
-
熵
|
3581 |
-
熹
|
3582 |
-
燃
|
3583 |
-
燄
|
3584 |
-
燊
|
3585 |
-
燎
|
3586 |
-
燔
|
3587 |
-
燕
|
3588 |
-
燥
|
3589 |
-
燧
|
3590 |
-
燮
|
3591 |
-
燻
|
3592 |
-
燿
|
3593 |
-
爆
|
3594 |
-
爪
|
3595 |
-
爬
|
3596 |
-
爰
|
3597 |
-
爱
|
3598 |
-
爵
|
3599 |
-
父
|
3600 |
-
爷
|
3601 |
-
爸
|
3602 |
-
爹
|
3603 |
-
爻
|
3604 |
-
爽
|
3605 |
-
片
|
3606 |
-
版
|
3607 |
-
牌
|
3608 |
-
牍
|
3609 |
-
牒
|
3610 |
-
牙
|
3611 |
-
牛
|
3612 |
-
牝
|
3613 |
-
牟
|
3614 |
-
牠
|
3615 |
-
牡
|
3616 |
-
牢
|
3617 |
-
牦
|
3618 |
-
牧
|
3619 |
-
物
|
3620 |
-
牯
|
3621 |
-
牲
|
3622 |
-
牴
|
3623 |
-
牵
|
3624 |
-
特
|
3625 |
-
牺
|
3626 |
-
犀
|
3627 |
-
犁
|
3628 |
-
犄
|
3629 |
-
犊
|
3630 |
-
犍
|
3631 |
-
犒
|
3632 |
-
犬
|
3633 |
-
犯
|
3634 |
-
状
|
3635 |
-
犷
|
3636 |
-
犸
|
3637 |
-
犹
|
3638 |
-
狂
|
3639 |
-
狄
|
3640 |
-
狈
|
3641 |
-
狎
|
3642 |
-
狐
|
3643 |
-
狒
|
3644 |
-
狗
|
3645 |
-
狙
|
3646 |
-
狞
|
3647 |
-
狠
|
3648 |
-
狡
|
3649 |
-
狩
|
3650 |
-
独
|
3651 |
-
狭
|
3652 |
-
狮
|
3653 |
-
狰
|
3654 |
-
狱
|
3655 |
-
狸
|
3656 |
-
狼
|
3657 |
-
猎
|
3658 |
-
猕
|
3659 |
-
猖
|
3660 |
-
猗
|
3661 |
-
猛
|
3662 |
-
猜
|
3663 |
-
猝
|
3664 |
-
猥
|
3665 |
-
猩
|
3666 |
-
猪
|
3667 |
-
猫
|
3668 |
-
猬
|
3669 |
-
献
|
3670 |
-
猴
|
3671 |
-
猷
|
3672 |
-
猾
|
3673 |
-
猿
|
3674 |
-
獐
|
3675 |
-
獒
|
3676 |
-
獗
|
3677 |
-
獠
|
3678 |
-
獣
|
3679 |
-
獭
|
3680 |
-
獾
|
3681 |
-
玄
|
3682 |
-
率
|
3683 |
-
玉
|
3684 |
-
王
|
3685 |
-
玑
|
3686 |
-
玖
|
3687 |
-
玛
|
3688 |
-
玟
|
3689 |
-
玠
|
3690 |
-
玥
|
3691 |
-
玩
|
3692 |
-
玫
|
3693 |
-
玮
|
3694 |
-
环
|
3695 |
-
现
|
3696 |
-
玲
|
3697 |
-
玳
|
3698 |
-
玷
|
3699 |
-
玺
|
3700 |
-
玻
|
3701 |
-
珀
|
3702 |
-
珂
|
3703 |
-
珅
|
3704 |
-
珈
|
3705 |
-
珉
|
3706 |
-
珊
|
3707 |
-
珍
|
3708 |
-
珏
|
3709 |
-
珐
|
3710 |
-
珑
|
3711 |
-
珙
|
3712 |
-
珞
|
3713 |
-
珠
|
3714 |
-
珣
|
3715 |
-
珥
|
3716 |
-
珩
|
3717 |
-
珪
|
3718 |
-
班
|
3719 |
-
珮
|
3720 |
-
珲
|
3721 |
-
珺
|
3722 |
-
球
|
3723 |
-
琅
|
3724 |
-
理
|
3725 |
-
琇
|
3726 |
-
琉
|
3727 |
-
琊
|
3728 |
-
琍
|
3729 |
-
琏
|
3730 |
-
琐
|
3731 |
-
琛
|
3732 |
-
琢
|
3733 |
-
琥
|
3734 |
-
琦
|
3735 |
-
琨
|
3736 |
-
琪
|
3737 |
-
琬
|
3738 |
-
琮
|
3739 |
-
琰
|
3740 |
-
琲
|
3741 |
-
琳
|
3742 |
-
琴
|
3743 |
-
琵
|
3744 |
-
琶
|
3745 |
-
琼
|
3746 |
-
瑀
|
3747 |
-
瑁
|
3748 |
-
瑄
|
3749 |
-
瑕
|
3750 |
-
瑗
|
3751 |
-
瑙
|
3752 |
-
瑚
|
3753 |
-
瑛
|
3754 |
-
瑜
|
3755 |
-
瑞
|
3756 |
-
瑟
|
3757 |
-
瑠
|
3758 |
-
瑯
|
3759 |
-
瑰
|
3760 |
-
瑶
|
3761 |
-
瑾
|
3762 |
-
璀
|
3763 |
-
璁
|
3764 |
-
璃
|
3765 |
-
璇
|
3766 |
-
璋
|
3767 |
-
璎
|
3768 |
-
璐
|
3769 |
-
璜
|
3770 |
-
璞
|
3771 |
-
璟
|
3772 |
-
璧
|
3773 |
-
璨
|
3774 |
-
璿
|
3775 |
-
瓒
|
3776 |
-
瓜
|
3777 |
-
瓢
|
3778 |
-
瓣
|
3779 |
-
瓤
|
3780 |
-
瓦
|
3781 |
-
瓮
|
3782 |
-
瓯
|
3783 |
-
瓴
|
3784 |
-
瓶
|
3785 |
-
瓷
|
3786 |
-
甄
|
3787 |
-
甕
|
3788 |
-
甘
|
3789 |
-
甙
|
3790 |
-
甚
|
3791 |
-
甜
|
3792 |
-
生
|
3793 |
-
甥
|
3794 |
-
甦
|
3795 |
-
用
|
3796 |
-
甩
|
3797 |
-
甫
|
3798 |
-
甬
|
3799 |
-
甭
|
3800 |
-
甯
|
3801 |
-
田
|
3802 |
-
由
|
3803 |
-
甲
|
3804 |
-
申
|
3805 |
-
电
|
3806 |
-
男
|
3807 |
-
甸
|
3808 |
-
町
|
3809 |
-
画
|
3810 |
-
甾
|
3811 |
-
畀
|
3812 |
-
畅
|
3813 |
-
界
|
3814 |
-
畏
|
3815 |
-
畑
|
3816 |
-
畔
|
3817 |
-
留
|
3818 |
-
畜
|
3819 |
-
略
|
3820 |
-
畦
|
3821 |
-
番
|
3822 |
-
畲
|
3823 |
-
畳
|
3824 |
-
畴
|
3825 |
-
畸
|
3826 |
-
畹
|
3827 |
-
畿
|
3828 |
-
疆
|
3829 |
-
疏
|
3830 |
-
疑
|
3831 |
-
疔
|
3832 |
-
疖
|
3833 |
-
疗
|
3834 |
-
疙
|
3835 |
-
疚
|
3836 |
-
疝
|
3837 |
-
疟
|
3838 |
-
疡
|
3839 |
-
疣
|
3840 |
-
疤
|
3841 |
-
疥
|
3842 |
-
疫
|
3843 |
-
疮
|
3844 |
-
疯
|
3845 |
-
疱
|
3846 |
-
疲
|
3847 |
-
疳
|
3848 |
-
疵
|
3849 |
-
疸
|
3850 |
-
疹
|
3851 |
-
疼
|
3852 |
-
疽
|
3853 |
-
疾
|
3854 |
-
痂
|
3855 |
-
病
|
3856 |
-
症
|
3857 |
-
痈
|
3858 |
-
痉
|
3859 |
-
痊
|
3860 |
-
痍
|
3861 |
-
痒
|
3862 |
-
痔
|
3863 |
-
痕
|
3864 |
-
痘
|
3865 |
-
痛
|
3866 |
-
痞
|
3867 |
-
痠
|
3868 |
-
痢
|
3869 |
-
痣
|
3870 |
-
痤
|
3871 |
-
痧
|
3872 |
-
痨
|
3873 |
-
痪
|
3874 |
-
痫
|
3875 |
-
痰
|
3876 |
-
痱
|
3877 |
-
痴
|
3878 |
-
痹
|
3879 |
-
痺
|
3880 |
-
痼
|
3881 |
-
痿
|
3882 |
-
瘀
|
3883 |
-
瘁
|
3884 |
-
瘘
|
3885 |
-
瘙
|
3886 |
-
瘟
|
3887 |
-
瘠
|
3888 |
-
瘢
|
3889 |
-
瘤
|
3890 |
-
瘦
|
3891 |
-
瘩
|
3892 |
-
瘪
|
3893 |
-
瘫
|
3894 |
-
瘴
|
3895 |
-
瘸
|
3896 |
-
瘾
|
3897 |
-
癌
|
3898 |
-
癒
|
3899 |
-
癖
|
3900 |
-
癜
|
3901 |
-
癞
|
3902 |
-
癡
|
3903 |
-
癣
|
3904 |
-
癫
|
3905 |
-
癸
|
3906 |
-
発
|
3907 |
-
登
|
3908 |
-
白
|
3909 |
-
百
|
3910 |
-
皂
|
3911 |
-
的
|
3912 |
-
皆
|
3913 |
-
皇
|
3914 |
-
皈
|
3915 |
-
皋
|
3916 |
-
皎
|
3917 |
-
皑
|
3918 |
-
皓
|
3919 |
-
皖
|
3920 |
-
皙
|
3921 |
-
皮
|
3922 |
-
皱
|
3923 |
-
皴
|
3924 |
-
皿
|
3925 |
-
盂
|
3926 |
-
盅
|
3927 |
-
盆
|
3928 |
-
盈
|
3929 |
-
益
|
3930 |
-
盎
|
3931 |
-
盏
|
3932 |
-
盐
|
3933 |
-
监
|
3934 |
-
盒
|
3935 |
-
盔
|
3936 |
-
盖
|
3937 |
-
盗
|
3938 |
-
盘
|
3939 |
-
盛
|
3940 |
-
盟
|
3941 |
-
盥
|
3942 |
-
目
|
3943 |
-
盯
|
3944 |
-
盱
|
3945 |
-
盲
|
3946 |
-
直
|
3947 |
-
相
|
3948 |
-
盹
|
3949 |
-
盼
|
3950 |
-
盾
|
3951 |
-
省
|
3952 |
-
眈
|
3953 |
-
眉
|
3954 |
-
看
|
3955 |
-
県
|
3956 |
-
眙
|
3957 |
-
眞
|
3958 |
-
真
|
3959 |
-
眠
|
3960 |
-
眦
|
3961 |
-
眨
|
3962 |
-
眩
|
3963 |
-
眯
|
3964 |
-
眶
|
3965 |
-
眷
|
3966 |
-
眸
|
3967 |
-
眺
|
3968 |
-
眼
|
3969 |
-
着
|
3970 |
-
睁
|
3971 |
-
睇
|
3972 |
-
睐
|
3973 |
-
睑
|
3974 |
-
睛
|
3975 |
-
睡
|
3976 |
-
睢
|
3977 |
-
督
|
3978 |
-
睥
|
3979 |
-
睦
|
3980 |
-
睨
|
3981 |
-
睪
|
3982 |
-
睫
|
3983 |
-
睬
|
3984 |
-
睹
|
3985 |
-
睽
|
3986 |
-
睾
|
3987 |
-
睿
|
3988 |
-
瞄
|
3989 |
-
瞅
|
3990 |
-
瞇
|
3991 |
-
瞋
|
3992 |
-
瞌
|
3993 |
-
瞎
|
3994 |
-
瞑
|
3995 |
-
瞒
|
3996 |
-
瞓
|
3997 |
-
瞟
|
3998 |
-
瞠
|
3999 |
-
瞥
|
4000 |
-
瞧
|
4001 |
-
瞩
|
4002 |
-
瞪
|
4003 |
-
瞬
|
4004 |
-
瞰
|
4005 |
-
瞳
|
4006 |
-
瞻
|
4007 |
-
瞿
|
4008 |
-
矍
|
4009 |
-
矗
|
4010 |
-
矛
|
4011 |
-
矜
|
4012 |
-
矢
|
4013 |
-
��
|
4014 |
-
知
|
4015 |
-
矩
|
4016 |
-
矫
|
4017 |
-
短
|
4018 |
-
矮
|
4019 |
-
石
|
4020 |
-
矶
|
4021 |
-
矽
|
4022 |
-
矾
|
4023 |
-
矿
|
4024 |
-
码
|
4025 |
-
砂
|
4026 |
-
砌
|
4027 |
-
砍
|
4028 |
-
砒
|
4029 |
-
研
|
4030 |
-
砖
|
4031 |
-
砗
|
4032 |
-
砚
|
4033 |
-
砝
|
4034 |
-
砣
|
4035 |
-
砥
|
4036 |
-
砧
|
4037 |
-
砭
|
4038 |
-
砰
|
4039 |
-
砲
|
4040 |
-
破
|
4041 |
-
砷
|
4042 |
-
砸
|
4043 |
-
砺
|
4044 |
-
砼
|
4045 |
-
砾
|
4046 |
-
础
|
4047 |
-
硅
|
4048 |
-
硐
|
4049 |
-
硒
|
4050 |
-
硕
|
4051 |
-
硝
|
4052 |
-
硫
|
4053 |
-
硬
|
4054 |
-
确
|
4055 |
-
硼
|
4056 |
-
碁
|
4057 |
-
碇
|
4058 |
-
碉
|
4059 |
-
碌
|
4060 |
-
碍
|
4061 |
-
碎
|
4062 |
-
碑
|
4063 |
-
碓
|
4064 |
-
碗
|
4065 |
-
碘
|
4066 |
-
碚
|
4067 |
-
碛
|
4068 |
-
碟
|
4069 |
-
碣
|
4070 |
-
碧
|
4071 |
-
碰
|
4072 |
-
碱
|
4073 |
-
碳
|
4074 |
-
碴
|
4075 |
-
碾
|
4076 |
-
磁
|
4077 |
-
磅
|
4078 |
-
磊
|
4079 |
-
磋
|
4080 |
-
磐
|
4081 |
-
磕
|
4082 |
-
磡
|
4083 |
-
磨
|
4084 |
-
磬
|
4085 |
-
磲
|
4086 |
-
磷
|
4087 |
-
磺
|
4088 |
-
礁
|
4089 |
-
礡
|
4090 |
-
礴
|
4091 |
-
示
|
4092 |
-
礼
|
4093 |
-
社
|
4094 |
-
祀
|
4095 |
-
祁
|
4096 |
-
祂
|
4097 |
-
祇
|
4098 |
-
祈
|
4099 |
-
祉
|
4100 |
-
祎
|
4101 |
-
祐
|
4102 |
-
祕
|
4103 |
-
祖
|
4104 |
-
祗
|
4105 |
-
祚
|
4106 |
-
祛
|
4107 |
-
祜
|
4108 |
-
祝
|
4109 |
-
神
|
4110 |
-
祟
|
4111 |
-
祠
|
4112 |
-
祢
|
4113 |
-
祥
|
4114 |
-
票
|
4115 |
-
祭
|
4116 |
-
祯
|
4117 |
-
祷
|
4118 |
-
祸
|
4119 |
-
祺
|
4120 |
-
禀
|
4121 |
-
禁
|
4122 |
-
禄
|
4123 |
-
禅
|
4124 |
-
福
|
4125 |
-
禛
|
4126 |
-
禧
|
4127 |
-
禹
|
4128 |
-
禺
|
4129 |
-
离
|
4130 |
-
禽
|
4131 |
-
禾
|
4132 |
-
秀
|
4133 |
-
私
|
4134 |
-
秃
|
4135 |
-
秆
|
4136 |
-
秉
|
4137 |
-
秋
|
4138 |
-
种
|
4139 |
-
科
|
4140 |
-
秒
|
4141 |
-
秘
|
4142 |
-
租
|
4143 |
-
秣
|
4144 |
-
秤
|
4145 |
-
秦
|
4146 |
-
秧
|
4147 |
-
秩
|
4148 |
-
秭
|
4149 |
-
积
|
4150 |
-
称
|
4151 |
-
秸
|
4152 |
-
移
|
4153 |
-
秽
|
4154 |
-
稀
|
4155 |
-
程
|
4156 |
-
稍
|
4157 |
-
税
|
4158 |
-
稔
|
4159 |
-
稗
|
4160 |
-
稚
|
4161 |
-
稜
|
4162 |
-
稞
|
4163 |
-
稠
|
4164 |
-
稣
|
4165 |
-
稲
|
4166 |
-
稳
|
4167 |
-
稷
|
4168 |
-
稹
|
4169 |
-
稻
|
4170 |
-
稼
|
4171 |
-
稽
|
4172 |
-
稿
|
4173 |
-
穂
|
4174 |
-
穆
|
4175 |
-
穗
|
4176 |
-
穴
|
4177 |
-
究
|
4178 |
-
穷
|
4179 |
-
穹
|
4180 |
-
空
|
4181 |
-
穿
|
4182 |
-
突
|
4183 |
-
窃
|
4184 |
-
窄
|
4185 |
-
窈
|
4186 |
-
窍
|
4187 |
-
窑
|
4188 |
-
窒
|
4189 |
-
窓
|
4190 |
-
窕
|
4191 |
-
窖
|
4192 |
-
窗
|
4193 |
-
窘
|
4194 |
-
窜
|
4195 |
-
窝
|
4196 |
-
窟
|
4197 |
-
窠
|
4198 |
-
窥
|
4199 |
-
窦
|
4200 |
-
窨
|
4201 |
-
窿
|
4202 |
-
立
|
4203 |
-
竖
|
4204 |
-
站
|
4205 |
-
竜
|
4206 |
-
竞
|
4207 |
-
竟
|
4208 |
-
章
|
4209 |
-
竣
|
4210 |
-
童
|
4211 |
-
竭
|
4212 |
-
端
|
4213 |
-
竹
|
4214 |
-
竺
|
4215 |
-
竽
|
4216 |
-
竿
|
4217 |
-
笃
|
4218 |
-
笆
|
4219 |
-
笈
|
4220 |
-
笋
|
4221 |
-
笏
|
4222 |
-
笑
|
4223 |
-
笔
|
4224 |
-
笙
|
4225 |
-
笛
|
4226 |
-
笞
|
4227 |
-
笠
|
4228 |
-
符
|
4229 |
-
笨
|
4230 |
-
第
|
4231 |
-
笹
|
4232 |
-
笺
|
4233 |
-
笼
|
4234 |
-
等
|
4235 |
-
筊
|
4236 |
-
筋
|
4237 |
-
筏
|
4238 |
-
筐
|
4239 |
-
筑
|
4240 |
-
筒
|
4241 |
-
答
|
4242 |
-
策
|
4243 |
-
筛
|
4244 |
-
筝
|
4245 |
-
筠
|
4246 |
-
筱
|
4247 |
-
筲
|
4248 |
-
筵
|
4249 |
-
筷
|
4250 |
-
筹
|
4251 |
-
签
|
4252 |
-
简
|
4253 |
-
箇
|
4254 |
-
箍
|
4255 |
-
箐
|
4256 |
-
箔
|
4257 |
-
箕
|
4258 |
-
算
|
4259 |
-
箝
|
4260 |
-
管
|
4261 |
-
箩
|
4262 |
-
箫
|
4263 |
-
箭
|
4264 |
-
箱
|
4265 |
-
箴
|
4266 |
-
箸
|
4267 |
-
篁
|
4268 |
-
篆
|
4269 |
-
篇
|
4270 |
-
篑
|
4271 |
-
篓
|
4272 |
-
篙
|
4273 |
-
篝
|
4274 |
-
篠
|
4275 |
-
篡
|
4276 |
-
篪
|
4277 |
-
篮
|
4278 |
-
篱
|
4279 |
-
篷
|
4280 |
-
簇
|
4281 |
-
簌
|
4282 |
-
簦
|
4283 |
-
簧
|
4284 |
-
簪
|
4285 |
-
簷
|
4286 |
-
簸
|
4287 |
-
簿
|
4288 |
-
籁
|
4289 |
-
籍
|
4290 |
-
籐
|
4291 |
-
籤
|
4292 |
-
米
|
4293 |
-
类
|
4294 |
-
籼
|
4295 |
-
籽
|
4296 |
-
粄
|
4297 |
-
粉
|
4298 |
-
粑
|
4299 |
-
粒
|
4300 |
-
粕
|
4301 |
-
粗
|
4302 |
-
粘
|
4303 |
-
粟
|
4304 |
-
粤
|
4305 |
-
粥
|
4306 |
-
粧
|
4307 |
-
粪
|
4308 |
-
粮
|
4309 |
-
粱
|
4310 |
-
粲
|
4311 |
-
粳
|
4312 |
-
粹
|
4313 |
-
粼
|
4314 |
-
粽
|
4315 |
-
精
|
4316 |
-
粿
|
4317 |
-
糅
|
4318 |
-
糊
|
4319 |
-
糍
|
4320 |
-
糕
|
4321 |
-
糖
|
4322 |
-
糗
|
4323 |
-
糙
|
4324 |
-
糜
|
4325 |
-
糟
|
4326 |
-
糠
|
4327 |
-
糬
|
4328 |
-
糯
|
4329 |
-
糸
|
4330 |
-
系
|
4331 |
-
紊
|
4332 |
-
素
|
4333 |
-
索
|
4334 |
-
紧
|
4335 |
-
紫
|
4336 |
-
紮
|
4337 |
-
累
|
4338 |
-
絃
|
4339 |
-
経
|
4340 |
-
絮
|
4341 |
-
絵
|
4342 |
-
綑
|
4343 |
-
継
|
4344 |
-
続
|
4345 |
-
綦
|
4346 |
-
総
|
4347 |
-
縁
|
4348 |
-
縄
|
4349 |
-
繁
|
4350 |
-
繇
|
4351 |
-
繋
|
4352 |
-
纂
|
4353 |
-
纠
|
4354 |
-
红
|
4355 |
-
纣
|
4356 |
-
纤
|
4357 |
-
约
|
4358 |
-
级
|
4359 |
-
纨
|
4360 |
-
纪
|
4361 |
-
纫
|
4362 |
-
纬
|
4363 |
-
纭
|
4364 |
-
纯
|
4365 |
-
纰
|
4366 |
-
纱
|
4367 |
-
纲
|
4368 |
-
纳
|
4369 |
-
纵
|
4370 |
-
纶
|
4371 |
-
纷
|
4372 |
-
纸
|
4373 |
-
纹
|
4374 |
-
纺
|
4375 |
-
纽
|
4376 |
-
纾
|
4377 |
-
线
|
4378 |
-
绀
|
4379 |
-
练
|
4380 |
-
组
|
4381 |
-
绅
|
4382 |
-
细
|
4383 |
-
织
|
4384 |
-
终
|
4385 |
-
绊
|
4386 |
-
绍
|
4387 |
-
绎
|
4388 |
-
经
|
4389 |
-
绑
|
4390 |
-
绒
|
4391 |
-
结
|
4392 |
-
绔
|
4393 |
-
绕
|
4394 |
-
绘
|
4395 |
-
给
|
4396 |
-
绚
|
4397 |
-
绛
|
4398 |
-
络
|
4399 |
-
绝
|
4400 |
-
绞
|
4401 |
-
统
|
4402 |
-
绡
|
4403 |
-
绢
|
4404 |
-
绣
|
4405 |
-
绥
|
4406 |
-
绦
|
4407 |
-
继
|
4408 |
-
绩
|
4409 |
-
绪
|
4410 |
-
绫
|
4411 |
-
续
|
4412 |
-
绮
|
4413 |
-
绯
|
4414 |
-
绰
|
4415 |
-
绳
|
4416 |
-
维
|
4417 |
-
绵
|
4418 |
-
绶
|
4419 |
-
绷
|
4420 |
-
绸
|
4421 |
-
绻
|
4422 |
-
综
|
4423 |
-
绽
|
4424 |
-
绾
|
4425 |
-
绿
|
4426 |
-
缀
|
4427 |
-
缄
|
4428 |
-
缅
|
4429 |
-
缆
|
4430 |
-
缇
|
4431 |
-
缈
|
4432 |
-
缉
|
4433 |
-
缎
|
4434 |
-
缓
|
4435 |
-
缔
|
4436 |
-
缕
|
4437 |
-
编
|
4438 |
-
缘
|
4439 |
-
缙
|
4440 |
-
缚
|
4441 |
-
缜
|
4442 |
-
缝
|
4443 |
-
缠
|
4444 |
-
缢
|
4445 |
-
缤
|
4446 |
-
缥
|
4447 |
-
缨
|
4448 |
-
缩
|
4449 |
-
缪
|
4450 |
-
缭
|
4451 |
-
缮
|
4452 |
-
缰
|
4453 |
-
缱
|
4454 |
-
缴
|
4455 |
-
缸
|
4456 |
-
缺
|
4457 |
-
罂
|
4458 |
-
罄
|
4459 |
-
罐
|
4460 |
-
网
|
4461 |
-
罔
|
4462 |
-
罕
|
4463 |
-
罗
|
4464 |
-
罚
|
4465 |
-
罡
|
4466 |
-
罢
|
4467 |
-
罩
|
4468 |
-
罪
|
4469 |
-
置
|
4470 |
-
署
|
4471 |
-
罹
|
4472 |
-
羁
|
4473 |
-
羊
|
4474 |
-
羌
|
4475 |
-
美
|
4476 |
-
羔
|
4477 |
-
羚
|
4478 |
-
羞
|
4479 |
-
羟
|
4480 |
-
羡
|
4481 |
-
羣
|
4482 |
-
群
|
4483 |
-
羧
|
4484 |
-
羨
|
4485 |
-
羯
|
4486 |
-
羲
|
4487 |
-
羸
|
4488 |
-
羹
|
4489 |
-
羽
|
4490 |
-
羿
|
4491 |
-
翁
|
4492 |
-
翅
|
4493 |
-
翊
|
4494 |
-
翌
|
4495 |
-
翎
|
4496 |
-
翔
|
4497 |
-
翘
|
4498 |
-
翟
|
4499 |
-
翠
|
4500 |
-
翡
|
4501 |
-
翦
|
4502 |
-
翩
|
4503 |
-
翰
|
4504 |
-
翱
|
4505 |
-
翳
|
4506 |
-
翻
|
4507 |
-
翼
|
4508 |
-
耀
|
4509 |
-
老
|
4510 |
-
考
|
4511 |
-
耄
|
4512 |
-
者
|
4513 |
-
耆
|
4514 |
-
耋
|
4515 |
-
而
|
4516 |
-
耍
|
4517 |
-
耐
|
4518 |
-
耒
|
4519 |
-
耕
|
4520 |
-
耗
|
4521 |
-
耘
|
4522 |
-
耙
|
4523 |
-
耦
|
4524 |
-
耨
|
4525 |
-
耳
|
4526 |
-
耶
|
4527 |
-
耷
|
4528 |
-
耸
|
4529 |
-
耻
|
4530 |
-
耽
|
4531 |
-
耿
|
4532 |
-
聂
|
4533 |
-
聆
|
4534 |
-
聊
|
4535 |
-
聋
|
4536 |
-
职
|
4537 |
-
聒
|
4538 |
-
联
|
4539 |
-
聘
|
4540 |
-
聚
|
4541 |
-
聪
|
4542 |
-
聴
|
4543 |
-
聿
|
4544 |
-
肃
|
4545 |
-
肄
|
4546 |
-
肆
|
4547 |
-
肇
|
4548 |
-
肉
|
4549 |
-
肋
|
4550 |
-
肌
|
4551 |
-
肏
|
4552 |
-
肓
|
4553 |
-
肖
|
4554 |
-
肘
|
4555 |
-
肚
|
4556 |
-
肛
|
4557 |
-
肝
|
4558 |
-
肠
|
4559 |
-
股
|
4560 |
-
肢
|
4561 |
-
肤
|
4562 |
-
肥
|
4563 |
-
肩
|
4564 |
-
肪
|
4565 |
-
肮
|
4566 |
-
肯
|
4567 |
-
肱
|
4568 |
-
育
|
4569 |
-
肴
|
4570 |
-
肺
|
4571 |
-
肽
|
4572 |
-
肾
|
4573 |
-
肿
|
4574 |
-
胀
|
4575 |
-
胁
|
4576 |
-
胃
|
4577 |
-
胄
|
4578 |
-
胆
|
4579 |
-
背
|
4580 |
-
胍
|
4581 |
-
胎
|
4582 |
-
胖
|
4583 |
-
胚
|
4584 |
-
胛
|
4585 |
-
胜
|
4586 |
-
胝
|
4587 |
-
胞
|
4588 |
-
胡
|
4589 |
-
胤
|
4590 |
-
胥
|
4591 |
-
胧
|
4592 |
-
胫
|
4593 |
-
胭
|
4594 |
-
胯
|
4595 |
-
胰
|
4596 |
-
胱
|
4597 |
-
胳
|
4598 |
-
胴
|
4599 |
-
胶
|
4600 |
-
胸
|
4601 |
-
胺
|
4602 |
-
能
|
4603 |
-
脂
|
4604 |
-
脆
|
4605 |
-
脇
|
4606 |
-
脉
|
4607 |
-
脊
|
4608 |
-
脍
|
4609 |
-
脏
|
4610 |
-
脐
|
4611 |
-
脑
|
4612 |
-
脓
|
4613 |
-
脖
|
4614 |
-
脘
|
4615 |
-
脚
|
4616 |
-
脣
|
4617 |
-
脩
|
4618 |
-
脯
|
4619 |
-
脱
|
4620 |
-
脲
|
4621 |
-
脳
|
4622 |
-
脸
|
4623 |
-
脾
|
4624 |
-
腆
|
4625 |
-
腈
|
4626 |
-
腊
|
4627 |
-
腋
|
4628 |
-
腌
|
4629 |
-
腐
|
4630 |
-
腑
|
4631 |
-
腓
|
4632 |
-
腔
|
4633 |
-
腕
|
4634 |
-
腥
|
4635 |
-
腩
|
4636 |
-
腭
|
4637 |
-
腮
|
4638 |
-
腰
|
4639 |
-
腱
|
4640 |
-
腴
|
4641 |
-
腹
|
4642 |
-
腺
|
4643 |
-
腻
|
4644 |
-
腼
|
4645 |
-
腾
|
4646 |
-
腿
|
4647 |
-
膀
|
4648 |
-
膈
|
4649 |
-
膊
|
4650 |
-
膏
|
4651 |
-
膑
|
4652 |
-
膘
|
4653 |
-
膛
|
4654 |
-
膜
|
4655 |
-
膝
|
4656 |
-
膦
|
4657 |
-
膨
|
4658 |
-
膳
|
4659 |
-
膺
|
4660 |
-
膻
|
4661 |
-
臀
|
4662 |
-
臂
|
4663 |
-
臃
|
4664 |
-
臆
|
4665 |
-
臊
|
4666 |
-
臓
|
4667 |
-
臣
|
4668 |
-
臧
|
4669 |
-
自
|
4670 |
-
臬
|
4671 |
-
臭
|
4672 |
-
至
|
4673 |
-
致
|
4674 |
-
臻
|
4675 |
-
臼
|
4676 |
-
臾
|
4677 |
-
舀
|
4678 |
-
舂
|
4679 |
-
舅
|
4680 |
-
舆
|
4681 |
-
舌
|
4682 |
-
舍
|
4683 |
-
舎
|
4684 |
-
舐
|
4685 |
-
舒
|
4686 |
-
舔
|
4687 |
-
舖
|
4688 |
-
舗
|
4689 |
-
舛
|
4690 |
-
舜
|
4691 |
-
舞
|
4692 |
-
舟
|
4693 |
-
航
|
4694 |
-
舫
|
4695 |
-
般
|
4696 |
-
舰
|
4697 |
-
舱
|
4698 |
-
舵
|
4699 |
-
舶
|
4700 |
-
舷
|
4701 |
-
舸
|
4702 |
-
船
|
4703 |
-
舺
|
4704 |
-
舾
|
4705 |
-
艇
|
4706 |
-
艋
|
4707 |
-
艘
|
4708 |
-
艮
|
4709 |
-
良
|
4710 |
-
艰
|
4711 |
-
色
|
4712 |
-
艳
|
4713 |
-
艹
|
4714 |
-
艺
|
4715 |
-
艾
|
4716 |
-
节
|
4717 |
-
芃
|
4718 |
-
芈
|
4719 |
-
芊
|
4720 |
-
芋
|
4721 |
-
芍
|
4722 |
-
芎
|
4723 |
-
芒
|
4724 |
-
芙
|
4725 |
-
芜
|
4726 |
-
芝
|
4727 |
-
芡
|
4728 |
-
芥
|
4729 |
-
芦
|
4730 |
-
芩
|
4731 |
-
芪
|
4732 |
-
芫
|
4733 |
-
芬
|
4734 |
-
芭
|
4735 |
-
芮
|
4736 |
-
芯
|
4737 |
-
花
|
4738 |
-
芳
|
4739 |
-
芷
|
4740 |
-
芸
|
4741 |
-
芹
|
4742 |
-
芽
|
4743 |
-
芾
|
4744 |
-
苁
|
4745 |
-
苄
|
4746 |
-
苇
|
4747 |
-
苋
|
4748 |
-
苍
|
4749 |
-
苏
|
4750 |
-
苑
|
4751 |
-
苒
|
4752 |
-
苓
|
4753 |
-
苔
|
4754 |
-
苕
|
4755 |
-
苗
|
4756 |
-
苛
|
4757 |
-
苜
|
4758 |
-
苞
|
4759 |
-
苟
|
4760 |
-
苡
|
4761 |
-
苣
|
4762 |
-
若
|
4763 |
-
苦
|
4764 |
-
苫
|
4765 |
-
苯
|
4766 |
-
英
|
4767 |
-
苷
|
4768 |
-
苹
|
4769 |
-
苻
|
4770 |
-
茁
|
4771 |
-
茂
|
4772 |
-
范
|
4773 |
-
茄
|
4774 |
-
茅
|
4775 |
-
茉
|
4776 |
-
茎
|
4777 |
-
茏
|
4778 |
-
茗
|
4779 |
-
茜
|
4780 |
-
茧
|
4781 |
-
茨
|
4782 |
-
茫
|
4783 |
-
茬
|
4784 |
-
茭
|
4785 |
-
茯
|
4786 |
-
茱
|
4787 |
-
茴
|
4788 |
-
茵
|
4789 |
-
茶
|
4790 |
-
茸
|
4791 |
-
茹
|
4792 |
-
茼
|
4793 |
-
荀
|
4794 |
-
荃
|
4795 |
-
荆
|
4796 |
-
草
|
4797 |
-
荏
|
4798 |
-
荐
|
4799 |
-
荒
|
4800 |
-
荔
|
4801 |
-
荖
|
4802 |
-
荘
|
4803 |
-
荚
|
4804 |
-
荞
|
4805 |
-
荟
|
4806 |
-
荠
|
4807 |
-
荡
|
4808 |
-
荣
|
4809 |
-
荤
|
4810 |
-
荥
|
4811 |
-
荧
|
4812 |
-
荨
|
4813 |
-
荪
|
4814 |
-
荫
|
4815 |
-
药
|
4816 |
-
荳
|
4817 |
-
荷
|
4818 |
-
荸
|
4819 |
-
荻
|
4820 |
-
荼
|
4821 |
-
荽
|
4822 |
-
莅
|
4823 |
-
莆
|
4824 |
-
莉
|
4825 |
-
莎
|
4826 |
-
莒
|
4827 |
-
莓
|
4828 |
-
莘
|
4829 |
-
莞
|
4830 |
-
莠
|
4831 |
-
莪
|
4832 |
-
莫
|
4833 |
-
莱
|
4834 |
-
莲
|
4835 |
-
莴
|
4836 |
-
获
|
4837 |
-
莹
|
4838 |
-
莺
|
4839 |
-
莽
|
4840 |
-
莿
|
4841 |
-
菀
|
4842 |
-
菁
|
4843 |
-
菅
|
4844 |
-
菇
|
4845 |
-
菈
|
4846 |
-
菊
|
4847 |
-
菌
|
4848 |
-
菏
|
4849 |
-
菓
|
4850 |
-
菖
|
4851 |
-
菘
|
4852 |
-
菜
|
4853 |
-
菟
|
4854 |
-
菠
|
4855 |
-
菡
|
4856 |
-
菩
|
4857 |
-
菱
|
4858 |
-
菲
|
4859 |
-
菸
|
4860 |
-
菽
|
4861 |
-
萁
|
4862 |
-
萃
|
4863 |
-
萄
|
4864 |
-
萋
|
4865 |
-
萌
|
4866 |
-
萍
|
4867 |
-
萎
|
4868 |
-
萘
|
4869 |
-
萝
|
4870 |
-
萤
|
4871 |
-
营
|
4872 |
-
萦
|
4873 |
-
萧
|
4874 |
-
萨
|
4875 |
-
萩
|
4876 |
-
萱
|
4877 |
-
萸
|
4878 |
-
萼
|
4879 |
-
落
|
4880 |
-
葆
|
4881 |
-
著
|
4882 |
-
葚
|
4883 |
-
葛
|
4884 |
-
葡
|
4885 |
-
董
|
4886 |
-
葩
|
4887 |
-
葫
|
4888 |
-
葬
|
4889 |
-
葭
|
4890 |
-
葱
|
4891 |
-
葳
|
4892 |
-
葵
|
4893 |
-
葺
|
4894 |
-
蒂
|
4895 |
-
蒋
|
4896 |
-
蒐
|
4897 |
-
蒙
|
4898 |
-
蒜
|
4899 |
-
蒟
|
4900 |
-
蒡
|
4901 |
-
蒨
|
4902 |
-
蒲
|
4903 |
-
蒸
|
4904 |
-
蒹
|
4905 |
-
蒻
|
4906 |
-
蒿
|
4907 |
-
蓁
|
4908 |
-
蓄
|
4909 |
-
蓆
|
4910 |
-
蓉
|
4911 |
-
蓑
|
4912 |
-
蓓
|
4913 |
-
蓖
|
4914 |
-
蓝
|
4915 |
-
蓟
|
4916 |
-
蓦
|
4917 |
-
蓬
|
4918 |
-
蓼
|
4919 |
-
蓿
|
4920 |
-
蔑
|
4921 |
-
蔓
|
4922 |
-
蔗
|
4923 |
-
蔘
|
4924 |
-
蔚
|
4925 |
-
蔡
|
4926 |
-
蔫
|
4927 |
-
蔬
|
4928 |
-
蔵
|
4929 |
-
蔷
|
4930 |
-
蔺
|
4931 |
-
蔻
|
4932 |
-
蔼
|
4933 |
-
蔽
|
4934 |
-
蕃
|
4935 |
-
蕈
|
4936 |
-
蕉
|
4937 |
-
蕊
|
4938 |
-
蕙
|
4939 |
-
蕤
|
4940 |
-
蕨
|
4941 |
-
蕲
|
4942 |
-
蕴
|
4943 |
-
蕻
|
4944 |
-
蕾
|
4945 |
-
薄
|
4946 |
-
薅
|
4947 |
-
薇
|
4948 |
-
薏
|
4949 |
-
薑
|
4950 |
-
薙
|
4951 |
-
薛
|
4952 |
-
薨
|
4953 |
-
薪
|
4954 |
-
薬
|
4955 |
-
薯
|
4956 |
-
薰
|
4957 |
-
薹
|
4958 |
-
藏
|
4959 |
-
藐
|
4960 |
-
藓
|
4961 |
-
藕
|
4962 |
-
藜
|
4963 |
-
藤
|
4964 |
-
藩
|
4965 |
-
藻
|
4966 |
-
藿
|
4967 |
-
蘑
|
4968 |
-
蘸
|
4969 |
-
蘼
|
4970 |
-
虎
|
4971 |
-
虏
|
4972 |
-
虐
|
4973 |
-
虑
|
4974 |
-
虔
|
4975 |
-
虚
|
4976 |
-
虞
|
4977 |
-
虢
|
4978 |
-
虫
|
4979 |
-
虬
|
4980 |
-
虱
|
4981 |
-
虹
|
4982 |
-
虻
|
4983 |
-
虽
|
4984 |
-
虾
|
4985 |
-
蚀
|
4986 |
-
蚁
|
4987 |
-
蚂
|
4988 |
-
蚊
|
4989 |
-
蚌
|
4990 |
-
蚓
|
4991 |
-
蚕
|
4992 |
-
蚜
|
4993 |
-
蚝
|
4994 |
-
蚣
|
4995 |
-
蚤
|
4996 |
-
蚩
|
4997 |
-
蚪
|
4998 |
-
蚯
|
4999 |
-
蚱
|
5000 |
-
蚵
|
5001 |
-
蛀
|
5002 |
-
蛆
|
5003 |
-
蛇
|
5004 |
-
蛊
|
5005 |
-
蛋
|
5006 |
-
蛎
|
5007 |
-
蛐
|
5008 |
-
蛔
|
5009 |
-
蛙
|
5010 |
-
蛛
|
5011 |
-
蛟
|
5012 |
-
蛤
|
5013 |
-
蛭
|
5014 |
-
蛮
|
5015 |
-
蛰
|
5016 |
-
蛳
|
5017 |
-
蛹
|
5018 |
-
蛾
|
5019 |
-
蜀
|
5020 |
-
蜂
|
5021 |
-
蜃
|
5022 |
-
蜇
|
5023 |
-
蜈
|
5024 |
-
蜊
|
5025 |
-
蜍
|
5026 |
-
蜒
|
5027 |
-
蜓
|
5028 |
-
蜕
|
5029 |
-
蜗
|
5030 |
-
蜘
|
5031 |
-
蜚
|
5032 |
-
蜜
|
5033 |
-
蜡
|
5034 |
-
蜢
|
5035 |
-
蜥
|
5036 |
-
蜱
|
5037 |
-
��
|
5038 |
-
蜷
|
5039 |
-
蜻
|
5040 |
-
蜿
|
5041 |
-
蝇
|
5042 |
-
蝈
|
5043 |
-
蝉
|
5044 |
-
蝌
|
5045 |
-
蝎
|
5046 |
-
蝗
|
5047 |
-
蝙
|
5048 |
-
蝠
|
5049 |
-
蝨
|
5050 |
-
蝴
|
5051 |
-
蝶
|
5052 |
-
蝼
|
5053 |
-
螂
|
5054 |
-
螃
|
5055 |
-
融
|
5056 |
-
螨
|
5057 |
-
螯
|
5058 |
-
螳
|
5059 |
-
螺
|
5060 |
-
蟀
|
5061 |
-
蟆
|
5062 |
-
蟋
|
5063 |
-
蟑
|
5064 |
-
蟒
|
5065 |
-
蟠
|
5066 |
-
蟹
|
5067 |
-
蟾
|
5068 |
-
蠍
|
5069 |
-
蠔
|
5070 |
-
蠕
|
5071 |
-
蠛
|
5072 |
-
蠡
|
5073 |
-
蠢
|
5074 |
-
蠹
|
5075 |
-
血
|
5076 |
-
衄
|
5077 |
-
衅
|
5078 |
-
行
|
5079 |
-
衍
|
5080 |
-
衔
|
5081 |
-
街
|
5082 |
-
衙
|
5083 |
-
衞
|
5084 |
-
衡
|
5085 |
-
衢
|
5086 |
-
衣
|
5087 |
-
补
|
5088 |
-
表
|
5089 |
-
衩
|
5090 |
-
衫
|
5091 |
-
衬
|
5092 |
-
衮
|
5093 |
-
衰
|
5094 |
-
衲
|
5095 |
-
衷
|
5096 |
-
衾
|
5097 |
-
衿
|
5098 |
-
袁
|
5099 |
-
袂
|
5100 |
-
袄
|
5101 |
-
袅
|
5102 |
-
袈
|
5103 |
-
袋
|
5104 |
-
袍
|
5105 |
-
袒
|
5106 |
-
袖
|
5107 |
-
袜
|
5108 |
-
袤
|
5109 |
-
袪
|
5110 |
-
被
|
5111 |
-
袭
|
5112 |
-
袱
|
5113 |
-
裁
|
5114 |
-
裂
|
5115 |
-
装
|
5116 |
-
裆
|
5117 |
-
裔
|
5118 |
-
裕
|
5119 |
-
裘
|
5120 |
-
裙
|
5121 |
-
裟
|
5122 |
-
裤
|
5123 |
-
裨
|
5124 |
-
裱
|
5125 |
-
裳
|
5126 |
-
裴
|
5127 |
-
裸
|
5128 |
-
裹
|
5129 |
-
裾
|
5130 |
-
褂
|
5131 |
-
褐
|
5132 |
-
褒
|
5133 |
-
褓
|
5134 |
-
褔
|
5135 |
-
褚
|
5136 |
-
褥
|
5137 |
-
褪
|
5138 |
-
褫
|
5139 |
-
褶
|
5140 |
-
襁
|
5141 |
-
襄
|
5142 |
-
襟
|
5143 |
-
西
|
5144 |
-
要
|
5145 |
-
覃
|
5146 |
-
覆
|
5147 |
-
覇
|
5148 |
-
覚
|
5149 |
-
覧
|
5150 |
-
観
|
5151 |
-
见
|
5152 |
-
观
|
5153 |
-
规
|
5154 |
-
觅
|
5155 |
-
视
|
5156 |
-
览
|
5157 |
-
觉
|
5158 |
-
觊
|
5159 |
-
觎
|
5160 |
-
觐
|
5161 |
-
觑
|
5162 |
-
角
|
5163 |
-
觞
|
5164 |
-
解
|
5165 |
-
觥
|
5166 |
-
触
|
5167 |
-
言
|
5168 |
-
訳
|
5169 |
-
証
|
5170 |
-
詹
|
5171 |
-
誉
|
5172 |
-
誓
|
5173 |
-
読
|
5174 |
-
諡
|
5175 |
-
譁
|
5176 |
-
警
|
5177 |
-
譬
|
5178 |
-
譲
|
5179 |
-
讚
|
5180 |
-
计
|
5181 |
-
订
|
5182 |
-
认
|
5183 |
-
讥
|
5184 |
-
讧
|
5185 |
-
讨
|
5186 |
-
让
|
5187 |
-
讪
|
5188 |
-
讫
|
5189 |
-
训
|
5190 |
-
议
|
5191 |
-
讯
|
5192 |
-
记
|
5193 |
-
讲
|
5194 |
-
讳
|
5195 |
-
讴
|
5196 |
-
讶
|
5197 |
-
讷
|
5198 |
-
许
|
5199 |
-
讹
|
5200 |
-
论
|
5201 |
-
讼
|
5202 |
-
讽
|
5203 |
-
设
|
5204 |
-
访
|
5205 |
-
诀
|
5206 |
-
证
|
5207 |
-
诃
|
5208 |
-
评
|
5209 |
-
诅
|
5210 |
-
识
|
5211 |
-
诈
|
5212 |
-
诉
|
5213 |
-
诊
|
5214 |
-
诋
|
5215 |
-
词
|
5216 |
-
诏
|
5217 |
-
译
|
5218 |
-
试
|
5219 |
-
诗
|
5220 |
-
诘
|
5221 |
-
诙
|
5222 |
-
诚
|
5223 |
-
诛
|
5224 |
-
话
|
5225 |
-
诞
|
5226 |
-
诟
|
5227 |
-
诠
|
5228 |
-
诡
|
5229 |
-
询
|
5230 |
-
诣
|
5231 |
-
诤
|
5232 |
-
该
|
5233 |
-
详
|
5234 |
-
诧
|
5235 |
-
诩
|
5236 |
-
诫
|
5237 |
-
诬
|
5238 |
-
语
|
5239 |
-
误
|
5240 |
-
诰
|
5241 |
-
诱
|
5242 |
-
诲
|
5243 |
-
说
|
5244 |
-
诵
|
5245 |
-
诶
|
5246 |
-
请
|
5247 |
-
诸
|
5248 |
-
诺
|
5249 |
-
读
|
5250 |
-
诽
|
5251 |
-
课
|
5252 |
-
诿
|
5253 |
-
谀
|
5254 |
-
谁
|
5255 |
-
调
|
5256 |
-
谄
|
5257 |
-
谅
|
5258 |
-
谆
|
5259 |
-
谈
|
5260 |
-
谊
|
5261 |
-
谋
|
5262 |
-
谌
|
5263 |
-
谍
|
5264 |
-
谎
|
5265 |
-
谏
|
5266 |
-
谐
|
5267 |
-
谑
|
5268 |
-
谒
|
5269 |
-
谓
|
5270 |
-
谔
|
5271 |
-
谕
|
5272 |
-
谗
|
5273 |
-
谘
|
5274 |
-
谙
|
5275 |
-
谚
|
5276 |
-
谛
|
5277 |
-
谜
|
5278 |
-
谟
|
5279 |
-
谢
|
5280 |
-
谣
|
5281 |
-
谤
|
5282 |
-
谥
|
5283 |
-
谦
|
5284 |
-
谧
|
5285 |
-
谨
|
5286 |
-
谩
|
5287 |
-
谪
|
5288 |
-
谬
|
5289 |
-
谭
|
5290 |
-
谯
|
5291 |
-
谱
|
5292 |
-
谲
|
5293 |
-
谴
|
5294 |
-
谶
|
5295 |
-
谷
|
5296 |
-
豁
|
5297 |
-
豆
|
5298 |
-
豇
|
5299 |
-
豉
|
5300 |
-
豊
|
5301 |
-
豌
|
5302 |
-
豔
|
5303 |
-
豚
|
5304 |
-
象
|
5305 |
-
豢
|
5306 |
-
豪
|
5307 |
-
豫
|
5308 |
-
豹
|
5309 |
-
豺
|
5310 |
-
貂
|
5311 |
-
貅
|
5312 |
-
貌
|
5313 |
-
貔
|
5314 |
-
貘
|
5315 |
-
贝
|
5316 |
-
贞
|
5317 |
-
负
|
5318 |
-
贡
|
5319 |
-
财
|
5320 |
-
责
|
5321 |
-
贤
|
5322 |
-
败
|
5323 |
-
账
|
5324 |
-
货
|
5325 |
-
质
|
5326 |
-
贩
|
5327 |
-
贪
|
5328 |
-
贫
|
5329 |
-
贬
|
5330 |
-
购
|
5331 |
-
贮
|
5332 |
-
贯
|
5333 |
-
贰
|
5334 |
-
贱
|
5335 |
-
贲
|
5336 |
-
贴
|
5337 |
-
贵
|
5338 |
-
贷
|
5339 |
-
贸
|
5340 |
-
费
|
5341 |
-
贺
|
5342 |
-
贻
|
5343 |
-
贼
|
5344 |
-
贾
|
5345 |
-
贿
|
5346 |
-
赁
|
5347 |
-
赂
|
5348 |
-
赃
|
5349 |
-
资
|
5350 |
-
赅
|
5351 |
-
赈
|
5352 |
-
赊
|
5353 |
-
赋
|
5354 |
-
赌
|
5355 |
-
赎
|
5356 |
-
赏
|
5357 |
-
赐
|
5358 |
-
赓
|
5359 |
-
赔
|
5360 |
-
赖
|
5361 |
-
赘
|
5362 |
-
赚
|
5363 |
-
赛
|
5364 |
-
赝
|
5365 |
-
赞
|
5366 |
-
赠
|
5367 |
-
赡
|
5368 |
-
赢
|
5369 |
-
赣
|
5370 |
-
赤
|
5371 |
-
赦
|
5372 |
-
赧
|
5373 |
-
赫
|
5374 |
-
赭
|
5375 |
-
走
|
5376 |
-
赳
|
5377 |
-
赴
|
5378 |
-
赵
|
5379 |
-
赶
|
5380 |
-
起
|
5381 |
-
趁
|
5382 |
-
超
|
5383 |
-
越
|
5384 |
-
趋
|
5385 |
-
趟
|
5386 |
-
趣
|
5387 |
-
足
|
5388 |
-
趴
|
5389 |
-
趵
|
5390 |
-
趸
|
5391 |
-
趺
|
5392 |
-
趾
|
5393 |
-
跃
|
5394 |
-
跄
|
5395 |
-
跆
|
5396 |
-
跋
|
5397 |
-
跌
|
5398 |
-
跎
|
5399 |
-
跑
|
5400 |
-
跖
|
5401 |
-
跚
|
5402 |
-
跛
|
5403 |
-
距
|
5404 |
-
跟
|
5405 |
-
跤
|
5406 |
-
跨
|
5407 |
-
跩
|
5408 |
-
跪
|
5409 |
-
路
|
5410 |
-
跳
|
5411 |
-
践
|
5412 |
-
跷
|
5413 |
-
跹
|
5414 |
-
跺
|
5415 |
-
跻
|
5416 |
-
踉
|
5417 |
-
踊
|
5418 |
-
踌
|
5419 |
-
踏
|
5420 |
-
踝
|
5421 |
-
踞
|
5422 |
-
踟
|
5423 |
-
踢
|
5424 |
-
踩
|
5425 |
-
踪
|
5426 |
-
踮
|
5427 |
-
踱
|
5428 |
-
踵
|
5429 |
-
踹
|
5430 |
-
蹂
|
5431 |
-
蹄
|
5432 |
-
蹇
|
5433 |
-
蹈
|
5434 |
-
蹉
|
5435 |
-
蹊
|
5436 |
-
蹋
|
5437 |
-
蹑
|
5438 |
-
蹒
|
5439 |
-
蹙
|
5440 |
-
蹟
|
5441 |
-
蹦
|
5442 |
-
蹩
|
5443 |
-
蹬
|
5444 |
-
蹭
|
5445 |
-
蹲
|
5446 |
-
蹴
|
5447 |
-
蹶
|
5448 |
-
蹼
|
5449 |
-
蹿
|
5450 |
-
躁
|
5451 |
-
躇
|
5452 |
-
躏
|
5453 |
-
身
|
5454 |
-
躬
|
5455 |
-
躯
|
5456 |
-
躲
|
5457 |
-
躺
|
5458 |
-
転
|
5459 |
-
軽
|
5460 |
-
车
|
5461 |
-
轧
|
5462 |
-
轨
|
5463 |
-
轩
|
5464 |
-
转
|
5465 |
-
轭
|
5466 |
-
轮
|
5467 |
-
软
|
5468 |
-
轰
|
5469 |
-
轲
|
5470 |
-
轴
|
5471 |
-
轶
|
5472 |
-
轻
|
5473 |
-
轼
|
5474 |
-
载
|
5475 |
-
轿
|
5476 |
-
较
|
5477 |
-
辄
|
5478 |
-
辅
|
5479 |
-
辆
|
5480 |
-
辇
|
5481 |
-
辈
|
5482 |
-
辉
|
5483 |
-
辊
|
5484 |
-
辍
|
5485 |
-
辐
|
5486 |
-
辑
|
5487 |
-
输
|
5488 |
-
辕
|
5489 |
-
辖
|
5490 |
-
辗
|
5491 |
-
辘
|
5492 |
-
辙
|
5493 |
-
辛
|
5494 |
-
辜
|
5495 |
-
辞
|
5496 |
-
辟
|
5497 |
-
辣
|
5498 |
-
辨
|
5499 |
-
辩
|
5500 |
-
辫
|
5501 |
-
辰
|
5502 |
-
辱
|
5503 |
-
边
|
5504 |
-
辺
|
5505 |
-
辻
|
5506 |
-
込
|
5507 |
-
辽
|
5508 |
-
达
|
5509 |
-
迁
|
5510 |
-
迂
|
5511 |
-
迄
|
5512 |
-
迅
|
5513 |
-
过
|
5514 |
-
迈
|
5515 |
-
迎
|
5516 |
-
运
|
5517 |
-
近
|
5518 |
-
返
|
5519 |
-
还
|
5520 |
-
这
|
5521 |
-
进
|
5522 |
-
远
|
5523 |
-
违
|
5524 |
-
连
|
5525 |
-
迟
|
5526 |
-
迢
|
5527 |
-
迤
|
5528 |
-
迥
|
5529 |
-
迦
|
5530 |
-
迩
|
5531 |
-
迪
|
5532 |
-
迫
|
5533 |
-
迭
|
5534 |
-
述
|
5535 |
-
迷
|
5536 |
-
迸
|
5537 |
-
迹
|
5538 |
-
迺
|
5539 |
-
追
|
5540 |
-
退
|
5541 |
-
送
|
5542 |
-
适
|
5543 |
-
逃
|
5544 |
-
逅
|
5545 |
-
逆
|
5546 |
-
选
|
5547 |
-
逊
|
5548 |
-
逍
|
5549 |
-
透
|
5550 |
-
逐
|
5551 |
-
递
|
5552 |
-
途
|
5553 |
-
逗
|
5554 |
-
通
|
5555 |
-
逛
|
5556 |
-
逝
|
5557 |
-
逞
|
5558 |
-
速
|
5559 |
-
造
|
5560 |
-
逢
|
5561 |
-
逮
|
5562 |
-
逵
|
5563 |
-
逶
|
5564 |
-
逸
|
5565 |
-
逻
|
5566 |
-
逼
|
5567 |
-
逾
|
5568 |
-
遁
|
5569 |
-
遂
|
5570 |
-
遅
|
5571 |
-
遇
|
5572 |
-
遍
|
5573 |
-
遏
|
5574 |
-
遐
|
5575 |
-
遑
|
5576 |
-
遒
|
5577 |
-
道
|
5578 |
-
遗
|
5579 |
-
遛
|
5580 |
-
遢
|
5581 |
-
遣
|
5582 |
-
遥
|
5583 |
-
遨
|
5584 |
-
遭
|
5585 |
-
遮
|
5586 |
-
遴
|
5587 |
-
遵
|
5588 |
-
遶
|
5589 |
-
遽
|
5590 |
-
避
|
5591 |
-
邀
|
5592 |
-
邂
|
5593 |
-
邃
|
5594 |
-
邈
|
5595 |
-
邋
|
5596 |
-
邑
|
5597 |
-
邓
|
5598 |
-
邕
|
5599 |
-
邛
|
5600 |
-
邝
|
5601 |
-
邢
|
5602 |
-
那
|
5603 |
-
邦
|
5604 |
-
邨
|
5605 |
-
邪
|
5606 |
-
邬
|
5607 |
-
邮
|
5608 |
-
邯
|
5609 |
-
邰
|
5610 |
-
邱
|
5611 |
-
邳
|
5612 |
-
邵
|
5613 |
-
邸
|
5614 |
-
邹
|
5615 |
-
邺
|
5616 |
-
邻
|
5617 |
-
郁
|
5618 |
-
郅
|
5619 |
-
郊
|
5620 |
-
郎
|
5621 |
-
郑
|
5622 |
-
郜
|
5623 |
-
郝
|
5624 |
-
郡
|
5625 |
-
郢
|
5626 |
-
郤
|
5627 |
-
郦
|
5628 |
-
郧
|
5629 |
-
部
|
5630 |
-
郫
|
5631 |
-
郭
|
5632 |
-
郴
|
5633 |
-
郷
|
5634 |
-
郸
|
5635 |
-
都
|
5636 |
-
鄂
|
5637 |
-
鄙
|
5638 |
-
鄞
|
5639 |
-
鄢
|
5640 |
-
鄱
|
5641 |
-
酉
|
5642 |
-
酊
|
5643 |
-
酋
|
5644 |
-
酌
|
5645 |
-
配
|
5646 |
-
酐
|
5647 |
-
酒
|
5648 |
-
酗
|
5649 |
-
酚
|
5650 |
-
酝
|
5651 |
-
酢
|
5652 |
-
酣
|
5653 |
-
酥
|
5654 |
-
酩
|
5655 |
-
酪
|
5656 |
-
酬
|
5657 |
-
酮
|
5658 |
-
酯
|
5659 |
-
酰
|
5660 |
-
酱
|
5661 |
-
酵
|
5662 |
-
酶
|
5663 |
-
酷
|
5664 |
-
酸
|
5665 |
-
酿
|
5666 |
-
醃
|
5667 |
-
醇
|
5668 |
-
醉
|
5669 |
-
醋
|
5670 |
-
醍
|
5671 |
-
醐
|
5672 |
-
醒
|
5673 |
-
醚
|
5674 |
-
醛
|
5675 |
-
醣
|
5676 |
-
醪
|
5677 |
-
醮
|
5678 |
-
醯
|
5679 |
-
醴
|
5680 |
-
醺
|
5681 |
-
采
|
5682 |
-
釉
|
5683 |
-
释
|
5684 |
-
里
|
5685 |
-
重
|
5686 |
-
野
|
5687 |
-
量
|
5688 |
-
金
|
5689 |
-
釜
|
5690 |
-
釦
|
5691 |
-
鈪
|
5692 |
-
鉄
|
5693 |
-
鉴
|
5694 |
-
銭
|
5695 |
-
銮
|
5696 |
-
鍊
|
5697 |
-
鎌
|
5698 |
-
鎏
|
5699 |
-
鎗
|
5700 |
-
鏖
|
5701 |
-
鑑
|
5702 |
-
鑫
|
5703 |
-
针
|
5704 |
-
钉
|
5705 |
-
钊
|
5706 |
-
钎
|
5707 |
-
钏
|
5708 |
-
钒
|
5709 |
-
钓
|
5710 |
-
钗
|
5711 |
-
钙
|
5712 |
-
钛
|
5713 |
-
钜
|
5714 |
-
钝
|
5715 |
-
钞
|
5716 |
-
钟
|
5717 |
-
钠
|
5718 |
-
钡
|
5719 |
-
钢
|
5720 |
-
钣
|
5721 |
-
钤
|
5722 |
-
钥
|
5723 |
-
钦
|
5724 |
-
钧
|
5725 |
-
钨
|
5726 |
-
钩
|
5727 |
-
钮
|
5728 |
-
钯
|
5729 |
-
钰
|
5730 |
-
钱
|
5731 |
-
钳
|
5732 |
-
钴
|
5733 |
-
钵
|
5734 |
-
钺
|
5735 |
-
钻
|
5736 |
-
钼
|
5737 |
-
钾
|
5738 |
-
钿
|
5739 |
-
铀
|
5740 |
-
铁
|
5741 |
-
铂
|
5742 |
-
铃
|
5743 |
-
铄
|
5744 |
-
铅
|
5745 |
-
铆
|
5746 |
-
铉
|
5747 |
-
铎
|
5748 |
-
铐
|
5749 |
-
铛
|
5750 |
-
铜
|
5751 |
-
铝
|
5752 |
-
铠
|
5753 |
-
铡
|
5754 |
-
铢
|
5755 |
-
铣
|
5756 |
-
铤
|
5757 |
-
铨
|
5758 |
-
铩
|
5759 |
-
铬
|
5760 |
-
铭
|
5761 |
-
铮
|
5762 |
-
铰
|
5763 |
-
铲
|
5764 |
-
铵
|
5765 |
-
银
|
5766 |
-
铸
|
5767 |
-
铺
|
5768 |
-
链
|
5769 |
-
铿
|
5770 |
-
销
|
5771 |
-
锁
|
5772 |
-
锂
|
5773 |
-
锄
|
5774 |
-
锅
|
5775 |
-
锆
|
5776 |
-
锈
|
5777 |
-
锉
|
5778 |
-
锋
|
5779 |
-
锌
|
5780 |
-
锏
|
5781 |
-
锐
|
5782 |
-
锑
|
5783 |
-
错
|
5784 |
-
锚
|
5785 |
-
锟
|
5786 |
-
锡
|
5787 |
-
锢
|
5788 |
-
锣
|
5789 |
-
锤
|
5790 |
-
锥
|
5791 |
-
锦
|
5792 |
-
锭
|
5793 |
-
键
|
5794 |
-
锯
|
5795 |
-
锰
|
5796 |
-
锲
|
5797 |
-
锵
|
5798 |
-
锹
|
5799 |
-
锺
|
5800 |
-
锻
|
5801 |
-
镀
|
5802 |
-
镁
|
5803 |
-
镂
|
5804 |
-
镇
|
5805 |
-
镉
|
5806 |
-
镌
|
5807 |
-
镍
|
5808 |
-
镐
|
5809 |
-
镑
|
5810 |
-
镕
|
5811 |
-
镖
|
5812 |
-
镗
|
5813 |
-
镛
|
5814 |
-
镜
|
5815 |
-
镣
|
5816 |
-
镭
|
5817 |
-
镯
|
5818 |
-
镰
|
5819 |
-
镳
|
5820 |
-
镶
|
5821 |
-
长
|
5822 |
-
閒
|
5823 |
-
関
|
5824 |
-
闇
|
5825 |
-
闘
|
5826 |
-
闢
|
5827 |
-
门
|
5828 |
-
闪
|
5829 |
-
闫
|
5830 |
-
闭
|
5831 |
-
问
|
5832 |
-
闯
|
5833 |
-
闰
|
5834 |
-
闲
|
5835 |
-
间
|
5836 |
-
闵
|
5837 |
-
闷
|
5838 |
-
闸
|
5839 |
-
闹
|
5840 |
-
闺
|
5841 |
-
闻
|
5842 |
-
闽
|
5843 |
-
闾
|
5844 |
-
阀
|
5845 |
-
阁
|
5846 |
-
阂
|
5847 |
-
阅
|
5848 |
-
阆
|
5849 |
-
阇
|
5850 |
-
阈
|
5851 |
-
阉
|
5852 |
-
阎
|
5853 |
-
阐
|
5854 |
-
阑
|
5855 |
-
阔
|
5856 |
-
阕
|
5857 |
-
阖
|
5858 |
-
阙
|
5859 |
-
阚
|
5860 |
-
阜
|
5861 |
-
队
|
5862 |
-
阡
|
5863 |
-
阮
|
5864 |
-
阱
|
5865 |
-
防
|
5866 |
-
阳
|
5867 |
-
阴
|
5868 |
-
阵
|
5869 |
-
阶
|
5870 |
-
阻
|
5871 |
-
阿
|
5872 |
-
陀
|
5873 |
-
陂
|
5874 |
-
附
|
5875 |
-
际
|
5876 |
-
陆
|
5877 |
-
陇
|
5878 |
-
陈
|
5879 |
-
陋
|
5880 |
-
陌
|
5881 |
-
降
|
5882 |
-
限
|
5883 |
-
陕
|
5884 |
-
陛
|
5885 |
-
陞
|
5886 |
-
陟
|
5887 |
-
陡
|
5888 |
-
院
|
5889 |
-
除
|
5890 |
-
陨
|
5891 |
-
险
|
5892 |
-
陪
|
5893 |
-
陲
|
5894 |
-
陵
|
5895 |
-
陶
|
5896 |
-
陷
|
5897 |
-
険
|
5898 |
-
隅
|
5899 |
-
隆
|
5900 |
-
隈
|
5901 |
-
隋
|
5902 |
-
隍
|
5903 |
-
随
|
5904 |
-
隐
|
5905 |
-
隔
|
5906 |
-
隘
|
5907 |
-
隙
|
5908 |
-
障
|
5909 |
-
隠
|
5910 |
-
隣
|
5911 |
-
隧
|
5912 |
-
隶
|
5913 |
-
隼
|
5914 |
-
隽
|
5915 |
-
难
|
5916 |
-
雀
|
5917 |
-
雁
|
5918 |
-
雄
|
5919 |
-
雅
|
5920 |
-
集
|
5921 |
-
雇
|
5922 |
-
雉
|
5923 |
-
雌
|
5924 |
-
雍
|
5925 |
-
雎
|
5926 |
-
雏
|
5927 |
-
雑
|
5928 |
-
雒
|
5929 |
-
雕
|
5930 |
-
雨
|
5931 |
-
雪
|
5932 |
-
雯
|
5933 |
-
雰
|
5934 |
-
雳
|
5935 |
-
零
|
5936 |
-
雷
|
5937 |
-
雹
|
5938 |
-
雾
|
5939 |
-
需
|
5940 |
-
霁
|
5941 |
-
霄
|
5942 |
-
霆
|
5943 |
-
震
|
5944 |
-
霈
|
5945 |
-
霉
|
5946 |
-
霊
|
5947 |
-
霍
|
5948 |
-
霎
|
5949 |
-
霏
|
5950 |
-
霑
|
5951 |
-
霓
|
5952 |
-
霖
|
5953 |
-
霜
|
5954 |
-
霞
|
5955 |
-
霭
|
5956 |
-
霰
|
5957 |
-
露
|
5958 |
-
霸
|
5959 |
-
霹
|
5960 |
-
霾
|
5961 |
-
青
|
5962 |
-
靓
|
5963 |
-
靖
|
5964 |
-
静
|
5965 |
-
靛
|
5966 |
-
非
|
5967 |
-
靠
|
5968 |
-
靡
|
5969 |
-
面
|
5970 |
-
靥
|
5971 |
-
革
|
5972 |
-
靳
|
5973 |
-
靴
|
5974 |
-
靶
|
5975 |
-
靼
|
5976 |
-
鞅
|
5977 |
-
鞋
|
5978 |
-
鞍
|
5979 |
-
鞑
|
5980 |
-
鞘
|
5981 |
-
鞠
|
5982 |
-
鞣
|
5983 |
-
鞭
|
5984 |
-
韦
|
5985 |
-
韧
|
5986 |
-
韩
|
5987 |
-
韬
|
5988 |
-
韭
|
5989 |
-
音
|
5990 |
-
韵
|
5991 |
-
韶
|
5992 |
-
頫
|
5993 |
-
頼
|
5994 |
-
页
|
5995 |
-
顶
|
5996 |
-
顷
|
5997 |
-
项
|
5998 |
-
顺
|
5999 |
-
须
|
6000 |
-
顼
|
6001 |
-
顽
|
6002 |
-
顾
|
6003 |
-
顿
|
6004 |
-
颁
|
6005 |
-
颂
|
6006 |
-
预
|
6007 |
-
颅
|
6008 |
-
领
|
6009 |
-
颇
|
6010 |
-
颈
|
6011 |
-
颉
|
6012 |
-
颊
|
6013 |
-
颌
|
6014 |
-
颍
|
6015 |
-
颐
|
6016 |
-
频
|
6017 |
-
颓
|
6018 |
-
颔
|
6019 |
-
颖
|
6020 |
-
颗
|
6021 |
-
题
|
6022 |
-
颚
|
6023 |
-
颛
|
6024 |
-
颜
|
6025 |
-
额
|
6026 |
-
颞
|
6027 |
-
颠
|
6028 |
-
颡
|
6029 |
-
颢
|
6030 |
-
颤
|
6031 |
-
颦
|
6032 |
-
颧
|
6033 |
-
风
|
6034 |
-
飒
|
6035 |
-
飓
|
6036 |
-
飕
|
6037 |
-
飘
|
6038 |
-
飙
|
6039 |
-
飚
|
6040 |
-
飞
|
6041 |
-
食
|
6042 |
-
飨
|
6043 |
-
餐
|
6044 |
-
餮
|
6045 |
-
餵
|
6046 |
-
饍
|
6047 |
-
饕
|
6048 |
-
饥
|
6049 |
-
饨
|
6050 |
-
饪
|
6051 |
-
饬
|
6052 |
-
饭
|
6053 |
-
饮
|
6054 |
-
饯
|
6055 |
-
饰
|
6056 |
-
饱
|
6057 |
-
饲
|
6058 |
-
饴
|
6059 |
-
饵
|
6060 |
-
饶
|
6061 |
-
��
|
6062 |
-
饺
|
6063 |
-
饼
|
6064 |
-
饽
|
6065 |
-
饿
|
6066 |
-
馀
|
6067 |
-
馁
|
6068 |
-
馄
|
6069 |
-
馅
|
6070 |
-
馆
|
6071 |
-
馈
|
6072 |
-
馋
|
6073 |
-
馍
|
6074 |
-
馏
|
6075 |
-
馒
|
6076 |
-
馔
|
6077 |
-
首
|
6078 |
-
馗
|
6079 |
-
香
|
6080 |
-
馥
|
6081 |
-
馨
|
6082 |
-
駄
|
6083 |
-
駅
|
6084 |
-
駆
|
6085 |
-
験
|
6086 |
-
騨
|
6087 |
-
驒
|
6088 |
-
马
|
6089 |
-
驭
|
6090 |
-
驮
|
6091 |
-
驯
|
6092 |
-
驰
|
6093 |
-
驱
|
6094 |
-
驳
|
6095 |
-
驴
|
6096 |
-
驶
|
6097 |
-
驷
|
6098 |
-
驸
|
6099 |
-
驹
|
6100 |
-
驻
|
6101 |
-
驼
|
6102 |
-
驾
|
6103 |
-
驿
|
6104 |
-
骁
|
6105 |
-
骂
|
6106 |
-
骄
|
6107 |
-
骅
|
6108 |
-
骆
|
6109 |
-
骇
|
6110 |
-
骈
|
6111 |
-
骊
|
6112 |
-
骋
|
6113 |
-
验
|
6114 |
-
骏
|
6115 |
-
骐
|
6116 |
-
骑
|
6117 |
-
骗
|
6118 |
-
骚
|
6119 |
-
骛
|
6120 |
-
骜
|
6121 |
-
骞
|
6122 |
-
骠
|
6123 |
-
骡
|
6124 |
-
骤
|
6125 |
-
骥
|
6126 |
-
骧
|
6127 |
-
骨
|
6128 |
-
骰
|
6129 |
-
骶
|
6130 |
-
骷
|
6131 |
-
骸
|
6132 |
-
骼
|
6133 |
-
髂
|
6134 |
-
髅
|
6135 |
-
髋
|
6136 |
-
髓
|
6137 |
-
高
|
6138 |
-
髦
|
6139 |
-
髪
|
6140 |
-
髯
|
6141 |
-
髻
|
6142 |
-
鬃
|
6143 |
-
鬓
|
6144 |
-
鬟
|
6145 |
-
鬣
|
6146 |
-
鬼
|
6147 |
-
魁
|
6148 |
-
魂
|
6149 |
-
魄
|
6150 |
-
魅
|
6151 |
-
魇
|
6152 |
-
魍
|
6153 |
-
魏
|
6154 |
-
魔
|
6155 |
-
鮨
|
6156 |
-
鱼
|
6157 |
-
鱿
|
6158 |
-
鲁
|
6159 |
-
鲈
|
6160 |
-
鲍
|
6161 |
-
鲑
|
6162 |
-
鲛
|
6163 |
-
鲜
|
6164 |
-
鲟
|
6165 |
-
鲢
|
6166 |
-
鲤
|
6167 |
-
鲨
|
6168 |
-
鲫
|
6169 |
-
鲱
|
6170 |
-
鲲
|
6171 |
-
鲶
|
6172 |
-
鲷
|
6173 |
-
鲸
|
6174 |
-
鳃
|
6175 |
-
鳄
|
6176 |
-
鳅
|
6177 |
-
鳌
|
6178 |
-
鳍
|
6179 |
-
鳕
|
6180 |
-
鳖
|
6181 |
-
鳗
|
6182 |
-
鳝
|
6183 |
-
鳞
|
6184 |
-
鵰
|
6185 |
-
鸟
|
6186 |
-
鸠
|
6187 |
-
鸡
|
6188 |
-
鸢
|
6189 |
-
鸣
|
6190 |
-
鸥
|
6191 |
-
鸦
|
6192 |
-
鸨
|
6193 |
-
鸪
|
6194 |
-
鸭
|
6195 |
-
鸯
|
6196 |
-
鸳
|
6197 |
-
鸵
|
6198 |
-
鸽
|
6199 |
-
鸾
|
6200 |
-
鸿
|
6201 |
-
鹂
|
6202 |
-
鹃
|
6203 |
-
鹄
|
6204 |
-
鹅
|
6205 |
-
鹈
|
6206 |
-
鹉
|
6207 |
-
鹊
|
6208 |
-
鹌
|
6209 |
-
鹏
|
6210 |
-
鹑
|
6211 |
-
鹕
|
6212 |
-
鹘
|
6213 |
-
鹜
|
6214 |
-
鹞
|
6215 |
-
鹤
|
6216 |
-
鹦
|
6217 |
-
鹧
|
6218 |
-
鹫
|
6219 |
-
鹭
|
6220 |
-
鹰
|
6221 |
-
鹳
|
6222 |
-
鹿
|
6223 |
-
麂
|
6224 |
-
麋
|
6225 |
-
麒
|
6226 |
-
麓
|
6227 |
-
麝
|
6228 |
-
麟
|
6229 |
-
麦
|
6230 |
-
麴
|
6231 |
-
麸
|
6232 |
-
麺
|
6233 |
-
麻
|
6234 |
-
麾
|
6235 |
-
黄
|
6236 |
-
黍
|
6237 |
-
黎
|
6238 |
-
黏
|
6239 |
-
黑
|
6240 |
-
黒
|
6241 |
-
黔
|
6242 |
-
默
|
6243 |
-
黛
|
6244 |
-
黜
|
6245 |
-
黝
|
6246 |
-
黠
|
6247 |
-
黯
|
6248 |
-
鼋
|
6249 |
-
鼎
|
6250 |
-
鼐
|
6251 |
-
鼓
|
6252 |
-
鼠
|
6253 |
-
鼬
|
6254 |
-
鼹
|
6255 |
-
鼻
|
6256 |
-
鼾
|
6257 |
-
齁
|
6258 |
-
齐
|
6259 |
-
齢
|
6260 |
-
齿
|
6261 |
-
龄
|
6262 |
-
龅
|
6263 |
-
龈
|
6264 |
-
龊
|
6265 |
-
龋
|
6266 |
-
龌
|
6267 |
-
龙
|
6268 |
-
龚
|
6269 |
-
龛
|
6270 |
-
龟
|
6271 |
-
呡
|
6272 |
-
乾
|
6273 |
-
绗
|
6274 |
-
楦
|
6275 |
-
硌
|
6276 |
-
袢
|
6277 |
-
钕
|
6278 |
-
蕞
|
6279 |
-
癀
|
6280 |
-
皲
|
6281 |
-
貉
|
6282 |
-
唛
|
6283 |
-
笕
|
6284 |
-
椴
|
6285 |
-
―
|
6286 |
-
胗
|
6287 |
-
旯
|
6288 |
-
鳙
|
6289 |
-
鲇
|
6290 |
-
鳐
|
6291 |
-
鳜
|
6292 |
-
鲅
|
6293 |
-
鳊
|
6294 |
-
鲳
|
6295 |
-
鲽
|
6296 |
-
鲣
|
6297 |
-
枞
|
6298 |
-
炝
|
6299 |
-
醅
|
6300 |
-
馊
|
6301 |
-
捯
|
6302 |
-
抻
|
6303 |
-
绉
|
6304 |
-
馐
|
6305 |
-
饧
|
6306 |
-
莜
|
6307 |
-
嘬
|
6308 |
-
腘
|
6309 |
-
肫
|
6310 |
-
鳟
|
6311 |
-
镊
|
6312 |
-
犽
|
6313 |
-
洌
|
6314 |
-
蝰
|
6315 |
-
铱
|
6316 |
-
髌
|
6317 |
-
锃
|
6318 |
-
镲
|
6319 |
-
锗
|
6320 |
-
甑
|
6321 |
-
戗
|
6322 |
-
裥
|
6323 |
-
弎
|
6324 |
-
粝
|
6325 |
-
霂
|
6326 |
-
猄
|
6327 |
-
轱
|
6328 |
-
苎
|
6329 |
-
偲
|
6330 |
-
兿
|
6331 |
-
铷
|
6332 |
-
栢
|
6333 |
-
帏
|
6334 |
-
黢
|
6335 |
-
洇
|
6336 |
-
沄
|
6337 |
-
誊
|
6338 |
-
忸
|
6339 |
-
怩
|
6340 |
-
蚬
|
6341 |
-
籺
|
6342 |
-
氚
|
6343 |
-
犇
|
6344 |
-
锒
|
6345 |
-
鸩
|
6346 |
-
噘
|
6347 |
-
偾
|
6348 |
-
髫
|
6349 |
-
︰
|
6350 |
-
︱
|
6351 |
-
︶
|
6352 |
-
︿
|
6353 |
-
﹁
|
6354 |
-
﹂
|
6355 |
-
﹍
|
6356 |
-
﹏
|
6357 |
-
﹐
|
6358 |
-
﹑
|
6359 |
-
﹒
|
6360 |
-
﹔
|
6361 |
-
﹕
|
6362 |
-
﹖
|
6363 |
-
﹗
|
6364 |
-
﹙
|
6365 |
-
﹚
|
6366 |
-
﹝
|
6367 |
-
﹞
|
6368 |
-
﹡
|
6369 |
-
﹣
|
6370 |
-
!
|
6371 |
-
(
|
6372 |
-
)
|
6373 |
-
,
|
6374 |
-
:
|
6375 |
-
;
|
6376 |
-
?
|
6377 |
-
|
|
6378 |
-
~
|
6379 |
-
。
|
6380 |
-
「
|
6381 |
-
」
|
6382 |
-
、
|
6383 |
-
・
|
6384 |
-
ッ
|
6385 |
-
ー
|
6386 |
-
゙
|
6387 |
-
゚
|
6388 |
-
 ̄
|
6389 |
-
¥
|
6390 |
-
...
|
6391 |
-
yam
|
6392 |
-
lofter
|
6393 |
-
##s
|
6394 |
-
by
|
6395 |
-
##0
|
6396 |
-
com
|
6397 |
-
##a
|
6398 |
-
##2
|
6399 |
-
##1
|
6400 |
-
##3
|
6401 |
-
##e
|
6402 |
-
##8
|
6403 |
-
##5
|
6404 |
-
##6
|
6405 |
-
##4
|
6406 |
-
##9
|
6407 |
-
##7
|
6408 |
-
##t
|
6409 |
-
##o
|
6410 |
-
##d
|
6411 |
-
##i
|
6412 |
-
##n
|
6413 |
-
app
|
6414 |
-
www
|
6415 |
-
the
|
6416 |
-
##m
|
6417 |
-
##c
|
6418 |
-
##l
|
6419 |
-
##y
|
6420 |
-
##r
|
6421 |
-
##g
|
6422 |
-
http
|
6423 |
-
qq
|
6424 |
-
##p
|
6425 |
-
##f
|
6426 |
-
google
|
6427 |
-
pixnet
|
6428 |
-
cookies
|
6429 |
-
tripadvisor
|
6430 |
-
##er
|
6431 |
-
##k
|
6432 |
-
##h
|
6433 |
-
facebook
|
6434 |
-
##b
|
6435 |
-
of
|
6436 |
-
##x
|
6437 |
-
##u
|
6438 |
-
iphone
|
6439 |
-
ip
|
6440 |
-
in
|
6441 |
-
##w
|
6442 |
-
##ing
|
6443 |
-
ctrip
|
6444 |
-
##on
|
6445 |
-
##v
|
6446 |
-
to
|
6447 |
-
id
|
6448 |
-
it
|
6449 |
-
windows
|
6450 |
-
llc
|
6451 |
-
top
|
6452 |
-
led
|
6453 |
-
at
|
6454 |
-
##an
|
6455 |
-
##z
|
6456 |
-
android
|
6457 |
-
and
|
6458 |
-
vr
|
6459 |
-
blogthis
|
6460 |
-
twitter
|
6461 |
-
##le
|
6462 |
-
ok
|
6463 |
-
cn
|
6464 |
-
no
|
6465 |
-
ios
|
6466 |
-
##in
|
6467 |
-
##mm
|
6468 |
-
on
|
6469 |
-
te
|
6470 |
-
ig
|
6471 |
-
lv
|
6472 |
-
##ng
|
6473 |
-
##us
|
6474 |
-
pc
|
6475 |
-
──
|
6476 |
-
##te
|
6477 |
-
##ed
|
6478 |
-
html
|
6479 |
-
ncc
|
6480 |
-
wifi
|
6481 |
-
email
|
6482 |
-
blog
|
6483 |
-
is
|
6484 |
-
mail
|
6485 |
-
online
|
6486 |
-
##al
|
6487 |
-
dvd
|
6488 |
-
##ic
|
6489 |
-
studio
|
6490 |
-
##℃
|
6491 |
-
##ia
|
6492 |
-
line
|
6493 |
-
vip
|
6494 |
-
##q
|
6495 |
-
##ce
|
6496 |
-
##en
|
6497 |
-
for
|
6498 |
-
##is
|
6499 |
-
##ra
|
6500 |
-
##es
|
6501 |
-
##j
|
6502 |
-
usb
|
6503 |
-
net
|
6504 |
-
cp
|
6505 |
-
asia
|
6506 |
-
##cm
|
6507 |
-
diy
|
6508 |
-
new
|
6509 |
-
ta
|
6510 |
-
language
|
6511 |
-
vs
|
6512 |
-
apple
|
6513 |
-
tw
|
6514 |
-
web
|
6515 |
-
##ne
|
6516 |
-
ipad
|
6517 |
-
you
|
6518 |
-
##re
|
6519 |
-
##tion
|
6520 |
-
ps
|
6521 |
-
de
|
6522 |
-
bt
|
6523 |
-
pony
|
6524 |
-
atm
|
6525 |
-
##ch
|
6526 |
-
ceo
|
6527 |
-
##or
|
6528 |
-
go
|
6529 |
-
##na
|
6530 |
-
av
|
6531 |
-
pro
|
6532 |
-
cafe
|
6533 |
-
pinterest
|
6534 |
-
pixstyleme3c
|
6535 |
-
##ta
|
6536 |
-
more
|
6537 |
-
said
|
6538 |
-
mp3
|
6539 |
-
##ll
|
6540 |
-
nba
|
6541 |
-
jun
|
6542 |
-
tv
|
6543 |
-
pm
|
6544 |
-
nbsp
|
6545 |
-
##ie
|
6546 |
-
linux
|
6547 |
-
##ma
|
6548 |
-
cd
|
6549 |
-
hd
|
6550 |
-
##ion
|
6551 |
-
am
|
6552 |
-
##th
|
6553 |
-
##st
|
6554 |
-
##se
|
6555 |
-
##et
|
6556 |
-
gdp
|
6557 |
-
my
|
6558 |
-
abc
|
6559 |
-
flash
|
6560 |
-
one
|
6561 |
-
##ck
|
6562 |
-
gps
|
6563 |
-
##ly
|
6564 |
-
web885
|
6565 |
-
##ge
|
6566 |
-
xd
|
6567 |
-
boss
|
6568 |
-
isbn
|
6569 |
-
org
|
6570 |
-
##ry
|
6571 |
-
me
|
6572 |
-
love
|
6573 |
-
##ter
|
6574 |
-
##ar
|
6575 |
-
##la
|
6576 |
-
hotel
|
6577 |
-
pk
|
6578 |
-
ie
|
6579 |
-
##os
|
6580 |
-
##el
|
6581 |
-
seo
|
6582 |
-
cpu
|
6583 |
-
##ml
|
6584 |
-
p2p
|
6585 |
-
may
|
6586 |
-
sun
|
6587 |
-
tue
|
6588 |
-
internet
|
6589 |
-
cc
|
6590 |
-
posted
|
6591 |
-
youtube
|
6592 |
-
##at
|
6593 |
-
##man
|
6594 |
-
ii
|
6595 |
-
abs
|
6596 |
-
nt
|
6597 |
-
pdf
|
6598 |
-
yahoo
|
6599 |
-
ago
|
6600 |
-
##it
|
6601 |
-
news
|
6602 |
-
mac
|
6603 |
-
##me
|
6604 |
-
java
|
6605 |
-
spa
|
6606 |
-
##de
|
6607 |
-
##nt
|
6608 |
-
hk
|
6609 |
-
all
|
6610 |
-
plus
|
6611 |
-
la
|
6612 |
-
##mb
|
6613 |
-
##ve
|
6614 |
-
west
|
6615 |
-
##da
|
6616 |
-
air
|
6617 |
-
##ps
|
6618 |
-
##to
|
6619 |
-
logo
|
6620 |
-
htc
|
6621 |
-
php
|
6622 |
-
https
|
6623 |
-
fi
|
6624 |
-
momo
|
6625 |
-
##son
|
6626 |
-
sat
|
6627 |
-
##ke
|
6628 |
-
ebd
|
6629 |
-
suv
|
6630 |
-
wi
|
6631 |
-
day
|
6632 |
-
apk
|
6633 |
-
##um
|
6634 |
-
mv
|
6635 |
-
galaxy
|
6636 |
-
wiki
|
6637 |
-
or
|
6638 |
-
brake
|
6639 |
-
this
|
6640 |
-
mon
|
6641 |
-
po
|
6642 |
-
javascript
|
6643 |
-
life
|
6644 |
-
home
|
6645 |
-
june
|
6646 |
-
##ss
|
6647 |
-
system
|
6648 |
-
pp
|
6649 |
-
world
|
6650 |
-
fb
|
6651 |
-
br
|
6652 |
-
##as
|
6653 |
-
ic
|
6654 |
-
ai
|
6655 |
-
leonardo
|
6656 |
-
safari
|
6657 |
-
live
|
6658 |
-
free
|
6659 |
-
xx
|
6660 |
-
wed
|
6661 |
-
win7
|
6662 |
-
kiehl
|
6663 |
-
##co
|
6664 |
-
lg
|
6665 |
-
o2o
|
6666 |
-
##go
|
6667 |
-
us
|
6668 |
-
mm
|
6669 |
-
vfm
|
6670 |
-
kanye
|
6671 |
-
##id
|
6672 |
-
jr
|
6673 |
-
##ey
|
6674 |
-
rss
|
6675 |
-
##sa
|
6676 |
-
##ro
|
6677 |
-
##am
|
6678 |
-
##no
|
6679 |
-
thu
|
6680 |
-
fri
|
6681 |
-
##sh
|
6682 |
-
##ki
|
6683 |
-
comments
|
6684 |
-
name
|
6685 |
-
##pe
|
6686 |
-
##ine
|
6687 |
-
max
|
6688 |
-
uber
|
6689 |
-
##mi
|
6690 |
-
##ton
|
6691 |
-
wordpress
|
6692 |
-
office
|
6693 |
-
##ment
|
6694 |
-
bd
|
6695 |
-
win10
|
6696 |
-
##ld
|
6697 |
-
##li
|
6698 |
-
gmail
|
6699 |
-
bb
|
6700 |
-
dior
|
6701 |
-
##rs
|
6702 |
-
##ri
|
6703 |
-
##rd
|
6704 |
-
up
|
6705 |
-
cad
|
6706 |
-
dr
|
6707 |
-
read
|
6708 |
-
##io
|
6709 |
-
url
|
6710 |
-
pvc
|
6711 |
-
paypal
|
6712 |
-
show
|
6713 |
-
policy
|
6714 |
-
##ty
|
6715 |
-
with
|
6716 |
-
txt
|
6717 |
-
##ba
|
6718 |
-
dna
|
6719 |
-
from
|
6720 |
-
post
|
6721 |
-
mini
|
6722 |
-
ar
|
6723 |
-
taiwan
|
6724 |
-
john
|
6725 |
-
##ga
|
6726 |
-
privacy
|
6727 |
-
agoda
|
6728 |
-
##ny
|
6729 |
-
word
|
6730 |
-
##by
|
6731 |
-
##ur
|
6732 |
-
##hz
|
6733 |
-
##ang
|
6734 |
-
cookie
|
6735 |
-
netscape
|
6736 |
-
##ka
|
6737 |
-
##~
|
6738 |
-
##ad
|
6739 |
-
house
|
6740 |
-
share
|
6741 |
-
note
|
6742 |
-
ibm
|
6743 |
-
code
|
6744 |
-
hello
|
6745 |
-
nike
|
6746 |
-
sim
|
6747 |
-
survey
|
6748 |
-
wikia
|
6749 |
-
cbc
|
6750 |
-
##tor
|
6751 |
-
##kg
|
6752 |
-
##rt
|
6753 |
-
campaign
|
6754 |
-
store
|
6755 |
-
os
|
6756 |
-
##ct
|
6757 |
-
##ts
|
6758 |
-
##°
|
6759 |
-
api
|
6760 |
-
##ns
|
6761 |
-
excel
|
6762 |
-
##ao
|
6763 |
-
##nd
|
6764 |
-
university
|
6765 |
-
##ya
|
6766 |
-
##il
|
6767 |
-
pierre
|
6768 |
-
ipo
|
6769 |
-
hotels
|
6770 |
-
##ian
|
6771 |
-
years
|
6772 |
-
##ers
|
6773 |
-
high
|
6774 |
-
##day
|
6775 |
-
time
|
6776 |
-
##ay
|
6777 |
-
bug
|
6778 |
-
##line
|
6779 |
-
##be
|
6780 |
-
xp
|
6781 |
-
talk2yam
|
6782 |
-
yamservice
|
6783 |
-
coco
|
6784 |
-
##dy
|
6785 |
-
sony
|
6786 |
-
##ies
|
6787 |
-
microsoft
|
6788 |
-
david
|
6789 |
-
people
|
6790 |
-
##ha
|
6791 |
-
instagram
|
6792 |
-
intel
|
6793 |
-
##ot
|
6794 |
-
iso
|
6795 |
-
##va
|
6796 |
-
##mo
|
6797 |
-
##land
|
6798 |
-
xxx
|
6799 |
-
man
|
6800 |
-
co
|
6801 |
-
ltxsw
|
6802 |
-
##ation
|
6803 |
-
baby
|
6804 |
-
##pa
|
6805 |
-
##ol
|
6806 |
-
tag
|
6807 |
-
##ue
|
6808 |
-
msn
|
6809 |
-
oppo
|
6810 |
-
##ca
|
6811 |
-
control
|
6812 |
-
##om
|
6813 |
-
st
|
6814 |
-
chrome
|
6815 |
-
##ure
|
6816 |
-
be
|
6817 |
-
lol
|
6818 |
-
##bo
|
6819 |
-
lady
|
6820 |
-
##way
|
6821 |
-
##ko
|
6822 |
-
##do
|
6823 |
-
##un
|
6824 |
-
corporation
|
6825 |
-
##ni
|
6826 |
-
herme
|
6827 |
-
##up
|
6828 |
-
ui
|
6829 |
-
##ds
|
6830 |
-
ppt
|
6831 |
-
admin
|
6832 |
-
three
|
6833 |
-
bbc
|
6834 |
-
re
|
6835 |
-
ca
|
6836 |
-
hp
|
6837 |
-
##ee
|
6838 |
-
tpp
|
6839 |
-
##ive
|
6840 |
-
root
|
6841 |
-
##cc
|
6842 |
-
##ble
|
6843 |
-
##ity
|
6844 |
-
adobe
|
6845 |
-
park
|
6846 |
-
et
|
6847 |
-
oled
|
6848 |
-
city
|
6849 |
-
##ex
|
6850 |
-
##ler
|
6851 |
-
##ap
|
6852 |
-
china
|
6853 |
-
##book
|
6854 |
-
view
|
6855 |
-
##ice
|
6856 |
-
global
|
6857 |
-
##km
|
6858 |
-
your
|
6859 |
-
hong
|
6860 |
-
##mg
|
6861 |
-
out
|
6862 |
-
##ms
|
6863 |
-
ng
|
6864 |
-
ebay
|
6865 |
-
menu
|
6866 |
-
ubuntu
|
6867 |
-
##cy
|
6868 |
-
rom
|
6869 |
-
##view
|
6870 |
-
open
|
6871 |
-
ktv
|
6872 |
-
do
|
6873 |
-
server
|
6874 |
-
##lo
|
6875 |
-
if
|
6876 |
-
english
|
6877 |
-
##oo
|
6878 |
-
step1
|
6879 |
-
kong
|
6880 |
-
club
|
6881 |
-
july
|
6882 |
-
inc
|
6883 |
-
mr
|
6884 |
-
hi
|
6885 |
-
##net
|
6886 |
-
touch
|
6887 |
-
##ls
|
6888 |
-
##ii
|
6889 |
-
michael
|
6890 |
-
lcd
|
6891 |
-
phone
|
6892 |
-
james
|
6893 |
-
step2
|
6894 |
-
ios9
|
6895 |
-
##box
|
6896 |
-
dc
|
6897 |
-
##ley
|
6898 |
-
samsung
|
6899 |
-
pokemon
|
6900 |
-
css
|
6901 |
-
##ent
|
6902 |
-
##les
|
6903 |
-
s8
|
6904 |
-
atom
|
6905 |
-
play
|
6906 |
-
bmw
|
6907 |
-
##said
|
6908 |
-
sa
|
6909 |
-
etf
|
6910 |
-
ctrl
|
6911 |
-
adidas
|
6912 |
-
amazon
|
6913 |
-
##ber
|
6914 |
-
##ner
|
6915 |
-
visa
|
6916 |
-
##der
|
6917 |
-
connectivity
|
6918 |
-
##hi
|
6919 |
-
firefox
|
6920 |
-
hr
|
6921 |
-
so
|
6922 |
-
style
|
6923 |
-
mark
|
6924 |
-
pop
|
6925 |
-
ol
|
6926 |
-
skip
|
6927 |
-
as
|
6928 |
-
##ir
|
6929 |
-
mba
|
6930 |
-
##ai
|
6931 |
-
le
|
6932 |
-
##ver
|
6933 |
-
cafe2017
|
6934 |
-
lte
|
6935 |
-
super
|
6936 |
-
##ron
|
6937 |
-
amd
|
6938 |
-
like
|
6939 |
-
are
|
6940 |
-
##ster
|
6941 |
-
we
|
6942 |
-
##sk
|
6943 |
-
paul
|
6944 |
-
data
|
6945 |
-
international
|
6946 |
-
##ft
|
6947 |
-
longchamp
|
6948 |
-
ssd
|
6949 |
-
good
|
6950 |
-
##ti
|
6951 |
-
reply
|
6952 |
-
##my
|
6953 |
-
apr
|
6954 |
-
star
|
6955 |
-
##ker
|
6956 |
-
source
|
6957 |
-
js
|
6958 |
-
get
|
6959 |
-
force
|
6960 |
-
photo
|
6961 |
-
##one
|
6962 |
-
##ow
|
6963 |
-
link
|
6964 |
-
bbs
|
6965 |
-
goods
|
6966 |
-
##lin
|
6967 |
-
python
|
6968 |
-
##ip
|
6969 |
-
game
|
6970 |
-
##ics
|
6971 |
-
blue
|
6972 |
-
page
|
6973 |
-
itunes
|
6974 |
-
gt
|
6975 |
-
gif
|
6976 |
-
##ff
|
6977 |
-
group
|
6978 |
-
about
|
6979 |
-
bar
|
6980 |
-
ganji
|
6981 |
-
##nce
|
6982 |
-
music
|
6983 |
-
lee
|
6984 |
-
not
|
6985 |
-
##per
|
6986 |
-
an
|
6987 |
-
faq
|
6988 |
-
comment
|
6989 |
-
days
|
6990 |
-
##ock
|
6991 |
-
##bs
|
6992 |
-
v1
|
6993 |
-
player
|
6994 |
-
xbox
|
6995 |
-
sql
|
6996 |
-
fm
|
6997 |
-
f1
|
6998 |
-
##ah
|
6999 |
-
##lv
|
7000 |
-
##mp
|
7001 |
-
melody
|
7002 |
-
xml
|
7003 |
-
market
|
7004 |
-
##au
|
7005 |
-
what
|
7006 |
-
gl
|
7007 |
-
##age
|
7008 |
-
tips
|
7009 |
-
book
|
7010 |
-
##ting
|
7011 |
-
mysql
|
7012 |
-
can
|
7013 |
-
##ung
|
7014 |
-
wonderland
|
7015 |
-
watch
|
7016 |
-
##ction
|
7017 |
-
mar
|
7018 |
-
mobile
|
7019 |
-
article
|
7020 |
-
##db
|
7021 |
-
part
|
7022 |
-
party
|
7023 |
-
##ore
|
7024 |
-
##op
|
7025 |
-
dj
|
7026 |
-
main
|
7027 |
-
##ong
|
7028 |
-
art
|
7029 |
-
ad
|
7030 |
-
pm2
|
7031 |
-
japan
|
7032 |
-
ts
|
7033 |
-
##ica
|
7034 |
-
der
|
7035 |
-
sm
|
7036 |
-
##wa
|
7037 |
-
ct
|
7038 |
-
homemesh
|
7039 |
-
search
|
7040 |
-
##tv
|
7041 |
-
##di
|
7042 |
-
macbook
|
7043 |
-
service
|
7044 |
-
type
|
7045 |
-
##ier
|
7046 |
-
##si
|
7047 |
-
##ok
|
7048 |
-
best
|
7049 |
-
goris
|
7050 |
-
lock
|
7051 |
-
cf
|
7052 |
-
big
|
7053 |
-
##ut
|
7054 |
-
ftp
|
7055 |
-
carol
|
7056 |
-
##vi
|
7057 |
-
happy
|
7058 |
-
sd
|
7059 |
-
##ac
|
7060 |
-
anti
|
7061 |
-
pe
|
7062 |
-
cnn
|
7063 |
-
iii
|
7064 |
-
esp
|
7065 |
-
jan
|
7066 |
-
tags
|
7067 |
-
august
|
7068 |
-
vol
|
7069 |
-
##fs
|
7070 |
-
##sion
|
7071 |
-
design
|
7072 |
-
ac
|
7073 |
-
press
|
7074 |
-
jordan
|
7075 |
-
ppp
|
7076 |
-
that
|
7077 |
-
key
|
7078 |
-
check
|
7079 |
-
##tt
|
7080 |
-
##㎡
|
7081 |
-
##lt
|
7082 |
-
power
|
7083 |
-
##bc
|
7084 |
-
vivi
|
7085 |
-
he
|
7086 |
-
jpg
|
7087 |
-
##rry
|
7088 |
-
nb
|
7089 |
-
##ted
|
7090 |
-
##rn
|
7091 |
-
usd
|
7092 |
-
##t00
|
7093 |
-
master
|
7094 |
-
model
|
7095 |
-
al
|
7096 |
-
ram
|
7097 |
-
goo
|
7098 |
-
##ui
|
7099 |
-
red
|
7100 |
-
##ary
|
7101 |
-
rpg
|
7102 |
-
item
|
7103 |
-
##pm
|
7104 |
-
##za
|
7105 |
-
project
|
7106 |
-
hot
|
7107 |
-
td
|
7108 |
-
blogabstract
|
7109 |
-
##ger
|
7110 |
-
gr2
|
7111 |
-
black
|
7112 |
-
electronic
|
7113 |
-
nfc
|
7114 |
-
year
|
7115 |
-
asus
|
7116 |
-
html5
|
7117 |
-
cindy
|
7118 |
-
##hd
|
7119 |
-
m3
|
7120 |
-
esc
|
7121 |
-
##od
|
7122 |
-
booking
|
7123 |
-
fed
|
7124 |
-
tvb
|
7125 |
-
##ina
|
7126 |
-
mit
|
7127 |
-
chan
|
7128 |
-
distribution
|
7129 |
-
next
|
7130 |
-
peter
|
7131 |
-
bios
|
7132 |
-
steam
|
7133 |
-
cm
|
7134 |
-
pk10
|
7135 |
-
##ix
|
7136 |
-
dec
|
7137 |
-
nasa
|
7138 |
-
##ana
|
7139 |
-
icecat
|
7140 |
-
b1
|
7141 |
-
will
|
7142 |
-
li
|
7143 |
-
se
|
7144 |
-
##ji
|
7145 |
-
##ard
|
7146 |
-
oct
|
7147 |
-
##ain
|
7148 |
-
jp
|
7149 |
-
##ze
|
7150 |
-
##bi
|
7151 |
-
cio
|
7152 |
-
smart
|
7153 |
-
h5
|
7154 |
-
##port
|
7155 |
-
curve
|
7156 |
-
vpn
|
7157 |
-
##nm
|
7158 |
-
##dia
|
7159 |
-
utc
|
7160 |
-
rmvb
|
7161 |
-
chanel
|
7162 |
-
a4
|
7163 |
-
miss
|
7164 |
-
##and
|
7165 |
-
##im
|
7166 |
-
media
|
7167 |
-
who
|
7168 |
-
she
|
7169 |
-
girl
|
7170 |
-
vera
|
7171 |
-
class
|
7172 |
-
vivo
|
7173 |
-
king
|
7174 |
-
##ei
|
7175 |
-
national
|
7176 |
-
ab
|
7177 |
-
ipod
|
7178 |
-
ap
|
7179 |
-
ms
|
7180 |
-
mp4
|
7181 |
-
msci
|
7182 |
-
##po
|
7183 |
-
mg
|
7184 |
-
index
|
7185 |
-
##bit
|
7186 |
-
##out
|
7187 |
-
##zz
|
7188 |
-
apec
|
7189 |
-
photoshop
|
7190 |
-
opec
|
7191 |
-
##tes
|
7192 |
-
##ast
|
7193 |
-
○○
|
7194 |
-
##ling
|
7195 |
-
##ory
|
7196 |
-
##ical
|
7197 |
-
kitty
|
7198 |
-
content
|
7199 |
-
step3
|
7200 |
-
##cn
|
7201 |
-
win8
|
7202 |
-
vc
|
7203 |
-
iphone7
|
7204 |
-
robert
|
7205 |
-
tcl
|
7206 |
-
beauty
|
7207 |
-
en
|
7208 |
-
dollars
|
7209 |
-
##ys
|
7210 |
-
##oc
|
7211 |
-
step
|
7212 |
-
pay
|
7213 |
-
yy
|
7214 |
-
a1
|
7215 |
-
##lly
|
7216 |
-
##ks
|
7217 |
-
download
|
7218 |
-
sep
|
7219 |
-
exe
|
7220 |
-
ph
|
7221 |
-
school
|
7222 |
-
gb
|
7223 |
-
center
|
7224 |
-
pr
|
7225 |
-
street
|
7226 |
-
##board
|
7227 |
-
uv
|
7228 |
-
##lan
|
7229 |
-
winrar
|
7230 |
-
##que
|
7231 |
-
##ua
|
7232 |
-
##com
|
7233 |
-
gpu
|
7234 |
-
ettoday
|
7235 |
-
fu
|
7236 |
-
tom
|
7237 |
-
##ren
|
7238 |
-
##via
|
7239 |
-
b2b
|
7240 |
-
##tch
|
7241 |
-
rose
|
7242 |
-
arm
|
7243 |
-
mb
|
7244 |
-
##ial
|
7245 |
-
##nn
|
7246 |
-
nvidia
|
7247 |
-
step4
|
7248 |
-
mvp
|
7249 |
-
york
|
7250 |
-
how
|
7251 |
-
cpi
|
7252 |
-
gov
|
7253 |
-
kg
|
7254 |
-
joe
|
7255 |
-
##xx
|
7256 |
-
mandy
|
7257 |
-
pa
|
7258 |
-
##ser
|
7259 |
-
copyright
|
7260 |
-
fashion
|
7261 |
-
don
|
7262 |
-
ecu
|
7263 |
-
##ist
|
7264 |
-
##art
|
7265 |
-
erp
|
7266 |
-
wap
|
7267 |
-
have
|
7268 |
-
##lm
|
7269 |
-
talk
|
7270 |
-
##ek
|
7271 |
-
##ning
|
7272 |
-
##if
|
7273 |
-
ch
|
7274 |
-
##ite
|
7275 |
-
video
|
7276 |
-
cs
|
7277 |
-
san
|
7278 |
-
iot
|
7279 |
-
look
|
7280 |
-
##ku
|
7281 |
-
october
|
7282 |
-
##ux
|
7283 |
-
trump
|
7284 |
-
##hs
|
7285 |
-
##ide
|
7286 |
-
box
|
7287 |
-
first
|
7288 |
-
##ins
|
7289 |
-
april
|
7290 |
-
##ight
|
7291 |
-
angel
|
7292 |
-
protected
|
7293 |
-
aa
|
7294 |
-
x1
|
7295 |
-
m2
|
7296 |
-
##fe
|
7297 |
-
##×
|
7298 |
-
##ho
|
7299 |
-
size
|
7300 |
-
min
|
7301 |
-
ofo
|
7302 |
-
fun
|
7303 |
-
gomaji
|
7304 |
-
ex
|
7305 |
-
hdmi
|
7306 |
-
food
|
7307 |
-
dns
|
7308 |
-
march
|
7309 |
-
chris
|
7310 |
-
kevin
|
7311 |
-
##lla
|
7312 |
-
##pp
|
7313 |
-
##ec
|
7314 |
-
ag
|
7315 |
-
ems
|
7316 |
-
##rm
|
7317 |
-
##ham
|
7318 |
-
off
|
7319 |
-
asp
|
7320 |
-
team
|
7321 |
-
fandom
|
7322 |
-
ed
|
7323 |
-
##ell
|
7324 |
-
info
|
7325 |
-
sina
|
7326 |
-
##able
|
7327 |
-
##ctor
|
7328 |
-
dll
|
7329 |
-
rights
|
7330 |
-
ltd
|
7331 |
-
idc
|
7332 |
-
jul
|
7333 |
-
ma
|
7334 |
-
surface
|
7335 |
-
mall
|
7336 |
-
eps
|
7337 |
-
green
|
7338 |
-
map
|
7339 |
-
space
|
7340 |
-
donald
|
7341 |
-
v2
|
7342 |
-
sodu
|
7343 |
-
##light
|
7344 |
-
reserved
|
7345 |
-
htm
|
7346 |
-
##han
|
7347 |
-
mod
|
7348 |
-
##ise
|
7349 |
-
##tions
|
7350 |
-
ti
|
7351 |
-
##shi
|
7352 |
-
doc
|
7353 |
-
icp
|
7354 |
-
wang
|
7355 |
-
##ram
|
7356 |
-
shopping
|
7357 |
-
aug
|
7358 |
-
##pi
|
7359 |
-
##well
|
7360 |
-
now
|
7361 |
-
wam
|
7362 |
-
b2
|
7363 |
-
##hu
|
7364 |
-
##gb
|
7365 |
-
f2
|
7366 |
-
mix
|
7367 |
-
##ef
|
7368 |
-
##uan
|
7369 |
-
bwl
|
7370 |
-
##plus
|
7371 |
-
##res
|
7372 |
-
core
|
7373 |
-
##ess
|
7374 |
-
tea
|
7375 |
-
hktvmall
|
7376 |
-
nhk
|
7377 |
-
##ate
|
7378 |
-
list
|
7379 |
-
##ese
|
7380 |
-
feb
|
7381 |
-
inn
|
7382 |
-
nov
|
7383 |
-
daniel
|
7384 |
-
##ci
|
7385 |
-
pass
|
7386 |
-
##bet
|
7387 |
-
##nk
|
7388 |
-
coffee
|
7389 |
-
ssl
|
7390 |
-
airbnb
|
7391 |
-
##ute
|
7392 |
-
fbi
|
7393 |
-
woshipm
|
7394 |
-
skype
|
7395 |
-
ea
|
7396 |
-
cg
|
7397 |
-
sp
|
7398 |
-
##fc
|
7399 |
-
##www
|
7400 |
-
yes
|
7401 |
-
edge
|
7402 |
-
alt
|
7403 |
-
fpga
|
7404 |
-
##ght
|
7405 |
-
##gs
|
7406 |
-
iso9001
|
7407 |
-
##ile
|
7408 |
-
##wood
|
7409 |
-
##uo
|
7410 |
-
image
|
7411 |
-
lin
|
7412 |
-
icon
|
7413 |
-
american
|
7414 |
-
##em
|
7415 |
-
set
|
7416 |
-
says
|
7417 |
-
##king
|
7418 |
-
##tive
|
7419 |
-
blogger
|
7420 |
-
##ox
|
7421 |
-
##zy
|
7422 |
-
##red
|
7423 |
-
##ium
|
7424 |
-
##lf
|
7425 |
-
nokia
|
7426 |
-
claire
|
7427 |
-
##ding
|
7428 |
-
november
|
7429 |
-
lohas
|
7430 |
-
##tic
|
7431 |
-
##cs
|
7432 |
-
##che
|
7433 |
-
##ire
|
7434 |
-
##gy
|
7435 |
-
##ult
|
7436 |
-
db
|
7437 |
-
january
|
7438 |
-
win
|
7439 |
-
road
|
7440 |
-
ptt
|
7441 |
-
##fa
|
7442 |
-
##mer
|
7443 |
-
anna
|
7444 |
-
pchome
|
7445 |
-
udn
|
7446 |
-
ef
|
7447 |
-
##time
|
7448 |
-
##tte
|
7449 |
-
g20
|
7450 |
-
white
|
7451 |
-
garden
|
7452 |
-
eleven
|
7453 |
-
di
|
7454 |
-
chen
|
7455 |
-
young
|
7456 |
-
cosplay
|
7457 |
-
bat
|
7458 |
-
##tra
|
7459 |
-
kindle
|
7460 |
-
npc
|
7461 |
-
steve
|
7462 |
-
etc
|
7463 |
-
##ern
|
7464 |
-
call
|
7465 |
-
xperia
|
7466 |
-
ces
|
7467 |
-
travel
|
7468 |
-
sk
|
7469 |
-
s7
|
7470 |
-
##ous
|
7471 |
-
##int
|
7472 |
-
edu
|
7473 |
-
file
|
7474 |
-
cho
|
7475 |
-
qr
|
7476 |
-
##car
|
7477 |
-
##our
|
7478 |
-
##ant
|
7479 |
-
eric
|
7480 |
-
rends
|
7481 |
-
##jo
|
7482 |
-
mastercard
|
7483 |
-
kb
|
7484 |
-
##min
|
7485 |
-
##ino
|
7486 |
-
vista
|
7487 |
-
##ris
|
7488 |
-
##ud
|
7489 |
-
jack
|
7490 |
-
##set
|
7491 |
-
pos
|
7492 |
-
##her
|
7493 |
-
##ou
|
7494 |
-
taipei
|
7495 |
-
beta
|
7496 |
-
##fi
|
7497 |
-
express
|
7498 |
-
body
|
7499 |
-
##ill
|
7500 |
-
aphojoy
|
7501 |
-
user
|
7502 |
-
december
|
7503 |
-
meiki
|
7504 |
-
##ick
|
7505 |
-
tweet
|
7506 |
-
richard
|
7507 |
-
##av
|
7508 |
-
iphone6
|
7509 |
-
##dd
|
7510 |
-
views
|
7511 |
-
##mark
|
7512 |
-
pd
|
7513 |
-
times
|
7514 |
-
level
|
7515 |
-
##ash
|
7516 |
-
point
|
7517 |
-
##ome
|
7518 |
-
koreanmall
|
7519 |
-
##ak
|
7520 |
-
george
|
7521 |
-
q2
|
7522 |
-
wma
|
7523 |
-
tcp
|
7524 |
-
full
|
7525 |
-
mlb
|
7526 |
-
##lle
|
7527 |
-
##watch
|
7528 |
-
tm
|
7529 |
-
run
|
7530 |
-
smith
|
7531 |
-
business
|
7532 |
-
##und
|
7533 |
-
color
|
7534 |
-
##tal
|
7535 |
-
##less
|
7536 |
-
moon
|
7537 |
-
##rl
|
7538 |
-
update
|
7539 |
-
pcb
|
7540 |
-
shop
|
7541 |
-
little
|
7542 |
-
end
|
7543 |
-
##mhz
|
7544 |
-
van
|
7545 |
-
dsp
|
7546 |
-
easy
|
7547 |
-
##house
|
7548 |
-
##key
|
7549 |
-
history
|
7550 |
-
oh
|
7551 |
-
##hy
|
7552 |
-
##web
|
7553 |
-
oem
|
7554 |
-
let
|
7555 |
-
was
|
7556 |
-
##gg
|
7557 |
-
review
|
7558 |
-
##wan
|
7559 |
-
##°c
|
7560 |
-
uc
|
7561 |
-
title
|
7562 |
-
##val
|
7563 |
-
united
|
7564 |
-
##ons
|
7565 |
-
doi
|
7566 |
-
trivago
|
7567 |
-
overdope
|
7568 |
-
sbs
|
7569 |
-
##ance
|
7570 |
-
grand
|
7571 |
-
special
|
7572 |
-
imf
|
7573 |
-
wx17house
|
7574 |
-
##so
|
7575 |
-
audi
|
7576 |
-
##he
|
7577 |
-
london
|
7578 |
-
william
|
7579 |
-
##rp
|
7580 |
-
##ake
|
7581 |
-
science
|
7582 |
-
beach
|
7583 |
-
cfa
|
7584 |
-
amp
|
7585 |
-
ps4
|
7586 |
-
##link
|
7587 |
-
##hp
|
7588 |
-
crm
|
7589 |
-
ferragamo
|
7590 |
-
bell
|
7591 |
-
make
|
7592 |
-
##eng
|
7593 |
-
under
|
7594 |
-
zh
|
7595 |
-
photos
|
7596 |
-
##style
|
7597 |
-
via
|
7598 |
-
da
|
7599 |
-
##gi
|
7600 |
-
company
|
7601 |
-
i7
|
7602 |
-
##ray
|
7603 |
-
thomas
|
7604 |
-
ufo
|
7605 |
-
i5
|
7606 |
-
##max
|
7607 |
-
plc
|
7608 |
-
ben
|
7609 |
-
back
|
7610 |
-
research
|
7611 |
-
mike
|
7612 |
-
##pc
|
7613 |
-
september
|
7614 |
-
##ace
|
7615 |
-
vps
|
7616 |
-
february
|
7617 |
-
pantos
|
7618 |
-
wp
|
7619 |
-
lisa
|
7620 |
-
jquery
|
7621 |
-
night
|
7622 |
-
long
|
7623 |
-
offer
|
7624 |
-
##berg
|
7625 |
-
##news
|
7626 |
-
ray
|
7627 |
-
fks
|
7628 |
-
wto
|
7629 |
-
over
|
7630 |
-
##all
|
7631 |
-
##rus
|
7632 |
-
##works
|
7633 |
-
blogtitle
|
7634 |
-
loftpermalink
|
7635 |
-
martin
|
7636 |
-
test
|
7637 |
-
ling
|
7638 |
-
km
|
7639 |
-
fda
|
7640 |
-
v3
|
7641 |
-
##ja
|
7642 |
-
outlet
|
7643 |
-
family
|
7644 |
-
##ea
|
7645 |
-
##top
|
7646 |
-
story
|
7647 |
-
##ness
|
7648 |
-
salvatore
|
7649 |
-
##lu
|
7650 |
-
swift
|
7651 |
-
room
|
7652 |
-
oracle
|
7653 |
-
##ul
|
7654 |
-
sam
|
7655 |
-
b2c
|
7656 |
-
week
|
7657 |
-
pi
|
7658 |
-
rock
|
7659 |
-
##ean
|
7660 |
-
##gle
|
7661 |
-
cctv
|
7662 |
-
after
|
7663 |
-
chinese
|
7664 |
-
##back
|
7665 |
-
powered
|
7666 |
-
x2
|
7667 |
-
##tan
|
7668 |
-
##nes
|
7669 |
-
canon
|
7670 |
-
only
|
7671 |
-
##zi
|
7672 |
-
##las
|
7673 |
-
say
|
7674 |
-
##oe
|
7675 |
-
##sd
|
7676 |
-
##bot
|
7677 |
-
##world
|
7678 |
-
##zo
|
7679 |
-
sky
|
7680 |
-
made
|
7681 |
-
top100
|
7682 |
-
just
|
7683 |
-
pmi
|
7684 |
-
gap
|
7685 |
-
##vr
|
7686 |
-
les
|
7687 |
-
ball
|
7688 |
-
vogue
|
7689 |
-
vi
|
7690 |
-
ing
|
7691 |
-
ofweek
|
7692 |
-
cos
|
7693 |
-
##list
|
7694 |
-
##ort
|
7695 |
-
##lon
|
7696 |
-
last
|
7697 |
-
##tc
|
7698 |
-
##of
|
7699 |
-
##bus
|
7700 |
-
##gen
|
7701 |
-
real
|
7702 |
-
eva
|
7703 |
-
a3
|
7704 |
-
nas
|
7705 |
-
##lie
|
7706 |
-
##ria
|
7707 |
-
##coin
|
7708 |
-
##bt
|
7709 |
-
his
|
7710 |
-
cat
|
7711 |
-
nata
|
7712 |
-
vive
|
7713 |
-
health
|
7714 |
-
drive
|
7715 |
-
sir
|
7716 |
-
du
|
7717 |
-
cup
|
7718 |
-
##ook
|
7719 |
-
##sy
|
7720 |
-
alex
|
7721 |
-
msg
|
7722 |
-
tour
|
7723 |
-
##word
|
7724 |
-
ebooks
|
7725 |
-
r8
|
7726 |
-
block
|
7727 |
-
nice
|
7728 |
-
pvp
|
7729 |
-
months
|
7730 |
-
rewards
|
7731 |
-
##ther
|
7732 |
-
##xi
|
7733 |
-
##sc
|
7734 |
-
micro
|
7735 |
-
gg
|
7736 |
-
blogfp
|
7737 |
-
op
|
7738 |
-
daily
|
7739 |
-
m1
|
7740 |
-
true
|
7741 |
-
##bb
|
7742 |
-
ml
|
7743 |
-
##tar
|
7744 |
-
##ky
|
7745 |
-
anthony
|
7746 |
-
##yo
|
7747 |
-
state
|
7748 |
-
##ara
|
7749 |
-
##aa
|
7750 |
-
##rc
|
7751 |
-
##tz
|
7752 |
-
##ston
|
7753 |
-
gear
|
7754 |
-
##eo
|
7755 |
-
##ade
|
7756 |
-
ge
|
7757 |
-
see
|
7758 |
-
##win
|
7759 |
-
##ura
|
7760 |
-
ss
|
7761 |
-
heart
|
7762 |
-
##den
|
7763 |
-
##ita
|
7764 |
-
down
|
7765 |
-
##sm
|
7766 |
-
el
|
7767 |
-
png
|
7768 |
-
rakuten
|
7769 |
-
whatsapp
|
7770 |
-
bay
|
7771 |
-
dream
|
7772 |
-
add
|
7773 |
-
##use
|
7774 |
-
pad
|
7775 |
-
gucci
|
7776 |
-
mpv
|
7777 |
-
##ode
|
7778 |
-
##fo
|
7779 |
-
island
|
7780 |
-
jason
|
7781 |
-
chicago
|
7782 |
-
##hone
|
7783 |
-
io
|
7784 |
-
sogo
|
7785 |
-
be2
|
7786 |
-
##ology
|
7787 |
-
cloud
|
7788 |
-
vcd
|
7789 |
-
##con
|
7790 |
-
##ford
|
7791 |
-
##joy
|
7792 |
-
##kb
|
7793 |
-
##rade
|
7794 |
-
but
|
7795 |
-
##ach
|
7796 |
-
docker
|
7797 |
-
##ful
|
7798 |
-
rfid
|
7799 |
-
ul
|
7800 |
-
##ase
|
7801 |
-
hit
|
7802 |
-
ford
|
7803 |
-
##star
|
7804 |
-
a2
|
7805 |
-
sdk
|
7806 |
-
reading
|
7807 |
-
edited
|
7808 |
-
##are
|
7809 |
-
cmos
|
7810 |
-
##mc
|
7811 |
-
siri
|
7812 |
-
light
|
7813 |
-
##ella
|
7814 |
-
bloomberg
|
7815 |
-
##read
|
7816 |
-
pizza
|
7817 |
-
##ison
|
7818 |
-
jimmy
|
7819 |
-
##vm
|
7820 |
-
college
|
7821 |
-
node
|
7822 |
-
journal
|
7823 |
-
ba
|
7824 |
-
##play
|
7825 |
-
##cer
|
7826 |
-
magic
|
7827 |
-
##yu
|
7828 |
-
jump
|
7829 |
-
tt
|
7830 |
-
##ings
|
7831 |
-
asr
|
7832 |
-
##lia
|
7833 |
-
step5
|
7834 |
-
network
|
7835 |
-
##cd
|
7836 |
-
mc
|
7837 |
-
pixstyleme
|
7838 |
-
money
|
7839 |
-
bl
|
7840 |
-
act
|
7841 |
-
##tus
|
7842 |
-
tokyo
|
7843 |
-
##rial
|
7844 |
-
##life
|
7845 |
-
emba
|
7846 |
-
##ae
|
7847 |
-
saas
|
7848 |
-
tcs
|
7849 |
-
##rk
|
7850 |
-
##wang
|
7851 |
-
summer
|
7852 |
-
##sp
|
7853 |
-
ko
|
7854 |
-
##ving
|
7855 |
-
premium
|
7856 |
-
netflix
|
7857 |
-
uk
|
7858 |
-
mt
|
7859 |
-
##lton
|
7860 |
-
right
|
7861 |
-
frank
|
7862 |
-
two
|
7863 |
-
##ple
|
7864 |
-
##cal
|
7865 |
-
##sen
|
7866 |
-
##ville
|
7867 |
-
hold
|
7868 |
-
nexus
|
7869 |
-
dd
|
7870 |
-
##ius
|
7871 |
-
##mah
|
7872 |
-
tila
|
7873 |
-
zero
|
7874 |
-
ce
|
7875 |
-
##tin
|
7876 |
-
resort
|
7877 |
-
##ws
|
7878 |
-
charles
|
7879 |
-
old
|
7880 |
-
p10
|
7881 |
-
report
|
7882 |
-
##ru
|
7883 |
-
bus
|
7884 |
-
vans
|
7885 |
-
lt
|
7886 |
-
##est
|
7887 |
-
pv
|
7888 |
-
links
|
7889 |
-
rebecca
|
7890 |
-
##dm
|
7891 |
-
azure
|
7892 |
-
limited
|
7893 |
-
bit
|
7894 |
-
##mon
|
7895 |
-
moto
|
7896 |
-
##eam
|
7897 |
-
var
|
7898 |
-
eos
|
7899 |
-
blogspot
|
7900 |
-
e3
|
7901 |
-
dos
|
7902 |
-
dm
|
7903 |
-
fc
|
7904 |
-
##ments
|
7905 |
-
##ik
|
7906 |
-
##kw
|
7907 |
-
boy
|
7908 |
-
##bin
|
7909 |
-
##ata
|
7910 |
-
er
|
7911 |
-
##vin
|
7912 |
-
##tu
|
7913 |
-
##ula
|
7914 |
-
station
|
7915 |
-
##ature
|
7916 |
-
files
|
7917 |
-
zara
|
7918 |
-
hdr
|
7919 |
-
top10
|
7920 |
-
nature
|
7921 |
-
magazine
|
7922 |
-
s6
|
7923 |
-
marriott
|
7924 |
-
avira
|
7925 |
-
case
|
7926 |
-
tab
|
7927 |
-
##ran
|
7928 |
-
tony
|
7929 |
-
##home
|
7930 |
-
oculus
|
7931 |
-
im
|
7932 |
-
##ral
|
7933 |
-
jean
|
7934 |
-
saint
|
7935 |
-
cry
|
7936 |
-
rosie
|
7937 |
-
##force
|
7938 |
-
##ini
|
7939 |
-
ice
|
7940 |
-
##bert
|
7941 |
-
##nder
|
7942 |
-
##mber
|
7943 |
-
pet
|
7944 |
-
plurk
|
7945 |
-
##sis
|
7946 |
-
##ence
|
7947 |
-
tim
|
7948 |
-
##nc
|
7949 |
-
##name
|
7950 |
-
log
|
7951 |
-
ips
|
7952 |
-
great
|
7953 |
-
ikea
|
7954 |
-
malaysia
|
7955 |
-
unix
|
7956 |
-
##ncy
|
7957 |
-
##nie
|
7958 |
-
akb48
|
7959 |
-
##ye
|
7960 |
-
##oid
|
7961 |
-
##chi
|
7962 |
-
oa
|
7963 |
-
xuehai
|
7964 |
-
##orm
|
7965 |
-
##rf
|
7966 |
-
##ware
|
7967 |
-
ho
|
7968 |
-
##pro
|
7969 |
-
text
|
7970 |
-
##era
|
7971 |
-
bob
|
7972 |
-
##ub
|
7973 |
-
scp
|
7974 |
-
avi
|
7975 |
-
##zen
|
7976 |
-
mi
|
7977 |
-
wu
|
7978 |
-
museum
|
7979 |
-
qvod
|
7980 |
-
apache
|
7981 |
-
lake
|
7982 |
-
jcb
|
7983 |
-
ni
|
7984 |
-
##hr
|
7985 |
-
hill
|
7986 |
-
ne
|
7987 |
-
weibo
|
7988 |
-
ruby
|
7989 |
-
##row
|
7990 |
-
iv
|
7991 |
-
##ish
|
7992 |
-
github
|
7993 |
-
mate
|
7994 |
-
##lot
|
7995 |
-
##ane
|
7996 |
-
andrew
|
7997 |
-
##tina
|
7998 |
-
t1
|
7999 |
-
rf
|
8000 |
-
ed2k
|
8001 |
-
##vel
|
8002 |
-
way
|
8003 |
-
final
|
8004 |
-
ns
|
8005 |
-
sweet
|
8006 |
-
bytes
|
8007 |
-
##ene
|
8008 |
-
##cker
|
8009 |
-
##px
|
8010 |
-
topapp
|
8011 |
-
helpapp
|
8012 |
-
rs
|
8013 |
-
low
|
8014 |
-
g4g
|
8015 |
-
care
|
8016 |
-
ldquo
|
8017 |
-
##fork
|
8018 |
-
leave
|
8019 |
-
rm
|
8020 |
-
edition
|
8021 |
-
##gan
|
8022 |
-
##zon
|
8023 |
-
##qq
|
8024 |
-
##google
|
8025 |
-
##ism
|
8026 |
-
gold
|
8027 |
-
explorer
|
8028 |
-
##zer
|
8029 |
-
toyota
|
8030 |
-
category
|
8031 |
-
select
|
8032 |
-
visual
|
8033 |
-
##labels
|
8034 |
-
restaurant
|
8035 |
-
##md
|
8036 |
-
posts
|
8037 |
-
s1
|
8038 |
-
##ico
|
8039 |
-
angelababy
|
8040 |
-
sports
|
8041 |
-
s3
|
8042 |
-
mbc
|
8043 |
-
shell
|
8044 |
-
x86
|
8045 |
-
candy
|
8046 |
-
##new
|
8047 |
-
kbs
|
8048 |
-
face
|
8049 |
-
xl
|
8050 |
-
##here
|
8051 |
-
swissinfo
|
8052 |
-
v8
|
8053 |
-
dram
|
8054 |
-
##ual
|
8055 |
-
##vice
|
8056 |
-
##wer
|
8057 |
-
sport
|
8058 |
-
q1
|
8059 |
-
ios10
|
8060 |
-
public
|
8061 |
-
int
|
8062 |
-
card
|
8063 |
-
ep
|
8064 |
-
au
|
8065 |
-
rt
|
8066 |
-
bill
|
8067 |
-
##mll
|
8068 |
-
kim
|
8069 |
-
wan
|
8070 |
-
##uk
|
8071 |
-
x3
|
8072 |
-
scott
|
8073 |
-
##ming
|
8074 |
-
e5
|
8075 |
-
h7n9
|
8076 |
-
worldcat
|
8077 |
-
brown
|
8078 |
-
##vo
|
8079 |
-
##led
|
8080 |
-
##ax
|
8081 |
-
##ert
|
8082 |
-
paris
|
8083 |
-
polo
|
8084 |
-
##lr
|
8085 |
-
capital
|
8086 |
-
##hing
|
8087 |
-
bank
|
8088 |
-
cv
|
8089 |
-
##chat
|
8090 |
-
adc
|
8091 |
-
##ule
|
8092 |
-
digital
|
8093 |
-
hotmail
|
8094 |
-
##pad
|
8095 |
-
bbq
|
8096 |
-
quot
|
8097 |
-
##ring
|
8098 |
-
before
|
8099 |
-
wali
|
8100 |
-
mcu
|
8101 |
-
costco
|
8102 |
-
north
|
8103 |
-
switch
|
8104 |
-
##city
|
8105 |
-
philips
|
8106 |
-
##mann
|
8107 |
-
management
|
8108 |
-
panasonic
|
8109 |
-
##cl
|
8110 |
-
##vd
|
8111 |
-
##ping
|
8112 |
-
##rge
|
8113 |
-
alice
|
8114 |
-
##lk
|
8115 |
-
css3
|
8116 |
-
##ney
|
8117 |
-
vision
|
8118 |
-
alpha
|
8119 |
-
##ular
|
8120 |
-
##tter
|
8121 |
-
lz
|
8122 |
-
mode
|
8123 |
-
gre
|
8124 |
-
pci
|
8125 |
-
##tm
|
8126 |
-
##yan
|
8127 |
-
##let
|
8128 |
-
work
|
8129 |
-
war
|
8130 |
-
coach
|
8131 |
-
ah
|
8132 |
-
mary
|
8133 |
-
huang
|
8134 |
-
##pt
|
8135 |
-
a8
|
8136 |
-
pt
|
8137 |
-
follow
|
8138 |
-
##berry
|
8139 |
-
##ew
|
8140 |
-
a5
|
8141 |
-
ghost
|
8142 |
-
##wn
|
8143 |
-
##og
|
8144 |
-
south
|
8145 |
-
##code
|
8146 |
-
girls
|
8147 |
-
##rid
|
8148 |
-
action
|
8149 |
-
villa
|
8150 |
-
git
|
8151 |
-
r11
|
8152 |
-
table
|
8153 |
-
games
|
8154 |
-
##cket
|
8155 |
-
error
|
8156 |
-
##anonymoussaid
|
8157 |
-
##ag
|
8158 |
-
here
|
8159 |
-
##ame
|
8160 |
-
##gc
|
8161 |
-
qa
|
8162 |
-
##lis
|
8163 |
-
gmp
|
8164 |
-
##gin
|
8165 |
-
vmalife
|
8166 |
-
##cher
|
8167 |
-
yu
|
8168 |
-
wedding
|
8169 |
-
##tis
|
8170 |
-
demo
|
8171 |
-
dragon
|
8172 |
-
soho
|
8173 |
-
social
|
8174 |
-
bye
|
8175 |
-
##rant
|
8176 |
-
river
|
8177 |
-
orz
|
8178 |
-
acer
|
8179 |
-
##ats
|
8180 |
-
del
|
8181 |
-
##ven
|
8182 |
-
ups
|
8183 |
-
value
|
8184 |
-
macd
|
8185 |
-
yougou
|
8186 |
-
##dn
|
8187 |
-
##ano
|
8188 |
-
ll
|
8189 |
-
##urt
|
8190 |
-
##rent
|
8191 |
-
continue
|
8192 |
-
script
|
8193 |
-
##wen
|
8194 |
-
##ect
|
8195 |
-
paper
|
8196 |
-
shift
|
8197 |
-
##chel
|
8198 |
-
##cat
|
8199 |
-
x5
|
8200 |
-
fox
|
8201 |
-
car
|
8202 |
-
aaa
|
8203 |
-
##blog
|
8204 |
-
loading
|
8205 |
-
##yn
|
8206 |
-
##tp
|
8207 |
-
kuso
|
8208 |
-
si
|
8209 |
-
sns
|
8210 |
-
rmb
|
8211 |
-
vdc
|
8212 |
-
forest
|
8213 |
-
central
|
8214 |
-
prime
|
8215 |
-
help
|
8216 |
-
ultra
|
8217 |
-
##rmb
|
8218 |
-
square
|
8219 |
-
##field
|
8220 |
-
##reen
|
8221 |
-
##ors
|
8222 |
-
##ju
|
8223 |
-
c1
|
8224 |
-
start
|
8225 |
-
##air
|
8226 |
-
##map
|
8227 |
-
cdn
|
8228 |
-
##wo
|
8229 |
-
cba
|
8230 |
-
stephen
|
8231 |
-
m8
|
8232 |
-
##get
|
8233 |
-
opera
|
8234 |
-
##base
|
8235 |
-
##ood
|
8236 |
-
vsa
|
8237 |
-
com™
|
8238 |
-
##aw
|
8239 |
-
##ail
|
8240 |
-
count
|
8241 |
-
t2
|
8242 |
-
##een
|
8243 |
-
hop
|
8244 |
-
##gp
|
8245 |
-
vsc
|
8246 |
-
tree
|
8247 |
-
##eg
|
8248 |
-
##ose
|
8249 |
-
##ories
|
8250 |
-
##shop
|
8251 |
-
alphago
|
8252 |
-
v4
|
8253 |
-
simon
|
8254 |
-
fluke62max
|
8255 |
-
zip
|
8256 |
-
##sta
|
8257 |
-
louis
|
8258 |
-
cr
|
8259 |
-
bas
|
8260 |
-
bc
|
8261 |
-
##yer
|
8262 |
-
hadoop
|
8263 |
-
##ube
|
8264 |
-
##wi
|
8265 |
-
hola
|
8266 |
-
##low
|
8267 |
-
place
|
8268 |
-
centre
|
8269 |
-
d3
|
8270 |
-
##fer
|
8271 |
-
##media
|
8272 |
-
exchange
|
8273 |
-
series
|
8274 |
-
##san
|
8275 |
-
eb
|
8276 |
-
##bank
|
8277 |
-
q3
|
8278 |
-
##nge
|
8279 |
-
##mail
|
8280 |
-
take
|
8281 |
-
##lp
|
8282 |
-
client
|
8283 |
-
east
|
8284 |
-
cache
|
8285 |
-
event
|
8286 |
-
vincent
|
8287 |
-
##nse
|
8288 |
-
sui
|
8289 |
-
adchoice
|
8290 |
-
##stry
|
8291 |
-
##zone
|
8292 |
-
ga
|
8293 |
-
apps
|
8294 |
-
sea
|
8295 |
-
##ab
|
8296 |
-
cisco
|
8297 |
-
##rner
|
8298 |
-
kymco
|
8299 |
-
##care
|
8300 |
-
dha
|
8301 |
-
##pu
|
8302 |
-
##yi
|
8303 |
-
minkoff
|
8304 |
-
royal
|
8305 |
-
p1
|
8306 |
-
annie
|
8307 |
-
collection
|
8308 |
-
kpi
|
8309 |
-
playstation
|
8310 |
-
bh
|
8311 |
-
##bar
|
8312 |
-
queen
|
8313 |
-
radio
|
8314 |
-
andy
|
8315 |
-
armani
|
8316 |
-
##xy
|
8317 |
-
manager
|
8318 |
-
iherb
|
8319 |
-
##ery
|
8320 |
-
##share
|
8321 |
-
spring
|
8322 |
-
raid
|
8323 |
-
johnson
|
8324 |
-
##ob
|
8325 |
-
volvo
|
8326 |
-
hall
|
8327 |
-
##ball
|
8328 |
-
v6
|
8329 |
-
our
|
8330 |
-
taylor
|
8331 |
-
##hk
|
8332 |
-
bi
|
8333 |
-
##cp
|
8334 |
-
kate
|
8335 |
-
bo
|
8336 |
-
water
|
8337 |
-
technology
|
8338 |
-
##rie
|
8339 |
-
##ona
|
8340 |
-
##sl
|
8341 |
-
hpv
|
8342 |
-
gtx
|
8343 |
-
hip
|
8344 |
-
rdquo
|
8345 |
-
jayz
|
8346 |
-
stone
|
8347 |
-
##lex
|
8348 |
-
##rum
|
8349 |
-
namespace
|
8350 |
-
##ale
|
8351 |
-
##atic
|
8352 |
-
des
|
8353 |
-
##erson
|
8354 |
-
##ql
|
8355 |
-
##ves
|
8356 |
-
##type
|
8357 |
-
enter
|
8358 |
-
d2
|
8359 |
-
##mix
|
8360 |
-
##bian
|
8361 |
-
a9
|
8362 |
-
jj
|
8363 |
-
ky
|
8364 |
-
##lc
|
8365 |
-
access
|
8366 |
-
movie
|
8367 |
-
##hc
|
8368 |
-
tower
|
8369 |
-
##ration
|
8370 |
-
##mit
|
8371 |
-
##nch
|
8372 |
-
ua
|
8373 |
-
tel
|
8374 |
-
prefix
|
8375 |
-
##o2
|
8376 |
-
##point
|
8377 |
-
ott
|
8378 |
-
##http
|
8379 |
-
##ury
|
8380 |
-
baidu
|
8381 |
-
##ink
|
8382 |
-
member
|
8383 |
-
##logy
|
8384 |
-
bigbang
|
8385 |
-
nownews
|
8386 |
-
##js
|
8387 |
-
##shot
|
8388 |
-
##tb
|
8389 |
-
eba
|
8390 |
-
##tics
|
8391 |
-
##lus
|
8392 |
-
v5
|
8393 |
-
spark
|
8394 |
-
##ama
|
8395 |
-
there
|
8396 |
-
##ions
|
8397 |
-
god
|
8398 |
-
##lls
|
8399 |
-
##down
|
8400 |
-
hiv
|
8401 |
-
##ress
|
8402 |
-
burberry
|
8403 |
-
day2
|
8404 |
-
##kv
|
8405 |
-
jeff
|
8406 |
-
related
|
8407 |
-
film
|
8408 |
-
edit
|
8409 |
-
joseph
|
8410 |
-
##ark
|
8411 |
-
cx
|
8412 |
-
order
|
8413 |
-
g9
|
8414 |
-
##ans
|
8415 |
-
##tty
|
8416 |
-
s5
|
8417 |
-
##bee
|
8418 |
-
thread
|
8419 |
-
xr
|
8420 |
-
buy
|
8421 |
-
sh
|
8422 |
-
land
|
8423 |
-
spotify
|
8424 |
-
mx
|
8425 |
-
##ari
|
8426 |
-
##verse
|
8427 |
-
sf
|
8428 |
-
why
|
8429 |
-
nego
|
8430 |
-
sunny
|
8431 |
-
dom
|
8432 |
-
exo
|
8433 |
-
positioning
|
8434 |
-
fit
|
8435 |
-
rgb
|
8436 |
-
##tton
|
8437 |
-
kiss
|
8438 |
-
alexa
|
8439 |
-
adam
|
8440 |
-
lp
|
8441 |
-
mp
|
8442 |
-
##ties
|
8443 |
-
##llow
|
8444 |
-
amy
|
8445 |
-
##du
|
8446 |
-
np
|
8447 |
-
institute
|
8448 |
-
##rth
|
8449 |
-
##lar
|
8450 |
-
##des
|
8451 |
-
sidebar
|
8452 |
-
imax
|
8453 |
-
site
|
8454 |
-
##cky
|
8455 |
-
##kit
|
8456 |
-
##ime
|
8457 |
-
season
|
8458 |
-
##fun
|
8459 |
-
gogoro
|
8460 |
-
a7
|
8461 |
-
pu
|
8462 |
-
lily
|
8463 |
-
fire
|
8464 |
-
twd600
|
8465 |
-
##vis
|
8466 |
-
##cture
|
8467 |
-
information
|
8468 |
-
close
|
8469 |
-
friday
|
8470 |
-
yi
|
8471 |
-
nick
|
8472 |
-
##tta
|
8473 |
-
##tel
|
8474 |
-
##lock
|
8475 |
-
cbd
|
8476 |
-
economy
|
8477 |
-
tinker
|
8478 |
-
double
|
8479 |
-
voice
|
8480 |
-
##app
|
8481 |
-
oops
|
8482 |
-
channel
|
8483 |
-
today
|
8484 |
-
##right
|
8485 |
-
raw
|
8486 |
-
xyz
|
8487 |
-
jim
|
8488 |
-
edm
|
8489 |
-
##cent
|
8490 |
-
supreme
|
8491 |
-
ds
|
8492 |
-
##its
|
8493 |
-
##asia
|
8494 |
-
dropbox
|
8495 |
-
##tti
|
8496 |
-
books
|
8497 |
-
##tle
|
8498 |
-
##ller
|
8499 |
-
##ken
|
8500 |
-
##more
|
8501 |
-
##boy
|
8502 |
-
sex
|
8503 |
-
##dom
|
8504 |
-
t3
|
8505 |
-
##ider
|
8506 |
-
##unch
|
8507 |
-
feel
|
8508 |
-
##put
|
8509 |
-
s2
|
8510 |
-
mo
|
8511 |
-
##gh
|
8512 |
-
men
|
8513 |
-
ka
|
8514 |
-
amoled
|
8515 |
-
div
|
8516 |
-
##tr
|
8517 |
-
##n1
|
8518 |
-
port
|
8519 |
-
howard
|
8520 |
-
##tags
|
8521 |
-
ken
|
8522 |
-
dnf
|
8523 |
-
##nus
|
8524 |
-
adsense
|
8525 |
-
ide
|
8526 |
-
buff
|
8527 |
-
thunder
|
8528 |
-
##town
|
8529 |
-
##ique
|
8530 |
-
has
|
8531 |
-
##body
|
8532 |
-
auto
|
8533 |
-
pin
|
8534 |
-
##erry
|
8535 |
-
tee
|
8536 |
-
number
|
8537 |
-
##the
|
8538 |
-
object
|
8539 |
-
psp
|
8540 |
-
cool
|
8541 |
-
udnbkk
|
8542 |
-
##mic
|
8543 |
-
miui
|
8544 |
-
##tro
|
8545 |
-
most
|
8546 |
-
r2
|
8547 |
-
##alk
|
8548 |
-
##nity
|
8549 |
-
s4
|
8550 |
-
law
|
8551 |
-
version
|
8552 |
-
##oa
|
8553 |
-
n1
|
8554 |
-
sgs
|
8555 |
-
docomo
|
8556 |
-
##tf
|
8557 |
-
##ack
|
8558 |
-
henry
|
8559 |
-
fc2
|
8560 |
-
##ded
|
8561 |
-
##sco
|
8562 |
-
##rite
|
8563 |
-
linkedin
|
8564 |
-
##ada
|
8565 |
-
##now
|
8566 |
-
wii
|
8567 |
-
##ndy
|
8568 |
-
ucbug
|
8569 |
-
sputniknews
|
8570 |
-
legalminer
|
8571 |
-
##ika
|
8572 |
-
##xp
|
8573 |
-
##bu
|
8574 |
-
q10
|
8575 |
-
oo
|
8576 |
-
b6
|
8577 |
-
come
|
8578 |
-
##rman
|
8579 |
-
cheese
|
8580 |
-
ming
|
8581 |
-
maker
|
8582 |
-
##gm
|
8583 |
-
nikon
|
8584 |
-
##fig
|
8585 |
-
ppi
|
8586 |
-
kelly
|
8587 |
-
jchere
|
8588 |
-
ted
|
8589 |
-
md
|
8590 |
-
fgo
|
8591 |
-
tech
|
8592 |
-
##tto
|
8593 |
-
dan
|
8594 |
-
soc
|
8595 |
-
##gl
|
8596 |
-
##len
|
8597 |
-
hair
|
8598 |
-
earth
|
8599 |
-
img
|
8600 |
-
##pper
|
8601 |
-
##a1
|
8602 |
-
acca
|
8603 |
-
##ition
|
8604 |
-
##ference
|
8605 |
-
suite
|
8606 |
-
##ig
|
8607 |
-
outlook
|
8608 |
-
##mond
|
8609 |
-
##cation
|
8610 |
-
##pr
|
8611 |
-
airport
|
8612 |
-
##over
|
8613 |
-
jones
|
8614 |
-
##ith
|
8615 |
-
lab
|
8616 |
-
##su
|
8617 |
-
co2
|
8618 |
-
town
|
8619 |
-
piece
|
8620 |
-
##llo
|
8621 |
-
no1
|
8622 |
-
vmware
|
8623 |
-
##qi
|
8624 |
-
focus
|
8625 |
-
reader
|
8626 |
-
##admin
|
8627 |
-
##ora
|
8628 |
-
tb
|
8629 |
-
false
|
8630 |
-
##log
|
8631 |
-
know
|
8632 |
-
lan
|
8633 |
-
##ces
|
8634 |
-
f4
|
8635 |
-
##ume
|
8636 |
-
motel
|
8637 |
-
stop
|
8638 |
-
##oper
|
8639 |
-
na
|
8640 |
-
flickr
|
8641 |
-
netcomponents
|
8642 |
-
##af
|
8643 |
-
pose
|
8644 |
-
williams
|
8645 |
-
local
|
8646 |
-
##ound
|
8647 |
-
##cg
|
8648 |
-
##site
|
8649 |
-
##iko
|
8650 |
-
gsm
|
8651 |
-
con
|
8652 |
-
##ath
|
8653 |
-
friends
|
8654 |
-
##hip
|
8655 |
-
cell
|
8656 |
-
##rey
|
8657 |
-
cream
|
8658 |
-
##cks
|
8659 |
-
##dp
|
8660 |
-
facebooktwitterpinterestgoogle
|
8661 |
-
sso
|
8662 |
-
shtml
|
8663 |
-
song
|
8664 |
-
swiss
|
8665 |
-
##mw
|
8666 |
-
lumia
|
8667 |
-
xdd
|
8668 |
-
string
|
8669 |
-
tiffany
|
8670 |
-
marc
|
8671 |
-
insee
|
8672 |
-
russell
|
8673 |
-
sc
|
8674 |
-
dell
|
8675 |
-
##ations
|
8676 |
-
camera
|
8677 |
-
##vs
|
8678 |
-
##flow
|
8679 |
-
##late
|
8680 |
-
classic
|
8681 |
-
##nter
|
8682 |
-
stay
|
8683 |
-
g1
|
8684 |
-
mtv
|
8685 |
-
##ever
|
8686 |
-
##lab
|
8687 |
-
##nger
|
8688 |
-
qe
|
8689 |
-
sata
|
8690 |
-
ryan
|
8691 |
-
d1
|
8692 |
-
cms
|
8693 |
-
##cing
|
8694 |
-
su
|
8695 |
-
editor
|
8696 |
-
##nap
|
8697 |
-
security
|
8698 |
-
sunday
|
8699 |
-
association
|
8700 |
-
##ens
|
8701 |
-
##bra
|
8702 |
-
acg
|
8703 |
-
sofascore
|
8704 |
-
mkv
|
8705 |
-
##ign
|
8706 |
-
jonathan
|
8707 |
-
gary
|
8708 |
-
build
|
8709 |
-
labels
|
8710 |
-
##oto
|
8711 |
-
tesla
|
8712 |
-
moba
|
8713 |
-
qi
|
8714 |
-
gohappy
|
8715 |
-
general
|
8716 |
-
ajax
|
8717 |
-
society
|
8718 |
-
##test
|
8719 |
-
##urs
|
8720 |
-
wps
|
8721 |
-
fedora
|
8722 |
-
##ich
|
8723 |
-
mozilla
|
8724 |
-
##dr
|
8725 |
-
usa
|
8726 |
-
urn
|
8727 |
-
##lina
|
8728 |
-
grace
|
8729 |
-
##die
|
8730 |
-
##try
|
8731 |
-
##ader
|
8732 |
-
elle
|
8733 |
-
##chen
|
8734 |
-
price
|
8735 |
-
##ten
|
8736 |
-
uhz
|
8737 |
-
##ough
|
8738 |
-
eq
|
8739 |
-
##hen
|
8740 |
-
states
|
8741 |
-
push
|
8742 |
-
session
|
8743 |
-
balance
|
8744 |
-
wow
|
8745 |
-
##cus
|
8746 |
-
##py
|
8747 |
-
when
|
8748 |
-
##ward
|
8749 |
-
##ep
|
8750 |
-
wong
|
8751 |
-
library
|
8752 |
-
prada
|
8753 |
-
##cle
|
8754 |
-
running
|
8755 |
-
##ree
|
8756 |
-
ck
|
8757 |
-
date
|
8758 |
-
q4
|
8759 |
-
##ctive
|
8760 |
-
##ool
|
8761 |
-
mk
|
8762 |
-
##ira
|
8763 |
-
die
|
8764 |
-
secret
|
8765 |
-
rq
|
8766 |
-
dota
|
8767 |
-
buffet
|
8768 |
-
e6
|
8769 |
-
##ez
|
8770 |
-
pan
|
8771 |
-
ha
|
8772 |
-
##card
|
8773 |
-
##cha
|
8774 |
-
alan
|
8775 |
-
day3
|
8776 |
-
eye
|
8777 |
-
f3
|
8778 |
-
##end
|
8779 |
-
france
|
8780 |
-
keep
|
8781 |
-
adi
|
8782 |
-
rna
|
8783 |
-
tvbs
|
8784 |
-
##ala
|
8785 |
-
solo
|
8786 |
-
nova
|
8787 |
-
##tail
|
8788 |
-
support
|
8789 |
-
##ries
|
8790 |
-
##ved
|
8791 |
-
base
|
8792 |
-
copy
|
8793 |
-
iis
|
8794 |
-
fps
|
8795 |
-
##ways
|
8796 |
-
hero
|
8797 |
-
hgih
|
8798 |
-
profile
|
8799 |
-
fish
|
8800 |
-
mu
|
8801 |
-
ssh
|
8802 |
-
entertainment
|
8803 |
-
chang
|
8804 |
-
##wd
|
8805 |
-
click
|
8806 |
-
cake
|
8807 |
-
##ond
|
8808 |
-
pre
|
8809 |
-
##tom
|
8810 |
-
kic
|
8811 |
-
pixel
|
8812 |
-
##ov
|
8813 |
-
##fl
|
8814 |
-
product
|
8815 |
-
##pd
|
8816 |
-
dear
|
8817 |
-
##gate
|
8818 |
-
es
|
8819 |
-
yumi
|
8820 |
-
audio
|
8821 |
-
##²
|
8822 |
-
##sky
|
8823 |
-
echo
|
8824 |
-
bin
|
8825 |
-
where
|
8826 |
-
##ture
|
8827 |
-
##ape
|
8828 |
-
find
|
8829 |
-
sap
|
8830 |
-
isis
|
8831 |
-
nand
|
8832 |
-
##load
|
8833 |
-
##ream
|
8834 |
-
band
|
8835 |
-
a6
|
8836 |
-
never
|
8837 |
-
##post
|
8838 |
-
festival
|
8839 |
-
##we
|
8840 |
-
guide
|
8841 |
-
zenfone
|
8842 |
-
##ike
|
8843 |
-
gd
|
8844 |
-
forum
|
8845 |
-
jessica
|
8846 |
-
strong
|
8847 |
-
alexander
|
8848 |
-
##ould
|
8849 |
-
software
|
8850 |
-
allen
|
8851 |
-
##ious
|
8852 |
-
program
|
8853 |
-
else
|
8854 |
-
lohasthree
|
8855 |
-
##gar
|
8856 |
-
please
|
8857 |
-
rc
|
8858 |
-
##ggle
|
8859 |
-
##ric
|
8860 |
-
bim
|
8861 |
-
##own
|
8862 |
-
eclipse
|
8863 |
-
brian
|
8864 |
-
##side
|
8865 |
-
##other
|
8866 |
-
##tech
|
8867 |
-
##ator
|
8868 |
-
engine
|
8869 |
-
##ged
|
8870 |
-
plaza
|
8871 |
-
##fit
|
8872 |
-
cia
|
8873 |
-
ngo
|
8874 |
-
westbrook
|
8875 |
-
shi
|
8876 |
-
tbs
|
8877 |
-
sci
|
8878 |
-
reuters
|
8879 |
-
##ily
|
8880 |
-
contextlink
|
8881 |
-
##hn
|
8882 |
-
af
|
8883 |
-
##cil
|
8884 |
-
bridge
|
8885 |
-
very
|
8886 |
-
##cel
|
8887 |
-
cambridge
|
8888 |
-
##ize
|
8889 |
-
##aid
|
8890 |
-
##data
|
8891 |
-
frm
|
8892 |
-
##head
|
8893 |
-
award
|
8894 |
-
butler
|
8895 |
-
##sun
|
8896 |
-
meta
|
8897 |
-
##mar
|
8898 |
-
america
|
8899 |
-
ps3
|
8900 |
-
puma
|
8901 |
-
pmid
|
8902 |
-
lc
|
8903 |
-
kitchen
|
8904 |
-
##lic
|
8905 |
-
day1
|
8906 |
-
future
|
8907 |
-
##text
|
8908 |
-
##page
|
8909 |
-
##rris
|
8910 |
-
pm1
|
8911 |
-
##ket
|
8912 |
-
fans
|
8913 |
-
christian
|
8914 |
-
bot
|
8915 |
-
kids
|
8916 |
-
trackback
|
8917 |
-
##hai
|
8918 |
-
c3
|
8919 |
-
display
|
8920 |
-
##hl
|
8921 |
-
n2
|
8922 |
-
idea
|
8923 |
-
##sent
|
8924 |
-
airmail
|
8925 |
-
##ug
|
8926 |
-
##men
|
8927 |
-
pwm
|
8928 |
-
##lution
|
8929 |
-
awards
|
8930 |
-
schemas
|
8931 |
-
asics
|
8932 |
-
wikipedia
|
8933 |
-
font
|
8934 |
-
##tional
|
8935 |
-
##vy
|
8936 |
-
c2
|
8937 |
-
##dget
|
8938 |
-
##ein
|
8939 |
-
contact
|
8940 |
-
pepper
|
8941 |
-
##uel
|
8942 |
-
##ument
|
8943 |
-
##hang
|
8944 |
-
q5
|
8945 |
-
##sue
|
8946 |
-
rain
|
8947 |
-
##ndi
|
8948 |
-
wei
|
8949 |
-
swatch
|
8950 |
-
##cept
|
8951 |
-
popular
|
8952 |
-
##ste
|
8953 |
-
##tag
|
8954 |
-
p2
|
8955 |
-
trc
|
8956 |
-
##west
|
8957 |
-
##live
|
8958 |
-
justin
|
8959 |
-
honda
|
8960 |
-
ping
|
8961 |
-
messenger
|
8962 |
-
##rap
|
8963 |
-
v9
|
8964 |
-
unity
|
8965 |
-
appqq
|
8966 |
-
leo
|
8967 |
-
##tone
|
8968 |
-
##ass
|
8969 |
-
uniqlo
|
8970 |
-
her
|
8971 |
-
jane
|
8972 |
-
memory
|
8973 |
-
moneydj
|
8974 |
-
##tical
|
8975 |
-
human
|
8976 |
-
##m2
|
8977 |
-
coc
|
8978 |
-
miacare
|
8979 |
-
##mn
|
8980 |
-
tmt
|
8981 |
-
##core
|
8982 |
-
vim
|
8983 |
-
kk
|
8984 |
-
##may
|
8985 |
-
fan
|
8986 |
-
target
|
8987 |
-
use
|
8988 |
-
too
|
8989 |
-
fast
|
8990 |
-
services
|
8991 |
-
##ope
|
8992 |
-
omega
|
8993 |
-
energy
|
8994 |
-
pinkoi
|
8995 |
-
##rain
|
8996 |
-
jackson
|
8997 |
-
##ement
|
8998 |
-
p9
|
8999 |
-
rd
|
9000 |
-
##tier
|
9001 |
-
##vic
|
9002 |
-
zone
|
9003 |
-
dl
|
9004 |
-
isofix
|
9005 |
-
cpa
|
9006 |
-
m4
|
9007 |
-
kimi
|
9008 |
-
davis
|
9009 |
-
##lay
|
9010 |
-
lulu
|
9011 |
-
##uck
|
9012 |
-
weeks
|
9013 |
-
qs
|
9014 |
-
##hop
|
9015 |
-
ae
|
9016 |
-
##ear
|
9017 |
-
eia
|
9018 |
-
##fly
|
9019 |
-
korea
|
9020 |
-
jpeg
|
9021 |
-
boost
|
9022 |
-
##ship
|
9023 |
-
small
|
9024 |
-
eur
|
9025 |
-
valley
|
9026 |
-
##iel
|
9027 |
-
simple
|
9028 |
-
##ude
|
9029 |
-
rn
|
9030 |
-
k2
|
9031 |
-
##ena
|
9032 |
-
non
|
9033 |
-
patrick
|
9034 |
-
feed
|
9035 |
-
process
|
9036 |
-
well
|
9037 |
-
qqmei
|
9038 |
-
##thing
|
9039 |
-
they
|
9040 |
-
aws
|
9041 |
-
lu
|
9042 |
-
pink
|
9043 |
-
##ters
|
9044 |
-
##kin
|
9045 |
-
board
|
9046 |
-
##vertisement
|
9047 |
-
wine
|
9048 |
-
##ien
|
9049 |
-
unicode
|
9050 |
-
##dge
|
9051 |
-
r1
|
9052 |
-
##tant
|
9053 |
-
##twitter
|
9054 |
-
cool1
|
9055 |
-
isp
|
9056 |
-
standard
|
9057 |
-
matt
|
9058 |
-
##fu
|
9059 |
-
##iner
|
9060 |
-
googlemsn
|
9061 |
-
pixnetfacebookyahoo
|
9062 |
-
x7
|
9063 |
-
##uce
|
9064 |
-
sao
|
9065 |
-
##ev
|
9066 |
-
##file
|
9067 |
-
xddd
|
9068 |
-
shirt
|
9069 |
-
##rio
|
9070 |
-
##hat
|
9071 |
-
givenchy
|
9072 |
-
ya
|
9073 |
-
bang
|
9074 |
-
##lio
|
9075 |
-
monday
|
9076 |
-
crystal
|
9077 |
-
##abc
|
9078 |
-
head
|
9079 |
-
ubuntuforumwikilinuxpastechat
|
9080 |
-
##vc
|
9081 |
-
##rity
|
9082 |
-
cnc
|
9083 |
-
ipv6
|
9084 |
-
null
|
9085 |
-
##ost
|
9086 |
-
yang
|
9087 |
-
imsean
|
9088 |
-
tiger
|
9089 |
-
##fet
|
9090 |
-
dji
|
9091 |
-
ji
|
9092 |
-
maria
|
9093 |
-
##come
|
9094 |
-
foundation
|
9095 |
-
##beth
|
9096 |
-
active
|
9097 |
-
##aft
|
9098 |
-
##don
|
9099 |
-
sr
|
9100 |
-
emma
|
9101 |
-
##khz
|
9102 |
-
living
|
9103 |
-
sas
|
9104 |
-
x6
|
9105 |
-
##face
|
9106 |
-
pptv
|
9107 |
-
x4
|
9108 |
-
##mate
|
9109 |
-
han
|
9110 |
-
sophie
|
9111 |
-
##jing
|
9112 |
-
fifa
|
9113 |
-
##mand
|
9114 |
-
other
|
9115 |
-
sale
|
9116 |
-
inwedding
|
9117 |
-
##gn
|
9118 |
-
##mmy
|
9119 |
-
##pmlast
|
9120 |
-
bad
|
9121 |
-
nana
|
9122 |
-
nbc
|
9123 |
-
##wu
|
9124 |
-
note7
|
9125 |
-
single
|
9126 |
-
##bel
|
9127 |
-
window
|
9128 |
-
##dio
|
9129 |
-
##ht
|
9130 |
-
union
|
9131 |
-
age
|
9132 |
-
##ivity
|
9133 |
-
domain
|
9134 |
-
neo
|
9135 |
-
##isa
|
9136 |
-
##lter
|
9137 |
-
f5
|
9138 |
-
steven
|
9139 |
-
##cts
|
9140 |
-
powerpoint
|
9141 |
-
tft
|
9142 |
-
self
|
9143 |
-
g2
|
9144 |
-
ft
|
9145 |
-
zol
|
9146 |
-
##act
|
9147 |
-
mwc
|
9148 |
-
nbapop
|
9149 |
-
eds
|
9150 |
-
ace
|
9151 |
-
##room
|
9152 |
-
previous
|
9153 |
-
author
|
9154 |
-
tomtom
|
9155 |
-
il
|
9156 |
-
##ets
|
9157 |
-
hu
|
9158 |
-
financial
|
9159 |
-
bp
|
9160 |
-
chi
|
9161 |
-
##hg
|
9162 |
-
fairmont
|
9163 |
-
cross
|
9164 |
-
gay
|
9165 |
-
h2
|
9166 |
-
function
|
9167 |
-
also
|
9168 |
-
##raph
|
9169 |
-
##ils
|
9170 |
-
i3
|
9171 |
-
avenue
|
9172 |
-
##host
|
9173 |
-
##bon
|
9174 |
-
##tsu
|
9175 |
-
message
|
9176 |
-
navigation
|
9177 |
-
fintech
|
9178 |
-
h6
|
9179 |
-
##ject
|
9180 |
-
##vas
|
9181 |
-
##firm
|
9182 |
-
credit
|
9183 |
-
##wf
|
9184 |
-
xxxx
|
9185 |
-
form
|
9186 |
-
##nor
|
9187 |
-
##space
|
9188 |
-
huawei
|
9189 |
-
plan
|
9190 |
-
json
|
9191 |
-
sbl
|
9192 |
-
##dc
|
9193 |
-
machine
|
9194 |
-
wish
|
9195 |
-
##sol
|
9196 |
-
windows7
|
9197 |
-
edward
|
9198 |
-
development
|
9199 |
-
washington
|
9200 |
-
##nsis
|
9201 |
-
lo
|
9202 |
-
##sio
|
9203 |
-
##ym
|
9204 |
-
##bor
|
9205 |
-
planet
|
9206 |
-
##wt
|
9207 |
-
ieee
|
9208 |
-
gpa
|
9209 |
-
camp
|
9210 |
-
ann
|
9211 |
-
gm
|
9212 |
-
##tw
|
9213 |
-
##oka
|
9214 |
-
connect
|
9215 |
-
##rss
|
9216 |
-
##work
|
9217 |
-
##atus
|
9218 |
-
wall
|
9219 |
-
chicken
|
9220 |
-
soul
|
9221 |
-
##times
|
9222 |
-
fa
|
9223 |
-
##ather
|
9224 |
-
##cord
|
9225 |
-
##eep
|
9226 |
-
hitachi
|
9227 |
-
gui
|
9228 |
-
harry
|
9229 |
-
##pan
|
9230 |
-
e1
|
9231 |
-
disney
|
9232 |
-
##press
|
9233 |
-
wind
|
9234 |
-
frigidaire
|
9235 |
-
##tl
|
9236 |
-
liu
|
9237 |
-
hsu
|
9238 |
-
basic
|
9239 |
-
von
|
9240 |
-
ev
|
9241 |
-
learning
|
9242 |
-
##ull
|
9243 |
-
expedia
|
9244 |
-
archives
|
9245 |
-
change
|
9246 |
-
##wei
|
9247 |
-
santa
|
9248 |
-
cut
|
9249 |
-
ins
|
9250 |
-
turbo
|
9251 |
-
brand
|
9252 |
-
cf1
|
9253 |
-
return
|
9254 |
-
##rip
|
9255 |
-
h1
|
9256 |
-
##nis
|
9257 |
-
application
|
9258 |
-
emc
|
9259 |
-
rx
|
9260 |
-
##oon
|
9261 |
-
quick
|
9262 |
-
wilson
|
9263 |
-
wing
|
9264 |
-
chapter
|
9265 |
-
##bug
|
9266 |
-
beyond
|
9267 |
-
##cms
|
9268 |
-
##dar
|
9269 |
-
##oh
|
9270 |
-
zoom
|
9271 |
-
e2
|
9272 |
-
trip
|
9273 |
-
sb
|
9274 |
-
##nba
|
9275 |
-
rcep
|
9276 |
-
aspx
|
9277 |
-
ci
|
9278 |
-
gc
|
9279 |
-
gnu
|
9280 |
-
##count
|
9281 |
-
advanced
|
9282 |
-
dance
|
9283 |
-
dv
|
9284 |
-
##url
|
9285 |
-
##ging
|
9286 |
-
am09
|
9287 |
-
shadow
|
9288 |
-
battle
|
9289 |
-
##cia
|
9290 |
-
emily
|
9291 |
-
##tation
|
9292 |
-
host
|
9293 |
-
ff
|
9294 |
-
techorz
|
9295 |
-
sars
|
9296 |
-
##mini
|
9297 |
-
##mporary
|
9298 |
-
##ering
|
9299 |
-
nc
|
9300 |
-
##next
|
9301 |
-
cma
|
9302 |
-
##mbps
|
9303 |
-
##gas
|
9304 |
-
##ift
|
9305 |
-
##dot
|
9306 |
-
amana
|
9307 |
-
##ros
|
9308 |
-
ir
|
9309 |
-
##eet
|
9310 |
-
##ible
|
9311 |
-
##aka
|
9312 |
-
dcs
|
9313 |
-
iq
|
9314 |
-
l1
|
9315 |
-
##lor
|
9316 |
-
maggie
|
9317 |
-
##iu
|
9318 |
-
##gt
|
9319 |
-
articles
|
9320 |
-
create
|
9321 |
-
##burg
|
9322 |
-
##iki
|
9323 |
-
database
|
9324 |
-
fantasy
|
9325 |
-
##rex
|
9326 |
-
##cam
|
9327 |
-
dlc
|
9328 |
-
dean
|
9329 |
-
##you
|
9330 |
-
hard
|
9331 |
-
path
|
9332 |
-
gaming
|
9333 |
-
victoria
|
9334 |
-
maps
|
9335 |
-
cb
|
9336 |
-
##lee
|
9337 |
-
##itor
|
9338 |
-
overchicstoretvhome
|
9339 |
-
systems
|
9340 |
-
##xt
|
9341 |
-
p3
|
9342 |
-
sarah
|
9343 |
-
##nan
|
9344 |
-
x9
|
9345 |
-
install
|
9346 |
-
second
|
9347 |
-
##ann
|
9348 |
-
##ph
|
9349 |
-
##rcle
|
9350 |
-
##nic
|
9351 |
-
##nar
|
9352 |
-
ec
|
9353 |
-
metro
|
9354 |
-
chocolate
|
9355 |
-
##rian
|
9356 |
-
##table
|
9357 |
-
skin
|
9358 |
-
##sn
|
9359 |
-
mountain
|
9360 |
-
inparadise
|
9361 |
-
ib
|
9362 |
-
##jia
|
9363 |
-
eeworld
|
9364 |
-
creative
|
9365 |
-
g5
|
9366 |
-
g3
|
9367 |
-
parker
|
9368 |
-
ecfa
|
9369 |
-
village
|
9370 |
-
sylvia
|
9371 |
-
hbl
|
9372 |
-
##ques
|
9373 |
-
##onsored
|
9374 |
-
##x2
|
9375 |
-
##v4
|
9376 |
-
##tein
|
9377 |
-
ie6
|
9378 |
-
##stack
|
9379 |
-
ver
|
9380 |
-
##ads
|
9381 |
-
##baby
|
9382 |
-
sound
|
9383 |
-
bbe
|
9384 |
-
##lone
|
9385 |
-
##uid
|
9386 |
-
ads
|
9387 |
-
gundam
|
9388 |
-
thinkpad
|
9389 |
-
scrum
|
9390 |
-
match
|
9391 |
-
##ave
|
9392 |
-
mems
|
9393 |
-
##oy
|
9394 |
-
##talk
|
9395 |
-
glass
|
9396 |
-
lamigo
|
9397 |
-
span
|
9398 |
-
##eme
|
9399 |
-
job
|
9400 |
-
##a5
|
9401 |
-
jay
|
9402 |
-
wade
|
9403 |
-
kde
|
9404 |
-
##lace
|
9405 |
-
ocean
|
9406 |
-
tvg
|
9407 |
-
##covery
|
9408 |
-
##r3
|
9409 |
-
##ners
|
9410 |
-
##rea
|
9411 |
-
junior
|
9412 |
-
think
|
9413 |
-
##aine
|
9414 |
-
cover
|
9415 |
-
##ision
|
9416 |
-
##sia
|
9417 |
-
##bow
|
9418 |
-
msi
|
9419 |
-
##love
|
9420 |
-
soft
|
9421 |
-
z2
|
9422 |
-
##pl
|
9423 |
-
mobil
|
9424 |
-
mind
|
9425 |
-
##uy
|
9426 |
-
nginx
|
9427 |
-
##oi
|
9428 |
-
##rr
|
9429 |
-
##mple
|
9430 |
-
##sson
|
9431 |
-
##nts
|
9432 |
-
comhd
|
9433 |
-
crv3000
|
9434 |
-
##uard
|
9435 |
-
deep
|
9436 |
-
lost
|
9437 |
-
field
|
9438 |
-
gallery
|
9439 |
-
##bia
|
9440 |
-
rate
|
9441 |
-
spf
|
9442 |
-
redis
|
9443 |
-
traction
|
9444 |
-
icloud
|
9445 |
-
fe
|
9446 |
-
jose
|
9447 |
-
##tory
|
9448 |
-
into
|
9449 |
-
sohu
|
9450 |
-
fx
|
9451 |
-
kicstart2
|
9452 |
-
##hia
|
9453 |
-
##sit
|
9454 |
-
ra
|
9455 |
-
##walk
|
9456 |
-
##xure
|
9457 |
-
##pact
|
9458 |
-
pacific
|
9459 |
-
xa
|
9460 |
-
natural
|
9461 |
-
carlo
|
9462 |
-
##walker
|
9463 |
-
##can
|
9464 |
-
cto
|
9465 |
-
gigi
|
9466 |
-
pen
|
9467 |
-
##hoo
|
9468 |
-
ob
|
9469 |
-
matlab
|
9470 |
-
##yy
|
9471 |
-
##iti
|
9472 |
-
mango
|
9473 |
-
##bbs
|
9474 |
-
sense
|
9475 |
-
c5
|
9476 |
-
oxford
|
9477 |
-
walker
|
9478 |
-
jennifer
|
9479 |
-
##ola
|
9480 |
-
course
|
9481 |
-
##bre
|
9482 |
-
##pus
|
9483 |
-
##rder
|
9484 |
-
lucky
|
9485 |
-
ivy
|
9486 |
-
##nia
|
9487 |
-
sotheby
|
9488 |
-
side
|
9489 |
-
##ugh
|
9490 |
-
joy
|
9491 |
-
##orage
|
9492 |
-
##ush
|
9493 |
-
##bat
|
9494 |
-
##dt
|
9495 |
-
r9
|
9496 |
-
##gio
|
9497 |
-
country
|
9498 |
-
wear
|
9499 |
-
##lax
|
9500 |
-
##moon
|
9501 |
-
seven
|
9502 |
-
study
|
9503 |
-
lonzo
|
9504 |
-
evolution
|
9505 |
-
##kk
|
9506 |
-
gs
|
9507 |
-
kd
|
9508 |
-
arduino
|
9509 |
-
b12
|
9510 |
-
##lux
|
9511 |
-
arpg
|
9512 |
-
##rdon
|
9513 |
-
cook
|
9514 |
-
##x5
|
9515 |
-
dark
|
9516 |
-
five
|
9517 |
-
##als
|
9518 |
-
##ida
|
9519 |
-
sign
|
9520 |
-
something
|
9521 |
-
##nda
|
9522 |
-
##posted
|
9523 |
-
fresh
|
9524 |
-
tf
|
9525 |
-
cam
|
9526 |
-
##mine
|
9527 |
-
##skip
|
9528 |
-
##form
|
9529 |
-
##ssion
|
9530 |
-
education
|
9531 |
-
##tee
|
9532 |
-
dyson
|
9533 |
-
stage
|
9534 |
-
##jie
|
9535 |
-
want
|
9536 |
-
##night
|
9537 |
-
epson
|
9538 |
-
pack
|
9539 |
-
##ppy
|
9540 |
-
##█
|
9541 |
-
wd
|
9542 |
-
##eh
|
9543 |
-
##rence
|
9544 |
-
left
|
9545 |
-
##lvin
|
9546 |
-
golden
|
9547 |
-
mhz
|
9548 |
-
discovery
|
9549 |
-
##trix
|
9550 |
-
##n2
|
9551 |
-
loft
|
9552 |
-
##uch
|
9553 |
-
##dra
|
9554 |
-
##sse
|
9555 |
-
speed
|
9556 |
-
sorry
|
9557 |
-
welcome
|
9558 |
-
##urn
|
9559 |
-
wave
|
9560 |
-
gaga
|
9561 |
-
##lmer
|
9562 |
-
teddy
|
9563 |
-
rp
|
9564 |
-
##sha
|
9565 |
-
rar
|
9566 |
-
holiday
|
9567 |
-
##vg
|
9568 |
-
##nos
|
9569 |
-
##rail
|
9570 |
-
gartner
|
9571 |
-
gi
|
9572 |
-
##dium
|
9573 |
-
kit
|
9574 |
-
b3
|
9575 |
-
eco
|
9576 |
-
sean
|
9577 |
-
##stone
|
9578 |
-
autocad
|
9579 |
-
nu
|
9580 |
-
##np
|
9581 |
-
f16
|
9582 |
-
write
|
9583 |
-
m5
|
9584 |
-
##ias
|
9585 |
-
images
|
9586 |
-
atp
|
9587 |
-
##dk
|
9588 |
-
fsm
|
9589 |
-
ve
|
9590 |
-
##xxx
|
9591 |
-
##cake
|
9592 |
-
unit
|
9593 |
-
lim
|
9594 |
-
ru
|
9595 |
-
##ification
|
9596 |
-
published
|
9597 |
-
angela
|
9598 |
-
analytics
|
9599 |
-
ak
|
9600 |
-
##nel
|
9601 |
-
gmt
|
9602 |
-
##icon
|
9603 |
-
again
|
9604 |
-
##₂
|
9605 |
-
##bby
|
9606 |
-
ios11
|
9607 |
-
waze
|
9608 |
-
##ust
|
9609 |
-
framework
|
9610 |
-
iptv
|
9611 |
-
delete
|
9612 |
-
cl
|
9613 |
-
wwdc
|
9614 |
-
##fw
|
9615 |
-
##xon
|
9616 |
-
brandt
|
9617 |
-
##ses
|
9618 |
-
##dragon
|
9619 |
-
tc
|
9620 |
-
vetements
|
9621 |
-
anne
|
9622 |
-
monte
|
9623 |
-
modern
|
9624 |
-
official
|
9625 |
-
##ere
|
9626 |
-
##nne
|
9627 |
-
##oud
|
9628 |
-
etnews
|
9629 |
-
##a2
|
9630 |
-
##graphy
|
9631 |
-
##rtex
|
9632 |
-
l2
|
9633 |
-
##gma
|
9634 |
-
mount
|
9635 |
-
ccd
|
9636 |
-
archive
|
9637 |
-
morning
|
9638 |
-
tan
|
9639 |
-
ddos
|
9640 |
-
e7
|
9641 |
-
day4
|
9642 |
-
gis
|
9643 |
-
its
|
9644 |
-
factory
|
9645 |
-
bruce
|
9646 |
-
pg
|
9647 |
-
##ito
|
9648 |
-
guest
|
9649 |
-
cdma
|
9650 |
-
##lling
|
9651 |
-
n3
|
9652 |
-
mega
|
9653 |
-
eyes
|
9654 |
-
ro
|
9655 |
-
women
|
9656 |
-
dac
|
9657 |
-
church
|
9658 |
-
##jun
|
9659 |
-
singapore
|
9660 |
-
##facebook
|
9661 |
-
starbucks
|
9662 |
-
##tos
|
9663 |
-
##stin
|
9664 |
-
##shine
|
9665 |
-
zen
|
9666 |
-
##mu
|
9667 |
-
tina
|
9668 |
-
request
|
9669 |
-
##gence
|
9670 |
-
qt
|
9671 |
-
q7
|
9672 |
-
##zzi
|
9673 |
-
diary
|
9674 |
-
##tore
|
9675 |
-
##ead
|
9676 |
-
cst
|
9677 |
-
##osa
|
9678 |
-
canada
|
9679 |
-
agent
|
9680 |
-
va
|
9681 |
-
##jiang
|
9682 |
-
##lam
|
9683 |
-
sg
|
9684 |
-
##nix
|
9685 |
-
##sday
|
9686 |
-
g6
|
9687 |
-
##master
|
9688 |
-
bing
|
9689 |
-
##zl
|
9690 |
-
charlie
|
9691 |
-
nb40
|
9692 |
-
thai
|
9693 |
-
ln284ct
|
9694 |
-
##itz
|
9695 |
-
bonnie
|
9696 |
-
##food
|
9697 |
-
##lent
|
9698 |
-
originals
|
9699 |
-
##stro
|
9700 |
-
##lts
|
9701 |
-
##bscribe
|
9702 |
-
children
|
9703 |
-
ntd
|
9704 |
-
yesstyle
|
9705 |
-
hmv
|
9706 |
-
##tment
|
9707 |
-
d5
|
9708 |
-
arts
|
9709 |
-
sms
|
9710 |
-
##pn
|
9711 |
-
topios9
|
9712 |
-
lifestyle
|
9713 |
-
virtual
|
9714 |
-
##ague
|
9715 |
-
xz
|
9716 |
-
##deo
|
9717 |
-
muji
|
9718 |
-
unt
|
9719 |
-
##nnis
|
9720 |
-
faq1
|
9721 |
-
##ette
|
9722 |
-
fly
|
9723 |
-
curry
|
9724 |
-
##pop
|
9725 |
-
release
|
9726 |
-
##cast
|
9727 |
-
##ews
|
9728 |
-
##stle
|
9729 |
-
ios7
|
9730 |
-
##ima
|
9731 |
-
dog
|
9732 |
-
lenovo
|
9733 |
-
##r4
|
9734 |
-
roger
|
9735 |
-
cbs
|
9736 |
-
vornado
|
9737 |
-
##desk
|
9738 |
-
##ald
|
9739 |
-
##van
|
9740 |
-
oil
|
9741 |
-
some
|
9742 |
-
break
|
9743 |
-
common
|
9744 |
-
##jy
|
9745 |
-
##lines
|
9746 |
-
g7
|
9747 |
-
twice
|
9748 |
-
ella
|
9749 |
-
nano
|
9750 |
-
belle
|
9751 |
-
##mes
|
9752 |
-
##self
|
9753 |
-
##note
|
9754 |
-
jb
|
9755 |
-
benz
|
9756 |
-
##ova
|
9757 |
-
save
|
9758 |
-
##wing
|
9759 |
-
kai
|
9760 |
-
##hua
|
9761 |
-
##rect
|
9762 |
-
rainer
|
9763 |
-
##unge
|
9764 |
-
adsl
|
9765 |
-
guestname
|
9766 |
-
##uma
|
9767 |
-
##kins
|
9768 |
-
##zu
|
9769 |
-
tokichoi
|
9770 |
-
##price
|
9771 |
-
county
|
9772 |
-
##med
|
9773 |
-
##mus
|
9774 |
-
rmk
|
9775 |
-
address
|
9776 |
-
vm
|
9777 |
-
openload
|
9778 |
-
##group
|
9779 |
-
##hin
|
9780 |
-
##iginal
|
9781 |
-
amg
|
9782 |
-
urban
|
9783 |
-
##oz
|
9784 |
-
jobs
|
9785 |
-
emi
|
9786 |
-
##public
|
9787 |
-
beautiful
|
9788 |
-
##sch
|
9789 |
-
album
|
9790 |
-
##dden
|
9791 |
-
##bell
|
9792 |
-
jerry
|
9793 |
-
works
|
9794 |
-
hostel
|
9795 |
-
miller
|
9796 |
-
##drive
|
9797 |
-
##rmin
|
9798 |
-
boot
|
9799 |
-
##fx
|
9800 |
-
##nome
|
9801 |
-
##ctionary
|
9802 |
-
##oman
|
9803 |
-
##lish
|
9804 |
-
##cr
|
9805 |
-
##hm
|
9806 |
-
##how
|
9807 |
-
francis
|
9808 |
-
xi
|
9809 |
-
c919
|
9810 |
-
b5
|
9811 |
-
evernote
|
9812 |
-
##uc
|
9813 |
-
vga
|
9814 |
-
coupe
|
9815 |
-
##urg
|
9816 |
-
##cca
|
9817 |
-
##uality
|
9818 |
-
multi
|
9819 |
-
##ett
|
9820 |
-
em
|
9821 |
-
hey
|
9822 |
-
##ani
|
9823 |
-
##tax
|
9824 |
-
##rma
|
9825 |
-
inside
|
9826 |
-
than
|
9827 |
-
leonnhurt
|
9828 |
-
##jin
|
9829 |
-
ict
|
9830 |
-
bird
|
9831 |
-
notes
|
9832 |
-
##dical
|
9833 |
-
##lli
|
9834 |
-
result
|
9835 |
-
iu
|
9836 |
-
ee
|
9837 |
-
smap
|
9838 |
-
gopro
|
9839 |
-
##last
|
9840 |
-
yin
|
9841 |
-
pure
|
9842 |
-
##dan
|
9843 |
-
##rame
|
9844 |
-
mama
|
9845 |
-
##oot
|
9846 |
-
bean
|
9847 |
-
marketing
|
9848 |
-
##hur
|
9849 |
-
bella
|
9850 |
-
sync
|
9851 |
-
xuite
|
9852 |
-
##ground
|
9853 |
-
discuz
|
9854 |
-
##getrelax
|
9855 |
-
##ince
|
9856 |
-
##bay
|
9857 |
-
cj
|
9858 |
-
gmat
|
9859 |
-
apt
|
9860 |
-
##pass
|
9861 |
-
jing
|
9862 |
-
##rix
|
9863 |
-
c4
|
9864 |
-
rich
|
9865 |
-
niusnews
|
9866 |
-
##ello
|
9867 |
-
bag
|
9868 |
-
##eting
|
9869 |
-
##mobile
|
9870 |
-
culture
|
9871 |
-
area
|
9872 |
-
##ience
|
9873 |
-
details
|
9874 |
-
gp
|
9875 |
-
universal
|
9876 |
-
silver
|
9877 |
-
dit
|
9878 |
-
private
|
9879 |
-
ddd
|
9880 |
-
u11
|
9881 |
-
kanshu
|
9882 |
-
##ified
|
9883 |
-
fung
|
9884 |
-
##nny
|
9885 |
-
dx
|
9886 |
-
tai
|
9887 |
-
##fr
|
9888 |
-
##lean
|
9889 |
-
##pin
|
9890 |
-
##rin
|
9891 |
-
ly
|
9892 |
-
rick
|
9893 |
-
##bility
|
9894 |
-
usb3
|
9895 |
-
banner
|
9896 |
-
##baru
|
9897 |
-
##gion
|
9898 |
-
metal
|
9899 |
-
dt
|
9900 |
-
vdf
|
9901 |
-
karl
|
9902 |
-
qualcomm
|
9903 |
-
bear
|
9904 |
-
oldid
|
9905 |
-
ian
|
9906 |
-
jo
|
9907 |
-
##tors
|
9908 |
-
population
|
9909 |
-
##ernel
|
9910 |
-
mmorpg
|
9911 |
-
##mv
|
9912 |
-
##bike
|
9913 |
-
ww
|
9914 |
-
friend
|
9915 |
-
##ager
|
9916 |
-
exhibition
|
9917 |
-
##del
|
9918 |
-
##pods
|
9919 |
-
fpx
|
9920 |
-
structure
|
9921 |
-
##free
|
9922 |
-
##tings
|
9923 |
-
kl
|
9924 |
-
##rley
|
9925 |
-
##copyright
|
9926 |
-
##mma
|
9927 |
-
california
|
9928 |
-
orange
|
9929 |
-
yoga
|
9930 |
-
canmake
|
9931 |
-
honey
|
9932 |
-
##anda
|
9933 |
-
nikkie
|
9934 |
-
dhl
|
9935 |
-
publishing
|
9936 |
-
##mall
|
9937 |
-
##gnet
|
9938 |
-
##┅
|
9939 |
-
e88
|
9940 |
-
##dog
|
9941 |
-
fishbase
|
9942 |
-
##!
|
9943 |
-
##"
|
9944 |
-
###
|
9945 |
-
##$
|
9946 |
-
##%
|
9947 |
-
##&
|
9948 |
-
##'
|
9949 |
-
##(
|
9950 |
-
##)
|
9951 |
-
##*
|
9952 |
-
##+
|
9953 |
-
##,
|
9954 |
-
##-
|
9955 |
-
##.
|
9956 |
-
##/
|
9957 |
-
##:
|
9958 |
-
##;
|
9959 |
-
##<
|
9960 |
-
##=
|
9961 |
-
##>
|
9962 |
-
##?
|
9963 |
-
##@
|
9964 |
-
##[
|
9965 |
-
##\
|
9966 |
-
##]
|
9967 |
-
##^
|
9968 |
-
##_
|
9969 |
-
##{
|
9970 |
-
##|
|
9971 |
-
##}
|
9972 |
-
##~
|
9973 |
-
##£
|
9974 |
-
##¤
|
9975 |
-
##¥
|
9976 |
-
##§
|
9977 |
-
##«
|
9978 |
-
##±
|
9979 |
-
##³
|
9980 |
-
##µ
|
9981 |
-
##·
|
9982 |
-
##¹
|
9983 |
-
##º
|
9984 |
-
##»
|
9985 |
-
##¼
|
9986 |
-
##ß
|
9987 |
-
##æ
|
9988 |
-
##÷
|
9989 |
-
##ø
|
9990 |
-
##đ
|
9991 |
-
##ŋ
|
9992 |
-
##ɔ
|
9993 |
-
##ə
|
9994 |
-
##ɡ
|
9995 |
-
##ʰ
|
9996 |
-
##ˇ
|
9997 |
-
##ˈ
|
9998 |
-
##ˊ
|
9999 |
-
##ˋ
|
10000 |
-
##ˍ
|
10001 |
-
##ː
|
10002 |
-
##˙
|
10003 |
-
##˚
|
10004 |
-
##ˢ
|
10005 |
-
##α
|
10006 |
-
##β
|
10007 |
-
##γ
|
10008 |
-
##δ
|
10009 |
-
##ε
|
10010 |
-
##η
|
10011 |
-
##θ
|
10012 |
-
##ι
|
10013 |
-
##κ
|
10014 |
-
##λ
|
10015 |
-
##μ
|
10016 |
-
##ν
|
10017 |
-
##ο
|
10018 |
-
##��
|
10019 |
-
##ρ
|
10020 |
-
##ς
|
10021 |
-
##σ
|
10022 |
-
##τ
|
10023 |
-
##υ
|
10024 |
-
##φ
|
10025 |
-
##χ
|
10026 |
-
##ψ
|
10027 |
-
##б
|
10028 |
-
##в
|
10029 |
-
##г
|
10030 |
-
##д
|
10031 |
-
##е
|
10032 |
-
##ж
|
10033 |
-
##з
|
10034 |
-
##к
|
10035 |
-
##л
|
10036 |
-
##м
|
10037 |
-
##н
|
10038 |
-
##о
|
10039 |
-
##п
|
10040 |
-
##р
|
10041 |
-
##с
|
10042 |
-
##т
|
10043 |
-
##у
|
10044 |
-
##ф
|
10045 |
-
##х
|
10046 |
-
##ц
|
10047 |
-
##ч
|
10048 |
-
##ш
|
10049 |
-
##ы
|
10050 |
-
##ь
|
10051 |
-
##і
|
10052 |
-
##ก
|
10053 |
-
##ง
|
10054 |
-
##น
|
10055 |
-
##ม
|
10056 |
-
##ย
|
10057 |
-
##ร
|
10058 |
-
##อ
|
10059 |
-
##า
|
10060 |
-
##เ
|
10061 |
-
##๑
|
10062 |
-
##་
|
10063 |
-
##ღ
|
10064 |
-
##ᵃ
|
10065 |
-
##ᵉ
|
10066 |
-
##ᵍ
|
10067 |
-
##ᵏ
|
10068 |
-
##ᵐ
|
10069 |
-
##ᵒ
|
10070 |
-
##ᵘ
|
10071 |
-
##‖
|
10072 |
-
##„
|
10073 |
-
##†
|
10074 |
-
##•
|
10075 |
-
##‥
|
10076 |
-
##‧
|
10077 |
-
##
|
10078 |
-
##‰
|
10079 |
-
##′
|
10080 |
-
##″
|
10081 |
-
##‹
|
10082 |
-
##›
|
10083 |
-
##※
|
10084 |
-
##‿
|
10085 |
-
##⁄
|
10086 |
-
##ⁱ
|
10087 |
-
##⁺
|
10088 |
-
##ⁿ
|
10089 |
-
##₁
|
10090 |
-
##₃
|
10091 |
-
##₄
|
10092 |
-
##€
|
10093 |
-
##№
|
10094 |
-
##ⅰ
|
10095 |
-
##ⅱ
|
10096 |
-
##ⅲ
|
10097 |
-
##ⅳ
|
10098 |
-
##ⅴ
|
10099 |
-
##⇒
|
10100 |
-
##∀
|
10101 |
-
##−
|
10102 |
-
##∕
|
10103 |
-
##∙
|
10104 |
-
##√
|
10105 |
-
##∞
|
10106 |
-
##∟
|
10107 |
-
##∠
|
10108 |
-
##∣
|
10109 |
-
##∩
|
10110 |
-
##∮
|
10111 |
-
##∶
|
10112 |
-
##∼
|
10113 |
-
##∽
|
10114 |
-
##≈
|
10115 |
-
##≒
|
10116 |
-
##≡
|
10117 |
-
##≤
|
10118 |
-
##≥
|
10119 |
-
##≦
|
10120 |
-
##≧
|
10121 |
-
##≪
|
10122 |
-
##≫
|
10123 |
-
##⊙
|
10124 |
-
##⋅
|
10125 |
-
##⋈
|
10126 |
-
##⋯
|
10127 |
-
##⌒
|
10128 |
-
##①
|
10129 |
-
##②
|
10130 |
-
##③
|
10131 |
-
##④
|
10132 |
-
##⑤
|
10133 |
-
##⑥
|
10134 |
-
##⑦
|
10135 |
-
##⑧
|
10136 |
-
##⑨
|
10137 |
-
##⑩
|
10138 |
-
##⑴
|
10139 |
-
##⑵
|
10140 |
-
##⑶
|
10141 |
-
##⑷
|
10142 |
-
##⑸
|
10143 |
-
##⒈
|
10144 |
-
##⒉
|
10145 |
-
##⒊
|
10146 |
-
##⒋
|
10147 |
-
##ⓒ
|
10148 |
-
##ⓔ
|
10149 |
-
##ⓘ
|
10150 |
-
##━
|
10151 |
-
##┃
|
10152 |
-
##┆
|
10153 |
-
##┊
|
10154 |
-
##┌
|
10155 |
-
##└
|
10156 |
-
##├
|
10157 |
-
##┣
|
10158 |
-
##═
|
10159 |
-
##║
|
10160 |
-
##╚
|
10161 |
-
##╞
|
10162 |
-
##╠
|
10163 |
-
##╭
|
10164 |
-
##╮
|
10165 |
-
##╯
|
10166 |
-
##╰
|
10167 |
-
##╱
|
10168 |
-
##╳
|
10169 |
-
##▂
|
10170 |
-
##▃
|
10171 |
-
##▅
|
10172 |
-
##▇
|
10173 |
-
##▉
|
10174 |
-
##▋
|
10175 |
-
##▌
|
10176 |
-
##▍
|
10177 |
-
##▎
|
10178 |
-
##□
|
10179 |
-
##▬
|
10180 |
-
##△
|
10181 |
-
##►
|
10182 |
-
##▽
|
10183 |
-
##◇
|
10184 |
-
##◕
|
10185 |
-
##◠
|
10186 |
-
##◢
|
10187 |
-
##◤
|
10188 |
-
##☞
|
10189 |
-
##☼
|
10190 |
-
##♡
|
10191 |
-
##♫
|
10192 |
-
##♬
|
10193 |
-
##✕
|
10194 |
-
##✦
|
10195 |
-
##✪
|
10196 |
-
##✰
|
10197 |
-
##✿
|
10198 |
-
##❀
|
10199 |
-
##➜
|
10200 |
-
##➤
|
10201 |
-
##⦿
|
10202 |
-
##、
|
10203 |
-
##。
|
10204 |
-
##〃
|
10205 |
-
##々
|
10206 |
-
##〇
|
10207 |
-
##〈
|
10208 |
-
##〉
|
10209 |
-
##《
|
10210 |
-
##》
|
10211 |
-
##「
|
10212 |
-
##」
|
10213 |
-
##『
|
10214 |
-
##』
|
10215 |
-
##【
|
10216 |
-
##】
|
10217 |
-
##〓
|
10218 |
-
##〔
|
10219 |
-
##〕
|
10220 |
-
##〖
|
10221 |
-
##〗
|
10222 |
-
##〜
|
10223 |
-
##〝
|
10224 |
-
##〞
|
10225 |
-
##ㄧ
|
10226 |
-
##ㆍ
|
10227 |
-
##㈦
|
10228 |
-
##㊣
|
10229 |
-
##㗎
|
10230 |
-
##︰
|
10231 |
-
##︱
|
10232 |
-
##︶
|
10233 |
-
##︿
|
10234 |
-
##﹁
|
10235 |
-
##﹂
|
10236 |
-
##﹍
|
10237 |
-
##﹏
|
10238 |
-
##﹐
|
10239 |
-
##﹑
|
10240 |
-
##﹒
|
10241 |
-
##﹔
|
10242 |
-
##﹕
|
10243 |
-
##﹖
|
10244 |
-
##﹗
|
10245 |
-
##﹙
|
10246 |
-
##﹚
|
10247 |
-
##﹝
|
10248 |
-
##﹞
|
10249 |
-
##﹡
|
10250 |
-
##﹣
|
10251 |
-
##!
|
10252 |
-
##(
|
10253 |
-
##)
|
10254 |
-
##,
|
10255 |
-
##:
|
10256 |
-
##;
|
10257 |
-
##?
|
10258 |
-
##。
|
10259 |
-
##「
|
10260 |
-
##」
|
10261 |
-
##、
|
10262 |
-
##・
|
10263 |
-
##ッ
|
10264 |
-
##ー
|
10265 |
-
##゙
|
10266 |
-
##゚
|
10267 |
-
## ̄
|
10268 |
-
##¥
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vocab.py
ADDED
@@ -0,0 +1,453 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from patcher import tiktoken_patch
|
2 |
+
import tiktoken
|
3 |
+
from transformers import AutoTokenizer
|
4 |
+
from enum import Enum, auto
|
5 |
+
from dataclasses import dataclass, field
|
6 |
+
|
7 |
+
from utils.log_util import logger
|
8 |
+
from typing import Dict, Any, Union
|
9 |
+
|
10 |
+
"""Interface:
|
11 |
+
tokenizer.encode
|
12 |
+
tokenizer.decode
|
13 |
+
tokenizer.convert_tokens_to_string # gpt4 没有这个方法
|
14 |
+
tokenizer.convert_ids_to_tokens
|
15 |
+
|
16 |
+
|
17 |
+
tokenizer.parent = ""
|
18 |
+
tokenizer.vocab_size
|
19 |
+
tokenizer.get_vocab() # gpt-neox-20b, llama
|
20 |
+
tokenizer.type = TokenizerType.ByteBPE.name
|
21 |
+
tokenizer.implementation = TokenizerImpl.SentencePiece.name # https://github.com/facebookresearch/llama/blob/main/llama/tokenizer.py
|
22 |
+
"HFGPT2Tokenizer", "HFTokenizer", "GPT2BPETokenizer", "CharLevelTokenizer", "TiktokenTokenizer", "SPMTokenizer", https://github.com/EleutherAI/gpt-neox/blob/main/tools/preprocess_data.py
|
23 |
+
|
24 |
+
|
25 |
+
tokenizer.comments = "split all numbers into individual digits, " \
|
26 |
+
"and fallback to bytes to decompose unknown UTF-8 characters"
|
27 |
+
|
28 |
+
tokenizer.all_special_tokens # baichuan
|
29 |
+
tokenizer.special_tokens_set # gpt3.5_turbo
|
30 |
+
tokenizer.special_tokens_map
|
31 |
+
"""
|
32 |
+
|
33 |
+
|
34 |
+
class TokenizerImpl(Enum):
|
35 |
+
"""
|
36 |
+
- https://github.com/huggingface/tokenizers/blob/main/bindings/python/py_src/tokenizers/implementations/__init__.py
|
37 |
+
- https://huggingface.co/docs/transformers/tokenizer_summary
|
38 |
+
- https://github.com/EleutherAI/gpt-neox/blob/main/megatron/tokenizer/tokenizer.py
|
39 |
+
|
40 |
+
## google/BertTokenizer
|
41 |
+
- https://github.com/huggingface/tokenizers/blob/main/bindings/python/py_src/tokenizers/implementations/bert_wordpiece.py
|
42 |
+
- 特征
|
43 |
+
- 算法:BERT的编码器是 BPE-WordPiece,将单词拆分成多个前缀符号(比如BERT中的##)最小单元
|
44 |
+
- 词典:有##开头的token,表示subword,
|
45 |
+
- 中文采用char粒度分词
|
46 |
+
- 英文采用 WordPiece
|
47 |
+
|
48 |
+
|
49 |
+
|
50 |
+
|
51 |
+
## google/sentencepiece
|
52 |
+
- https://github.com/google/sentencepiece/blob/3863f7648e5d8edb571ac592f3ac4f5f0695275a/src/sentencepiece_model.proto#L48
|
53 |
+
- 支持 sentencepiece 和 wordpiece
|
54 |
+
- sentencepiece 有byte-bpe吗?
|
55 |
+
- UNIGRAM = 1; // Unigram language model with dynamic algorithm
|
56 |
+
- BPE = 2; // Byte Pair Encoding
|
57 |
+
- WORD = 3; // Delimitered by whitespace.
|
58 |
+
- CHAR = 4; // tokenizes into character sequence
|
59 |
+
- wordpiece
|
60 |
+
- 特征:
|
61 |
+
- 训练: spm_train --model_type unigram/bpe/char/word
|
62 |
+
- 特殊符号: Ġ
|
63 |
+
- 文件: *.sp_model 或 *.model (可选文件 .vocab,) spm简称 (其他格式比如 tokenizer.json是给hf_tokenizer兼容用的)
|
64 |
+
- 实现:
|
65 |
+
- 依赖: protobuf
|
66 |
+
- 训练: `import sentencepiece as spm; spm.SentencePieceTrainer.train` 或 `spm_train`
|
67 |
+
- 加载: `import sentencepiece as spm; spm.SentencePieceProcessor().Load(vocab_file)`
|
68 |
+
- 方法: 是SentencePieceProcessor类型,sp_model.id_to_piece,有tokenizer.json tokenizer.model,
|
69 |
+
- 分词:
|
70 |
+
- pre_tokenizers.ByteLevel(add_prefix_space=True, use_regex=False)
|
71 |
+
- 词典: 词典字符有 ▁ (U+2581) ,表示空格或句首。
|
72 |
+
- 示例:google-t5, llama,baichuan, orion,
|
73 |
+
- llama: tokenizer.json(包含model.vocab model.merges) tokenizer.model
|
74 |
+
- grok: 原始是 .model文件,后面转成了 tokenizer.json
|
75 |
+
- google-t5: tokenizer.json, spiece.model
|
76 |
+
- Skywork-13B-Math: tokenizer.model
|
77 |
+
- xlm_roberta: sentencepiece.bpe.model
|
78 |
+
- GPT2Tokenizer
|
79 |
+
- tokenizer.json, vocab.json, merges.txt (https://huggingface.co/openai-community/gpt2)
|
80 |
+
- vocab.bpe, encoder.json, dict.txt (fairseq版本,不常用,可以忽略这个版本)
|
81 |
+
|
82 |
+
|
83 |
+
|
84 |
+
## thu/icetk
|
85 |
+
- icetk: sentencepiece的分支,支持image_tokenizer。
|
86 |
+
- glm, chatglm1, chatglm2
|
87 |
+
|
88 |
+
## huggingface/tokenizers
|
89 |
+
- https://github.com/huggingface/tokenizers
|
90 |
+
- VS sentencepiece
|
91 |
+
- 支持sentencepiece
|
92 |
+
- .model转化为 (merges.txt + vocab.json) 或者 tokenizer.json
|
93 |
+
- https://github.com/huggingface/tokenizers/blob/main/bindings/python/scripts/sentencepiece_extractor.py
|
94 |
+
- 加载 merges.txt, vocab.json
|
95 |
+
- SentencePieceBPETokenizer https://github.com/huggingface/tokenizers/blob/v0.19.1/bindings/python/py_src/tokenizers/implementations/sentencepiece_bpe.py#L10
|
96 |
+
- 在 sentencepiece基础上,hf_tokenizer支持pre-tokenization的正则表达式,对tab和换行支持更好,支持special token
|
97 |
+
- 类型: 支持 BBPE, WordPiece or Unigram
|
98 |
+
- 特征:
|
99 |
+
- 文件: tokenizer.json(包含后两个文件的内容), merges.txt, vocab.json
|
100 |
+
- added_tokens 在vocab中不一定存在。
|
101 |
+
- 实现:
|
102 |
+
- 训练: `from tokenizers.trainers import BpeTrainer, UnigramTrainer, WordLevelTrainer, WordPieceTrainer`
|
103 |
+
- 加载:
|
104 |
+
- 方法: .model.from_file .model.save .model.token_to_id .model.tokenize
|
105 |
+
- .model 是 tokenizer.models.BPE 类型
|
106 |
+
- 词典有 Ġ "\u0120" 开头
|
107 |
+
- 优势
|
108 |
+
-
|
109 |
+
- 示例:gpt2, gpt_neox_20b, moss, bloom, qwen2
|
110 |
+
- 优势:相对sentence piece,
|
111 |
+
- ss
|
112 |
+
|
113 |
+
## openai/tiktoken
|
114 |
+
- 特征:空格就是空格,
|
115 |
+
- 示例:gpt3.5 gpt4, qwen,
|
116 |
+
"""
|
117 |
+
""" 算法体系 https://www.huaxiaozhuan.com/%E5%B7%A5%E5%85%B7/huggingface_transformer/chapters/1_tokenizer.html
|
118 |
+
- word-base tokenizer:
|
119 |
+
- char-base tokenizer:
|
120 |
+
- subword-based Tokenizer
|
121 |
+
- BPE
|
122 |
+
- byte-bpe: base vocabulary大小是256
|
123 |
+
- WordPiece:
|
124 |
+
- 相比BPE,WordPiece 仅保存最终词表,而不保存学到的 merge rule
|
125 |
+
- Unigram
|
126 |
+
- SentencePiece
|
127 |
+
|
128 |
+
"""
|
129 |
+
|
130 |
+
# 分类体系:https://github.com/huggingface/tokenizers/blob/main/bindings/python/py_src/tokenizers/implementations/
|
131 |
+
BertTokenizer = "wordpiece.BertTokenizer"
|
132 |
+
JapaneseTokenizer = ("wordpiece.MecabTokenizer", "https://github.com/polm/fugashi") # 常用日语包 ipadic,fugashi,
|
133 |
+
ByteLevelBPETokenizer = "byte_level_bpe" # BBPE
|
134 |
+
SentencePieceBPETokenizer = "sentencepiece_bpe"
|
135 |
+
|
136 |
+
# 分类体系
|
137 |
+
|
138 |
+
# SentencePeice(BPE)
|
139 |
+
SentencePiece = auto() # sentencepiece.bpe, sentencepiece.unigram, sentencepiece.char, sentencepiece.word,
|
140 |
+
byte_level_bpe = auto()
|
141 |
+
# HFTokenizer = auto() # , 支持
|
142 |
+
TikToken = auto()
|
143 |
+
# subword-nmt
|
144 |
+
# WordPiece
|
145 |
+
|
146 |
+
|
147 |
+
# load_vocab_with_SPECIAL_TOKEN = True # 如果不包含会导致计算词典大小错误、overlap_token计算不一致。
|
148 |
+
|
149 |
+
|
150 |
+
@dataclass
|
151 |
+
class TokenizerConfig:
|
152 |
+
"""
|
153 |
+
https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/blob/main/src/leaderboard/read_evals.py
|
154 |
+
"""
|
155 |
+
name_or_path: str # org/model (path on hub), as unique id
|
156 |
+
name_display: str = None #
|
157 |
+
impl: TokenizerImpl = None # implementation, tokenizer_class/type
|
158 |
+
org: str = None
|
159 |
+
link: str = None # http://**
|
160 |
+
desc: str = None # description
|
161 |
+
meta: str = None
|
162 |
+
level: str = None # char-level, word-level, byte-level
|
163 |
+
init_kwargs: Dict[str, Any] = field(default_factory=dict, )
|
164 |
+
|
165 |
+
def __post_init__(self):
|
166 |
+
if self.link is None:
|
167 |
+
self.link = "https://huggingface.co/" + self.name_or_path # TODO + revision
|
168 |
+
if self.name_display is None:
|
169 |
+
self.name_display = self.name_or_path
|
170 |
+
|
171 |
+
@classmethod
|
172 |
+
def init_from_json_file(cls, json_filepath: str) -> 'TokenizerConfig':
|
173 |
+
pass
|
174 |
+
|
175 |
+
def __eq__(self, other):
|
176 |
+
if isinstance(other, self.__class__):
|
177 |
+
return self.__dict__ == other.__dict__
|
178 |
+
else:
|
179 |
+
return False
|
180 |
+
|
181 |
+
def __hash__(self):
|
182 |
+
return hash(self.name_or_path)
|
183 |
+
|
184 |
+
|
185 |
+
# format: , description, hf_path, tokenizer_class/type, comments, Organization
|
186 |
+
# TODO: append link and description to the end of dropdown button.
|
187 |
+
_all_tokenizer_config = [
|
188 |
+
##### bert 系列
|
189 |
+
TokenizerConfig("google-bert/bert-base-cased", impl=TokenizerImpl.BertTokenizer, org="Google",
|
190 |
+
desc="first add whitespace around any CJK character, then perform wordpiece tokenization."),
|
191 |
+
TokenizerConfig("google-bert/bert-base-uncased", impl=TokenizerImpl.BertTokenizer, org="Google",
|
192 |
+
desc="first add whitespace around any CJK character, then perform wordpiece tokenization."),
|
193 |
+
TokenizerConfig("google-bert/bert-base-chinese", impl=TokenizerImpl.BertTokenizer, org="Google",
|
194 |
+
desc="first add whitespace around any CJK character, then perform wordpiece tokenization."),
|
195 |
+
TokenizerConfig("google-bert/bert-base-german-cased", impl=TokenizerImpl.BertTokenizer, org="Google"),
|
196 |
+
TokenizerConfig("dbmdz/bert-base-german-uncased", impl=TokenizerImpl.BertTokenizer, org="dbmdz"),
|
197 |
+
TokenizerConfig("google-bert/bert-base-multilingual-uncased", impl=TokenizerImpl.BertTokenizer, org="Google"),
|
198 |
+
TokenizerConfig("google-bert/bert-base-multilingual-cased", impl=TokenizerImpl.BertTokenizer, org="Google"),
|
199 |
+
TokenizerConfig("tohoku-nlp/bert-base-japanese", impl=TokenizerImpl.BertTokenizer, org="Tohoku",
|
200 |
+
desc="The texts are first tokenized by MeCab morphological parser with the IPA dictionary, "
|
201 |
+
"then split into subwords by the WordPiece algorithm."),
|
202 |
+
TokenizerConfig("clue/roberta_chinese_clue_tiny", name_display="clue/roberta-chinese-clue",
|
203 |
+
impl=TokenizerImpl.BertTokenizer, org="CLUE",
|
204 |
+
init_kwargs={"revision": "refs/pr/1"},
|
205 |
+
desc="",
|
206 |
+
meta="去掉了繁体字, https://github.com/CLUEbenchmark/CLUEPretrainedModels/blob/master/README.md"),
|
207 |
+
TokenizerConfig("eson/kplug-base-encoder", name_display="eson/kplug", impl=TokenizerImpl.BertTokenizer, org="JD"),
|
208 |
+
TokenizerConfig("ckiplab/gpt2-base-chinese", impl=TokenizerImpl.BertTokenizer, org="SINICA"), # 台湾中央研究院
|
209 |
+
# WoBERT
|
210 |
+
# WoBERT Plus https://github.com/ZhuiyiTechnology/WoBERT
|
211 |
+
|
212 |
+
|
213 |
+
##### GPT2Tokenizer
|
214 |
+
TokenizerConfig("openai-community/gpt2", impl=TokenizerImpl.SentencePiece, org="OpenAI"),
|
215 |
+
# byte-level BPE,没有byte,是unicode-level的吗?
|
216 |
+
TokenizerConfig("ClassCat/gpt2-base-french", impl=TokenizerImpl.SentencePiece, org="ClassCat"),
|
217 |
+
TokenizerConfig("ClassCat/gpt2-base-spanish", impl=TokenizerImpl.SentencePiece, org="ClassCat"),
|
218 |
+
TokenizerConfig("fnlp/moss-moon-003-sft", impl=TokenizerImpl.SentencePiece, init_kwargs={"revision": "refs/pr/6"},
|
219 |
+
org="Fudan",
|
220 |
+
desc="This tokenizer has been trained to treat spaces like parts of the tokens "
|
221 |
+
"(a bit like sentencepiece) so a word will be encoded differently whether "
|
222 |
+
"it is at the beginning of the sentence (without space) or not",
|
223 |
+
meta="在gpt2词典基础上,扩充了5万中文"),
|
224 |
+
TokenizerConfig("bigscience/bloom", impl=TokenizerImpl.SentencePiece, org="BigScience",
|
225 |
+
meta="比gpt_neox的词典 对中文支持更好。"),
|
226 |
+
# ("bloomz_6b4_zh",
|
227 |
+
# ("BelleGroup/BELLE-7B-2M", # 模型和词典都基于bloom
|
228 |
+
#
|
229 |
+
TokenizerConfig("EleutherAI/gpt-neox-20b", impl=TokenizerImpl.SentencePiece, org="EleutherAI"), # 5万
|
230 |
+
TokenizerConfig("cyberagent/open-calm-7b", impl=TokenizerImpl.SentencePiece, org="CyberAgent"), # GPTNeoXTokenizer
|
231 |
+
TokenizerConfig("abeja/gpt-neox-japanese-2.7b", impl=TokenizerImpl.SentencePiece, org="ABEJA"),
|
232 |
+
TokenizerConfig("Qwen/Qwen1.5-14B-Chat", name_display="Qwen/Qwen1.5", impl=TokenizerImpl.SentencePiece, org="Alibaba"), # 15万,速度有点慢
|
233 |
+
TokenizerConfig("HuggingFaceH4/starchat-alpha", impl=TokenizerImpl.SentencePiece, org="-"),
|
234 |
+
|
235 |
+
####### google/sentencepiece tokenizer:
|
236 |
+
# T5 llama internlm
|
237 |
+
TokenizerConfig("google-t5/t5-large", name_display="google-t5/t5", impl=TokenizerImpl.SentencePiece, org="Google"),
|
238 |
+
# t5_small, t5_base, t5_large, flan_t5_base,
|
239 |
+
# ("t5_base", "", "sentencepiece"),
|
240 |
+
# TokenizerConfig("google/flan-t5-base", impl=TokenizerImpl.SentencePiece, ),
|
241 |
+
TokenizerConfig("lmsys/fastchat-t5-3b-v1.0", impl=TokenizerImpl.SentencePiece,
|
242 |
+
org="LMSYS",
|
243 |
+
init_kwargs={"use_fast": False} # 解决 pyo3_runtime.PanicException: AddedVocabulary bad split
|
244 |
+
),
|
245 |
+
TokenizerConfig("CohereForAI/aya-101", org="Cohere For AI"), # "tokenizer_class": "T5Tokenizer",
|
246 |
+
|
247 |
+
TokenizerConfig("ClueAI/ChatYuan-large-v2", impl=TokenizerImpl.SentencePiece, org="CLUE"),
|
248 |
+
TokenizerConfig("ClueAI/PromptCLUE-base", impl=TokenizerImpl.SentencePiece, org="CLUE"),
|
249 |
+
TokenizerConfig("gradientai/Llama-3-8B-Instruct-Gradient-1048k", name_display="Meta/llama3",
|
250 |
+
impl=TokenizerImpl.SentencePiece, org="Meta",
|
251 |
+
desc="llama split all numbers into individual digits, and fallback to bytes to decompose unknown UTF-8 characters"),
|
252 |
+
# byte-level BPE
|
253 |
+
# '中文单字': 700, '中文多字': 0
|
254 |
+
TokenizerConfig("NousResearch/Llama-2-7b-chat-hf", name_display="Meta/llama2", impl=TokenizerImpl.SentencePiece,
|
255 |
+
org="Meta"),
|
256 |
+
TokenizerConfig("huggyllama/llama-7b", name_display="Meta/llama", impl=TokenizerImpl.SentencePiece, org="Meta"),
|
257 |
+
TokenizerConfig("hpcai-tech/grok-1", name_display="xai-org/grok-1", impl=TokenizerImpl.SentencePiece, org="xAI"),
|
258 |
+
# 由.model文件转化为了
|
259 |
+
TokenizerConfig("hfl/chinese-llama-lora-7b", impl=TokenizerImpl.SentencePiece, org="-",
|
260 |
+
meta="向原始LLaMA的词汇表中添加2w个中文词汇,针对原版LLaMA模型扩充了中文词表, 提升了中文编解码效率"),
|
261 |
+
#
|
262 |
+
TokenizerConfig("hfl/chinese-llama-2-7b", impl=TokenizerImpl.SentencePiece, org="-",
|
263 |
+
meta="重新设计了新词表(大小:55296),进一步提升了中文字词的覆盖程度"), #
|
264 |
+
TokenizerConfig("hfl/llama-3-chinese-8b", impl=TokenizerImpl.SentencePiece, org="-"),
|
265 |
+
TokenizerConfig("hfl/chinese-alpaca-lora-7b", impl=TokenizerImpl.SentencePiece, org="-"),
|
266 |
+
# 中文Alpaca模型在上述中文LLaMA模型的基础上进一步使用了指令数据进行精调。 "比chinese_llama词典多一个`[PAD]`,请勿混用"
|
267 |
+
#
|
268 |
+
# ("belle_llama_ext_7b",
|
269 |
+
# ("alpaca_7b",
|
270 |
+
TokenizerConfig("baichuan-inc/Baichuan-7B", name_display="baichuan-inc/baichuan",
|
271 |
+
impl=TokenizerImpl.SentencePiece,
|
272 |
+
level="byte-level", org="Baichuan"),
|
273 |
+
TokenizerConfig("baichuan-inc/Baichuan2-7B-Chat", name_display="baichuan-inc/baichuan2",
|
274 |
+
impl=TokenizerImpl.SentencePiece, org="Baichuan",
|
275 |
+
desc="expand the vocabulary size from 64000 in Baichuan1 to 125696"),
|
276 |
+
TokenizerConfig("internlm/internlm-chat-7b", impl=TokenizerImpl.SentencePiece, org="Shanghai AI Lab"),
|
277 |
+
# 上海AI实验室 + 商汤
|
278 |
+
TokenizerConfig("internlm/internlm2-chat-7b", impl=TokenizerImpl.SentencePiece, org="Shanghai AI Lab"),
|
279 |
+
TokenizerConfig("internlm/internlm2-math-7b", impl=TokenizerImpl.SentencePiece, org="Shanghai AI Lab"),
|
280 |
+
TokenizerConfig("internlm/internlm-xcomposer-7b", impl=TokenizerImpl.SentencePiece, org="Shanghai AI Lab"),
|
281 |
+
TokenizerConfig("tiiuae/falcon-7b", impl=TokenizerImpl.SentencePiece, org="TII"),
|
282 |
+
TokenizerConfig("tiiuae/falcon-180b", impl=TokenizerImpl.SentencePiece, org="TII"),
|
283 |
+
TokenizerConfig("Skywork/Skywork-13B-base", impl=TokenizerImpl.SentencePiece, org="Kunlun"),
|
284 |
+
TokenizerConfig("Skywork/Skywork-13B-Math", impl=TokenizerImpl.SentencePiece, org="Kunlun"), # 文件:tokenizer.model
|
285 |
+
TokenizerConfig("FacebookAI/xlm-roberta-base", impl=TokenizerImpl.SentencePiece, org="Facebook"),
|
286 |
+
# 这个的tokenizer.json 为什么没有merges? vocab里为什么有概率值?
|
287 |
+
# "goat",
|
288 |
+
|
289 |
+
# ##### glm系列
|
290 |
+
# "glm_chinese",),
|
291 |
+
TokenizerConfig("THUDM/chatglm-6b", impl=TokenizerImpl.SentencePiece, org="Tsinghua",
|
292 |
+
meta=f"num_image_tokens: {12}; num_image_tokens: {34} ",
|
293 |
+
init_kwargs={"revision": "refs/pr/100"}),
|
294 |
+
TokenizerConfig("THUDM/chatglm2-6b", impl=TokenizerImpl.SentencePiece, org="Tsinghua", ),
|
295 |
+
TokenizerConfig("THUDM/chatglm3-6b", impl=TokenizerImpl.SentencePiece, org="Tsinghua", ),
|
296 |
+
TokenizerConfig("thu-coai/CharacterGLM-6B", impl=TokenizerImpl.SentencePiece, org="Tsinghua", ),
|
297 |
+
|
298 |
+
# tiktoken 系列
|
299 |
+
TokenizerConfig("openai/text-davinci-003", impl=TokenizerImpl.TikToken, org="OpenAI",
|
300 |
+
link="https://github.com/openai/tiktoken"),
|
301 |
+
#
|
302 |
+
TokenizerConfig("openai/code-davinci-002", impl=TokenizerImpl.TikToken, org="OpenAI",
|
303 |
+
link="https://github.com/openai/tiktoken"),
|
304 |
+
TokenizerConfig("openai/gpt-3.5-turbo", impl=TokenizerImpl.TikToken, org="OpenAI",
|
305 |
+
link="https://github.com/openai/tiktoken",
|
306 |
+
desc="tiktoken is a fast BPE tokeniser for use with OpenAI's models. There are 16 tokens KeyError"),
|
307 |
+
TokenizerConfig("openai/gpt-4", impl=TokenizerImpl.TikToken, org="OpenAI",
|
308 |
+
link="https://github.com/openai/tiktoken", ),
|
309 |
+
TokenizerConfig("openai/gpt-4o", impl=TokenizerImpl.TikToken, org="OpenAI",
|
310 |
+
link="https://github.com/openai/tiktoken", ),
|
311 |
+
TokenizerConfig("Qwen/Qwen-7B-Chat", name_display="Qwen/Qwen", impl=TokenizerImpl.TikToken, org="Alibaba",
|
312 |
+
init_kwargs={"revision": "refs/pr/56"},
|
313 |
+
meta="在gpt4词典基础上,删除了100个多数字token,增加10000中文词token;并优化了special_token的分词"),
|
314 |
+
# https://huggingface.co/Qwen/Qwen-7B-Chat#%E6%A8%A1%E5%9E%8B%E7%BB%86%E8%8A%82%EF%BC%88model%EF%BC%89
|
315 |
+
# 该词表在GPT-4使用的BPE词表cl100k_base基础上,对中文、多语言进行了优化,在对中、英、代码数据的高效编解码的基础上,
|
316 |
+
# 对部分多语言更加友好,方便用户在不扩展词表的情况下对部分语种进行能力增强。 词表对数字按单个数字位切分。
|
317 |
+
|
318 |
+
# TokenizerConfig("Qwen/Qwen-72B-Chat", impl=TokenizerImpl.TikToken),
|
319 |
+
|
320 |
+
# 未分类
|
321 |
+
# ("amber", ""),
|
322 |
+
TokenizerConfig("LLM360/CrystalCoder", org="MBZUAI"),
|
323 |
+
TokenizerConfig("mistralai/Mistral-7B-v0.1", org="Mistral"),
|
324 |
+
TokenizerConfig("mistralai/Mixtral-8x7B-v0.1", org="Mistral"),
|
325 |
+
|
326 |
+
TokenizerConfig("paust/pko-t5-large", org="PAUST"),
|
327 |
+
|
328 |
+
TokenizerConfig("01-ai/Yi-6B", org="Yi"),
|
329 |
+
TokenizerConfig("01-ai/Yi-34B", org="Yi"),
|
330 |
+
TokenizerConfig("01-ai/Yi-VL-34B", org="Yi"),
|
331 |
+
TokenizerConfig("OrionStarAI/Orion-14B-Chat", org="OrionStar"),
|
332 |
+
TokenizerConfig("microsoft/phi-1", org="Microsoft"),
|
333 |
+
TokenizerConfig("microsoft/phi-2", org="Microsoft"),
|
334 |
+
TokenizerConfig("microsoft/Phi-3-mini-4k-instruct", org="Microsoft", meta="即llama vocab"),
|
335 |
+
TokenizerConfig("Upstage/SOLAR-10.7B-v1.0", org="-"),
|
336 |
+
TokenizerConfig("google/mobilebert-uncased", org="Google"),
|
337 |
+
# ("google/mobilenet_v2_1.0_224",), # error
|
338 |
+
TokenizerConfig("google/switch-c-2048", org="Google"),
|
339 |
+
TokenizerConfig("google/byt5-small", org="Google"),
|
340 |
+
TokenizerConfig("google/mt5-large", org="Google"),
|
341 |
+
TokenizerConfig("WizardLM/WizardCoder-Python-7B-V1.0", org="Microsoft"),
|
342 |
+
TokenizerConfig("WizardLM/WizardCoder-15B-V1.0", org="Microsoft"),
|
343 |
+
TokenizerConfig("WizardLM/WizardLM-7B-V1.0", org="Microsoft"),
|
344 |
+
TokenizerConfig("WizardLM/WizardMath-70B-V1.0", org="Microsoft"),
|
345 |
+
TokenizerConfig("TigerResearch/tigerbot-70b-chat-v4-4k", org="Tigerobo"),
|
346 |
+
TokenizerConfig("TigerResearch/tigerbot-13b-chat-v2", org="Tigerobo"),
|
347 |
+
TokenizerConfig("deepseek-ai/deepseek-coder-33b-instruct", org="DeepSeek"),
|
348 |
+
TokenizerConfig("deepseek-ai/deepseek-llm-7b-base", org="DeepSeek"),
|
349 |
+
TokenizerConfig("deepseek-ai/DeepSeek-V2", org="DeepSeek"),
|
350 |
+
TokenizerConfig("google/gemma-7b", org="Google"),
|
351 |
+
TokenizerConfig("allenai/OLMo-7B", org="Allen AI"),
|
352 |
+
TokenizerConfig("HuggingFaceH4/zephyr-7b-beta", org="HuggingFace"),
|
353 |
+
TokenizerConfig("ai21labs/Jamba-v0.1", org="AI21"),
|
354 |
+
TokenizerConfig("databricks/dbrx-instruct", org="Databricks"),
|
355 |
+
|
356 |
+
# ("claude",),
|
357 |
+
# https://github.com/Duxiaoman-DI/XuanYuan
|
358 |
+
|
359 |
+
# https://huggingface.co/apple/OpenELM-3B-Instruct https://huggingface.co/apple/OpenELM-3B
|
360 |
+
|
361 |
+
]
|
362 |
+
|
363 |
+
assert len(set([config.name_display for config in _all_tokenizer_config])) == len(_all_tokenizer_config)
|
364 |
+
assert len(set([config.name_or_path for config in _all_tokenizer_config])) == len(_all_tokenizer_config)
|
365 |
+
assert len(set([config.name_or_path.split("/")[-1] for config in _all_tokenizer_config])) == len(_all_tokenizer_config)
|
366 |
+
|
367 |
+
|
368 |
+
class TokenizerFactory:
|
369 |
+
|
370 |
+
def __init__(self):
|
371 |
+
self.all_tokenizer_configs = sorted(_all_tokenizer_config, key=lambda k: k.name_or_path)
|
372 |
+
self.all_tokenizer_names = [config.name_or_path for config in self.all_tokenizer_configs]
|
373 |
+
self.name_to_config_list = [
|
374 |
+
{config.name_or_path: config for config in self.all_tokenizer_configs},
|
375 |
+
{config.name_display: config for config in self.all_tokenizer_configs},
|
376 |
+
{config.name_display.split("/")[-1]: config for config in self.all_tokenizer_configs},
|
377 |
+
]
|
378 |
+
self.tokenizer_cache = {}
|
379 |
+
|
380 |
+
def get_tokenizer_config(self, tokenizer_name: str) -> TokenizerConfig:
|
381 |
+
for name_to_config in self.name_to_config_list:
|
382 |
+
if tokenizer_name in name_to_config:
|
383 |
+
return name_to_config[tokenizer_name]
|
384 |
+
return None
|
385 |
+
|
386 |
+
def get_tokenizer(self, tokenizer_name: str):
|
387 |
+
"""
|
388 |
+
:param tokenizer_config:
|
389 |
+
:return:
|
390 |
+
"""
|
391 |
+
tokenizer_config = self.get_tokenizer_config(tokenizer_name)
|
392 |
+
|
393 |
+
# 1. load from cache
|
394 |
+
if tokenizer_config in self.tokenizer_cache:
|
395 |
+
return self.tokenizer_cache[tokenizer_config]
|
396 |
+
|
397 |
+
# 2. load tokenizer
|
398 |
+
logger.info(f"loading tokenizer {tokenizer_config.name_or_path}")
|
399 |
+
if tokenizer_config.impl == TokenizerImpl.TikToken and "openai" in tokenizer_config.name_or_path:
|
400 |
+
tokenizer = tiktoken.encoding_for_model(tokenizer_config.name_or_path.replace("openai/", ""))
|
401 |
+
else:
|
402 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
403 |
+
tokenizer_config.name_or_path,
|
404 |
+
trust_remote_code=True,
|
405 |
+
**tokenizer_config.init_kwargs
|
406 |
+
)
|
407 |
+
self.tokenizer_cache[tokenizer_config] = tokenizer
|
408 |
+
return tokenizer
|
409 |
+
|
410 |
+
def get_name_with_hyperlink(self, tokenizer_name):
|
411 |
+
def model_hyperlink(link, model_name):
|
412 |
+
model_name = model_name
|
413 |
+
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
414 |
+
|
415 |
+
tokenizer_config = self.get_tokenizer_config(tokenizer_name)
|
416 |
+
return model_hyperlink(tokenizer_config.link, tokenizer_config.name_display.split("/")[-1])
|
417 |
+
|
418 |
+
|
419 |
+
tokenizer_factory = TokenizerFactory()
|
420 |
+
|
421 |
+
# class TokenizerType(Enum):
|
422 |
+
#
|
423 |
+
# # BERTTokenizer
|
424 |
+
# # 依赖一个txt文件
|
425 |
+
#
|
426 |
+
#
|
427 |
+
# # https://github.com/EleutherAI/gpt-neox/blob/v2.0/megatron/tokenizer/tokenizer.py#L231
|
428 |
+
# # 依赖一个json文件,Tokenizer.from_file(vocab_file)
|
429 |
+
# # 案例:gpt-neox-20B
|
430 |
+
# HFTokenizer = auto()
|
431 |
+
#
|
432 |
+
# # 依赖: model_file, sentencepiece.SentencePieceProcessor(model_file)
|
433 |
+
# # 案例:
|
434 |
+
# SentencePieceTokenizer = auto()
|
435 |
+
#
|
436 |
+
#
|
437 |
+
# # 依赖: 3个json文件:vocab.json, merges.txt, special_tokens.txt
|
438 |
+
# # 源码:
|
439 |
+
# # - https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/tokenizer/gpt2_tokenization.py#L92
|
440 |
+
# # Byte-level BPE
|
441 |
+
# GPT2BPETokenizer = auto()
|
442 |
+
|
443 |
+
|
444 |
+
if __name__ == "__main__":
|
445 |
+
|
446 |
+
for tokenizer_config in tokenizer_factory.all_tokenizer_configs:
|
447 |
+
if True:
|
448 |
+
# if "t5" in tokenizer_config.name_or_path:
|
449 |
+
tokenizer1 = tokenizer_factory.get_tokenizer(tokenizer_config.name_or_path)
|
450 |
+
tokenizer2 = tokenizer_factory.get_tokenizer(tokenizer_config.name_display)
|
451 |
+
tokenizer3 = tokenizer_factory.get_tokenizer(tokenizer_config.name_display.split("/")[-1])
|
452 |
+
assert tokenizer1 == tokenizer2 == tokenizer3
|
453 |
+
print(tokenizer_config.name_or_path, len(tokenizer1))
|
vocab/Intern_gpt/README.md
DELETED
File without changes
|
vocab/__init__.py
DELETED
@@ -1,260 +0,0 @@
|
|
1 |
-
import importlib
|
2 |
-
from enum import Enum, auto
|
3 |
-
|
4 |
-
"""Interface:
|
5 |
-
tokenizer.encode
|
6 |
-
tokenizer.decode
|
7 |
-
tokenizer.convert_tokens_to_string # gpt4 没有这个方法
|
8 |
-
tokenizer.convert_ids_to_tokens
|
9 |
-
|
10 |
-
|
11 |
-
tokenizer.parent = ""
|
12 |
-
tokenizer.vocab_size
|
13 |
-
tokenizer.get_vocab() # gpt-neox-20b, llama
|
14 |
-
tokenizer.type = TokenizerType.ByteBPE.name
|
15 |
-
tokenizer.implementation = TokenizerImpl.SentencePiece.name # https://github.com/facebookresearch/llama/blob/main/llama/tokenizer.py
|
16 |
-
"HFGPT2Tokenizer", "HFTokenizer", "GPT2BPETokenizer", "CharLevelTokenizer", "TiktokenTokenizer", "SPMTokenizer", https://github.com/EleutherAI/gpt-neox/blob/main/tools/preprocess_data.py
|
17 |
-
|
18 |
-
- google/bert
|
19 |
-
- 特征
|
20 |
-
- 词典:有##开头的token,表示subword
|
21 |
-
- 示例:
|
22 |
-
- bpe-google/sentencepiece:
|
23 |
-
- 特征:
|
24 |
-
- 训练:
|
25 |
-
- 文件: *.sp_model 或 *.model (可选文件 .vocab,) spm简称
|
26 |
-
- 实现:
|
27 |
-
- 依赖: protobuf
|
28 |
-
- 训练: `import sentencepiece as spm; spm.SentencePieceTrainer.train` 或 `spm_train`
|
29 |
-
- 加载: `import sentencepiece as spm; spm.SentencePieceProcessor().Load(vocab_file)`
|
30 |
-
- 方法: 是SentencePieceProcessor类型,sp_model.id_to_piece,有tokenizer.json tokenizer.model,
|
31 |
-
- 分词:
|
32 |
-
- pre_tokenizers.ByteLevel(add_prefix_space=True, use_regex=False)
|
33 |
-
- 词典: 词典字符有 ▁ (U+2581) ,表示空格或句首。
|
34 |
-
- 示例:google-t5, llama,baichuan, orion,
|
35 |
-
- icetk: sentencepiece的分支,支持image_tokenizer
|
36 |
-
- glm, chatglm1, chatglm2
|
37 |
-
- openai/tiktoken
|
38 |
-
- bpe-hf_tokenizer
|
39 |
-
- ss
|
40 |
-
- 特征:
|
41 |
-
- 文件: tokenizer.json(包含后两个文件的内容), merges.txt, vocab.json
|
42 |
-
- added_tokens 在vocab中不一定存在。
|
43 |
-
- 实现:
|
44 |
-
- 训练: `from tokenizers.trainers import BpeTrainer, UnigramTrainer, WordLevelTrainer, WordPieceTrainer`
|
45 |
-
- 加载:
|
46 |
-
- 方法: .model.from_file .model.save .model.token_to_id .model.tokenize
|
47 |
-
- .model 是 tokenizer.models.BPE 类型
|
48 |
-
- 词典有 Ġ "\u0120" 开头
|
49 |
-
- 优势
|
50 |
-
-
|
51 |
-
- 示例:gpt2, gpt_neox_20b, moss, bloom, qwen2
|
52 |
-
- 优势:相对sentence piece,hf_tokenizer支持pre-tokenization的正则表达式,对tab和换行支持更好 ()
|
53 |
-
- ss
|
54 |
-
- tiktoken
|
55 |
-
- 特征:空格就是空格,
|
56 |
-
- 示例:gpt3.5 gpt4, qwen,
|
57 |
-
tokenizer.comments = "split all numbers into individual digits, " \
|
58 |
-
"and fallback to bytes to decompose unknown UTF-8 characters"
|
59 |
-
|
60 |
-
tokenizer.all_special_tokens # baichuan
|
61 |
-
tokenizer.special_tokens_set # gpt3.5_turbo
|
62 |
-
tokenizer.special_tokens_map
|
63 |
-
|
64 |
-
tokenizer.dependency [sentencepiece, tiktoken, icetk]
|
65 |
-
"""
|
66 |
-
|
67 |
-
from utils.log_util import logger
|
68 |
-
|
69 |
-
# Animal = Enum('Animal', 'ANT BEE CAT DOG')
|
70 |
-
|
71 |
-
uniq_tokenizers = [
|
72 |
-
""
|
73 |
-
]
|
74 |
-
|
75 |
-
# format: alias/abbr, description, hf_path, tokenizer_class/type, comments, Organization
|
76 |
-
# TODO: append link and description to the end of dropdown button.
|
77 |
-
all_tokenizers = [
|
78 |
-
##### bert 系列
|
79 |
-
("bert_base_cased", "", "bert"),
|
80 |
-
("bert_base_uncased", "", "bert"),
|
81 |
-
("bert_base_chinese", "", "bert"),
|
82 |
-
("roberta_chinese_clue", "", "bert"),
|
83 |
-
("kplug",),
|
84 |
-
("gpt2_chinese",),
|
85 |
-
|
86 |
-
##### GPT2Tokenizer
|
87 |
-
("gpt2", "", "GPT2Tokenizer",), #
|
88 |
-
("moss", "", "GPT2Tokenizer",),
|
89 |
-
("bloom", "", "GPT2Tokenizer",),
|
90 |
-
# ("bloomz_6b4_zh",
|
91 |
-
# ("belle_7b_2m", # 模型和词典都基于bloom
|
92 |
-
#
|
93 |
-
("gpt_nexo_20b", "", "GPT2Tokenizer",), # 5万
|
94 |
-
("qwen1_5_14b_chat", "", "GPT2Tokenizer",), # 15万,速度有点慢
|
95 |
-
("starchat_alpha", "", "GPT2Tokenizer",),
|
96 |
-
|
97 |
-
####### google/sentencepiece tokenizer:
|
98 |
-
# T5 llama internlm
|
99 |
-
("t5_small", "", "sentencepiece"),
|
100 |
-
("t5_base", "", "sentencepiece"),
|
101 |
-
("t5_large", "", "sentencepiece"),
|
102 |
-
("chatyuan_large_v2", "", "sentencepiece"),
|
103 |
-
("prompt_clue", "", "sentencepiece"),
|
104 |
-
|
105 |
-
("llama", "", "sentencepiece", "llama use single digits and thus uses 4 tokens to encode the number 1000"), # '中文单字': 700, '中文多字': 0
|
106 |
-
("llama2", "", "sentencepiece"),
|
107 |
-
("llama3", "", "sentencepiece"),
|
108 |
-
("chinese_llama", "", "sentencepiece"), #
|
109 |
-
("chinese_llama2", "", "sentencepiece"), #
|
110 |
-
("llama_3_chinese_8b", "sentencepiece"),
|
111 |
-
# ("chinese_alpaca_lora_7b", # 中文Alpaca模型在上述中文LLaMA模型的基础上进一步使用了指令数据进行精调。
|
112 |
-
# ("belle_llama_ext_7b",
|
113 |
-
# ("alpaca_7b",
|
114 |
-
("baichuan", "", "sentencepiece"),
|
115 |
-
("baichuan2", "", "sentencepiece"),
|
116 |
-
("internlm_chat_7b", "", "sentencepiece"),
|
117 |
-
("internlm2_chat_7b", "", "sentencepiece"),
|
118 |
-
("internlm2_math_7b", "", "sentencepiece"),
|
119 |
-
("internlm_xcomposer_7b", "", "sentencepiece"),
|
120 |
-
("falcon_7b", "", "sentencepiece"),
|
121 |
-
("falcon_180b", "", "sentencepiece"),
|
122 |
-
("skywork_13b_base",),
|
123 |
-
("skywork_13b_math",),
|
124 |
-
("xlm_roberta", ),
|
125 |
-
# "goat",
|
126 |
-
|
127 |
-
# ##### glm系列
|
128 |
-
# "glm_chinese",),
|
129 |
-
("chatglm_6b", "", "sentencepiece"),
|
130 |
-
("chatglm2_6b", "", "sentencepiece"),
|
131 |
-
("chatglm3_6b", "", "sentencepiece"),
|
132 |
-
("character_glm_6b", "", "sentencepiece"),
|
133 |
-
|
134 |
-
# tiktoken 系列
|
135 |
-
("qwen_1_8b_chat", "", "tiktoken"),
|
136 |
-
("qwen_7b_chat", "", "tiktoken"),
|
137 |
-
("qwen_72b_chat", "", "tiktoken"),
|
138 |
-
("text_davinci_003", "", "tiktoken"),
|
139 |
-
("code_davinci_002", "", "tiktoken"),
|
140 |
-
("gpt_35_turbo", "", "tiktoken"),
|
141 |
-
("gpt_4", "", "tiktoken"),
|
142 |
-
|
143 |
-
# 未分类
|
144 |
-
# ("amber", ""),
|
145 |
-
("crystal_coder", ""),
|
146 |
-
("mistral_7b",),
|
147 |
-
("mixtral_8_7b",),
|
148 |
-
|
149 |
-
|
150 |
-
("flan_t5_base",),
|
151 |
-
("fastchat_t5_3b",),
|
152 |
-
("pko_t5_large",),
|
153 |
-
("wizardcoder_15b_v1",),
|
154 |
-
("yi_6b",),
|
155 |
-
("yi_34b",),
|
156 |
-
("yi_vl34b",),
|
157 |
-
("orion_14b_chat",),
|
158 |
-
("phi_1",),
|
159 |
-
("phi_2",),
|
160 |
-
("phi_3_mini", "即llama vocab"),
|
161 |
-
("solar_10_7b",),
|
162 |
-
("mobilebert_uncased",),
|
163 |
-
# ("mobilenet_v2",), # error
|
164 |
-
("switch_c_2048",),
|
165 |
-
("byt5_small",),
|
166 |
-
("mt5_large",),
|
167 |
-
("wizardcoder_python_7b_v1",),
|
168 |
-
("wizardlm_7b_v1",),
|
169 |
-
("wizardmath_70b_v1",),
|
170 |
-
("tigerbot_70b_chat_v4_4k",),
|
171 |
-
("tigerbot_13b_chat_v2",),
|
172 |
-
("deepseek_coder_33b_instruct",),
|
173 |
-
("deepseek_llm_7b_base",),
|
174 |
-
("gemma_7b",),
|
175 |
-
("olmo_7b",),
|
176 |
-
("aya_101",),
|
177 |
-
("zephyr_7b_beta",),
|
178 |
-
("jamba_v0_1", ),
|
179 |
-
("dbrx_instruct", ),
|
180 |
-
("grok_1",),
|
181 |
-
# ("claude",),
|
182 |
-
("gpt_nexo_20b", ),
|
183 |
-
("gpt_neox_japanese_2_7b", ),
|
184 |
-
|
185 |
-
]
|
186 |
-
|
187 |
-
all_tokenizers = [tokenizer[0] for tokenizer in all_tokenizers]
|
188 |
-
all_tokenizers = sorted(all_tokenizers)
|
189 |
-
|
190 |
-
|
191 |
-
class TokenizerType(Enum):
|
192 |
-
"""
|
193 |
-
- https://huggingface.co/docs/transformers/tokenizer_summary
|
194 |
-
- https://github.com/EleutherAI/gpt-neox/blob/main/megatron/tokenizer/tokenizer.py
|
195 |
-
- https://github.com/google/sentencepiece/blob/3863f7648e5d8edb571ac592f3ac4f5f0695275a/src/sentencepiece_model.proto#L48
|
196 |
-
- UNIGRAM = 1; // Unigram language model with dynamic algorithm
|
197 |
-
- BPE = 2; // Byte Pair Encoding
|
198 |
-
- WORD = 3; // Delimitered by whitespace.
|
199 |
-
- CHAR = 4; // tokenizes into character sequence
|
200 |
-
"""
|
201 |
-
BPE = auto()
|
202 |
-
ByteBPE = auto() # BBPE Byte-Level BPE
|
203 |
-
GPT2BPETokenizer = auto() #
|
204 |
-
BERTTokenizer = auto()
|
205 |
-
|
206 |
-
|
207 |
-
# class TokenizerType(Enum):
|
208 |
-
#
|
209 |
-
# # BERTTokenizer
|
210 |
-
# # 依赖一个txt文件
|
211 |
-
#
|
212 |
-
#
|
213 |
-
# # https://github.com/EleutherAI/gpt-neox/blob/v2.0/megatron/tokenizer/tokenizer.py#L231
|
214 |
-
# # 依赖一个json文件,Tokenizer.from_file(vocab_file)
|
215 |
-
# # 案例:gpt-neox-20B
|
216 |
-
# HFTokenizer = auto()
|
217 |
-
#
|
218 |
-
# # 依赖: model_file, sentencepiece.SentencePieceProcessor(model_file)
|
219 |
-
# # 案例:
|
220 |
-
# SentencePieceTokenizer = auto()
|
221 |
-
#
|
222 |
-
#
|
223 |
-
# # 依赖: 3个json文件:vocab.json, merges.txt, special_tokens.txt
|
224 |
-
# # 源码:
|
225 |
-
# # - https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/tokenizer/gpt2_tokenization.py#L92
|
226 |
-
# # Byte-level BPE
|
227 |
-
# GPT2BPETokenizer = auto()
|
228 |
-
|
229 |
-
|
230 |
-
class TokenizerImpl(Enum):
|
231 |
-
|
232 |
-
"""
|
233 |
-
https://github.com/google/sentencepiece,支持 sentencepiece(BPE,unigram,char,word), wordpiece,
|
234 |
-
spm_train --model_type unigram/bpe/char/word
|
235 |
-
"""
|
236 |
-
SentencePiece = auto()
|
237 |
-
|
238 |
-
# https://github.com/huggingface/transformers/blob/v4.30.2/src/transformers/models/gpt2/tokenization_gpt2.py#L104
|
239 |
-
# 构造词典:
|
240 |
-
# GPT2Tokenizer = auto()
|
241 |
-
# BertTokenizer = auto() #
|
242 |
-
|
243 |
-
"""
|
244 |
-
"""
|
245 |
-
HFTokenizer = auto() # https://github.com/huggingface/tokenizers, 支持
|
246 |
-
|
247 |
-
|
248 |
-
cache = {}
|
249 |
-
|
250 |
-
def load_tokener(model_name):
|
251 |
-
if model_name in cache:
|
252 |
-
return cache[model_name]
|
253 |
-
logger.info(f"loading tokenizer {model_name}")
|
254 |
-
tokenizer = importlib.import_module("." + model_name, 'vocab').tokenizer
|
255 |
-
tokenizer.alias = model_name
|
256 |
-
return tokenizer
|
257 |
-
|
258 |
-
|
259 |
-
if __name__ == "__main__":
|
260 |
-
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vocab/_alpaca_7b/README.md
DELETED
File without changes
|
vocab/_goat/README.md
DELETED
File without changes
|
vocab/_goat/__init__.py
DELETED
File without changes
|
vocab/albert/__init__.py
DELETED
@@ -1,6 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
|
3 |
-
SentencePiece(unigram)
|
4 |
-
|
5 |
-
https://huggingface.co/docs/transformers/tokenizer_summary#sentencepiece
|
6 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vocab/aya_101/__init__.py
DELETED
@@ -1,5 +0,0 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
from transformers import AutoTokenizer
|
4 |
-
|
5 |
-
tokenizer = AutoTokenizer.from_pretrained("CohereForAI/aya-101")
|
|
|
|
|
|
|
|
|
|
|
|
vocab/baichuan/Baichuan-7B/config.json
DELETED
@@ -1,26 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"architectures": [
|
3 |
-
"BaiChuanForCausalLM"
|
4 |
-
],
|
5 |
-
"auto_map": {
|
6 |
-
"AutoConfig": "configuration_baichuan.BaiChuanConfig",
|
7 |
-
"AutoModelForCausalLM": "modeling_baichuan.BaiChuanForCausalLM"
|
8 |
-
},
|
9 |
-
"bos_token_id": 1,
|
10 |
-
"eos_token_id": 2,
|
11 |
-
"hidden_act": "silu",
|
12 |
-
"hidden_size": 4096,
|
13 |
-
"initializer_range": 0.02,
|
14 |
-
"intermediate_size": 11008,
|
15 |
-
"max_position_embeddings": 4096,
|
16 |
-
"model_type": "baichuan",
|
17 |
-
"num_attention_heads": 32,
|
18 |
-
"num_hidden_layers": 32,
|
19 |
-
"pad_token_id": 0,
|
20 |
-
"rms_norm_eps": 1e-06,
|
21 |
-
"tie_word_embeddings": false,
|
22 |
-
"torch_dtype": "float32",
|
23 |
-
"transformers_version": "4.29.1",
|
24 |
-
"use_cache": true,
|
25 |
-
"vocab_size": 64000
|
26 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vocab/baichuan/Baichuan-7B/configuration_baichuan.py
DELETED
@@ -1,66 +0,0 @@
|
|
1 |
-
# coding=utf-8
|
2 |
-
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
|
3 |
-
#
|
4 |
-
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
|
5 |
-
# and OPT implementations in this library. It has been modified from its
|
6 |
-
# original forms to accommodate minor architectural differences compared
|
7 |
-
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
|
8 |
-
#
|
9 |
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
10 |
-
# you may not use this file except in compliance with the License.
|
11 |
-
# You may obtain a copy of the License at
|
12 |
-
#
|
13 |
-
# http://www.apache.org/licenses/LICENSE-2.0
|
14 |
-
#
|
15 |
-
# Unless required by applicable law or agreed to in writing, software
|
16 |
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
17 |
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
18 |
-
# See the License for the specific language governing permissions and
|
19 |
-
# limitations under the License.
|
20 |
-
|
21 |
-
from transformers.configuration_utils import PretrainedConfig
|
22 |
-
from transformers.utils import logging
|
23 |
-
|
24 |
-
|
25 |
-
logger = logging.get_logger(__name__)
|
26 |
-
|
27 |
-
|
28 |
-
class BaiChuanConfig(PretrainedConfig):
|
29 |
-
model_type = "baichuan"
|
30 |
-
keys_to_ignore_at_inference = ["past_key_values"]
|
31 |
-
|
32 |
-
def __init__(
|
33 |
-
self,
|
34 |
-
vocab_size=64000,
|
35 |
-
hidden_size=4096,
|
36 |
-
intermediate_size=11008,
|
37 |
-
num_hidden_layers=32,
|
38 |
-
num_attention_heads=32,
|
39 |
-
hidden_act="silu",
|
40 |
-
max_position_embeddings=4096,
|
41 |
-
initializer_range=0.02,
|
42 |
-
rms_norm_eps=1e-6,
|
43 |
-
use_cache=True,
|
44 |
-
pad_token_id=0,
|
45 |
-
bos_token_id=1,
|
46 |
-
eos_token_id=2,
|
47 |
-
tie_word_embeddings=False,
|
48 |
-
**kwargs,
|
49 |
-
):
|
50 |
-
self.vocab_size = vocab_size
|
51 |
-
self.max_position_embeddings = max_position_embeddings
|
52 |
-
self.hidden_size = hidden_size
|
53 |
-
self.intermediate_size = intermediate_size
|
54 |
-
self.num_hidden_layers = num_hidden_layers
|
55 |
-
self.num_attention_heads = num_attention_heads
|
56 |
-
self.hidden_act = hidden_act
|
57 |
-
self.initializer_range = initializer_range
|
58 |
-
self.rms_norm_eps = rms_norm_eps
|
59 |
-
self.use_cache = use_cache
|
60 |
-
super().__init__(
|
61 |
-
pad_token_id=pad_token_id,
|
62 |
-
bos_token_id=bos_token_id,
|
63 |
-
eos_token_id=eos_token_id,
|
64 |
-
tie_word_embeddings=tie_word_embeddings,
|
65 |
-
**kwargs,
|
66 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vocab/baichuan/Baichuan-7B/special_tokens_map.json
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"bos_token": {
|
3 |
-
"content": "<s>",
|
4 |
-
"lstrip": false,
|
5 |
-
"normalized": true,
|
6 |
-
"rstrip": false,
|
7 |
-
"single_word": false
|
8 |
-
},
|
9 |
-
"eos_token": {
|
10 |
-
"content": "</s>",
|
11 |
-
"lstrip": false,
|
12 |
-
"normalized": true,
|
13 |
-
"rstrip": false,
|
14 |
-
"single_word": false
|
15 |
-
},
|
16 |
-
"unk_token": {
|
17 |
-
"content": "<unk>",
|
18 |
-
"lstrip": false,
|
19 |
-
"normalized": true,
|
20 |
-
"rstrip": false,
|
21 |
-
"single_word": false
|
22 |
-
}
|
23 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vocab/baichuan/Baichuan-7B/tokenization_baichuan.py
DELETED
@@ -1,250 +0,0 @@
|
|
1 |
-
# coding=utf-8
|
2 |
-
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
|
3 |
-
#
|
4 |
-
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
|
5 |
-
# and OPT implementations in this library. It has been modified from its
|
6 |
-
# original forms to accommodate minor architectural differences compared
|
7 |
-
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
|
8 |
-
#
|
9 |
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
10 |
-
# you may not use this file except in compliance with the License.
|
11 |
-
# You may obtain a copy of the License at
|
12 |
-
#
|
13 |
-
# http://www.apache.org/licenses/LICENSE-2.0
|
14 |
-
#
|
15 |
-
# Unless required by applicable law or agreed to in writing, software
|
16 |
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
17 |
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
18 |
-
# See the License for the specific language governing permissions and
|
19 |
-
# limitations under the License.
|
20 |
-
|
21 |
-
import os
|
22 |
-
from shutil import copyfile
|
23 |
-
from typing import Any, Dict, List, Optional, Tuple
|
24 |
-
|
25 |
-
import sentencepiece as spm
|
26 |
-
|
27 |
-
from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
|
28 |
-
from transformers.utils import logging
|
29 |
-
|
30 |
-
|
31 |
-
logger = logging.get_logger(__name__)
|
32 |
-
|
33 |
-
VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
|
34 |
-
|
35 |
-
PRETRAINED_VOCAB_FILES_MAP = {
|
36 |
-
"vocab_file": {},
|
37 |
-
"tokenizer_file": {},
|
38 |
-
}
|
39 |
-
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {}
|
40 |
-
|
41 |
-
|
42 |
-
class BaiChuanTokenizer(PreTrainedTokenizer):
|
43 |
-
"""
|
44 |
-
Construct a BaiChuan tokenizer. Based on byte-level Byte-Pair-Encoding.
|
45 |
-
|
46 |
-
Args:
|
47 |
-
vocab_file (`str`):
|
48 |
-
Path to the vocabulary file.
|
49 |
-
"""
|
50 |
-
|
51 |
-
vocab_files_names = VOCAB_FILES_NAMES
|
52 |
-
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
53 |
-
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
54 |
-
model_input_names = ["input_ids", "attention_mask"]
|
55 |
-
|
56 |
-
def __init__(
|
57 |
-
self,
|
58 |
-
vocab_file,
|
59 |
-
unk_token="<unk>",
|
60 |
-
bos_token="<s>",
|
61 |
-
eos_token="</s>",
|
62 |
-
pad_token=None,
|
63 |
-
sp_model_kwargs: Optional[Dict[str, Any]] = None,
|
64 |
-
add_bos_token=True,
|
65 |
-
add_eos_token=False,
|
66 |
-
clean_up_tokenization_spaces=False,
|
67 |
-
**kwargs,
|
68 |
-
):
|
69 |
-
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
|
70 |
-
bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
|
71 |
-
eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
|
72 |
-
unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
|
73 |
-
pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
|
74 |
-
self.vocab_file = vocab_file
|
75 |
-
self.add_bos_token = add_bos_token
|
76 |
-
self.add_eos_token = add_eos_token
|
77 |
-
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
|
78 |
-
self.sp_model.Load(vocab_file)
|
79 |
-
super().__init__(
|
80 |
-
bos_token=bos_token,
|
81 |
-
eos_token=eos_token,
|
82 |
-
unk_token=unk_token,
|
83 |
-
pad_token=pad_token,
|
84 |
-
add_bos_token=add_bos_token,
|
85 |
-
add_eos_token=add_eos_token,
|
86 |
-
sp_model_kwargs=self.sp_model_kwargs,
|
87 |
-
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
|
88 |
-
**kwargs,
|
89 |
-
)
|
90 |
-
|
91 |
-
def __getstate__(self):
|
92 |
-
state = self.__dict__.copy()
|
93 |
-
state["sp_model"] = None
|
94 |
-
return state
|
95 |
-
|
96 |
-
def __setstate__(self, d):
|
97 |
-
self.__dict__ = d
|
98 |
-
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
|
99 |
-
self.sp_model.Load(self.vocab_file)
|
100 |
-
|
101 |
-
@property
|
102 |
-
def vocab_size(self):
|
103 |
-
"""Returns vocab size"""
|
104 |
-
return self.sp_model.get_piece_size()
|
105 |
-
|
106 |
-
def get_vocab(self):
|
107 |
-
"""Returns vocab as a dict"""
|
108 |
-
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
|
109 |
-
vocab.update(self.added_tokens_encoder)
|
110 |
-
return vocab
|
111 |
-
|
112 |
-
def _tokenize(self, text):
|
113 |
-
"""Returns a tokenized string."""
|
114 |
-
return self.sp_model.encode(text, out_type=str)
|
115 |
-
|
116 |
-
def _convert_token_to_id(self, token):
|
117 |
-
"""Converts a token (str) in an id using the vocab."""
|
118 |
-
return self.sp_model.piece_to_id(token)
|
119 |
-
|
120 |
-
def _convert_id_to_token(self, index):
|
121 |
-
"""Converts an index (integer) in a token (str) using the vocab."""
|
122 |
-
token = self.sp_model.IdToPiece(index)
|
123 |
-
return token
|
124 |
-
|
125 |
-
def convert_tokens_to_string(self, tokens):
|
126 |
-
"""Converts a sequence of tokens (string) in a single string."""
|
127 |
-
current_sub_tokens = []
|
128 |
-
out_string = ""
|
129 |
-
prev_is_special = False
|
130 |
-
for i, token in enumerate(tokens):
|
131 |
-
# make sure that special tokens are not decoded using sentencepiece model
|
132 |
-
if token in self.all_special_tokens:
|
133 |
-
if not prev_is_special and i != 0:
|
134 |
-
out_string += " "
|
135 |
-
out_string += self.sp_model.decode(current_sub_tokens) + token
|
136 |
-
prev_is_special = True
|
137 |
-
current_sub_tokens = []
|
138 |
-
else:
|
139 |
-
current_sub_tokens.append(token)
|
140 |
-
prev_is_special = False
|
141 |
-
out_string += self.sp_model.decode(current_sub_tokens)
|
142 |
-
return out_string
|
143 |
-
|
144 |
-
def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
145 |
-
"""
|
146 |
-
Save the vocabulary and special tokens file to a directory.
|
147 |
-
|
148 |
-
Args:
|
149 |
-
save_directory (`str`):
|
150 |
-
The directory in which to save the vocabulary.
|
151 |
-
|
152 |
-
Returns:
|
153 |
-
`Tuple(str)`: Paths to the files saved.
|
154 |
-
"""
|
155 |
-
if not os.path.isdir(save_directory):
|
156 |
-
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
|
157 |
-
return
|
158 |
-
out_vocab_file = os.path.join(
|
159 |
-
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
|
160 |
-
)
|
161 |
-
|
162 |
-
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
|
163 |
-
copyfile(self.vocab_file, out_vocab_file)
|
164 |
-
elif not os.path.isfile(self.vocab_file):
|
165 |
-
with open(out_vocab_file, "wb") as fi:
|
166 |
-
content_spiece_model = self.sp_model.serialized_model_proto()
|
167 |
-
fi.write(content_spiece_model)
|
168 |
-
|
169 |
-
return (out_vocab_file,)
|
170 |
-
|
171 |
-
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
|
172 |
-
bos_token_id = [self.bos_token_id] if self.add_bos_token else []
|
173 |
-
eos_token_id = [self.eos_token_id] if self.add_eos_token else []
|
174 |
-
|
175 |
-
output = bos_token_id + token_ids_0 + eos_token_id
|
176 |
-
|
177 |
-
if token_ids_1 is not None:
|
178 |
-
output = output + bos_token_id + token_ids_1 + eos_token_id
|
179 |
-
|
180 |
-
return output
|
181 |
-
|
182 |
-
def get_special_tokens_mask(
|
183 |
-
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
|
184 |
-
) -> List[int]:
|
185 |
-
"""
|
186 |
-
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
|
187 |
-
special tokens using the tokenizer `prepare_for_model` method.
|
188 |
-
|
189 |
-
Args:
|
190 |
-
token_ids_0 (`List[int]`):
|
191 |
-
List of IDs.
|
192 |
-
token_ids_1 (`List[int]`, *optional*):
|
193 |
-
Optional second list of IDs for sequence pairs.
|
194 |
-
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
|
195 |
-
Whether or not the token list is already formatted with special tokens for the model.
|
196 |
-
|
197 |
-
Returns:
|
198 |
-
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
|
199 |
-
"""
|
200 |
-
if already_has_special_tokens:
|
201 |
-
return super().get_special_tokens_mask(
|
202 |
-
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
|
203 |
-
)
|
204 |
-
|
205 |
-
bos_token_id = [1] if self.add_bos_token else []
|
206 |
-
eos_token_id = [1] if self.add_eos_token else []
|
207 |
-
|
208 |
-
if token_ids_1 is None:
|
209 |
-
return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
|
210 |
-
return (
|
211 |
-
bos_token_id
|
212 |
-
+ ([0] * len(token_ids_0))
|
213 |
-
+ eos_token_id
|
214 |
-
+ bos_token_id
|
215 |
-
+ ([0] * len(token_ids_1))
|
216 |
-
+ eos_token_id
|
217 |
-
)
|
218 |
-
|
219 |
-
def create_token_type_ids_from_sequences(
|
220 |
-
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
221 |
-
) -> List[int]:
|
222 |
-
"""
|
223 |
-
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
|
224 |
-
sequence pair mask has the following format:
|
225 |
-
|
226 |
-
```
|
227 |
-
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
228 |
-
| first sequence | second sequence |
|
229 |
-
```
|
230 |
-
|
231 |
-
if token_ids_1 is None, only returns the first portion of the mask (0s).
|
232 |
-
|
233 |
-
Args:
|
234 |
-
token_ids_0 (`List[int]`):
|
235 |
-
List of ids.
|
236 |
-
token_ids_1 (`List[int]`, *optional*):
|
237 |
-
Optional second list of IDs for sequence pairs.
|
238 |
-
|
239 |
-
Returns:
|
240 |
-
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
|
241 |
-
"""
|
242 |
-
bos_token_id = [self.bos_token_id] if self.add_bos_token else []
|
243 |
-
eos_token_id = [self.eos_token_id] if self.add_eos_token else []
|
244 |
-
|
245 |
-
output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
|
246 |
-
|
247 |
-
if token_ids_1 is not None:
|
248 |
-
output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
|
249 |
-
|
250 |
-
return output
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vocab/baichuan/Baichuan-7B/tokenizer.model
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:4be54af290d93c113bcbf421115ae9eed9d6340408f564898f1e966dc738ef01
|
3 |
-
size 1136699
|
|
|
|
|
|
|
|
vocab/baichuan/Baichuan-7B/tokenizer_config.json
DELETED
@@ -1,35 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"auto_map": {
|
3 |
-
"AutoTokenizer": ["tokenization_baichuan.BaiChuanTokenizer", null]
|
4 |
-
},
|
5 |
-
"add_bos_token": false,
|
6 |
-
"add_eos_token": false,
|
7 |
-
"bos_token": {
|
8 |
-
"__type": "AddedToken",
|
9 |
-
"content": "<s>",
|
10 |
-
"lstrip": false,
|
11 |
-
"normalized": true,
|
12 |
-
"rstrip": false,
|
13 |
-
"single_word": false
|
14 |
-
},
|
15 |
-
"clean_up_tokenization_spaces": false,
|
16 |
-
"eos_token": {
|
17 |
-
"__type": "AddedToken",
|
18 |
-
"content": "</s>",
|
19 |
-
"lstrip": false,
|
20 |
-
"normalized": true,
|
21 |
-
"rstrip": false,
|
22 |
-
"single_word": false
|
23 |
-
},
|
24 |
-
"model_max_length": 1000000000000000019884624838656,
|
25 |
-
"sp_model_kwargs": {},
|
26 |
-
"tokenizer_class": "BaiChuanTokenizer",
|
27 |
-
"unk_token": {
|
28 |
-
"__type": "AddedToken",
|
29 |
-
"content": "<unk>",
|
30 |
-
"lstrip": false,
|
31 |
-
"normalized": true,
|
32 |
-
"rstrip": false,
|
33 |
-
"single_word": false
|
34 |
-
}
|
35 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vocab/baichuan/__init__.py
DELETED
@@ -1,19 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
import config
|
3 |
-
from transformers import AutoTokenizer
|
4 |
-
from vocab import TokenizerType
|
5 |
-
|
6 |
-
|
7 |
-
if config.USE_REMOTE:
|
8 |
-
tokenizer = AutoTokenizer.from_pretrained("baichuan-inc/Baichuan-7B", trust_remote_code=True)
|
9 |
-
else:
|
10 |
-
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
|
11 |
-
TOKENIZER_DIR = os.path.join(CURRENT_DIR, "Baichuan-7B")
|
12 |
-
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR, trust_remote_code=True)
|
13 |
-
|
14 |
-
|
15 |
-
# byte-bpe sentencepiece
|
16 |
-
tokenizer.type = TokenizerType.ByteBPE
|
17 |
-
|
18 |
-
tokenizer.comments = "使用 SentencePiece 中的 Byte-Pair Encoding (BPE) 作为分词算法"
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vocab/baichuan/demo.py
DELETED
@@ -1,6 +0,0 @@
|
|
1 |
-
|
2 |
-
from vocab.baichuan import tokenizer
|
3 |
-
|
4 |
-
id1 = tokenizer.encode("<pad>")
|
5 |
-
token1 = tokenizer.decode(125696)
|
6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vocab/baichuan/error.md
DELETED
@@ -1,8 +0,0 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
## AttributeError: 'BaichuanTokenizer' object has no attribute 'sp_model'
|
4 |
-
|
5 |
-
https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat/discussions/18
|
6 |
-
|
7 |
-
|
8 |
-
transfomers 4.34 doesn't work for me either. Degrading to 4.33.1 works in my case
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|