Spaces:
Running
Running
update
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +37 -35
- README.2.md +0 -136
- README.md +17 -15
- app.py +39 -24
- character_app.py +83 -79
- character_util.py +216 -216
- compression_app.py +187 -130
- compression_util.py +320 -302
- css/style.css +62 -59
- playground_app.py +233 -264
- playground_util.py +181 -181
- requirements.txt +11 -10
- stats/character_stats.json +0 -0
- stats/compression_rate.json +0 -0
- stats/compression_rate/ClassCat.gpt2-base-french @ cc100.ar.diff.json +3 -0
- stats/compression_rate/ClassCat.gpt2-base-french @ cc100.de.diff.json +3 -0
- stats/compression_rate/ClassCat.gpt2-base-french @ cc100.en.diff.json +3 -0
- stats/compression_rate/ClassCat.gpt2-base-french @ cc100.es.diff.json +3 -0
- stats/compression_rate/ClassCat.gpt2-base-french @ cc100.fa.diff.json +3 -0
- stats/compression_rate/ClassCat.gpt2-base-french @ cc100.fr.diff.json +3 -0
- stats/compression_rate/ClassCat.gpt2-base-french @ cc100.ja.diff.json +3 -0
- stats/compression_rate/ClassCat.gpt2-base-french @ cc100.ko.diff.json +3 -0
- stats/compression_rate/ClassCat.gpt2-base-french @ cc100.zh-Hans.diff.json +3 -0
- stats/compression_rate/ClassCat.gpt2-base-spanish @ cc100.ar.diff.json +3 -0
- stats/compression_rate/ClassCat.gpt2-base-spanish @ cc100.de.diff.json +3 -0
- stats/compression_rate/ClassCat.gpt2-base-spanish @ cc100.en.diff.json +3 -0
- stats/compression_rate/ClassCat.gpt2-base-spanish @ cc100.es.diff.json +3 -0
- stats/compression_rate/ClassCat.gpt2-base-spanish @ cc100.fa.diff.json +3 -0
- stats/compression_rate/ClassCat.gpt2-base-spanish @ cc100.fr.diff.json +3 -0
- stats/compression_rate/ClassCat.gpt2-base-spanish @ cc100.ja.diff.json +3 -0
- stats/compression_rate/ClassCat.gpt2-base-spanish @ cc100.ko.diff.json +3 -0
- stats/compression_rate/ClassCat.gpt2-base-spanish @ cc100.zh-Hans.diff.json +3 -0
- stats/compression_rate/ClueAI.ChatYuan-large-v2 @ cc100.ar.diff.json +3 -0
- stats/compression_rate/ClueAI.ChatYuan-large-v2 @ cc100.de.diff.json +3 -0
- stats/compression_rate/ClueAI.ChatYuan-large-v2 @ cc100.en.diff.json +3 -0
- stats/compression_rate/ClueAI.ChatYuan-large-v2 @ cc100.es.diff.json +3 -0
- stats/compression_rate/ClueAI.ChatYuan-large-v2 @ cc100.fa.diff.json +3 -0
- stats/compression_rate/ClueAI.ChatYuan-large-v2 @ cc100.fr.diff.json +3 -0
- stats/compression_rate/ClueAI.ChatYuan-large-v2 @ cc100.ja.diff.json +3 -0
- stats/compression_rate/ClueAI.ChatYuan-large-v2 @ cc100.ko.diff.json +3 -0
- stats/compression_rate/ClueAI.ChatYuan-large-v2 @ cc100.zh-Hans.diff.json +3 -0
- stats/compression_rate/ClueAI.PromptCLUE-base @ cc100.ar.diff.json +3 -0
- stats/compression_rate/ClueAI.PromptCLUE-base @ cc100.de.diff.json +3 -0
- stats/compression_rate/ClueAI.PromptCLUE-base @ cc100.en.diff.json +3 -0
- stats/compression_rate/ClueAI.PromptCLUE-base @ cc100.es.diff.json +3 -0
- stats/compression_rate/ClueAI.PromptCLUE-base @ cc100.fa.diff.json +3 -0
- stats/compression_rate/ClueAI.PromptCLUE-base @ cc100.fr.diff.json +3 -0
- stats/compression_rate/ClueAI.PromptCLUE-base @ cc100.ja.diff.json +3 -0
- stats/compression_rate/ClueAI.PromptCLUE-base @ cc100.ko.diff.json +3 -0
- stats/compression_rate/ClueAI.PromptCLUE-base @ cc100.zh-Hans.diff.json +3 -0
.gitattributes
CHANGED
@@ -1,35 +1,37 @@
|
|
1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
stats/iter_vocab/*.jsonl filter=lfs diff=lfs merge=lfs -text
|
37 |
+
stats/compression_rate/*.json filter=lfs diff=lfs merge=lfs -text
|
README.2.md
DELETED
@@ -1,136 +0,0 @@
|
|
1 |
-
|
2 |
-
https://arxiv.org/abs/2308.16692 SpeechTokenizer
|
3 |
-
|
4 |
-
对于OpenAI的模型而言,英文的Token效率是中文的8-12倍,
|
5 |
-
之前三百字中文以上时Turbo 3.5 16k就会出现逻辑颠倒问题,提示词换成英文后该问题没有出现过。
|
6 |
-
|
7 |
-
## 词典构建
|
8 |
-
|
9 |
-
bert词典
|
10 |
-
gpt词典
|
11 |
-
gpt-neox词典
|
12 |
-
|
13 |
-
## encode
|
14 |
-
|
15 |
-
|
16 |
-
## decode
|
17 |
-
|
18 |
-
bert词典有个特殊字符 #
|
19 |
-
|
20 |
-
gpt-neox词典呢?
|
21 |
-
- _开头表示空格或句首
|
22 |
-
|
23 |
-
|
24 |
-
## 关于分词粒度
|
25 |
-
|
26 |
-
|
27 |
-
## ss
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
bert-chinese vocab_size: 21128
|
32 |
-
bert-en
|
33 |
-
clue
|
34 |
-
glm
|
35 |
-
chatglm
|
36 |
-
bloom
|
37 |
-
|
38 |
-
|
39 |
-
## 最小词典
|
40 |
-
|
41 |
-
mobilenet
|
42 |
-
|
43 |
-
|
44 |
-
## ss
|
45 |
-
|
46 |
-
|
47 |
-
## bert
|
48 |
-
|
49 |
-
```
|
50 |
-
[PAD]
|
51 |
-
...
|
52 |
-
[unused99]
|
53 |
-
[UNK]
|
54 |
-
[CLS]
|
55 |
-
[SEP]
|
56 |
-
[MASK]
|
57 |
-
<S>
|
58 |
-
<T>
|
59 |
-
!
|
60 |
-
...
|
61 |
-
|
62 |
-
big
|
63 |
-
##ut
|
64 |
-
ftp
|
65 |
-
carol
|
66 |
-
##vi
|
67 |
-
```
|
68 |
-
|
69 |
-
|
70 |
-
## @@
|
71 |
-
|
72 |
-
https://github.com/pytorch/fairseq/blob/master/tests/test_noising.py#L37
|
73 |
-
|
74 |
-
```
|
75 |
-
"he@@", "llo", "n@@", "ew", "y@@", "or@@", "k"
|
76 |
-
```
|
77 |
-
|
78 |
-
跟BERT类似,只不过BERT是词后缀,这里是词前缀。
|
79 |
-
|
80 |
-
这种应该是 https://github.com/rsennrich/subword-nmt
|
81 |
-
|
82 |
-
|
83 |
-
## GPT2
|
84 |
-
|
85 |
-
词典见:https://huggingface.co/gpt2/raw/main/vocab.json
|
86 |
-
|
87 |
-
|
88 |
-
```
|
89 |
-
['What', "'s", 'Ġup', 'Ġwith', 'Ġthe', 'Ġtoken', 'izer', '?']
|
90 |
-
```
|
91 |
-
跟BERT不同,BERT用特殊符号表示 “连接”,GPT2用特殊符号表示 “空格”。
|
92 |
-
|
93 |
-
详见 gpt2/README.md
|
94 |
-
|
95 |
-
- 功能符号: `<|endoftext|>` 表示换行。tab? 空格?
|
96 |
-
- 很多数字独立编码,几乎上千个。
|
97 |
-
|
98 |
-
- 类似的还有:moss
|
99 |
-
|
100 |
-
|
101 |
-
### Ġ是什么
|
102 |
-
|
103 |
-
It's a feature of byte-level BPE(an encoded space character).
|
104 |
-
Ġ 表示空格,有的版本用Ä代替Ġ。
|
105 |
-
|
106 |
-
|
107 |
-
```sh
|
108 |
-
What's up with the tokenizer?
|
109 |
-
# BPE后
|
110 |
-
['What', "'s", 'Ġup', 'Ġwith', 'Ġthe', 'Ġtoken', 'izer', '?']
|
111 |
-
# 经过vocab.json编码后
|
112 |
-
[ 2061, 338, 510, 351, 262, 11241, 7509, 30]
|
113 |
-
# 经过dict.txt编码后(fairseq特有)
|
114 |
-
[ 其他数字 ]
|
115 |
-
```
|
116 |
-
<>
|
117 |
-
疑问:up会加Ġ,为什么what不加Ġ,因为有个pre
|
118 |
-
|
119 |
-
- https://github.com/pytorch/fairseq/issues/1716
|
120 |
-
- https://github.com/huggingface/transformers/issues/1083
|
121 |
-
|
122 |
-
|
123 |
-
## 空格、tab、换行
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
## reversible and lossless
|
130 |
-
|
131 |
-
It's reversible and lossless, so you can convert tokens back into the original text
|
132 |
-
|
133 |
-
|
134 |
-
## diff
|
135 |
-
|
136 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
README.md
CHANGED
@@ -1,15 +1,17 @@
|
|
1 |
-
---
|
2 |
-
title: Tokenizer Arena
|
3 |
-
emoji:
|
4 |
-
colorFrom: red
|
5 |
-
colorTo: gray
|
6 |
-
sdk: gradio
|
7 |
-
sdk_version: 4.
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
-
datasets:
|
11 |
-
- cc100
|
12 |
-
---
|
13 |
-
|
14 |
-
|
15 |
-
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Tokenizer Arena
|
3 |
+
emoji: ⚔
|
4 |
+
colorFrom: red
|
5 |
+
colorTo: gray
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 4.32.2
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
datasets:
|
11 |
+
- cc100
|
12 |
+
---
|
13 |
+
|
14 |
+
|
15 |
+
|
16 |
+
|
17 |
+
Please visit our GitHub repo for more information: https://github.com/xu-song/tokenizer-arena
|
app.py
CHANGED
@@ -1,24 +1,39 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
from
|
6 |
-
from
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
)
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Gradio app to showcase the LLM tokenization."""
|
2 |
+
|
3 |
+
import os
|
4 |
+
import gradio as gr
|
5 |
+
from huggingface_hub import login
|
6 |
+
from playground_app import demo as playground_tab
|
7 |
+
from compression_app import demo as compression_tab
|
8 |
+
from character_app import demo as character_tab
|
9 |
+
|
10 |
+
auth_token = os.environ.get('HF_TOKEN', None)
|
11 |
+
if auth_token:
|
12 |
+
login(token=auth_token)
|
13 |
+
|
14 |
+
|
15 |
+
title = '<div align="center">Tokenizer Arena ⚔️</div>'
|
16 |
+
interface_list = [playground_tab, compression_tab, character_tab]
|
17 |
+
tab_names = [" ⚔️ Playground", "🏆 Compression Leaderboard", "📊 Character Statistics"]
|
18 |
+
|
19 |
+
with gr.Blocks(css="css/style.css", js="js/onload.js") as demo:
|
20 |
+
gr.HTML(
|
21 |
+
f"<h1 style='text-align: center; margin-bottom: 1rem'>{title}</h1>"
|
22 |
+
)
|
23 |
+
with gr.Tabs():
|
24 |
+
for interface, tab_name in zip(interface_list, tab_names):
|
25 |
+
with gr.Tab(label=tab_name):
|
26 |
+
interface.render()
|
27 |
+
|
28 |
+
model_name = gr.Textbox(
|
29 |
+
placeholder="🔍 Add tokenizer from Hugging Face (e.g. Xenova/gpt-4o) and press ENTER...",
|
30 |
+
show_label=False,
|
31 |
+
)
|
32 |
+
|
33 |
+
model_name.submit()
|
34 |
+
|
35 |
+
# demo.load(js=open("js/onload.js", "r", encoding="utf-8").read())
|
36 |
+
|
37 |
+
if __name__ == "__main__":
|
38 |
+
demo.launch()
|
39 |
+
# demo.queue(max_size=1024, default_concurrency_limit=80).launch()
|
character_app.py
CHANGED
@@ -1,79 +1,83 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
("
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
#
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
"
|
42 |
-
|
43 |
-
)
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
##
|
3 |
+
"""
|
4 |
+
|
5 |
+
import gradio as gr
|
6 |
+
from character_util import get_character_table, default_columns
|
7 |
+
|
8 |
+
all_columns = [
|
9 |
+
("digit", "digit"),
|
10 |
+
("space", "space"),
|
11 |
+
("lang-chinese", 'zh'),
|
12 |
+
("lang-korea", 'ko'),
|
13 |
+
("lang-japanese", 'ja'),
|
14 |
+
# ("byte", "byte"),
|
15 |
+
# ("oov", "oov")
|
16 |
+
]
|
17 |
+
|
18 |
+
|
19 |
+
# columns = ["lang-zh", "lang-korea", "lang-japanese", "number", "space", "bytes", "oov"]
|
20 |
+
|
21 |
+
abbr2name = {column[1]: column[0].split('-')[-1] for column in all_columns}
|
22 |
+
|
23 |
+
|
24 |
+
def get_column_info(columns):
|
25 |
+
markdown = ""
|
26 |
+
for column in columns:
|
27 |
+
markdown += f"- `num({column})`: num of tokens containing {abbr2name[column]} characters\n" \
|
28 |
+
f"- `len({column})`: `min,median,max` length of tokens containing {abbr2name[column]} characters\n"
|
29 |
+
return markdown
|
30 |
+
|
31 |
+
|
32 |
+
with gr.Blocks() as demo:
|
33 |
+
gr.Markdown("## 🛠️ Setting") # ⚙
|
34 |
+
with gr.Accordion("Please select the type of character you want to count.", open=True):
|
35 |
+
# file size 💽 🖴, tokens 🧮
|
36 |
+
with gr.Row():
|
37 |
+
with gr.Column():
|
38 |
+
columns = gr.Checkboxgroup(
|
39 |
+
all_columns,
|
40 |
+
value=default_columns,
|
41 |
+
label="character type",
|
42 |
+
# info=""
|
43 |
+
)
|
44 |
+
gr.Markdown(
|
45 |
+
"To count other types of characters, you can modify [lang_util.py]"
|
46 |
+
"(https://huggingface.co/spaces/eson/tokenizer-arena/blob/main/utils/lang_util.py). "
|
47 |
+
)
|
48 |
+
column_info = gr.Markdown(
|
49 |
+
get_column_info(default_columns)
|
50 |
+
)
|
51 |
+
|
52 |
+
gr.Markdown("## 📊 Character Statistics")
|
53 |
+
search_bar = gr.Textbox(
|
54 |
+
placeholder="🔍 Search by tokenizer or organization (e.g., 'llama', 'openai') and press ENTER...",
|
55 |
+
show_label=False,
|
56 |
+
elem_id="search-bar",
|
57 |
+
)
|
58 |
+
compress_rate_table = gr.Dataframe(datatype="html", wrap=True)
|
59 |
+
|
60 |
+
search_bar.submit(
|
61 |
+
get_character_table,
|
62 |
+
inputs=[search_bar, columns],
|
63 |
+
outputs=compress_rate_table
|
64 |
+
)
|
65 |
+
columns.change(
|
66 |
+
get_character_table,
|
67 |
+
inputs=[search_bar, columns],
|
68 |
+
outputs=compress_rate_table
|
69 |
+
)
|
70 |
+
columns.change(
|
71 |
+
get_column_info,
|
72 |
+
inputs=[columns],
|
73 |
+
outputs=column_info
|
74 |
+
)
|
75 |
+
|
76 |
+
demo.load(
|
77 |
+
get_character_table,
|
78 |
+
inputs=[search_bar, columns],
|
79 |
+
outputs=compress_rate_table
|
80 |
+
)
|
81 |
+
|
82 |
+
if __name__ == "__main__":
|
83 |
+
demo.launch()
|
character_util.py
CHANGED
@@ -1,216 +1,216 @@
|
|
1 |
-
"""
|
2 |
-
TODO:
|
3 |
-
1. add more language
|
4 |
-
2. check space count of bert
|
5 |
-
3. add token_impl
|
6 |
-
4.
|
7 |
-
"""
|
8 |
-
import os
|
9 |
-
import json
|
10 |
-
import numpy as np
|
11 |
-
import pandas as pd
|
12 |
-
from collections import Counter, defaultdict
|
13 |
-
from vocab import tokenizer_factory
|
14 |
-
from typing import Optional, Union, Literal
|
15 |
-
from utils.log_util import logger
|
16 |
-
from utils.text_util import contains_digit, get_space_count
|
17 |
-
from utils.lang_util import
|
18 |
-
|
19 |
-
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
|
20 |
-
|
21 |
-
default_columns = ["digit", "zh"]
|
22 |
-
|
23 |
-
def _to_unicode(text):
|
24 |
-
return ''.join(r'\u{:04X}'.format(ord(chr)) for chr in text)
|
25 |
-
|
26 |
-
|
27 |
-
def _get_coding_length(tokenizer, vocab, filter=None):
|
28 |
-
"""
|
29 |
-
oov character may be tokenized into more than one token.
|
30 |
-
"""
|
31 |
-
all_length = []
|
32 |
-
for word in vocab:
|
33 |
-
if len(word) > 1:
|
34 |
-
continue
|
35 |
-
if filter is not None and filter(word):
|
36 |
-
continue
|
37 |
-
try:
|
38 |
-
tokens = tokenizer.encode(word)
|
39 |
-
except Exception as e:
|
40 |
-
print(e)
|
41 |
-
|
42 |
-
all_length.append(len(tokens))
|
43 |
-
# if len(tokens.ids) > 1:
|
44 |
-
# if len(tokens) > 3:
|
45 |
-
# print(word, tokens)
|
46 |
-
|
47 |
-
dist_length = Counter(all_length)
|
48 |
-
mean_length = round(sum(all_length) / len(all_length), 2)
|
49 |
-
return dist_length, mean_length
|
50 |
-
|
51 |
-
|
52 |
-
cache = {}
|
53 |
-
|
54 |
-
|
55 |
-
def _dist(token_lens):
|
56 |
-
"""
|
57 |
-
:param token_lens:
|
58 |
-
:return: min,median,max of token_lens
|
59 |
-
"""
|
60 |
-
if not token_lens:
|
61 |
-
return "-"
|
62 |
-
return f"{min(token_lens)},{round(np.median(token_lens))},{max(token_lens)}"
|
63 |
-
|
64 |
-
|
65 |
-
def iter_vocab(
|
66 |
-
tokenizer_name: str,
|
67 |
-
from_cache: bool = True,
|
68 |
-
cache_dir: str = "stats",
|
69 |
-
) -> Union[pd.DataFrame, dict]:
|
70 |
-
"""
|
71 |
-
:param tokenizer_name:
|
72 |
-
:param from_cache:
|
73 |
-
:param cache_dir:
|
74 |
-
:return:
|
75 |
-
"""
|
76 |
-
tokenizer_config = tokenizer_factory.get_tokenizer_config(tokenizer_name)
|
77 |
-
|
78 |
-
cache_dir = os.path.join(CURRENT_DIR, cache_dir)
|
79 |
-
os.makedirs(cache_dir, exist_ok=True)
|
80 |
-
|
81 |
-
# load from cache
|
82 |
-
cache_path = os.path.join(cache_dir, "character_stats.json")
|
83 |
-
if not cache and os.path.exists(cache_path):
|
84 |
-
with open(cache_path, "r", encoding="utf-8") as f_tmp:
|
85 |
-
cache.update(json.load(f_tmp))
|
86 |
-
if from_cache and tokenizer_name in cache:
|
87 |
-
# logger.info(f"load {tokenizer_config.name_or_path} from cache")
|
88 |
-
return cache[tokenizer_name]
|
89 |
-
|
90 |
-
tokenizer = tokenizer_factory.get_tokenizer(tokenizer_name)
|
91 |
-
|
92 |
-
tokens_by_lang = {lang[1]: [] for lang in language_ranges.keys()}
|
93 |
-
digit_tokens = []
|
94 |
-
space_tokens = []
|
95 |
-
byte_tokens = []
|
96 |
-
|
97 |
-
buffer = []
|
98 |
-
for token_id in range(tokenizer.vocab_size):
|
99 |
-
# for token_id in tokenizer.get_vocab():
|
100 |
-
# for token_id in range(len(tokenizer)):
|
101 |
-
decode_str = tokenizer.decode([token_id], skip_special_tokens=False)
|
102 |
-
token = tokenizer.convert_ids_to_tokens([token_id], skip_special_tokens=False)[0]
|
103 |
-
tags = []
|
104 |
-
if token is None: # 有些词典有空的id(不连续)
|
105 |
-
continue
|
106 |
-
if isinstance(token, bytes):
|
107 |
-
token = token.decode("utf-8", errors="ignore")
|
108 |
-
|
109 |
-
if hasattr(tokenizer, "sp_model"): # 基于 sentencepiece 包
|
110 |
-
if tokenizer.sp_model.is_byte(token_id):
|
111 |
-
tags.append("is_byte")
|
112 |
-
byte_tokens.append(token)
|
113 |
-
|
114 |
-
language_tags =
|
115 |
-
for language in language_tags:
|
116 |
-
tokens_by_lang[language[1]].append(decode_str)
|
117 |
-
|
118 |
-
if contains_digit(decode_str):
|
119 |
-
tags.append("digit")
|
120 |
-
digit_tokens.append(decode_str)
|
121 |
-
|
122 |
-
space_count = get_space_count(decode_str)
|
123 |
-
if space_count > 0:
|
124 |
-
space_tokens.append(decode_str)
|
125 |
-
|
126 |
-
buffer.append(json.dumps(
|
127 |
-
{
|
128 |
-
"id": token_id,
|
129 |
-
"token": token,
|
130 |
-
"token_decode": decode_str,
|
131 |
-
"token_dumps": json.dumps(token),
|
132 |
-
"token_unicode": _to_unicode(token),
|
133 |
-
"token_len": len(decode_str),
|
134 |
-
},
|
135 |
-
ensure_ascii=False) + "\n")
|
136 |
-
|
137 |
-
result = {
|
138 |
-
"tokenizer": tokenizer_factory.get_name_with_hyperlink(tokenizer_name),
|
139 |
-
"organization": tokenizer_config.org,
|
140 |
-
# "impl": str(tokenizer.__class__),
|
141 |
-
# "vocab_size-": tokenizer.vocab_size, # vocab_size_without_added_token
|
142 |
-
"vocab_size": len(tokenizer),
|
143 |
-
|
144 |
-
# "中文汉字编码长度均值": mean_length, # 不用统计,因为字典包含中文字符多,一般就意味着 中文汉字编码长度短。
|
145 |
-
# "中文汉字编码长度分布": json.dumps(dist_length),
|
146 |
-
|
147 |
-
"num(digit)": len(digit_tokens),
|
148 |
-
"len(digit)": _dist([len(token) for token in digit_tokens]),
|
149 |
-
"num(space)": len(space_tokens),
|
150 |
-
"len(space)": _dist([len(token) for token in space_tokens]),
|
151 |
-
|
152 |
-
# "num(byte)": len(byte_tokens)
|
153 |
-
}
|
154 |
-
|
155 |
-
for lang, tokens in tokens_by_lang.items():
|
156 |
-
result[f"num({lang})"] = len(tokens)
|
157 |
-
result["len(" + lang + ")"] = _dist([len(token) for token in tokens])
|
158 |
-
|
159 |
-
out_path = os.path.join(cache_dir, f"iter_vocab/{tokenizer_name.replace('/', '_')}.vocab.jsonl")
|
160 |
-
with open(out_path, "w", encoding="utf-8") as f_out:
|
161 |
-
for line in buffer:
|
162 |
-
f_out.write(line)
|
163 |
-
len_before = len(cache)
|
164 |
-
cache[tokenizer_name] = result
|
165 |
-
len_after = len(cache)
|
166 |
-
logger.info(f"saving {tokenizer_name} to memory and file cache: {len_before}->{len_after}")
|
167 |
-
with open(cache_path, "w", encoding="utf-8") as f_out:
|
168 |
-
f_out.write(json.dumps(cache, ensure_ascii=False, indent=2))
|
169 |
-
return result
|
170 |
-
|
171 |
-
|
172 |
-
def to_dataframe(stats, columns):
|
173 |
-
table = []
|
174 |
-
for stat in stats.values():
|
175 |
-
filtered_stat = {}
|
176 |
-
for k, v in stat.items():
|
177 |
-
if not k.startswith("num") and not k.startswith("len"):
|
178 |
-
filtered_stat[k] = v
|
179 |
-
if any(column in k for column in columns):
|
180 |
-
k = k.replace("ja-kana", "kana")
|
181 |
-
filtered_stat[k] = v
|
182 |
-
table.append(filtered_stat)
|
183 |
-
df = pd.DataFrame(table)
|
184 |
-
return df
|
185 |
-
|
186 |
-
|
187 |
-
def get_character_table(
|
188 |
-
tokenizer_filter: Optional[str] = None,
|
189 |
-
columns: Optional[list] = None,
|
190 |
-
return_type: Optional[Literal["dict", "dataframe"]] = "dataframe"
|
191 |
-
) -> Union[pd.DataFrame, dict]:
|
192 |
-
"""
|
193 |
-
"""
|
194 |
-
logger.info(f"columns: {columns}, tokenizer_filter: {tokenizer_filter}")
|
195 |
-
stats = {}
|
196 |
-
if columns is None:
|
197 |
-
columns = default_columns
|
198 |
-
if tokenizer_filter is not None:
|
199 |
-
tokenizer_names = [tokenizer_config.name_or_path for tokenizer_config in tokenizer_factory.all_tokenizer_configs
|
200 |
-
if tokenizer_filter.lower() in tokenizer_config.name_or_path.lower()]
|
201 |
-
else:
|
202 |
-
tokenizer_names = tokenizer_factory.all_tokenizer_names
|
203 |
-
|
204 |
-
for tokenizer_name in tokenizer_names:
|
205 |
-
stat = iter_vocab(tokenizer_name)
|
206 |
-
stats[tokenizer_name] = stat
|
207 |
-
|
208 |
-
if return_type == "dataframe":
|
209 |
-
stats = to_dataframe(stats, columns)
|
210 |
-
return stats
|
211 |
-
|
212 |
-
|
213 |
-
if __name__ == "__main__":
|
214 |
-
# aa = get_character_table(tokenizer_filter="baichuan")
|
215 |
-
df = get_character_table()
|
216 |
-
logger.info(f"\n{df.to_markdown(index=False)}")
|
|
|
1 |
+
"""
|
2 |
+
TODO:
|
3 |
+
1. add more language
|
4 |
+
2. check space count of bert
|
5 |
+
3. add token_impl
|
6 |
+
4.
|
7 |
+
"""
|
8 |
+
import os
|
9 |
+
import json
|
10 |
+
import numpy as np
|
11 |
+
import pandas as pd
|
12 |
+
from collections import Counter, defaultdict
|
13 |
+
from vocab import tokenizer_factory
|
14 |
+
from typing import Optional, Union, Literal
|
15 |
+
from utils.log_util import logger
|
16 |
+
from utils.text_util import contains_digit, get_space_count
|
17 |
+
from utils.lang_util import detect_language_by_unicode, language_ranges
|
18 |
+
|
19 |
+
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
|
20 |
+
|
21 |
+
default_columns = ["digit", "zh"]
|
22 |
+
|
23 |
+
def _to_unicode(text):
|
24 |
+
return ''.join(r'\u{:04X}'.format(ord(chr)) for chr in text)
|
25 |
+
|
26 |
+
|
27 |
+
def _get_coding_length(tokenizer, vocab, filter=None):
|
28 |
+
"""
|
29 |
+
oov character may be tokenized into more than one token.
|
30 |
+
"""
|
31 |
+
all_length = []
|
32 |
+
for word in vocab:
|
33 |
+
if len(word) > 1:
|
34 |
+
continue
|
35 |
+
if filter is not None and filter(word):
|
36 |
+
continue
|
37 |
+
try:
|
38 |
+
tokens = tokenizer.encode(word)
|
39 |
+
except Exception as e:
|
40 |
+
print(e)
|
41 |
+
|
42 |
+
all_length.append(len(tokens))
|
43 |
+
# if len(tokens.ids) > 1:
|
44 |
+
# if len(tokens) > 3:
|
45 |
+
# print(word, tokens)
|
46 |
+
|
47 |
+
dist_length = Counter(all_length)
|
48 |
+
mean_length = round(sum(all_length) / len(all_length), 2)
|
49 |
+
return dist_length, mean_length
|
50 |
+
|
51 |
+
|
52 |
+
cache = {}
|
53 |
+
|
54 |
+
|
55 |
+
def _dist(token_lens):
|
56 |
+
"""
|
57 |
+
:param token_lens:
|
58 |
+
:return: min,median,max of token_lens
|
59 |
+
"""
|
60 |
+
if not token_lens:
|
61 |
+
return "-"
|
62 |
+
return f"{min(token_lens)},{round(np.median(token_lens))},{max(token_lens)}"
|
63 |
+
|
64 |
+
|
65 |
+
def iter_vocab(
|
66 |
+
tokenizer_name: str,
|
67 |
+
from_cache: bool = True,
|
68 |
+
cache_dir: str = "stats",
|
69 |
+
) -> Union[pd.DataFrame, dict]:
|
70 |
+
"""
|
71 |
+
:param tokenizer_name:
|
72 |
+
:param from_cache:
|
73 |
+
:param cache_dir:
|
74 |
+
:return:
|
75 |
+
"""
|
76 |
+
tokenizer_config = tokenizer_factory.get_tokenizer_config(tokenizer_name)
|
77 |
+
|
78 |
+
cache_dir = os.path.join(CURRENT_DIR, cache_dir)
|
79 |
+
os.makedirs(cache_dir, exist_ok=True)
|
80 |
+
|
81 |
+
# load from cache
|
82 |
+
cache_path = os.path.join(cache_dir, "character_stats.json")
|
83 |
+
if not cache and os.path.exists(cache_path):
|
84 |
+
with open(cache_path, "r", encoding="utf-8") as f_tmp:
|
85 |
+
cache.update(json.load(f_tmp))
|
86 |
+
if from_cache and tokenizer_name in cache:
|
87 |
+
# logger.info(f"load {tokenizer_config.name_or_path} from cache")
|
88 |
+
return cache[tokenizer_name]
|
89 |
+
|
90 |
+
tokenizer = tokenizer_factory.get_tokenizer(tokenizer_name)
|
91 |
+
|
92 |
+
tokens_by_lang = {lang[1]: [] for lang in language_ranges.keys()}
|
93 |
+
digit_tokens = []
|
94 |
+
space_tokens = []
|
95 |
+
byte_tokens = []
|
96 |
+
|
97 |
+
buffer = []
|
98 |
+
for token_id in range(tokenizer.vocab_size):
|
99 |
+
# for token_id in tokenizer.get_vocab():
|
100 |
+
# for token_id in range(len(tokenizer)):
|
101 |
+
decode_str = tokenizer.decode([token_id], skip_special_tokens=False)
|
102 |
+
token = tokenizer.convert_ids_to_tokens([token_id], skip_special_tokens=False)[0]
|
103 |
+
tags = []
|
104 |
+
if token is None: # 有些词典有空的id(不连续)
|
105 |
+
continue
|
106 |
+
if isinstance(token, bytes):
|
107 |
+
token = token.decode("utf-8", errors="ignore")
|
108 |
+
|
109 |
+
if hasattr(tokenizer, "sp_model"): # 基于 sentencepiece 包
|
110 |
+
if tokenizer.sp_model.is_byte(token_id):
|
111 |
+
tags.append("is_byte")
|
112 |
+
byte_tokens.append(token)
|
113 |
+
|
114 |
+
language_tags = detect_language_by_unicode(decode_str)
|
115 |
+
for language in language_tags:
|
116 |
+
tokens_by_lang[language[1]].append(decode_str)
|
117 |
+
|
118 |
+
if contains_digit(decode_str):
|
119 |
+
tags.append("digit")
|
120 |
+
digit_tokens.append(decode_str)
|
121 |
+
|
122 |
+
space_count = get_space_count(decode_str)
|
123 |
+
if space_count > 0:
|
124 |
+
space_tokens.append(decode_str)
|
125 |
+
|
126 |
+
buffer.append(json.dumps(
|
127 |
+
{
|
128 |
+
"id": token_id,
|
129 |
+
"token": token,
|
130 |
+
"token_decode": decode_str,
|
131 |
+
"token_dumps": json.dumps(token),
|
132 |
+
"token_unicode": _to_unicode(token),
|
133 |
+
"token_len": len(decode_str),
|
134 |
+
},
|
135 |
+
ensure_ascii=False) + "\n")
|
136 |
+
|
137 |
+
result = {
|
138 |
+
"tokenizer": tokenizer_factory.get_name_with_hyperlink(tokenizer_name),
|
139 |
+
"organization": tokenizer_config.org,
|
140 |
+
# "impl": str(tokenizer.__class__),
|
141 |
+
# "vocab_size-": tokenizer.vocab_size, # vocab_size_without_added_token
|
142 |
+
"vocab_size": len(tokenizer),
|
143 |
+
|
144 |
+
# "中文汉字编码长度均值": mean_length, # 不用统计,因为字典包含中文字符多,一般就意味着 中文汉字编码长度短。
|
145 |
+
# "中文汉字编码长度分布": json.dumps(dist_length),
|
146 |
+
|
147 |
+
"num(digit)": len(digit_tokens),
|
148 |
+
"len(digit)": _dist([len(token) for token in digit_tokens]),
|
149 |
+
"num(space)": len(space_tokens),
|
150 |
+
"len(space)": _dist([len(token) for token in space_tokens]),
|
151 |
+
|
152 |
+
# "num(byte)": len(byte_tokens)
|
153 |
+
}
|
154 |
+
|
155 |
+
for lang, tokens in tokens_by_lang.items():
|
156 |
+
result[f"num({lang})"] = len(tokens)
|
157 |
+
result["len(" + lang + ")"] = _dist([len(token) for token in tokens])
|
158 |
+
|
159 |
+
out_path = os.path.join(cache_dir, f"iter_vocab/{tokenizer_name.replace('/', '_')}.vocab.jsonl")
|
160 |
+
with open(out_path, "w", encoding="utf-8") as f_out:
|
161 |
+
for line in buffer:
|
162 |
+
f_out.write(line)
|
163 |
+
len_before = len(cache)
|
164 |
+
cache[tokenizer_name] = result
|
165 |
+
len_after = len(cache)
|
166 |
+
logger.info(f"saving {tokenizer_name} to memory and file cache: {len_before}->{len_after}")
|
167 |
+
with open(cache_path, "w", encoding="utf-8") as f_out:
|
168 |
+
f_out.write(json.dumps(cache, ensure_ascii=False, indent=2))
|
169 |
+
return result
|
170 |
+
|
171 |
+
|
172 |
+
def to_dataframe(stats, columns):
|
173 |
+
table = []
|
174 |
+
for stat in stats.values():
|
175 |
+
filtered_stat = {}
|
176 |
+
for k, v in stat.items():
|
177 |
+
if not k.startswith("num") and not k.startswith("len"):
|
178 |
+
filtered_stat[k] = v
|
179 |
+
if any(column in k for column in columns):
|
180 |
+
k = k.replace("ja-kana", "kana")
|
181 |
+
filtered_stat[k] = v
|
182 |
+
table.append(filtered_stat)
|
183 |
+
df = pd.DataFrame(table)
|
184 |
+
return df
|
185 |
+
|
186 |
+
|
187 |
+
def get_character_table(
|
188 |
+
tokenizer_filter: Optional[str] = None,
|
189 |
+
columns: Optional[list] = None,
|
190 |
+
return_type: Optional[Literal["dict", "dataframe"]] = "dataframe"
|
191 |
+
) -> Union[pd.DataFrame, dict]:
|
192 |
+
"""
|
193 |
+
"""
|
194 |
+
logger.info(f"columns: {columns}, tokenizer_filter: {tokenizer_filter}")
|
195 |
+
stats = {}
|
196 |
+
if columns is None:
|
197 |
+
columns = default_columns
|
198 |
+
if tokenizer_filter is not None:
|
199 |
+
tokenizer_names = [tokenizer_config.name_or_path for tokenizer_config in tokenizer_factory.all_tokenizer_configs
|
200 |
+
if tokenizer_filter.lower() in tokenizer_config.name_or_path.lower()]
|
201 |
+
else:
|
202 |
+
tokenizer_names = tokenizer_factory.all_tokenizer_names
|
203 |
+
|
204 |
+
for tokenizer_name in tokenizer_names:
|
205 |
+
stat = iter_vocab(tokenizer_name)
|
206 |
+
stats[tokenizer_name] = stat
|
207 |
+
|
208 |
+
if return_type == "dataframe":
|
209 |
+
stats = to_dataframe(stats, columns)
|
210 |
+
return stats
|
211 |
+
|
212 |
+
|
213 |
+
if __name__ == "__main__":
|
214 |
+
# aa = get_character_table(tokenizer_filter="baichuan")
|
215 |
+
df = get_character_table()
|
216 |
+
logger.info(f"\n{df.to_markdown(index=False)}")
|
compression_app.py
CHANGED
@@ -1,130 +1,187 @@
|
|
1 |
-
"""
|
2 |
-
TODO:
|
3 |
-
- 统计 tokenizer_impl
|
4 |
-
- 统计 OOV
|
5 |
-
- 统计 reversal
|
6 |
-
- 增加 math,code
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
#
|
34 |
-
#
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
#
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
TODO:
|
3 |
+
- 统计 tokenizer_impl
|
4 |
+
- 统计 OOV
|
5 |
+
- 统计 reversal
|
6 |
+
- 增加 math,code
|
7 |
+
|
8 |
+
|
9 |
+
|
10 |
+
## balance
|
11 |
+
|
12 |
+
- 高压缩率 VS vocab_size:
|
13 |
+
- 高压缩率,就意味着,编码后的token数少,那么 token长度 就会长,--> vocab_size 就会太大
|
14 |
+
- 高压缩率 VS 无损
|
15 |
+
- s
|
16 |
+
- OOV
|
17 |
+
- OOV 多,那么生成的 UNK 可能多(一个char 一个UNK) --> token 数目多 -> 压缩率低
|
18 |
+
- OOV 多,那么生成的 UNK 可能少() --> token 数目多 -> 压缩率低
|
19 |
+
|
20 |
+
"""
|
21 |
+
|
22 |
+
import gradio as gr
|
23 |
+
from compression_util import get_compression_leaderboard, common_corpuses
|
24 |
+
|
25 |
+
|
26 |
+
# From the perspective of compression
|
27 |
+
# exactly reconstructed from compressed tokens
|
28 |
+
docs = """## 📖 What is a good tokenizer?
|
29 |
+
|
30 |
+
From a compression perspective, a good tokenizer should be lossless, and keep high compression rate (less tokens).
|
31 |
+
The encoding and decoding process can be formulated as
|
32 |
+
```python
|
33 |
+
token_ids = tokenizer.encode(input_text) # compressed tokens
|
34 |
+
decoded_text = tokenizer.decode(token_ids) # reconstructed text
|
35 |
+
```
|
36 |
+
|
37 |
+
- **Lossless** <br>
|
38 |
+
Lossless tokenization preserves the exact original text, i.e. `decoded_text = input_text`.
|
39 |
+
|
40 |
+
- Most lossy tokenizers get many out-of-vocabulary tokens. 👉 Check the [oov of bert-base-uncased](https://huggingface.co/spaces/eson/tokenizer-arena/raw/main/stats/google-bert.bert-base-casedcc100.zh-Hans.diff.json).
|
41 |
+
- Some other tokenizers have no oov, but still be lossy due to text normalization. For example qwen performs [unicode normalization](https://github.com/huggingface/transformers/blob/v4.42.3/src/transformers/models/qwen2/tokenization_qwen2.py#L338),
|
42 |
+
which may bring some [slight difference](https://huggingface.co/spaces/eson/tokenizer-arena/raw/main/stats/compression_rate/Qwen.Qwen1.5-1.8B%20@%20cc100.ja.diff.jsonn) to the reconstructed text.
|
43 |
+
|
44 |
+
- **Compression Rate** <br>
|
45 |
+
There are mainly two types of metric to represent the `input_text`:
|
46 |
+
- `byte-level`: the number of bytes in the given text
|
47 |
+
- `char-level`: the number of characters in the given text.
|
48 |
+
|
49 |
+
To evaluate compression rate, simple metrics can be "how many bytes per token" or "how many chars per token". <br>
|
50 |
+
In this leaderboard, we adopt more frequently used metric: "how many billion tokens per gigabytes corpus" and "how many chars
|
51 |
+
per token", i.e. `b_tokens/g_bytes` and `char/token`.
|
52 |
+
|
53 |
+
|
54 |
+
💬 [Discussions is Welcome](https://huggingface.co/spaces/eson/tokenizer-arena/discussions)
|
55 |
+
"""
|
56 |
+
|
57 |
+
|
58 |
+
|
59 |
+
# theme = gr.themes.Monochrome()
|
60 |
+
theme = gr.themes.Default()
|
61 |
+
# theme.set(accordion_text_weight=600) # 暂不支持
|
62 |
+
with gr.Blocks(theme=theme) as demo:
|
63 |
+
# gr.Markdown("## Convertor")
|
64 |
+
# with gr.Accordion("Convertor", open=False):
|
65 |
+
# gr.Markdown("Tokenize {} corpus")
|
66 |
+
# with gr.Row(elem_classes="no-border"):
|
67 |
+
# gr.Button("File Size", min_width=50)
|
68 |
+
# file_size = gr.Textbox(
|
69 |
+
# show_label=False,
|
70 |
+
# min_width=50,
|
71 |
+
# # elem_classes="textbox-as-text"
|
72 |
+
# )
|
73 |
+
# gr.Dropdown(
|
74 |
+
# choices=['MB', 'GB', 'TB'],
|
75 |
+
# show_label=False,
|
76 |
+
# min_width=15,
|
77 |
+
# # elem_classes="textbox-as-text"
|
78 |
+
# )
|
79 |
+
# # gr.Markdown('<h2 align="center">≈</h2>')
|
80 |
+
# # gr.HTML('<h2 style="margin: auto;">≈</h2>')
|
81 |
+
# gr.Button(
|
82 |
+
# "≈",
|
83 |
+
# min_width=10,
|
84 |
+
# elem_classes="button-white h2-font"
|
85 |
+
#
|
86 |
+
# )
|
87 |
+
#
|
88 |
+
# gr.Button(
|
89 |
+
# "Tokens",
|
90 |
+
# min_width=50
|
91 |
+
# )
|
92 |
+
# gr.Textbox(
|
93 |
+
# show_label=False,
|
94 |
+
# min_width=50
|
95 |
+
# )
|
96 |
+
# gr.Dropdown(
|
97 |
+
# ['million', 'billion', 'trillion'],
|
98 |
+
# show_label=False,
|
99 |
+
# min_width=15,
|
100 |
+
# elem_classes="button-white"
|
101 |
+
# )
|
102 |
+
|
103 |
+
|
104 |
+
|
105 |
+
gr.Markdown(docs)
|
106 |
+
gr.Markdown("## 🛠️ Setting") # ⚙
|
107 |
+
gr.Markdown("We perform tokenization on different corpus, and calculate the compression rate."
|
108 |
+
"")
|
109 |
+
with gr.Accordion("Please select the corpus and measure of compression rate.", open=True):
|
110 |
+
# file size 💽 🖴, tokens 🧮
|
111 |
+
# Total amount of disk used
|
112 |
+
with gr.Row():
|
113 |
+
with gr.Column():
|
114 |
+
compress_rate_corpus = gr.Dropdown(
|
115 |
+
common_corpuses, # , "code"
|
116 |
+
value=["cc100/en", "cc100/zh-Hans", "cc100/fr", "cc100/es"],
|
117 |
+
label="corpus",
|
118 |
+
multiselect=True
|
119 |
+
# info=""
|
120 |
+
)
|
121 |
+
|
122 |
+
# unit of file_size: gigabyte terabyte
|
123 |
+
# unit of token_num: million billion trillion
|
124 |
+
# The most common units of measurement include length (meter, inch, foot), weight (gram, kilogram, pound), volume (liter, gallon, milliliter), time (second, minute, hour)
|
125 |
+
compress_rate_unit = gr.Radio(
|
126 |
+
["b_tokens/g_bytes", "t_tokens/t_bytes"],
|
127 |
+
value="b_tokens/g_bytes",
|
128 |
+
label="measure", # evaluation metric
|
129 |
+
)
|
130 |
+
|
131 |
+
gr.Markdown(
|
132 |
+
# "Note:\n\n explanation"
|
133 |
+
# "Supported languages are (20): arabic (ar), bulgarian (bg), german (de), modern greek (el), english (en), spanish (es), french (fr), hindi (hi), italian (it), japanese (ja), dutch (nl), polish (pl), portuguese (pt), russian (ru), swahili (sw), thai (th), turkish (tr), urdu (ur), vietnamese (vi), and chinese (zh)."
|
134 |
+
# " arabic (ar), english (en), spanish (es), french (fr), italian (it), japanese (ja), portuguese (pt), russian (ru), and chinese (zh)."
|
135 |
+
"- `corpus`: tokenization is performed on the selected subsets of [cc100](https://huggingface.co/datasets/statmt/cc100) corpus.\n"
|
136 |
+
"- measure\n"
|
137 |
+
" - `b_tokens/g_bytes` measures how many billion tokens per gigabytes corpus.\n"
|
138 |
+
" - `t_tokens/t_bytes` measures how many trillion tokens per terabytes corpus.\n"
|
139 |
+
# "- `g_bytes/b_tokens` measures how many gigabytes corpus per billion tokens.\n"
|
140 |
+
# "- `t_bytes/t_tokens` measures how many terabytes corpus per trillion tokens.\n"
|
141 |
+
" - `char/token` measures how many chars per token on the tokenized corpus.\n"
|
142 |
+
" - `oov_ratio`: out-of-vocabulary ratio on the selected corpus, 👉 get [oov charset](https://huggingface.co/spaces/eson/tokenizer-arena/raw/main/stats/compression_rate.json)\n\n"
|
143 |
+
"You can reproduce this procedure with [compression_util.py](https://huggingface.co/spaces/eson/tokenizer-arena/blob/main/compression_util.py)."
|
144 |
+
)
|
145 |
+
|
146 |
+
gr.Markdown("## 🏆 Compression Rate Leaderboard")
|
147 |
+
search_bar = gr.Textbox(
|
148 |
+
placeholder="🔍 Search by tokenizer or organization (e.g., 'llama', 'openai') and press ENTER...",
|
149 |
+
show_label=False,
|
150 |
+
elem_id="search-bar",
|
151 |
+
)
|
152 |
+
compress_rate_table = gr.Dataframe(datatype="html")
|
153 |
+
|
154 |
+
# func call
|
155 |
+
compress_rate_corpus.change(
|
156 |
+
get_compression_leaderboard,
|
157 |
+
inputs=[compress_rate_corpus, compress_rate_unit, search_bar],
|
158 |
+
outputs=compress_rate_table
|
159 |
+
)
|
160 |
+
compress_rate_unit.change(
|
161 |
+
get_compression_leaderboard,
|
162 |
+
inputs=[compress_rate_corpus, compress_rate_unit, search_bar],
|
163 |
+
outputs=compress_rate_table
|
164 |
+
)
|
165 |
+
# file_size.change(
|
166 |
+
# get_all_compress_rate,
|
167 |
+
# outputs=compress_rate_table
|
168 |
+
# )
|
169 |
+
|
170 |
+
search_bar.submit(
|
171 |
+
get_compression_leaderboard,
|
172 |
+
inputs=[
|
173 |
+
compress_rate_corpus,
|
174 |
+
compress_rate_unit,
|
175 |
+
search_bar,
|
176 |
+
],
|
177 |
+
outputs=compress_rate_table
|
178 |
+
)
|
179 |
+
|
180 |
+
demo.load(
|
181 |
+
get_compression_leaderboard,
|
182 |
+
inputs=[compress_rate_corpus, compress_rate_unit],
|
183 |
+
outputs=compress_rate_table
|
184 |
+
)
|
185 |
+
|
186 |
+
if __name__ == "__main__":
|
187 |
+
demo.launch()
|
compression_util.py
CHANGED
@@ -1,302 +1,320 @@
|
|
1 |
-
"""
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
whitespace:
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
import
|
12 |
-
import
|
13 |
-
import
|
14 |
-
from
|
15 |
-
|
16 |
-
from
|
17 |
-
from
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
"
|
30 |
-
"
|
31 |
-
"
|
32 |
-
"
|
33 |
-
"
|
34 |
-
"
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
value =
|
65 |
-
elif unit in ["token
|
66 |
-
value =
|
67 |
-
elif unit
|
68 |
-
value =
|
69 |
-
elif unit == "b_tokens
|
70 |
-
value =
|
71 |
-
elif unit == "
|
72 |
-
value =
|
73 |
-
elif unit == "t_tokens
|
74 |
-
value =
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
"
|
92 |
-
"
|
93 |
-
"
|
94 |
-
"
|
95 |
-
"
|
96 |
-
"
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
units =
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
:param
|
144 |
-
:
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
oov_charset =
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
}
|
241 |
-
|
242 |
-
os.
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
-
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
## more statistics
|
3 |
+
code:
|
4 |
+
math:
|
5 |
+
digit:
|
6 |
+
whitespace:
|
7 |
+
top_oov: most frequent oov chars
|
8 |
+
ranking: thumb_up thumb_down
|
9 |
+
"""
|
10 |
+
|
11 |
+
import json
|
12 |
+
import os
|
13 |
+
import sys
|
14 |
+
from difflib import SequenceMatcher
|
15 |
+
import pandas as pd
|
16 |
+
from datasets import load_dataset
|
17 |
+
from utils.log_util import logger
|
18 |
+
from vocab import tokenizer_factory, TokenizerConfig
|
19 |
+
from typing import List, Optional, Union, Literal
|
20 |
+
|
21 |
+
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
|
22 |
+
|
23 |
+
common_units = ["g_bytes/b_tokens", "b_tokens/g_bytes", "t_bytes/t_tokens", "t_tokens/t_bytes", "n_chars/n_tokens", ]
|
24 |
+
|
25 |
+
common_corpuses = sorted(["cc100/en", "cc100/zh-Hans", "cc100/es", "cc100/fr", "cc100/de", "cc100/ko",
|
26 |
+
"cc100/fa", "cc100/ar", "cc100/ja"])
|
27 |
+
|
28 |
+
VALID_CODES_CC100 = [
|
29 |
+
"am", "ar", "as", "az", "be", "bg", "bn", "bn_rom", "br", "bs", "ca", "cs", "cy", "da", "de",
|
30 |
+
"el", "en", "eo", "es", "et", "eu", "fa", "ff", "fi", "fr", "fy", "ga", "gd", "gl", "gn", "gu",
|
31 |
+
"ha", "he", "hi", "hi_rom", "hr", "ht", "hu", "hy", "id", "ig", "is", "it", "ja", "jv", "ka",
|
32 |
+
"kk", "km", "kn", "ko", "ku", "ky", "la", "lg", "li", "ln", "lo", "lt", "lv", "mg", "mk", "ml",
|
33 |
+
"mn", "mr", "ms", "my", "my_zaw", "ne", "nl", "no", "ns", "om", "or", "pa", "pl", "ps", "pt",
|
34 |
+
"qu", "rm", "ro", "ru", "sa", "si", "sc", "sd", "sk", "sl", "so", "sq", "sr", "ss", "su", "sv",
|
35 |
+
"sw", "ta", "ta_rom", "te", "te_rom", "th", "tl", "tn", "tr", "ug", "uk", "ur", "ur_rom", "uz",
|
36 |
+
"vi", "wo", "xh", "yi", "yo", "zh-Hans", "zh-Hant", "zu",
|
37 |
+
]
|
38 |
+
|
39 |
+
|
40 |
+
# code: https://huggingface.co/datasets/codeparrot/github-code-clean python java c sql html
|
41 |
+
# math:
|
42 |
+
|
43 |
+
def get_n_bytes_of_string(string_text):
|
44 |
+
n_bytes = len(string_text.encode("utf-8"))
|
45 |
+
return n_bytes
|
46 |
+
|
47 |
+
|
48 |
+
def unit_convertor(stat, unit):
|
49 |
+
n_tokens = stat["_n_tokens"]
|
50 |
+
n_chars = stat["_n_chars"]
|
51 |
+
n_bytes = stat["_n_bytes"]
|
52 |
+
|
53 |
+
if n_tokens is None:
|
54 |
+
return None
|
55 |
+
|
56 |
+
n_tokens_in_billion = n_tokens / (1000 * 1000 * 1000)
|
57 |
+
n_tokens_in_trillion = n_tokens / (1000 * 1000 * 1000 * 1000)
|
58 |
+
n_bytes_in_mb = n_bytes / (1024 * 1024)
|
59 |
+
n_bytes_in_gb = n_bytes_in_mb / 1024
|
60 |
+
n_bytes_in_tb = n_bytes_in_gb / 1024
|
61 |
+
# n_chars_in_billion = n_chars / (1000 * 1000 * 1000)
|
62 |
+
|
63 |
+
if unit == "n_tokens/n_bytes":
|
64 |
+
value = n_tokens / n_bytes
|
65 |
+
elif unit in ["char/token", "chars_per_token"]: # 重要:平均一个token包含多少个字符。
|
66 |
+
value = n_chars / n_tokens
|
67 |
+
elif unit in ["token/char", "tokens_per_char"]: # 一个中文汉字需要几个token?
|
68 |
+
value = n_tokens / n_chars
|
69 |
+
elif unit == "g_bytes/b_tokens":
|
70 |
+
value = n_bytes_in_gb / n_tokens_in_billion
|
71 |
+
elif unit == "b_tokens/g_bytes":
|
72 |
+
value = n_tokens_in_billion / n_bytes_in_gb
|
73 |
+
elif unit == "t_bytes/t_tokens": # 重要:
|
74 |
+
value = n_bytes_in_tb / n_tokens_in_trillion
|
75 |
+
elif unit == "t_tokens/t_bytes":
|
76 |
+
value = n_tokens_in_trillion / n_bytes_in_tb
|
77 |
+
else:
|
78 |
+
raise "measure not support"
|
79 |
+
return round(value, 3)
|
80 |
+
|
81 |
+
|
82 |
+
def _merge_stats_by_corpus(stats_by_corpus, oov_threshold=0.3):
|
83 |
+
"""
|
84 |
+
"""
|
85 |
+
all_stats = list(stats_by_corpus.values())
|
86 |
+
assert len(set([stats["tokenizer"] for stats in all_stats])) == 1
|
87 |
+
lossless = all(stat['lossless'] for stat in all_stats)
|
88 |
+
is_support = all(stat['oov_ratio'] < oov_threshold for stat in all_stats)
|
89 |
+
|
90 |
+
merged_stats = {
|
91 |
+
"tokenizer": all_stats[0]["tokenizer"],
|
92 |
+
"organization": all_stats[0]["organization"],
|
93 |
+
"vocab_size": all_stats[0]["vocab_size"],
|
94 |
+
"_n_bytes": 0,
|
95 |
+
"_n_tokens": 0 if is_support else None,
|
96 |
+
"_n_chars": 0,
|
97 |
+
"_n_oov_chars": 0,
|
98 |
+
"lossless": True,
|
99 |
+
}
|
100 |
+
for stats in all_stats:
|
101 |
+
merged_stats["_n_bytes"] += stats["_n_bytes"]
|
102 |
+
merged_stats["_n_chars"] += stats["_n_chars"]
|
103 |
+
if is_support: # The number of tokens cannot be accurately counted, when there are too many UNKs.
|
104 |
+
merged_stats["_n_tokens"] += stats["_n_tokens"]
|
105 |
+
merged_stats["_n_oov_chars"] += stats["_n_oov_chars"]
|
106 |
+
merged_stats["lossless"] &= stats['lossless']
|
107 |
+
|
108 |
+
merged_stats.update({
|
109 |
+
"oov_ratio": float("%.4g" % (stats["_n_oov_chars"] / stats["_n_chars"])),
|
110 |
+
"lossless": lossless
|
111 |
+
})
|
112 |
+
return merged_stats
|
113 |
+
|
114 |
+
|
115 |
+
def to_dataframe(stats, units=None):
|
116 |
+
if units is None:
|
117 |
+
units = common_units
|
118 |
+
elif not isinstance(units, list):
|
119 |
+
units = [units]
|
120 |
+
table = []
|
121 |
+
|
122 |
+
for stat in stats.values():
|
123 |
+
columns = {k: v for k, v in stat.items() if not k.startswith("_")}
|
124 |
+
for unit in units:
|
125 |
+
if unit not in stat:
|
126 |
+
columns[unit] = unit_convertor(stat, unit)
|
127 |
+
else:
|
128 |
+
logger.error(f"unit {unit} not support")
|
129 |
+
table.append(columns)
|
130 |
+
df = pd.DataFrame(table)
|
131 |
+
return df
|
132 |
+
|
133 |
+
|
134 |
+
cache = {}
|
135 |
+
|
136 |
+
|
137 |
+
def tokenize_corpus(
|
138 |
+
tokenizer_name: str,
|
139 |
+
corpuses: List[str],
|
140 |
+
cache_dir: str = "stats"
|
141 |
+
) -> dict:
|
142 |
+
"""
|
143 |
+
:param tokenizer_name:
|
144 |
+
:param corpuses:
|
145 |
+
:param cache_dir:
|
146 |
+
:return:
|
147 |
+
"""
|
148 |
+
|
149 |
+
def _assert_oov(tokenizer, oov_candidate):
|
150 |
+
|
151 |
+
tokenizer.encode()
|
152 |
+
|
153 |
+
def _char_based_oov(src_text, decoded_text, tokenizer):
|
154 |
+
oov_charset = [] # keep the order in src_text
|
155 |
+
decoded_charset = set(decoded_text)
|
156 |
+
for char in dict.fromkeys(src_text):
|
157 |
+
if char not in decoded_charset \
|
158 |
+
and char != tokenizer.decode(tokenizer.encode(char, add_special_tokens=False)):
|
159 |
+
oov_charset.append(char)
|
160 |
+
|
161 |
+
n_oov_chars = sum([1 for char in src_text if char in oov_charset])
|
162 |
+
return n_oov_chars, oov_charset
|
163 |
+
|
164 |
+
def _diff_path(src_text, decoded_text):
|
165 |
+
s = SequenceMatcher(a=src_text, b=decoded_text)
|
166 |
+
changes = []
|
167 |
+
for tag, i1, i2, j1, j2 in s.get_opcodes():
|
168 |
+
if tag != "equal":
|
169 |
+
changes.append('{:7} text[{}:{}] --> decoded_text[{}:{}] {!r:>8} --> {!r}'.format(
|
170 |
+
tag, i1, i2, j1, j2, src_text[i1:i2], decoded_text[j1:j2]))
|
171 |
+
return changes
|
172 |
+
|
173 |
+
def _tokenize(tokenizer, datasets, detail_path=None):
|
174 |
+
"""
|
175 |
+
:param tokenizer:
|
176 |
+
:param datasets:
|
177 |
+
:param detail_path:
|
178 |
+
:return:
|
179 |
+
"""
|
180 |
+
n_bytes = 0
|
181 |
+
n_tokens = 0
|
182 |
+
n_chars = 0
|
183 |
+
n_oov_chars = 0
|
184 |
+
diff_details = []
|
185 |
+
oov_charset = set()
|
186 |
+
unk_token_id = None
|
187 |
+
if hasattr(tokenizer, "unk_token"):
|
188 |
+
unk_token_id = tokenizer.unk_token_id
|
189 |
+
for dataset in datasets:
|
190 |
+
for item in dataset:
|
191 |
+
text = item["text"]
|
192 |
+
n_bytes += get_n_bytes_of_string(text)
|
193 |
+
n_chars += len(text)
|
194 |
+
ids = tokenizer.encode(text, add_special_tokens=False)
|
195 |
+
|
196 |
+
# detect oov
|
197 |
+
decoded_text = tokenizer.decode(ids)
|
198 |
+
decoded_text_without_unk = tokenizer.decode([token_id for token_id in ids if token_id != unk_token_id])
|
199 |
+
if decoded_text != text:
|
200 |
+
_n_oov_chars, _oov_charset = _char_based_oov(text, decoded_text_without_unk, tokenizer)
|
201 |
+
diffs = _diff_path(text, decoded_text)
|
202 |
+
diff_details.append(
|
203 |
+
{
|
204 |
+
"text": text,
|
205 |
+
"decoded_text": decoded_text,
|
206 |
+
"diff": diffs,
|
207 |
+
"n_oov_chars": _n_oov_chars,
|
208 |
+
'oov_ratio': _n_oov_chars / len(text),
|
209 |
+
'oov_charset': json.dumps(_oov_charset, ensure_ascii=False),
|
210 |
+
}
|
211 |
+
)
|
212 |
+
n_oov_chars += _n_oov_chars
|
213 |
+
oov_charset.update(_oov_charset)
|
214 |
+
n_tokens += len(ids)
|
215 |
+
stat = {
|
216 |
+
"_n_bytes": n_bytes,
|
217 |
+
"_n_tokens": n_tokens,
|
218 |
+
"_n_chars": n_chars,
|
219 |
+
"_n_oov_chars": n_oov_chars,
|
220 |
+
"oov_ratio": n_oov_chars / n_chars,
|
221 |
+
'_oov_charset': json.dumps(list(oov_charset), ensure_ascii=False),
|
222 |
+
"lossless": len(diff_details) == 0
|
223 |
+
}
|
224 |
+
|
225 |
+
if detail_path and diff_details:
|
226 |
+
logger.info(f"saving tokenization detail to '{detail_path}'")
|
227 |
+
with open(detail_path, "w", encoding="utf-8") as f:
|
228 |
+
f.write(json.dumps(diff_details, ensure_ascii=False, indent=2))
|
229 |
+
# print(f"{tokenizer_config.name_or_path}, {infer_tokenizer_type(tokenizer_config)}\n"
|
230 |
+
# f"lossless: false; unk_token: {get_unk(tokenizer_config)},"
|
231 |
+
# f" unk_ratio: {unk_count / len(encoding):.4f}; oov: []")
|
232 |
+
# for diff_detail in diff_details:
|
233 |
+
# # print(f"text[{i}] = {str(bytes(text[i:], 'utf-8'))}\n"
|
234 |
+
# # f"decoding[{i}] = {str(bytes(decoding[i:], 'utf-8'))}")
|
235 |
+
# f.write(f"text= {json.dumps(text[i:], ensure_ascii=False)}, \n"
|
236 |
+
# f"decoding[{i}] = {json.dumps(decoding[i:], ensure_ascii=False)}")
|
237 |
+
return stat
|
238 |
+
|
239 |
+
# load from cache
|
240 |
+
cache_id = f"{tokenizer_name} @ {'.'.join(corpuses)}"
|
241 |
+
cache_path = os.path.join(cache_dir, "compression_rate.json")
|
242 |
+
if not cache and os.path.exists(cache_path):
|
243 |
+
with open(cache_path, "r", encoding="utf-8") as f_tmp:
|
244 |
+
cache.update(json.load(f_tmp))
|
245 |
+
if cache_id in cache:
|
246 |
+
# logger.info(f"loading {cache_id} from in-memory cache")
|
247 |
+
return cache[cache_id]
|
248 |
+
|
249 |
+
# tokenize corpus
|
250 |
+
tokenizer = tokenizer_factory.get_tokenizer(tokenizer_name)
|
251 |
+
datasets = [load_dataset("eson/cc100-samples", corpus.replace("cc100/", ""), split="train") for corpus in corpuses]
|
252 |
+
|
253 |
+
stat = {
|
254 |
+
"tokenizer": tokenizer_factory.get_name_with_hyperlink(tokenizer_name),
|
255 |
+
"organization": tokenizer_factory.get_tokenizer_config(tokenizer_name).org,
|
256 |
+
"vocab_size": len(tokenizer),
|
257 |
+
}
|
258 |
+
tokenize_detail_dir = os.path.join(cache_dir, "compression_rate")
|
259 |
+
os.makedirs(tokenize_detail_dir, exist_ok=True)
|
260 |
+
tokenize_detail_path = os.path.join(tokenize_detail_dir, cache_id.replace("/", ".") + ".diff.json")
|
261 |
+
stat.update(_tokenize(tokenizer, datasets, detail_path=tokenize_detail_path))
|
262 |
+
# add basic info
|
263 |
+
|
264 |
+
# save to cache
|
265 |
+
len_before = len(cache)
|
266 |
+
cache[cache_id] = stat
|
267 |
+
len_after = len(cache)
|
268 |
+
logger.info(f"saving '{cache_id}' to memory and file cache '{cache_path}': {len_before}->{len_after}")
|
269 |
+
with open(cache_path, "w", encoding="utf-8") as f_tmp:
|
270 |
+
json.dump(cache, f_tmp, ensure_ascii=False, indent=2)
|
271 |
+
return stat
|
272 |
+
|
273 |
+
|
274 |
+
def get_compression_leaderboard(
|
275 |
+
corpuses: List[str] = ['cc100/en'],
|
276 |
+
unit: str = "b_tokens/g_bytes",
|
277 |
+
tokenizer_filter: Optional[str] = None,
|
278 |
+
return_type: Optional[Literal["dict", "dataframe"]] = "dataframe"
|
279 |
+
) -> Union[pd.DataFrame, dict]:
|
280 |
+
"""
|
281 |
+
"""
|
282 |
+
logger.info(f"corpuses: {corpuses}; unit: {unit}; tokenizer_filter: {tokenizer_filter}")
|
283 |
+
stats = {}
|
284 |
+
if tokenizer_filter is not None:
|
285 |
+
tokenizer_names = [tokenizer_name for tokenizer_name in tokenizer_factory.all_tokenizer_names
|
286 |
+
if tokenizer_filter.lower() in tokenizer_name.lower()]
|
287 |
+
else:
|
288 |
+
tokenizer_names = tokenizer_factory.all_tokenizer_names
|
289 |
+
for tokenizer_name in tokenizer_names:
|
290 |
+
stats_by_corpus = {}
|
291 |
+
for corpus in corpuses:
|
292 |
+
stats_by_corpus[corpus] = tokenize_corpus(tokenizer_name, [corpus])
|
293 |
+
stats[tokenizer_name] = _merge_stats_by_corpus(stats_by_corpus)
|
294 |
+
|
295 |
+
if return_type == "dataframe":
|
296 |
+
token_number_unit, file_size_unit = unit.split("/")
|
297 |
+
reverse_unit = f"{file_size_unit}/{token_number_unit}"
|
298 |
+
stats = to_dataframe(stats, [unit, reverse_unit, "char/token"])
|
299 |
+
stats = stats.sort_values(["oov_ratio", unit], ascending=[True, True])
|
300 |
+
stats = stats.rename(columns={"oov_ratio": f' ⬆️oov_ratio'}).rename(columns={unit: f' ⬆️{unit}'}) # ⬇
|
301 |
+
return stats
|
302 |
+
|
303 |
+
|
304 |
+
def main():
|
305 |
+
if len(sys.argv) == 3:
|
306 |
+
tokenizer_filter = [sys.argv[1]]
|
307 |
+
corpuses = [sys.argv[2]]
|
308 |
+
else:
|
309 |
+
tokenizer_filter, corpuses = None, common_corpuses
|
310 |
+
# tokenizer_filter, corpuses = "openai", ["cc100/en", "cc100/zh-Hans"]
|
311 |
+
# tokenizer_filter, corpuses = "Qwen/Qwen1.5-14B", ["cc100/de"]
|
312 |
+
# tokenizer_filter, corpuses = "Qwen/Qwen1.5-14B", ["cc100/ja"] # oov 特别多
|
313 |
+
# tokenizer_filter, corpuses = "google-bert/bert-base-uncased", ["cc100/ja", "cc100/zh-Hans"] # oov 特别多
|
314 |
+
df = get_compression_leaderboard(corpuses, tokenizer_filter=tokenizer_filter)
|
315 |
+
# print(df.to_markdown(index=False, tablefmt='fancy_grid'))
|
316 |
+
logger.info(f"\n{df.to_markdown(index=False)}")
|
317 |
+
|
318 |
+
|
319 |
+
if __name__ == "__main__":
|
320 |
+
main()
|
css/style.css
CHANGED
@@ -1,59 +1,62 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
.
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
}
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
.
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
|
|
|
|
|
|
|
1 |
+
/* hidde legend of HighlightText, has been integrated in gradio.4.32.0 with `show_inline_category`
|
2 |
+
.category-legend {
|
3 |
+
display: none !important;
|
4 |
+
}
|
5 |
+
*/
|
6 |
+
|
7 |
+
/* show space in HighlightText:https://blog.csdn.net/liuxiao723846/article/details/118994673
|
8 |
+
TODO: integrate in gradio with `show_single_whitespace=True` or `strip_token=False`
|
9 |
+
*/
|
10 |
+
.space-show {
|
11 |
+
white-space: pre-wrap;
|
12 |
+
}
|
13 |
+
|
14 |
+
.cell-wrap {
|
15 |
+
white-space: pre-wrap;
|
16 |
+
}
|
17 |
+
|
18 |
+
|
19 |
+
/* white button */
|
20 |
+
.button-as-text {
|
21 |
+
background: #fff;
|
22 |
+
border-color: #fff;
|
23 |
+
}
|
24 |
+
|
25 |
+
.textbox-as-text {
|
26 |
+
border-style: hidden;
|
27 |
+
background: #fff;
|
28 |
+
border-color: #fff;
|
29 |
+
}
|
30 |
+
|
31 |
+
|
32 |
+
.h2-font {
|
33 |
+
font-size: 30px;
|
34 |
+
}
|
35 |
+
|
36 |
+
.no-border {
|
37 |
+
border: 0px none;
|
38 |
+
}
|
39 |
+
|
40 |
+
|
41 |
+
.statistics {
|
42 |
+
min-width: min(50px, 100%) !important;
|
43 |
+
}
|
44 |
+
|
45 |
+
.statistics textarea {
|
46 |
+
min-width: min(50px, 100%) !important;
|
47 |
+
font-size: 20px !important;
|
48 |
+
font-weight: 600 !important;
|
49 |
+
text-align: center !important;
|
50 |
+
border: none !important;
|
51 |
+
}
|
52 |
+
|
53 |
+
.statistics label {
|
54 |
+
text-align: center !important;
|
55 |
+
}
|
56 |
+
|
57 |
+
/* align-self: flex-end; */
|
58 |
+
.example-style {
|
59 |
+
max-width: 150px;
|
60 |
+
align-self: self-end;
|
61 |
+
}
|
62 |
+
|
playground_app.py
CHANGED
@@ -1,264 +1,233 @@
|
|
1 |
-
# coding=utf-8
|
2 |
-
# author: xusong
|
3 |
-
# time: 2022/8/23 16:06
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
""
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
#
|
58 |
-
#
|
59 |
-
#
|
60 |
-
#
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
#
|
143 |
-
#
|
144 |
-
#
|
145 |
-
#
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
#
|
210 |
-
#
|
211 |
-
# [
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
#
|
232 |
-
|
233 |
-
#
|
234 |
-
#
|
235 |
-
# compress_rate_unit.change(get_compress_rate,
|
236 |
-
# [tokenizer_type_1, compress_rate_corpus, compress_rate_unit],
|
237 |
-
# [stats_compress_rate_1])
|
238 |
-
# compress_rate_unit.change(get_compress_rate,
|
239 |
-
# [tokenizer_type_2, compress_rate_corpus, compress_rate_unit],
|
240 |
-
# [stats_compress_rate_2])
|
241 |
-
# compress_rate_corpus.change(get_compress_rate,
|
242 |
-
# [tokenizer_type_1, compress_rate_corpus, compress_rate_unit],
|
243 |
-
# [stats_compress_rate_1])
|
244 |
-
# compress_rate_corpus.change(get_compress_rate,
|
245 |
-
# [tokenizer_type_2, compress_rate_corpus, compress_rate_unit],
|
246 |
-
# [stats_compress_rate_2])
|
247 |
-
|
248 |
-
dropdown_examples.change(
|
249 |
-
example_fn,
|
250 |
-
dropdown_examples,
|
251 |
-
[user_input, tokenizer_name_1, tokenizer_name_2]
|
252 |
-
)
|
253 |
-
|
254 |
-
demo.load(
|
255 |
-
fn=on_load,
|
256 |
-
inputs=[user_input], # 这里只需要传个空object即可。
|
257 |
-
outputs=[user_input, tokenizer_name_1, tokenizer_name_2],
|
258 |
-
js=get_window_url_params
|
259 |
-
)
|
260 |
-
|
261 |
-
if __name__ == "__main__":
|
262 |
-
# demo.queue(max_size=20).launch()
|
263 |
-
demo.launch()
|
264 |
-
# demo.launch(share=True)
|
|
|
1 |
+
# coding=utf-8
|
2 |
+
# author: xusong
|
3 |
+
# time: 2022/8/23 16:06
|
4 |
+
|
5 |
+
import gradio as gr
|
6 |
+
from vocab import tokenizer_factory
|
7 |
+
from playground_examples import example_types, example_fn
|
8 |
+
from playground_util import tokenize, tokenize_pair, basic_count, get_overlap_token_size, on_load
|
9 |
+
|
10 |
+
|
11 |
+
|
12 |
+
|
13 |
+
get_window_url_params = """
|
14 |
+
function(url_params) {
|
15 |
+
const params = new URLSearchParams(window.location.search);
|
16 |
+
url_params = JSON.stringify(Object.fromEntries(params));
|
17 |
+
return url_params;
|
18 |
+
}
|
19 |
+
"""
|
20 |
+
|
21 |
+
all_tokenizer_name = [(config.name_display, config.name_or_path) for config in tokenizer_factory.all_tokenizer_configs]
|
22 |
+
|
23 |
+
with gr.Blocks() as demo:
|
24 |
+
# links: https://www.coderstool.com/utf8-encoding-decoding
|
25 |
+
# 功能:输入文本,进行分词
|
26 |
+
# 分词器:常见的分词器有集中,
|
27 |
+
# 背景:方便分词、看词粒度、对比
|
28 |
+
|
29 |
+
with gr.Row():
|
30 |
+
gr.Markdown("## Input Text")
|
31 |
+
dropdown_examples = gr.Dropdown(
|
32 |
+
example_types,
|
33 |
+
value="Examples",
|
34 |
+
type="index",
|
35 |
+
allow_custom_value=True,
|
36 |
+
show_label=False,
|
37 |
+
container=False,
|
38 |
+
scale=0,
|
39 |
+
elem_classes="example-style"
|
40 |
+
)
|
41 |
+
user_input = gr.Textbox(
|
42 |
+
# value=default_user_input,
|
43 |
+
label="Input Text",
|
44 |
+
lines=5,
|
45 |
+
show_label=False,
|
46 |
+
)
|
47 |
+
gr.Markdown("## Tokenization")
|
48 |
+
|
49 |
+
# compress rate setting TODO: 将 这个模块调整到下面
|
50 |
+
# with gr.Accordion("Compress Rate Setting", open=True):
|
51 |
+
# gr.Markdown(
|
52 |
+
# "Please select corpus and unit of compress rate, get more details at [github](https://github.com/xu-song/tokenizer-arena/). ")
|
53 |
+
# with gr.Row():
|
54 |
+
# compress_rate_corpus = gr.CheckboxGroup(
|
55 |
+
# common_corpuses, # , "code"
|
56 |
+
# value=["cc100-en", "cc100-zh-Hans"],
|
57 |
+
# label="corpus",
|
58 |
+
# # info=""
|
59 |
+
# )
|
60 |
+
# compress_rate_unit = gr.Radio(
|
61 |
+
# common_units,
|
62 |
+
# value="b_tokens/g_bytes",
|
63 |
+
# label="unit",
|
64 |
+
# )
|
65 |
+
# TODO: Token Setting
|
66 |
+
# with gr.Accordion("Token Filter Setting", open=False):
|
67 |
+
# gr.Markdown(
|
68 |
+
# "Get total number of tokens which contain the following character)")
|
69 |
+
# gr.Radio(
|
70 |
+
# ["zh-Hans", "", "number", "space"],
|
71 |
+
# value="zh",
|
72 |
+
# )
|
73 |
+
|
74 |
+
with gr.Row():
|
75 |
+
with gr.Column(scale=6):
|
76 |
+
with gr.Group():
|
77 |
+
tokenizer_name_1 = gr.Dropdown(
|
78 |
+
all_tokenizer_name,
|
79 |
+
label="Tokenizer 1",
|
80 |
+
)
|
81 |
+
with gr.Group():
|
82 |
+
with gr.Row():
|
83 |
+
organization_1 = gr.TextArea(
|
84 |
+
label="Organization",
|
85 |
+
lines=1,
|
86 |
+
elem_classes="statistics",
|
87 |
+
)
|
88 |
+
stats_vocab_size_1 = gr.TextArea(
|
89 |
+
label="Vocab Size",
|
90 |
+
lines=1,
|
91 |
+
elem_classes="statistics"
|
92 |
+
)
|
93 |
+
# stats_zh_token_size_1 = gr.TextArea(
|
94 |
+
# label="ZH char/word",
|
95 |
+
# lines=1,
|
96 |
+
# elem_classes="statistics",
|
97 |
+
# )
|
98 |
+
# stats_compress_rate_1 = gr.TextArea(
|
99 |
+
# label="Compress Rate",
|
100 |
+
# lines=1,
|
101 |
+
# elem_classes="statistics",
|
102 |
+
# )
|
103 |
+
stats_overlap_token_size_1 = gr.TextArea(
|
104 |
+
# value=default_stats_overlap_token_size,
|
105 |
+
label="Overlap Tokens",
|
106 |
+
lines=1,
|
107 |
+
elem_classes="statistics"
|
108 |
+
)
|
109 |
+
# stats_3 = gr.TextArea(
|
110 |
+
# label="Compress Rate",
|
111 |
+
# lines=1,
|
112 |
+
# elem_classes="statistics"
|
113 |
+
# )
|
114 |
+
# https://www.onlinewebfonts.com/icon/418591
|
115 |
+
gr.Image("images/VS.svg", scale=1, show_label=False,
|
116 |
+
show_download_button=False, container=False,
|
117 |
+
show_share_button=False)
|
118 |
+
with gr.Column(scale=6):
|
119 |
+
with gr.Group():
|
120 |
+
tokenizer_name_2 = gr.Dropdown(
|
121 |
+
all_tokenizer_name,
|
122 |
+
label="Tokenizer 2",
|
123 |
+
)
|
124 |
+
with gr.Group():
|
125 |
+
with gr.Row():
|
126 |
+
organization_2 = gr.TextArea(
|
127 |
+
label="Organization",
|
128 |
+
lines=1,
|
129 |
+
elem_classes="statistics",
|
130 |
+
)
|
131 |
+
stats_vocab_size_2 = gr.TextArea(
|
132 |
+
label="Vocab Size",
|
133 |
+
lines=1,
|
134 |
+
elem_classes="statistics"
|
135 |
+
)
|
136 |
+
# stats_zh_token_size_2 = gr.TextArea(
|
137 |
+
# label="ZH char/word", # 中文字/词
|
138 |
+
# lines=1,
|
139 |
+
# elem_classes="statistics",
|
140 |
+
# )
|
141 |
+
# stats_compress_rate_2 = gr.TextArea(
|
142 |
+
# label="Compress Rate",
|
143 |
+
# lines=1,
|
144 |
+
# elem_classes="statistics"
|
145 |
+
# )
|
146 |
+
stats_filtered_token_2 = gr.TextArea(
|
147 |
+
label="filtered tokens",
|
148 |
+
lines=1,
|
149 |
+
elem_classes="statistics",
|
150 |
+
visible=False
|
151 |
+
)
|
152 |
+
stats_overlap_token_size_2 = gr.TextArea(
|
153 |
+
label="Overlap Tokens",
|
154 |
+
lines=1,
|
155 |
+
elem_classes="statistics"
|
156 |
+
)
|
157 |
+
|
158 |
+
# TODO: 图 表 压缩率
|
159 |
+
with gr.Row():
|
160 |
+
# dynamic change label
|
161 |
+
with gr.Column():
|
162 |
+
output_text_1 = gr.Highlightedtext(
|
163 |
+
show_legend=False,
|
164 |
+
show_inline_category=False,
|
165 |
+
elem_classes="space-show"
|
166 |
+
)
|
167 |
+
with gr.Column():
|
168 |
+
output_text_2 = gr.Highlightedtext(
|
169 |
+
show_legend=False,
|
170 |
+
show_inline_category=False,
|
171 |
+
elem_classes="space-show"
|
172 |
+
)
|
173 |
+
|
174 |
+
with gr.Row():
|
175 |
+
output_table_1 = gr.Dataframe()
|
176 |
+
output_table_2 = gr.Dataframe()
|
177 |
+
|
178 |
+
# setting
|
179 |
+
# compress_rate_unit.change(compress_rate_unit_change, [compress_rate_unit],
|
180 |
+
# [stats_compress_rate_1, stats_compress_rate_2])
|
181 |
+
|
182 |
+
tokenizer_name_1.change(tokenize, [user_input, tokenizer_name_1],
|
183 |
+
[output_text_1, output_table_1])
|
184 |
+
tokenizer_name_1.change(basic_count, [tokenizer_name_1], [stats_vocab_size_1, organization_1])
|
185 |
+
tokenizer_name_1.change(get_overlap_token_size, [tokenizer_name_1, tokenizer_name_2],
|
186 |
+
[stats_overlap_token_size_1, stats_overlap_token_size_2])
|
187 |
+
# tokenizer_type_1.change(get_compress_rate, [tokenizer_type_1, compress_rate_corpus, compress_rate_unit],
|
188 |
+
# [stats_compress_rate_1])
|
189 |
+
|
190 |
+
# TODO: every=3
|
191 |
+
user_input.change(tokenize_pair,
|
192 |
+
[user_input, tokenizer_name_1, tokenizer_name_2],
|
193 |
+
[output_text_1, output_table_1, output_text_2, output_table_2]) # , pass_request=1
|
194 |
+
|
195 |
+
tokenizer_name_2.change(tokenize, [user_input, tokenizer_name_2],
|
196 |
+
[output_text_2, output_table_2])
|
197 |
+
tokenizer_name_2.change(basic_count, [tokenizer_name_2], [stats_vocab_size_2, organization_2])
|
198 |
+
tokenizer_name_2.change(get_overlap_token_size, [tokenizer_name_1, tokenizer_name_2],
|
199 |
+
[stats_overlap_token_size_1, stats_overlap_token_size_2])
|
200 |
+
# tokenizer_type_2.change(get_compress_rate,
|
201 |
+
# [tokenizer_type_2, compress_rate_corpus, compress_rate_unit],
|
202 |
+
# [stats_compress_rate_2])
|
203 |
+
#
|
204 |
+
# compress_rate_unit.change(get_compress_rate,
|
205 |
+
# [tokenizer_type_1, compress_rate_corpus, compress_rate_unit],
|
206 |
+
# [stats_compress_rate_1])
|
207 |
+
# compress_rate_unit.change(get_compress_rate,
|
208 |
+
# [tokenizer_type_2, compress_rate_corpus, compress_rate_unit],
|
209 |
+
# [stats_compress_rate_2])
|
210 |
+
# compress_rate_corpus.change(get_compress_rate,
|
211 |
+
# [tokenizer_type_1, compress_rate_corpus, compress_rate_unit],
|
212 |
+
# [stats_compress_rate_1])
|
213 |
+
# compress_rate_corpus.change(get_compress_rate,
|
214 |
+
# [tokenizer_type_2, compress_rate_corpus, compress_rate_unit],
|
215 |
+
# [stats_compress_rate_2])
|
216 |
+
|
217 |
+
dropdown_examples.change(
|
218 |
+
example_fn,
|
219 |
+
dropdown_examples,
|
220 |
+
[user_input, tokenizer_name_1, tokenizer_name_2]
|
221 |
+
)
|
222 |
+
|
223 |
+
demo.load(
|
224 |
+
fn=on_load,
|
225 |
+
inputs=[user_input], # 这里只需要传个空object即可。
|
226 |
+
outputs=[user_input, tokenizer_name_1, tokenizer_name_2],
|
227 |
+
js=get_window_url_params
|
228 |
+
)
|
229 |
+
|
230 |
+
if __name__ == "__main__":
|
231 |
+
# demo.queue(max_size=20).launch()
|
232 |
+
demo.launch()
|
233 |
+
# demo.launch(share=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
playground_util.py
CHANGED
@@ -1,181 +1,181 @@
|
|
1 |
-
import gradio as gr
|
2 |
-
import json
|
3 |
-
import copy
|
4 |
-
import pandas as pd
|
5 |
-
from vocab import tokenizer_factory
|
6 |
-
from character_util import iter_vocab
|
7 |
-
from utils.log_util import logger
|
8 |
-
from functools import lru_cache
|
9 |
-
|
10 |
-
default_user_input = """\
|
11 |
-
Replace this text in the input field to see how tokenization works.
|
12 |
-
Buenos días!
|
13 |
-
华为发布Mate60手机。
|
14 |
-
ラグビーワールドカップ2023フランス"""
|
15 |
-
# default_tokenizer_name_1 = "Meta/llama3"
|
16 |
-
default_tokenizer_name_1 = "gradientai/Llama-3-8B-Instruct-Gradient-1048k"
|
17 |
-
default_tokenizer_name_2 = "openai/gpt-
|
18 |
-
|
19 |
-
|
20 |
-
@lru_cache
|
21 |
-
def _tokenize(
|
22 |
-
text: str,
|
23 |
-
tokenizer_name: str,
|
24 |
-
color_num: int = 5,
|
25 |
-
add_special_token: bool = False
|
26 |
-
):
|
27 |
-
logger.info("param=" + json.dumps({"text": text, "tokenizer_type": tokenizer_name}, ensure_ascii=False))
|
28 |
-
pos_tokens = []
|
29 |
-
tokenizer = tokenizer_factory.get_tokenizer(tokenizer_name)
|
30 |
-
if add_special_token:
|
31 |
-
encoding = tokenizer.encode(text, add_special_tokens=True)
|
32 |
-
else:
|
33 |
-
encoding = tokenizer.encode(text, add_special_tokens=False)
|
34 |
-
|
35 |
-
table = []
|
36 |
-
|
37 |
-
for idx, token_id in enumerate(encoding):
|
38 |
-
|
39 |
-
pos_tokens.extend([(
|
40 |
-
|
41 |
-
# token "Byte": # 这是 utf-8编码吧?
|
42 |
-
token = tokenizer.convert_ids_to_tokens([token_id], skip_special_tokens=False)[0]
|
43 |
-
if isinstance(token, bytes):
|
44 |
-
try:
|
45 |
-
token_str = token.decode("utf-8")
|
46 |
-
except:
|
47 |
-
token_str = token.decode("utf-8", errors="ignore")
|
48 |
-
logger.error(f"{idx}: decode_error: " + json.dumps( # gpt_35_turbo 经常有token会decode error,这里用来记录一下
|
49 |
-
{"tokenizer_type": tokenizer_name, "token": str(token), "token_str": token_str},
|
50 |
-
ensure_ascii=False))
|
51 |
-
|
52 |
-
token_bytes = token
|
53 |
-
# json_dumps = json.dumps(token_str)
|
54 |
-
elif isinstance(token, str):
|
55 |
-
token_str = token
|
56 |
-
token_bytes = bytes(token_str, "utf-8")
|
57 |
-
# json_dumps = json.dumps(token_str)
|
58 |
-
else:
|
59 |
-
logger.error(f"{idx}: wrong type for token {token_id} {type(token)} " + json.dumps(
|
60 |
-
{"text": text, "tokenizer_type": tokenizer_name}, ensure_ascii=False))
|
61 |
-
token_str = token
|
62 |
-
token_bytes = token
|
63 |
-
# continue
|
64 |
-
|
65 |
-
# ⭐
|
66 |
-
# TODO: gpt3.5_turbo错误: 只有id和text是对的,token和 utf8都是错的。说明 convert_ids_to_tokens 出错了。
|
67 |
-
table.append(
|
68 |
-
{"TokenID": token_id,
|
69 |
-
"Token": token_str, # utf-8
|
70 |
-
"Text":
|
71 |
-
# "Bytes": token_bytes, # bytes类型在gradio前端页面被解码成字符串,比如 b'\xe4\xb8\xad' 仍然显示成 "中"。因此 str(token_bytes)
|
72 |
-
"UTF8 Bytes": str(token_bytes),
|
73 |
-
# "Unicode": json_dumps # unicode, 如果是ascii码,就直接显示。如果不是ascii码,就显示unicode
|
74 |
-
}
|
75 |
-
)
|
76 |
-
|
77 |
-
table_df = pd.DataFrame(table)
|
78 |
-
logger.info(f"tokenizer_type={tokenizer_name}, Tokens={table[:4]}")
|
79 |
-
return pos_tokens, len(encoding), table_df
|
80 |
-
|
81 |
-
|
82 |
-
def tokenize(
|
83 |
-
text: str,
|
84 |
-
tokenizer_name: str,
|
85 |
-
color_num: int = 5,
|
86 |
-
add_special_token: bool = False
|
87 |
-
):
|
88 |
-
""" tokenize wrapper
|
89 |
-
As gr.Update would be overwritten after passing to frontend, we apply lru_cache in _tokenize.
|
90 |
-
"""
|
91 |
-
pos_tokens, num_tokens, table_df = _tokenize(text, tokenizer_name, color_num, add_special_token)
|
92 |
-
return gr.update(value=pos_tokens, label=f"Tokens: {num_tokens}"), table_df
|
93 |
-
|
94 |
-
|
95 |
-
def tokenize_pair(text, tokenizer_type_1, tokenizer_type_2):
|
96 |
-
"""
|
97 |
-
input_text.change
|
98 |
-
"""
|
99 |
-
pos_tokens_1, table_df_1 = tokenize(text, tokenizer_type_1)
|
100 |
-
pos_tokens_2, table_df_2 = tokenize(text, tokenizer_type_2)
|
101 |
-
return pos_tokens_1, table_df_1, pos_tokens_2, table_df_2
|
102 |
-
|
103 |
-
|
104 |
-
@lru_cache
|
105 |
-
def basic_count(tokenizer_name):
|
106 |
-
stats = iter_vocab(tokenizer_name)
|
107 |
-
return stats['vocab_size'], f'{stats["organization"]}'
|
108 |
-
# return tokenizer.vocab_size, f'{stats["中文汉字数"]["中文单字"]}/{stats["中文汉字数"]["中文多字"]}'
|
109 |
-
|
110 |
-
|
111 |
-
# def get_compress_rate(tokenizer_name, all_corpus, unit):
|
112 |
-
# tokenizer = tokenizer_factory.get_tokenizer(tokenizer_name)
|
113 |
-
# compress_rate_stats = tokenize_corpus(tokenizer, all_corpus)
|
114 |
-
# compress_rate = unit_convertor(compress_rate_stats, unit)
|
115 |
-
# return compress_rate
|
116 |
-
|
117 |
-
|
118 |
-
@lru_cache
|
119 |
-
def get_overlap_token_size(tokenizer_name_1, tokenizer_name_2):
|
120 |
-
tokenizer1 = tokenizer_factory.get_tokenizer(tokenizer_name_1)
|
121 |
-
tokenizer2 = tokenizer_factory.get_tokenizer(tokenizer_name_2)
|
122 |
-
|
123 |
-
vocab_set_1 = tokenizer1.get_vocab().keys()
|
124 |
-
vocab_set_2 = tokenizer2.get_vocab().keys()
|
125 |
-
|
126 |
-
token1 = next(iter(vocab_set_1))
|
127 |
-
token2 = next(iter(vocab_set_2))
|
128 |
-
if type(token1) != type(token2): # bytes str
|
129 |
-
if isinstance(token1, str):
|
130 |
-
vocab_set_1 = set([token.encode("utf-8") for token in vocab_set_1])
|
131 |
-
if isinstance(token2, str):
|
132 |
-
vocab_set_2 = set([token.encode("utf-8") for token in vocab_set_2])
|
133 |
-
|
134 |
-
overlap_tokens = vocab_set_1 & vocab_set_2
|
135 |
-
overlap_token_size = len(overlap_tokens)
|
136 |
-
logger.info(
|
137 |
-
f"{overlap_token_size} OverlapTokens of {tokenizer_name_1} {tokenizer_name_2}: {list(overlap_tokens)[:10]}")
|
138 |
-
return overlap_token_size, overlap_token_size
|
139 |
-
|
140 |
-
|
141 |
-
def on_load(url_params, request: gr.Request):
|
142 |
-
"""
|
143 |
-
onLoad
|
144 |
-
"""
|
145 |
-
text = None
|
146 |
-
tokenizer_type_1 = None
|
147 |
-
tokenizer_type_2 = None
|
148 |
-
try:
|
149 |
-
url_params = json.loads(url_params)
|
150 |
-
except:
|
151 |
-
url_params = {}
|
152 |
-
if request:
|
153 |
-
logger.info(str(request.headers))
|
154 |
-
client_ip = request.client.host
|
155 |
-
# local_ip = socket.gethostbyname(socket.gethostbyname(""))
|
156 |
-
# headers = request.kwargs['headers']
|
157 |
-
# if headers and 'x-forwarded-for' in headers:
|
158 |
-
# x_forwarded_for = headers['x-forwarded-for']
|
159 |
-
# client_ip = x_forwarded_for.split(' ')[0] if x_forwarded_for else ""
|
160 |
-
# if "referer" in request.headers: # not work for huggingface-space
|
161 |
-
# url_params = parse_qs(urlparse(request.headers["referer"]).query)
|
162 |
-
# url_params = {k: v[0] for k, v in url_params.items() if len(v) > 0}
|
163 |
-
tokenizer_type_1 = url_params.get("tokenizer1", default_tokenizer_name_1)
|
164 |
-
tokenizer_type_2 = url_params.get("tokenizer2", default_tokenizer_name_2)
|
165 |
-
text = url_params.get("text", default_user_input)
|
166 |
-
logger.info(f"client_ip: {client_ip}; params: {url_params}")
|
167 |
-
return text, tokenizer_type_1, tokenizer_type_2
|
168 |
-
|
169 |
-
|
170 |
-
# def compress_rate_unit_change(unit):
|
171 |
-
# return gr.update(label=f"Compress Rate: {unit}"), gr.update(label=f"Compress Rate: {unit}"),
|
172 |
-
|
173 |
-
|
174 |
-
def test_coding():
|
175 |
-
bytes1 = b'\xe4\xb8\xad'
|
176 |
-
print(bytes1) # b'\xe4\xb8\xad'
|
177 |
-
|
178 |
-
|
179 |
-
if __name__ == "__main__":
|
180 |
-
print(get_overlap_token_size("gpt-35-turbo", "gpt-4"))
|
181 |
-
# print(basic_count("internlm_chat_7b"))
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import json
|
3 |
+
import copy
|
4 |
+
import pandas as pd
|
5 |
+
from vocab import tokenizer_factory
|
6 |
+
from character_util import iter_vocab
|
7 |
+
from utils.log_util import logger
|
8 |
+
from functools import lru_cache
|
9 |
+
|
10 |
+
default_user_input = """\
|
11 |
+
Replace this text in the input field to see how tokenization works.
|
12 |
+
Buenos días!
|
13 |
+
华为发布Mate60手机。
|
14 |
+
ラグビーワールドカップ2023フランス"""
|
15 |
+
# default_tokenizer_name_1 = "Meta/llama3"
|
16 |
+
default_tokenizer_name_1 = "gradientai/Llama-3-8B-Instruct-Gradient-1048k"
|
17 |
+
default_tokenizer_name_2 = "openai/gpt-4o"
|
18 |
+
|
19 |
+
|
20 |
+
@lru_cache
|
21 |
+
def _tokenize(
|
22 |
+
text: str,
|
23 |
+
tokenizer_name: str,
|
24 |
+
color_num: int = 5,
|
25 |
+
add_special_token: bool = False
|
26 |
+
):
|
27 |
+
logger.info("param=" + json.dumps({"text": text, "tokenizer_type": tokenizer_name}, ensure_ascii=False))
|
28 |
+
pos_tokens = []
|
29 |
+
tokenizer = tokenizer_factory.get_tokenizer(tokenizer_name)
|
30 |
+
if add_special_token:
|
31 |
+
encoding = tokenizer.encode(text, add_special_tokens=True)
|
32 |
+
else:
|
33 |
+
encoding = tokenizer.encode(text, add_special_tokens=False)
|
34 |
+
|
35 |
+
table = []
|
36 |
+
|
37 |
+
for idx, token_id in enumerate(encoding):
|
38 |
+
decoded_text = tokenizer.decode([token_id]) # 特殊字符解码后会统一变成 �,对应 "\ufffd"
|
39 |
+
pos_tokens.extend([(decoded_text, str(idx % color_num))])
|
40 |
+
|
41 |
+
# token "Byte": # 这是 utf-8编码吧?
|
42 |
+
token = tokenizer.convert_ids_to_tokens([token_id], skip_special_tokens=False)[0]
|
43 |
+
if isinstance(token, bytes):
|
44 |
+
try:
|
45 |
+
token_str = token.decode("utf-8")
|
46 |
+
except:
|
47 |
+
token_str = token.decode("utf-8", errors="ignore")
|
48 |
+
logger.error(f"{idx}: decode_error: " + json.dumps( # gpt_35_turbo 经常有token会decode error,这里用来记录一下
|
49 |
+
{"tokenizer_type": tokenizer_name, "token": str(token), "token_str": token_str},
|
50 |
+
ensure_ascii=False))
|
51 |
+
|
52 |
+
token_bytes = token
|
53 |
+
# json_dumps = json.dumps(token_str)
|
54 |
+
elif isinstance(token, str):
|
55 |
+
token_str = token
|
56 |
+
token_bytes = bytes(token_str, "utf-8")
|
57 |
+
# json_dumps = json.dumps(token_str)
|
58 |
+
else:
|
59 |
+
logger.error(f"{idx}: wrong type for token {token_id} {type(token)} " + json.dumps(
|
60 |
+
{"text": text, "tokenizer_type": tokenizer_name}, ensure_ascii=False))
|
61 |
+
token_str = token
|
62 |
+
token_bytes = token
|
63 |
+
# continue
|
64 |
+
|
65 |
+
# ⭐
|
66 |
+
# TODO: gpt3.5_turbo错误: 只有id和text是对的,token和 utf8都是错的。说明 convert_ids_to_tokens 出错了。
|
67 |
+
table.append(
|
68 |
+
{"TokenID": token_id,
|
69 |
+
"Token": token_str, # utf-8解码后的字符串,为什么���些是 <0xE7>,表示什么?比如llama
|
70 |
+
"Text": decoded_text, #
|
71 |
+
# "Bytes": token_bytes, # bytes类型在gradio前端页面被解码成字符串,比如 b'\xe4\xb8\xad' 仍然显示成 "中"。因此 str(token_bytes)
|
72 |
+
"UTF8 Bytes": str(token_bytes),
|
73 |
+
# "Unicode": json_dumps # unicode, 如果是ascii码,就直接显示。如果不是ascii码,就显示unicode
|
74 |
+
}
|
75 |
+
)
|
76 |
+
|
77 |
+
table_df = pd.DataFrame(table)
|
78 |
+
logger.info(f"tokenizer_type={tokenizer_name}, Tokens={table[:4]}")
|
79 |
+
return pos_tokens, len(encoding), table_df
|
80 |
+
|
81 |
+
|
82 |
+
def tokenize(
|
83 |
+
text: str,
|
84 |
+
tokenizer_name: str,
|
85 |
+
color_num: int = 5,
|
86 |
+
add_special_token: bool = False
|
87 |
+
):
|
88 |
+
""" tokenize wrapper
|
89 |
+
As gr.Update would be overwritten after passing to frontend, we apply lru_cache in _tokenize.
|
90 |
+
"""
|
91 |
+
pos_tokens, num_tokens, table_df = _tokenize(text, tokenizer_name, color_num, add_special_token)
|
92 |
+
return gr.update(value=pos_tokens, label=f"Tokens: {num_tokens}"), table_df
|
93 |
+
|
94 |
+
|
95 |
+
def tokenize_pair(text, tokenizer_type_1, tokenizer_type_2):
|
96 |
+
"""
|
97 |
+
input_text.change
|
98 |
+
"""
|
99 |
+
pos_tokens_1, table_df_1 = tokenize(text, tokenizer_type_1)
|
100 |
+
pos_tokens_2, table_df_2 = tokenize(text, tokenizer_type_2)
|
101 |
+
return pos_tokens_1, table_df_1, pos_tokens_2, table_df_2
|
102 |
+
|
103 |
+
|
104 |
+
@lru_cache
|
105 |
+
def basic_count(tokenizer_name):
|
106 |
+
stats = iter_vocab(tokenizer_name)
|
107 |
+
return stats['vocab_size'], f'{stats["organization"]}'
|
108 |
+
# return tokenizer.vocab_size, f'{stats["中文汉字数"]["中文单字"]}/{stats["中文汉字数"]["中文多字"]}'
|
109 |
+
|
110 |
+
|
111 |
+
# def get_compress_rate(tokenizer_name, all_corpus, unit):
|
112 |
+
# tokenizer = tokenizer_factory.get_tokenizer(tokenizer_name)
|
113 |
+
# compress_rate_stats = tokenize_corpus(tokenizer, all_corpus)
|
114 |
+
# compress_rate = unit_convertor(compress_rate_stats, unit)
|
115 |
+
# return compress_rate
|
116 |
+
|
117 |
+
|
118 |
+
@lru_cache
|
119 |
+
def get_overlap_token_size(tokenizer_name_1, tokenizer_name_2):
|
120 |
+
tokenizer1 = tokenizer_factory.get_tokenizer(tokenizer_name_1)
|
121 |
+
tokenizer2 = tokenizer_factory.get_tokenizer(tokenizer_name_2)
|
122 |
+
|
123 |
+
vocab_set_1 = tokenizer1.get_vocab().keys()
|
124 |
+
vocab_set_2 = tokenizer2.get_vocab().keys()
|
125 |
+
|
126 |
+
token1 = next(iter(vocab_set_1))
|
127 |
+
token2 = next(iter(vocab_set_2))
|
128 |
+
if type(token1) != type(token2): # bytes str
|
129 |
+
if isinstance(token1, str):
|
130 |
+
vocab_set_1 = set([token.encode("utf-8") for token in vocab_set_1])
|
131 |
+
if isinstance(token2, str):
|
132 |
+
vocab_set_2 = set([token.encode("utf-8") for token in vocab_set_2])
|
133 |
+
|
134 |
+
overlap_tokens = vocab_set_1 & vocab_set_2
|
135 |
+
overlap_token_size = len(overlap_tokens)
|
136 |
+
logger.info(
|
137 |
+
f"{overlap_token_size} OverlapTokens of {tokenizer_name_1} {tokenizer_name_2}: {list(overlap_tokens)[:10]}")
|
138 |
+
return overlap_token_size, overlap_token_size
|
139 |
+
|
140 |
+
|
141 |
+
def on_load(url_params, request: gr.Request):
|
142 |
+
"""
|
143 |
+
onLoad
|
144 |
+
"""
|
145 |
+
text = None
|
146 |
+
tokenizer_type_1 = None
|
147 |
+
tokenizer_type_2 = None
|
148 |
+
try:
|
149 |
+
url_params = json.loads(url_params)
|
150 |
+
except:
|
151 |
+
url_params = {}
|
152 |
+
if request:
|
153 |
+
logger.info(str(request.headers))
|
154 |
+
client_ip = request.client.host
|
155 |
+
# local_ip = socket.gethostbyname(socket.gethostbyname(""))
|
156 |
+
# headers = request.kwargs['headers']
|
157 |
+
# if headers and 'x-forwarded-for' in headers:
|
158 |
+
# x_forwarded_for = headers['x-forwarded-for']
|
159 |
+
# client_ip = x_forwarded_for.split(' ')[0] if x_forwarded_for else ""
|
160 |
+
# if "referer" in request.headers: # not work for huggingface-space
|
161 |
+
# url_params = parse_qs(urlparse(request.headers["referer"]).query)
|
162 |
+
# url_params = {k: v[0] for k, v in url_params.items() if len(v) > 0}
|
163 |
+
tokenizer_type_1 = url_params.get("tokenizer1", default_tokenizer_name_1)
|
164 |
+
tokenizer_type_2 = url_params.get("tokenizer2", default_tokenizer_name_2)
|
165 |
+
text = url_params.get("text", default_user_input)
|
166 |
+
logger.info(f"client_ip: {client_ip}; params: {url_params}")
|
167 |
+
return text, tokenizer_type_1, tokenizer_type_2
|
168 |
+
|
169 |
+
|
170 |
+
# def compress_rate_unit_change(unit):
|
171 |
+
# return gr.update(label=f"Compress Rate: {unit}"), gr.update(label=f"Compress Rate: {unit}"),
|
172 |
+
|
173 |
+
|
174 |
+
def test_coding():
|
175 |
+
bytes1 = b'\xe4\xb8\xad'
|
176 |
+
print(bytes1) # b'\xe4\xb8\xad'
|
177 |
+
|
178 |
+
|
179 |
+
if __name__ == "__main__":
|
180 |
+
print(get_overlap_token_size("gpt-35-turbo", "gpt-4"))
|
181 |
+
# print(basic_count("internlm_chat_7b"))
|
requirements.txt
CHANGED
@@ -1,11 +1,12 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
|
|
11 |
fugashi
|
|
|
1 |
+
gradio>=4.32.0
|
2 |
+
transformers
|
3 |
+
sentencepiece
|
4 |
+
tiktoken
|
5 |
+
icetk
|
6 |
+
torch
|
7 |
+
nltk
|
8 |
+
boto3
|
9 |
+
protobuf==4.25.3
|
10 |
+
ai2-olmo==0.2.4
|
11 |
+
ipadic
|
12 |
fugashi
|
stats/character_stats.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
stats/compression_rate.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
stats/compression_rate/ClassCat.gpt2-base-french @ cc100.ar.diff.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bfb1c2be8bf13e5989a95b5f401f92aaad6cadde8ecc704ebaf9b9578bb359a2
|
3 |
+
size 2145294
|
stats/compression_rate/ClassCat.gpt2-base-french @ cc100.de.diff.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:554a365ce0da76ae5d93642b496bb1bc3d8d78c1112523545a2219f7fe213a91
|
3 |
+
size 10978507
|
stats/compression_rate/ClassCat.gpt2-base-french @ cc100.en.diff.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:21c349b2602379affd0aa388d75addece67a14d0afaaf5b4980c90e9cc875e8e
|
3 |
+
size 5261108
|
stats/compression_rate/ClassCat.gpt2-base-french @ cc100.es.diff.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e829c4c544a9e8d98701b3d3bf1e3593b63e59ab5ba244c1ab376f6002fbd0f9
|
3 |
+
size 6853004
|
stats/compression_rate/ClassCat.gpt2-base-french @ cc100.fa.diff.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:908327a56262f721590d9479faa579156ba8bd155242262943797be697bc2655
|
3 |
+
size 1058478
|
stats/compression_rate/ClassCat.gpt2-base-french @ cc100.fr.diff.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8f02e17dfe25c4c1526c8adee812a7141d92ccbd3b1160e7c73fc325d9fbfe4e
|
3 |
+
size 6385085
|
stats/compression_rate/ClassCat.gpt2-base-french @ cc100.ja.diff.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0abf3a91ddeeaa12d4732eaf1b4ff2a207b3d85fc54a079b4ac853696d831148
|
3 |
+
size 2529096
|
stats/compression_rate/ClassCat.gpt2-base-french @ cc100.ko.diff.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8fd64f035328b88bb4389ee820bb6d2bed510e0e4259cc4f38a0f573d2c003c2
|
3 |
+
size 2491144
|
stats/compression_rate/ClassCat.gpt2-base-french @ cc100.zh-Hans.diff.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7781b5bc9b2c3f45764842bf551a6e039ddef4f1bafd85ce12446834a26dd241
|
3 |
+
size 10841058
|
stats/compression_rate/ClassCat.gpt2-base-spanish @ cc100.ar.diff.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bfb1c2be8bf13e5989a95b5f401f92aaad6cadde8ecc704ebaf9b9578bb359a2
|
3 |
+
size 2145294
|
stats/compression_rate/ClassCat.gpt2-base-spanish @ cc100.de.diff.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:554a365ce0da76ae5d93642b496bb1bc3d8d78c1112523545a2219f7fe213a91
|
3 |
+
size 10978507
|
stats/compression_rate/ClassCat.gpt2-base-spanish @ cc100.en.diff.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:21c349b2602379affd0aa388d75addece67a14d0afaaf5b4980c90e9cc875e8e
|
3 |
+
size 5261108
|
stats/compression_rate/ClassCat.gpt2-base-spanish @ cc100.es.diff.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e829c4c544a9e8d98701b3d3bf1e3593b63e59ab5ba244c1ab376f6002fbd0f9
|
3 |
+
size 6853004
|
stats/compression_rate/ClassCat.gpt2-base-spanish @ cc100.fa.diff.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:908327a56262f721590d9479faa579156ba8bd155242262943797be697bc2655
|
3 |
+
size 1058478
|
stats/compression_rate/ClassCat.gpt2-base-spanish @ cc100.fr.diff.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8f02e17dfe25c4c1526c8adee812a7141d92ccbd3b1160e7c73fc325d9fbfe4e
|
3 |
+
size 6385085
|
stats/compression_rate/ClassCat.gpt2-base-spanish @ cc100.ja.diff.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0abf3a91ddeeaa12d4732eaf1b4ff2a207b3d85fc54a079b4ac853696d831148
|
3 |
+
size 2529096
|
stats/compression_rate/ClassCat.gpt2-base-spanish @ cc100.ko.diff.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8fd64f035328b88bb4389ee820bb6d2bed510e0e4259cc4f38a0f573d2c003c2
|
3 |
+
size 2491144
|
stats/compression_rate/ClassCat.gpt2-base-spanish @ cc100.zh-Hans.diff.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7781b5bc9b2c3f45764842bf551a6e039ddef4f1bafd85ce12446834a26dd241
|
3 |
+
size 10841058
|
stats/compression_rate/ClueAI.ChatYuan-large-v2 @ cc100.ar.diff.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5b56af2e07e0c6ae80ed6c212d92a11eaad7dc654c187c7471738ba3c830a588
|
3 |
+
size 20780798
|
stats/compression_rate/ClueAI.ChatYuan-large-v2 @ cc100.de.diff.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:00492605965dd0637b79fe80e3d2428065cba551a9a7198bd7a0b505ce85d81b
|
3 |
+
size 2751629
|
stats/compression_rate/ClueAI.ChatYuan-large-v2 @ cc100.en.diff.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e823bebc4f8f42e03b8e621baa23b07072a4199eb0fd293e92d11c96003f3433
|
3 |
+
size 163424
|
stats/compression_rate/ClueAI.ChatYuan-large-v2 @ cc100.es.diff.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b3003793b062ae28b5b4f202b8f0d9f725e46f024acc38f7f9ef08e8b3381fc0
|
3 |
+
size 2030664
|
stats/compression_rate/ClueAI.ChatYuan-large-v2 @ cc100.fa.diff.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6c07f75c1eb80e59bab44b7b6ced9aec1404dbf56a5abd85779846c83974a7de
|
3 |
+
size 18041636
|
stats/compression_rate/ClueAI.ChatYuan-large-v2 @ cc100.fr.diff.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:341e747d07dd8276b90de8c7d725a45e10d39084bc819ffd54cab6460ddcba63
|
3 |
+
size 3129632
|
stats/compression_rate/ClueAI.ChatYuan-large-v2 @ cc100.ja.diff.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b46c604a75d0288f253c3439a2a7333c38e900ebb42ba39dd1c2ecbe4229f304
|
3 |
+
size 6425383
|
stats/compression_rate/ClueAI.ChatYuan-large-v2 @ cc100.ko.diff.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:eeab167e9f566512c3065d362e720f1930bd51ca5b9c14c207a252fa9380e7fa
|
3 |
+
size 15893128
|
stats/compression_rate/ClueAI.ChatYuan-large-v2 @ cc100.zh-Hans.diff.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e851ffd1f4f9bf8949cb0e77cc15ea65223fe4a54ac5a13ec9e43c27a550388f
|
3 |
+
size 10563259
|
stats/compression_rate/ClueAI.PromptCLUE-base @ cc100.ar.diff.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5b56af2e07e0c6ae80ed6c212d92a11eaad7dc654c187c7471738ba3c830a588
|
3 |
+
size 20780798
|
stats/compression_rate/ClueAI.PromptCLUE-base @ cc100.de.diff.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:00492605965dd0637b79fe80e3d2428065cba551a9a7198bd7a0b505ce85d81b
|
3 |
+
size 2751629
|
stats/compression_rate/ClueAI.PromptCLUE-base @ cc100.en.diff.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e823bebc4f8f42e03b8e621baa23b07072a4199eb0fd293e92d11c96003f3433
|
3 |
+
size 163424
|
stats/compression_rate/ClueAI.PromptCLUE-base @ cc100.es.diff.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b3003793b062ae28b5b4f202b8f0d9f725e46f024acc38f7f9ef08e8b3381fc0
|
3 |
+
size 2030664
|
stats/compression_rate/ClueAI.PromptCLUE-base @ cc100.fa.diff.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6c07f75c1eb80e59bab44b7b6ced9aec1404dbf56a5abd85779846c83974a7de
|
3 |
+
size 18041636
|
stats/compression_rate/ClueAI.PromptCLUE-base @ cc100.fr.diff.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:341e747d07dd8276b90de8c7d725a45e10d39084bc819ffd54cab6460ddcba63
|
3 |
+
size 3129632
|
stats/compression_rate/ClueAI.PromptCLUE-base @ cc100.ja.diff.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b46c604a75d0288f253c3439a2a7333c38e900ebb42ba39dd1c2ecbe4229f304
|
3 |
+
size 6425383
|
stats/compression_rate/ClueAI.PromptCLUE-base @ cc100.ko.diff.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:eeab167e9f566512c3065d362e720f1930bd51ca5b9c14c207a252fa9380e7fa
|
3 |
+
size 15893128
|
stats/compression_rate/ClueAI.PromptCLUE-base @ cc100.zh-Hans.diff.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e851ffd1f4f9bf8949cb0e77cc15ea65223fe4a54ac5a13ec9e43c27a550388f
|
3 |
+
size 10563259
|