File size: 9,870 Bytes
751936e
 
 
 
 
d10ecd7
6551d2c
 
a173fe5
 
 
 
 
309a593
 
 
 
7156337
79b95c3
0177868
8e0e4e9
d10ecd7
751936e
 
 
 
 
 
428b731
 
 
 
751936e
 
 
 
 
 
 
d10ecd7
 
751936e
428b731
6551d2c
0177868
428b731
d10ecd7
428b731
0177868
6551d2c
0177868
6551d2c
 
 
 
 
428b731
751936e
6551d2c
79b95c3
 
 
 
428b731
d10ecd7
 
428b731
751936e
b15345c
d2551ad
 
 
b15345c
 
d2551ad
 
b15345c
 
 
 
 
0177868
 
b15345c
d10ecd7
428b731
751936e
428b731
 
 
751936e
428b731
751936e
d10ecd7
 
 
d2551ad
 
d10ecd7
 
 
 
 
 
 
751936e
 
b15345c
428b731
 
 
751936e
d10ecd7
 
 
 
751936e
428b731
 
 
 
 
 
 
b15345c
428b731
 
 
 
 
 
 
 
b15345c
428b731
 
 
 
d10ecd7
b15345c
d10ecd7
428b731
 
 
d10ecd7
b15345c
d10ecd7
428b731
 
 
d10ecd7
 
 
 
 
428b731
e4187ae
79b95c3
309a593
428b731
 
 
 
309a593
428b731
 
 
 
 
b15345c
428b731
 
 
 
7156337
b15345c
7156337
428b731
 
 
d10ecd7
 
 
 
 
 
b15345c
d10ecd7
428b731
 
 
 
751936e
 
 
 
b15345c
0177868
751936e
 
 
 
 
b15345c
0177868
751936e
 
 
 
428b731
 
b15345c
d10ecd7
 
428b731
 
 
b15345c
d10ecd7
 
428b731
 
d10ecd7
 
 
 
 
 
428b731
 
 
6f9d07b
d10ecd7
 
 
 
 
 
 
 
 
 
 
428b731
751936e
d10ecd7
a173fe5
751936e
 
d10ecd7
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
# coding=utf-8
# author: xusong
# time: 2022/8/23 16:06

"""
## TODO:
- http get方式获取参数,(高优先级)
- i18 国际化  https://blog.csdn.net/qq_26212731/article/details/78457198   request.header中也有language
- iter_vocab 的 warmup
- add_special_token 开关
- theme 开关 light/dark
- token_id/tokens/bytes 开关
- 通过 javascript 添加 hover_text
- 给方法 + 缓存,避免重复调用
- 英文 utf-8编码
- 词典支持下载
- 中文字词统计,是否要包括 _ G 等字符
- baichuan的单字数量怎么两万多个?
- OOV
- feedback位置
- gpt4, gpt3.5 的overlap tokens 有问题。


plots

table

## related demo
- [](http://text-processing.com/demo/tokenize/)
- [gpt-tokenizer](https://gpt-tokenizer.dev/)
- [llama-tokenizer-js](https://belladoreai.github.io/llama-tokenizer-js/example-demo/build/)
- [](https://huggingface.co/spaces/Xenova/the-tokenizer-playground)

## 可视化

[ The, 2, QUICK, Brown, Foxes, jumped, over, the, lazy, dog's, bone ]
"""

import gradio as gr
from vocab import all_tokenizers
from util import *

# llama chatglm_6b gpt_nexo_20b baichuan  baichuan_7b
examples_zh = [
    ["空格测试:  2个空格        8个空格", "llama", "chatglm_6b"],  # chatglm 有blank_n,
    ["标点测试:,。!?;", "baichuan_7b", "llama"],
    ["符号测试:🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan_7b", "llama"],
    ["数字测试:(10086 + 98) = 100184", "baichuan_7b", "llama"],
    ["中文简体:宽带,繁体:樂來", "baichuan_7b", "llama"],
]

examples = [
    ["spaces:  2spaces        8spaces", "llama", "chatglm_6b"],  # chatglm 有blank_n,
    ["punctuations: ,./?\",。!?;", "baichuan_7b", "llama"],
    ["symbols: 🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan_7b", "llama"],
    ["digits: (10086 + 98) = 100184", "baichuan_7b", "llama"],
]


# jieba.enable_parallel()  # flask中没办法parallel




def example_fn(example_idx):
    return examples[example_idx]


"""Replace this text in the input field to see how tokenization works


"""

default_user_input = """Replace this text in the input field to see how tokenization works
华为发布Mate60手机
ラグビーワールドカップ2023フランス"""
default_tokenizer_type_1 = "llama"
default_tokenizer_type_2 = "internlm_chat_7b"
default_stats_vocab_size_1, default_stats_zh_token_size_1 = basic_count(default_tokenizer_type_1)
default_stats_vocab_size_2, default_stats_zh_token_size_2 = basic_count(default_tokenizer_type_2)
default_stats_overlap_token_size = get_overlap_token_size(default_tokenizer_type_1, default_tokenizer_type_2)[0]
default_output_text_1, default_output_table_1, default_output_len_1 = tokenize(default_user_input, default_tokenizer_type_1, update=False)
default_output_text_2, default_output_table_2, default_output_len_2 = tokenize(default_user_input, default_tokenizer_type_2, update=False)

with gr.Blocks(css="style.css") as demo:
    gr.HTML("""<h1 align="center">Tokenizer Arena ⚔️</h1>""")
    # links: https://www.coderstool.com/utf8-encoding-decoding
    # 功能:输入文本,进行分词
    # 分词器:常见的分词器有集中,
    # 背景:方便分词、看词粒度、对比
    #
    # Byte: 表示分词

    with gr.Row():
        gr.Markdown("## Input Text")
        dropdown_examples = gr.Dropdown(
            # ["空格测试", "标点测试", "符号测试", "数字测试"],
            ["spaces", "punctuations", "symbols", "digits"],
            value="Examples",
            type="index",
            show_label=False,
            container=False,
            scale=0,
            elem_classes="example-style"
        )

    user_input = gr.Textbox(
        value=default_user_input,
        label="Input Text",
        lines=5,
        show_label=False,
    )  # placeholder="Enter sentence here..."
    # gr.Examples(
    #     examples,
    #     None,
    # )

    gr.Markdown("## Tokenization")

    with gr.Row():
        with gr.Column(scale=6):
            with gr.Group():
                tokenizer_type_1 = gr.Dropdown(
                    all_tokenizers,
                    value=default_tokenizer_type_1,
                    label="Tokenizer 1",
                )
                with gr.Group():
                    """
                    <div class="stat"><div class="stat-value">69</div><div class="stat-label">Characters</div></div>
                    """
                    with gr.Row():
                        stats_vocab_size_1 = gr.TextArea(
                            value=default_stats_vocab_size_1,
                            label="VocabSize",
                            lines=1,
                            elem_classes="statistics"
                        )
                        stats_zh_token_size_1 = gr.TextArea(
                            value=default_stats_zh_token_size_1,
                            label="ZH char/word",
                            lines=1,
                            elem_classes="statistics"
                        )
                        stats_overlap_token_size_1 = gr.TextArea(
                            value=default_stats_overlap_token_size,
                            label="Overlap Tokens",
                            lines=1,
                            elem_classes="statistics"
                        )
                        # stats_3 = gr.TextArea(
                        #     label="Compress Rate",
                        #     lines=1,
                        #     elem_classes="statistics"
                        # )
        # https://www.onlinewebfonts.com/icon/418591
        gr.Image("images/VS.svg", scale=1, show_label=False,
                 show_download_button=False, container=False,
                 show_share_button=False)
        with gr.Column(scale=6):
            with gr.Group():
                tokenizer_type_2 = gr.Dropdown(
                    all_tokenizers,
                    value=default_tokenizer_type_2,
                    label="Tokenizer 2",
                )
                with gr.Group():
                    with gr.Row():
                        stats_vocab_size_2 = gr.TextArea(
                            value=default_stats_vocab_size_2,
                            label="VocabSize",
                            lines=1,
                            elem_classes="statistics"
                        )
                        stats_zh_token_size_2 = gr.TextArea(
                            value=default_stats_zh_token_size_2,
                            label="ZH char/word",  # 中文字/词
                            lines=1,
                            elem_classes="statistics"
                        )
                        # stats_6 = gr.TextArea(
                        #     label="Compress Rate",
                        #     lines=1,
                        #     elem_classes="statistics"
                        # )
                        stats_overlap_token_size_2 = gr.TextArea(
                            value=default_stats_overlap_token_size,
                            label="Overlap Tokens",
                            lines=1,
                            elem_classes="statistics"
                        )

    # TODO: 图 表 压缩率
    with gr.Row():
        with gr.Column():
            output_text_1 = gr.Highlightedtext(
                value=default_output_text_1,
                label=f"Tokens: {default_output_len_1}",
                show_legend=True,
                elem_classes="space-show"
            )
        with gr.Column():
            output_text_2 = gr.Highlightedtext(
                value=default_output_text_2,
                label=f"Tokens: {default_output_len_2}",
                show_legend=True,
                elem_classes="space-show"
            )

    with gr.Row():
        output_table_1 = gr.Dataframe(
            value=default_output_table_1,
            headers=["TokenID", "Byte", "Text"],
            datatype=["str", "str", "str"],
            # elem_classes="space-show",   # 给整个Dataframe加这个css不起作用,因此直接修改cell-wrap
        )
        output_table_2 = gr.Dataframe(
            value=default_output_table_2,
            headers=["TokenID", "Token", "Text"],
            datatype=["str", "str", "str"],
        )

    tokenizer_type_1.change(tokenize, [user_input, tokenizer_type_1],
                            [output_text_1, output_table_1])
    # 下面两个好像可以合并
    tokenizer_type_1.change(basic_count, [tokenizer_type_1], [stats_vocab_size_1, stats_zh_token_size_1])
    tokenizer_type_1.change(get_overlap_token_size, [tokenizer_type_1, tokenizer_type_2],
                            [stats_overlap_token_size_1, stats_overlap_token_size_2])

    user_input.change(tokenize_pair,
                      [user_input, tokenizer_type_1, tokenizer_type_2],
                      [output_text_1, output_table_1, output_text_2, output_table_2]) # , pass_request=1

    tokenizer_type_2.change(tokenize, [user_input, tokenizer_type_2],
                            [output_text_2, output_table_2])
    tokenizer_type_2.change(basic_count, [tokenizer_type_2], [stats_vocab_size_2, stats_zh_token_size_2])
    tokenizer_type_2.change(get_overlap_token_size, [tokenizer_type_1, tokenizer_type_2],
                            [stats_overlap_token_size_1, stats_overlap_token_size_2])

    dropdown_examples.change(
        example_fn,
        dropdown_examples,
        [user_input, tokenizer_type_1, tokenizer_type_2]
    )

    # start up 初始化
    # user_input.update(user_input.value + "___")

if __name__ == "__main__":
    demo.queue(max_size=20).launch()
    # demo.launch()