Spaces:
Sleeping
Sleeping
Taranosaurus
commited on
Commit
β’
7dae6b7
1
Parent(s):
47edf6c
Re-adjusting how the tokenizer and vocabulary loading
Browse filesMade it more reliable so your analysis gets loaded and processed more predictably
app.py
CHANGED
@@ -2,9 +2,10 @@ from transformers import AutoTokenizer
|
|
2 |
import gradio as gr
|
3 |
import random
|
4 |
|
5 |
-
checkpoint = "
|
6 |
checkpoints = [
|
7 |
checkpoint,
|
|
|
8 |
"microsoft/phi-2",
|
9 |
"openai/whisper-large-v3",
|
10 |
"NousResearch/Nous-Hermes-2-Yi-34B",
|
@@ -27,43 +28,53 @@ def randomize_sequence():
|
|
27 |
|
28 |
sequence = randomize_sequence
|
29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
def load_tokenizer(checkpoint):
|
31 |
if not "tokenizer" in globals():
|
32 |
global tokenizer
|
33 |
-
|
34 |
-
|
35 |
-
if checkpoint == tokenizer.name_or_path:
|
36 |
-
gr.Info(f"Tokenizer already loaded '{checkpoint}'")
|
37 |
-
else:
|
38 |
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
gr.Info(f"Tokenizer vocab size: {vocab_size}")
|
45 |
-
return vocab_size, unk, vocab_sorted
|
46 |
-
except Exception as error:
|
47 |
-
gr.Warning(f"An unexpected error occurred while loading the Tokenizer.")
|
48 |
-
gr.Warning(f"{error}")
|
49 |
-
return None, None, None
|
50 |
|
51 |
def tokenize_er(checkpoint, sequence):
|
52 |
-
vocab_size, unk, vocab_sorted = load_tokenizer(checkpoint)
|
53 |
try:
|
|
|
54 |
tokens = tokenizer.tokenize(sequence)
|
55 |
ids = tokenizer.convert_tokens_to_ids(tokens)
|
56 |
token_id_pair = []
|
57 |
if len(tokens) == len(ids):
|
58 |
for i in range(len(ids)):
|
59 |
token_id_pair.append([tokens[i],ids[i]])
|
60 |
-
return token_id_pair
|
61 |
except NameError:
|
62 |
gr.Warning("Select Tokenizer before sequencing.")
|
63 |
-
return [[None, None]]
|
|
|
|
|
|
|
64 |
|
65 |
-
def de_tokenize_er(pairs):
|
66 |
try:
|
|
|
67 |
tokens = []
|
68 |
ids = []
|
69 |
for row in pairs:
|
@@ -79,15 +90,19 @@ def de_tokenize_er(pairs):
|
|
79 |
except NameError:
|
80 |
gr.Warning("Tokenize sequence before decoding.")
|
81 |
return None, None, None
|
|
|
|
|
|
|
82 |
|
83 |
with gr.Blocks() as frontend:
|
84 |
with gr.Row():
|
85 |
with gr.Column(scale=3):
|
86 |
-
gr.Markdown("# π Tokenizaminer\n### The Tokenizer Examiner, or the Tokeniza Miner... π΅οΈπ³οΈ\nThe purpose of this tool is to examine the vocabulary and tokens of a models tokenizer and play with the results.\nNote how the Vocabulary ID lines up with the full Vocabulary index on the right β‘οΈ\n\nβ οΈ Loading the vocabulary can take a few seconds.")
|
87 |
with gr.Row():
|
88 |
gr.Markdown("\n#### 1. Select Tokenizer\nSelect from the list or enter any model from π€ Hugging Face Models, it will only download the Tokenizer data! Image models won't work here.")
|
89 |
-
with gr.
|
90 |
input_checkpoint = gr.Dropdown(label="Tokenizer", choices=checkpoints, value=checkpoint, allow_custom_value=True, show_label=False, container=False)
|
|
|
91 |
with gr.Row():
|
92 |
gr.Markdown("\n#### 2. Sequence & Tokenize")
|
93 |
with gr.Row():
|
@@ -110,13 +125,16 @@ with gr.Blocks() as frontend:
|
|
110 |
output_decoded_ids = gr.TextArea(label="Decoded IDs", interactive=False)
|
111 |
with gr.Column(scale=1):
|
112 |
with gr.Group():
|
113 |
-
gr.Markdown("
|
|
|
114 |
output_vocab_count = gr.Number(label="Vocab Size", interactive=False)
|
115 |
output_unknown_token = gr.Textbox(label="Unknown Token", interactive=False)
|
116 |
output_vocab = gr.Code(label="Vocabulary IDs")
|
117 |
|
118 |
-
|
|
|
119 |
btn_random_seq.click(fn=randomize_sequence, inputs=[], outputs=[input_sequence])
|
120 |
-
btn_decode.click(fn=de_tokenize_er, inputs=[token_id_pair], outputs=[output_decoded_token_ids,output_decoded_tokens, output_decoded_ids])
|
|
|
121 |
|
122 |
frontend.launch()
|
|
|
2 |
import gradio as gr
|
3 |
import random
|
4 |
|
5 |
+
checkpoint = "dslim/bert-base-NER"
|
6 |
checkpoints = [
|
7 |
checkpoint,
|
8 |
+
"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
|
9 |
"microsoft/phi-2",
|
10 |
"openai/whisper-large-v3",
|
11 |
"NousResearch/Nous-Hermes-2-Yi-34B",
|
|
|
28 |
|
29 |
sequence = randomize_sequence
|
30 |
|
31 |
+
def load_vocab(target_model, current_model):
|
32 |
+
checkpoint = target_model
|
33 |
+
if target_model == current_model:
|
34 |
+
gr.Info(f"Tokenizer already loaded: {checkpoint}")
|
35 |
+
else:
|
36 |
+
load_tokenizer(checkpoint)
|
37 |
+
gr.Info(f"Tokenizer loaded: {checkpoint}")
|
38 |
+
vocab = dict(sorted(tokenizer.vocab.items(), key=lambda item: item[1]))
|
39 |
+
unk = next(iter(vocab))
|
40 |
+
vocab.pop(unk)
|
41 |
+
vocab_sorted = "\n".join(vocab)
|
42 |
+
vocab_size = len(vocab)
|
43 |
+
gr.Info(f"Tokenizer vocab size: {vocab_size}")
|
44 |
+
return checkpoint, vocab_size, unk, vocab_sorted
|
45 |
+
|
46 |
def load_tokenizer(checkpoint):
|
47 |
if not "tokenizer" in globals():
|
48 |
global tokenizer
|
49 |
+
if len(checkpoint) > 0:
|
50 |
+
try:
|
|
|
|
|
|
|
51 |
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
|
52 |
+
except Exception as error:
|
53 |
+
gr.Warning("Unexpected error!")
|
54 |
+
raise gr.Error(f"{error}")
|
55 |
+
else:
|
56 |
+
return ValueError("Tokenizer cannot be empty!")
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
|
58 |
def tokenize_er(checkpoint, sequence):
|
|
|
59 |
try:
|
60 |
+
load_tokenizer(checkpoint)
|
61 |
tokens = tokenizer.tokenize(sequence)
|
62 |
ids = tokenizer.convert_tokens_to_ids(tokens)
|
63 |
token_id_pair = []
|
64 |
if len(tokens) == len(ids):
|
65 |
for i in range(len(ids)):
|
66 |
token_id_pair.append([tokens[i],ids[i]])
|
67 |
+
return token_id_pair
|
68 |
except NameError:
|
69 |
gr.Warning("Select Tokenizer before sequencing.")
|
70 |
+
return [[None, None]]
|
71 |
+
except Exception as error:
|
72 |
+
gr.Warning("Unexpected error!")
|
73 |
+
raise gr.Error(f"{error}")
|
74 |
|
75 |
+
def de_tokenize_er(checkpoint, pairs):
|
76 |
try:
|
77 |
+
load_tokenizer(checkpoint)
|
78 |
tokens = []
|
79 |
ids = []
|
80 |
for row in pairs:
|
|
|
90 |
except NameError:
|
91 |
gr.Warning("Tokenize sequence before decoding.")
|
92 |
return None, None, None
|
93 |
+
except Exception as error:
|
94 |
+
gr.Warning("Unexpected error!")
|
95 |
+
raise gr.Error(f"{error}")
|
96 |
|
97 |
with gr.Blocks() as frontend:
|
98 |
with gr.Row():
|
99 |
with gr.Column(scale=3):
|
100 |
+
gr.Markdown("# π Tokenizaminer\n### The Tokenizer Examiner, or the Tokeniza Miner... π΅οΈπ³οΈ\nThe purpose of this tool is to examine the vocabulary and tokens of a models tokenizer and play with the results.\nNote how the Vocabulary ID lines up with the full Vocabulary index on the right β‘οΈ\n\nβ οΈ Loading the full vocabulary can take a few seconds and the browser might stutter.")
|
101 |
with gr.Row():
|
102 |
gr.Markdown("\n#### 1. Select Tokenizer\nSelect from the list or enter any model from π€ Hugging Face Models, it will only download the Tokenizer data! Image models won't work here.")
|
103 |
+
with gr.Row():
|
104 |
input_checkpoint = gr.Dropdown(label="Tokenizer", choices=checkpoints, value=checkpoint, allow_custom_value=True, show_label=False, container=False)
|
105 |
+
#btn_load_vocab = gr.Button(value="Load Vocabulary")
|
106 |
with gr.Row():
|
107 |
gr.Markdown("\n#### 2. Sequence & Tokenize")
|
108 |
with gr.Row():
|
|
|
125 |
output_decoded_ids = gr.TextArea(label="Decoded IDs", interactive=False)
|
126 |
with gr.Column(scale=1):
|
127 |
with gr.Group():
|
128 |
+
gr.Markdown("### π² Tokenizer Data")
|
129 |
+
output_checkpoint = gr.Textbox(visible=False)
|
130 |
output_vocab_count = gr.Number(label="Vocab Size", interactive=False)
|
131 |
output_unknown_token = gr.Textbox(label="Unknown Token", interactive=False)
|
132 |
output_vocab = gr.Code(label="Vocabulary IDs")
|
133 |
|
134 |
+
input_checkpoint.change(fn=load_vocab, inputs=[input_checkpoint, output_checkpoint], outputs=[output_checkpoint, output_vocab_count, output_unknown_token, output_vocab], queue=True)
|
135 |
+
btn_tokenize.click(fn=tokenize_er, inputs=[input_checkpoint, input_sequence], outputs=[token_id_pair], queue=True)
|
136 |
btn_random_seq.click(fn=randomize_sequence, inputs=[], outputs=[input_sequence])
|
137 |
+
btn_decode.click(fn=de_tokenize_er, inputs=[input_checkpoint, token_id_pair], outputs=[output_decoded_token_ids,output_decoded_tokens, output_decoded_ids], queue=True)
|
138 |
+
frontend.load(fn=load_vocab, inputs=[input_checkpoint, output_checkpoint], outputs=[output_checkpoint, output_vocab_count, output_unknown_token, output_vocab], queue=True)
|
139 |
|
140 |
frontend.launch()
|