Spaces:
Runtime error
Runtime error
sayanbanerjee32
commited on
Commit
•
cbab219
1
Parent(s):
a0ea6aa
Upload folder using huggingface_hub
Browse files- app.py +53 -0
- requirements.txt +2 -0
- utils.py +61 -0
app.py
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import os
|
3 |
+
|
4 |
+
from huggingface_hub import hf_hub_download
|
5 |
+
import joblib
|
6 |
+
|
7 |
+
from utils import encode, decode
|
8 |
+
|
9 |
+
|
10 |
+
REPO_ID = "sayanbanerjee32/bengali_tokenizer"
|
11 |
+
data_file = "bengali_tokenizer.pkl"
|
12 |
+
|
13 |
+
data_dict = joblib.load(
|
14 |
+
hf_hub_download(repo_id=REPO_ID, filename=data_file)
|
15 |
+
)
|
16 |
+
vocab = data_dict['vocab']
|
17 |
+
merges = data_dict['merges']
|
18 |
+
regex_pat = data_dict['regex_pat']
|
19 |
+
|
20 |
+
|
21 |
+
|
22 |
+
def encode_decode(text):
|
23 |
+
ids = encode(text, regex_pat, merges)
|
24 |
+
return ' '.join([str(i) for i in ids]), decode(ids, vocab)
|
25 |
+
|
26 |
+
|
27 |
+
with gr.Blocks() as demo:
|
28 |
+
gr.HTML("<h1 align = 'center'> Bengali BPE Tokenizer </h1>")
|
29 |
+
gr.HTML("<h4 align = 'center'> Tokenizes bengali text using Byte Pair Encoding algorithm</h4>")
|
30 |
+
|
31 |
+
content = gr.Textbox(label = "Enter the Bengali text for tokenization")
|
32 |
+
inputs = [
|
33 |
+
content,
|
34 |
+
]
|
35 |
+
gr.Examples(["বাইরে এতই গরম যে আমি পুরোদিন আমার শীততাপ নিয়ন্ত্রিত বাড়িতে থাকতে চাই।",
|
36 |
+
"খুব ভালোভাবেই নিজের দায়িত্ব পালন করেছেন তিনি।",
|
37 |
+
"আয়কর উঠে যাচ্ছে অনেকটা।",
|
38 |
+
"যদি কোনো ব্যক্তি এ ব্যাপারে দোষী সাব্যস্ত হয় তা হলে ব্যবস্থা নেওয়া হবে।",
|
39 |
+
"বছরের বারোটা মাস হলো জানুয়ারি, ফেব্রুয়ারি, মার্চ, এপ্রিল, মে, জুন জুলাই, আগস্ট, সেপ্টেম্বর, অক্টোবর, নভেম্বর আর ডিসেম্বর।"],
|
40 |
+
inputs = inputs)
|
41 |
+
|
42 |
+
generate_btn = gr.Button(value = 'Tokenize')
|
43 |
+
with gr.Row():
|
44 |
+
encoded = gr.Textbox(label = "Tokens")
|
45 |
+
decoded = gr.Textbox(label = "Regenerated text")
|
46 |
+
outputs = [encoded, decoded]
|
47 |
+
generate_btn.click(fn = encode_decode, inputs= inputs, outputs = outputs)
|
48 |
+
|
49 |
+
## for collab
|
50 |
+
# demo.launch(debug=True)
|
51 |
+
|
52 |
+
if __name__ == '__main__':
|
53 |
+
demo.launch()
|
requirements.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
joblib
|
2 |
+
regex
|
utils.py
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import regex as re
|
2 |
+
|
3 |
+
def get_stats(ids, counts= None):
|
4 |
+
counts = {} if counts is None else counts
|
5 |
+
for pair in zip(ids, ids[1:]):
|
6 |
+
counts[pair] = counts.get(pair, 0) + 1
|
7 |
+
return counts
|
8 |
+
|
9 |
+
def merge(ids, pair, idx):
|
10 |
+
newids = []
|
11 |
+
i = 0
|
12 |
+
while i < len(ids):
|
13 |
+
if i < len(ids) - 1 and ids[i] == pair[0] and ids[i+1] == pair[1]:
|
14 |
+
newids.append(idx)
|
15 |
+
i += 2
|
16 |
+
else:
|
17 |
+
newids.append(ids[i])
|
18 |
+
i += 1
|
19 |
+
return newids
|
20 |
+
|
21 |
+
def _encode_chunk(text_bytes, merges):
|
22 |
+
# return the token ids
|
23 |
+
# let's begin. first, convert all bytes to integers in range 0..255
|
24 |
+
ids = list(text_bytes)
|
25 |
+
while len(ids) >= 2:
|
26 |
+
# find the pair with the lowest merge index
|
27 |
+
stats = get_stats(ids)
|
28 |
+
pair = min(stats, key=lambda p: merges.get(p, float("inf")))
|
29 |
+
# subtle: if there are no more merges available, the key will
|
30 |
+
# result in an inf for every single pair, and the min will be
|
31 |
+
# just the first pair in the list, arbitrarily
|
32 |
+
# we can detect this terminating case by a membership check
|
33 |
+
if pair not in merges:
|
34 |
+
break # nothing else can be merged anymore
|
35 |
+
# otherwise let's merge the best pair (lowest merge index)
|
36 |
+
idx = merges[pair]
|
37 |
+
ids = merge(ids, pair, idx)
|
38 |
+
return ids
|
39 |
+
|
40 |
+
def encode(text, regex_pat, merges):
|
41 |
+
# split text into chunks of text by categories defined in regex pattern
|
42 |
+
text_chunks = re.findall(regex_pat, text)
|
43 |
+
# all chunks of text are encoded separately, then results are joined
|
44 |
+
ids = []
|
45 |
+
for chunk in text_chunks:
|
46 |
+
chunk_bytes = chunk.encode("utf-8") # raw bytes
|
47 |
+
chunk_ids = _encode_chunk(chunk_bytes, merges)
|
48 |
+
ids.extend(chunk_ids)
|
49 |
+
return ids
|
50 |
+
|
51 |
+
def decode(ids, vocab):
|
52 |
+
# given ids (list of integers), return Python string
|
53 |
+
part_bytes = []
|
54 |
+
for idx in ids:
|
55 |
+
if idx in vocab:
|
56 |
+
part_bytes.append(vocab[idx])
|
57 |
+
else:
|
58 |
+
raise ValueError(f"invalid token id: {idx}")
|
59 |
+
text_bytes = b"".join(part_bytes)
|
60 |
+
text = text_bytes.decode("utf-8", errors="replace")
|
61 |
+
return text
|