sayanbanerjee32 commited on
Commit
cbab219
1 Parent(s): a0ea6aa

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. app.py +53 -0
  2. requirements.txt +2 -0
  3. utils.py +61 -0
app.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+
4
+ from huggingface_hub import hf_hub_download
5
+ import joblib
6
+
7
+ from utils import encode, decode
8
+
9
+
10
+ REPO_ID = "sayanbanerjee32/bengali_tokenizer"
11
+ data_file = "bengali_tokenizer.pkl"
12
+
13
+ data_dict = joblib.load(
14
+ hf_hub_download(repo_id=REPO_ID, filename=data_file)
15
+ )
16
+ vocab = data_dict['vocab']
17
+ merges = data_dict['merges']
18
+ regex_pat = data_dict['regex_pat']
19
+
20
+
21
+
22
+ def encode_decode(text):
23
+ ids = encode(text, regex_pat, merges)
24
+ return ' '.join([str(i) for i in ids]), decode(ids, vocab)
25
+
26
+
27
+ with gr.Blocks() as demo:
28
+ gr.HTML("<h1 align = 'center'> Bengali BPE Tokenizer </h1>")
29
+ gr.HTML("<h4 align = 'center'> Tokenizes bengali text using Byte Pair Encoding algorithm</h4>")
30
+
31
+ content = gr.Textbox(label = "Enter the Bengali text for tokenization")
32
+ inputs = [
33
+ content,
34
+ ]
35
+ gr.Examples(["বাইরে এতই গরম যে আমি পুরোদিন আমার শীততাপ নিয়ন্ত্রিত বাড়িতে থাকতে চাই।",
36
+ "খুব ভালোভাবেই নিজের দায়িত্ব পালন করেছেন তিনি।",
37
+ "আয়কর উঠে যাচ্ছে অনেকটা।",
38
+ "যদি কোনো ব্যক্তি এ ব্যাপারে দোষী সাব্যস্ত হয় তা হলে ব্যবস্থা নেওয়া হবে।",
39
+ "বছরের বারোটা মাস হলো জানুয়ারি, ফেব্রুয়ারি, মার্চ, এপ্রিল, মে, জুন জুলাই, আগস্ট, সেপ্টেম্বর, অক্টোবর, নভেম্বর আর ডিসেম্বর।"],
40
+ inputs = inputs)
41
+
42
+ generate_btn = gr.Button(value = 'Tokenize')
43
+ with gr.Row():
44
+ encoded = gr.Textbox(label = "Tokens")
45
+ decoded = gr.Textbox(label = "Regenerated text")
46
+ outputs = [encoded, decoded]
47
+ generate_btn.click(fn = encode_decode, inputs= inputs, outputs = outputs)
48
+
49
+ ## for collab
50
+ # demo.launch(debug=True)
51
+
52
+ if __name__ == '__main__':
53
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ joblib
2
+ regex
utils.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import regex as re
2
+
3
+ def get_stats(ids, counts= None):
4
+ counts = {} if counts is None else counts
5
+ for pair in zip(ids, ids[1:]):
6
+ counts[pair] = counts.get(pair, 0) + 1
7
+ return counts
8
+
9
+ def merge(ids, pair, idx):
10
+ newids = []
11
+ i = 0
12
+ while i < len(ids):
13
+ if i < len(ids) - 1 and ids[i] == pair[0] and ids[i+1] == pair[1]:
14
+ newids.append(idx)
15
+ i += 2
16
+ else:
17
+ newids.append(ids[i])
18
+ i += 1
19
+ return newids
20
+
21
+ def _encode_chunk(text_bytes, merges):
22
+ # return the token ids
23
+ # let's begin. first, convert all bytes to integers in range 0..255
24
+ ids = list(text_bytes)
25
+ while len(ids) >= 2:
26
+ # find the pair with the lowest merge index
27
+ stats = get_stats(ids)
28
+ pair = min(stats, key=lambda p: merges.get(p, float("inf")))
29
+ # subtle: if there are no more merges available, the key will
30
+ # result in an inf for every single pair, and the min will be
31
+ # just the first pair in the list, arbitrarily
32
+ # we can detect this terminating case by a membership check
33
+ if pair not in merges:
34
+ break # nothing else can be merged anymore
35
+ # otherwise let's merge the best pair (lowest merge index)
36
+ idx = merges[pair]
37
+ ids = merge(ids, pair, idx)
38
+ return ids
39
+
40
+ def encode(text, regex_pat, merges):
41
+ # split text into chunks of text by categories defined in regex pattern
42
+ text_chunks = re.findall(regex_pat, text)
43
+ # all chunks of text are encoded separately, then results are joined
44
+ ids = []
45
+ for chunk in text_chunks:
46
+ chunk_bytes = chunk.encode("utf-8") # raw bytes
47
+ chunk_ids = _encode_chunk(chunk_bytes, merges)
48
+ ids.extend(chunk_ids)
49
+ return ids
50
+
51
+ def decode(ids, vocab):
52
+ # given ids (list of integers), return Python string
53
+ part_bytes = []
54
+ for idx in ids:
55
+ if idx in vocab:
56
+ part_bytes.append(vocab[idx])
57
+ else:
58
+ raise ValueError(f"invalid token id: {idx}")
59
+ text_bytes = b"".join(part_bytes)
60
+ text = text_bytes.decode("utf-8", errors="replace")
61
+ return text