Spaces:

sayanbanerjee32
/

bengali_bpe_tokenizer

Runtime error

App Files Files Community

sayanbanerjee32 commited on Jun 19

Commit

cbab219

•

1 Parent(s): a0ea6aa

Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

app.py +53 -0
requirements.txt +2 -0
utils.py +61 -0

app.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import gradio as gr
+import os
+from huggingface_hub import hf_hub_download
+import joblib
+from utils import encode, decode
+REPO_ID = "sayanbanerjee32/bengali_tokenizer"
+data_file = "bengali_tokenizer.pkl"
+data_dict = joblib.load(
+    hf_hub_download(repo_id=REPO_ID, filename=data_file)
+)
+vocab = data_dict['vocab']
+merges = data_dict['merges']
+regex_pat = data_dict['regex_pat']
+def encode_decode(text):
+    ids = encode(text, regex_pat, merges)
+    return ' '.join([str(i) for i in ids]), decode(ids, vocab)
+with gr.Blocks() as demo:
+    gr.HTML("<h1 align = 'center'> Bengali BPE Tokenizer </h1>")
+    gr.HTML("<h4 align = 'center'> Tokenizes bengali text using Byte Pair Encoding algorithm</h4>")
+    content = gr.Textbox(label = "Enter the Bengali text for tokenization")
+    inputs = [
+            content,
+            ]
+    gr.Examples(["বাইরে এতই গরম যে আমি পুরোদিন আমার শীততাপ নিয়ন্ত্রিত বাড়িতে থাকতে চাই।",
+                "খুব ভালোভাবেই নিজের দায়িত্ব পালন করেছেন তিনি।",
+                "আয়কর উঠে যাচ্ছে অনেকটা।",
+                "যদি কোনো ব্যক্তি এ ব্যাপারে দোষী সাব্যস্ত হয় তা হলে ব্যবস্থা নেওয়া হবে।",
+                "বছরের বারোটা মাস হলো জানুয়ারি, ফেব্রুয়ারি, মার্চ, এপ্রিল, মে, জুন জুলাই, আগস্ট, সেপ্টেম্বর, অক্টোবর, নভেম্বর আর ডিসেম্বর।"],
+                inputs = inputs)
+    generate_btn = gr.Button(value = 'Tokenize')
+    with gr.Row():
+        encoded = gr.Textbox(label = "Tokens")
+        decoded = gr.Textbox(label = "Regenerated text")
+    outputs  = [encoded, decoded]
+    generate_btn.click(fn = encode_decode, inputs= inputs, outputs = outputs)
+## for collab
+# demo.launch(debug=True)
+if __name__ == '__main__':
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ joblib
2	+ regex

utils.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import regex as re
+def get_stats(ids, counts= None):
+    counts = {} if counts is None else counts
+    for pair in zip(ids, ids[1:]):
+        counts[pair] = counts.get(pair, 0) + 1
+    return counts
+def merge(ids, pair, idx):
+    newids = []
+    i = 0
+    while i < len(ids):
+        if i < len(ids) - 1 and ids[i] == pair[0] and ids[i+1] == pair[1]:
+            newids.append(idx)
+            i += 2
+        else:
+            newids.append(ids[i])
+            i += 1
+    return newids
+def _encode_chunk(text_bytes, merges):
+        # return the token ids
+        # let's begin. first, convert all bytes to integers in range 0..255
+        ids = list(text_bytes)
+        while len(ids) >= 2:
+            # find the pair with the lowest merge index
+            stats = get_stats(ids)
+            pair = min(stats, key=lambda p: merges.get(p, float("inf")))
+            # subtle: if there are no more merges available, the key will
+            # result in an inf for every single pair, and the min will be
+            # just the first pair in the list, arbitrarily
+            # we can detect this terminating case by a membership check
+            if pair not in merges:
+                break # nothing else can be merged anymore
+            # otherwise let's merge the best pair (lowest merge index)
+            idx = merges[pair]
+            ids = merge(ids, pair, idx)
+        return ids
+def encode(text, regex_pat, merges):
+    # split text into chunks of text by categories defined in regex pattern
+    text_chunks = re.findall(regex_pat, text)
+    # all chunks of text are encoded separately, then results are joined
+    ids = []
+    for chunk in text_chunks:
+        chunk_bytes = chunk.encode("utf-8") # raw bytes
+        chunk_ids = _encode_chunk(chunk_bytes, merges)
+        ids.extend(chunk_ids)
+    return ids
+def decode(ids, vocab):
+    # given ids (list of integers), return Python string
+    part_bytes = []
+    for idx in ids:
+        if idx in vocab:
+            part_bytes.append(vocab[idx])
+        else:
+            raise ValueError(f"invalid token id: {idx}")
+    text_bytes = b"".join(part_bytes)
+    text = text_bytes.decode("utf-8", errors="replace")
+    return text