Spaces:

Manu101
/

HindiTokenizer

Running

App Files Files Community

Manu101 commited on Jul 16

Commit

693faa9

•

1 Parent(s): 0974415

Upload 12 files

Browse files

Files changed (12) hide show

app.py +77 -0
requirements.txt +2 -0
saved_vocabs/batch_2_Hindi_Tokenizer-test-all_batches-100_000_batchsize-initial_vocab_size_5000.model +3 -0
src/Basictokenizer.py +172 -0
src/HindiTokenizer.py +473 -0
src/HuggingFace-based-tokenizer.py +77 -0
src/__init__.py +3 -0
src/__pycache__/Basictokenizer.cpython-312.pyc +0 -0
src/__pycache__/HindiTokenizer.cpython-312.pyc +0 -0
src/__pycache__/__init__.cpython-312.pyc +0 -0
src/__pycache__/base.cpython-312.pyc +0 -0
src/base.py +163 -0

app.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import pathlib
+import random
+import gradio as gr
+from src import HindiTokenizer, BasicTokenizer
+Basic = BasicTokenizer()
+Basic._build_vocab()
+Hindi = HindiTokenizer()
+Hindi.load(
+    model_file_path=pathlib.Path(
+        "saved_vocabs/batch_1_Hindi_Tokenizer-test-all_batches-100_000_batchsize-initial_vocab_size_5000.model"))
+def tokenize_and_color(text, tokenizer_choice="HindiTokenizer"):
+    if tokenizer_choice == "BasicTokenizer":
+        tokenizer = Basic
+    else:
+        tokenizer = Hindi
+    tokens = tokenizer.encode(text)
+    # colors = [
+    #     "#FF5733", "#33FF57", "#3357FF", "#F333FF",
+    #     "#33FFF3", "#F3FF33", "#FF3380", "#3380FF",
+    #     "#83FF33", "#FF8333"
+    # ]
+    colors = [
+        "#FF5733", "#33FF57", "#3357FF", "#F333FF",
+        "#33FFF3", "#F3FF33", "#FF3380", "#3380FF",
+        "#83FF33", "#FF8333", "#7FDBFF", "#0074D9",
+        "#39CCCC", "#3D9970", "#2ECC40", "#01FF70",
+        "#FFDC00", "#FF851B", "#FF4136", "#85144b",
+        "#F012BE", "#B10DC9", "#AAAAAA", "#DDDDDD"
+    ]
+    colored_text = '<div style="word-wrap: break-word; white-space: pre-wrap;">'
+    token_color_mapping = {}
+    last_color = ""
+    for index, token in enumerate(tokens):
+        token_id = token
+        if token_id in token_color_mapping:
+            color = token_color_mapping[token_id]
+        else:
+            color = random.choice([c for c in colors if c != last_color])
+            last_color = color
+            token_color_mapping[token_id] = color
+        colored_text += f'<span id="{token_id}" style="color: {color}; margin-right: 20px;">{token}</span>'
+    colored_text += '</div>'
+    return colored_text
+examples = [
+    ["आप कैसे हैं??"],
+    ["यह एक परीक्षण है।"],
+    ["लोरेम इप्सम एक छद्म-लैटिन पाठ है जिसका उपयोग मुद्रण और टाइपसेटिंग उद्योगों में किया जाता है।"]
+]
+iface = gr.Interface(fn=tokenize_and_color,
+                     title="Hindi Text Tokenizer",
+                     description="Enter text to see the tokenized output with each token colored differently.",
+                     inputs=[
+                         gr.Textbox(lines=2, label="Input Text"),
+                         # gr.Radio(choices=["BasicTokenizer", "HindiTokenizer"], label="Tokenizer Choice",
+                         #          value="HindiTokenizer")
+                     ],
+                     outputs=[
+                         gr.HTML(label="Tokenized and Colored Text")
+                     ],
+                     examples=examples,
+                     # theme=gr.themes.Soft()
+                     theme=gr.themes.Base()
+                     )
+if __name__ == "__main__":
+    iface.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ gradio
2	+ scrapy

saved_vocabs/batch_2_Hindi_Tokenizer-test-all_batches-100_000_batchsize-initial_vocab_size_5000.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6a08206e8219876b874bdb5aedbd4080a0504e1de86b794cc4655b3d1847ee59
+size 47214

src/Basictokenizer.py ADDED Viewed

	@@ -0,0 +1,172 @@

+"""
+Minimal (byte-level) Byte Pair Encoding tokenizer.
+Algorithmically follows along the GPT tokenizer:
+https://github.com/openai/gpt-2/blob/master/src/encoder.py
+But:
+- Does not handle the regular expression splitting pattern.
+- Does not handle any special tokens.
+"""
+import copy
+from .base import Tokenizer, get_stats, merge
+# class BasicTokenizer(Tokenizer):
+#
+#     def __init__(self):
+#         super().__init__()
+#
+#     def train(self, text, vocab_size, verbose=False):
+#         assert vocab_size >= 256
+#         num_merges = vocab_size - 256
+#
+#         # input text preprocessing
+#         text_bytes = text.encode("utf-8")  # raw bytes
+#         ids = list(text_bytes)  # list of integers in range 0..255
+#
+#         # iteratively merge the most common pairs to create new tokens
+#         merges = {}  # (int, int) -> int
+#         vocab = {idx: bytes([idx]) for idx in range(256)}  # int -> bytes
+#         for i in range(num_merges):
+#             # count up the number of times every consecutive pair appears
+#             stats = get_stats(ids)
+#             # find the pair with the highest count
+#             pair = max(stats, key=stats.get)
+#             # mint a new token: assign it the next available id
+#             idx = 256 + i
+#             # replace all occurrences of pair in ids with idx
+#             ids = merge(ids, pair, idx)
+#             # save the merge
+#             merges[pair] = idx
+#             vocab[idx] = vocab[pair[0]] + vocab[pair[1]]
+#             # prints
+#             if verbose:
+#                 print(f"merge {i + 1}/{num_merges}: {pair} -> {idx} ({vocab[idx]}) had {stats[pair]} occurrences")
+#
+#         # save class variables
+#         self.merges = merges  # used in encode()
+#         self.vocab = vocab  # used in decode()
+#
+#     def decode(self, ids):
+#         # given ids (list of integers), return Python string
+#         text_bytes = b"".join(self.vocab[idx] for idx in ids)
+#         text = text_bytes.decode("utf-8", errors="replace")
+#         return text
+#
+#     def encode(self, text):
+#         # given a string text, return the token ids
+#         text_bytes = text.encode("utf-8")  # raw bytes
+#         ids = list(text_bytes)  # list of integers in range 0..255
+#         while len(ids) >= 2:
+#             # find the pair with the lowest merge index
+#             stats = get_stats(ids)
+#             pair = min(stats, key=lambda p: self.merges.get(p, float("inf")))
+#             # subtle: if there are no more merges available, the key will
+#             # result in an inf for every single pair, and the min will be
+#             # just the first pair in the list, arbitrarily
+#             # we can detect this terminating case by a membership check
+#             if pair not in self.merges:
+#                 break  # nothing else can be merged anymore
+#             # otherwise let's merge the best pair (lowest merge index)
+#             idx = self.merges[pair]
+#             ids = merge(ids, pair, idx)
+#         return ids
+class BasicTokenizer(Tokenizer):
+    def __init__(self):
+        super().__init__()
+        self.merge_counter = 0
+    def train(self, text, vocab_size, verbose=False):
+        # left assert in place just to introduce consistency and a hard check of the increase in vocab size and number of merges
+        assert vocab_size >= 256
+        num_merges = vocab_size - 256
+        current_batch_merge_counter = 0  # in case not all exact `num_merges` happen
+        # input text preprocessing
+        text_bytes = text.encode("utf-8")  # encode to get all waw bytes
+        ids = list(text_bytes)  # represent the bytes in ints
+        # use same merge dict if exists
+        self.merges = {} if self.merges is None else self.merges  # to hold all merges (int, int) -> int
+        # Use same vocab for this Tokenizer object if it exists
+        # Tokenizer vocab:  int -> bytes
+        self.vocab = {idx: bytes([idx]) for idx in range(256)} if self.vocab is None else self.vocab
+        # iteratively merge the MOST COMMON pair from the text
+        for i in range(num_merges):
+            # get count of pairs
+            stats = get_stats(ids)
+            # find the pair with the highest count
+            # pair = max(stats, key=stats.get)
+            # tmp_stats = copy.deepcopy(stats)
+            # get most occurring pair from ids
+            pair = max(stats, key=stats.get)
+            while pair in self.merges:
+                # pair was previously merged ... use this first to update IDS
+                # No need to add to merges and vocab, use previously stored token
+                already_merged_idx = self.merges[pair]
+                # just replace already merged pairs in ids and get new ids and no need to again add to merges and vocab
+                ids = merge(ids, pair, already_merged_idx)
+                stats = get_stats(ids)
+                if stats and len(ids) >= 2:
+                    pair = max(stats, key=stats.get)
+                else:
+                    # no new merges found in this incoming data batch
+                    print(f"\n\nstopping merges as no new byte pair found in the current batch")
+                    break
+            # this most occurring pair not merged yet in any data batch
+            #  generate a new token considering how many have been generated so far for the same tokenizer
+            idx = len(self.vocab) + 1
+            # update current new generated tokens to add to self.merge_counter later
+            current_batch_merge_counter += 1
+            # replace all occurrences of `pair` above in `ids` with NEW `idx` token, add this one to merges & vocab
+            # Note: this pair has never been seen for merging
+            ids = merge(ids, pair, idx)
+            self.merges[pair] = idx
+            self.vocab[idx] = self.vocab[pair[0]] + self.vocab[pair[1]]
+            if verbose:
+                print(f"merge {i + 1}/{num_merges}: {pair} -> {idx} ({self.vocab[idx]}) had {stats[pair]} count")
+        self.merge_counter += current_batch_merge_counter
+    def decode(self, ids):
+        # given ids (list of integers), return Python string
+        text_bytes = b"".join(self.vocab[idx] for idx in ids)
+        text = text_bytes.decode("utf-8", errors="replace")
+        return text
+    def encode(self, text):
+        # input a string text, returns the token ids
+        text_bytes = text.encode("utf-8")
+        ids = list(text_bytes)
+        while len(ids) >= 2:
+            # here find the pair with the lowest merge index
+            stats = get_stats(ids)
+            pair = min(stats, key=lambda p: self.merges.get(p, float("inf")))
+            # if no merges i.e. the pair is not in merges dict,
+            # the key will result in an `inf` for every single pair,
+            # and the min will be just the first pair in the list,
+            # we can detect this terminating case by a membership check
+            if pair not in self.merges:
+                break  # nothing else can be merged anymore
+            # otherwise merge the best pair NOTE: (lowest merge index)
+            idx = self.merges[pair]
+            ids = merge(ids, pair, idx)
+        return ids

src/HindiTokenizer.py ADDED Viewed

	@@ -0,0 +1,473 @@

+import os
+import pathlib
+import time
+from textwrap import dedent
+import regex as re
+import unicodedata
+import utilities
+from src.base import Tokenizer, get_stats, merge
+whitespace = ' \t\n\r\v\f'
+ascii_lowercase = 'abcdefghijklmnopqrstuvwxyz'
+ascii_uppercase = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
+ascii_letters = ascii_lowercase + ascii_uppercase
+digits = '0123456789'
+hexdigits = digits + 'abcdef' + 'ABCDEF'
+octdigits = '01234567'
+punctuation = r"""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""
+ascii_printable = whitespace + ascii_letters + hexdigits + punctuation
+# the main GPT text split patterns, see
+# https://github.com/openai/tiktoken/blob/main/tiktoken_ext/openai_public.py
+GPT2_SPLIT_PATTERN = r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
+GPT4_SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"""
+"""
+Basic Devanagari: \u0900 to \u097F
+Vedic Extensions: \u1CD0 to \u1CFF
+Extended Devanagari: \uA8E0 to \uA8FF
+"""
+# ignore case in compile below
+SIMPLE_HINDI_PATTERN = r"""[\t\n\r\f\v]?|[^\r\n\p{Devanagari}\p{N}]?+\p{Devanagari}+|\\p{N}{1,}| ?[^\s\p{Devanagari}+\p{N}]++[\r\n]*|\s*[\r\n]*|\s+(?!\S)|\s+"""
+EXTENDED_HINDI_PATTERN = r"""[\t\n\r\f\v]?|[^\r\n\p{Devanagari}\uA8E0-\uA8FF\u1CD0-\u1CFF\p{N}]?+[\p{Devanagari}\uA8E0-\uA8FF\u1CD0-\u1CFF]+|\p{N}{1,}| ?[^\s\p{Devanagari}+\p{N}\uA8E0-\uA8FF\u1CD0-\u1CFF]++[\r\n]*|\s*[\r\n]*|\s+(?!\S)|\s+"""
+def replace_control_characters(s: str) -> str:
+    chars = []
+    for ch in s:
+        if unicodedata.category(ch)[0] != "C":
+            chars.append(ch)  # this character is ok
+        else:
+            chars.append(f"\\u{ord(ch):04x}")  # escape
+    return "".join(chars)
+def render_token(t: bytes) -> str:
+    # pretty print a token, escaping control characters
+    s = t.decode('utf-8', errors='replace')
+    s = replace_control_characters(s)
+    return s
+class HindiTokenizer:
+    def __init__(self, pattern=None, encoding="utf-8"):
+        self.pattern = SIMPLE_HINDI_PATTERN if pattern is None else pattern
+        self.compiled_pattern = re.compile(self.pattern, re.IGNORECASE, re.UNICODE)
+        self.inverse_special_tokens = {}
+        self.merges = None
+        self.vocab = None
+        self.encoding = encoding
+        self.hindi_varnmala_and_key_units = dedent("""
+                    अ आ इ ई उ ऊ ए ऐ ओ औ अं अः ऋ ॠ
+                    ा ि ी ु ू ृॄ ॅॆ े ैॉ ॊ ो ौ
+                    क ख ग घ ङ क़ ख़ ग़ घ़ ङ़
+                    च छ ज झ ञ ज़ झ़ ञ़
+                    ट ठ ड ढ ण ड़ ढ़ ण़
+                    त थ द ध न त़ थ़ द़ ध़ ऩ
+                    प फ ब भ म प़ फ़ ब़ म़
+                    य र ल ळ व य़ ऱ ल़ ऴ व़
+                    श ष ॺ स ह श़ ष़ स़ ह़
+                    ० १ २ ३ ४ ५ ६ ७ ८ ९
+                    ॥
+                    """)
+        self.special_tokens = {}
+        super().__init__()
+    def _build_vocab(self):
+        '''add other important ASCII units except English letters'''
+        print("\n====================================\n\n"
+              "Building initial Hindi vocabulary with basic Hindi letters and key tokens")
+        self.vocab = {}
+        ascii_letters_encoded = ascii_letters.encode(
+            encoding="utf-8")  # was using this to ignore ASCII English letters, revisit/todo, hindi usage with English or day to day usage and chats may include english letter and what to fill with those blank idxes?
+        for idx in range(256):
+            self.vocab[idx] = bytes([idx])
+        max_idx = max(self.vocab.keys()) + 1
+        basic_hindi_alphabet = self.hindi_varnmala_and_key_units.strip().split()
+        for idx in range(len(basic_hindi_alphabet)):
+            encoded_char = basic_hindi_alphabet[idx].encode(encoding=self.encoding)
+            new_idx = idx + max_idx
+            self.vocab[new_idx] = encoded_char
+        for (pos0, pos1), idx in self.merges.items():
+            self.vocab[idx] = self.vocab[pos0] + self.vocab[pos1]
+        # NOW add special tokens defined in __init__()
+        # NOTE encode special tokens using .encode with UTF-8 encoding
+        for tok, idx in self.special_tokens.items():
+            self.vocab[idx] = tok.encode("utf-8")
+        print("\n=================\nVocab initialisation done...")
+        # verified the resumed letter from .model file  b'\xe0\xa4\x85'.decode("utf-8") is indeed character 'अ' ;
+        # One index extra is skipped (number idx 357 so had to add +1 where needed when re-building vocab 😅)
+        # not needed here though.
+        return self.vocab
+    # @utilities.log_to_file("HindiTokenizer-train.log")
+    def train(self, text, vocab_size, verbose=False,
+              default_initial_vocab_size=256 + 101,
+              encoding="utf-8",
+              save_tokenizer_at_train_end: bool = False,
+              prefix_for_save: str = "Hindi_Tokenizer",
+              just_replacing_already_seen_tokens_counter_threshold=100,
+              minting_new_token_for_merge_threshold=10,
+              current_batch_num=None,
+              save_at_every_nth_iteration=100
+              ):
+        """
+        text: the incoming text sata in str
+        vocab_size: int: the new target vocab size to build, used to determine how many merges to run
+        verbose: bool: to print when a new token is generated and used to merge pairs in the data' ids
+        encoding: str="utf-8" : the encoding to use
+        save_tokenizer_at_train_end: bool: a flag to save incrementing vocab and merges dictionaries so later can be resumed and re-used
+        prefix_for_save: str: the prefix for saving tokenizer files
+        just_replacing_already_seen_tokens_counter_threshold: int = 50: a threshold int value to check if number of replacements in current batch is for existing pairs created previously
+            the idea is if a new data batch has no or very few pairs that can be generated as new entries then quickly stop and move to new data batch
+        minting_new_token_for_merge_threshold: int=10: another threshold for checking if new minted tokens are below or above this, used in conjunction with previous threshold value
+        current_batch_num: int or None, to indicate what batch number is currently running, for print logs and save files options
+        """
+        if self.vocab is None:
+            self._build_vocab()
+        print("\n`Training`...for HindiTokenizer")
+        assert vocab_size >= default_initial_vocab_size
+        num_merges = vocab_size - default_initial_vocab_size
+        stop_this_batch = False
+        if current_batch_num is not None and isinstance(current_batch_num, int):
+            current_batch_num = "batch_" + str(current_batch_num) + "_"
+            prefix_for_save = current_batch_num + prefix_for_save
+        # split the text up into text chunks
+        text_chunks = re.findall(self.compiled_pattern, text)
+        # input text preprocessing
+        ids = [list(ch.encode("utf-8")) for ch in text_chunks if len(ch) > 1]
+        # iteratively merge the MOST COMMON pair from the text
+        # use same merge dict if exists
+        self.merges = {} if self.merges is None else self.merges  # to hold all merges (int, int) -> int
+        '''Some counters for helping to check running batch's work if all is into replacing already
+        created tokens/existing ones OR actually finding something new to mint new token & add to merge and vocab'''
+        minting_new_token_for_merge_counter = 0
+        just_replacing_already_seen_tokens_counter = 0
+        # run merging iteratively
+        for i in range(num_merges):
+            if i + 1 % save_at_every_nth_iteration == 0:
+                self.save(file_prefix=prefix_for_save + f"_at_{i}_iteration_",
+                          save_to_folder=pathlib.Path("saved_vocabs"))
+            merge_start_time = time.perf_counter()
+            # count the number of times every consecutive pair appears
+            stats = {}
+            for chunk_ids in ids:
+                # passing in stats will update it in place, adding up counts
+                get_stats(chunk_ids, stats)
+            # find the pair with the highest count
+            pair = max(stats, key=stats.get)
+            while pair in self.merges:
+                replacing_time_start = time.perf_counter()
+                just_replacing_already_seen_tokens_counter += 1
+                '''A simple check that says: If  pairs are already seen in this batch
+                and what happens more is just replacement of already existing pairs,
+                 way more than generating new tokens, best is to skip this batch...
+                 [use those thresholds to experiment further]'''
+                if just_replacing_already_seen_tokens_counter > just_replacing_already_seen_tokens_counter_threshold \
+                        and minting_new_token_for_merge_counter < minting_new_token_for_merge_threshold:
+                    print("\n\n===========\nStopping current batch as replacing previously learned merges is way"
+                          f" higher than creating new merges\njust_replacing_already_seen_tokens_counter:"
+                          f" {just_replacing_already_seen_tokens_counter}"
+                          f" and minting_new_token_for_merge_counter: {minting_new_token_for_merge_counter}")
+                    stop_this_batch = True
+                    break
+                # pair was previously merged ... use this first to update IDS
+                # No need to add to merges and vocab, use previously seen and stored token
+                already_merged_idx = self.merges[pair]
+                print(f"\nPair: {pair} already in merged tokens... replacing in IDS...")
+                print(f"with.. id.. {already_merged_idx}")
+                # just replace already merged pairs in ids and get new ids and no need to again add to merges and vocab
+                ids = [merge(chunk_ids, pair, already_merged_idx) for chunk_ids in ids]
+                print(
+                    f"\nReplacing existing pair:{pair} in IDs took :{time.perf_counter() - replacing_time_start} seconds")
+                # get updated stats now, here ids are list of lists, so use above way of updating stats
+                stats = {}
+                for chunk_ids in ids:
+                    # passing in stats will update it in place
+                    get_stats(chunk_ids, stats)
+                # just avoiding merging when ids become less than 2
+                if stats and len(ids) >= 2:
+                    pair = max(stats, key=stats.get)
+                else:
+                    # no new merges found in this incoming data batch
+                    print(f"\n\nstopping merges as no new byte pair found in the current batch")
+                    stop_this_batch = True
+                    break
+            if stop_this_batch is True:
+                break
+            # mint a new token as the pair was already not in merges: assign it the next available id
+            idx = len(self.vocab) + 1
+            minting_new_token_for_merge_counter += 1
+            # replace all occurrences of pair in ids with idx
+            ids = [merge(chunk_ids, pair, idx) for chunk_ids in ids]
+            # save the merge
+            self.merges[pair] = idx
+            self.vocab[idx] = self.vocab[pair[0]] + self.vocab[pair[1]]
+            if verbose:
+                print(
+                    f"\n\nmerge {i + 1}/{num_merges}: {pair} -> {idx} ({self.vocab[idx]}) had"
+                    f" {stats[pair]:_} occurrences."
+                    f"\ntime taken: {time.perf_counter() - merge_start_time} seconds")
+        if save_tokenizer_at_train_end:
+            self.save(file_prefix=prefix_for_save, save_to_folder=pathlib.Path("saved_vocabs"))
+    def register_special_tokens(self, special_tokens):
+        # special_tokens is a dictionary of str -> int
+        # example: {"<|endoftext|>": 100257}
+        self.special_tokens = special_tokens
+        self.inverse_special_tokens = {v: k for k, v in special_tokens.items()}
+    @utilities.log_to_file("HindiTokenizer-decode.log")
+    def decode(self, ids):
+        print("\nDecoding...for HindiTokenizer")
+        # given ids (list of integers), return Python string
+        part_bytes = []
+        for idx in ids:
+            if idx in self.vocab:
+                part_bytes.append(self.vocab[idx])
+            elif idx in self.inverse_special_tokens:
+                part_bytes.append(self.inverse_special_tokens[idx].encode("utf-8"))
+            else:
+                raise ValueError(f"invalid token id: {idx}")
+        text_bytes = b"".join(part_bytes)
+        text = text_bytes.decode("utf-8", errors="replace")
+        return text
+    def _encode_chunk(self, text_bytes):
+        # return the token ids
+        # let's begin. first, convert all bytes to integers in range 0..255
+        ids = list(text_bytes)
+        while len(ids) >= 2:
+            # find the pair with the lowest merge index
+            stats = get_stats(ids)
+            pair = min(stats, key=lambda p: self.merges.get(p, float("inf")))
+            # subtle: if there are no more merges available, the key will
+            # result in an inf for every single pair, and the min will be
+            # just the first pair in the list, arbitrarily
+            # we can detect this terminating case by a membership check
+            if pair not in self.merges:
+                break  # nothing else can be merged anymore
+            # otherwise let's merge the best pair (lowest merge index)
+            idx = self.merges[pair]
+            ids = merge(ids, pair, idx)
+        return ids
+    def encode_ordinary(self, text):
+        """Encoding that ignores any special tokens."""
+        # split text into chunks of text by categories defined in regex pattern
+        text_chunks = re.findall(self.compiled_pattern, text)
+        # all chunks of text are encoded separately, then results are joined
+        ids = []
+        for chunk in text_chunks:
+            chunk_bytes = chunk.encode("utf-8")  # raw bytes
+            chunk_ids = self._encode_chunk(chunk_bytes)
+            ids.extend(chunk_ids)
+        return ids
+    @utilities.log_to_file("HindiTokenizer-encode.log")
+    def encode(self, text, allowed_special="none_raise"):
+        """
+        Unlike encode_ordinary, this function handles special tokens.
+        allowed_special: can be "all"|"none"|"none_raise" or a custom set of special tokens
+        if none_raise, then an error is raised if any special token is encountered in text
+        this is the default tiktoken behavior right now as well
+        any other behavior is either annoying, or a major footgun
+        """
+        # decode the user desire w.r.t. handling of special tokens
+        special = None
+        if allowed_special == "all":
+            special = self.special_tokens
+        elif allowed_special == "none":
+            special = {}
+        elif allowed_special == "none_raise":
+            special = {}
+            assert all(token not in text for token in self.special_tokens)
+        elif isinstance(allowed_special, set):
+            special = {k: v for k, v in self.special_tokens.items() if k in allowed_special}
+        else:
+            raise ValueError(f"allowed_special={allowed_special} not understood")
+        if not special:
+            # shortcut: if no special tokens, just use the ordinary encoding
+            return self.encode_ordinary(text)
+        # otherwise, we have to be careful with potential special tokens in text
+        # we handle special tokens by splitting the text
+        # based on the occurrence of any exact match with any of the special tokens
+        # we can use re.split for this. note that surrounding the pattern with ()
+        # makes it into a capturing group, so the special tokens will be included
+        special_pattern = "(" + "|".join(re.escape(k) for k in special) + ")"
+        special_chunks = re.split(special_pattern, text)
+        # now all the special characters are separated from the rest of the text
+        # all chunks of text are encoded separately, then results are joined
+        ids = []
+        for part in special_chunks:
+            if part in special:
+                # this is a special token, encode it separately as a special case
+                ids.append(special[part])
+            else:
+                # this is an ordinary sequence, encode it normally
+                ids.extend(self.encode_ordinary(part))
+        return ids
+    # directly from BPE repo
+    def save(self, file_prefix, save_to_folder: pathlib.Path, version=1):
+        """
+        Saves two files: file_prefix.vocab and file_prefix.model
+        This is inspired (but not equivalent to!) sentencepiece's model saving:
+        - model file is the critical one, intended for load()
+        - vocab file is just a pretty printed version for human inspection only
+        """
+        print("Saving tokenizer...")
+        # write the model: to be used in load() later
+        assert save_to_folder is not None and isinstance(save_to_folder,
+                                                         pathlib.Path), \
+            "the Path passed to store vocab and models seems to be wrong"
+        model_file = file_prefix + ".model"
+        model_file = os.path.join(os.path.abspath(save_to_folder), model_file)
+        with open(model_file, 'w') as f:
+            f.write(f"version:{version}\n")
+            f.write(f"{self.pattern}\n")
+            # write the special tokens, first the number of them, then each one
+            f.write(f"{len(self.special_tokens)}\n")
+            for special, idx in self.special_tokens.items():
+                f.write(f"{special} {idx}\n")
+            # the merges dict
+            for idx1, idx2 in self.merges:
+                f.write(f"{idx1} {idx2}\n")
+        # write the vocab
+        vocab_file = file_prefix + ".vocab"
+        vocab_file = os.path.join(save_to_folder, vocab_file)
+        inverted_merges = {idx: pair for pair, idx in self.merges.items()}
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            for idx, token in self.vocab.items():
+                # note: many tokens may be partial utf-8 sequences
+                # and cannot be decoded into valid strings. Here we're using
+                # errors='replace' to replace them with the replacement char �.
+                # this also means that we couldn't possibly use .vocab in load()
+                # because decoding in this way is a lossy operation!
+                s = render_token(token)
+                # find the children of this token, if any
+                if idx in inverted_merges:
+                    # if this token has children, render it nicely as a merge
+                    idx0, idx1 = inverted_merges[idx]
+                    s0 = render_token(self.vocab[idx0])
+                    s1 = render_token(self.vocab[idx1])
+                    f.write(f"[{s0}][{s1}] -> [{s}] {idx}\n")
+                else:
+                    # otherwise this is leaf token, just print it
+                    # (this should just be the first 256 tokens, the bytes)
+                    f.write(f"[{s}] {idx}\n")
+    def load(self, model_file_path):
+        """Inverse of save() but only for the model file"""
+        if isinstance(model_file_path, pathlib.Path):
+            model_file_path = str(model_file_path.absolute())
+        assert model_file_path.endswith(".model")
+        # read the model file
+        merges = {}
+        special_tokens = {}
+        # 256 for default first 256 chars and their bytes next 101 for Hindi
+        idx = 256 + 101 + 1  # One index extra is skipped initially when creating merges (number idx 357 so had to add +1 where needed when re-building vocab 😅)
+        with open(model_file_path, 'r', encoding="utf-8") as f:
+            # read the version
+            version = f.readline().strip()
+            print(version)
+            # read the pattern
+            self.pattern = f.readline().strip()
+            # read the special tokens
+            num_special = int(f.readline().strip())
+            for _ in range(num_special):
+                special, special_idx = f.readline().strip().split()
+                special_tokens[special] = int(special_idx)
+            # read the merges
+            for line in f:
+                idx1, idx2 = map(int, line.split())
+                merges[(idx1, idx2)] = idx
+                idx += 1
+        self.merges = merges
+        self.special_tokens = special_tokens
+        self.vocab = self._build_vocab()
+# if __name__ == "__main__":
+#     custom_text = """
+#     <|endoftext|>ूज रहा है जहाँ चकित हो जन-जन देख अकाज
+# सात वर्ष हो गये राह में, अटका कहाँ स्वराज?
+#
+# अटका कहाँ स्वराज? बोल दिल्ली! तू क्या कहती है?
+# तू रानी बन गयी वेदना जनता क्यों सहती है?
+# सबके भाग्य दबा रखे हैं किसने अपने कर में?
+# उतरी थी जो विभा, हुई बंदिनी बता किस घर में
+#
+# समर शेष है, यह प्रकाश बंदीगृह से छूटेगा
+# और नहीं तो तुझ पर पापिनी! महावज्र टूटेगा
+#
+# समर शेष है, उस स्वराज को सत्य बनाना होगा
+# जिसका है ये न्यास उसे सत्वर पहुँचाना होगा
+# धारा के मग में अनेक जो पर्वत खडे हुए हैं
+# गंगा का पथ रोक इन्द्र के गज जो अडे हुए हैं
+#
+# कह दो उनसे झुके अगर तो जग मे यश पाएंगे
+# अड़े रहे अगर तो ऐरावत पत्तों से बह जाऐंगे<|fim_prefix|><|endofprompt|>
+#     """.strip()
+#     special_tokens = {
+#         '<|endoftext|>': 100257,
+#         '<|fim_prefix|>': 100258,
+#         '<|fim_middle|>': 100259,
+#         '<|fim_suffix|>': 100260,
+#         '<|endofprompt|>': 100276
+#     }
+#     text = custom_text
+#     # create a Tokenizer and do 64 merges
+#     tokenizer = HindiTokenizer()
+#     tokenizer.train(text, 256 + 2, verbose=True)
+#     tokenizer.register_special_tokens(special_tokens)
+#     # verify that decode(encode(x)) == x
+#     assert tokenizer.decode(tokenizer.encode(text, "all")) == text

src/HuggingFace-based-tokenizer.py ADDED Viewed

	@@ -0,0 +1,77 @@

+# source: https://huggingface.co/learn/nlp-course/en/chapter6/8?fw=pt
+from tokenizers import normalizers, models, decoders, pre_tokenizers, trainers, Tokenizer, processors
+from datasets import load_dataset
+dataset = load_dataset("wikitext", name="wikitext-2-raw-v1", split="train")
+def get_training_corpus(batch_size=1000):
+    for i in range(0, len(dataset), batch_size):
+        yield dataset[i: i + batch_size]["text"]
+tokenizer = Tokenizer(model=models.WordPiece(unk_token="[UNK]"))
+tokenizer.normalizer = normalizers.Sequence([normalizers.NFD(), normalizers.Lowercase(), normalizers.StripAccents()])
+print(tokenizer.normalizer.normalize_str("Héllò hôw are ü?"))
+tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()  # pre_tokenizers.BertPreTokenizer()
+print(tokenizer.pre_tokenizer.pre_tokenize_str("Let's test my pre-tokenizer."))
+pre_tokenizer = pre_tokenizers.WhitespaceSplit()
+print(pre_tokenizer.pre_tokenize_str("Let's test my pre-tokenizer."))
+# manually selecting individual splitters
+pre_tokenizer = pre_tokenizers.Sequence(
+    [pre_tokenizers.WhitespaceSplit(), pre_tokenizers.Punctuation()]
+)
+print(pre_tokenizer.pre_tokenize_str("Let's test my pre-tokenizer."))
+special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
+trainer = trainers.WordPieceTrainer(vocab_size=25000, special_tokens=special_tokens)
+# train from an iterator
+tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)
+cls_token_id = tokenizer.token_to_id("[CLS]")
+sep_token_id = tokenizer.token_to_id("[SEP]")
+print(cls_token_id, sep_token_id)
+"""
+To write the template for the TemplateProcessor, we have to specify how to treat a single sentence and a pair of sentences.
+For both, we write the special tokens we want to use; the first (or single) sentence is represented by $A,
+while the second sentence (if encoding a pair) is represented by $B. For each of these (special tokens and sentences),
+we also specify the corresponding token type ID after a colon.
+The classic BERT template is thus defined as follows:
+"""
+tokenizer.post_processor = processors.TemplateProcessing(
+    single=f"[CLS]:0 $A:0 [SEP]:0",
+    pair=f"[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1",
+    special_tokens=[("[CLS]", cls_token_id), ("[SEP]", sep_token_id)],
+)
+encoding = tokenizer.encode("Let's test this tokenizer.")
+print(encoding.tokens)
+encoding = tokenizer.encode("Let's test this tokenizer...", "on a pair of sentences.")
+print(encoding.tokens)
+print(encoding.type_ids)
+tokenizer.decoder = decoders.WordPiece(prefix="##")
+from transformers import PreTrainedTokenizerFast
+wrapped_tokenizer = PreTrainedTokenizerFast(
+    tokenizer_object=tokenizer,
+    # tokenizer_file="tokenizer.json", # You can load from the tokenizer file, alternatively
+    unk_token="[UNK]",
+    pad_token="[PAD]",
+    cls_token="[CLS]",
+    sep_token="[SEP]",
+    mask_token="[MASK]",
+)

src/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .base import Tokenizer
+from .Basictokenizer import BasicTokenizer
+from .HindiTokenizer import HindiTokenizer

src/__pycache__/Basictokenizer.cpython-312.pyc ADDED Viewed

Binary file (4.35 kB). View file

src/__pycache__/HindiTokenizer.cpython-312.pyc ADDED Viewed

Binary file (20.3 kB). View file

src/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (280 Bytes). View file

src/__pycache__/base.cpython-312.pyc ADDED Viewed

Binary file (7.25 kB). View file

src/base.py ADDED Viewed

	@@ -0,0 +1,163 @@

+import unicodedata
+def get_stats(ids, counts=None):
+    """
+    Given a list of ints/ids, count the pairwise occurence
+    Returns count dict
+    """
+    counts = {} if counts is None else counts
+    for pair in zip(ids, ids[1:]):
+        counts[pair] = counts.get(pair, 0) + 1
+    return counts
+def merge(ids, pair_to_merge, idx_to_use):
+    """
+    find and merge the given `pair` and replace it with given `idx_to_use` in given list of ints/ids
+    Return updated list
+    """
+    new_ids = []
+    i = 0
+    while i < len(ids):
+        # check pair match AND if 0th position is NOT last element
+        if i < len(ids) - 1 and (pair_to_merge[0] == ids[i] and pair_to_merge[1] == ids[i + 1]):
+            new_ids.append(idx_to_use)  # pair found, append to new list of ids
+            i += 2  # skip by two elements as the pair is found
+        else:
+            # pair not found in the list, normal 1 element update
+            new_ids.append(ids[i])  # append the current item from old list as it is not a pair
+            i += 1
+    return new_ids
+# helper functions taken directly from Karpathy's BPE repo
+def replace_control_characters(s: str) -> str:
+    chars = []
+    for ch in s:
+        if unicodedata.category(ch)[0] != "C":
+            chars.append(ch)  # this character is ok
+        else:
+            chars.append(f"\\u{ord(ch):04x}")  # escape
+    return "".join(chars)
+def render_token(t: bytes) -> str:
+    # pretty print a token, escaping control characters
+    s = t.decode('utf-8', errors='replace')
+    s = replace_control_characters(s)
+    return s
+# base Tokenizer class
+class Tokenizer:
+    """Base Tokenizer class, MUST inherit for use"""
+    def __init__(self) -> None:
+        # defaults -> no patterns used, no merges, use usual first 256 bytes as mapping/vocab items
+        self.merges = {}  # this will hold the actual merged data eg: (101, 32) -> 256 , here say 101 chr e and 32 ' '(space) had max pair count -> replace this with next ID in order
+        self.pattern = ""  # any regular expression pattern if to be used on raw text
+        self.special_tokens = {}  # a mapping t hold any special tokens, empty here, to be used for subclasses, str -> int, e.g. {'<|endoftext|>': 90257}
+        self.vocab = self._build_vocab()  # int -> bytes
+    def train(self, text, vocab_size, verbose=False):
+        # Tokenizer can train a vocabulary of size vocab_size from text
+        raise NotImplementedError
+    def encode(self, text):
+        # Tokenizer can encode a string into a list of integers
+        raise NotImplementedError
+    def decode(self, ids):
+        # Tokenizer can decode a list of integers into a string
+        raise NotImplementedError
+    def _build_vocab(self):
+        # here vocab starts from normal 256 bytes of ints and then merges after it
+        vocab = {idx: bytes([idx]) for idx in range(256)}
+        for (pos0, pos1), idx in self.merges.items():
+            vocab[idx] = vocab[pos0] + vocab[pos1]
+        # NOW add special tokens defined in __init__()
+        # NOTE encode special tokens using .encode with UTF-8 encoding
+        for tok, idx in self.special_tokens.items():
+            vocab[idx] = tok.encode("utf-8")
+    # directly from BPE repo
+    def save(self, file_prefix):
+        """
+        Saves two files: file_prefix.vocab and file_prefix.model
+        This is inspired (but not equivalent to!) sentencepiece's model saving:
+        - model file is the critical one, intended for load()
+        - vocab file is just a pretty printed version for human inspection only
+        """
+        print("Saving tokenizer...")
+        # write the model: to be used in load() later
+        model_file = file_prefix + ".model"
+        with open(model_file, 'w') as f:
+            # write the version, pattern and merges, that's all that's needed
+            f.write("base v1\n")
+            f.write(f"{self.pattern}\n")
+            # write the special tokens, first the number of them, then each one
+            f.write(f"{len(self.special_tokens)}\n")
+            for special, idx in self.special_tokens.items():
+                f.write(f"{special} {idx}\n")
+            # the merges dict
+            for idx1, idx2 in self.merges:
+                f.write(f"{idx1} {idx2}\n")
+        # write the vocab: for the human to look at
+        vocab_file = file_prefix + ".vocab"
+        inverted_merges = {idx: pair for pair, idx in self.merges.items()}
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            for idx, token in self.vocab.items():
+                # note: many tokens may be partial utf-8 sequences
+                # and cannot be decoded into valid strings. Here we're using
+                # errors='replace' to replace them with the replacement char �.
+                # this also means that we couldn't possibly use .vocab in load()
+                # because decoding in this way is a lossy operation!
+                s = render_token(token)
+                # find the children of this token, if any
+                if idx in inverted_merges:
+                    # if this token has children, render it nicely as a merge
+                    idx0, idx1 = inverted_merges[idx]
+                    s0 = render_token(self.vocab[idx0])
+                    s1 = render_token(self.vocab[idx1])
+                    f.write(f"[{s0}][{s1}] -> [{s}] {idx}\n")
+                else:
+                    # otherwise this is leaf token, just print it
+                    # (this should just be the first 256 tokens, the bytes)
+                    f.write(f"[{s}] {idx}\n")
+    def load(self, model_file):
+        """Inverse of save() but only for the model file"""
+        assert model_file.endswith(".model")
+        # read the model file
+        merges = {}
+        special_tokens = {}
+        idx = 256
+        with open(model_file, 'r', encoding="utf-8") as f:
+            # read the version
+            version = f.readline().strip()
+            print(version)
+            # read the pattern
+            self.pattern = f.readline().strip()
+            # read the special tokens
+            num_special = int(f.readline().strip())
+            for _ in range(num_special):
+                special, special_idx = f.readline().strip().split()
+                special_tokens[special] = int(special_idx)
+            # read the merges
+            for line in f:
+                idx1, idx2 = map(int, line.split())
+                merges[(idx1, idx2)] = idx
+                idx += 1
+        self.merges = merges
+        self.special_tokens = special_tokens
+        self.vocab = self._build_vocab()