import pathlib import random import gradio as gr from src import HindiTokenizer, BasicTokenizer Basic = BasicTokenizer() Basic._build_vocab() Hindi = HindiTokenizer() Hindi.load( model_file_path=pathlib.Path( "saved_vocabs/batch_1_Hindi_Tokenizer-test-all_batches-100_000_batchsize-initial_vocab_size_5000.model")) def tokenize_and_color(text, tokenizer_choice="HindiTokenizer"): if tokenizer_choice == "BasicTokenizer": tokenizer = Basic else: tokenizer = Hindi tokens = tokenizer.encode(text) # colors = [ # "#FF5733", "#33FF57", "#3357FF", "#F333FF", # "#33FFF3", "#F3FF33", "#FF3380", "#3380FF", # "#83FF33", "#FF8333" # ] # colors = [ # "#FF5733", "#33FF57", "#3357FF", "#F333FF", # "#33FFF3", "#FF3380", "#3380FF", # "#83FF33", "#FF8333", "#7FDBFF", "#0074D9", # "#39CCCC", "#3D9970", "#2ECC40", "#01FF70", # "#FFDC00", "#FF851B", "#FF4136", "#85144b", # "#F012BE", "#B10DC9", "#AAAAAA", "#DDDDDD" # ] colors = [ "#33FF57", "#3357FF", "#F333FF", "#33FFF3", "#FF3380", "#3380FF", "#FF8333", "#7FDBFF", "#0074D9", "#39CCCC", "#3D9970", "#2ECC40", "#FFDC00", "#FF851B", "#0000FF", "#85144b", "#F012BE", "#B10DC9", "#AAAAAA", "#DDDDDD", "#800000", "#808000", "#008080", "#800080", "#FFC0CB", "#000080", "#FFD700", "#FF69B4", "#8A2BE2", "#5F9EA0" ] colored_text = '
' token_color_mapping = {} last_color = "" for index, token in enumerate(tokens): token_id = token if token_id in token_color_mapping: color = token_color_mapping[token_id] else: color = random.choice([c for c in colors if c != last_color]) last_color = color token_color_mapping[token_id] = color colored_text += f'{token}' colored_text += '
' return colored_text examples = [ ["आप कैसे हैं??"], ["यह एक परीक्षण है।"], ["लोरेम इप्सम एक छद्म-लैटिन पाठ है जिसका उपयोग मुद्रण और टाइपसेटिंग उद्योगों में किया जाता है।"], ["This is just English text for testing purposes."] ] iface = gr.Interface(fn=tokenize_and_color, title="Hindi Text Tokenizer", description="Enter text to see the tokenized output with each token colored differently.", inputs=[ gr.Textbox(lines=2, label="Input Text"), gr.Radio(choices=["BasicTokenizer", "HindiTokenizer"], label="Tokenizer Choice", value="HindiTokenizer") ], outputs=[ gr.HTML(label="Tokenized and Colored Text") ], examples=examples, # theme=gr.themes.Soft() theme=gr.themes.Base() ) if __name__ == "__main__": iface.launch()