import gradio as gr from data import download_dataset, tokenize_dataset, load_tokenized_dataset from infer import get_model_and_tokenizer, batch_embed # TODO: add instructor models # "hkunlp/instructor-xl", # "hkunlp/instructor-large", # "hkunlp/instructor-base", # model ids and hidden sizes models_and_hidden_sizes = [ ("intfloat/e5-small-v2", 384), ("intfloat/e5-base-v2", 768), ("intfloat/e5-large-v2", 1024), ("intfloat/multilingual-e5-small", 384), ("intfloat/multilingual-e5-base", 768), ("intfloat/multilingual-e5-large", 1024), ("sentence-transformers/all-MiniLM-L6-v2", 384), ("sentence-transformers/all-MiniLM-L12-v2", 384), ("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", 384), ] model_options = [ f"{model_name} (hidden_size = {hidden_size})" for model_name, hidden_size in models_and_hidden_sizes ] opt2desc = { "O2": "Most precise, slowest (O2: basic and extended general optimizations, transformers-specific fusions)", "O3": "Less precise, faster (O3: O2 + gelu approx)", "O4": "Least precise, fastest (O4: O3 + fp16/bf16)", } desc2opt = {v: k for k, v in opt2desc.items()} optimization_options = list(opt2desc.values()) def download_and_tokenize( ds_name, ds_config, column_name, ds_split, model_choice, opt_desc, num2skip, num2embed, progress=gr.Progress(track_tqdm=True), ): num_samples = download_dataset(ds_name, ds_config, ds_split) opt_level = desc2opt[opt_desc] model_name = model_choice.split()[0] tokenize_dataset( ds_name=ds_name, ds_config=ds_config, model_name=model_name, opt_level=opt_level, column_name=column_name, num2skip=num2skip, num2embed=num2embed, ) return f"Downloaded! It has {len(num_samples)} docs." def embed( ds_name, ds_config, column_name, ds_split, model_choice, opt_desc, new_dataset_id, num2skip, num2embed, progress=gr.Progress(track_tqdm=True), ): ds = load_tokenized_dataset(ds_name, ds_config, ds_split) opt_level = desc2opt[opt_desc] model_name = model_choice.split()[0] if progress is not None: progress(0.2, "Downloading model and tokenizer...") model, tokenizer = get_model_and_tokenizer(model_name, opt_level, progress) doc_count, seconds_taken = batch_embed( ds, model, tokenizer, model_name=model_name, column_name=column_name, new_dataset_id=new_dataset_id, opt_level=opt_level, num2skip=num2skip, num2embed=num2embed, progress=progress, ) return f"Embedded {doc_count} docs in {seconds_taken/60:.2f} minutes ({doc_count/seconds_taken:.1f} docs/sec)" with gr.Blocks(title="Bulk embeddings") as demo: gr.Markdown( """ # Bulk Embeddings This Space allows you to embed a large dataset easily. For instance, this can easily create vectors for Wikipedia \ articles -- taking about __ hours and costing approximately $__. This utilizes state-of-the-art open-source embedding models, \ and optimizes them for inference using Hugging Face [optimum](https://github.com/huggingface/optimum). There are various \ levels of optimizations that can be applied - the quality of the embeddings will degrade as the optimizations increase. Currently available options: O2/O3/O4 on T4/A10 GPUs using onnx runtime. Future options: - OpenVino for CPU inference - TensorRT for GPU inference - Quantized models - Instructor models - Text splitting options - More control about which rows to embed (skip some, stop early) - Dynamic padding ## Steps 1. Upload the dataset to the Hugging Face Hub. 2. Enter dataset details into the form below. 3. Choose a model. These are taken from the top of the [MTEB leaderboard](https://huggingface.co/spaces/mteb/leaderboard). 4. Enter optimization level. See [here](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/optimization#optimization-configuration) for details. 5. Choose a name for the new dataset. 6. Hit run! ### Note: If you have short documents, O3 will be faster than O4. If you have long documents, O4 will be faster than O3. \ O4 requires the tokenized documents to be padded to max length. """ ) with gr.Row(): ds_name = gr.Textbox( lines=1, label="Dataset to load from Hugging Face Hub", value="wikipedia", ) ds_config = gr.Textbox( lines=1, label="Dataset config (leave blank to use default)", value="20220301.en", ) column_name = gr.Textbox(lines=1, label="Enter column to embed", value="text") ds_split = gr.Dropdown( choices=["train", "validation", "test"], label="Dataset split", value="train", ) # TODO: idx column # TODO: text splitting options with gr.Row(): model_choice = gr.Dropdown( choices=model_options, label="Embedding model", value=model_options[0] ) opt_desc = gr.Dropdown( choices=optimization_options, label="Optimization level", value=optimization_options[0], ) with gr.Row(): new_dataset_id = gr.Textbox( lines=1, label="New dataset name, including username", value="wiki-embeds", ) num2skip = gr.Slider( value=0, minimum=0, maximum=100_000_000, step=1, label="Number of rows to skip", ) num2embed = gr.Slider( value=30000, minimum=-1, maximum=100_000_000, step=1, label="Number of rows to embed (-1 = all)", ) num2upload = gr.Slider( value=10000, minimum=1000, maximum=100000, step=1000, label="Chunk size for uploading", ) with gr.Row(): download_btn = gr.Button(value="Download and tokenize dataset!") embed_btn = gr.Button(value="Embed texts!") last = gr.Textbox(value="") download_btn.click( fn=download_and_tokenize, inputs=[ ds_name, ds_config, column_name, ds_split, model_choice, opt_desc, num2skip, num2embed, ], outputs=last, ) embed_btn.click( fn=embed, inputs=[ ds_name, ds_config, column_name, ds_split, model_choice, opt_desc, new_dataset_id, num2skip, num2embed, ], outputs=last, ) if __name__ == "__main__": demo.queue(concurrency_count=20).launch(show_error=True, debug=True)