Spaces:

SwastikM
/

Embedding-Quantization

Runtime error

App Files Files Community

SwastikM commited on May 31

Commit

ef56763

•

1 Parent(s): 6376a51

Upload 19 files

Browse files

Files changed (20) hide show

.gitattributes +2 -0
Makefile +3 -0
app.py +94 -0
binary_index.py +15 -0
conala.index +3 -0
conala/cache-98d56627cf79bda0.arrow +3 -0
conala/data-00000-of-00001.arrow +3 -0
conala/dataset_info.json +58 -0
conala/state.json +20 -0
conala_int8_usearch.index +3 -0
create_mbedding-vecotor.ipynb +189 -0
query_search.ipynb +303 -0
requirements.txt +6 -0
retrieve_dataset.ipynb +125 -0
save_int8_index.py +16 -0
vectorized_dataset/data-00000-of-00003.arrow +3 -0
vectorized_dataset/data-00001-of-00003.arrow +3 -0
vectorized_dataset/data-00002-of-00003.arrow +3 -0
vectorized_dataset/dataset_info.json +65 -0
vectorized_dataset/state.json +27 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+conala_int8_usearch.index filter=lfs diff=lfs merge=lfs -text
+conala.index filter=lfs diff=lfs merge=lfs -text

Makefile ADDED Viewed

	@@ -0,0 +1,3 @@

+install:
+		pip install --upgrade pip &&\
+			pip install -r requirements.txt

app.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import gradio as gr
+from datasets import load_from_disk
+import pandas as pd
+from sentence_transformers import SentenceTransformer
+from sentence_transformers.quantization import quantize_embeddings
+import faiss
+from usearch.index import Index
+import numpy as np
+import os
+base_path = os.getcwd()
+full_path = os.path.join(base_path, 'conala')
+conala_dataset = load_from_disk(full_path)
+int8_view = Index.restore(os.path.join(base_path, 'conala_int8_usearch.index'), view=True)
+binary_index: faiss.IndexBinaryFlat = faiss.read_index_binary(os.path.join(base_path, 'conala.index'))
+model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+def search(query, top_k: int = 20):
+    # 1. Embed the query as float32
+    query_embedding = model.encode(query)
+    # 2. Quantize the query to ubinary. To perform actual search with faiss
+    query_embedding_ubinary = quantize_embeddings(query_embedding.reshape(1, -1), "ubinary")
+    # 3. Search the binary index
+    index =  binary_index
+    _scores, binary_ids = index.search(query_embedding_ubinary, top_k)
+    binary_ids = binary_ids[0]
+    # 4. Load the corresponding int8 embeddings. To perform rescoring to calculate score of fetched documents.
+    int8_embeddings = int8_view[binary_ids].astype(int)
+    # 5. Rescore the top_k * rescore_multiplier using the float32 query embedding and the int8 document embeddings
+    scores = query_embedding @ int8_embeddings.T
+    # 6. Sort the scores and return the top_k
+    indices = scores.argsort()[::-1][:top_k]
+    top_k_indices = binary_ids[indices]
+    top_k_scores = scores[indices]
+    top_k_codes = conala_dataset[top_k_indices]
+    return top_k_codes
+def response_generator(user_prompt):
+    top_k_outputs = search(user_prompt)
+    probs = top_k_outputs['prob']
+    snippets = top_k_outputs['snippet']
+    idx = np.argsort(probs)[::-1]
+    results = np.array(snippets)[idx]
+    filtered_results = []
+    for item in results:
+        if len(filtered_results)<3:
+            if item not in filtered_results:
+                filtered_results.append(item)
+    output_template = "User Query: {user_query}\nBelow are some examples of previous conversations.\nQuery: {query1} Solution: {solution1}\nQuery: {query2} Solution: {solution2}\nYou may use the above examples for reference only. Create your own solution and provide only the solution"
+    output_template = "The top three most relevant code snippets from the database are:\n\n1. {snippet1}\n\n2. {snippet2}\n\n3. {snippet3}"
+    output = f'{output_template.format(snippet1=filtered_results[0],snippet2=filtered_results[1],snippet3=filtered_results[2])}'
+    return {output_box:output}
+with gr.Blocks() as demo:
+    gr.Markdown(
+    """
+    # Embedding Quantization
+    ## Quantized Semantic Search
+    - ***Embedding:*** all-MiniLM-L6-v2
+    - ***Vetor DB:*** faiss, USearch
+    - ***Vector_DB Size:*** `5,93,891`
+    """)
+    state_var = gr.State([])
+    input_box = gr.Textbox(autoscroll=True,visible=True,label='User',info="Enter a query.",value="How to extract the n-th elements from a list of tuples in python?")
+    output_box = gr.Textbox(autoscroll=True,max_lines=30,value="Output",label='Assistant')
+    gr.Interface(fn=response_generator, inputs=[input_box], outputs=[output_box],
+                 delete_cache=(20,10),
+                 allow_flagging='never')
+demo.queue()
+demo.launch()

binary_index.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from datasets import load_from_disk
+import numpy as np
+from faiss import IndexBinaryFlat, write_index_binary
+from sentence_transformers.quantization import quantize_embeddings
+import os
+path_to_vectorised_dataset = os.path.join(os.getcwd(),'vectorized_dataset')
+dataset = load_from_disk(path_to_vectorised_dataset)
+embeddings = np.array(dataset["embedding"], dtype=np.float32)
+ubinary_embeddings = quantize_embeddings(embeddings, "ubinary")
+index = IndexBinaryFlat(384)    ## embedding dimension
+index.add(ubinary_embeddings)
+write_index_binary(index, "conala.index")

conala.index ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9168e2e320dd7d85ff9b01636a43dbfce18b1922cda518864c923366e88f0b8c
+size 28506801

conala/cache-98d56627cf79bda0.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0aba3bba56615d8284d26413efc5bb8a314814d8c2009d0a61a30cb718d6e576
+size 1019270752

conala/data-00000-of-00001.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:84d9591a024c23ec27e4162ae208c0b7fe657a8923760a95aa353b1f478c0b4f
+size 104616624

conala/dataset_info.json ADDED Viewed

	@@ -0,0 +1,58 @@

+{
+  "builder_name": "parquet",
+  "citation": "@inproceedings{yin2018learning,\n  title={Learning to mine aligned code and natural language pairs from stack overflow},\n  author={Yin, Pengcheng and Deng, Bowen and Chen, Edgar and Vasilescu, Bogdan and Neubig, Graham},\n  booktitle={2018 IEEE/ACM 15th international conference on mining software repositories (MSR)},\n  pages={476--486},\n  year={2018},\n  organization={IEEE}\n}\n",
+  "config_name": "mined",
+  "dataset_name": "conala",
+  "dataset_size": 104561297,
+  "description": "CoNaLa is a dataset of code and natural language pairs crawled from Stack Overflow, for more details please refer to this paper: https://arxiv.org/pdf/1805.08949.pdf or the dataset page https://conala-corpus.github.io/.\n",
+  "download_checksums": {
+    "hf://datasets/neulab/conala@798cef31a9b480d9c31aed21e745c9e485ed2647/mined/train/0000.parquet": {
+      "num_bytes": 74356953,
+      "checksum": null
+    }
+  },
+  "download_size": 74356953,
+  "features": {
+    "question_id": {
+      "dtype": "int64",
+      "_type": "Value"
+    },
+    "parent_answer_post_id": {
+      "dtype": "int64",
+      "_type": "Value"
+    },
+    "prob": {
+      "dtype": "float64",
+      "_type": "Value"
+    },
+    "snippet": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "intent": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "id": {
+      "dtype": "string",
+      "_type": "Value"
+    }
+  },
+  "homepage": "https://conala-corpus.github.io/",
+  "license": "",
+  "size_in_bytes": 178918250,
+  "splits": {
+    "train": {
+      "name": "train",
+      "num_bytes": 104561297,
+      "num_examples": 593891,
+      "dataset_name": "conala"
+    }
+  },
+  "version": {
+    "version_str": "1.1.0",
+    "major": 1,
+    "minor": 1,
+    "patch": 0
+  }
+}

conala/state.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "cd29c6cc7b846bee",
+  "_format_columns": [
+    "question_id",
+    "parent_answer_post_id",
+    "prob",
+    "snippet",
+    "intent",
+    "id"
+  ],
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": null
+}

conala_int8_usearch.index ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:32e4de66d49ef092bf814e753769b84aae93bb6068b24c5d0826a9e44ad32917
+size 316247436

create_mbedding-vecotor.ipynb ADDED Viewed

	@@ -0,0 +1,189 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Import Libraries"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/swastikm/work/senetence-transformer-in-action-main/semantic_search_env/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sentence_transformers import SentenceTransformer"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load Dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import load_from_disk\n",
+    "import os"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "base_path = os.getcwd()\n",
+    "dataset_folder = \"conala\"\n",
+    "path_to_dataset = os.path.join(base_path,dataset_folder)\n",
+    "dataset = load_from_disk(path_to_dataset)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Create vector embedding"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/swastikm/work/senetence-transformer-in-action-main/semantic_search_env/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
+   "source": [
+    "model = SentenceTransformer(\"sentence-transformers/all-MiniLM-L6-v2\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_embeddings(examples):\n",
+    "    vectors = {}\n",
+    "    model_input = examples['intent']\n",
+    "    out =  model.encode(model_input)\n",
+    "    vectors['embedding'] = out\n",
+    "    return vectors"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Map: 100%|██████████| 593891/593891 [20:14<00:00, 488.98 examples/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "vectorized_dataset = dataset.map(get_embeddings,batched=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Dataset({\n",
+       "    features: ['question_id', 'parent_answer_post_id', 'prob', 'snippet', 'intent', 'id', 'embedding'],\n",
+       "    num_rows: 593891\n",
+       "})"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "vectorized_dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Saving the dataset (3/3 shards): 100%|██████████| 593891/593891 [00:08<00:00, 69697.72 examples/s] \n"
+     ]
+    }
+   ],
+   "source": [
+    "vectorized_dataset.save_to_disk('vectorized_dataset')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "semactic_env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

query_search.ipynb ADDED Viewed

	@@ -0,0 +1,303 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Import Libraries"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/swastikm/work/senetence-transformer-in-action-main/semantic_search_env/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "import time\n",
+    "from datasets import load_from_disk\n",
+    "import pandas as pd\n",
+    "from sentence_transformers import SentenceTransformer\n",
+    "from sentence_transformers.quantization import quantize_embeddings\n",
+    "import faiss\n",
+    "from usearch.index import Index\n",
+    "import numpy as np\n",
+    "import os"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Load Dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "base_path = os.getcwd()\n",
+    "full_path = os.path.join(base_path, 'conala')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "conala_dataset = load_from_disk(full_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "```Text\n",
+    "Load the int8 and binary indices. Int8 is loaded as a view to save memory, as we never actually perform search with it.\n",
+    "Int8 embedding is required to perform rescoring of fetched document. Rescoring is done by performing inner product with F32 embedding of Query\n",
+    "```\n",
+    "[Efficient Passage Retrieval with Hashing for Open-domain Question Answering](https://arxiv.org/abs/2106.00882)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load the int8 and binary indices. Int8 is loaded as a view to save memory, as we never actually perform search with it.\n",
+    "int8_view = Index.restore(os.path.join(base_path, 'conala_int8_usearch.index'), view=True)\n",
+    "binary_index: faiss.IndexBinaryFlat = faiss.read_index_binary(os.path.join(base_path, 'conala.index'))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Import Model to generate embedding"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/swastikm/work/senetence-transformer-in-action-main/semantic_search_env/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
+   "source": [
+    "model = SentenceTransformer(\"sentence-transformers/all-MiniLM-L6-v2\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def search(query, top_k: int = 20):\n",
+    "    # 1. Embed the query as float32\n",
+    "    query_embedding = model.encode(query)\n",
+    "\n",
+    "    # 2. Quantize the query to ubinary. To perform actual search with faiss\n",
+    "    query_embedding_ubinary = quantize_embeddings(query_embedding.reshape(1, -1), \"ubinary\")\n",
+    "\n",
+    "\n",
+    "    # 3. Search the binary index \n",
+    "    index =  binary_index\n",
+    "    _scores, binary_ids = index.search(query_embedding_ubinary, top_k)\n",
+    "    binary_ids = binary_ids[0]\n",
+    "\n",
+    "\n",
+    "    # 4. Load the corresponding int8 embeddings. To perform rescoring to calculate score of fetched documents.\n",
+    "    int8_embeddings = int8_view[binary_ids].astype(int)\n",
+    "\n",
+    "    # 5. Rescore the top_k * rescore_multiplier using the float32 query embedding and the int8 document embeddings\n",
+    "    scores = query_embedding @ int8_embeddings.T\n",
+    "\n",
+    "    # 6. Sort the scores and return the top_k\n",
+    "    start_time = time.time()\n",
+    "    indices = scores.argsort()[::-1][:top_k]\n",
+    "    top_k_indices = binary_ids[indices]\n",
+    "    top_k_scores = scores[indices]\n",
+    "\n",
+    "    top_k_codes = conala_dataset[top_k_indices]\n",
+    "\n",
+    "    return top_k_codes\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "user_prompt = input('Enter python coding query')\n",
+    "top_k_outputs = search(user_prompt)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "probs = top_k_outputs['prob']\n",
+    "snippets = top_k_outputs['snippet']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "idx = np.argsort(probs)[::-1]\n",
+    "results = np.array(snippets)[idx]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['[x[1] for x in elements]',\n",
+       " 'map(itemgetter(1), elements)',\n",
+       " 'zip(*elements)[1]']"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "filtered_results = []\n",
+    "for item in results:\n",
+    "    if len(filtered_results)<3:\n",
+    "        if item not in filtered_results:\n",
+    "            filtered_results.append(item)\n",
+    "filtered_results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "output_template = \"User Query: {user_query}\\nBelow are some examples of previous conversations.\\nQuery: {query1} Solution: {solution1}\\nQuery: {query2} Solution: {solution2}\\nYou may use the above examples for reference only. Create your own solution and provide only the solution\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "output_template = \"The top three most relevant code snippets from the database are:\\n\\n1. {snippet1}\\n\\n2. {snippet2}\\n\\n3. {snippet3}\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "output = f'{output_template.format(snippet1=filtered_results[0],snippet2=filtered_results[1],snippet3=filtered_results[2])}'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The top three most relevant code snippets from the database are:\n",
+      "\n",
+      "1. [x[1] for x in elements]\n",
+      "\n",
+      "2. map(itemgetter(1), elements)\n",
+      "\n",
+      "3. zip(*elements)[1]\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(output)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+einops==0.8.0
+datasets==2.19.0
+faiss-cpu==1.8.0
+sentence-transformers==2.7.0
+usearch==2.12.0
+gradio==4.28.3

retrieve_dataset.ipynb ADDED Viewed

	@@ -0,0 +1,125 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/swastikm/work/senetence-transformer-in-action-main/semantic_search_env/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "import datasets\n",
+    "from datasets import load_dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "raw_datasets = load_dataset(\"neulab/conala\", \"mined\", trust_remote_code=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "DatasetDict({\n",
+       "    train: Dataset({\n",
+       "        features: ['question_id', 'parent_answer_post_id', 'prob', 'snippet', 'intent', 'id'],\n",
+       "        num_rows: 593891\n",
+       "    })\n",
+       "})"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "raw_datasets"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "raw_datasets = datasets.concatenate_datasets([raw_datasets[\"train\"]])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Saving the dataset (1/1 shards): 100%|██████████| 593891/593891 [00:00<00:00, 1373525.97 examples/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "raw_datasets.save_to_disk(\"conala\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "semantic_search_env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

save_int8_index.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from datasets import load_from_disk
+import numpy as np
+from usearch.index import Index
+from sentence_transformers.quantization import quantize_embeddings
+import os
+path_to_vectorised_dataset = os.path.join(os.getcwd(),'vectorized_dataset')
+dataset = load_from_disk(path_to_vectorised_dataset)
+embeddings = np.array(dataset["embedding"], dtype=np.float32)
+int8_embeddings = quantize_embeddings(embeddings, "int8")
+index = Index(ndim=384, metric="ip", dtype="i8")             ### embedding dimension
+index.add(np.arange(len(int8_embeddings)), int8_embeddings)
+index.save("conala_int8_usearch.index")

vectorized_dataset/data-00000-of-00003.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bfa90c3110ec533d52d8e89cc4bc140cc5bda6e0b8687108e6d7cd051092c8de
+size 334430032

vectorized_dataset/data-00001-of-00003.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7f5f342829b8bdc85295656da9ea17ffea90519c47161505a5d0eb7084ecb881
+size 340242056

vectorized_dataset/data-00002-of-00003.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dd6209626da7cf6dfc257a73354aece4bd593afe2ec669683889fff7a98cb356
+size 344600608

vectorized_dataset/dataset_info.json ADDED Viewed

	@@ -0,0 +1,65 @@

+{
+  "builder_name": "parquet",
+  "citation": "@inproceedings{yin2018learning,\n  title={Learning to mine aligned code and natural language pairs from stack overflow},\n  author={Yin, Pengcheng and Deng, Bowen and Chen, Edgar and Vasilescu, Bogdan and Neubig, Graham},\n  booktitle={2018 IEEE/ACM 15th international conference on mining software repositories (MSR)},\n  pages={476--486},\n  year={2018},\n  organization={IEEE}\n}\n",
+  "config_name": "mined",
+  "dataset_name": "conala",
+  "dataset_size": 104561297,
+  "description": "CoNaLa is a dataset of code and natural language pairs crawled from Stack Overflow, for more details please refer to this paper: https://arxiv.org/pdf/1805.08949.pdf or the dataset page https://conala-corpus.github.io/.\n",
+  "download_checksums": {
+    "hf://datasets/neulab/conala@798cef31a9b480d9c31aed21e745c9e485ed2647/mined/train/0000.parquet": {
+      "num_bytes": 74356953,
+      "checksum": null
+    }
+  },
+  "download_size": 74356953,
+  "features": {
+    "question_id": {
+      "dtype": "int64",
+      "_type": "Value"
+    },
+    "parent_answer_post_id": {
+      "dtype": "int64",
+      "_type": "Value"
+    },
+    "prob": {
+      "dtype": "float64",
+      "_type": "Value"
+    },
+    "snippet": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "intent": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "id": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "embedding": {
+      "feature": {
+        "dtype": "float32",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    }
+  },
+  "homepage": "https://conala-corpus.github.io/",
+  "license": "",
+  "size_in_bytes": 178918250,
+  "splits": {
+    "train": {
+      "name": "train",
+      "num_bytes": 104561297,
+      "num_examples": 593891,
+      "dataset_name": "conala"
+    }
+  },
+  "version": {
+    "version_str": "1.1.0",
+    "major": 1,
+    "minor": 1,
+    "patch": 0
+  }
+}

vectorized_dataset/state.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00003.arrow"
+    },
+    {
+      "filename": "data-00001-of-00003.arrow"
+    },
+    {
+      "filename": "data-00002-of-00003.arrow"
+    }
+  ],
+  "_fingerprint": "98d56627cf79bda0",
+  "_format_columns": [
+    "embedding",
+    "id",
+    "intent",
+    "parent_answer_post_id",
+    "prob",
+    "question_id",
+    "snippet"
+  ],
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": null
+}