SwastikM commited on
Commit
ef56763
1 Parent(s): 6376a51

Upload 19 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ conala_int8_usearch.index filter=lfs diff=lfs merge=lfs -text
37
+ conala.index filter=lfs diff=lfs merge=lfs -text
Makefile ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ install:
2
+ pip install --upgrade pip &&\
3
+ pip install -r requirements.txt
app.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import gradio as gr
3
+ from datasets import load_from_disk
4
+ import pandas as pd
5
+ from sentence_transformers import SentenceTransformer
6
+ from sentence_transformers.quantization import quantize_embeddings
7
+ import faiss
8
+ from usearch.index import Index
9
+ import numpy as np
10
+ import os
11
+
12
+ base_path = os.getcwd()
13
+ full_path = os.path.join(base_path, 'conala')
14
+ conala_dataset = load_from_disk(full_path)
15
+
16
+ int8_view = Index.restore(os.path.join(base_path, 'conala_int8_usearch.index'), view=True)
17
+ binary_index: faiss.IndexBinaryFlat = faiss.read_index_binary(os.path.join(base_path, 'conala.index'))
18
+
19
+ model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
20
+
21
+ def search(query, top_k: int = 20):
22
+ # 1. Embed the query as float32
23
+ query_embedding = model.encode(query)
24
+
25
+ # 2. Quantize the query to ubinary. To perform actual search with faiss
26
+ query_embedding_ubinary = quantize_embeddings(query_embedding.reshape(1, -1), "ubinary")
27
+
28
+
29
+ # 3. Search the binary index
30
+ index = binary_index
31
+ _scores, binary_ids = index.search(query_embedding_ubinary, top_k)
32
+ binary_ids = binary_ids[0]
33
+
34
+
35
+ # 4. Load the corresponding int8 embeddings. To perform rescoring to calculate score of fetched documents.
36
+ int8_embeddings = int8_view[binary_ids].astype(int)
37
+
38
+ # 5. Rescore the top_k * rescore_multiplier using the float32 query embedding and the int8 document embeddings
39
+ scores = query_embedding @ int8_embeddings.T
40
+
41
+ # 6. Sort the scores and return the top_k
42
+ indices = scores.argsort()[::-1][:top_k]
43
+ top_k_indices = binary_ids[indices]
44
+ top_k_scores = scores[indices]
45
+
46
+ top_k_codes = conala_dataset[top_k_indices]
47
+
48
+ return top_k_codes
49
+
50
+
51
+ def response_generator(user_prompt):
52
+ top_k_outputs = search(user_prompt)
53
+ probs = top_k_outputs['prob']
54
+ snippets = top_k_outputs['snippet']
55
+ idx = np.argsort(probs)[::-1]
56
+ results = np.array(snippets)[idx]
57
+ filtered_results = []
58
+ for item in results:
59
+ if len(filtered_results)<3:
60
+ if item not in filtered_results:
61
+ filtered_results.append(item)
62
+
63
+ output_template = "User Query: {user_query}\nBelow are some examples of previous conversations.\nQuery: {query1} Solution: {solution1}\nQuery: {query2} Solution: {solution2}\nYou may use the above examples for reference only. Create your own solution and provide only the solution"
64
+ output_template = "The top three most relevant code snippets from the database are:\n\n1. {snippet1}\n\n2. {snippet2}\n\n3. {snippet3}"
65
+ output = f'{output_template.format(snippet1=filtered_results[0],snippet2=filtered_results[1],snippet3=filtered_results[2])}'
66
+
67
+ return {output_box:output}
68
+
69
+
70
+ with gr.Blocks() as demo:
71
+
72
+ gr.Markdown(
73
+ """
74
+ # Embedding Quantization
75
+
76
+ ## Quantized Semantic Search
77
+
78
+ - ***Embedding:*** all-MiniLM-L6-v2
79
+ - ***Vetor DB:*** faiss, USearch
80
+ - ***Vector_DB Size:*** `5,93,891`
81
+
82
+ """)
83
+
84
+ state_var = gr.State([])
85
+
86
+
87
+ input_box = gr.Textbox(autoscroll=True,visible=True,label='User',info="Enter a query.",value="How to extract the n-th elements from a list of tuples in python?")
88
+ output_box = gr.Textbox(autoscroll=True,max_lines=30,value="Output",label='Assistant')
89
+ gr.Interface(fn=response_generator, inputs=[input_box], outputs=[output_box],
90
+ delete_cache=(20,10),
91
+ allow_flagging='never')
92
+
93
+ demo.queue()
94
+ demo.launch()
binary_index.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_from_disk
2
+ import numpy as np
3
+ from faiss import IndexBinaryFlat, write_index_binary
4
+ from sentence_transformers.quantization import quantize_embeddings
5
+
6
+ import os
7
+ path_to_vectorised_dataset = os.path.join(os.getcwd(),'vectorized_dataset')
8
+
9
+ dataset = load_from_disk(path_to_vectorised_dataset)
10
+ embeddings = np.array(dataset["embedding"], dtype=np.float32)
11
+
12
+ ubinary_embeddings = quantize_embeddings(embeddings, "ubinary")
13
+ index = IndexBinaryFlat(384) ## embedding dimension
14
+ index.add(ubinary_embeddings)
15
+ write_index_binary(index, "conala.index")
conala.index ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9168e2e320dd7d85ff9b01636a43dbfce18b1922cda518864c923366e88f0b8c
3
+ size 28506801
conala/cache-98d56627cf79bda0.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0aba3bba56615d8284d26413efc5bb8a314814d8c2009d0a61a30cb718d6e576
3
+ size 1019270752
conala/data-00000-of-00001.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:84d9591a024c23ec27e4162ae208c0b7fe657a8923760a95aa353b1f478c0b4f
3
+ size 104616624
conala/dataset_info.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "builder_name": "parquet",
3
+ "citation": "@inproceedings{yin2018learning,\n title={Learning to mine aligned code and natural language pairs from stack overflow},\n author={Yin, Pengcheng and Deng, Bowen and Chen, Edgar and Vasilescu, Bogdan and Neubig, Graham},\n booktitle={2018 IEEE/ACM 15th international conference on mining software repositories (MSR)},\n pages={476--486},\n year={2018},\n organization={IEEE}\n}\n",
4
+ "config_name": "mined",
5
+ "dataset_name": "conala",
6
+ "dataset_size": 104561297,
7
+ "description": "CoNaLa is a dataset of code and natural language pairs crawled from Stack Overflow, for more details please refer to this paper: https://arxiv.org/pdf/1805.08949.pdf or the dataset page https://conala-corpus.github.io/.\n",
8
+ "download_checksums": {
9
+ "hf://datasets/neulab/conala@798cef31a9b480d9c31aed21e745c9e485ed2647/mined/train/0000.parquet": {
10
+ "num_bytes": 74356953,
11
+ "checksum": null
12
+ }
13
+ },
14
+ "download_size": 74356953,
15
+ "features": {
16
+ "question_id": {
17
+ "dtype": "int64",
18
+ "_type": "Value"
19
+ },
20
+ "parent_answer_post_id": {
21
+ "dtype": "int64",
22
+ "_type": "Value"
23
+ },
24
+ "prob": {
25
+ "dtype": "float64",
26
+ "_type": "Value"
27
+ },
28
+ "snippet": {
29
+ "dtype": "string",
30
+ "_type": "Value"
31
+ },
32
+ "intent": {
33
+ "dtype": "string",
34
+ "_type": "Value"
35
+ },
36
+ "id": {
37
+ "dtype": "string",
38
+ "_type": "Value"
39
+ }
40
+ },
41
+ "homepage": "https://conala-corpus.github.io/",
42
+ "license": "",
43
+ "size_in_bytes": 178918250,
44
+ "splits": {
45
+ "train": {
46
+ "name": "train",
47
+ "num_bytes": 104561297,
48
+ "num_examples": 593891,
49
+ "dataset_name": "conala"
50
+ }
51
+ },
52
+ "version": {
53
+ "version_str": "1.1.0",
54
+ "major": 1,
55
+ "minor": 1,
56
+ "patch": 0
57
+ }
58
+ }
conala/state.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_data_files": [
3
+ {
4
+ "filename": "data-00000-of-00001.arrow"
5
+ }
6
+ ],
7
+ "_fingerprint": "cd29c6cc7b846bee",
8
+ "_format_columns": [
9
+ "question_id",
10
+ "parent_answer_post_id",
11
+ "prob",
12
+ "snippet",
13
+ "intent",
14
+ "id"
15
+ ],
16
+ "_format_kwargs": {},
17
+ "_format_type": null,
18
+ "_output_all_columns": false,
19
+ "_split": null
20
+ }
conala_int8_usearch.index ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:32e4de66d49ef092bf814e753769b84aae93bb6068b24c5d0826a9e44ad32917
3
+ size 316247436
create_mbedding-vecotor.ipynb ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": []
9
+ },
10
+ {
11
+ "cell_type": "markdown",
12
+ "metadata": {},
13
+ "source": [
14
+ "# Import Libraries"
15
+ ]
16
+ },
17
+ {
18
+ "cell_type": "code",
19
+ "execution_count": 1,
20
+ "metadata": {},
21
+ "outputs": [
22
+ {
23
+ "name": "stderr",
24
+ "output_type": "stream",
25
+ "text": [
26
+ "/home/swastikm/work/senetence-transformer-in-action-main/semantic_search_env/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
27
+ " from .autonotebook import tqdm as notebook_tqdm\n"
28
+ ]
29
+ }
30
+ ],
31
+ "source": [
32
+ "from sentence_transformers import SentenceTransformer"
33
+ ]
34
+ },
35
+ {
36
+ "cell_type": "markdown",
37
+ "metadata": {},
38
+ "source": [
39
+ "## Load Dataset"
40
+ ]
41
+ },
42
+ {
43
+ "cell_type": "code",
44
+ "execution_count": 2,
45
+ "metadata": {},
46
+ "outputs": [],
47
+ "source": [
48
+ "from datasets import load_from_disk\n",
49
+ "import os"
50
+ ]
51
+ },
52
+ {
53
+ "cell_type": "code",
54
+ "execution_count": 3,
55
+ "metadata": {},
56
+ "outputs": [],
57
+ "source": [
58
+ "base_path = os.getcwd()\n",
59
+ "dataset_folder = \"conala\"\n",
60
+ "path_to_dataset = os.path.join(base_path,dataset_folder)\n",
61
+ "dataset = load_from_disk(path_to_dataset)"
62
+ ]
63
+ },
64
+ {
65
+ "cell_type": "markdown",
66
+ "metadata": {},
67
+ "source": [
68
+ "# Create vector embedding"
69
+ ]
70
+ },
71
+ {
72
+ "cell_type": "code",
73
+ "execution_count": 4,
74
+ "metadata": {},
75
+ "outputs": [
76
+ {
77
+ "name": "stderr",
78
+ "output_type": "stream",
79
+ "text": [
80
+ "/home/swastikm/work/senetence-transformer-in-action-main/semantic_search_env/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
81
+ " warnings.warn(\n"
82
+ ]
83
+ }
84
+ ],
85
+ "source": [
86
+ "model = SentenceTransformer(\"sentence-transformers/all-MiniLM-L6-v2\")"
87
+ ]
88
+ },
89
+ {
90
+ "cell_type": "code",
91
+ "execution_count": 5,
92
+ "metadata": {},
93
+ "outputs": [],
94
+ "source": [
95
+ "def get_embeddings(examples):\n",
96
+ " vectors = {}\n",
97
+ " model_input = examples['intent']\n",
98
+ " out = model.encode(model_input)\n",
99
+ " vectors['embedding'] = out\n",
100
+ " return vectors"
101
+ ]
102
+ },
103
+ {
104
+ "cell_type": "code",
105
+ "execution_count": 6,
106
+ "metadata": {},
107
+ "outputs": [
108
+ {
109
+ "name": "stderr",
110
+ "output_type": "stream",
111
+ "text": [
112
+ "Map: 100%|██████████| 593891/593891 [20:14<00:00, 488.98 examples/s]\n"
113
+ ]
114
+ }
115
+ ],
116
+ "source": [
117
+ "vectorized_dataset = dataset.map(get_embeddings,batched=True)"
118
+ ]
119
+ },
120
+ {
121
+ "cell_type": "code",
122
+ "execution_count": 7,
123
+ "metadata": {},
124
+ "outputs": [
125
+ {
126
+ "data": {
127
+ "text/plain": [
128
+ "Dataset({\n",
129
+ " features: ['question_id', 'parent_answer_post_id', 'prob', 'snippet', 'intent', 'id', 'embedding'],\n",
130
+ " num_rows: 593891\n",
131
+ "})"
132
+ ]
133
+ },
134
+ "execution_count": 7,
135
+ "metadata": {},
136
+ "output_type": "execute_result"
137
+ }
138
+ ],
139
+ "source": [
140
+ "vectorized_dataset"
141
+ ]
142
+ },
143
+ {
144
+ "cell_type": "code",
145
+ "execution_count": 8,
146
+ "metadata": {},
147
+ "outputs": [
148
+ {
149
+ "name": "stderr",
150
+ "output_type": "stream",
151
+ "text": [
152
+ "Saving the dataset (3/3 shards): 100%|██████████| 593891/593891 [00:08<00:00, 69697.72 examples/s] \n"
153
+ ]
154
+ }
155
+ ],
156
+ "source": [
157
+ "vectorized_dataset.save_to_disk('vectorized_dataset')"
158
+ ]
159
+ },
160
+ {
161
+ "cell_type": "code",
162
+ "execution_count": null,
163
+ "metadata": {},
164
+ "outputs": [],
165
+ "source": []
166
+ }
167
+ ],
168
+ "metadata": {
169
+ "kernelspec": {
170
+ "display_name": "semactic_env",
171
+ "language": "python",
172
+ "name": "python3"
173
+ },
174
+ "language_info": {
175
+ "codemirror_mode": {
176
+ "name": "ipython",
177
+ "version": 3
178
+ },
179
+ "file_extension": ".py",
180
+ "mimetype": "text/x-python",
181
+ "name": "python",
182
+ "nbconvert_exporter": "python",
183
+ "pygments_lexer": "ipython3",
184
+ "version": "3.10.12"
185
+ }
186
+ },
187
+ "nbformat": 4,
188
+ "nbformat_minor": 2
189
+ }
query_search.ipynb ADDED
@@ -0,0 +1,303 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": []
9
+ },
10
+ {
11
+ "cell_type": "code",
12
+ "execution_count": null,
13
+ "metadata": {},
14
+ "outputs": [],
15
+ "source": []
16
+ },
17
+ {
18
+ "cell_type": "markdown",
19
+ "metadata": {},
20
+ "source": [
21
+ "# Import Libraries"
22
+ ]
23
+ },
24
+ {
25
+ "cell_type": "code",
26
+ "execution_count": 1,
27
+ "metadata": {},
28
+ "outputs": [
29
+ {
30
+ "name": "stderr",
31
+ "output_type": "stream",
32
+ "text": [
33
+ "/home/swastikm/work/senetence-transformer-in-action-main/semantic_search_env/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
34
+ " from .autonotebook import tqdm as notebook_tqdm\n"
35
+ ]
36
+ }
37
+ ],
38
+ "source": [
39
+ "import time\n",
40
+ "from datasets import load_from_disk\n",
41
+ "import pandas as pd\n",
42
+ "from sentence_transformers import SentenceTransformer\n",
43
+ "from sentence_transformers.quantization import quantize_embeddings\n",
44
+ "import faiss\n",
45
+ "from usearch.index import Index\n",
46
+ "import numpy as np\n",
47
+ "import os"
48
+ ]
49
+ },
50
+ {
51
+ "cell_type": "markdown",
52
+ "metadata": {},
53
+ "source": [
54
+ "# Load Dataset"
55
+ ]
56
+ },
57
+ {
58
+ "cell_type": "code",
59
+ "execution_count": 2,
60
+ "metadata": {},
61
+ "outputs": [],
62
+ "source": [
63
+ "base_path = os.getcwd()\n",
64
+ "full_path = os.path.join(base_path, 'conala')"
65
+ ]
66
+ },
67
+ {
68
+ "cell_type": "code",
69
+ "execution_count": 3,
70
+ "metadata": {},
71
+ "outputs": [],
72
+ "source": [
73
+ "conala_dataset = load_from_disk(full_path)"
74
+ ]
75
+ },
76
+ {
77
+ "cell_type": "markdown",
78
+ "metadata": {},
79
+ "source": [
80
+ "```Text\n",
81
+ "Load the int8 and binary indices. Int8 is loaded as a view to save memory, as we never actually perform search with it.\n",
82
+ "Int8 embedding is required to perform rescoring of fetched document. Rescoring is done by performing inner product with F32 embedding of Query\n",
83
+ "```\n",
84
+ "[Efficient Passage Retrieval with Hashing for Open-domain Question Answering](https://arxiv.org/abs/2106.00882)"
85
+ ]
86
+ },
87
+ {
88
+ "cell_type": "code",
89
+ "execution_count": 4,
90
+ "metadata": {},
91
+ "outputs": [],
92
+ "source": [
93
+ "# Load the int8 and binary indices. Int8 is loaded as a view to save memory, as we never actually perform search with it.\n",
94
+ "int8_view = Index.restore(os.path.join(base_path, 'conala_int8_usearch.index'), view=True)\n",
95
+ "binary_index: faiss.IndexBinaryFlat = faiss.read_index_binary(os.path.join(base_path, 'conala.index'))"
96
+ ]
97
+ },
98
+ {
99
+ "cell_type": "markdown",
100
+ "metadata": {},
101
+ "source": [
102
+ "# Import Model to generate embedding"
103
+ ]
104
+ },
105
+ {
106
+ "cell_type": "code",
107
+ "execution_count": 5,
108
+ "metadata": {},
109
+ "outputs": [
110
+ {
111
+ "name": "stderr",
112
+ "output_type": "stream",
113
+ "text": [
114
+ "/home/swastikm/work/senetence-transformer-in-action-main/semantic_search_env/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
115
+ " warnings.warn(\n"
116
+ ]
117
+ }
118
+ ],
119
+ "source": [
120
+ "model = SentenceTransformer(\"sentence-transformers/all-MiniLM-L6-v2\")"
121
+ ]
122
+ },
123
+ {
124
+ "cell_type": "code",
125
+ "execution_count": 6,
126
+ "metadata": {},
127
+ "outputs": [],
128
+ "source": [
129
+ "def search(query, top_k: int = 20):\n",
130
+ " # 1. Embed the query as float32\n",
131
+ " query_embedding = model.encode(query)\n",
132
+ "\n",
133
+ " # 2. Quantize the query to ubinary. To perform actual search with faiss\n",
134
+ " query_embedding_ubinary = quantize_embeddings(query_embedding.reshape(1, -1), \"ubinary\")\n",
135
+ "\n",
136
+ "\n",
137
+ " # 3. Search the binary index \n",
138
+ " index = binary_index\n",
139
+ " _scores, binary_ids = index.search(query_embedding_ubinary, top_k)\n",
140
+ " binary_ids = binary_ids[0]\n",
141
+ "\n",
142
+ "\n",
143
+ " # 4. Load the corresponding int8 embeddings. To perform rescoring to calculate score of fetched documents.\n",
144
+ " int8_embeddings = int8_view[binary_ids].astype(int)\n",
145
+ "\n",
146
+ " # 5. Rescore the top_k * rescore_multiplier using the float32 query embedding and the int8 document embeddings\n",
147
+ " scores = query_embedding @ int8_embeddings.T\n",
148
+ "\n",
149
+ " # 6. Sort the scores and return the top_k\n",
150
+ " start_time = time.time()\n",
151
+ " indices = scores.argsort()[::-1][:top_k]\n",
152
+ " top_k_indices = binary_ids[indices]\n",
153
+ " top_k_scores = scores[indices]\n",
154
+ "\n",
155
+ " top_k_codes = conala_dataset[top_k_indices]\n",
156
+ "\n",
157
+ " return top_k_codes\n"
158
+ ]
159
+ },
160
+ {
161
+ "cell_type": "code",
162
+ "execution_count": 7,
163
+ "metadata": {},
164
+ "outputs": [],
165
+ "source": [
166
+ "user_prompt = input('Enter python coding query')\n",
167
+ "top_k_outputs = search(user_prompt)"
168
+ ]
169
+ },
170
+ {
171
+ "cell_type": "code",
172
+ "execution_count": 8,
173
+ "metadata": {},
174
+ "outputs": [],
175
+ "source": [
176
+ "probs = top_k_outputs['prob']\n",
177
+ "snippets = top_k_outputs['snippet']"
178
+ ]
179
+ },
180
+ {
181
+ "cell_type": "code",
182
+ "execution_count": 9,
183
+ "metadata": {},
184
+ "outputs": [],
185
+ "source": [
186
+ "idx = np.argsort(probs)[::-1]\n",
187
+ "results = np.array(snippets)[idx]"
188
+ ]
189
+ },
190
+ {
191
+ "cell_type": "code",
192
+ "execution_count": 10,
193
+ "metadata": {},
194
+ "outputs": [
195
+ {
196
+ "data": {
197
+ "text/plain": [
198
+ "['[x[1] for x in elements]',\n",
199
+ " 'map(itemgetter(1), elements)',\n",
200
+ " 'zip(*elements)[1]']"
201
+ ]
202
+ },
203
+ "execution_count": 10,
204
+ "metadata": {},
205
+ "output_type": "execute_result"
206
+ }
207
+ ],
208
+ "source": [
209
+ "filtered_results = []\n",
210
+ "for item in results:\n",
211
+ " if len(filtered_results)<3:\n",
212
+ " if item not in filtered_results:\n",
213
+ " filtered_results.append(item)\n",
214
+ "filtered_results"
215
+ ]
216
+ },
217
+ {
218
+ "cell_type": "code",
219
+ "execution_count": 11,
220
+ "metadata": {},
221
+ "outputs": [],
222
+ "source": [
223
+ "output_template = \"User Query: {user_query}\\nBelow are some examples of previous conversations.\\nQuery: {query1} Solution: {solution1}\\nQuery: {query2} Solution: {solution2}\\nYou may use the above examples for reference only. Create your own solution and provide only the solution\""
224
+ ]
225
+ },
226
+ {
227
+ "cell_type": "code",
228
+ "execution_count": 12,
229
+ "metadata": {},
230
+ "outputs": [],
231
+ "source": [
232
+ "output_template = \"The top three most relevant code snippets from the database are:\\n\\n1. {snippet1}\\n\\n2. {snippet2}\\n\\n3. {snippet3}\""
233
+ ]
234
+ },
235
+ {
236
+ "cell_type": "code",
237
+ "execution_count": 13,
238
+ "metadata": {},
239
+ "outputs": [],
240
+ "source": [
241
+ "output = f'{output_template.format(snippet1=filtered_results[0],snippet2=filtered_results[1],snippet3=filtered_results[2])}'"
242
+ ]
243
+ },
244
+ {
245
+ "cell_type": "code",
246
+ "execution_count": 14,
247
+ "metadata": {},
248
+ "outputs": [
249
+ {
250
+ "name": "stdout",
251
+ "output_type": "stream",
252
+ "text": [
253
+ "The top three most relevant code snippets from the database are:\n",
254
+ "\n",
255
+ "1. [x[1] for x in elements]\n",
256
+ "\n",
257
+ "2. map(itemgetter(1), elements)\n",
258
+ "\n",
259
+ "3. zip(*elements)[1]\n"
260
+ ]
261
+ }
262
+ ],
263
+ "source": [
264
+ "print(output)"
265
+ ]
266
+ },
267
+ {
268
+ "cell_type": "code",
269
+ "execution_count": null,
270
+ "metadata": {},
271
+ "outputs": [],
272
+ "source": []
273
+ },
274
+ {
275
+ "cell_type": "code",
276
+ "execution_count": null,
277
+ "metadata": {},
278
+ "outputs": [],
279
+ "source": []
280
+ }
281
+ ],
282
+ "metadata": {
283
+ "kernelspec": {
284
+ "display_name": "Python 3",
285
+ "language": "python",
286
+ "name": "python3"
287
+ },
288
+ "language_info": {
289
+ "codemirror_mode": {
290
+ "name": "ipython",
291
+ "version": 3
292
+ },
293
+ "file_extension": ".py",
294
+ "mimetype": "text/x-python",
295
+ "name": "python",
296
+ "nbconvert_exporter": "python",
297
+ "pygments_lexer": "ipython3",
298
+ "version": "3.10.12"
299
+ }
300
+ },
301
+ "nbformat": 4,
302
+ "nbformat_minor": 2
303
+ }
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ einops==0.8.0
2
+ datasets==2.19.0
3
+ faiss-cpu==1.8.0
4
+ sentence-transformers==2.7.0
5
+ usearch==2.12.0
6
+ gradio==4.28.3
retrieve_dataset.ipynb ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": []
9
+ },
10
+ {
11
+ "cell_type": "code",
12
+ "execution_count": 1,
13
+ "metadata": {},
14
+ "outputs": [
15
+ {
16
+ "name": "stderr",
17
+ "output_type": "stream",
18
+ "text": [
19
+ "/home/swastikm/work/senetence-transformer-in-action-main/semantic_search_env/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
20
+ " from .autonotebook import tqdm as notebook_tqdm\n"
21
+ ]
22
+ }
23
+ ],
24
+ "source": [
25
+ "import datasets\n",
26
+ "from datasets import load_dataset"
27
+ ]
28
+ },
29
+ {
30
+ "cell_type": "code",
31
+ "execution_count": 2,
32
+ "metadata": {},
33
+ "outputs": [],
34
+ "source": [
35
+ "raw_datasets = load_dataset(\"neulab/conala\", \"mined\", trust_remote_code=True)"
36
+ ]
37
+ },
38
+ {
39
+ "cell_type": "code",
40
+ "execution_count": 3,
41
+ "metadata": {},
42
+ "outputs": [
43
+ {
44
+ "data": {
45
+ "text/plain": [
46
+ "DatasetDict({\n",
47
+ " train: Dataset({\n",
48
+ " features: ['question_id', 'parent_answer_post_id', 'prob', 'snippet', 'intent', 'id'],\n",
49
+ " num_rows: 593891\n",
50
+ " })\n",
51
+ "})"
52
+ ]
53
+ },
54
+ "execution_count": 3,
55
+ "metadata": {},
56
+ "output_type": "execute_result"
57
+ }
58
+ ],
59
+ "source": [
60
+ "raw_datasets"
61
+ ]
62
+ },
63
+ {
64
+ "cell_type": "code",
65
+ "execution_count": 4,
66
+ "metadata": {},
67
+ "outputs": [],
68
+ "source": [
69
+ "raw_datasets = datasets.concatenate_datasets([raw_datasets[\"train\"]])"
70
+ ]
71
+ },
72
+ {
73
+ "cell_type": "code",
74
+ "execution_count": 5,
75
+ "metadata": {},
76
+ "outputs": [
77
+ {
78
+ "name": "stderr",
79
+ "output_type": "stream",
80
+ "text": [
81
+ "Saving the dataset (1/1 shards): 100%|██████████| 593891/593891 [00:00<00:00, 1373525.97 examples/s]\n"
82
+ ]
83
+ }
84
+ ],
85
+ "source": [
86
+ "raw_datasets.save_to_disk(\"conala\")"
87
+ ]
88
+ },
89
+ {
90
+ "cell_type": "code",
91
+ "execution_count": null,
92
+ "metadata": {},
93
+ "outputs": [],
94
+ "source": []
95
+ },
96
+ {
97
+ "cell_type": "code",
98
+ "execution_count": null,
99
+ "metadata": {},
100
+ "outputs": [],
101
+ "source": []
102
+ }
103
+ ],
104
+ "metadata": {
105
+ "kernelspec": {
106
+ "display_name": "semantic_search_env",
107
+ "language": "python",
108
+ "name": "python3"
109
+ },
110
+ "language_info": {
111
+ "codemirror_mode": {
112
+ "name": "ipython",
113
+ "version": 3
114
+ },
115
+ "file_extension": ".py",
116
+ "mimetype": "text/x-python",
117
+ "name": "python",
118
+ "nbconvert_exporter": "python",
119
+ "pygments_lexer": "ipython3",
120
+ "version": "3.10.12"
121
+ }
122
+ },
123
+ "nbformat": 4,
124
+ "nbformat_minor": 2
125
+ }
save_int8_index.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_from_disk
2
+ import numpy as np
3
+ from usearch.index import Index
4
+ from sentence_transformers.quantization import quantize_embeddings
5
+
6
+
7
+ import os
8
+ path_to_vectorised_dataset = os.path.join(os.getcwd(),'vectorized_dataset')
9
+
10
+ dataset = load_from_disk(path_to_vectorised_dataset)
11
+ embeddings = np.array(dataset["embedding"], dtype=np.float32)
12
+
13
+ int8_embeddings = quantize_embeddings(embeddings, "int8")
14
+ index = Index(ndim=384, metric="ip", dtype="i8") ### embedding dimension
15
+ index.add(np.arange(len(int8_embeddings)), int8_embeddings)
16
+ index.save("conala_int8_usearch.index")
vectorized_dataset/data-00000-of-00003.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bfa90c3110ec533d52d8e89cc4bc140cc5bda6e0b8687108e6d7cd051092c8de
3
+ size 334430032
vectorized_dataset/data-00001-of-00003.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f5f342829b8bdc85295656da9ea17ffea90519c47161505a5d0eb7084ecb881
3
+ size 340242056
vectorized_dataset/data-00002-of-00003.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd6209626da7cf6dfc257a73354aece4bd593afe2ec669683889fff7a98cb356
3
+ size 344600608
vectorized_dataset/dataset_info.json ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "builder_name": "parquet",
3
+ "citation": "@inproceedings{yin2018learning,\n title={Learning to mine aligned code and natural language pairs from stack overflow},\n author={Yin, Pengcheng and Deng, Bowen and Chen, Edgar and Vasilescu, Bogdan and Neubig, Graham},\n booktitle={2018 IEEE/ACM 15th international conference on mining software repositories (MSR)},\n pages={476--486},\n year={2018},\n organization={IEEE}\n}\n",
4
+ "config_name": "mined",
5
+ "dataset_name": "conala",
6
+ "dataset_size": 104561297,
7
+ "description": "CoNaLa is a dataset of code and natural language pairs crawled from Stack Overflow, for more details please refer to this paper: https://arxiv.org/pdf/1805.08949.pdf or the dataset page https://conala-corpus.github.io/.\n",
8
+ "download_checksums": {
9
+ "hf://datasets/neulab/conala@798cef31a9b480d9c31aed21e745c9e485ed2647/mined/train/0000.parquet": {
10
+ "num_bytes": 74356953,
11
+ "checksum": null
12
+ }
13
+ },
14
+ "download_size": 74356953,
15
+ "features": {
16
+ "question_id": {
17
+ "dtype": "int64",
18
+ "_type": "Value"
19
+ },
20
+ "parent_answer_post_id": {
21
+ "dtype": "int64",
22
+ "_type": "Value"
23
+ },
24
+ "prob": {
25
+ "dtype": "float64",
26
+ "_type": "Value"
27
+ },
28
+ "snippet": {
29
+ "dtype": "string",
30
+ "_type": "Value"
31
+ },
32
+ "intent": {
33
+ "dtype": "string",
34
+ "_type": "Value"
35
+ },
36
+ "id": {
37
+ "dtype": "string",
38
+ "_type": "Value"
39
+ },
40
+ "embedding": {
41
+ "feature": {
42
+ "dtype": "float32",
43
+ "_type": "Value"
44
+ },
45
+ "_type": "Sequence"
46
+ }
47
+ },
48
+ "homepage": "https://conala-corpus.github.io/",
49
+ "license": "",
50
+ "size_in_bytes": 178918250,
51
+ "splits": {
52
+ "train": {
53
+ "name": "train",
54
+ "num_bytes": 104561297,
55
+ "num_examples": 593891,
56
+ "dataset_name": "conala"
57
+ }
58
+ },
59
+ "version": {
60
+ "version_str": "1.1.0",
61
+ "major": 1,
62
+ "minor": 1,
63
+ "patch": 0
64
+ }
65
+ }
vectorized_dataset/state.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_data_files": [
3
+ {
4
+ "filename": "data-00000-of-00003.arrow"
5
+ },
6
+ {
7
+ "filename": "data-00001-of-00003.arrow"
8
+ },
9
+ {
10
+ "filename": "data-00002-of-00003.arrow"
11
+ }
12
+ ],
13
+ "_fingerprint": "98d56627cf79bda0",
14
+ "_format_columns": [
15
+ "embedding",
16
+ "id",
17
+ "intent",
18
+ "parent_answer_post_id",
19
+ "prob",
20
+ "question_id",
21
+ "snippet"
22
+ ],
23
+ "_format_kwargs": {},
24
+ "_format_type": null,
25
+ "_output_all_columns": false,
26
+ "_split": null
27
+ }