Spaces:
Runtime error
Runtime error
Upload 19 files
Browse files- .gitattributes +2 -0
- Makefile +3 -0
- app.py +94 -0
- binary_index.py +15 -0
- conala.index +3 -0
- conala/cache-98d56627cf79bda0.arrow +3 -0
- conala/data-00000-of-00001.arrow +3 -0
- conala/dataset_info.json +58 -0
- conala/state.json +20 -0
- conala_int8_usearch.index +3 -0
- create_mbedding-vecotor.ipynb +189 -0
- query_search.ipynb +303 -0
- requirements.txt +6 -0
- retrieve_dataset.ipynb +125 -0
- save_int8_index.py +16 -0
- vectorized_dataset/data-00000-of-00003.arrow +3 -0
- vectorized_dataset/data-00001-of-00003.arrow +3 -0
- vectorized_dataset/data-00002-of-00003.arrow +3 -0
- vectorized_dataset/dataset_info.json +65 -0
- vectorized_dataset/state.json +27 -0
.gitattributes
CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
conala_int8_usearch.index filter=lfs diff=lfs merge=lfs -text
|
37 |
+
conala.index filter=lfs diff=lfs merge=lfs -text
|
Makefile
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
install:
|
2 |
+
pip install --upgrade pip &&\
|
3 |
+
pip install -r requirements.txt
|
app.py
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import gradio as gr
|
3 |
+
from datasets import load_from_disk
|
4 |
+
import pandas as pd
|
5 |
+
from sentence_transformers import SentenceTransformer
|
6 |
+
from sentence_transformers.quantization import quantize_embeddings
|
7 |
+
import faiss
|
8 |
+
from usearch.index import Index
|
9 |
+
import numpy as np
|
10 |
+
import os
|
11 |
+
|
12 |
+
base_path = os.getcwd()
|
13 |
+
full_path = os.path.join(base_path, 'conala')
|
14 |
+
conala_dataset = load_from_disk(full_path)
|
15 |
+
|
16 |
+
int8_view = Index.restore(os.path.join(base_path, 'conala_int8_usearch.index'), view=True)
|
17 |
+
binary_index: faiss.IndexBinaryFlat = faiss.read_index_binary(os.path.join(base_path, 'conala.index'))
|
18 |
+
|
19 |
+
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
|
20 |
+
|
21 |
+
def search(query, top_k: int = 20):
|
22 |
+
# 1. Embed the query as float32
|
23 |
+
query_embedding = model.encode(query)
|
24 |
+
|
25 |
+
# 2. Quantize the query to ubinary. To perform actual search with faiss
|
26 |
+
query_embedding_ubinary = quantize_embeddings(query_embedding.reshape(1, -1), "ubinary")
|
27 |
+
|
28 |
+
|
29 |
+
# 3. Search the binary index
|
30 |
+
index = binary_index
|
31 |
+
_scores, binary_ids = index.search(query_embedding_ubinary, top_k)
|
32 |
+
binary_ids = binary_ids[0]
|
33 |
+
|
34 |
+
|
35 |
+
# 4. Load the corresponding int8 embeddings. To perform rescoring to calculate score of fetched documents.
|
36 |
+
int8_embeddings = int8_view[binary_ids].astype(int)
|
37 |
+
|
38 |
+
# 5. Rescore the top_k * rescore_multiplier using the float32 query embedding and the int8 document embeddings
|
39 |
+
scores = query_embedding @ int8_embeddings.T
|
40 |
+
|
41 |
+
# 6. Sort the scores and return the top_k
|
42 |
+
indices = scores.argsort()[::-1][:top_k]
|
43 |
+
top_k_indices = binary_ids[indices]
|
44 |
+
top_k_scores = scores[indices]
|
45 |
+
|
46 |
+
top_k_codes = conala_dataset[top_k_indices]
|
47 |
+
|
48 |
+
return top_k_codes
|
49 |
+
|
50 |
+
|
51 |
+
def response_generator(user_prompt):
|
52 |
+
top_k_outputs = search(user_prompt)
|
53 |
+
probs = top_k_outputs['prob']
|
54 |
+
snippets = top_k_outputs['snippet']
|
55 |
+
idx = np.argsort(probs)[::-1]
|
56 |
+
results = np.array(snippets)[idx]
|
57 |
+
filtered_results = []
|
58 |
+
for item in results:
|
59 |
+
if len(filtered_results)<3:
|
60 |
+
if item not in filtered_results:
|
61 |
+
filtered_results.append(item)
|
62 |
+
|
63 |
+
output_template = "User Query: {user_query}\nBelow are some examples of previous conversations.\nQuery: {query1} Solution: {solution1}\nQuery: {query2} Solution: {solution2}\nYou may use the above examples for reference only. Create your own solution and provide only the solution"
|
64 |
+
output_template = "The top three most relevant code snippets from the database are:\n\n1. {snippet1}\n\n2. {snippet2}\n\n3. {snippet3}"
|
65 |
+
output = f'{output_template.format(snippet1=filtered_results[0],snippet2=filtered_results[1],snippet3=filtered_results[2])}'
|
66 |
+
|
67 |
+
return {output_box:output}
|
68 |
+
|
69 |
+
|
70 |
+
with gr.Blocks() as demo:
|
71 |
+
|
72 |
+
gr.Markdown(
|
73 |
+
"""
|
74 |
+
# Embedding Quantization
|
75 |
+
|
76 |
+
## Quantized Semantic Search
|
77 |
+
|
78 |
+
- ***Embedding:*** all-MiniLM-L6-v2
|
79 |
+
- ***Vetor DB:*** faiss, USearch
|
80 |
+
- ***Vector_DB Size:*** `5,93,891`
|
81 |
+
|
82 |
+
""")
|
83 |
+
|
84 |
+
state_var = gr.State([])
|
85 |
+
|
86 |
+
|
87 |
+
input_box = gr.Textbox(autoscroll=True,visible=True,label='User',info="Enter a query.",value="How to extract the n-th elements from a list of tuples in python?")
|
88 |
+
output_box = gr.Textbox(autoscroll=True,max_lines=30,value="Output",label='Assistant')
|
89 |
+
gr.Interface(fn=response_generator, inputs=[input_box], outputs=[output_box],
|
90 |
+
delete_cache=(20,10),
|
91 |
+
allow_flagging='never')
|
92 |
+
|
93 |
+
demo.queue()
|
94 |
+
demo.launch()
|
binary_index.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datasets import load_from_disk
|
2 |
+
import numpy as np
|
3 |
+
from faiss import IndexBinaryFlat, write_index_binary
|
4 |
+
from sentence_transformers.quantization import quantize_embeddings
|
5 |
+
|
6 |
+
import os
|
7 |
+
path_to_vectorised_dataset = os.path.join(os.getcwd(),'vectorized_dataset')
|
8 |
+
|
9 |
+
dataset = load_from_disk(path_to_vectorised_dataset)
|
10 |
+
embeddings = np.array(dataset["embedding"], dtype=np.float32)
|
11 |
+
|
12 |
+
ubinary_embeddings = quantize_embeddings(embeddings, "ubinary")
|
13 |
+
index = IndexBinaryFlat(384) ## embedding dimension
|
14 |
+
index.add(ubinary_embeddings)
|
15 |
+
write_index_binary(index, "conala.index")
|
conala.index
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9168e2e320dd7d85ff9b01636a43dbfce18b1922cda518864c923366e88f0b8c
|
3 |
+
size 28506801
|
conala/cache-98d56627cf79bda0.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0aba3bba56615d8284d26413efc5bb8a314814d8c2009d0a61a30cb718d6e576
|
3 |
+
size 1019270752
|
conala/data-00000-of-00001.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:84d9591a024c23ec27e4162ae208c0b7fe657a8923760a95aa353b1f478c0b4f
|
3 |
+
size 104616624
|
conala/dataset_info.json
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"builder_name": "parquet",
|
3 |
+
"citation": "@inproceedings{yin2018learning,\n title={Learning to mine aligned code and natural language pairs from stack overflow},\n author={Yin, Pengcheng and Deng, Bowen and Chen, Edgar and Vasilescu, Bogdan and Neubig, Graham},\n booktitle={2018 IEEE/ACM 15th international conference on mining software repositories (MSR)},\n pages={476--486},\n year={2018},\n organization={IEEE}\n}\n",
|
4 |
+
"config_name": "mined",
|
5 |
+
"dataset_name": "conala",
|
6 |
+
"dataset_size": 104561297,
|
7 |
+
"description": "CoNaLa is a dataset of code and natural language pairs crawled from Stack Overflow, for more details please refer to this paper: https://arxiv.org/pdf/1805.08949.pdf or the dataset page https://conala-corpus.github.io/.\n",
|
8 |
+
"download_checksums": {
|
9 |
+
"hf://datasets/neulab/conala@798cef31a9b480d9c31aed21e745c9e485ed2647/mined/train/0000.parquet": {
|
10 |
+
"num_bytes": 74356953,
|
11 |
+
"checksum": null
|
12 |
+
}
|
13 |
+
},
|
14 |
+
"download_size": 74356953,
|
15 |
+
"features": {
|
16 |
+
"question_id": {
|
17 |
+
"dtype": "int64",
|
18 |
+
"_type": "Value"
|
19 |
+
},
|
20 |
+
"parent_answer_post_id": {
|
21 |
+
"dtype": "int64",
|
22 |
+
"_type": "Value"
|
23 |
+
},
|
24 |
+
"prob": {
|
25 |
+
"dtype": "float64",
|
26 |
+
"_type": "Value"
|
27 |
+
},
|
28 |
+
"snippet": {
|
29 |
+
"dtype": "string",
|
30 |
+
"_type": "Value"
|
31 |
+
},
|
32 |
+
"intent": {
|
33 |
+
"dtype": "string",
|
34 |
+
"_type": "Value"
|
35 |
+
},
|
36 |
+
"id": {
|
37 |
+
"dtype": "string",
|
38 |
+
"_type": "Value"
|
39 |
+
}
|
40 |
+
},
|
41 |
+
"homepage": "https://conala-corpus.github.io/",
|
42 |
+
"license": "",
|
43 |
+
"size_in_bytes": 178918250,
|
44 |
+
"splits": {
|
45 |
+
"train": {
|
46 |
+
"name": "train",
|
47 |
+
"num_bytes": 104561297,
|
48 |
+
"num_examples": 593891,
|
49 |
+
"dataset_name": "conala"
|
50 |
+
}
|
51 |
+
},
|
52 |
+
"version": {
|
53 |
+
"version_str": "1.1.0",
|
54 |
+
"major": 1,
|
55 |
+
"minor": 1,
|
56 |
+
"patch": 0
|
57 |
+
}
|
58 |
+
}
|
conala/state.json
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_data_files": [
|
3 |
+
{
|
4 |
+
"filename": "data-00000-of-00001.arrow"
|
5 |
+
}
|
6 |
+
],
|
7 |
+
"_fingerprint": "cd29c6cc7b846bee",
|
8 |
+
"_format_columns": [
|
9 |
+
"question_id",
|
10 |
+
"parent_answer_post_id",
|
11 |
+
"prob",
|
12 |
+
"snippet",
|
13 |
+
"intent",
|
14 |
+
"id"
|
15 |
+
],
|
16 |
+
"_format_kwargs": {},
|
17 |
+
"_format_type": null,
|
18 |
+
"_output_all_columns": false,
|
19 |
+
"_split": null
|
20 |
+
}
|
conala_int8_usearch.index
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:32e4de66d49ef092bf814e753769b84aae93bb6068b24c5d0826a9e44ad32917
|
3 |
+
size 316247436
|
create_mbedding-vecotor.ipynb
ADDED
@@ -0,0 +1,189 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": null,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [],
|
8 |
+
"source": []
|
9 |
+
},
|
10 |
+
{
|
11 |
+
"cell_type": "markdown",
|
12 |
+
"metadata": {},
|
13 |
+
"source": [
|
14 |
+
"# Import Libraries"
|
15 |
+
]
|
16 |
+
},
|
17 |
+
{
|
18 |
+
"cell_type": "code",
|
19 |
+
"execution_count": 1,
|
20 |
+
"metadata": {},
|
21 |
+
"outputs": [
|
22 |
+
{
|
23 |
+
"name": "stderr",
|
24 |
+
"output_type": "stream",
|
25 |
+
"text": [
|
26 |
+
"/home/swastikm/work/senetence-transformer-in-action-main/semantic_search_env/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
27 |
+
" from .autonotebook import tqdm as notebook_tqdm\n"
|
28 |
+
]
|
29 |
+
}
|
30 |
+
],
|
31 |
+
"source": [
|
32 |
+
"from sentence_transformers import SentenceTransformer"
|
33 |
+
]
|
34 |
+
},
|
35 |
+
{
|
36 |
+
"cell_type": "markdown",
|
37 |
+
"metadata": {},
|
38 |
+
"source": [
|
39 |
+
"## Load Dataset"
|
40 |
+
]
|
41 |
+
},
|
42 |
+
{
|
43 |
+
"cell_type": "code",
|
44 |
+
"execution_count": 2,
|
45 |
+
"metadata": {},
|
46 |
+
"outputs": [],
|
47 |
+
"source": [
|
48 |
+
"from datasets import load_from_disk\n",
|
49 |
+
"import os"
|
50 |
+
]
|
51 |
+
},
|
52 |
+
{
|
53 |
+
"cell_type": "code",
|
54 |
+
"execution_count": 3,
|
55 |
+
"metadata": {},
|
56 |
+
"outputs": [],
|
57 |
+
"source": [
|
58 |
+
"base_path = os.getcwd()\n",
|
59 |
+
"dataset_folder = \"conala\"\n",
|
60 |
+
"path_to_dataset = os.path.join(base_path,dataset_folder)\n",
|
61 |
+
"dataset = load_from_disk(path_to_dataset)"
|
62 |
+
]
|
63 |
+
},
|
64 |
+
{
|
65 |
+
"cell_type": "markdown",
|
66 |
+
"metadata": {},
|
67 |
+
"source": [
|
68 |
+
"# Create vector embedding"
|
69 |
+
]
|
70 |
+
},
|
71 |
+
{
|
72 |
+
"cell_type": "code",
|
73 |
+
"execution_count": 4,
|
74 |
+
"metadata": {},
|
75 |
+
"outputs": [
|
76 |
+
{
|
77 |
+
"name": "stderr",
|
78 |
+
"output_type": "stream",
|
79 |
+
"text": [
|
80 |
+
"/home/swastikm/work/senetence-transformer-in-action-main/semantic_search_env/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
|
81 |
+
" warnings.warn(\n"
|
82 |
+
]
|
83 |
+
}
|
84 |
+
],
|
85 |
+
"source": [
|
86 |
+
"model = SentenceTransformer(\"sentence-transformers/all-MiniLM-L6-v2\")"
|
87 |
+
]
|
88 |
+
},
|
89 |
+
{
|
90 |
+
"cell_type": "code",
|
91 |
+
"execution_count": 5,
|
92 |
+
"metadata": {},
|
93 |
+
"outputs": [],
|
94 |
+
"source": [
|
95 |
+
"def get_embeddings(examples):\n",
|
96 |
+
" vectors = {}\n",
|
97 |
+
" model_input = examples['intent']\n",
|
98 |
+
" out = model.encode(model_input)\n",
|
99 |
+
" vectors['embedding'] = out\n",
|
100 |
+
" return vectors"
|
101 |
+
]
|
102 |
+
},
|
103 |
+
{
|
104 |
+
"cell_type": "code",
|
105 |
+
"execution_count": 6,
|
106 |
+
"metadata": {},
|
107 |
+
"outputs": [
|
108 |
+
{
|
109 |
+
"name": "stderr",
|
110 |
+
"output_type": "stream",
|
111 |
+
"text": [
|
112 |
+
"Map: 100%|██████████| 593891/593891 [20:14<00:00, 488.98 examples/s]\n"
|
113 |
+
]
|
114 |
+
}
|
115 |
+
],
|
116 |
+
"source": [
|
117 |
+
"vectorized_dataset = dataset.map(get_embeddings,batched=True)"
|
118 |
+
]
|
119 |
+
},
|
120 |
+
{
|
121 |
+
"cell_type": "code",
|
122 |
+
"execution_count": 7,
|
123 |
+
"metadata": {},
|
124 |
+
"outputs": [
|
125 |
+
{
|
126 |
+
"data": {
|
127 |
+
"text/plain": [
|
128 |
+
"Dataset({\n",
|
129 |
+
" features: ['question_id', 'parent_answer_post_id', 'prob', 'snippet', 'intent', 'id', 'embedding'],\n",
|
130 |
+
" num_rows: 593891\n",
|
131 |
+
"})"
|
132 |
+
]
|
133 |
+
},
|
134 |
+
"execution_count": 7,
|
135 |
+
"metadata": {},
|
136 |
+
"output_type": "execute_result"
|
137 |
+
}
|
138 |
+
],
|
139 |
+
"source": [
|
140 |
+
"vectorized_dataset"
|
141 |
+
]
|
142 |
+
},
|
143 |
+
{
|
144 |
+
"cell_type": "code",
|
145 |
+
"execution_count": 8,
|
146 |
+
"metadata": {},
|
147 |
+
"outputs": [
|
148 |
+
{
|
149 |
+
"name": "stderr",
|
150 |
+
"output_type": "stream",
|
151 |
+
"text": [
|
152 |
+
"Saving the dataset (3/3 shards): 100%|██████████| 593891/593891 [00:08<00:00, 69697.72 examples/s] \n"
|
153 |
+
]
|
154 |
+
}
|
155 |
+
],
|
156 |
+
"source": [
|
157 |
+
"vectorized_dataset.save_to_disk('vectorized_dataset')"
|
158 |
+
]
|
159 |
+
},
|
160 |
+
{
|
161 |
+
"cell_type": "code",
|
162 |
+
"execution_count": null,
|
163 |
+
"metadata": {},
|
164 |
+
"outputs": [],
|
165 |
+
"source": []
|
166 |
+
}
|
167 |
+
],
|
168 |
+
"metadata": {
|
169 |
+
"kernelspec": {
|
170 |
+
"display_name": "semactic_env",
|
171 |
+
"language": "python",
|
172 |
+
"name": "python3"
|
173 |
+
},
|
174 |
+
"language_info": {
|
175 |
+
"codemirror_mode": {
|
176 |
+
"name": "ipython",
|
177 |
+
"version": 3
|
178 |
+
},
|
179 |
+
"file_extension": ".py",
|
180 |
+
"mimetype": "text/x-python",
|
181 |
+
"name": "python",
|
182 |
+
"nbconvert_exporter": "python",
|
183 |
+
"pygments_lexer": "ipython3",
|
184 |
+
"version": "3.10.12"
|
185 |
+
}
|
186 |
+
},
|
187 |
+
"nbformat": 4,
|
188 |
+
"nbformat_minor": 2
|
189 |
+
}
|
query_search.ipynb
ADDED
@@ -0,0 +1,303 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": null,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [],
|
8 |
+
"source": []
|
9 |
+
},
|
10 |
+
{
|
11 |
+
"cell_type": "code",
|
12 |
+
"execution_count": null,
|
13 |
+
"metadata": {},
|
14 |
+
"outputs": [],
|
15 |
+
"source": []
|
16 |
+
},
|
17 |
+
{
|
18 |
+
"cell_type": "markdown",
|
19 |
+
"metadata": {},
|
20 |
+
"source": [
|
21 |
+
"# Import Libraries"
|
22 |
+
]
|
23 |
+
},
|
24 |
+
{
|
25 |
+
"cell_type": "code",
|
26 |
+
"execution_count": 1,
|
27 |
+
"metadata": {},
|
28 |
+
"outputs": [
|
29 |
+
{
|
30 |
+
"name": "stderr",
|
31 |
+
"output_type": "stream",
|
32 |
+
"text": [
|
33 |
+
"/home/swastikm/work/senetence-transformer-in-action-main/semantic_search_env/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
34 |
+
" from .autonotebook import tqdm as notebook_tqdm\n"
|
35 |
+
]
|
36 |
+
}
|
37 |
+
],
|
38 |
+
"source": [
|
39 |
+
"import time\n",
|
40 |
+
"from datasets import load_from_disk\n",
|
41 |
+
"import pandas as pd\n",
|
42 |
+
"from sentence_transformers import SentenceTransformer\n",
|
43 |
+
"from sentence_transformers.quantization import quantize_embeddings\n",
|
44 |
+
"import faiss\n",
|
45 |
+
"from usearch.index import Index\n",
|
46 |
+
"import numpy as np\n",
|
47 |
+
"import os"
|
48 |
+
]
|
49 |
+
},
|
50 |
+
{
|
51 |
+
"cell_type": "markdown",
|
52 |
+
"metadata": {},
|
53 |
+
"source": [
|
54 |
+
"# Load Dataset"
|
55 |
+
]
|
56 |
+
},
|
57 |
+
{
|
58 |
+
"cell_type": "code",
|
59 |
+
"execution_count": 2,
|
60 |
+
"metadata": {},
|
61 |
+
"outputs": [],
|
62 |
+
"source": [
|
63 |
+
"base_path = os.getcwd()\n",
|
64 |
+
"full_path = os.path.join(base_path, 'conala')"
|
65 |
+
]
|
66 |
+
},
|
67 |
+
{
|
68 |
+
"cell_type": "code",
|
69 |
+
"execution_count": 3,
|
70 |
+
"metadata": {},
|
71 |
+
"outputs": [],
|
72 |
+
"source": [
|
73 |
+
"conala_dataset = load_from_disk(full_path)"
|
74 |
+
]
|
75 |
+
},
|
76 |
+
{
|
77 |
+
"cell_type": "markdown",
|
78 |
+
"metadata": {},
|
79 |
+
"source": [
|
80 |
+
"```Text\n",
|
81 |
+
"Load the int8 and binary indices. Int8 is loaded as a view to save memory, as we never actually perform search with it.\n",
|
82 |
+
"Int8 embedding is required to perform rescoring of fetched document. Rescoring is done by performing inner product with F32 embedding of Query\n",
|
83 |
+
"```\n",
|
84 |
+
"[Efficient Passage Retrieval with Hashing for Open-domain Question Answering](https://arxiv.org/abs/2106.00882)"
|
85 |
+
]
|
86 |
+
},
|
87 |
+
{
|
88 |
+
"cell_type": "code",
|
89 |
+
"execution_count": 4,
|
90 |
+
"metadata": {},
|
91 |
+
"outputs": [],
|
92 |
+
"source": [
|
93 |
+
"# Load the int8 and binary indices. Int8 is loaded as a view to save memory, as we never actually perform search with it.\n",
|
94 |
+
"int8_view = Index.restore(os.path.join(base_path, 'conala_int8_usearch.index'), view=True)\n",
|
95 |
+
"binary_index: faiss.IndexBinaryFlat = faiss.read_index_binary(os.path.join(base_path, 'conala.index'))"
|
96 |
+
]
|
97 |
+
},
|
98 |
+
{
|
99 |
+
"cell_type": "markdown",
|
100 |
+
"metadata": {},
|
101 |
+
"source": [
|
102 |
+
"# Import Model to generate embedding"
|
103 |
+
]
|
104 |
+
},
|
105 |
+
{
|
106 |
+
"cell_type": "code",
|
107 |
+
"execution_count": 5,
|
108 |
+
"metadata": {},
|
109 |
+
"outputs": [
|
110 |
+
{
|
111 |
+
"name": "stderr",
|
112 |
+
"output_type": "stream",
|
113 |
+
"text": [
|
114 |
+
"/home/swastikm/work/senetence-transformer-in-action-main/semantic_search_env/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
|
115 |
+
" warnings.warn(\n"
|
116 |
+
]
|
117 |
+
}
|
118 |
+
],
|
119 |
+
"source": [
|
120 |
+
"model = SentenceTransformer(\"sentence-transformers/all-MiniLM-L6-v2\")"
|
121 |
+
]
|
122 |
+
},
|
123 |
+
{
|
124 |
+
"cell_type": "code",
|
125 |
+
"execution_count": 6,
|
126 |
+
"metadata": {},
|
127 |
+
"outputs": [],
|
128 |
+
"source": [
|
129 |
+
"def search(query, top_k: int = 20):\n",
|
130 |
+
" # 1. Embed the query as float32\n",
|
131 |
+
" query_embedding = model.encode(query)\n",
|
132 |
+
"\n",
|
133 |
+
" # 2. Quantize the query to ubinary. To perform actual search with faiss\n",
|
134 |
+
" query_embedding_ubinary = quantize_embeddings(query_embedding.reshape(1, -1), \"ubinary\")\n",
|
135 |
+
"\n",
|
136 |
+
"\n",
|
137 |
+
" # 3. Search the binary index \n",
|
138 |
+
" index = binary_index\n",
|
139 |
+
" _scores, binary_ids = index.search(query_embedding_ubinary, top_k)\n",
|
140 |
+
" binary_ids = binary_ids[0]\n",
|
141 |
+
"\n",
|
142 |
+
"\n",
|
143 |
+
" # 4. Load the corresponding int8 embeddings. To perform rescoring to calculate score of fetched documents.\n",
|
144 |
+
" int8_embeddings = int8_view[binary_ids].astype(int)\n",
|
145 |
+
"\n",
|
146 |
+
" # 5. Rescore the top_k * rescore_multiplier using the float32 query embedding and the int8 document embeddings\n",
|
147 |
+
" scores = query_embedding @ int8_embeddings.T\n",
|
148 |
+
"\n",
|
149 |
+
" # 6. Sort the scores and return the top_k\n",
|
150 |
+
" start_time = time.time()\n",
|
151 |
+
" indices = scores.argsort()[::-1][:top_k]\n",
|
152 |
+
" top_k_indices = binary_ids[indices]\n",
|
153 |
+
" top_k_scores = scores[indices]\n",
|
154 |
+
"\n",
|
155 |
+
" top_k_codes = conala_dataset[top_k_indices]\n",
|
156 |
+
"\n",
|
157 |
+
" return top_k_codes\n"
|
158 |
+
]
|
159 |
+
},
|
160 |
+
{
|
161 |
+
"cell_type": "code",
|
162 |
+
"execution_count": 7,
|
163 |
+
"metadata": {},
|
164 |
+
"outputs": [],
|
165 |
+
"source": [
|
166 |
+
"user_prompt = input('Enter python coding query')\n",
|
167 |
+
"top_k_outputs = search(user_prompt)"
|
168 |
+
]
|
169 |
+
},
|
170 |
+
{
|
171 |
+
"cell_type": "code",
|
172 |
+
"execution_count": 8,
|
173 |
+
"metadata": {},
|
174 |
+
"outputs": [],
|
175 |
+
"source": [
|
176 |
+
"probs = top_k_outputs['prob']\n",
|
177 |
+
"snippets = top_k_outputs['snippet']"
|
178 |
+
]
|
179 |
+
},
|
180 |
+
{
|
181 |
+
"cell_type": "code",
|
182 |
+
"execution_count": 9,
|
183 |
+
"metadata": {},
|
184 |
+
"outputs": [],
|
185 |
+
"source": [
|
186 |
+
"idx = np.argsort(probs)[::-1]\n",
|
187 |
+
"results = np.array(snippets)[idx]"
|
188 |
+
]
|
189 |
+
},
|
190 |
+
{
|
191 |
+
"cell_type": "code",
|
192 |
+
"execution_count": 10,
|
193 |
+
"metadata": {},
|
194 |
+
"outputs": [
|
195 |
+
{
|
196 |
+
"data": {
|
197 |
+
"text/plain": [
|
198 |
+
"['[x[1] for x in elements]',\n",
|
199 |
+
" 'map(itemgetter(1), elements)',\n",
|
200 |
+
" 'zip(*elements)[1]']"
|
201 |
+
]
|
202 |
+
},
|
203 |
+
"execution_count": 10,
|
204 |
+
"metadata": {},
|
205 |
+
"output_type": "execute_result"
|
206 |
+
}
|
207 |
+
],
|
208 |
+
"source": [
|
209 |
+
"filtered_results = []\n",
|
210 |
+
"for item in results:\n",
|
211 |
+
" if len(filtered_results)<3:\n",
|
212 |
+
" if item not in filtered_results:\n",
|
213 |
+
" filtered_results.append(item)\n",
|
214 |
+
"filtered_results"
|
215 |
+
]
|
216 |
+
},
|
217 |
+
{
|
218 |
+
"cell_type": "code",
|
219 |
+
"execution_count": 11,
|
220 |
+
"metadata": {},
|
221 |
+
"outputs": [],
|
222 |
+
"source": [
|
223 |
+
"output_template = \"User Query: {user_query}\\nBelow are some examples of previous conversations.\\nQuery: {query1} Solution: {solution1}\\nQuery: {query2} Solution: {solution2}\\nYou may use the above examples for reference only. Create your own solution and provide only the solution\""
|
224 |
+
]
|
225 |
+
},
|
226 |
+
{
|
227 |
+
"cell_type": "code",
|
228 |
+
"execution_count": 12,
|
229 |
+
"metadata": {},
|
230 |
+
"outputs": [],
|
231 |
+
"source": [
|
232 |
+
"output_template = \"The top three most relevant code snippets from the database are:\\n\\n1. {snippet1}\\n\\n2. {snippet2}\\n\\n3. {snippet3}\""
|
233 |
+
]
|
234 |
+
},
|
235 |
+
{
|
236 |
+
"cell_type": "code",
|
237 |
+
"execution_count": 13,
|
238 |
+
"metadata": {},
|
239 |
+
"outputs": [],
|
240 |
+
"source": [
|
241 |
+
"output = f'{output_template.format(snippet1=filtered_results[0],snippet2=filtered_results[1],snippet3=filtered_results[2])}'"
|
242 |
+
]
|
243 |
+
},
|
244 |
+
{
|
245 |
+
"cell_type": "code",
|
246 |
+
"execution_count": 14,
|
247 |
+
"metadata": {},
|
248 |
+
"outputs": [
|
249 |
+
{
|
250 |
+
"name": "stdout",
|
251 |
+
"output_type": "stream",
|
252 |
+
"text": [
|
253 |
+
"The top three most relevant code snippets from the database are:\n",
|
254 |
+
"\n",
|
255 |
+
"1. [x[1] for x in elements]\n",
|
256 |
+
"\n",
|
257 |
+
"2. map(itemgetter(1), elements)\n",
|
258 |
+
"\n",
|
259 |
+
"3. zip(*elements)[1]\n"
|
260 |
+
]
|
261 |
+
}
|
262 |
+
],
|
263 |
+
"source": [
|
264 |
+
"print(output)"
|
265 |
+
]
|
266 |
+
},
|
267 |
+
{
|
268 |
+
"cell_type": "code",
|
269 |
+
"execution_count": null,
|
270 |
+
"metadata": {},
|
271 |
+
"outputs": [],
|
272 |
+
"source": []
|
273 |
+
},
|
274 |
+
{
|
275 |
+
"cell_type": "code",
|
276 |
+
"execution_count": null,
|
277 |
+
"metadata": {},
|
278 |
+
"outputs": [],
|
279 |
+
"source": []
|
280 |
+
}
|
281 |
+
],
|
282 |
+
"metadata": {
|
283 |
+
"kernelspec": {
|
284 |
+
"display_name": "Python 3",
|
285 |
+
"language": "python",
|
286 |
+
"name": "python3"
|
287 |
+
},
|
288 |
+
"language_info": {
|
289 |
+
"codemirror_mode": {
|
290 |
+
"name": "ipython",
|
291 |
+
"version": 3
|
292 |
+
},
|
293 |
+
"file_extension": ".py",
|
294 |
+
"mimetype": "text/x-python",
|
295 |
+
"name": "python",
|
296 |
+
"nbconvert_exporter": "python",
|
297 |
+
"pygments_lexer": "ipython3",
|
298 |
+
"version": "3.10.12"
|
299 |
+
}
|
300 |
+
},
|
301 |
+
"nbformat": 4,
|
302 |
+
"nbformat_minor": 2
|
303 |
+
}
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
einops==0.8.0
|
2 |
+
datasets==2.19.0
|
3 |
+
faiss-cpu==1.8.0
|
4 |
+
sentence-transformers==2.7.0
|
5 |
+
usearch==2.12.0
|
6 |
+
gradio==4.28.3
|
retrieve_dataset.ipynb
ADDED
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": null,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [],
|
8 |
+
"source": []
|
9 |
+
},
|
10 |
+
{
|
11 |
+
"cell_type": "code",
|
12 |
+
"execution_count": 1,
|
13 |
+
"metadata": {},
|
14 |
+
"outputs": [
|
15 |
+
{
|
16 |
+
"name": "stderr",
|
17 |
+
"output_type": "stream",
|
18 |
+
"text": [
|
19 |
+
"/home/swastikm/work/senetence-transformer-in-action-main/semantic_search_env/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
20 |
+
" from .autonotebook import tqdm as notebook_tqdm\n"
|
21 |
+
]
|
22 |
+
}
|
23 |
+
],
|
24 |
+
"source": [
|
25 |
+
"import datasets\n",
|
26 |
+
"from datasets import load_dataset"
|
27 |
+
]
|
28 |
+
},
|
29 |
+
{
|
30 |
+
"cell_type": "code",
|
31 |
+
"execution_count": 2,
|
32 |
+
"metadata": {},
|
33 |
+
"outputs": [],
|
34 |
+
"source": [
|
35 |
+
"raw_datasets = load_dataset(\"neulab/conala\", \"mined\", trust_remote_code=True)"
|
36 |
+
]
|
37 |
+
},
|
38 |
+
{
|
39 |
+
"cell_type": "code",
|
40 |
+
"execution_count": 3,
|
41 |
+
"metadata": {},
|
42 |
+
"outputs": [
|
43 |
+
{
|
44 |
+
"data": {
|
45 |
+
"text/plain": [
|
46 |
+
"DatasetDict({\n",
|
47 |
+
" train: Dataset({\n",
|
48 |
+
" features: ['question_id', 'parent_answer_post_id', 'prob', 'snippet', 'intent', 'id'],\n",
|
49 |
+
" num_rows: 593891\n",
|
50 |
+
" })\n",
|
51 |
+
"})"
|
52 |
+
]
|
53 |
+
},
|
54 |
+
"execution_count": 3,
|
55 |
+
"metadata": {},
|
56 |
+
"output_type": "execute_result"
|
57 |
+
}
|
58 |
+
],
|
59 |
+
"source": [
|
60 |
+
"raw_datasets"
|
61 |
+
]
|
62 |
+
},
|
63 |
+
{
|
64 |
+
"cell_type": "code",
|
65 |
+
"execution_count": 4,
|
66 |
+
"metadata": {},
|
67 |
+
"outputs": [],
|
68 |
+
"source": [
|
69 |
+
"raw_datasets = datasets.concatenate_datasets([raw_datasets[\"train\"]])"
|
70 |
+
]
|
71 |
+
},
|
72 |
+
{
|
73 |
+
"cell_type": "code",
|
74 |
+
"execution_count": 5,
|
75 |
+
"metadata": {},
|
76 |
+
"outputs": [
|
77 |
+
{
|
78 |
+
"name": "stderr",
|
79 |
+
"output_type": "stream",
|
80 |
+
"text": [
|
81 |
+
"Saving the dataset (1/1 shards): 100%|██████████| 593891/593891 [00:00<00:00, 1373525.97 examples/s]\n"
|
82 |
+
]
|
83 |
+
}
|
84 |
+
],
|
85 |
+
"source": [
|
86 |
+
"raw_datasets.save_to_disk(\"conala\")"
|
87 |
+
]
|
88 |
+
},
|
89 |
+
{
|
90 |
+
"cell_type": "code",
|
91 |
+
"execution_count": null,
|
92 |
+
"metadata": {},
|
93 |
+
"outputs": [],
|
94 |
+
"source": []
|
95 |
+
},
|
96 |
+
{
|
97 |
+
"cell_type": "code",
|
98 |
+
"execution_count": null,
|
99 |
+
"metadata": {},
|
100 |
+
"outputs": [],
|
101 |
+
"source": []
|
102 |
+
}
|
103 |
+
],
|
104 |
+
"metadata": {
|
105 |
+
"kernelspec": {
|
106 |
+
"display_name": "semantic_search_env",
|
107 |
+
"language": "python",
|
108 |
+
"name": "python3"
|
109 |
+
},
|
110 |
+
"language_info": {
|
111 |
+
"codemirror_mode": {
|
112 |
+
"name": "ipython",
|
113 |
+
"version": 3
|
114 |
+
},
|
115 |
+
"file_extension": ".py",
|
116 |
+
"mimetype": "text/x-python",
|
117 |
+
"name": "python",
|
118 |
+
"nbconvert_exporter": "python",
|
119 |
+
"pygments_lexer": "ipython3",
|
120 |
+
"version": "3.10.12"
|
121 |
+
}
|
122 |
+
},
|
123 |
+
"nbformat": 4,
|
124 |
+
"nbformat_minor": 2
|
125 |
+
}
|
save_int8_index.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datasets import load_from_disk
|
2 |
+
import numpy as np
|
3 |
+
from usearch.index import Index
|
4 |
+
from sentence_transformers.quantization import quantize_embeddings
|
5 |
+
|
6 |
+
|
7 |
+
import os
|
8 |
+
path_to_vectorised_dataset = os.path.join(os.getcwd(),'vectorized_dataset')
|
9 |
+
|
10 |
+
dataset = load_from_disk(path_to_vectorised_dataset)
|
11 |
+
embeddings = np.array(dataset["embedding"], dtype=np.float32)
|
12 |
+
|
13 |
+
int8_embeddings = quantize_embeddings(embeddings, "int8")
|
14 |
+
index = Index(ndim=384, metric="ip", dtype="i8") ### embedding dimension
|
15 |
+
index.add(np.arange(len(int8_embeddings)), int8_embeddings)
|
16 |
+
index.save("conala_int8_usearch.index")
|
vectorized_dataset/data-00000-of-00003.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bfa90c3110ec533d52d8e89cc4bc140cc5bda6e0b8687108e6d7cd051092c8de
|
3 |
+
size 334430032
|
vectorized_dataset/data-00001-of-00003.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7f5f342829b8bdc85295656da9ea17ffea90519c47161505a5d0eb7084ecb881
|
3 |
+
size 340242056
|
vectorized_dataset/data-00002-of-00003.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dd6209626da7cf6dfc257a73354aece4bd593afe2ec669683889fff7a98cb356
|
3 |
+
size 344600608
|
vectorized_dataset/dataset_info.json
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"builder_name": "parquet",
|
3 |
+
"citation": "@inproceedings{yin2018learning,\n title={Learning to mine aligned code and natural language pairs from stack overflow},\n author={Yin, Pengcheng and Deng, Bowen and Chen, Edgar and Vasilescu, Bogdan and Neubig, Graham},\n booktitle={2018 IEEE/ACM 15th international conference on mining software repositories (MSR)},\n pages={476--486},\n year={2018},\n organization={IEEE}\n}\n",
|
4 |
+
"config_name": "mined",
|
5 |
+
"dataset_name": "conala",
|
6 |
+
"dataset_size": 104561297,
|
7 |
+
"description": "CoNaLa is a dataset of code and natural language pairs crawled from Stack Overflow, for more details please refer to this paper: https://arxiv.org/pdf/1805.08949.pdf or the dataset page https://conala-corpus.github.io/.\n",
|
8 |
+
"download_checksums": {
|
9 |
+
"hf://datasets/neulab/conala@798cef31a9b480d9c31aed21e745c9e485ed2647/mined/train/0000.parquet": {
|
10 |
+
"num_bytes": 74356953,
|
11 |
+
"checksum": null
|
12 |
+
}
|
13 |
+
},
|
14 |
+
"download_size": 74356953,
|
15 |
+
"features": {
|
16 |
+
"question_id": {
|
17 |
+
"dtype": "int64",
|
18 |
+
"_type": "Value"
|
19 |
+
},
|
20 |
+
"parent_answer_post_id": {
|
21 |
+
"dtype": "int64",
|
22 |
+
"_type": "Value"
|
23 |
+
},
|
24 |
+
"prob": {
|
25 |
+
"dtype": "float64",
|
26 |
+
"_type": "Value"
|
27 |
+
},
|
28 |
+
"snippet": {
|
29 |
+
"dtype": "string",
|
30 |
+
"_type": "Value"
|
31 |
+
},
|
32 |
+
"intent": {
|
33 |
+
"dtype": "string",
|
34 |
+
"_type": "Value"
|
35 |
+
},
|
36 |
+
"id": {
|
37 |
+
"dtype": "string",
|
38 |
+
"_type": "Value"
|
39 |
+
},
|
40 |
+
"embedding": {
|
41 |
+
"feature": {
|
42 |
+
"dtype": "float32",
|
43 |
+
"_type": "Value"
|
44 |
+
},
|
45 |
+
"_type": "Sequence"
|
46 |
+
}
|
47 |
+
},
|
48 |
+
"homepage": "https://conala-corpus.github.io/",
|
49 |
+
"license": "",
|
50 |
+
"size_in_bytes": 178918250,
|
51 |
+
"splits": {
|
52 |
+
"train": {
|
53 |
+
"name": "train",
|
54 |
+
"num_bytes": 104561297,
|
55 |
+
"num_examples": 593891,
|
56 |
+
"dataset_name": "conala"
|
57 |
+
}
|
58 |
+
},
|
59 |
+
"version": {
|
60 |
+
"version_str": "1.1.0",
|
61 |
+
"major": 1,
|
62 |
+
"minor": 1,
|
63 |
+
"patch": 0
|
64 |
+
}
|
65 |
+
}
|
vectorized_dataset/state.json
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_data_files": [
|
3 |
+
{
|
4 |
+
"filename": "data-00000-of-00003.arrow"
|
5 |
+
},
|
6 |
+
{
|
7 |
+
"filename": "data-00001-of-00003.arrow"
|
8 |
+
},
|
9 |
+
{
|
10 |
+
"filename": "data-00002-of-00003.arrow"
|
11 |
+
}
|
12 |
+
],
|
13 |
+
"_fingerprint": "98d56627cf79bda0",
|
14 |
+
"_format_columns": [
|
15 |
+
"embedding",
|
16 |
+
"id",
|
17 |
+
"intent",
|
18 |
+
"parent_answer_post_id",
|
19 |
+
"prob",
|
20 |
+
"question_id",
|
21 |
+
"snippet"
|
22 |
+
],
|
23 |
+
"_format_kwargs": {},
|
24 |
+
"_format_type": null,
|
25 |
+
"_output_all_columns": false,
|
26 |
+
"_split": null
|
27 |
+
}
|