lotrlol ivan-savchuk commited on
Commit
31c5069
β€’
0 Parent(s):

Duplicate from ivan-savchuk/medical-search

Browse files

Co-authored-by: Ivan Savchuk <ivan-savchuk@users.noreply.huggingface.co>

Files changed (6) hide show
  1. .gitattributes +32 -0
  2. README.md +15 -0
  3. app.py +114 -0
  4. docs.json +0 -0
  5. idx_vectors.index +3 -0
  6. requirements.txt +3 -0
.gitattributes ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ftz filter=lfs diff=lfs merge=lfs -text
6
+ *.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.h5 filter=lfs diff=lfs merge=lfs -text
8
+ *.joblib filter=lfs diff=lfs merge=lfs -text
9
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
10
+ *.model filter=lfs diff=lfs merge=lfs -text
11
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
12
+ *.npy filter=lfs diff=lfs merge=lfs -text
13
+ *.npz filter=lfs diff=lfs merge=lfs -text
14
+ *.onnx filter=lfs diff=lfs merge=lfs -text
15
+ *.ot filter=lfs diff=lfs merge=lfs -text
16
+ *.parquet filter=lfs diff=lfs merge=lfs -text
17
+ *.pickle filter=lfs diff=lfs merge=lfs -text
18
+ *.pkl filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pt filter=lfs diff=lfs merge=lfs -text
21
+ *.pth filter=lfs diff=lfs merge=lfs -text
22
+ *.rar filter=lfs diff=lfs merge=lfs -text
23
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
24
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
25
+ *.tflite filter=lfs diff=lfs merge=lfs -text
26
+ *.tgz filter=lfs diff=lfs merge=lfs -text
27
+ *.wasm filter=lfs diff=lfs merge=lfs -text
28
+ *.xz filter=lfs diff=lfs merge=lfs -text
29
+ *.zip filter=lfs diff=lfs merge=lfs -text
30
+ *.zstandard filter=lfs diff=lfs merge=lfs -text
31
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
32
+ idx_vectors.index filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Medical Search
3
+ metaTitle: Medical Search
4
+ emoji: πŸ—‚πŸ—‚πŸ—‚
5
+ colorFrom: red
6
+ colorTo: blue
7
+ sdk: streamlit
8
+ sdk_version: 1.10.0
9
+ app_file: app.py
10
+ pinned: false
11
+ license: afl-3.0
12
+ duplicated_from: ivan-savchuk/medical-search
13
+ ---
14
+
15
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import json
3
+ import time
4
+ import faiss
5
+ from sentence_transformers import SentenceTransformer
6
+ from sentence_transformers.cross_encoder import CrossEncoder
7
+
8
+
9
+ class DocumentSearch:
10
+ '''
11
+ This class is dedicated to
12
+ perform semantic document search
13
+ based on previously trained:
14
+ faiss: index
15
+ sbert: encoder
16
+ sbert: cross_encoder
17
+ '''
18
+ # we mention pass to every file that needed to run models
19
+ # and search over our data
20
+ enc_path = "ivan-savchuk/msmarco-distilbert-dot-v5-tuned-full-v1"
21
+ idx_path = "idx_vectors.index"
22
+ cross_enc_path = "ivan-savchuk/cross-encoder-ms-marco-MiniLM-L-12-v2-tuned_mediqa-v1"
23
+ docs_path = "docs.json"
24
+
25
+ def __init__(self):
26
+ # loading docs and corresponding urls
27
+ with open(DocumentSearch.docs_path, 'r') as json_file:
28
+ self.docs = json.load(json_file)
29
+
30
+ # loading sbert encoder model
31
+ self.encoder = SentenceTransformer(DocumentSearch.enc_path)
32
+ # loading faiss index
33
+ self.index = faiss.read_index(DocumentSearch.idx_path)
34
+ # loading sbert cross_encoder
35
+ # self.cross_encoder = CrossEncoder(DocumentSearch.cross_enc_path)
36
+
37
+ def search(self, query: str, k: int) -> list:
38
+ # get vector representation of text query
39
+ query_vector = self.encoder.encode([query])
40
+ # perform search via faiss FlatIP index
41
+ distances, indeces = self.index.search(query_vector, k*10)
42
+ # get docs by index
43
+ res_docs = [self.docs[i] for i in indeces[0]]
44
+ # get scores by index
45
+ dists = [dist for dist in distances[0]]
46
+
47
+ return[{'doc': doc[0], 'url': doc[1], 'score': dist} for doc, dist in zip(res_docs, dists)][:k]
48
+ ##### OLD VERSION WITH CROSS-ENCODER #####
49
+ # get answers by index
50
+ #answers = [self.docs[i] for i in indeces[0]]
51
+ # prepare inputs for cross encoder
52
+ # model_inputs = [[query, pairs[0]] for pairs in answers]
53
+ # urls = [pairs[1] for pairs in answers]
54
+ # get similarity score between query and documents
55
+ # scores = self.cross_encoder.predict(model_inputs, batch_size=1)
56
+ # compose results into list of dicts
57
+ # results = [{'doc': doc[1], 'url': url, 'score': score} for doc, url, score in zip(model_inputs, urls, scores)]
58
+
59
+ # return results sorted by similarity scores
60
+ # return sorted(results, key=lambda x: x['score'], reverse=True)[:k]
61
+
62
+
63
+ if __name__ == "__main__":
64
+ # get instance of DocumentSearch class
65
+ surfer = DocumentSearch()
66
+ # streamlit part starts here with title
67
+ title = """
68
+ <h1 style='
69
+ text-align: center;
70
+ color: #3CB371'>
71
+ Medical Search
72
+ </h1>
73
+ """
74
+ st.markdown(title, unsafe_allow_html=True)
75
+ # input form
76
+ with st.form("my_form"):
77
+ # here we have input space
78
+ query = st.text_input("Enter query about our Medical Data",
79
+ placeholder="Type query here...",
80
+ max_chars=200)
81
+ # Every form must have a submit button.
82
+ submitted = st.form_submit_button("Search")
83
+
84
+ # on submit we execute search
85
+ if(submitted):
86
+ # set start time
87
+ stt = time.time()
88
+ # retrieve top 5 documents
89
+ results = surfer.search(query, k=10)
90
+ # set endtime
91
+ ent = time.time()
92
+ # measure resulting time
93
+ elapsed_time = round(ent - stt, 2)
94
+
95
+ # show which query was entered, and what was searching time
96
+ st.write(f"**Results Related to:** \"{query}\" ({elapsed_time} sec.)")
97
+ # then we use loop to show results
98
+ for i, answer in enumerate(results):
99
+ # answer starts with header
100
+ st.subheader(f"Answer {i+1}")
101
+ # cropped answer
102
+ doc = answer["doc"][:250] + "..."
103
+ # and url to the full answer
104
+ url = answer["url"]
105
+ # then we display it
106
+ st.markdown(f'{doc}\n[**Read More**]({url})\n', unsafe_allow_html=True)
107
+
108
+
109
+ st.markdown("---")
110
+ st.markdown("**Author:** Ivan Savchuk. 2022")
111
+ else:
112
+ st.markdown("Typical queries looks like this: _**\"What is flu?\"**_,\
113
+ _**\"How to cure breast cancer?\"**_,\
114
+ _**\"I have headache, what should I do?\"**_")
docs.json ADDED
The diff for this file is too large to render. See raw diff
 
idx_vectors.index ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2b9cc233e7d8bdb268acacf62de81179fded35d2fb41b96ba45b79191e54329
3
+ size 7366701
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ torch
2
+ sentence_transformers
3
+ faiss-cpu