gabriel lopez commited on
Commit
b1709c2
1 Parent(s): b920efd
Files changed (8) hide show
  1. .gitignore +5 -0
  2. LICENSE +0 -0
  3. README.rst +0 -0
  4. arxiv_tool/app.py +34 -0
  5. arxiv_tool/core.py +74 -0
  6. arxiv_tool/plot.py +40 -0
  7. data/arxiv.csv +0 -0
  8. requirements.txt +10 -0
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ data/ARXIV_CSV.ipynb
2
+ data/arxiv-metadata-oai-snapshot.json
3
+ __pycache__
4
+ data/.ipynb_checkpoints
5
+ *~
LICENSE ADDED
File without changes
README.rst ADDED
File without changes
arxiv_tool/app.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from core import SentenceEncoder
3
+ from plot import EmbeddingPlotter
4
+
5
+ TITLE = "Search tool for ArXiv papers"
6
+ DESCRIPTION = "<center>Find your most beloved ArXiv papers!</center>"
7
+ EXAMPLES=["RoBERTa optimisation", "Permutation invariant AI models", "Gradient descent", "Black hole information theory"]
8
+ ARTICLE = r"<center>Done by dr. Gabriel Lopez<br> For more please visit: <a href='https://sites.google.com/view/dr-gabriel-lopez/home'>My Page</a></center>"
9
+
10
+ # interface function
11
+ def search_and_plot(querry):
12
+ # search
13
+ df, model, embeddings = SentenceEncoder().load_and_encode()
14
+ df, result = SentenceEncoder().transform(df, querry, model, embeddings)
15
+ # plot
16
+ fig1, fig2 = EmbeddingPlotter().transform(df, embeddings)
17
+ return result[['title', 'similarity']], fig1, fig2
18
+
19
+ # gradio elements
20
+ in_textbox = gr.Textbox(label="Search on ArXiv:", placeholder="what do you want to learn today?...", lines=1)
21
+ # in_examples = gr.Examples(examples=["BERT optimization", "Gradient descent", "Black hole information theory"], inputs=in_textbox)
22
+ out_dataframe = gr.DataFrame(label="Most similar papers on ArXiv:")
23
+ out_plot_sphere = gr.Plot(label="Embedding projection over a unit sphere")
24
+ out_plot_projected_sphere = gr.Plot(label="Lambert-conformal projection over a plane", visible=False)
25
+
26
+ # launch interface
27
+ gr.Interface(inputs=in_textbox,
28
+ outputs=[out_dataframe,out_plot_sphere,out_plot_projected_sphere],
29
+ examples=EXAMPLES,
30
+ fn=search_and_plot,
31
+ title=TITLE,
32
+ description=DESCRIPTION,
33
+ article=ARTICLE,
34
+ ).launch(share=True)
arxiv_tool/core.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import nmslib
4
+ from sentence_transformers import SentenceTransformer
5
+ # TODO: Use pipe, remove embeddings
6
+
7
+
8
+ class SentenceEncoder:
9
+ """ Encodes the querry and papers data set and finds elements with the lowest cosine similarity """
10
+
11
+ def load_and_encode(self):
12
+ # load
13
+ df = self._load()
14
+ # encode
15
+ df, model, embeddings = self._encode_papers(df)
16
+ return df, model, embeddings
17
+
18
+ def transform(self, df, querry, model, embeddings):
19
+ # create_index
20
+ emb_querry = self._econde_querry(querry, model)
21
+ # search
22
+ result = self._make_search(df,emb_querry, embeddings)
23
+ # add_relevant_columns
24
+ df = self._add_relevant_columns(df, result)
25
+ return df, result
26
+
27
+ def _load(self):
28
+ # Load data
29
+ df = pd.read_csv("data/arxiv.csv")
30
+ return df
31
+
32
+ def _encode_papers(self,df):
33
+ # Encode the papers title
34
+ checkpoint = 'distilbert-base-uncased'
35
+ model = SentenceTransformer(checkpoint)
36
+ embeddings = model.encode(df['title'], convert_to_tensor=True)
37
+ # embeddings column
38
+ df['embeddings'] = np.array(embeddings).tolist()
39
+ return df, model, embeddings
40
+
41
+ def _econde_querry(self,querry, model):
42
+ # Encode the querry
43
+ emb_querry = model.encode([querry])
44
+ return emb_querry
45
+
46
+ def _make_search(self, df, emb_querry, embeddings):
47
+ # initialize a new index, using a HNSW index on Cosine Similarity
48
+ index = nmslib.init(method='hnsw', space='cosinesimil')
49
+ index.addDataPointBatch(embeddings)
50
+ index.createIndex({'post': 2}, print_progress=True)
51
+ # search
52
+ result = self._extract_search_result(index, emb_querry, df, k=10)
53
+ return result
54
+
55
+ def _extract_search_result(self,index, emb_querry, df, k):
56
+ data = []
57
+ idx, distances = index.knnQuery(emb_querry, k=k)
58
+ for i, j in zip(idx, distances):
59
+ data.append({'index': i,
60
+ 'title': df.title[i],
61
+ 'abstract': df.abstract[i],
62
+ 'similarity': 1.0 - j})
63
+ return pd.DataFrame(data)
64
+
65
+ def _add_relevant_columns(self, df, result):
66
+ # get categories
67
+ df['categories_parsed'] = df.categories.str.split().apply(lambda x: x[0]).str.split('.').apply(lambda x: x[0])
68
+ # create columns for plotting
69
+ df['index_papers'] = df.index
70
+ df['selected'] = df.index_papers.apply(lambda x: x in list(result['index']) )
71
+ return df
72
+
73
+
74
+
arxiv_tool/plot.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sklearn.manifold import TSNE
2
+ import umap
3
+ import plotly.express as px
4
+ from pandas import DataFrame
5
+ import numpy as np
6
+
7
+ class EmbeddingPlotter:
8
+ """ Lower the dimensionality of the representation from 768 -> 2, over the surface of the sphere """
9
+
10
+ def transform(self, df, embeddings):
11
+ df = self.umap_embedding(df, embeddings)
12
+ fig1, fig2 = self.plot(df)
13
+ return fig1, fig2
14
+
15
+
16
+ def umap_embedding(self, df, embeddings):
17
+ # UMAP - Spherical
18
+ sphere_mapper = umap.UMAP(output_metric='haversine', random_state=42).fit(np.array(embeddings))
19
+ df['spherical_emb_X'] = np.sin(sphere_mapper.embedding_[:,0])*np.cos(sphere_mapper.embedding_[:,1])
20
+ df['spherical_emb_Y'] = np.sin(sphere_mapper.embedding_[:,0])*np.sin(sphere_mapper.embedding_[:,1])
21
+ df['spherical_emb_Z'] = np.cos(sphere_mapper.embedding_[:,0])
22
+ # UMAP - Lambert Conformal
23
+ df['lambert_conformal_emb_x'] = np.arctan2(df['spherical_emb_X'], df['spherical_emb_Y'])
24
+ df['lambert_conformal_emb_y'] = -np.arccos(df['spherical_emb_Z'])
25
+ return df
26
+
27
+ def plot(self, df):
28
+ # on the 3d sphere
29
+ fig1 = px.scatter_3d(df,
30
+ x='spherical_emb_X',
31
+ y='spherical_emb_Y',
32
+ z='spherical_emb_Z',
33
+ color="categories_parsed")
34
+ # on the projected spehre
35
+ fig2 = px.scatter(data_frame=df ,
36
+ x='lambert_conformal_emb_x',
37
+ y='lambert_conformal_emb_y',
38
+ color="categories_parsed",
39
+ )
40
+ return fig1, fig2
data/arxiv.csv ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio==3.9.1
2
+ nmslib==2.1.1
3
+ numpy==1.19.5
4
+ pandas==1.1.4
5
+ plotly==5.11.0
6
+ regex==2022.10.31
7
+ scikit_learn==1.1.3
8
+ sentence_transformers==2.2.2
9
+ umap==0.1.1
10
+ umap_learn==0.4.6