Spaces:
Running
Running
gabriel lopez
commited on
Commit
•
b1709c2
1
Parent(s):
b920efd
first
Browse files- .gitignore +5 -0
- LICENSE +0 -0
- README.rst +0 -0
- arxiv_tool/app.py +34 -0
- arxiv_tool/core.py +74 -0
- arxiv_tool/plot.py +40 -0
- data/arxiv.csv +0 -0
- requirements.txt +10 -0
.gitignore
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
data/ARXIV_CSV.ipynb
|
2 |
+
data/arxiv-metadata-oai-snapshot.json
|
3 |
+
__pycache__
|
4 |
+
data/.ipynb_checkpoints
|
5 |
+
*~
|
LICENSE
ADDED
File without changes
|
README.rst
ADDED
File without changes
|
arxiv_tool/app.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from core import SentenceEncoder
|
3 |
+
from plot import EmbeddingPlotter
|
4 |
+
|
5 |
+
TITLE = "Search tool for ArXiv papers"
|
6 |
+
DESCRIPTION = "<center>Find your most beloved ArXiv papers!</center>"
|
7 |
+
EXAMPLES=["RoBERTa optimisation", "Permutation invariant AI models", "Gradient descent", "Black hole information theory"]
|
8 |
+
ARTICLE = r"<center>Done by dr. Gabriel Lopez<br> For more please visit: <a href='https://sites.google.com/view/dr-gabriel-lopez/home'>My Page</a></center>"
|
9 |
+
|
10 |
+
# interface function
|
11 |
+
def search_and_plot(querry):
|
12 |
+
# search
|
13 |
+
df, model, embeddings = SentenceEncoder().load_and_encode()
|
14 |
+
df, result = SentenceEncoder().transform(df, querry, model, embeddings)
|
15 |
+
# plot
|
16 |
+
fig1, fig2 = EmbeddingPlotter().transform(df, embeddings)
|
17 |
+
return result[['title', 'similarity']], fig1, fig2
|
18 |
+
|
19 |
+
# gradio elements
|
20 |
+
in_textbox = gr.Textbox(label="Search on ArXiv:", placeholder="what do you want to learn today?...", lines=1)
|
21 |
+
# in_examples = gr.Examples(examples=["BERT optimization", "Gradient descent", "Black hole information theory"], inputs=in_textbox)
|
22 |
+
out_dataframe = gr.DataFrame(label="Most similar papers on ArXiv:")
|
23 |
+
out_plot_sphere = gr.Plot(label="Embedding projection over a unit sphere")
|
24 |
+
out_plot_projected_sphere = gr.Plot(label="Lambert-conformal projection over a plane", visible=False)
|
25 |
+
|
26 |
+
# launch interface
|
27 |
+
gr.Interface(inputs=in_textbox,
|
28 |
+
outputs=[out_dataframe,out_plot_sphere,out_plot_projected_sphere],
|
29 |
+
examples=EXAMPLES,
|
30 |
+
fn=search_and_plot,
|
31 |
+
title=TITLE,
|
32 |
+
description=DESCRIPTION,
|
33 |
+
article=ARTICLE,
|
34 |
+
).launch(share=True)
|
arxiv_tool/core.py
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
import nmslib
|
4 |
+
from sentence_transformers import SentenceTransformer
|
5 |
+
# TODO: Use pipe, remove embeddings
|
6 |
+
|
7 |
+
|
8 |
+
class SentenceEncoder:
|
9 |
+
""" Encodes the querry and papers data set and finds elements with the lowest cosine similarity """
|
10 |
+
|
11 |
+
def load_and_encode(self):
|
12 |
+
# load
|
13 |
+
df = self._load()
|
14 |
+
# encode
|
15 |
+
df, model, embeddings = self._encode_papers(df)
|
16 |
+
return df, model, embeddings
|
17 |
+
|
18 |
+
def transform(self, df, querry, model, embeddings):
|
19 |
+
# create_index
|
20 |
+
emb_querry = self._econde_querry(querry, model)
|
21 |
+
# search
|
22 |
+
result = self._make_search(df,emb_querry, embeddings)
|
23 |
+
# add_relevant_columns
|
24 |
+
df = self._add_relevant_columns(df, result)
|
25 |
+
return df, result
|
26 |
+
|
27 |
+
def _load(self):
|
28 |
+
# Load data
|
29 |
+
df = pd.read_csv("data/arxiv.csv")
|
30 |
+
return df
|
31 |
+
|
32 |
+
def _encode_papers(self,df):
|
33 |
+
# Encode the papers title
|
34 |
+
checkpoint = 'distilbert-base-uncased'
|
35 |
+
model = SentenceTransformer(checkpoint)
|
36 |
+
embeddings = model.encode(df['title'], convert_to_tensor=True)
|
37 |
+
# embeddings column
|
38 |
+
df['embeddings'] = np.array(embeddings).tolist()
|
39 |
+
return df, model, embeddings
|
40 |
+
|
41 |
+
def _econde_querry(self,querry, model):
|
42 |
+
# Encode the querry
|
43 |
+
emb_querry = model.encode([querry])
|
44 |
+
return emb_querry
|
45 |
+
|
46 |
+
def _make_search(self, df, emb_querry, embeddings):
|
47 |
+
# initialize a new index, using a HNSW index on Cosine Similarity
|
48 |
+
index = nmslib.init(method='hnsw', space='cosinesimil')
|
49 |
+
index.addDataPointBatch(embeddings)
|
50 |
+
index.createIndex({'post': 2}, print_progress=True)
|
51 |
+
# search
|
52 |
+
result = self._extract_search_result(index, emb_querry, df, k=10)
|
53 |
+
return result
|
54 |
+
|
55 |
+
def _extract_search_result(self,index, emb_querry, df, k):
|
56 |
+
data = []
|
57 |
+
idx, distances = index.knnQuery(emb_querry, k=k)
|
58 |
+
for i, j in zip(idx, distances):
|
59 |
+
data.append({'index': i,
|
60 |
+
'title': df.title[i],
|
61 |
+
'abstract': df.abstract[i],
|
62 |
+
'similarity': 1.0 - j})
|
63 |
+
return pd.DataFrame(data)
|
64 |
+
|
65 |
+
def _add_relevant_columns(self, df, result):
|
66 |
+
# get categories
|
67 |
+
df['categories_parsed'] = df.categories.str.split().apply(lambda x: x[0]).str.split('.').apply(lambda x: x[0])
|
68 |
+
# create columns for plotting
|
69 |
+
df['index_papers'] = df.index
|
70 |
+
df['selected'] = df.index_papers.apply(lambda x: x in list(result['index']) )
|
71 |
+
return df
|
72 |
+
|
73 |
+
|
74 |
+
|
arxiv_tool/plot.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sklearn.manifold import TSNE
|
2 |
+
import umap
|
3 |
+
import plotly.express as px
|
4 |
+
from pandas import DataFrame
|
5 |
+
import numpy as np
|
6 |
+
|
7 |
+
class EmbeddingPlotter:
|
8 |
+
""" Lower the dimensionality of the representation from 768 -> 2, over the surface of the sphere """
|
9 |
+
|
10 |
+
def transform(self, df, embeddings):
|
11 |
+
df = self.umap_embedding(df, embeddings)
|
12 |
+
fig1, fig2 = self.plot(df)
|
13 |
+
return fig1, fig2
|
14 |
+
|
15 |
+
|
16 |
+
def umap_embedding(self, df, embeddings):
|
17 |
+
# UMAP - Spherical
|
18 |
+
sphere_mapper = umap.UMAP(output_metric='haversine', random_state=42).fit(np.array(embeddings))
|
19 |
+
df['spherical_emb_X'] = np.sin(sphere_mapper.embedding_[:,0])*np.cos(sphere_mapper.embedding_[:,1])
|
20 |
+
df['spherical_emb_Y'] = np.sin(sphere_mapper.embedding_[:,0])*np.sin(sphere_mapper.embedding_[:,1])
|
21 |
+
df['spherical_emb_Z'] = np.cos(sphere_mapper.embedding_[:,0])
|
22 |
+
# UMAP - Lambert Conformal
|
23 |
+
df['lambert_conformal_emb_x'] = np.arctan2(df['spherical_emb_X'], df['spherical_emb_Y'])
|
24 |
+
df['lambert_conformal_emb_y'] = -np.arccos(df['spherical_emb_Z'])
|
25 |
+
return df
|
26 |
+
|
27 |
+
def plot(self, df):
|
28 |
+
# on the 3d sphere
|
29 |
+
fig1 = px.scatter_3d(df,
|
30 |
+
x='spherical_emb_X',
|
31 |
+
y='spherical_emb_Y',
|
32 |
+
z='spherical_emb_Z',
|
33 |
+
color="categories_parsed")
|
34 |
+
# on the projected spehre
|
35 |
+
fig2 = px.scatter(data_frame=df ,
|
36 |
+
x='lambert_conformal_emb_x',
|
37 |
+
y='lambert_conformal_emb_y',
|
38 |
+
color="categories_parsed",
|
39 |
+
)
|
40 |
+
return fig1, fig2
|
data/arxiv.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
requirements.txt
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio==3.9.1
|
2 |
+
nmslib==2.1.1
|
3 |
+
numpy==1.19.5
|
4 |
+
pandas==1.1.4
|
5 |
+
plotly==5.11.0
|
6 |
+
regex==2022.10.31
|
7 |
+
scikit_learn==1.1.3
|
8 |
+
sentence_transformers==2.2.2
|
9 |
+
umap==0.1.1
|
10 |
+
umap_learn==0.4.6
|