Spaces:

DrGabrielLopez
/

arXiv-tool

Running

App Files Files Community

gabriel lopez commited on Nov 13, 2022

Commit

b1709c2

•

1 Parent(s): b920efd

first

Browse files

Files changed (8) hide show

.gitignore +5 -0
LICENSE +0 -0
README.rst +0 -0
arxiv_tool/app.py +34 -0
arxiv_tool/core.py +74 -0
arxiv_tool/plot.py +40 -0
data/arxiv.csv +0 -0
requirements.txt +10 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+data/ARXIV_CSV.ipynb
+data/arxiv-metadata-oai-snapshot.json
+__pycache__
+data/.ipynb_checkpoints
+*~

LICENSE ADDED Viewed

File without changes

README.rst ADDED Viewed

File without changes

arxiv_tool/app.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import gradio as gr
+from core import SentenceEncoder
+from plot import EmbeddingPlotter
+TITLE = "Search tool for ArXiv papers"
+DESCRIPTION = "<center>Find your most beloved ArXiv papers!</center>"
+EXAMPLES=["RoBERTa optimisation", "Permutation invariant AI models", "Gradient descent", "Black hole information theory"]
+ARTICLE = r"<center>Done by dr. Gabriel Lopez<br> For more please visit: <a href='https://sites.google.com/view/dr-gabriel-lopez/home'>My Page</a></center>"
+# interface function
+def search_and_plot(querry):
+    # search
+    df, model, embeddings = SentenceEncoder().load_and_encode()
+    df, result = SentenceEncoder().transform(df, querry, model, embeddings)
+    # plot
+    fig1, fig2 = EmbeddingPlotter().transform(df, embeddings)
+    return result[['title', 'similarity']], fig1, fig2
+# gradio elements
+in_textbox = gr.Textbox(label="Search on ArXiv:", placeholder="what do you want to learn today?...", lines=1)
+# in_examples = gr.Examples(examples=["BERT optimization", "Gradient descent", "Black hole information theory"], inputs=in_textbox)
+out_dataframe = gr.DataFrame(label="Most similar papers on ArXiv:")
+out_plot_sphere = gr.Plot(label="Embedding projection over a unit sphere")
+out_plot_projected_sphere = gr.Plot(label="Lambert-conformal projection over a plane", visible=False)
+# launch interface
+gr.Interface(inputs=in_textbox,
+             outputs=[out_dataframe,out_plot_sphere,out_plot_projected_sphere],
+             examples=EXAMPLES,
+             fn=search_and_plot,
+             title=TITLE,
+             description=DESCRIPTION,
+             article=ARTICLE,
+             ).launch(share=True)

arxiv_tool/core.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import pandas as pd
+import numpy as np
+import nmslib
+from sentence_transformers import SentenceTransformer
+# TODO: Use pipe, remove embeddings
+class SentenceEncoder:
+    """ Encodes the querry and papers data set and finds elements with the lowest cosine similarity """
+    def load_and_encode(self):
+      # load
+      df = self._load()
+      # encode
+      df, model, embeddings = self._encode_papers(df)
+      return df, model, embeddings
+    def transform(self, df, querry, model, embeddings):
+      # create_index
+      emb_querry = self._econde_querry(querry, model)
+      # search
+      result = self._make_search(df,emb_querry, embeddings)
+      # add_relevant_columns
+      df = self._add_relevant_columns(df, result)
+      return df, result
+    def _load(self):
+      # Load data
+      df = pd.read_csv("data/arxiv.csv")
+      return df
+    def _encode_papers(self,df):
+      # Encode the papers title
+      checkpoint = 'distilbert-base-uncased'
+      model = SentenceTransformer(checkpoint)
+      embeddings = model.encode(df['title'], convert_to_tensor=True)
+      # embeddings column
+      df['embeddings'] = np.array(embeddings).tolist()
+      return df, model, embeddings
+    def _econde_querry(self,querry, model):
+      # Encode the querry
+      emb_querry = model.encode([querry])
+      return emb_querry
+    def _make_search(self, df, emb_querry, embeddings):
+      # initialize a new index, using a HNSW index on Cosine Similarity
+      index = nmslib.init(method='hnsw', space='cosinesimil')
+      index.addDataPointBatch(embeddings)
+      index.createIndex({'post': 2}, print_progress=True)
+      # search
+      result = self._extract_search_result(index, emb_querry, df, k=10)
+      return result
+    def _extract_search_result(self,index, emb_querry, df, k):
+      data = []
+      idx, distances = index.knnQuery(emb_querry, k=k)
+      for i, j in zip(idx, distances):
+        data.append({'index': i,
+                    'title': df.title[i],
+                    'abstract': df.abstract[i],
+                    'similarity': 1.0 - j})
+      return pd.DataFrame(data)
+    def _add_relevant_columns(self, df, result):
+      # get categories
+      df['categories_parsed'] = df.categories.str.split().apply(lambda x: x[0]).str.split('.').apply(lambda x: x[0])
+      # create columns for plotting
+      df['index_papers'] = df.index
+      df['selected'] = df.index_papers.apply(lambda x: x in list(result['index']) )
+      return df

arxiv_tool/plot.py ADDED Viewed

	@@ -0,0 +1,40 @@

+from sklearn.manifold import TSNE
+import umap
+import plotly.express as px
+from pandas import DataFrame
+import numpy as np
+class EmbeddingPlotter:
+    """ Lower the dimensionality of the representation from 768 -> 2, over the surface of the sphere """
+    def transform(self, df, embeddings):
+        df = self.umap_embedding(df, embeddings)
+        fig1, fig2 = self.plot(df)
+        return fig1, fig2
+    def umap_embedding(self, df, embeddings):
+        # UMAP - Spherical
+        sphere_mapper = umap.UMAP(output_metric='haversine', random_state=42).fit(np.array(embeddings))
+        df['spherical_emb_X'] = np.sin(sphere_mapper.embedding_[:,0])*np.cos(sphere_mapper.embedding_[:,1])
+        df['spherical_emb_Y'] = np.sin(sphere_mapper.embedding_[:,0])*np.sin(sphere_mapper.embedding_[:,1])
+        df['spherical_emb_Z'] = np.cos(sphere_mapper.embedding_[:,0])
+        # UMAP - Lambert Conformal
+        df['lambert_conformal_emb_x'] = np.arctan2(df['spherical_emb_X'], df['spherical_emb_Y'])
+        df['lambert_conformal_emb_y'] = -np.arccos(df['spherical_emb_Z'])
+        return df
+    def plot(self, df):
+        # on the 3d sphere
+        fig1 = px.scatter_3d(df,
+                    x='spherical_emb_X',
+                    y='spherical_emb_Y',
+                    z='spherical_emb_Z',
+                    color="categories_parsed")
+        # on the projected spehre
+        fig2 = px.scatter(data_frame=df ,
+                x='lambert_conformal_emb_x',
+                y='lambert_conformal_emb_y',
+                color="categories_parsed",
+                )
+        return fig1, fig2

data/arxiv.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+gradio==3.9.1
+nmslib==2.1.1
+numpy==1.19.5
+pandas==1.1.4
+plotly==5.11.0
+regex==2022.10.31
+scikit_learn==1.1.3
+sentence_transformers==2.2.2
+umap==0.1.1
+umap_learn==0.4.6