Spaces:

Adapting
/

TrendFlow

Running

App Files Files Community

Adapting commited on Oct 31, 2022

Commit

6cfc2b1

•

1 Parent(s): 237f83b

v1.1.0

Browse files

Files changed (11) hide show

app.py +2 -3
lrt/clustering/clustering_pipeline.py +16 -13
lrt/clustering/config.py +1 -1
lrt/lrt.py +10 -13
lrt/utils/dimension_reduction.py +17 -0
lrt/utils/functions.py +5 -1
lrt_instance/instances.py +2 -1
scripts/tests/lrt_test_run.py +1 -1
setup.py +1 -1
widgets/body.py +16 -3
widgets/sidebar.py +28 -8

app.py CHANGED Viewed

@@ -1,12 +1,11 @@
 import streamlit as st
 from widgets import *
-from lrt_instance import *
 # [![github](https://img.kookapp.cn/assets/2022-09/1w4G0FIWGK00w00w.png)](https://github.com/Mondkuchen/idp_LiteratureResearch_Tool)
 # sidebar content
-platforms, number_papers,start_year,end_year,k = render_sidebar()
 # body head
 with st.form("my_form",clear_on_submit=False):
@@ -26,7 +25,7 @@ with st.form("my_form",clear_on_submit=False):
 if submitted:
     # body
-    render_body(platforms, number_papers, 5, query_input, show_preview,start_year,end_year,k)
     # '''
     # bar = (
     #     Bar()

 import streamlit as st
 from widgets import *
 # [![github](https://img.kookapp.cn/assets/2022-09/1w4G0FIWGK00w00w.png)](https://github.com/Mondkuchen/idp_LiteratureResearch_Tool)
 # sidebar content
+platforms, number_papers,start_year,end_year, clustering_params = render_sidebar()
 # body head
 with st.form("my_form",clear_on_submit=False):
 if submitted:
     # body
+    render_body(platforms, number_papers, 5, query_input, show_preview,start_year,end_year, clustering_params)
     # '''
     # bar = (
     #     Bar()

lrt/clustering/clustering_pipeline.py CHANGED Viewed

@@ -1,9 +1,9 @@
 from typing import List
 from .config import BaselineConfig, Configuration
 from ..utils import __create_model__
-# import numpy as np
 from sklearn.cluster import KMeans
-# from yellowbrick.cluster import KElbowVisualizer
 from .clusters import ClusterList
 class ClusterPipeline:
@@ -15,7 +15,7 @@ class ClusterPipeline:
     def __setup__(self, config:Configuration):
         self.PTM = __create_model__(config.plm)
-        self.dimension_reduction = __create_model__(config.dimension_reduction) # TODO
         self.clustering = __create_model__(config.clustering)
         self.keywords_extraction = __create_model__(config.keywords_extraction)
@@ -38,9 +38,11 @@ class ClusterPipeline:
         if self.dimension_reduction is None:
             return embeddings
         print(f'>>> start dimension reduction...')
         print(f'>>> finished dimension reduction...')
-    def __3_clustering__(self, embeddings, return_cluster_centers = False, best_k: int = 5):
         '''
         :param embeddings: Nxd
@@ -51,13 +53,14 @@ class ClusterPipeline:
         else:
             print(f'>>> start clustering...')
             model = KMeans()
-            # visualizer = KElbowVisualizer(
-            #     model, k=(2, 12), metric='calinski_harabasz', timings=False, locate_elbow=False
-            # )
-            #
-            # visualizer.fit(embeddings)
-            # best_k = visualizer.k_values_[np.argmax(np.array(visualizer.k_scores_))]
-            # print(f'>>> The best K is {best_k}.')
             labels, cluster_centers = self.clustering(embeddings, k=best_k)
             clusters = ClusterList(best_k)
@@ -90,11 +93,11 @@ class ClusterPipeline:
             return clusters
-    def __call__(self, documents: List[str], best_k:int = 5):
         print(f'>>> pipeline starts...')
         x = self.__1_generate_word_embeddings__(documents)
         x = self.__2_dimenstion_reduction__(x)
-        clusters = self.__3_clustering__(x,best_k=best_k)
         outputs = self.__4_keywords_extraction__(clusters, documents)
         print(f'>>> pipeline finished!\n')
         return outputs

 from typing import List
 from .config import BaselineConfig, Configuration
 from ..utils import __create_model__
+import numpy as np
 from sklearn.cluster import KMeans
+from yellowbrick.cluster import KElbowVisualizer
 from .clusters import ClusterList
 class ClusterPipeline:
     def __setup__(self, config:Configuration):
         self.PTM = __create_model__(config.plm)
+        self.dimension_reduction = __create_model__(config.dimension_reduction)
         self.clustering = __create_model__(config.clustering)
         self.keywords_extraction = __create_model__(config.keywords_extraction)
         if self.dimension_reduction is None:
             return embeddings
         print(f'>>> start dimension reduction...')
+        embeddings = self.dimension_reduction.dimension_reduction(embeddings)
         print(f'>>> finished dimension reduction...')
+        return embeddings
+    def __3_clustering__(self, embeddings, return_cluster_centers = False, max_k: int =10):
         '''
         :param embeddings: Nxd
         else:
             print(f'>>> start clustering...')
             model = KMeans()
+            visualizer = KElbowVisualizer(
+                model, k=(2, max_k+1), metric='silhouette', timings=False, locate_elbow=False
+            )
+            visualizer.fit(embeddings)
+            # visualizer.show()
+            best_k = visualizer.k_values_[np.argmax(np.array(visualizer.k_scores_))]
+            print(f'>>> The best K is {best_k}.')
             labels, cluster_centers = self.clustering(embeddings, k=best_k)
             clusters = ClusterList(best_k)
             return clusters
+    def __call__(self, documents: List[str], max_k:int):
         print(f'>>> pipeline starts...')
         x = self.__1_generate_word_embeddings__(documents)
         x = self.__2_dimenstion_reduction__(x)
+        clusters = self.__3_clustering__(x,max_k=max_k)
         outputs = self.__4_keywords_extraction__(clusters, documents)
         print(f'>>> pipeline finished!\n')
         return outputs

lrt/clustering/config.py CHANGED Viewed

@@ -8,4 +8,4 @@ class Configuration:
 class BaselineConfig(Configuration):
     def __init__(self):
-        super().__init__('''all-mpnet-base-v2''', 'none', 'kmeans-euclidean', 'keyphrase-transformer')

 class BaselineConfig(Configuration):
     def __init__(self):
+        super().__init__('''all-mpnet-base-v2''', 'none', 'kmeans-euclidean', 'keyphrase-transformer')

lrt/lrt.py CHANGED Viewed

@@ -46,8 +46,8 @@ class LiteratureResearchTool:
                  num_papers: int,
                  start_year: int,
                  end_year: int,
                  platforms: List[str] = ['IEEE', 'Arxiv', 'Paper with Code'],
-                 best_k: int = 5,
                  loading_ctx_manager = None,
                  ):
@@ -55,9 +55,9 @@ class LiteratureResearchTool:
         for platform in platforms:
             if loading_ctx_manager:
                 with loading_ctx_manager():
-                    clusters, articles = self.__platformPipeline__(platform,query,num_papers,start_year,end_year,best_k)
             else:
-                clusters, articles = self.__platformPipeline__(platform, query, num_papers, start_year, end_year,best_k)
             clusters.sort()
             yield clusters,articles
@@ -69,7 +69,7 @@ class LiteratureResearchTool:
                              num_papers: int,
                              start_year: int,
                              end_year: int,
-                             best_k: int = 5
                              ) -> (ClusterList,ArticleList):
         @st.cache(hash_funcs={Tokenizer: Tokenizer.__hash__},allow_output_mutation=True)
@@ -78,12 +78,11 @@ class LiteratureResearchTool:
                 num_papers: int,
                 start_year: int,
                 end_year: int,
-                best_k: int = 5
         ):
             articles = ArticleList.parse_ieee_articles(
             self.literature_search.ieee(query, start_year, end_year, num_papers))  # ArticleList
             abstracts = articles.getAbstracts()  # List[str]
-            clusters = self.cluster_pipeline(abstracts, best_k=best_k)
             clusters = self.__postprocess_clusters__(clusters)
             return clusters, articles
@@ -91,12 +90,11 @@ class LiteratureResearchTool:
         def arxiv_process(
                 query: str,
                 num_papers: int,
-                best_k: int = 5
         ):
             articles = ArticleList.parse_arxiv_articles(
             self.literature_search.arxiv(query, num_papers))  # ArticleList
             abstracts = articles.getAbstracts()  # List[str]
-            clusters = self.cluster_pipeline(abstracts, best_k=best_k)
             clusters = self.__postprocess_clusters__(clusters)
             return clusters, articles
@@ -104,21 +102,20 @@ class LiteratureResearchTool:
         def pwc_process(
                 query: str,
                 num_papers: int,
-                best_k: int = 5
         ):
             articles = ArticleList.parse_pwc_articles(
             self.literature_search.paper_with_code(query, num_papers))  # ArticleList
             abstracts = articles.getAbstracts()  # List[str]
-            clusters = self.cluster_pipeline(abstracts, best_k=best_k)
             clusters = self.__postprocess_clusters__(clusters)
             return clusters, articles
         if platforn_name == 'IEEE':
-            return ieee_process(query,num_papers,start_year,end_year,best_k)
         elif platforn_name == 'Arxiv':
-            return arxiv_process(query,num_papers,best_k)
         elif platforn_name == 'Paper with Code':
-            return pwc_process(query,num_papers,best_k)
         else:
             raise RuntimeError('This platform is not supported. Please open an issue on the GitHub.')

                  num_papers: int,
                  start_year: int,
                  end_year: int,
+                 max_k: int,
                  platforms: List[str] = ['IEEE', 'Arxiv', 'Paper with Code'],
                  loading_ctx_manager = None,
                  ):
         for platform in platforms:
             if loading_ctx_manager:
                 with loading_ctx_manager():
+                    clusters, articles = self.__platformPipeline__(platform,query,num_papers,start_year,end_year,max_k)
             else:
+                clusters, articles = self.__platformPipeline__(platform, query, num_papers, start_year, end_year,max_k)
             clusters.sort()
             yield clusters,articles
                              num_papers: int,
                              start_year: int,
                              end_year: int,
+                             max_k: int
                              ) -> (ClusterList,ArticleList):
         @st.cache(hash_funcs={Tokenizer: Tokenizer.__hash__},allow_output_mutation=True)
                 num_papers: int,
                 start_year: int,
                 end_year: int,
         ):
             articles = ArticleList.parse_ieee_articles(
             self.literature_search.ieee(query, start_year, end_year, num_papers))  # ArticleList
             abstracts = articles.getAbstracts()  # List[str]
+            clusters = self.cluster_pipeline(abstracts,max_k)
             clusters = self.__postprocess_clusters__(clusters)
             return clusters, articles
         def arxiv_process(
                 query: str,
                 num_papers: int,
         ):
             articles = ArticleList.parse_arxiv_articles(
             self.literature_search.arxiv(query, num_papers))  # ArticleList
             abstracts = articles.getAbstracts()  # List[str]
+            clusters = self.cluster_pipeline(abstracts,max_k)
             clusters = self.__postprocess_clusters__(clusters)
             return clusters, articles
         def pwc_process(
                 query: str,
                 num_papers: int,
         ):
             articles = ArticleList.parse_pwc_articles(
             self.literature_search.paper_with_code(query, num_papers))  # ArticleList
             abstracts = articles.getAbstracts()  # List[str]
+            clusters = self.cluster_pipeline(abstracts,max_k)
             clusters = self.__postprocess_clusters__(clusters)
             return clusters, articles
         if platforn_name == 'IEEE':
+            return ieee_process(query,num_papers,start_year,end_year)
         elif platforn_name == 'Arxiv':
+            return arxiv_process(query,num_papers)
         elif platforn_name == 'Paper with Code':
+            return pwc_process(query,num_papers)
         else:
             raise RuntimeError('This platform is not supported. Please open an issue on the GitHub.')

lrt/utils/dimension_reduction.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from sklearn.decomposition import PCA as pca
+class BaseDimensionReduction:
+    def dimension_reduction(self,X):
+        raise NotImplementedError()
+class PCA(BaseDimensionReduction):
+    def __init__(self, n_components: int = 0.8, *args, **kwargs) -> None:
+        super().__init__()
+        self.pca = pca(n_components,*args,**kwargs)
+    def dimension_reduction(self, X):
+        self.pca.fit(X=X)
+        print(f'>>> The reduced dimension is {self.pca.n_components_}.')
+        return self.pca.transform(X)

lrt/utils/functions.py CHANGED Viewed

@@ -5,6 +5,7 @@ import torch
 from sklearn.cluster import KMeans
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM,Text2TextGenerationPipeline
 from inference_hf import InferenceHF
 class Template:
     def __init__(self):
@@ -14,7 +15,7 @@ class Template:
             'all-mpnet-base-v2':'''sentence-transformers/all-mpnet-base-v2'''
         }
         self.dimension_reduction = {
-            'pca': None,
             'vae': None,
             'cnn': None
         }
@@ -55,6 +56,9 @@ def __create_model__(model_ckpt):
         )
             return tmp[0].cpu().detach().numpy(), tmp[1].cpu().detach().numpy()
         return ret
     elif model_ckpt =='kmeans-euclidean':
         def ret(x,k):

 from sklearn.cluster import KMeans
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM,Text2TextGenerationPipeline
 from inference_hf import InferenceHF
+from .dimension_reduction import PCA
 class Template:
     def __init__(self):
             'all-mpnet-base-v2':'''sentence-transformers/all-mpnet-base-v2'''
         }
         self.dimension_reduction = {
+            'pca': PCA,
             'vae': None,
             'cnn': None
         }
         )
             return tmp[0].cpu().detach().numpy(), tmp[1].cpu().detach().numpy()
         return ret
+    elif model_ckpt == 'pca':
+        pca = template.dimension_reduction[model_ckpt](0.8)
+        return pca
     elif model_ckpt =='kmeans-euclidean':
         def ret(x,k):

lrt_instance/instances.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from lrt import LiteratureResearchTool
-baseline_lrt = LiteratureResearchTool()

 from lrt import LiteratureResearchTool
+from lrt.clustering.config import *
+baseline_lrt = LiteratureResearchTool()

scripts/tests/lrt_test_run.py CHANGED Viewed

@@ -10,7 +10,7 @@ if __name__ == '__main__':
     from lrt.utils import ArticleList
     config = Configuration(
         plm= 'all-mpnet-base-v2',
-        dimension_reduction='none',
         clustering='kmeans-euclidean',
         # keywords_extraction='KeyBartAdapter'
         keywords_extraction= 'keyphrase-transformer'

     from lrt.utils import ArticleList
     config = Configuration(
         plm= 'all-mpnet-base-v2',
+        dimension_reduction='pca',
         clustering='kmeans-euclidean',
         # keywords_extraction='KeyBartAdapter'
         keywords_extraction= 'keyphrase-transformer'

setup.py CHANGED Viewed

@@ -21,7 +21,7 @@ requirements = [
 setup(
     name="LiteratureResearchTool",
-    version="1.0.0",
     author="Tao Xiang",
     author_email="tao.xiang@tum.de",
     description="A tool for literature research and analysis",

 setup(
     name="LiteratureResearchTool",
+    version="1.1.0",
     author="Tao Xiang",
     author_email="tao.xiang@tum.de",
     description="A tool for literature research and analysis",

widgets/body.py CHANGED Viewed

@@ -1,7 +1,8 @@
 import streamlit as st
 from api_ import ArxivQuery, IEEEQuery, PaperWithCodeQuery
 from lrt.clustering.clusters import SingleCluster
-from lrt import ArticleList
 from lrt_instance import *
 # from pyecharts.charts import Bar
 # from pyecharts import options as opts
@@ -54,7 +55,7 @@ We have found following papers for you! (displaying 5 papers for each literature
         paperInGeneral.markdown(paperInGeneral_md)
-def render_body(platforms, num_papers, num_papers_preview, query_input, show_preview:bool,start_year,end_year,k):
     tmp = st.empty()
     if query_input != '':
@@ -66,7 +67,19 @@ def render_body(platforms, num_papers, num_papers_preview, query_input, show_pre
         # lrt results
-        generator =  baseline_lrt(query_input,num_papers,start_year,end_year,platforms, best_k=k)
         for i,plat in enumerate(platforms):
             clusters, articles = next(generator)
             st.markdown(f'''# {i+1} {plat} Results''')

 import streamlit as st
 from api_ import ArxivQuery, IEEEQuery, PaperWithCodeQuery
 from lrt.clustering.clusters import SingleCluster
+from lrt.clustering.config import Configuration
+from lrt import ArticleList, LiteratureResearchTool
 from lrt_instance import *
 # from pyecharts.charts import Bar
 # from pyecharts import options as opts
         paperInGeneral.markdown(paperInGeneral_md)
+def render_body(platforms, num_papers, num_papers_preview, query_input, show_preview:bool,start_year,end_year, clustering_params: dict):
     tmp = st.empty()
     if query_input != '':
         # lrt results
+        ## baseline
+        if clustering_params['dimension_reduction'] == 'none':
+            model = baseline_lrt
+        else:
+            config = Configuration(
+                plm= '''all-mpnet-base-v2''',
+                dimension_reduction= clustering_params['dimension_reduction'],
+                clustering= 'kmeans-euclidean',
+                keywords_extraction='keyphrase-transformer'
+            )
+            model = LiteratureResearchTool(config)
+        generator =  model(query_input,num_papers,start_year,end_year,max_k=clustering_params['max_k'],platforms=platforms)
         for i,plat in enumerate(platforms):
             clusters, articles = next(generator)
             st.markdown(f'''# {i+1} {plat} Results''')

widgets/sidebar.py CHANGED Viewed

@@ -3,6 +3,12 @@ import datetime
 # from .utils import PACKAGE_ROOT
 def render_sidebar():
     sidebar_markdown = f'''
     <center>
@@ -14,16 +20,14 @@ def render_sidebar():
     <code>
-    v1.0.0
     </code>
     </center>
-    <center>
-    <a href="https://github.com/Mondkuchen/idp_LiteratureResearch_Tool"><img src = "https://cdn-icons-png.flaticon.com/512/733/733609.png" width="23"></img></a>  <a href="mailto:xiang.tao@outlook.de"><img src="https://cdn-icons-png.flaticon.com/512/646/646094.png" alt="email" width = "27" ></a>
-    </center>
     ---
@@ -50,7 +54,7 @@ def render_sidebar():
     st.sidebar.markdown('## Choose the max number of papers to search')
-    number_papers=st.sidebar.slider('number', 5, 200, 10, 5)
     st.sidebar.markdown('## Choose the start year of publication')
     this_year = datetime.date.today().year
@@ -59,7 +63,23 @@ def render_sidebar():
     st.sidebar.markdown('## Choose the end year of publication')
     end_year = st.sidebar.slider('year end:', 2000, this_year, this_year, 1)
-    st.sidebar.markdown('## Choose the number of clusters')
-    k = st.sidebar.slider('number',1,10,3)
-    return platforms, number_papers, start_year, end_year, k

 # from .utils import PACKAGE_ROOT
 def render_sidebar():
+    icons = f'''
+    <center>
+    <a href="https://github.com/Mondkuchen/idp_LiteratureResearch_Tool"><img src = "https://cdn-icons-png.flaticon.com/512/733/733609.png" width="23"></img></a>  <a href="mailto:xiang.tao@outlook.de"><img src="https://cdn-icons-png.flaticon.com/512/646/646094.png" alt="email" width = "27" ></a>
+    </center>
+    '''
     sidebar_markdown = f'''
     <center>
     <code>
+    v1.1.0
     </code>
     </center>
+    {icons}
     ---
     st.sidebar.markdown('## Choose the max number of papers to search')
+    number_papers=st.sidebar.slider('number', 10, 200, 20, 5)
     st.sidebar.markdown('## Choose the start year of publication')
     this_year = datetime.date.today().year
     st.sidebar.markdown('## Choose the end year of publication')
     end_year = st.sidebar.slider('year end:', 2000, this_year, this_year, 1)
+    with st.sidebar:
+        st.markdown('## Adjust clustering hyperparameters')
+        with st.expander('Clustering Hyperparameters'):
+            dr = st.selectbox('1) Dimension Reduction', options=['none', 'pca'], index=0)
+            tmp = min(number_papers,15)
+            max_k = st.slider('2) Max number of clusters', 2,tmp , tmp//2)
+        st.markdown('---')
+        st.markdown(icons,unsafe_allow_html=True)
+        st.markdown('''<center>copyright@2022</center>''',unsafe_allow_html=True)
+    # st.sidebar.markdown('## Choose the number of clusters')
+    # k = st.sidebar.slider('number',1,10,3)
+    return platforms, number_papers, start_year, end_year, dict(
+        dimension_reduction= dr,
+        max_k = max_k
+    )