Adapting commited on
Commit
6cfc2b1
1 Parent(s): 237f83b
app.py CHANGED
@@ -1,12 +1,11 @@
1
  import streamlit as st
2
  from widgets import *
3
- from lrt_instance import *
4
 
5
 
6
  # [![github](https://img.kookapp.cn/assets/2022-09/1w4G0FIWGK00w00w.png)](https://github.com/Mondkuchen/idp_LiteratureResearch_Tool)
7
 
8
  # sidebar content
9
- platforms, number_papers,start_year,end_year,k = render_sidebar()
10
 
11
  # body head
12
  with st.form("my_form",clear_on_submit=False):
@@ -26,7 +25,7 @@ with st.form("my_form",clear_on_submit=False):
26
 
27
  if submitted:
28
  # body
29
- render_body(platforms, number_papers, 5, query_input, show_preview,start_year,end_year,k)
30
  # '''
31
  # bar = (
32
  # Bar()
 
1
  import streamlit as st
2
  from widgets import *
 
3
 
4
 
5
  # [![github](https://img.kookapp.cn/assets/2022-09/1w4G0FIWGK00w00w.png)](https://github.com/Mondkuchen/idp_LiteratureResearch_Tool)
6
 
7
  # sidebar content
8
+ platforms, number_papers,start_year,end_year, clustering_params = render_sidebar()
9
 
10
  # body head
11
  with st.form("my_form",clear_on_submit=False):
 
25
 
26
  if submitted:
27
  # body
28
+ render_body(platforms, number_papers, 5, query_input, show_preview,start_year,end_year, clustering_params)
29
  # '''
30
  # bar = (
31
  # Bar()
lrt/clustering/clustering_pipeline.py CHANGED
@@ -1,9 +1,9 @@
1
  from typing import List
2
  from .config import BaselineConfig, Configuration
3
  from ..utils import __create_model__
4
- # import numpy as np
5
  from sklearn.cluster import KMeans
6
- # from yellowbrick.cluster import KElbowVisualizer
7
  from .clusters import ClusterList
8
 
9
  class ClusterPipeline:
@@ -15,7 +15,7 @@ class ClusterPipeline:
15
 
16
  def __setup__(self, config:Configuration):
17
  self.PTM = __create_model__(config.plm)
18
- self.dimension_reduction = __create_model__(config.dimension_reduction) # TODO
19
  self.clustering = __create_model__(config.clustering)
20
  self.keywords_extraction = __create_model__(config.keywords_extraction)
21
 
@@ -38,9 +38,11 @@ class ClusterPipeline:
38
  if self.dimension_reduction is None:
39
  return embeddings
40
  print(f'>>> start dimension reduction...')
 
41
  print(f'>>> finished dimension reduction...')
 
42
 
43
- def __3_clustering__(self, embeddings, return_cluster_centers = False, best_k: int = 5):
44
  '''
45
 
46
  :param embeddings: Nxd
@@ -51,13 +53,14 @@ class ClusterPipeline:
51
  else:
52
  print(f'>>> start clustering...')
53
  model = KMeans()
54
- # visualizer = KElbowVisualizer(
55
- # model, k=(2, 12), metric='calinski_harabasz', timings=False, locate_elbow=False
56
- # )
57
- #
58
- # visualizer.fit(embeddings)
59
- # best_k = visualizer.k_values_[np.argmax(np.array(visualizer.k_scores_))]
60
- # print(f'>>> The best K is {best_k}.')
 
61
 
62
  labels, cluster_centers = self.clustering(embeddings, k=best_k)
63
  clusters = ClusterList(best_k)
@@ -90,11 +93,11 @@ class ClusterPipeline:
90
  return clusters
91
 
92
 
93
- def __call__(self, documents: List[str], best_k:int = 5):
94
  print(f'>>> pipeline starts...')
95
  x = self.__1_generate_word_embeddings__(documents)
96
  x = self.__2_dimenstion_reduction__(x)
97
- clusters = self.__3_clustering__(x,best_k=best_k)
98
  outputs = self.__4_keywords_extraction__(clusters, documents)
99
  print(f'>>> pipeline finished!\n')
100
  return outputs
 
1
  from typing import List
2
  from .config import BaselineConfig, Configuration
3
  from ..utils import __create_model__
4
+ import numpy as np
5
  from sklearn.cluster import KMeans
6
+ from yellowbrick.cluster import KElbowVisualizer
7
  from .clusters import ClusterList
8
 
9
  class ClusterPipeline:
 
15
 
16
  def __setup__(self, config:Configuration):
17
  self.PTM = __create_model__(config.plm)
18
+ self.dimension_reduction = __create_model__(config.dimension_reduction)
19
  self.clustering = __create_model__(config.clustering)
20
  self.keywords_extraction = __create_model__(config.keywords_extraction)
21
 
 
38
  if self.dimension_reduction is None:
39
  return embeddings
40
  print(f'>>> start dimension reduction...')
41
+ embeddings = self.dimension_reduction.dimension_reduction(embeddings)
42
  print(f'>>> finished dimension reduction...')
43
+ return embeddings
44
 
45
+ def __3_clustering__(self, embeddings, return_cluster_centers = False, max_k: int =10):
46
  '''
47
 
48
  :param embeddings: Nxd
 
53
  else:
54
  print(f'>>> start clustering...')
55
  model = KMeans()
56
+ visualizer = KElbowVisualizer(
57
+ model, k=(2, max_k+1), metric='silhouette', timings=False, locate_elbow=False
58
+ )
59
+
60
+ visualizer.fit(embeddings)
61
+ # visualizer.show()
62
+ best_k = visualizer.k_values_[np.argmax(np.array(visualizer.k_scores_))]
63
+ print(f'>>> The best K is {best_k}.')
64
 
65
  labels, cluster_centers = self.clustering(embeddings, k=best_k)
66
  clusters = ClusterList(best_k)
 
93
  return clusters
94
 
95
 
96
+ def __call__(self, documents: List[str], max_k:int):
97
  print(f'>>> pipeline starts...')
98
  x = self.__1_generate_word_embeddings__(documents)
99
  x = self.__2_dimenstion_reduction__(x)
100
+ clusters = self.__3_clustering__(x,max_k=max_k)
101
  outputs = self.__4_keywords_extraction__(clusters, documents)
102
  print(f'>>> pipeline finished!\n')
103
  return outputs
lrt/clustering/config.py CHANGED
@@ -8,4 +8,4 @@ class Configuration:
8
 
9
  class BaselineConfig(Configuration):
10
  def __init__(self):
11
- super().__init__('''all-mpnet-base-v2''', 'none', 'kmeans-euclidean', 'keyphrase-transformer')
 
8
 
9
  class BaselineConfig(Configuration):
10
  def __init__(self):
11
+ super().__init__('''all-mpnet-base-v2''', 'none', 'kmeans-euclidean', 'keyphrase-transformer')
lrt/lrt.py CHANGED
@@ -46,8 +46,8 @@ class LiteratureResearchTool:
46
  num_papers: int,
47
  start_year: int,
48
  end_year: int,
 
49
  platforms: List[str] = ['IEEE', 'Arxiv', 'Paper with Code'],
50
- best_k: int = 5,
51
  loading_ctx_manager = None,
52
  ):
53
 
@@ -55,9 +55,9 @@ class LiteratureResearchTool:
55
  for platform in platforms:
56
  if loading_ctx_manager:
57
  with loading_ctx_manager():
58
- clusters, articles = self.__platformPipeline__(platform,query,num_papers,start_year,end_year,best_k)
59
  else:
60
- clusters, articles = self.__platformPipeline__(platform, query, num_papers, start_year, end_year,best_k)
61
 
62
  clusters.sort()
63
  yield clusters,articles
@@ -69,7 +69,7 @@ class LiteratureResearchTool:
69
  num_papers: int,
70
  start_year: int,
71
  end_year: int,
72
- best_k: int = 5
73
  ) -> (ClusterList,ArticleList):
74
 
75
  @st.cache(hash_funcs={Tokenizer: Tokenizer.__hash__},allow_output_mutation=True)
@@ -78,12 +78,11 @@ class LiteratureResearchTool:
78
  num_papers: int,
79
  start_year: int,
80
  end_year: int,
81
- best_k: int = 5
82
  ):
83
  articles = ArticleList.parse_ieee_articles(
84
  self.literature_search.ieee(query, start_year, end_year, num_papers)) # ArticleList
85
  abstracts = articles.getAbstracts() # List[str]
86
- clusters = self.cluster_pipeline(abstracts, best_k=best_k)
87
  clusters = self.__postprocess_clusters__(clusters)
88
  return clusters, articles
89
 
@@ -91,12 +90,11 @@ class LiteratureResearchTool:
91
  def arxiv_process(
92
  query: str,
93
  num_papers: int,
94
- best_k: int = 5
95
  ):
96
  articles = ArticleList.parse_arxiv_articles(
97
  self.literature_search.arxiv(query, num_papers)) # ArticleList
98
  abstracts = articles.getAbstracts() # List[str]
99
- clusters = self.cluster_pipeline(abstracts, best_k=best_k)
100
  clusters = self.__postprocess_clusters__(clusters)
101
  return clusters, articles
102
 
@@ -104,21 +102,20 @@ class LiteratureResearchTool:
104
  def pwc_process(
105
  query: str,
106
  num_papers: int,
107
- best_k: int = 5
108
  ):
109
  articles = ArticleList.parse_pwc_articles(
110
  self.literature_search.paper_with_code(query, num_papers)) # ArticleList
111
  abstracts = articles.getAbstracts() # List[str]
112
- clusters = self.cluster_pipeline(abstracts, best_k=best_k)
113
  clusters = self.__postprocess_clusters__(clusters)
114
  return clusters, articles
115
 
116
  if platforn_name == 'IEEE':
117
- return ieee_process(query,num_papers,start_year,end_year,best_k)
118
  elif platforn_name == 'Arxiv':
119
- return arxiv_process(query,num_papers,best_k)
120
  elif platforn_name == 'Paper with Code':
121
- return pwc_process(query,num_papers,best_k)
122
  else:
123
  raise RuntimeError('This platform is not supported. Please open an issue on the GitHub.')
124
 
 
46
  num_papers: int,
47
  start_year: int,
48
  end_year: int,
49
+ max_k: int,
50
  platforms: List[str] = ['IEEE', 'Arxiv', 'Paper with Code'],
 
51
  loading_ctx_manager = None,
52
  ):
53
 
 
55
  for platform in platforms:
56
  if loading_ctx_manager:
57
  with loading_ctx_manager():
58
+ clusters, articles = self.__platformPipeline__(platform,query,num_papers,start_year,end_year,max_k)
59
  else:
60
+ clusters, articles = self.__platformPipeline__(platform, query, num_papers, start_year, end_year,max_k)
61
 
62
  clusters.sort()
63
  yield clusters,articles
 
69
  num_papers: int,
70
  start_year: int,
71
  end_year: int,
72
+ max_k: int
73
  ) -> (ClusterList,ArticleList):
74
 
75
  @st.cache(hash_funcs={Tokenizer: Tokenizer.__hash__},allow_output_mutation=True)
 
78
  num_papers: int,
79
  start_year: int,
80
  end_year: int,
 
81
  ):
82
  articles = ArticleList.parse_ieee_articles(
83
  self.literature_search.ieee(query, start_year, end_year, num_papers)) # ArticleList
84
  abstracts = articles.getAbstracts() # List[str]
85
+ clusters = self.cluster_pipeline(abstracts,max_k)
86
  clusters = self.__postprocess_clusters__(clusters)
87
  return clusters, articles
88
 
 
90
  def arxiv_process(
91
  query: str,
92
  num_papers: int,
 
93
  ):
94
  articles = ArticleList.parse_arxiv_articles(
95
  self.literature_search.arxiv(query, num_papers)) # ArticleList
96
  abstracts = articles.getAbstracts() # List[str]
97
+ clusters = self.cluster_pipeline(abstracts,max_k)
98
  clusters = self.__postprocess_clusters__(clusters)
99
  return clusters, articles
100
 
 
102
  def pwc_process(
103
  query: str,
104
  num_papers: int,
 
105
  ):
106
  articles = ArticleList.parse_pwc_articles(
107
  self.literature_search.paper_with_code(query, num_papers)) # ArticleList
108
  abstracts = articles.getAbstracts() # List[str]
109
+ clusters = self.cluster_pipeline(abstracts,max_k)
110
  clusters = self.__postprocess_clusters__(clusters)
111
  return clusters, articles
112
 
113
  if platforn_name == 'IEEE':
114
+ return ieee_process(query,num_papers,start_year,end_year)
115
  elif platforn_name == 'Arxiv':
116
+ return arxiv_process(query,num_papers)
117
  elif platforn_name == 'Paper with Code':
118
+ return pwc_process(query,num_papers)
119
  else:
120
  raise RuntimeError('This platform is not supported. Please open an issue on the GitHub.')
121
 
lrt/utils/dimension_reduction.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sklearn.decomposition import PCA as pca
2
+
3
+
4
+ class BaseDimensionReduction:
5
+ def dimension_reduction(self,X):
6
+ raise NotImplementedError()
7
+
8
+ class PCA(BaseDimensionReduction):
9
+ def __init__(self, n_components: int = 0.8, *args, **kwargs) -> None:
10
+ super().__init__()
11
+ self.pca = pca(n_components,*args,**kwargs)
12
+
13
+
14
+ def dimension_reduction(self, X):
15
+ self.pca.fit(X=X)
16
+ print(f'>>> The reduced dimension is {self.pca.n_components_}.')
17
+ return self.pca.transform(X)
lrt/utils/functions.py CHANGED
@@ -5,6 +5,7 @@ import torch
5
  from sklearn.cluster import KMeans
6
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM,Text2TextGenerationPipeline
7
  from inference_hf import InferenceHF
 
8
 
9
  class Template:
10
  def __init__(self):
@@ -14,7 +15,7 @@ class Template:
14
  'all-mpnet-base-v2':'''sentence-transformers/all-mpnet-base-v2'''
15
  }
16
  self.dimension_reduction = {
17
- 'pca': None,
18
  'vae': None,
19
  'cnn': None
20
  }
@@ -55,6 +56,9 @@ def __create_model__(model_ckpt):
55
  )
56
  return tmp[0].cpu().detach().numpy(), tmp[1].cpu().detach().numpy()
57
  return ret
 
 
 
58
 
59
  elif model_ckpt =='kmeans-euclidean':
60
  def ret(x,k):
 
5
  from sklearn.cluster import KMeans
6
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM,Text2TextGenerationPipeline
7
  from inference_hf import InferenceHF
8
+ from .dimension_reduction import PCA
9
 
10
  class Template:
11
  def __init__(self):
 
15
  'all-mpnet-base-v2':'''sentence-transformers/all-mpnet-base-v2'''
16
  }
17
  self.dimension_reduction = {
18
+ 'pca': PCA,
19
  'vae': None,
20
  'cnn': None
21
  }
 
56
  )
57
  return tmp[0].cpu().detach().numpy(), tmp[1].cpu().detach().numpy()
58
  return ret
59
+ elif model_ckpt == 'pca':
60
+ pca = template.dimension_reduction[model_ckpt](0.8)
61
+ return pca
62
 
63
  elif model_ckpt =='kmeans-euclidean':
64
  def ret(x,k):
lrt_instance/instances.py CHANGED
@@ -1,3 +1,4 @@
1
  from lrt import LiteratureResearchTool
 
2
 
3
- baseline_lrt = LiteratureResearchTool()
 
1
  from lrt import LiteratureResearchTool
2
+ from lrt.clustering.config import *
3
 
4
+ baseline_lrt = LiteratureResearchTool()
scripts/tests/lrt_test_run.py CHANGED
@@ -10,7 +10,7 @@ if __name__ == '__main__':
10
  from lrt.utils import ArticleList
11
  config = Configuration(
12
  plm= 'all-mpnet-base-v2',
13
- dimension_reduction='none',
14
  clustering='kmeans-euclidean',
15
  # keywords_extraction='KeyBartAdapter'
16
  keywords_extraction= 'keyphrase-transformer'
 
10
  from lrt.utils import ArticleList
11
  config = Configuration(
12
  plm= 'all-mpnet-base-v2',
13
+ dimension_reduction='pca',
14
  clustering='kmeans-euclidean',
15
  # keywords_extraction='KeyBartAdapter'
16
  keywords_extraction= 'keyphrase-transformer'
setup.py CHANGED
@@ -21,7 +21,7 @@ requirements = [
21
 
22
  setup(
23
  name="LiteratureResearchTool",
24
- version="1.0.0",
25
  author="Tao Xiang",
26
  author_email="tao.xiang@tum.de",
27
  description="A tool for literature research and analysis",
 
21
 
22
  setup(
23
  name="LiteratureResearchTool",
24
+ version="1.1.0",
25
  author="Tao Xiang",
26
  author_email="tao.xiang@tum.de",
27
  description="A tool for literature research and analysis",
widgets/body.py CHANGED
@@ -1,7 +1,8 @@
1
  import streamlit as st
2
  from api_ import ArxivQuery, IEEEQuery, PaperWithCodeQuery
3
  from lrt.clustering.clusters import SingleCluster
4
- from lrt import ArticleList
 
5
  from lrt_instance import *
6
  # from pyecharts.charts import Bar
7
  # from pyecharts import options as opts
@@ -54,7 +55,7 @@ We have found following papers for you! (displaying 5 papers for each literature
54
 
55
  paperInGeneral.markdown(paperInGeneral_md)
56
 
57
- def render_body(platforms, num_papers, num_papers_preview, query_input, show_preview:bool,start_year,end_year,k):
58
 
59
  tmp = st.empty()
60
  if query_input != '':
@@ -66,7 +67,19 @@ def render_body(platforms, num_papers, num_papers_preview, query_input, show_pre
66
 
67
 
68
  # lrt results
69
- generator = baseline_lrt(query_input,num_papers,start_year,end_year,platforms, best_k=k)
 
 
 
 
 
 
 
 
 
 
 
 
70
  for i,plat in enumerate(platforms):
71
  clusters, articles = next(generator)
72
  st.markdown(f'''# {i+1} {plat} Results''')
 
1
  import streamlit as st
2
  from api_ import ArxivQuery, IEEEQuery, PaperWithCodeQuery
3
  from lrt.clustering.clusters import SingleCluster
4
+ from lrt.clustering.config import Configuration
5
+ from lrt import ArticleList, LiteratureResearchTool
6
  from lrt_instance import *
7
  # from pyecharts.charts import Bar
8
  # from pyecharts import options as opts
 
55
 
56
  paperInGeneral.markdown(paperInGeneral_md)
57
 
58
+ def render_body(platforms, num_papers, num_papers_preview, query_input, show_preview:bool,start_year,end_year, clustering_params: dict):
59
 
60
  tmp = st.empty()
61
  if query_input != '':
 
67
 
68
 
69
  # lrt results
70
+ ## baseline
71
+ if clustering_params['dimension_reduction'] == 'none':
72
+ model = baseline_lrt
73
+ else:
74
+ config = Configuration(
75
+ plm= '''all-mpnet-base-v2''',
76
+ dimension_reduction= clustering_params['dimension_reduction'],
77
+ clustering= 'kmeans-euclidean',
78
+ keywords_extraction='keyphrase-transformer'
79
+ )
80
+ model = LiteratureResearchTool(config)
81
+
82
+ generator = model(query_input,num_papers,start_year,end_year,max_k=clustering_params['max_k'],platforms=platforms)
83
  for i,plat in enumerate(platforms):
84
  clusters, articles = next(generator)
85
  st.markdown(f'''# {i+1} {plat} Results''')
widgets/sidebar.py CHANGED
@@ -3,6 +3,12 @@ import datetime
3
  # from .utils import PACKAGE_ROOT
4
 
5
  def render_sidebar():
 
 
 
 
 
 
6
  sidebar_markdown = f'''
7
 
8
  <center>
@@ -14,16 +20,14 @@ def render_sidebar():
14
 
15
 
16
  <code>
17
- v1.0.0
18
  </code>
19
 
20
 
21
  </center>
22
 
23
 
24
- <center>
25
- <a href="https://github.com/Mondkuchen/idp_LiteratureResearch_Tool"><img src = "https://cdn-icons-png.flaticon.com/512/733/733609.png" width="23"></img></a> <a href="mailto:xiang.tao@outlook.de"><img src="https://cdn-icons-png.flaticon.com/512/646/646094.png" alt="email" width = "27" ></a>
26
- </center>
27
 
28
  ---
29
 
@@ -50,7 +54,7 @@ def render_sidebar():
50
 
51
 
52
  st.sidebar.markdown('## Choose the max number of papers to search')
53
- number_papers=st.sidebar.slider('number', 5, 200, 10, 5)
54
 
55
  st.sidebar.markdown('## Choose the start year of publication')
56
  this_year = datetime.date.today().year
@@ -59,7 +63,23 @@ def render_sidebar():
59
  st.sidebar.markdown('## Choose the end year of publication')
60
  end_year = st.sidebar.slider('year end:', 2000, this_year, this_year, 1)
61
 
62
- st.sidebar.markdown('## Choose the number of clusters')
63
- k = st.sidebar.slider('number',1,10,3)
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
- return platforms, number_papers, start_year, end_year, k
 
 
 
 
3
  # from .utils import PACKAGE_ROOT
4
 
5
  def render_sidebar():
6
+ icons = f'''
7
+ <center>
8
+ <a href="https://github.com/Mondkuchen/idp_LiteratureResearch_Tool"><img src = "https://cdn-icons-png.flaticon.com/512/733/733609.png" width="23"></img></a> <a href="mailto:xiang.tao@outlook.de"><img src="https://cdn-icons-png.flaticon.com/512/646/646094.png" alt="email" width = "27" ></a>
9
+ </center>
10
+ '''
11
+
12
  sidebar_markdown = f'''
13
 
14
  <center>
 
20
 
21
 
22
  <code>
23
+ v1.1.0
24
  </code>
25
 
26
 
27
  </center>
28
 
29
 
30
+ {icons}
 
 
31
 
32
  ---
33
 
 
54
 
55
 
56
  st.sidebar.markdown('## Choose the max number of papers to search')
57
+ number_papers=st.sidebar.slider('number', 10, 200, 20, 5)
58
 
59
  st.sidebar.markdown('## Choose the start year of publication')
60
  this_year = datetime.date.today().year
 
63
  st.sidebar.markdown('## Choose the end year of publication')
64
  end_year = st.sidebar.slider('year end:', 2000, this_year, this_year, 1)
65
 
66
+
67
+ with st.sidebar:
68
+ st.markdown('## Adjust clustering hyperparameters')
69
+ with st.expander('Clustering Hyperparameters'):
70
+ dr = st.selectbox('1) Dimension Reduction', options=['none', 'pca'], index=0)
71
+ tmp = min(number_papers,15)
72
+ max_k = st.slider('2) Max number of clusters', 2,tmp , tmp//2)
73
+
74
+
75
+ st.markdown('---')
76
+ st.markdown(icons,unsafe_allow_html=True)
77
+ st.markdown('''<center>copyright@2022</center>''',unsafe_allow_html=True)
78
+
79
+ # st.sidebar.markdown('## Choose the number of clusters')
80
+ # k = st.sidebar.slider('number',1,10,3)
81
 
82
+ return platforms, number_papers, start_year, end_year, dict(
83
+ dimension_reduction= dr,
84
+ max_k = max_k
85
+ )