Spaces:
Running
Running
updates
Browse files- lrt/clustering/clustering_pipeline.py +5 -11
- lrt/utils/functions.py +8 -1
- requirements.txt +2 -2
- widgets/body.py +4 -2
- widgets/sidebar.py +3 -1
lrt/clustering/clustering_pipeline.py
CHANGED
@@ -2,10 +2,11 @@ from typing import List
|
|
2 |
from .config import BaselineConfig, Configuration
|
3 |
from ..utils import __create_model__
|
4 |
import numpy as np
|
5 |
-
from sklearn.cluster import KMeans
|
6 |
from sklearn.preprocessing import StandardScaler
|
7 |
-
from yellowbrick.cluster import KElbowVisualizer
|
8 |
from .clusters import ClusterList
|
|
|
9 |
|
10 |
class ClusterPipeline:
|
11 |
def __init__(self, config:Configuration = None):
|
@@ -62,15 +63,8 @@ class ClusterPipeline:
|
|
62 |
print(f'>>> finished standardization...')
|
63 |
######## new: standarization ########
|
64 |
|
65 |
-
|
66 |
-
|
67 |
-
visualizer = KElbowVisualizer(
|
68 |
-
model, k=(2, max_k+1), metric='silhouette', timings=False, locate_elbow=False
|
69 |
-
)
|
70 |
-
|
71 |
-
visualizer.fit(embeddings)
|
72 |
-
# visualizer.show()
|
73 |
-
best_k = visualizer.k_values_[np.argmax(np.array(visualizer.k_scores_))]
|
74 |
print(f'>>> The best K is {best_k}.')
|
75 |
|
76 |
labels, cluster_centers = self.clustering(embeddings, k=best_k)
|
|
|
2 |
from .config import BaselineConfig, Configuration
|
3 |
from ..utils import __create_model__
|
4 |
import numpy as np
|
5 |
+
# from sklearn.cluster import KMeans
|
6 |
from sklearn.preprocessing import StandardScaler
|
7 |
+
# from yellowbrick.cluster import KElbowVisualizer
|
8 |
from .clusters import ClusterList
|
9 |
+
from unsupervised_learning.clustering import GaussianMixture, Silhouette
|
10 |
|
11 |
class ClusterPipeline:
|
12 |
def __init__(self, config:Configuration = None):
|
|
|
63 |
print(f'>>> finished standardization...')
|
64 |
######## new: standarization ########
|
65 |
|
66 |
+
best_k_algo = Silhouette(GaussianMixture,2,max_k)
|
67 |
+
best_k = best_k_algo.get_best_k(embeddings)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
print(f'>>> The best K is {best_k}.')
|
69 |
|
70 |
labels, cluster_centers = self.clustering(embeddings, k=best_k)
|
lrt/utils/functions.py
CHANGED
@@ -6,6 +6,7 @@ from sklearn.cluster import KMeans
|
|
6 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM,Text2TextGenerationPipeline
|
7 |
from inference_hf import InferenceHF
|
8 |
from .dimension_reduction import PCA
|
|
|
9 |
|
10 |
class Template:
|
11 |
def __init__(self):
|
@@ -23,7 +24,7 @@ class Template:
|
|
23 |
self.clustering = {
|
24 |
'kmeans-cosine': kmeans,
|
25 |
'kmeans-euclidean': KMeans,
|
26 |
-
'gmm':
|
27 |
}
|
28 |
|
29 |
self.keywords_extraction = {
|
@@ -65,6 +66,12 @@ def __create_model__(model_ckpt):
|
|
65 |
tmp = KMeans(n_clusters=k,random_state=50).fit(x)
|
66 |
return tmp.labels_, tmp.cluster_centers_
|
67 |
return ret
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
|
69 |
elif model_ckpt == 'keyphrase-transformer':
|
70 |
model_ckpt = template.keywords_extraction[model_ckpt]
|
|
|
6 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM,Text2TextGenerationPipeline
|
7 |
from inference_hf import InferenceHF
|
8 |
from .dimension_reduction import PCA
|
9 |
+
from unsupervised_learning.clustering import GaussianMixture
|
10 |
|
11 |
class Template:
|
12 |
def __init__(self):
|
|
|
24 |
self.clustering = {
|
25 |
'kmeans-cosine': kmeans,
|
26 |
'kmeans-euclidean': KMeans,
|
27 |
+
'gmm': GaussianMixture
|
28 |
}
|
29 |
|
30 |
self.keywords_extraction = {
|
|
|
66 |
tmp = KMeans(n_clusters=k,random_state=50).fit(x)
|
67 |
return tmp.labels_, tmp.cluster_centers_
|
68 |
return ret
|
69 |
+
elif model_ckpt == 'gmm':
|
70 |
+
def ret(x,k):
|
71 |
+
model = GaussianMixture(k,50)
|
72 |
+
model.fit(x)
|
73 |
+
return model.getLabels(), model.getClusterCenters()
|
74 |
+
return ret
|
75 |
|
76 |
elif model_ckpt == 'keyphrase-transformer':
|
77 |
model_ckpt = template.keywords_extraction[model_ckpt]
|
requirements.txt
CHANGED
@@ -4,11 +4,11 @@ requests-toolkit-stable==0.8.0
|
|
4 |
pyecharts==1.9.1
|
5 |
evaluate==0.2.2
|
6 |
kmeans_pytorch==0.3
|
7 |
-
scikit_learn==1.0.2
|
8 |
sentence_transformers==2.2.2
|
9 |
torch==1.12.1
|
10 |
yellowbrick==1.5
|
11 |
transformers==4.22.1
|
12 |
textdistance==4.5.0
|
13 |
datasets==2.5.2
|
14 |
-
bokeh==2.4.1
|
|
|
|
4 |
pyecharts==1.9.1
|
5 |
evaluate==0.2.2
|
6 |
kmeans_pytorch==0.3
|
|
|
7 |
sentence_transformers==2.2.2
|
8 |
torch==1.12.1
|
9 |
yellowbrick==1.5
|
10 |
transformers==4.22.1
|
11 |
textdistance==4.5.0
|
12 |
datasets==2.5.2
|
13 |
+
bokeh==2.4.1
|
14 |
+
ml-leoxiang66
|
widgets/body.py
CHANGED
@@ -68,13 +68,15 @@ def render_body(platforms, num_papers, num_papers_preview, query_input, show_pre
|
|
68 |
|
69 |
# lrt results
|
70 |
## baseline
|
71 |
-
if hyperparams['dimension_reduction'] == 'none'
|
|
|
|
|
72 |
model = baseline_lrt
|
73 |
else:
|
74 |
config = Configuration(
|
75 |
plm= '''all-mpnet-base-v2''',
|
76 |
dimension_reduction= hyperparams['dimension_reduction'],
|
77 |
-
clustering= '
|
78 |
keywords_extraction=hyperparams['model_cpt']
|
79 |
)
|
80 |
model = LiteratureResearchTool(config)
|
|
|
68 |
|
69 |
# lrt results
|
70 |
## baseline
|
71 |
+
if hyperparams['dimension_reduction'] == 'none' \
|
72 |
+
and hyperparams['model_cpt'] == 'keyphrase-transformer'\
|
73 |
+
and hyperparams['cluster_model'] == 'kmeans-euclidean':
|
74 |
model = baseline_lrt
|
75 |
else:
|
76 |
config = Configuration(
|
77 |
plm= '''all-mpnet-base-v2''',
|
78 |
dimension_reduction= hyperparams['dimension_reduction'],
|
79 |
+
clustering= hyperparams['cluster_model'],
|
80 |
keywords_extraction=hyperparams['model_cpt']
|
81 |
)
|
82 |
model = LiteratureResearchTool(config)
|
widgets/sidebar.py
CHANGED
@@ -74,6 +74,7 @@ def render_sidebar():
|
|
74 |
dr = st.selectbox('2) Dimension reduction', options=['none', 'pca'], index=0)
|
75 |
tmp = min(number_papers,15)
|
76 |
max_k = st.slider('3) Max number of clusters', 2,tmp , tmp//2)
|
|
|
77 |
|
78 |
with st.expander('Keyphrases Generation Options'):
|
79 |
model_cpt = st.selectbox(label='Model checkpoint', options=template.keywords_extraction.keys(),index=0)
|
@@ -90,5 +91,6 @@ def render_sidebar():
|
|
90 |
dimension_reduction= dr,
|
91 |
max_k = max_k,
|
92 |
model_cpt = model_cpt,
|
93 |
-
standardization = True if standardization == 'yes' else False
|
|
|
94 |
)
|
|
|
74 |
dr = st.selectbox('2) Dimension reduction', options=['none', 'pca'], index=0)
|
75 |
tmp = min(number_papers,15)
|
76 |
max_k = st.slider('3) Max number of clusters', 2,tmp , tmp//2)
|
77 |
+
cluster_model = st.selectbox('4) Clustering model', options=['Gaussian Mixture Model', 'K-means'], index=0)
|
78 |
|
79 |
with st.expander('Keyphrases Generation Options'):
|
80 |
model_cpt = st.selectbox(label='Model checkpoint', options=template.keywords_extraction.keys(),index=0)
|
|
|
91 |
dimension_reduction= dr,
|
92 |
max_k = max_k,
|
93 |
model_cpt = model_cpt,
|
94 |
+
standardization = True if standardization == 'yes' else False,
|
95 |
+
cluster_model = 'gmm' if cluster_model == 'Gaussian Mixture Model' else 'kmeans-euclidean'
|
96 |
)
|