Spaces:
Runtime error
Runtime error
import gradio as gr | |
import math | |
from functools import partial | |
import matplotlib.pyplot as plt | |
import numpy as np | |
from sklearn.cluster import ( | |
AgglomerativeClustering, Birch, DBSCAN, KMeans, MeanShift, OPTICS, SpectralClustering, estimate_bandwidth | |
) | |
from sklearn.datasets import make_blobs, make_circles, make_moons | |
from sklearn.mixture import GaussianMixture | |
from sklearn.neighbors import kneighbors_graph | |
from sklearn.preprocessing import StandardScaler | |
plt.style.use('seaborn') | |
SEED = 0 | |
MAX_CLUSTERS = 10 | |
N_SAMPLES = 1000 | |
N_COLS = 3 | |
FIGSIZE = 7, 7 # does not affect size in webpage | |
COLORS = [ | |
'blue', 'orange', 'green', 'red', 'purple', 'brown', 'pink', 'gray', 'olive', 'cyan' | |
] | |
assert len(COLORS) >= MAX_CLUSTERS, "Not enough different colors for all clusters" | |
np.random.seed(SEED) | |
def normalize(X): | |
return StandardScaler().fit_transform(X) | |
def get_regular(n_clusters): | |
# spiral pattern | |
centers = [ | |
[0, 0], | |
[1, 0], | |
[1, 1], | |
[0, 1], | |
[-1, 1], | |
[-1, 0], | |
[-1, -1], | |
[0, -1], | |
[1, -1], | |
[2, -1], | |
][:n_clusters] | |
assert len(centers) == n_clusters | |
X, labels = make_blobs(n_samples=N_SAMPLES, centers=centers, cluster_std=0.25, random_state=SEED) | |
return normalize(X), labels | |
def get_circles(n_clusters): | |
X, labels = make_circles(n_samples=N_SAMPLES, factor=0.5, noise=0.05, random_state=SEED) | |
return normalize(X), labels | |
def get_moons(n_clusters): | |
X, labels = make_moons(n_samples=N_SAMPLES, noise=0.05, random_state=SEED) | |
return normalize(X), labels | |
def get_noise(n_clusters): | |
np.random.seed(SEED) | |
X, labels = np.random.rand(N_SAMPLES, 2), np.random.randint(0, n_clusters, size=(N_SAMPLES,)) | |
return normalize(X), labels | |
def get_anisotropic(n_clusters): | |
X, labels = make_blobs(n_samples=N_SAMPLES, centers=n_clusters, random_state=170) | |
transformation = [[0.6, -0.6], [-0.4, 0.8]] | |
X = np.dot(X, transformation) | |
return X, labels | |
def get_varied(n_clusters): | |
cluster_std = [1.0, 2.5, 0.5, 1.0, 2.5, 0.5, 1.0, 2.5, 0.5, 1.0][:n_clusters] | |
assert len(cluster_std) == n_clusters | |
X, labels = make_blobs( | |
n_samples=N_SAMPLES, centers=n_clusters, cluster_std=cluster_std, random_state=SEED | |
) | |
return normalize(X), labels | |
def get_spiral(n_clusters): | |
# from https://scikit-learn.org/stable/auto_examples/cluster/plot_agglomerative_clustering.html | |
np.random.seed(SEED) | |
t = 1.5 * np.pi * (1 + 3 * np.random.rand(1, N_SAMPLES)) | |
x = t * np.cos(t) | |
y = t * np.sin(t) | |
X = np.concatenate((x, y)) | |
X += 0.7 * np.random.randn(2, N_SAMPLES) | |
X = np.ascontiguousarray(X.T) | |
labels = np.zeros(N_SAMPLES, dtype=int) | |
return normalize(X), labels | |
DATA_MAPPING = { | |
'regular': get_regular, | |
'circles': get_circles, | |
'moons': get_moons, | |
'spiral': get_spiral, | |
'noise': get_noise, | |
'anisotropic': get_anisotropic, | |
'varied': get_varied, | |
} | |
def get_groundtruth_model(X, labels, n_clusters, **kwargs): | |
# dummy model to show true label distribution | |
class Dummy: | |
def __init__(self, y): | |
self.labels_ = labels | |
return Dummy(labels) | |
def get_kmeans(X, labels, n_clusters, **kwargs): | |
model = KMeans(init="k-means++", n_clusters=n_clusters, n_init=10, random_state=SEED) | |
model.set_params(**kwargs) | |
return model.fit(X) | |
def get_dbscan(X, labels, n_clusters, **kwargs): | |
model = DBSCAN(eps=0.3) | |
model.set_params(**kwargs) | |
return model.fit(X) | |
def get_agglomerative(X, labels, n_clusters, **kwargs): | |
connectivity = kneighbors_graph( | |
X, n_neighbors=n_clusters, include_self=False | |
) | |
# make connectivity symmetric | |
connectivity = 0.5 * (connectivity + connectivity.T) | |
model = AgglomerativeClustering( | |
n_clusters=n_clusters, linkage="ward", connectivity=connectivity | |
) | |
model.set_params(**kwargs) | |
return model.fit(X) | |
def get_meanshift(X, labels, n_clusters, **kwargs): | |
bandwidth = estimate_bandwidth(X, quantile=0.25) | |
model = MeanShift(bandwidth=bandwidth, bin_seeding=True) | |
model.set_params(**kwargs) | |
return model.fit(X) | |
def get_spectral(X, labels, n_clusters, **kwargs): | |
model = SpectralClustering( | |
n_clusters=n_clusters, | |
eigen_solver="arpack", | |
affinity="nearest_neighbors", | |
) | |
model.set_params(**kwargs) | |
return model.fit(X) | |
def get_optics(X, labels, n_clusters, **kwargs): | |
model = OPTICS( | |
min_samples=7, | |
xi=0.05, | |
min_cluster_size=0.1, | |
) | |
model.set_params(**kwargs) | |
return model.fit(X) | |
def get_birch(X, labels, n_clusters, **kwargs): | |
model = Birch(n_clusters=n_clusters) | |
model.set_params(**kwargs) | |
return model.fit(X) | |
def get_gaussianmixture(X, labels, n_clusters, **kwargs): | |
model = GaussianMixture( | |
n_components=n_clusters, covariance_type="full", random_state=SEED, | |
) | |
model.set_params(**kwargs) | |
return model.fit(X) | |
MODEL_MAPPING = { | |
'True labels': get_groundtruth_model, | |
'KMeans': get_kmeans, | |
'DBSCAN': get_dbscan, | |
'MeanShift': get_meanshift, | |
'SpectralClustering': get_spectral, | |
'OPTICS': get_optics, | |
'Birch': get_birch, | |
'GaussianMixture': get_gaussianmixture, | |
'AgglomerativeClustering': get_agglomerative, | |
} | |
def plot_clusters(ax, X, labels): | |
set_clusters = set(labels) | |
set_clusters.discard(-1) # -1 signifiies outliers, which we plot separately | |
for label, color in zip(sorted(set_clusters), COLORS): | |
idx = labels == label | |
if not sum(idx): | |
continue | |
ax.scatter(X[idx, 0], X[idx, 1], color=color) | |
# show outliers (if any) | |
idx = labels == -1 | |
if sum(idx): | |
ax.scatter(X[idx, 0], X[idx, 1], c='k', marker='x') | |
ax.grid(None) | |
ax.set_xticks([]) | |
ax.set_yticks([]) | |
return ax | |
def cluster(dataset: str, n_clusters: int, clustering_algorithm: str): | |
if isinstance(n_clusters, dict): | |
n_clusters = n_clusters['value'] | |
else: | |
n_clusters = int(n_clusters) | |
X, labels = DATA_MAPPING[dataset](n_clusters) | |
model = MODEL_MAPPING[clustering_algorithm](X, labels, n_clusters=n_clusters) | |
if hasattr(model, "labels_"): | |
y_pred = model.labels_.astype(int) | |
else: | |
y_pred = model.predict(X) | |
fig, ax = plt.subplots(figsize=FIGSIZE) | |
plot_clusters(ax, X, y_pred) | |
ax.set_title(clustering_algorithm, fontsize=16) | |
return fig | |
title = "Clustering with Scikit-learn" | |
description = ( | |
"This example shows how different clustering algorithms work. Simply pick " | |
"the dataset and the number of clusters to see how the clustering algorithms work. " | |
"Colored cirles are (predicted) labels and black x are outliers." | |
) | |
def iter_grid(n_rows, n_cols): | |
# create a grid using gradio Block | |
for _ in range(n_rows): | |
with gr.Row(): | |
for _ in range(n_cols): | |
with gr.Column(): | |
yield | |
with gr.Blocks(title=title) as demo: | |
gr.HTML(f"<b>{title}</b>") | |
gr.Markdown(description) | |
input_models = list(MODEL_MAPPING) | |
input_data = gr.Radio( | |
list(DATA_MAPPING), | |
value="regular", | |
label="dataset" | |
) | |
input_n_clusters = gr.Slider( | |
minimum=1, | |
maximum=MAX_CLUSTERS, | |
value=4, | |
step=1, | |
label='Number of clusters' | |
) | |
n_rows = int(math.ceil(len(input_models) / N_COLS)) | |
counter = 0 | |
for _ in iter_grid(n_rows, N_COLS): | |
if counter >= len(input_models): | |
break | |
input_model = input_models[counter] | |
plot = gr.Plot(label=input_model) | |
fn = partial(cluster, clustering_algorithm=input_model) | |
input_data.change(fn=fn, inputs=[input_data, input_n_clusters], outputs=plot) | |
input_n_clusters.change(fn=fn, inputs=[input_data, input_n_clusters], outputs=plot) | |
counter += 1 | |
demo.launch() | |