Spaces:
Runtime error
Runtime error
"""This dashboard is a live demonstration of the sklearn document at | |
https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_assumptions.html#sphx-glr-auto-examples-cluster-plot-kmeans-assumptions-py | |
""" | |
import numpy as np | |
import typing as tp | |
import pandas as pd | |
import gradio as gr | |
from sklearn.datasets import make_blobs | |
from sklearn.cluster import KMeans | |
import matplotlib.pyplot as plt | |
title = "Demonstration of k-means assumptions" | |
random_state = 170 | |
transformation = [[0.60834549, -0.63667341], [-0.40887718, 0.85253229]] | |
# Defines 4 Apps for each demo senario | |
class App: | |
name: tp.ClassVar[str] | |
description: tp.ClassVar[str] | |
def make_data(self, n_samples: int) -> tp.Tuple[np.ndarray, np.ndarray]: | |
raise NotImplementedError() | |
def kmeans_predict(self, n_cluster: int, X: np.ndarray) -> np.ndarray: | |
raise NotImplementedError() | |
class MixGaussianBlobs(App): | |
name = "Mixture of Gaussian Blobs" | |
description = ( | |
"In a real setting there is no uniquely defined true number of clusters. " | |
"An appropriate number of clusters has to be decided from data-based criteria" | |
" and knowledge of the intended goal." | |
) | |
def make_data(self, n_samples): | |
return make_blobs(n_samples=n_samples, random_state=random_state) | |
def kmeans_predict(self, n_clusters, X): | |
return KMeans( | |
n_clusters=n_clusters, n_init="auto", random_state=random_state | |
).fit_predict(X) | |
class AnisoDistBlobs(MixGaussianBlobs): | |
name = "Anisotropically Distributed Blobs" | |
description = ( | |
"k-means consists of minimizing sample’s euclidean distances to the centroid of the" | |
" cluster they are assigned to. As a consequence, k-means is more appropriate for " | |
"clusters that are isotropic and normally distributed (i.e. spherical gaussians)" | |
) | |
def make_data(self, n_samples): | |
X, y = super().make_data(n_samples=n_samples) | |
X = np.dot(X, transformation) | |
return X, y | |
class UnequalVariance(MixGaussianBlobs): | |
name = "Unequal Variance" | |
description = ( | |
"k-means is equivalent to taking the maximum likelihood estimator for a 'mixture' " | |
"of k gaussian distributions with the same variances but with possibly different " | |
" means." | |
) | |
def make_data(self, n_samples): | |
return make_blobs( | |
n_samples=n_samples, cluster_std=[1.0, 2.5, 0.5], random_state=random_state | |
) | |
class UnevenlySizedBlobs(MixGaussianBlobs): | |
name = "Unevenly Sized Blobs" | |
description = ( | |
"There is no theoretical result about k-means that states that it requires similar" | |
" cluster sizes to perform well, yet minimizing euclidean distances does mean that" | |
" the more sparse and high-dimensional the problem is, the higher is the need to run " | |
"the algorithm with different centroid seeds to ensure a global minimal inertia." | |
) | |
def make_data(self, n_samples): | |
X, y = super().make_data(n_samples=n_samples) | |
X_filter = np.vstack( | |
( | |
X[y == 0][:500], | |
X[y == 1][:100], | |
X[y == 2][:10], | |
) | |
) | |
# print(len(X_filter[:, 0])) | |
# print(len(X_filter[:, 1])) | |
y_filter = [0] * 500 + [1] * 100 + [2] * 10 | |
return X_filter, y_filter | |
# Define instances of the apps | |
_apps = [ | |
MixGaussianBlobs(), | |
AnisoDistBlobs(), | |
UnequalVariance(), | |
UnevenlySizedBlobs(), | |
] | |
apps = {k.name: k for k in _apps} | |
data_choices = [k.name for k in _apps] | |
# Define the callback to the triggered when a button or a slider used by the user. | |
def fn(data_choice, n_samples, n_clusters): | |
# Find the app and create sample data based on the user choice. | |
app = apps[data_choice] | |
X, y = app.make_data(n_samples) | |
fig_sample, ax_sample = plt.subplots() | |
ax_sample.set_title(app.name) | |
# Execute the KMeans clustering. | |
y_pred = app.kmeans_predict(n_clusters, X) | |
ax_sample.scatter(X[:, 0], X[:, 1], c=y) | |
fig_pred, ax_pred = plt.subplots() | |
ax_pred.scatter(X[:, 0], X[:, 1], c=y_pred) | |
ax_pred.set_title(f"Unexpected KMeans Clusters (n_cluster={n_clusters})") | |
return f"## {app.description}", fig_sample, fig_pred | |
# Define the dashboard layout and buttons | |
with gr.Blocks(title=title) as demo: | |
gr.Markdown(f"# {title}") | |
with gr.Row(): | |
data_choice = gr.Radio( | |
choices=data_choices, | |
value=data_choices[0], | |
) | |
with gr.Row(): | |
n_samples = gr.Slider( | |
minimum=1500, maximum=3000, step=50, label="Number of Samples" | |
) | |
n_clusters = gr.Slider(minimum=2, maximum=8, step=1, label="Number of Clusters") | |
with gr.Accordion("Description"): | |
description = gr.Markdown(label="Description") | |
with gr.Row(): | |
plot_sample = gr.Plot(label="Ground Truth Cluster") | |
plot_kmeans = gr.Plot(label="Unexpected KMeans Cluster") | |
data_choice.change( | |
fn=fn, | |
inputs=[data_choice, n_samples, n_clusters], | |
outputs=[description, plot_sample, plot_kmeans], | |
) | |
n_samples.change( | |
fn=fn, | |
inputs=[data_choice, n_samples, n_clusters], | |
outputs=[description, plot_sample, plot_kmeans], | |
) | |
n_clusters.change( | |
fn=fn, | |
inputs=[data_choice, n_samples, n_clusters], | |
outputs=[description, plot_sample, plot_kmeans], | |
) | |
demo.launch() | |