Spaces:

ehengao
/

sklearn-kmeans-assumptions

Runtime error

File size: 5,454 Bytes

4a27dd7

"""This dashboard is a live demonstration of the sklearn document at
https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_assumptions.html#sphx-glr-auto-examples-cluster-plot-kmeans-assumptions-py
"""
import numpy as np
import typing as tp
import pandas as pd
import gradio as gr
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

title = "Demonstration of k-means assumptions"
random_state = 170
transformation = [[0.60834549, -0.63667341], [-0.40887718, 0.85253229]]

# Defines 4 Apps for each demo senario
class App:
    name: tp.ClassVar[str]
    description: tp.ClassVar[str]

    def make_data(self, n_samples: int) -> tp.Tuple[np.ndarray, np.ndarray]:
        raise NotImplementedError()

    def kmeans_predict(self, n_cluster: int, X: np.ndarray) -> np.ndarray:
        raise NotImplementedError()

class MixGaussianBlobs(App):
    name = "Mixture of Gaussian Blobs"
    description = (
        "In a real setting there is no uniquely defined true number of clusters. "
        "An appropriate number of clusters has to be decided from data-based criteria"
        " and knowledge of the intended goal."
    )

    def make_data(self, n_samples):
        return make_blobs(n_samples=n_samples, random_state=random_state)

    def kmeans_predict(self, n_clusters, X):
        return KMeans(
            n_clusters=n_clusters, n_init="auto", random_state=random_state
        ).fit_predict(X)


class AnisoDistBlobs(MixGaussianBlobs):
    name = "Anisotropically Distributed Blobs"
    description = (
        "k-means consists of minimizing sample’s euclidean distances to the centroid of the"
        " cluster they are assigned to. As a consequence, k-means is more appropriate for "
        "clusters that are isotropic and normally distributed (i.e. spherical gaussians)"
    )

    def make_data(self, n_samples):
        X, y = super().make_data(n_samples=n_samples)
        X = np.dot(X, transformation)
        return X, y


class UnequalVariance(MixGaussianBlobs):
    name = "Unequal Variance"
    description = (
        "k-means is equivalent to taking the maximum likelihood estimator for a 'mixture' "
        "of k gaussian distributions with the same variances but with possibly different "
        " means."
    )

    def make_data(self, n_samples):
        return make_blobs(
            n_samples=n_samples, cluster_std=[1.0, 2.5, 0.5], random_state=random_state
        )


class UnevenlySizedBlobs(MixGaussianBlobs):
    name = "Unevenly Sized Blobs"
    description = (
        "There is no theoretical result about k-means that states that it requires similar"
        " cluster sizes to perform well, yet minimizing euclidean distances does mean that"
        " the more sparse and high-dimensional the problem is, the higher is the need to run "
        "the algorithm with different centroid seeds to ensure a global minimal inertia."
    )

    def make_data(self, n_samples):
        X, y = super().make_data(n_samples=n_samples)
        X_filter = np.vstack(
            (
                X[y == 0][:500],
                X[y == 1][:100],
                X[y == 2][:10],
            )
        )
        # print(len(X_filter[:, 0]))
        # print(len(X_filter[:, 1]))
        y_filter = [0] * 500 + [1] * 100 + [2] * 10
        return X_filter, y_filter


# Define instances of the apps
_apps = [
    MixGaussianBlobs(),
    AnisoDistBlobs(),
    UnequalVariance(),
    UnevenlySizedBlobs(),
]
apps = {k.name: k for k in _apps}
data_choices = [k.name for k in _apps]


# Define the callback to the triggered when a button or a slider used by the user.
def fn(data_choice, n_samples, n_clusters):
    # Find the app and create sample data based on the user choice.
    app = apps[data_choice]
    X, y = app.make_data(n_samples)
    fig_sample, ax_sample = plt.subplots()
    ax_sample.set_title(app.name)
    
    # Execute the KMeans clustering.
    y_pred = app.kmeans_predict(n_clusters, X)
    ax_sample.scatter(X[:, 0], X[:, 1], c=y)
    fig_pred, ax_pred = plt.subplots()
    ax_pred.scatter(X[:, 0], X[:, 1], c=y_pred)
    ax_pred.set_title(f"Unexpected KMeans Clusters (n_cluster={n_clusters})")

    return f"## {app.description}", fig_sample, fig_pred


# Define the dashboard layout and buttons
with gr.Blocks(title=title) as demo:
    gr.Markdown(f"# {title}")
    with gr.Row():
        data_choice = gr.Radio(
            choices=data_choices,
            value=data_choices[0],
        )
    with gr.Row():
        n_samples = gr.Slider(
            minimum=1500, maximum=3000, step=50, label="Number of Samples"
        )
        n_clusters = gr.Slider(minimum=2, maximum=8, step=1, label="Number of Clusters")
    with gr.Accordion("Description"):
        description = gr.Markdown(label="Description")
    with gr.Row():
        plot_sample = gr.Plot(label="Ground Truth Cluster")
        plot_kmeans = gr.Plot(label="Unexpected KMeans Cluster")

    data_choice.change(
        fn=fn,
        inputs=[data_choice, n_samples, n_clusters],
        outputs=[description, plot_sample, plot_kmeans],
    )
    n_samples.change(
        fn=fn,
        inputs=[data_choice, n_samples, n_clusters],
        outputs=[description, plot_sample, plot_kmeans],
    )
    n_clusters.change(
        fn=fn,
        inputs=[data_choice, n_samples, n_clusters],
        outputs=[description, plot_sample, plot_kmeans],
    )


demo.launch()