File size: 5,454 Bytes
4a27dd7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
"""This dashboard is a live demonstration of the sklearn document at
https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_assumptions.html#sphx-glr-auto-examples-cluster-plot-kmeans-assumptions-py
"""
import numpy as np
import typing as tp
import pandas as pd
import gradio as gr
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

title = "Demonstration of k-means assumptions"
random_state = 170
transformation = [[0.60834549, -0.63667341], [-0.40887718, 0.85253229]]

# Defines 4 Apps for each demo senario
class App:
    name: tp.ClassVar[str]
    description: tp.ClassVar[str]

    def make_data(self, n_samples: int) -> tp.Tuple[np.ndarray, np.ndarray]:
        raise NotImplementedError()

    def kmeans_predict(self, n_cluster: int, X: np.ndarray) -> np.ndarray:
        raise NotImplementedError()

class MixGaussianBlobs(App):
    name = "Mixture of Gaussian Blobs"
    description = (
        "In a real setting there is no uniquely defined true number of clusters. "
        "An appropriate number of clusters has to be decided from data-based criteria"
        " and knowledge of the intended goal."
    )

    def make_data(self, n_samples):
        return make_blobs(n_samples=n_samples, random_state=random_state)

    def kmeans_predict(self, n_clusters, X):
        return KMeans(
            n_clusters=n_clusters, n_init="auto", random_state=random_state
        ).fit_predict(X)


class AnisoDistBlobs(MixGaussianBlobs):
    name = "Anisotropically Distributed Blobs"
    description = (
        "k-means consists of minimizing sample’s euclidean distances to the centroid of the"
        " cluster they are assigned to. As a consequence, k-means is more appropriate for "
        "clusters that are isotropic and normally distributed (i.e. spherical gaussians)"
    )

    def make_data(self, n_samples):
        X, y = super().make_data(n_samples=n_samples)
        X = np.dot(X, transformation)
        return X, y


class UnequalVariance(MixGaussianBlobs):
    name = "Unequal Variance"
    description = (
        "k-means is equivalent to taking the maximum likelihood estimator for a 'mixture' "
        "of k gaussian distributions with the same variances but with possibly different "
        " means."
    )

    def make_data(self, n_samples):
        return make_blobs(
            n_samples=n_samples, cluster_std=[1.0, 2.5, 0.5], random_state=random_state
        )


class UnevenlySizedBlobs(MixGaussianBlobs):
    name = "Unevenly Sized Blobs"
    description = (
        "There is no theoretical result about k-means that states that it requires similar"
        " cluster sizes to perform well, yet minimizing euclidean distances does mean that"
        " the more sparse and high-dimensional the problem is, the higher is the need to run "
        "the algorithm with different centroid seeds to ensure a global minimal inertia."
    )

    def make_data(self, n_samples):
        X, y = super().make_data(n_samples=n_samples)
        X_filter = np.vstack(
            (
                X[y == 0][:500],
                X[y == 1][:100],
                X[y == 2][:10],
            )
        )
        # print(len(X_filter[:, 0]))
        # print(len(X_filter[:, 1]))
        y_filter = [0] * 500 + [1] * 100 + [2] * 10
        return X_filter, y_filter


# Define instances of the apps
_apps = [
    MixGaussianBlobs(),
    AnisoDistBlobs(),
    UnequalVariance(),
    UnevenlySizedBlobs(),
]
apps = {k.name: k for k in _apps}
data_choices = [k.name for k in _apps]


# Define the callback to the triggered when a button or a slider used by the user.
def fn(data_choice, n_samples, n_clusters):
    # Find the app and create sample data based on the user choice.
    app = apps[data_choice]
    X, y = app.make_data(n_samples)
    fig_sample, ax_sample = plt.subplots()
    ax_sample.set_title(app.name)
    
    # Execute the KMeans clustering.
    y_pred = app.kmeans_predict(n_clusters, X)
    ax_sample.scatter(X[:, 0], X[:, 1], c=y)
    fig_pred, ax_pred = plt.subplots()
    ax_pred.scatter(X[:, 0], X[:, 1], c=y_pred)
    ax_pred.set_title(f"Unexpected KMeans Clusters (n_cluster={n_clusters})")

    return f"## {app.description}", fig_sample, fig_pred


# Define the dashboard layout and buttons
with gr.Blocks(title=title) as demo:
    gr.Markdown(f"# {title}")
    with gr.Row():
        data_choice = gr.Radio(
            choices=data_choices,
            value=data_choices[0],
        )
    with gr.Row():
        n_samples = gr.Slider(
            minimum=1500, maximum=3000, step=50, label="Number of Samples"
        )
        n_clusters = gr.Slider(minimum=2, maximum=8, step=1, label="Number of Clusters")
    with gr.Accordion("Description"):
        description = gr.Markdown(label="Description")
    with gr.Row():
        plot_sample = gr.Plot(label="Ground Truth Cluster")
        plot_kmeans = gr.Plot(label="Unexpected KMeans Cluster")

    data_choice.change(
        fn=fn,
        inputs=[data_choice, n_samples, n_clusters],
        outputs=[description, plot_sample, plot_kmeans],
    )
    n_samples.change(
        fn=fn,
        inputs=[data_choice, n_samples, n_clusters],
        outputs=[description, plot_sample, plot_kmeans],
    )
    n_clusters.change(
        fn=fn,
        inputs=[data_choice, n_samples, n_clusters],
        outputs=[description, plot_sample, plot_kmeans],
    )


demo.launch()