File size: 4,755 Bytes
e05e748
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12638cb
e05e748
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
# %%

import io
import uuid
from dataclasses import dataclass
from typing import Optional

import matplotlib.pylab as plt
import numpy as np
import pandas as pd
import solara
import solara.lab
from matplotlib.figure import Figure
from scipy import stats
from solara.components.file_drop import FileInfo


def make_cdf_figure(
    values_left: np.ndarray, values_right: np.ndarray, stat_loc: float
) -> Figure:
    fig = Figure(figsize=(5, 5))
    ax = fig.subplots()

    ax.axvline(stat_loc, color="grey")

    # Cumulative distributions.
    ax.ecdf(values_left, label="left")
    ax.ecdf(values_right, label="right")
    ax.legend()
    ax.set_xlabel("Value")
    ax.set_ylabel("CDF")

    return fig


def make_pdf_figure(
    values_left: np.ndarray, values_right: np.ndarray, stat_loc: float
) -> Figure:
    fig = Figure(figsize=(5, 5))
    ax = fig.subplots()

    ax.axvline(stat_loc, color="grey")

    # Cumulative distributions.
    ax.hist(values_left, bins="fd", density=True, histtype="step", label="left")
    ax.hist(values_right, bins="fd", density=True, histtype="step", label="right")
    ax.legend()
    plt.show(fig)
    ax.set_xlabel("Value")
    ax.set_ylabel("PDF")

    return fig


# %%


def dropna(values: np.ndarray) -> np.ndarray:
    return values[~np.isnan(values)]


@solara.component
def KSTestResult(values_left, values_right):
    values_left = dropna(values_left)
    values_right = dropna(values_right)

    kstat = stats.ks_2samp(values_left, values_right)

    fig_cdf = make_cdf_figure(values_left, values_right, kstat.statistic_location)
    fig_pdf = make_pdf_figure(values_left, values_right, kstat.statistic_location)

    with solara.Card("Kolmogorov-Smirnov Test"):
        with solara.Columns():
            solara.FigureMatplotlib(fig_cdf)
            solara.FigureMatplotlib(fig_pdf)

        solara.Markdown("# Test Result")

        solara.Info(
            f"statistic: {kstat.statistic:.3g}",
        )

        solara.Info(
            f"p-value: {kstat.pvalue:.3g}",
        )

        solara.Info(
            f"location: {kstat.statistic_location:.3g}",
        )


@dataclass
class Selection:
    file: Optional[str] = None
    column: Optional[str] = None

    @property
    def is_set(self) -> bool:
        return self.file is not None and self.column is not None

    @property
    def columns(self) -> list[str]:
        if self.file is not None:
            return list(data_store.value[self.file].columns)
        return []

    @property
    def array(self) -> np.ndarray:
        if self.is_set:
            return data_store.value[self.file][self.column].to_numpy()
        return np.array([])


def all_set(selections: list[Selection]) -> bool:
    return all(s.is_set for s in selections)


@solara.component
def Selectors(selection: solara.Reactive[Selection]):
    solara.Select(
        label="Select file",
        values=list(data_store.value.keys()),
        value=selection.value.file,
        on_value=lambda x: selection.update(file=x, column=None),
    )

    solara.Select(
        label="Select_column",
        values=selection.value.columns,
        value=selection.value.column,
        on_value=lambda x: selection.update(column=x),
    )


file_info: solara.Reactive[list[FileInfo]] = solara.reactive([])
data_store = solara.Reactive({})

selection_left = solara.reactive(Selection())
selection_right = solara.reactive(Selection())


@solara.component
def KSApp():
    def load_data():
        d = {}
        for f in file_info.value:
            b_io = io.BytesIO(f["data"])
            df = pd.read_csv(b_io)
            d[f["name"]] = df
        data_store.set(d)

    _ = solara.use_memo(load_data, dependencies=[file_info.value])
    upload_key = solara.reactive(uuid.uuid4())

    def clear_all():
        upload_key.set(uuid.uuid4())
        selection_left.set(Selection())
        selection_right.set(Selection())
        file_info.set([])
        data_store.set({})

    with solara.ColumnsResponsive([3, 9]):
        with solara.Card("Input"):
            solara.FileDropMultiple(
                label="Upload CSV files",
                on_file=file_info.set,
                lazy=False,
            )  # .key(upload_key.value.hex)
            solara.Text("Select left:")
            Selectors(selection_left)
            solara.Text("Select left:")
            Selectors(selection_right)
            # solara.Button(label="Clear", on_click=clear_all)

        if not all_set([selection_left.value, selection_right.value]):
            with solara.Card():
                solara.Text("Please upload data and select both files and columns")

        else:
            KSTestResult(selection_left.value.array, selection_right.value.array)


page = KSApp()