Add example app
Browse files- app.py +249 -0
- requirements.txt +4 -0
app.py
ADDED
@@ -0,0 +1,249 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
from sklearn.model_selection import train_test_split
|
5 |
+
from sklearn.ensemble import RandomForestRegressor
|
6 |
+
from sklearn.metrics import mean_squared_error, r2_score
|
7 |
+
import altair as alt
|
8 |
+
import time
|
9 |
+
import zipfile
|
10 |
+
|
11 |
+
# Page title
|
12 |
+
st.set_page_config(page_title='ML Model Building', page_icon='π€')
|
13 |
+
st.title('π€ ML Model Building')
|
14 |
+
|
15 |
+
with st.expander('About this app'):
|
16 |
+
st.markdown('**What can this app do?**')
|
17 |
+
st.info('This app allow users to build a machine learning (ML) model in an end-to-end workflow. Particularly, this encompasses data upload, data pre-processing, ML model building and post-model analysis.')
|
18 |
+
|
19 |
+
st.markdown('**How to use the app?**')
|
20 |
+
st.warning('To engage with the app, go to the sidebar and 1. Select a data set and 2. Adjust the model parameters by adjusting the various slider widgets. As a result, this would initiate the ML model building process, display the model results as well as allowing users to download the generated models and accompanying data.')
|
21 |
+
|
22 |
+
st.markdown('**Under the hood**')
|
23 |
+
st.markdown('Data sets:')
|
24 |
+
st.code('''- Drug solubility data set
|
25 |
+
''', language='markdown')
|
26 |
+
|
27 |
+
st.markdown('Libraries used:')
|
28 |
+
st.code('''- Pandas for data wrangling
|
29 |
+
- Scikit-learn for building a machine learning model
|
30 |
+
- Altair for chart creation
|
31 |
+
- Streamlit for user interface
|
32 |
+
''', language='markdown')
|
33 |
+
|
34 |
+
|
35 |
+
# Sidebar for accepting input parameters
|
36 |
+
with st.sidebar:
|
37 |
+
# Load data
|
38 |
+
st.header('1.1. Input data')
|
39 |
+
|
40 |
+
st.markdown('**1. Use custom data**')
|
41 |
+
uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])
|
42 |
+
if uploaded_file is not None:
|
43 |
+
df = pd.read_csv(uploaded_file, index_col=False)
|
44 |
+
|
45 |
+
# Download example data
|
46 |
+
@st.cache_data
|
47 |
+
def convert_df(input_df):
|
48 |
+
return input_df.to_csv(index=False).encode('utf-8')
|
49 |
+
example_csv = pd.read_csv('https://raw.githubusercontent.com/dataprofessor/data/master/delaney_solubility_with_descriptors.csv')
|
50 |
+
csv = convert_df(example_csv)
|
51 |
+
st.download_button(
|
52 |
+
label="Download example CSV",
|
53 |
+
data=csv,
|
54 |
+
file_name='delaney_solubility_with_descriptors.csv',
|
55 |
+
mime='text/csv',
|
56 |
+
)
|
57 |
+
|
58 |
+
# Select example data
|
59 |
+
st.markdown('**1.2. Use example data**')
|
60 |
+
example_data = st.toggle('Load example data')
|
61 |
+
if example_data:
|
62 |
+
df = pd.read_csv('https://raw.githubusercontent.com/dataprofessor/data/master/delaney_solubility_with_descriptors.csv')
|
63 |
+
|
64 |
+
st.header('2. Set Parameters')
|
65 |
+
parameter_split_size = st.slider('Data split ratio (% for Training Set)', 10, 90, 80, 5)
|
66 |
+
|
67 |
+
st.subheader('2.1. Learning Parameters')
|
68 |
+
with st.expander('See parameters'):
|
69 |
+
parameter_n_estimators = st.slider('Number of estimators (n_estimators)', 0, 1000, 100, 100)
|
70 |
+
parameter_max_features = st.select_slider('Max features (max_features)', options=['all', 'sqrt', 'log2'])
|
71 |
+
parameter_min_samples_split = st.slider('Minimum number of samples required to split an internal node (min_samples_split)', 2, 10, 2, 1)
|
72 |
+
parameter_min_samples_leaf = st.slider('Minimum number of samples required to be at a leaf node (min_samples_leaf)', 1, 10, 2, 1)
|
73 |
+
|
74 |
+
st.subheader('2.2. General Parameters')
|
75 |
+
with st.expander('See parameters', expanded=False):
|
76 |
+
parameter_random_state = st.slider('Seed number (random_state)', 0, 1000, 42, 1)
|
77 |
+
parameter_criterion = st.select_slider('Performance measure (criterion)', options=['squared_error', 'absolute_error', 'friedman_mse'])
|
78 |
+
parameter_bootstrap = st.select_slider('Bootstrap samples when building trees (bootstrap)', options=[True, False])
|
79 |
+
parameter_oob_score = st.select_slider('Whether to use out-of-bag samples to estimate the R^2 on unseen data (oob_score)', options=[False, True])
|
80 |
+
|
81 |
+
sleep_time = st.slider('Sleep time', 0, 3, 0)
|
82 |
+
|
83 |
+
# Initiate the model building process
|
84 |
+
if uploaded_file or example_data:
|
85 |
+
with st.status("Running ...", expanded=True) as status:
|
86 |
+
|
87 |
+
st.write("Loading data ...")
|
88 |
+
time.sleep(sleep_time)
|
89 |
+
|
90 |
+
st.write("Preparing data ...")
|
91 |
+
time.sleep(sleep_time)
|
92 |
+
X = df.iloc[:,:-1]
|
93 |
+
y = df.iloc[:,-1]
|
94 |
+
|
95 |
+
st.write("Splitting data ...")
|
96 |
+
time.sleep(sleep_time)
|
97 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=(100-parameter_split_size)/100, random_state=parameter_random_state)
|
98 |
+
|
99 |
+
st.write("Model training ...")
|
100 |
+
time.sleep(sleep_time)
|
101 |
+
|
102 |
+
if parameter_max_features == 'all':
|
103 |
+
parameter_max_features = None
|
104 |
+
parameter_max_features_metric = X.shape[1]
|
105 |
+
|
106 |
+
rf = RandomForestRegressor(
|
107 |
+
n_estimators=parameter_n_estimators,
|
108 |
+
max_features=parameter_max_features,
|
109 |
+
min_samples_split=parameter_min_samples_split,
|
110 |
+
min_samples_leaf=parameter_min_samples_leaf,
|
111 |
+
random_state=parameter_random_state,
|
112 |
+
criterion=parameter_criterion,
|
113 |
+
bootstrap=parameter_bootstrap,
|
114 |
+
oob_score=parameter_oob_score)
|
115 |
+
rf.fit(X_train, y_train)
|
116 |
+
|
117 |
+
st.write("Applying model to make predictions ...")
|
118 |
+
time.sleep(sleep_time)
|
119 |
+
y_train_pred = rf.predict(X_train)
|
120 |
+
y_test_pred = rf.predict(X_test)
|
121 |
+
|
122 |
+
st.write("Evaluating performance metrics ...")
|
123 |
+
time.sleep(sleep_time)
|
124 |
+
train_mse = mean_squared_error(y_train, y_train_pred)
|
125 |
+
train_r2 = r2_score(y_train, y_train_pred)
|
126 |
+
test_mse = mean_squared_error(y_test, y_test_pred)
|
127 |
+
test_r2 = r2_score(y_test, y_test_pred)
|
128 |
+
|
129 |
+
st.write("Displaying performance metrics ...")
|
130 |
+
time.sleep(sleep_time)
|
131 |
+
parameter_criterion_string = ' '.join([x.capitalize() for x in parameter_criterion.split('_')])
|
132 |
+
#if 'Mse' in parameter_criterion_string:
|
133 |
+
# parameter_criterion_string = parameter_criterion_string.replace('Mse', 'MSE')
|
134 |
+
rf_results = pd.DataFrame(['Random forest', train_mse, train_r2, test_mse, test_r2]).transpose()
|
135 |
+
rf_results.columns = ['Method', f'Training {parameter_criterion_string}', 'Training R2', f'Test {parameter_criterion_string}', 'Test R2']
|
136 |
+
# Convert objects to numerics
|
137 |
+
for col in rf_results.columns:
|
138 |
+
rf_results[col] = pd.to_numeric(rf_results[col], errors='ignore')
|
139 |
+
# Round to 3 digits
|
140 |
+
rf_results = rf_results.round(3)
|
141 |
+
|
142 |
+
status.update(label="Status", state="complete", expanded=False)
|
143 |
+
|
144 |
+
# Display data info
|
145 |
+
st.header('Input data', divider='rainbow')
|
146 |
+
col = st.columns(4)
|
147 |
+
col[0].metric(label="No. of samples", value=X.shape[0], delta="")
|
148 |
+
col[1].metric(label="No. of X variables", value=X.shape[1], delta="")
|
149 |
+
col[2].metric(label="No. of Training samples", value=X_train.shape[0], delta="")
|
150 |
+
col[3].metric(label="No. of Test samples", value=X_test.shape[0], delta="")
|
151 |
+
|
152 |
+
with st.expander('Initial dataset', expanded=True):
|
153 |
+
st.dataframe(df, height=210, use_container_width=True)
|
154 |
+
with st.expander('Train split', expanded=False):
|
155 |
+
train_col = st.columns((3,1))
|
156 |
+
with train_col[0]:
|
157 |
+
st.markdown('**X**')
|
158 |
+
st.dataframe(X_train, height=210, hide_index=True, use_container_width=True)
|
159 |
+
with train_col[1]:
|
160 |
+
st.markdown('**y**')
|
161 |
+
st.dataframe(y_train, height=210, hide_index=True, use_container_width=True)
|
162 |
+
with st.expander('Test split', expanded=False):
|
163 |
+
test_col = st.columns((3,1))
|
164 |
+
with test_col[0]:
|
165 |
+
st.markdown('**X**')
|
166 |
+
st.dataframe(X_test, height=210, hide_index=True, use_container_width=True)
|
167 |
+
with test_col[1]:
|
168 |
+
st.markdown('**y**')
|
169 |
+
st.dataframe(y_test, height=210, hide_index=True, use_container_width=True)
|
170 |
+
|
171 |
+
# Zip dataset files
|
172 |
+
df.to_csv('dataset.csv', index=False)
|
173 |
+
X_train.to_csv('X_train.csv', index=False)
|
174 |
+
y_train.to_csv('y_train.csv', index=False)
|
175 |
+
X_test.to_csv('X_test.csv', index=False)
|
176 |
+
y_test.to_csv('y_test.csv', index=False)
|
177 |
+
|
178 |
+
list_files = ['dataset.csv', 'X_train.csv', 'y_train.csv', 'X_test.csv', 'y_test.csv']
|
179 |
+
with zipfile.ZipFile('dataset.zip', 'w') as zipF:
|
180 |
+
for file in list_files:
|
181 |
+
zipF.write(file, compress_type=zipfile.ZIP_DEFLATED)
|
182 |
+
|
183 |
+
with open('dataset.zip', 'rb') as datazip:
|
184 |
+
btn = st.download_button(
|
185 |
+
label='Download ZIP',
|
186 |
+
data=datazip,
|
187 |
+
file_name="dataset.zip",
|
188 |
+
mime="application/octet-stream"
|
189 |
+
)
|
190 |
+
|
191 |
+
# Display model parameters
|
192 |
+
st.header('Model parameters', divider='rainbow')
|
193 |
+
parameters_col = st.columns(3)
|
194 |
+
parameters_col[0].metric(label="Data split ratio (% for Training Set)", value=parameter_split_size, delta="")
|
195 |
+
parameters_col[1].metric(label="Number of estimators (n_estimators)", value=parameter_n_estimators, delta="")
|
196 |
+
parameters_col[2].metric(label="Max features (max_features)", value=parameter_max_features_metric, delta="")
|
197 |
+
|
198 |
+
# Display feature importance plot
|
199 |
+
importances = rf.feature_importances_
|
200 |
+
feature_names = list(X.columns)
|
201 |
+
forest_importances = pd.Series(importances, index=feature_names)
|
202 |
+
df_importance = forest_importances.reset_index().rename(columns={'index': 'feature', 0: 'value'})
|
203 |
+
|
204 |
+
bars = alt.Chart(df_importance).mark_bar(size=40).encode(
|
205 |
+
x='value:Q',
|
206 |
+
y=alt.Y('feature:N', sort='-x')
|
207 |
+
).properties(height=250)
|
208 |
+
|
209 |
+
performance_col = st.columns((2, 0.2, 3))
|
210 |
+
with performance_col[0]:
|
211 |
+
st.header('Model performance', divider='rainbow')
|
212 |
+
st.dataframe(rf_results.T.reset_index().rename(columns={'index': 'Parameter', 0: 'Value'}))
|
213 |
+
with performance_col[2]:
|
214 |
+
st.header('Feature importance', divider='rainbow')
|
215 |
+
st.altair_chart(bars, theme='streamlit', use_container_width=True)
|
216 |
+
|
217 |
+
# Prediction results
|
218 |
+
st.header('Prediction results', divider='rainbow')
|
219 |
+
s_y_train = pd.Series(y_train, name='actual').reset_index(drop=True)
|
220 |
+
s_y_train_pred = pd.Series(y_train_pred, name='predicted').reset_index(drop=True)
|
221 |
+
df_train = pd.DataFrame(data=[s_y_train, s_y_train_pred], index=None).T
|
222 |
+
df_train['class'] = 'train'
|
223 |
+
|
224 |
+
s_y_test = pd.Series(y_test, name='actual').reset_index(drop=True)
|
225 |
+
s_y_test_pred = pd.Series(y_test_pred, name='predicted').reset_index(drop=True)
|
226 |
+
df_test = pd.DataFrame(data=[s_y_test, s_y_test_pred], index=None).T
|
227 |
+
df_test['class'] = 'test'
|
228 |
+
|
229 |
+
df_prediction = pd.concat([df_train, df_test], axis=0)
|
230 |
+
|
231 |
+
prediction_col = st.columns((2, 0.2, 3))
|
232 |
+
|
233 |
+
# Display dataframe
|
234 |
+
with prediction_col[0]:
|
235 |
+
st.dataframe(df_prediction, height=320, use_container_width=True)
|
236 |
+
|
237 |
+
# Display scatter plot of actual vs predicted values
|
238 |
+
with prediction_col[2]:
|
239 |
+
scatter = alt.Chart(df_prediction).mark_circle(size=60).encode(
|
240 |
+
x='actual',
|
241 |
+
y='predicted',
|
242 |
+
color='class'
|
243 |
+
)
|
244 |
+
st.altair_chart(scatter, theme='streamlit', use_container_width=True)
|
245 |
+
|
246 |
+
|
247 |
+
# Ask for CSV upload if none is detected
|
248 |
+
else:
|
249 |
+
st.warning('π Upload a CSV file or click *"Load example data"* to get started!')
|
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit==1.29.0
|
2 |
+
pandas>=1.3.0
|
3 |
+
scikit-learn
|
4 |
+
altair>=4.0
|