Add example app
Browse files- +249 -0
- requirements.txt +4 -0
@@ -0,0 +1,249 @@
1 |
import streamlit as st
2 |
import pandas as pd
3 |
import numpy as np
4 |
from sklearn.model_selection import train_test_split
5 |
from sklearn.ensemble import RandomForestRegressor
6 |
from sklearn.metrics import mean_squared_error, r2_score
7 |
import altair as alt
8 |
import time
9 |
import zipfile
10 |
11 |
# Page title
12 |
st.set_page_config(page_title='ML Model Building', page_icon='π€')
13 |
st.title('π€ ML Model Building')
14 |
15 |
with st.expander('About this app'):
16 |
st.markdown('**What can this app do?**')
17 |
+'This app allow users to build a machine learning (ML) model in an end-to-end workflow. Particularly, this encompasses data upload, data pre-processing, ML model building and post-model analysis.')
18 |
19 |
st.markdown('**How to use the app?**')
20 |
st.warning('To engage with the app, go to the sidebar and 1. Select a data set and 2. Adjust the model parameters by adjusting the various slider widgets. As a result, this would initiate the ML model building process, display the model results as well as allowing users to download the generated models and accompanying data.')
21 |
22 |
st.markdown('**Under the hood**')
23 |
st.markdown('Data sets:')
24 |
st.code('''- Drug solubility data set
25 |
''', language='markdown')
26 |
27 |
st.markdown('Libraries used:')
28 |
st.code('''- Pandas for data wrangling
29 |
- Scikit-learn for building a machine learning model
30 |
- Altair for chart creation
31 |
- Streamlit for user interface
32 |
''', language='markdown')
33 |
34 |
35 |
# Sidebar for accepting input parameters
36 |
with st.sidebar:
37 |
# Load data
38 |
st.header('1.1. Input data')
39 |
40 |
st.markdown('**1. Use custom data**')
41 |
uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])
42 |
if uploaded_file is not None:
43 |
df = pd.read_csv(uploaded_file, index_col=False)
44 |
45 |
# Download example data
46 |
47 |
def convert_df(input_df):
48 |
return input_df.to_csv(index=False).encode('utf-8')
49 |
example_csv = pd.read_csv('')
50 |
csv = convert_df(example_csv)
51 |
52 |
label="Download example CSV",
53 |
54 |
55 |
56 |
57 |
58 |
# Select example data
59 |
st.markdown('**1.2. Use example data**')
60 |
example_data = st.toggle('Load example data')
61 |
if example_data:
62 |
df = pd.read_csv('')
63 |
64 |
st.header('2. Set Parameters')
65 |
parameter_split_size = st.slider('Data split ratio (% for Training Set)', 10, 90, 80, 5)
66 |
67 |
st.subheader('2.1. Learning Parameters')
68 |
with st.expander('See parameters'):
69 |
parameter_n_estimators = st.slider('Number of estimators (n_estimators)', 0, 1000, 100, 100)
70 |
parameter_max_features = st.select_slider('Max features (max_features)', options=['all', 'sqrt', 'log2'])
71 |
parameter_min_samples_split = st.slider('Minimum number of samples required to split an internal node (min_samples_split)', 2, 10, 2, 1)
72 |
parameter_min_samples_leaf = st.slider('Minimum number of samples required to be at a leaf node (min_samples_leaf)', 1, 10, 2, 1)
73 |
74 |
st.subheader('2.2. General Parameters')
75 |
with st.expander('See parameters', expanded=False):
76 |
parameter_random_state = st.slider('Seed number (random_state)', 0, 1000, 42, 1)
77 |
parameter_criterion = st.select_slider('Performance measure (criterion)', options=['squared_error', 'absolute_error', 'friedman_mse'])
78 |
parameter_bootstrap = st.select_slider('Bootstrap samples when building trees (bootstrap)', options=[True, False])
79 |
parameter_oob_score = st.select_slider('Whether to use out-of-bag samples to estimate the R^2 on unseen data (oob_score)', options=[False, True])
80 |
81 |
sleep_time = st.slider('Sleep time', 0, 3, 0)
82 |
83 |
# Initiate the model building process
84 |
if uploaded_file or example_data:
85 |
with st.status("Running ...", expanded=True) as status:
86 |
87 |
st.write("Loading data ...")
88 |
89 |
90 |
st.write("Preparing data ...")
91 |
92 |
X = df.iloc[:,:-1]
93 |
y = df.iloc[:,-1]
94 |
95 |
st.write("Splitting data ...")
96 |
97 |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=(100-parameter_split_size)/100, random_state=parameter_random_state)
98 |
99 |
st.write("Model training ...")
100 |
101 |
102 |
if parameter_max_features == 'all':
103 |
parameter_max_features = None
104 |
parameter_max_features_metric = X.shape[1]
105 |
106 |
rf = RandomForestRegressor(
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
+, y_train)
116 |
117 |
st.write("Applying model to make predictions ...")
118 |
119 |
y_train_pred = rf.predict(X_train)
120 |
y_test_pred = rf.predict(X_test)
121 |
122 |
st.write("Evaluating performance metrics ...")
123 |
124 |
train_mse = mean_squared_error(y_train, y_train_pred)
125 |
train_r2 = r2_score(y_train, y_train_pred)
126 |
test_mse = mean_squared_error(y_test, y_test_pred)
127 |
test_r2 = r2_score(y_test, y_test_pred)
128 |
129 |
st.write("Displaying performance metrics ...")
130 |
131 |
parameter_criterion_string = ' '.join([x.capitalize() for x in parameter_criterion.split('_')])
132 |
#if 'Mse' in parameter_criterion_string:
133 |
# parameter_criterion_string = parameter_criterion_string.replace('Mse', 'MSE')
134 |
rf_results = pd.DataFrame(['Random forest', train_mse, train_r2, test_mse, test_r2]).transpose()
135 |
rf_results.columns = ['Method', f'Training {parameter_criterion_string}', 'Training R2', f'Test {parameter_criterion_string}', 'Test R2']
136 |
# Convert objects to numerics
137 |
for col in rf_results.columns:
138 |
rf_results[col] = pd.to_numeric(rf_results[col], errors='ignore')
139 |
# Round to 3 digits
140 |
rf_results = rf_results.round(3)
141 |
142 |
status.update(label="Status", state="complete", expanded=False)
143 |
144 |
# Display data info
145 |
st.header('Input data', divider='rainbow')
146 |
col = st.columns(4)
147 |
col[0].metric(label="No. of samples", value=X.shape[0], delta="")
148 |
col[1].metric(label="No. of X variables", value=X.shape[1], delta="")
149 |
col[2].metric(label="No. of Training samples", value=X_train.shape[0], delta="")
150 |
col[3].metric(label="No. of Test samples", value=X_test.shape[0], delta="")
151 |
152 |
with st.expander('Initial dataset', expanded=True):
153 |
st.dataframe(df, height=210, use_container_width=True)
154 |
with st.expander('Train split', expanded=False):
155 |
train_col = st.columns((3,1))
156 |
with train_col[0]:
157 |
158 |
st.dataframe(X_train, height=210, hide_index=True, use_container_width=True)
159 |
with train_col[1]:
160 |
161 |
st.dataframe(y_train, height=210, hide_index=True, use_container_width=True)
162 |
with st.expander('Test split', expanded=False):
163 |
test_col = st.columns((3,1))
164 |
with test_col[0]:
165 |
166 |
st.dataframe(X_test, height=210, hide_index=True, use_container_width=True)
167 |
with test_col[1]:
168 |
169 |
st.dataframe(y_test, height=210, hide_index=True, use_container_width=True)
170 |
171 |
# Zip dataset files
172 |
df.to_csv('dataset.csv', index=False)
173 |
X_train.to_csv('X_train.csv', index=False)
174 |
y_train.to_csv('y_train.csv', index=False)
175 |
X_test.to_csv('X_test.csv', index=False)
176 |
y_test.to_csv('y_test.csv', index=False)
177 |
178 |
list_files = ['dataset.csv', 'X_train.csv', 'y_train.csv', 'X_test.csv', 'y_test.csv']
179 |
with zipfile.ZipFile('', 'w') as zipF:
180 |
for file in list_files:
181 |
zipF.write(file, compress_type=zipfile.ZIP_DEFLATED)
182 |
183 |
with open('', 'rb') as datazip:
184 |
btn = st.download_button(
185 |
label='Download ZIP',
186 |
187 |
188 |
189 |
190 |
191 |
# Display model parameters
192 |
st.header('Model parameters', divider='rainbow')
193 |
parameters_col = st.columns(3)
194 |
parameters_col[0].metric(label="Data split ratio (% for Training Set)", value=parameter_split_size, delta="")
195 |
parameters_col[1].metric(label="Number of estimators (n_estimators)", value=parameter_n_estimators, delta="")
196 |
parameters_col[2].metric(label="Max features (max_features)", value=parameter_max_features_metric, delta="")
197 |
198 |
# Display feature importance plot
199 |
importances = rf.feature_importances_
200 |
feature_names = list(X.columns)
201 |
forest_importances = pd.Series(importances, index=feature_names)
202 |
df_importance = forest_importances.reset_index().rename(columns={'index': 'feature', 0: 'value'})
203 |
204 |
bars = alt.Chart(df_importance).mark_bar(size=40).encode(
205 |
206 |
y=alt.Y('feature:N', sort='-x')
207 |
208 |
209 |
performance_col = st.columns((2, 0.2, 3))
210 |
with performance_col[0]:
211 |
st.header('Model performance', divider='rainbow')
212 |
st.dataframe(rf_results.T.reset_index().rename(columns={'index': 'Parameter', 0: 'Value'}))
213 |
with performance_col[2]:
214 |
st.header('Feature importance', divider='rainbow')
215 |
st.altair_chart(bars, theme='streamlit', use_container_width=True)
216 |
217 |
# Prediction results
218 |
st.header('Prediction results', divider='rainbow')
219 |
s_y_train = pd.Series(y_train, name='actual').reset_index(drop=True)
220 |
s_y_train_pred = pd.Series(y_train_pred, name='predicted').reset_index(drop=True)
221 |
df_train = pd.DataFrame(data=[s_y_train, s_y_train_pred], index=None).T
222 |
df_train['class'] = 'train'
223 |
224 |
s_y_test = pd.Series(y_test, name='actual').reset_index(drop=True)
225 |
s_y_test_pred = pd.Series(y_test_pred, name='predicted').reset_index(drop=True)
226 |
df_test = pd.DataFrame(data=[s_y_test, s_y_test_pred], index=None).T
227 |
df_test['class'] = 'test'
228 |
229 |
df_prediction = pd.concat([df_train, df_test], axis=0)
230 |
231 |
prediction_col = st.columns((2, 0.2, 3))
232 |
233 |
# Display dataframe
234 |
with prediction_col[0]:
235 |
st.dataframe(df_prediction, height=320, use_container_width=True)
236 |
237 |
# Display scatter plot of actual vs predicted values
238 |
with prediction_col[2]:
239 |
scatter = alt.Chart(df_prediction).mark_circle(size=60).encode(
240 |
241 |
242 |
243 |
244 |
st.altair_chart(scatter, theme='streamlit', use_container_width=True)
245 |
246 |
247 |
# Ask for CSV upload if none is detected
248 |
249 |
st.warning('π Upload a CSV file or click *"Load example data"* to get started!')
@@ -0,0 +1,4 @@
1 |
2 |
3 |
4 |