Spaces:
Sleeping
Sleeping
Upload 4 files
Browse files- dump/1_Transformations.py +449 -0
- dump/1_Transformations_with_panel.py +548 -0
- dump/2_Model_Build_and_Performance.py +403 -0
- dump/3_Model_Tuning.py +197 -0
dump/1_Transformations.py
ADDED
@@ -0,0 +1,449 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import plotly.express as px
|
4 |
+
import plotly.graph_objects as go
|
5 |
+
from Eda_functions import format_numbers
|
6 |
+
import numpy as np
|
7 |
+
import pickle
|
8 |
+
from st_aggrid import AgGrid
|
9 |
+
from st_aggrid import GridOptionsBuilder,GridUpdateMode
|
10 |
+
from utilities import set_header,load_local_css
|
11 |
+
from st_aggrid import GridOptionsBuilder
|
12 |
+
import time
|
13 |
+
import itertools
|
14 |
+
import statsmodels.api as sm
|
15 |
+
import numpy as npc
|
16 |
+
import re
|
17 |
+
import itertools
|
18 |
+
from sklearn.metrics import mean_absolute_error, r2_score,mean_absolute_percentage_error
|
19 |
+
from sklearn.preprocessing import MinMaxScaler
|
20 |
+
import os
|
21 |
+
import matplotlib.pyplot as plt
|
22 |
+
from statsmodels.stats.outliers_influence import variance_inflation_factor
|
23 |
+
st.set_option('deprecation.showPyplotGlobalUse', False)
|
24 |
+
from datetime import datetime
|
25 |
+
import seaborn as sns
|
26 |
+
from Data_prep_functions import *
|
27 |
+
|
28 |
+
st.set_page_config(
|
29 |
+
page_title="Model Build",
|
30 |
+
page_icon=":shark:",
|
31 |
+
layout="wide",
|
32 |
+
initial_sidebar_state='collapsed'
|
33 |
+
)
|
34 |
+
|
35 |
+
load_local_css('styles.css')
|
36 |
+
set_header()
|
37 |
+
|
38 |
+
|
39 |
+
st.title('1. Build Your Model')
|
40 |
+
|
41 |
+
# media_data=pd.read_csv('Media_data_for_model.csv')
|
42 |
+
media_data=pd.read_csv('Media_data_for_model_dma_level.csv')
|
43 |
+
date=media_data['Date']
|
44 |
+
st.session_state['date']=date
|
45 |
+
revenue=media_data['Total Approved Accounts - Revenue']
|
46 |
+
media_data.drop(['Total Approved Accounts - Revenue'],axis=1,inplace=True)
|
47 |
+
media_data.drop(['Date'],axis=1,inplace=True)
|
48 |
+
media_data.reset_index(drop=True,inplace=True)
|
49 |
+
media_data.dropna(inplace=True)
|
50 |
+
|
51 |
+
if st.toggle('Apply Transformations on DMA/Panel Level'):
|
52 |
+
dma=st.selectbox('Select the Level of data ',[ col for col in media_data.columns if col.lower() in ['dma','panel']])
|
53 |
+
|
54 |
+
|
55 |
+
else:
|
56 |
+
""" code to aggregate data on date """
|
57 |
+
|
58 |
+
|
59 |
+
dma=None
|
60 |
+
|
61 |
+
# dma_dict={ dm:media_data[media_data[dma]==dm] for dm in media_data[dma].unique()}
|
62 |
+
# st.write(dma_dict)
|
63 |
+
|
64 |
+
st.markdown('## Select the Range of Transformations')
|
65 |
+
columns = st.columns(2)
|
66 |
+
old_shape=media_data.shape
|
67 |
+
|
68 |
+
|
69 |
+
if "old_shape" not in st.session_state:
|
70 |
+
st.session_state['old_shape']=old_shape
|
71 |
+
|
72 |
+
|
73 |
+
with columns[0]:
|
74 |
+
slider_value_adstock = st.slider('Select Adstock Range (only applied to media)', 0.0, 1.0, (0.2, 0.4), step=0.1, format="%.2f")
|
75 |
+
with columns[1]:
|
76 |
+
slider_value_lag = st.slider('Select Lag Range (applied to media, seasonal, macroeconomic variables)', 1, 7, (1, 3), step=1)
|
77 |
+
|
78 |
+
# with columns[2]:
|
79 |
+
# slider_value_power=st.slider('Select Power range (only applied to media )',0,4,(1,2),step=1)
|
80 |
+
|
81 |
+
# with columns[1]:
|
82 |
+
# st.number_input('Select the range of half saturation point ',min_value=1,max_value=5)
|
83 |
+
# st.number_input('Select the range of ')
|
84 |
+
|
85 |
+
def lag(data,features,lags,dma=None):
|
86 |
+
if dma:
|
87 |
+
|
88 |
+
transformed_data=pd.concat([data.groupby([dma])[features].shift(lag).add_suffix(f'_lag_{lag}') for lag in lags],axis=1)
|
89 |
+
transformed_data=transformed_data.fillna(method='bfill')
|
90 |
+
return pd.concat([transformed_data,data],axis=1)
|
91 |
+
|
92 |
+
else:
|
93 |
+
|
94 |
+
''' data should be aggregated on date'''
|
95 |
+
|
96 |
+
transformed_data=pd.concat([data[features].shift(lag).add_suffix(f'_lag_{lag}') for lag in lags],axis=1)
|
97 |
+
transformed_data=transformed_data.fillna(method='bfill')
|
98 |
+
|
99 |
+
return pd.concat([transformed_data,data],axis=1)
|
100 |
+
|
101 |
+
#adstock
|
102 |
+
def adstock(df, alphas, cutoff, features,dma=None):
|
103 |
+
|
104 |
+
if dma:
|
105 |
+
transformed_data=pd.DataFrame()
|
106 |
+
for d in df[dma].unique():
|
107 |
+
dma_sub_df = df[df[dma] == d]
|
108 |
+
n = len(dma_sub_df)
|
109 |
+
|
110 |
+
|
111 |
+
weights = np.array([[[alpha**(i-j) if i >= j and j >= i-cutoff else 0. for j in range(n)] for i in range(n)] for alpha in alphas])
|
112 |
+
|
113 |
+
X = dma_sub_df[features].to_numpy()
|
114 |
+
res = pd.DataFrame(np.hstack(weights @ X),
|
115 |
+
columns=[f'{col}_adstock_{alpha}' for alpha in alphas for col in features])
|
116 |
+
|
117 |
+
transformed_data=pd.concat([transformed_data,res],axis=0)
|
118 |
+
transformed_data.reset_index(drop=True,inplace=True)
|
119 |
+
return pd.concat([transformed_data,df],axis=1)
|
120 |
+
|
121 |
+
else:
|
122 |
+
|
123 |
+
n = len(df)
|
124 |
+
|
125 |
+
|
126 |
+
weights = np.array([[[alpha**(i-j) if i >= j and j >= i-cutoff else 0. for j in range(n)] for i in range(n)] for alpha in alphas])
|
127 |
+
|
128 |
+
X = df[features].to_numpy()
|
129 |
+
res = pd.DataFrame(np.hstack(weights @ X),
|
130 |
+
columns=[f'{col}_adstock_{alpha}' for alpha in alphas for col in features])
|
131 |
+
return pd.concat([res,df],axis=1)
|
132 |
+
|
133 |
+
|
134 |
+
|
135 |
+
|
136 |
+
|
137 |
+
if 'media_data' not in st.session_state:
|
138 |
+
|
139 |
+
st.session_state['media_data']=pd.DataFrame()
|
140 |
+
|
141 |
+
variables_to_be_transformed=[col for col in media_data.columns if col.lower() not in ['dma','panel'] ] # change for buckets
|
142 |
+
|
143 |
+
|
144 |
+
if st.button('Apply Transformations'):
|
145 |
+
with st.spinner('Applying Transformations'):
|
146 |
+
transformed_data_lag=lag(media_data,features=variables_to_be_transformed,lags=np.arange(slider_value_lag[0],slider_value_lag[1]+1,1),dma=dma)
|
147 |
+
|
148 |
+
variables_to_be_transformed=[col for col in list(transformed_data_lag.columns) if col not in ['Date','DMA','Panel']] #change for buckets
|
149 |
+
|
150 |
+
transformed_data_adstock=adstock(df=transformed_data_lag, alphas=np.arange(slider_value_adstock[0],slider_value_adstock[1]+0.1,0.1), cutoff=8, features=variables_to_be_transformed,dma=dma)
|
151 |
+
|
152 |
+
st.success('Done')
|
153 |
+
st.write(f'old shape {old_shape}, new shape {transformed_data_adstock.shape}')
|
154 |
+
st.write(media_data.head(10))
|
155 |
+
st.write(transformed_data_adstock)
|
156 |
+
st.write(transformed_data_adstock.isnull().sum().sort_values(ascending=False))
|
157 |
+
|
158 |
+
|
159 |
+
|
160 |
+
|
161 |
+
|
162 |
+
|
163 |
+
|
164 |
+
# st.write(dma_dict)
|
165 |
+
# st.session_state['media_data']=media_data
|
166 |
+
|
167 |
+
# with st.spinner('Applying Transformations'):
|
168 |
+
# time.sleep(2)
|
169 |
+
# st.success("Transformations complete!")
|
170 |
+
|
171 |
+
# if st.session_state['media_data'].shape[1]>old_shape[1]:
|
172 |
+
# with columns[0]:
|
173 |
+
# st.write(f'Total no.of variables before transformation: {old_shape[1]}, Total no.of variables after transformation: {st.session_state["media_data"].shape[1]}')
|
174 |
+
# #st.write(f'Total no.of variables after transformation: {st.session_state["media_data"].shape[1]}')
|
175 |
+
|
176 |
+
|
177 |
+
# bucket=['paid_search', 'kwai','indicacao','infleux', 'influencer','FB: Level Achieved - Tier 1 Impressions',
|
178 |
+
# ' FB: Level Achieved - Tier 2 Impressions','paid_social_others',
|
179 |
+
# ' GA App: Will And Cid Pequena Baixo Risco Clicks',
|
180 |
+
# 'digital_tactic_others',"programmatic"
|
181 |
+
# ]
|
182 |
+
|
183 |
+
# with columns[1]:
|
184 |
+
# if st.button('Create Combinations of Variables'):
|
185 |
+
|
186 |
+
# top_3_correlated_features=[]
|
187 |
+
# for col in st.session_state['media_data'].columns[:19]:
|
188 |
+
# corr_df=pd.concat([st.session_state['media_data'].filter(regex=col),
|
189 |
+
# revenue],axis=1).corr()['Total Approved Accounts - Revenue'].iloc[:-1]
|
190 |
+
# top_3_correlated_features.append(list(corr_df.sort_values(ascending=False).head(2).index))
|
191 |
+
# flattened_list = [item for sublist in top_3_correlated_features for item in sublist]
|
192 |
+
# all_features_set={var:[col for col in flattened_list if var in col] for var in bucket}
|
193 |
+
# channels_all=[values for values in all_features_set.values()]
|
194 |
+
# st.session_state['combinations'] = list(itertools.product(*channels_all))
|
195 |
+
|
196 |
+
# # if 'combinations' not in st.session_state:
|
197 |
+
# # st.session_state['combinations']=combinations_all
|
198 |
+
|
199 |
+
# st.session_state['final_selection']=st.session_state['combinations']
|
200 |
+
|
201 |
+
|
202 |
+
|
203 |
+
# revenue.reset_index(drop=True,inplace=True)
|
204 |
+
# if 'Model_results' not in st.session_state:
|
205 |
+
# st.session_state['Model_results']={'Model_object':[],
|
206 |
+
# 'Model_iteration':[],
|
207 |
+
# 'Feature_set':[],
|
208 |
+
# 'MAPE':[],
|
209 |
+
# 'R2':[],
|
210 |
+
# 'ADJR2':[]
|
211 |
+
# }
|
212 |
+
|
213 |
+
# #if st.button('Build Model'):
|
214 |
+
# if 'iterations' not in st.session_state:
|
215 |
+
# st.session_state['iterations']=1
|
216 |
+
# save_path = r"Model"
|
217 |
+
# with columns[1]:
|
218 |
+
# if "final_selection" in st.session_state:
|
219 |
+
# st.write(f'Total combinations created {format_numbers(len(st.session_state["final_selection"]))}')
|
220 |
+
|
221 |
+
# st.success('Done')
|
222 |
+
# if st.checkbox('Build all iterations'):
|
223 |
+
# iterations=len(st.session_state['final_selection'])
|
224 |
+
# else:
|
225 |
+
# iterations = st.number_input('Select the number of iterations to perform', min_value=1, step=100, value=st.session_state['iterations'])
|
226 |
+
|
227 |
+
# st.session_state['iterations']=iterations
|
228 |
+
|
229 |
+
|
230 |
+
# st.session_state['media_data']=st.session_state['media_data'].fillna(method='ffill')
|
231 |
+
# if st.button("Build Models"):
|
232 |
+
# st.markdown('Data Split -- Training Period: May 9th, 2023 - October 5th,2023 , Testing Period: October 6th, 2023 - November 7th, 2023 ')
|
233 |
+
# progress_bar = st.progress(0) # Initialize the progress bar
|
234 |
+
# #time_remaining_text = st.empty() # Create an empty space for time remaining text
|
235 |
+
# start_time = time.time() # Record the start time
|
236 |
+
# progress_text = st.empty()
|
237 |
+
# #time_elapsed_text = st.empty()
|
238 |
+
# for i, selected_features in enumerate(st.session_state["final_selection"][40000:40000+int(iterations)]):
|
239 |
+
# df = st.session_state['media_data']
|
240 |
+
|
241 |
+
# fet = [var for var in selected_features if len(var) > 0]
|
242 |
+
# X = df[fet]
|
243 |
+
# y = revenue
|
244 |
+
# ss = MinMaxScaler()
|
245 |
+
# X = pd.DataFrame(ss.fit_transform(X), columns=X.columns)
|
246 |
+
# X = sm.add_constant(X)
|
247 |
+
# X_train=X.iloc[:150]
|
248 |
+
# X_test=X.iloc[150:]
|
249 |
+
# y_train=y.iloc[:150]
|
250 |
+
# y_test=y.iloc[150:]
|
251 |
+
|
252 |
+
|
253 |
+
# model = sm.OLS(y_train, X_train).fit()
|
254 |
+
# # st.write(fet)
|
255 |
+
# positive_coeff=X.columns
|
256 |
+
# negetive_coeff=[]
|
257 |
+
# coefficients=model.params.to_dict()
|
258 |
+
# model_possitive=[col for col in coefficients.keys() if coefficients[col]>0]
|
259 |
+
# # st.write(positive_coeff)
|
260 |
+
# # st.write(model_possitive)
|
261 |
+
# pvalues=[var for var in list(model.pvalues) if var<=0.06]
|
262 |
+
# if (len(model_possitive)/len(selected_features))>0.9 and (len(pvalues)/len(selected_features))>=0.8:
|
263 |
+
|
264 |
+
|
265 |
+
# predicted_values = model.predict(X_train)
|
266 |
+
# mape = mean_absolute_percentage_error(y_train, predicted_values)
|
267 |
+
# adjr2 = model.rsquared_adj
|
268 |
+
# r2 = model.rsquared
|
269 |
+
# filename = os.path.join(save_path, f"model_{i}.pkl")
|
270 |
+
# with open(filename, "wb") as f:
|
271 |
+
# pickle.dump(model, f)
|
272 |
+
# # with open(r"C:\Users\ManojP\Documents\MMM\simopt\Model\model.pkl", 'rb') as file:
|
273 |
+
# # model = pickle.load(file)
|
274 |
+
|
275 |
+
# st.session_state['Model_results']['Model_object'].append(filename)
|
276 |
+
# st.session_state['Model_results']['Model_iteration'].append(i)
|
277 |
+
# st.session_state['Model_results']['Feature_set'].append(fet)
|
278 |
+
# st.session_state['Model_results']['MAPE'].append(mape)
|
279 |
+
# st.session_state['Model_results']['R2'].append(r2)
|
280 |
+
# st.session_state['Model_results']['ADJR2'].append(adjr2)
|
281 |
+
|
282 |
+
# current_time = time.time()
|
283 |
+
# time_taken = current_time - start_time
|
284 |
+
# time_elapsed_minutes = time_taken / 60
|
285 |
+
# completed_iterations_text = f"{i + 1}/{iterations}"
|
286 |
+
# progress_bar.progress((i + 1) / int(iterations))
|
287 |
+
# progress_text.text(f'Completed iterations: {completed_iterations_text},Time Elapsed (min): {time_elapsed_minutes:.2f}')
|
288 |
+
|
289 |
+
# st.write(f'Out of {st.session_state["iterations"]} iterations : {len(st.session_state["Model_results"]["Model_object"])} valid models')
|
290 |
+
# pd.DataFrame(st.session_state['Model_results']).to_csv('model_output.csv')
|
291 |
+
|
292 |
+
# def to_percentage(value):
|
293 |
+
# return f'{value * 100:.1f}%'
|
294 |
+
|
295 |
+
# st.title('2. Select Models')
|
296 |
+
# if 'tick' not in st.session_state:
|
297 |
+
# st.session_state['tick']=False
|
298 |
+
# if st.checkbox('Show results of top 10 models (based on MAPE and Adj. R2)',value=st.session_state['tick']):
|
299 |
+
# st.session_state['tick']=True
|
300 |
+
# st.write('Select one model iteration to generate performance metrics for it:')
|
301 |
+
# data=pd.DataFrame(st.session_state['Model_results'])
|
302 |
+
# data.sort_values(by=['MAPE'],ascending=False,inplace=True)
|
303 |
+
# data.drop_duplicates(subset='Model_iteration',inplace=True)
|
304 |
+
# top_10=data.head(10)
|
305 |
+
# top_10['Rank']=np.arange(1,len(top_10)+1,1)
|
306 |
+
# top_10[['MAPE','R2','ADJR2']]=np.round(top_10[['MAPE','R2','ADJR2']],4).applymap(to_percentage)
|
307 |
+
# top_10_table = top_10[['Rank','Model_iteration','MAPE','ADJR2','R2']]
|
308 |
+
# #top_10_table.columns=[['Rank','Model Iteration Index','MAPE','Adjusted R2','R2']]
|
309 |
+
# gd=GridOptionsBuilder.from_dataframe(top_10_table)
|
310 |
+
# gd.configure_pagination(enabled=True)
|
311 |
+
# gd.configure_selection(use_checkbox=True)
|
312 |
+
|
313 |
+
|
314 |
+
# gridoptions=gd.build()
|
315 |
+
|
316 |
+
# table = AgGrid(top_10,gridOptions=gridoptions,update_mode=GridUpdateMode.SELECTION_CHANGED)
|
317 |
+
|
318 |
+
# selected_rows=table.selected_rows
|
319 |
+
# # if st.session_state["selected_rows"] != selected_rows:
|
320 |
+
# # st.session_state["build_rc_cb"] = False
|
321 |
+
# st.session_state["selected_rows"] = selected_rows
|
322 |
+
# if 'Model' not in st.session_state:
|
323 |
+
# st.session_state['Model']={}
|
324 |
+
|
325 |
+
# if len(selected_rows)>0:
|
326 |
+
# st.header('2.1 Results Summary')
|
327 |
+
|
328 |
+
# model_object=data[data['Model_iteration']==selected_rows[0]['Model_iteration']]['Model_object']
|
329 |
+
# features_set=data[data['Model_iteration']==selected_rows[0]['Model_iteration']]['Feature_set']
|
330 |
+
|
331 |
+
# with open(str(model_object.values[0]), 'rb') as file:
|
332 |
+
# model = pickle.load(file)
|
333 |
+
# st.write(model.summary())
|
334 |
+
# st.header('2.2 Actual vs. Predicted Plot')
|
335 |
+
|
336 |
+
# df=st.session_state['media_data']
|
337 |
+
# X=df[features_set.values[0]]
|
338 |
+
# X = sm.add_constant(X)
|
339 |
+
# y=revenue
|
340 |
+
# X_train=X.iloc[:150]
|
341 |
+
# X_test=X.iloc[150:]
|
342 |
+
# y_train=y.iloc[:150]
|
343 |
+
# y_test=y.iloc[150:]
|
344 |
+
# ss = MinMaxScaler()
|
345 |
+
# X_train = pd.DataFrame(ss.fit_transform(X_train), columns=X_train.columns)
|
346 |
+
# st.session_state['X']=X_train
|
347 |
+
# st.session_state['features_set']=features_set.values[0]
|
348 |
+
|
349 |
+
# metrics_table,line,actual_vs_predicted_plot=plot_actual_vs_predicted(date, y_train, model.predict(X_train), model,target_column='Revenue')
|
350 |
+
|
351 |
+
# st.plotly_chart(actual_vs_predicted_plot,use_container_width=True)
|
352 |
+
|
353 |
+
|
354 |
+
|
355 |
+
# st.markdown('## 2.3 Residual Analysis')
|
356 |
+
# columns=st.columns(2)
|
357 |
+
# with columns[0]:
|
358 |
+
# fig=plot_residual_predicted(y_train,model.predict(X_train),X_train)
|
359 |
+
# st.plotly_chart(fig)
|
360 |
+
|
361 |
+
# with columns[1]:
|
362 |
+
# st.empty()
|
363 |
+
# fig = qqplot(y_train,model.predict(X_train))
|
364 |
+
# st.plotly_chart(fig)
|
365 |
+
|
366 |
+
# with columns[0]:
|
367 |
+
# fig=residual_distribution(y_train,model.predict(X_train))
|
368 |
+
# st.pyplot(fig)
|
369 |
+
|
370 |
+
|
371 |
+
|
372 |
+
# vif_data = pd.DataFrame()
|
373 |
+
# # X=X.drop('const',axis=1)
|
374 |
+
# vif_data["Variable"] = X_train.columns
|
375 |
+
# vif_data["VIF"] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
|
376 |
+
# vif_data.sort_values(by=['VIF'],ascending=False,inplace=True)
|
377 |
+
# vif_data=np.round(vif_data)
|
378 |
+
# vif_data['VIF']=vif_data['VIF'].astype(float)
|
379 |
+
# st.header('2.4 Variance Inflation Factor (VIF)')
|
380 |
+
# #st.dataframe(vif_data)
|
381 |
+
# color_mapping = {
|
382 |
+
# 'darkgreen': (vif_data['VIF'] < 3),
|
383 |
+
# 'orange': (vif_data['VIF'] >= 3) & (vif_data['VIF'] <= 10),
|
384 |
+
# 'darkred': (vif_data['VIF'] > 10)
|
385 |
+
# }
|
386 |
+
|
387 |
+
# # Create a horizontal bar plot
|
388 |
+
# fig, ax = plt.subplots()
|
389 |
+
# fig.set_figwidth(10) # Adjust the width of the figure as needed
|
390 |
+
|
391 |
+
# # Sort the bars by descending VIF values
|
392 |
+
# vif_data = vif_data.sort_values(by='VIF', ascending=False)
|
393 |
+
|
394 |
+
# # Iterate through the color mapping and plot bars with corresponding colors
|
395 |
+
# for color, condition in color_mapping.items():
|
396 |
+
# subset = vif_data[condition]
|
397 |
+
# bars = ax.barh(subset["Variable"], subset["VIF"], color=color, label=color)
|
398 |
+
|
399 |
+
# # Add text annotations on top of the bars
|
400 |
+
# for bar in bars:
|
401 |
+
# width = bar.get_width()
|
402 |
+
# ax.annotate(f'{width:}', xy=(width, bar.get_y() + bar.get_height() / 2), xytext=(5, 0),
|
403 |
+
# textcoords='offset points', va='center')
|
404 |
+
|
405 |
+
# # Customize the plot
|
406 |
+
# ax.set_xlabel('VIF Values')
|
407 |
+
# #ax.set_title('2.4 Variance Inflation Factor (VIF)')
|
408 |
+
# #ax.legend(loc='upper right')
|
409 |
+
|
410 |
+
# # Display the plot in Streamlit
|
411 |
+
# st.pyplot(fig)
|
412 |
+
|
413 |
+
# with st.expander('Results Summary Test data'):
|
414 |
+
# ss = MinMaxScaler()
|
415 |
+
# X_test = pd.DataFrame(ss.fit_transform(X_test), columns=X_test.columns)
|
416 |
+
# st.header('2.2 Actual vs. Predicted Plot')
|
417 |
+
|
418 |
+
# metrics_table,line,actual_vs_predicted_plot=plot_actual_vs_predicted(date, y_test, model.predict(X_test), model,target_column='Revenue')
|
419 |
+
|
420 |
+
# st.plotly_chart(actual_vs_predicted_plot,use_container_width=True)
|
421 |
+
|
422 |
+
# st.markdown('## 2.3 Residual Analysis')
|
423 |
+
# columns=st.columns(2)
|
424 |
+
# with columns[0]:
|
425 |
+
# fig=plot_residual_predicted(revenue,model.predict(X_test),X_test)
|
426 |
+
# st.plotly_chart(fig)
|
427 |
+
|
428 |
+
# with columns[1]:
|
429 |
+
# st.empty()
|
430 |
+
# fig = qqplot(revenue,model.predict(X_test))
|
431 |
+
# st.plotly_chart(fig)
|
432 |
+
|
433 |
+
# with columns[0]:
|
434 |
+
# fig=residual_distribution(revenue,model.predict(X_test))
|
435 |
+
# st.pyplot(fig)
|
436 |
+
|
437 |
+
# value=False
|
438 |
+
# if st.checkbox('Save this model to tune',key='build_rc_cb'):
|
439 |
+
# mod_name=st.text_input('Enter model name')
|
440 |
+
# if len(mod_name)>0:
|
441 |
+
# st.session_state['Model'][mod_name]={"Model_object":model,'feature_set':st.session_state['features_set'],'X_train':X_train}
|
442 |
+
# st.session_state['X_train']=X_train
|
443 |
+
# st.session_state['X_test']=X_test
|
444 |
+
# st.session_state['y_train']=y_train
|
445 |
+
# st.session_state['y_test']=y_test
|
446 |
+
# with open("best_models.pkl", "wb") as f:
|
447 |
+
# pickle.dump(st.session_state['Model'], f)
|
448 |
+
# st.success('Model saved!, Proceed next page to tune the model')
|
449 |
+
# value=False
|
dump/1_Transformations_with_panel.py
ADDED
@@ -0,0 +1,548 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import plotly.express as px
|
4 |
+
import plotly.graph_objects as go
|
5 |
+
from Eda_functions import format_numbers
|
6 |
+
import numpy as np
|
7 |
+
import pickle
|
8 |
+
from st_aggrid import AgGrid
|
9 |
+
from st_aggrid import GridOptionsBuilder,GridUpdateMode
|
10 |
+
from utilities import set_header,load_local_css
|
11 |
+
from st_aggrid import GridOptionsBuilder
|
12 |
+
import time
|
13 |
+
import itertools
|
14 |
+
import statsmodels.api as sm
|
15 |
+
import numpy as npc
|
16 |
+
import re
|
17 |
+
import itertools
|
18 |
+
from sklearn.metrics import mean_absolute_error, r2_score,mean_absolute_percentage_error
|
19 |
+
from sklearn.preprocessing import MinMaxScaler
|
20 |
+
import os
|
21 |
+
import matplotlib.pyplot as plt
|
22 |
+
from statsmodels.stats.outliers_influence import variance_inflation_factor
|
23 |
+
st.set_option('deprecation.showPyplotGlobalUse', False)
|
24 |
+
import statsmodels.api as sm
|
25 |
+
import statsmodels.formula.api as smf
|
26 |
+
|
27 |
+
from datetime import datetime
|
28 |
+
import seaborn as sns
|
29 |
+
from Data_prep_functions import *
|
30 |
+
|
31 |
+
|
32 |
+
def get_random_effects(media_data, panel_col, mdf):
|
33 |
+
random_eff_df = pd.DataFrame(columns=[panel_col, "random_effect"])
|
34 |
+
|
35 |
+
for i, market in enumerate(media_data[panel_col].unique()):
|
36 |
+
print(i, end='\r')
|
37 |
+
intercept = mdf.random_effects[market].values[0]
|
38 |
+
random_eff_df.loc[i, 'random_effect'] = intercept
|
39 |
+
random_eff_df.loc[i, panel_col] = market
|
40 |
+
|
41 |
+
return random_eff_df
|
42 |
+
|
43 |
+
|
44 |
+
def mdf_predict(X, mdf, random_eff_df) :
|
45 |
+
|
46 |
+
X['fixed_effect'] = mdf.predict(X)
|
47 |
+
merged_df=pd.merge(X[[panel_col,target_col]], random_eff_df, on = panel_col, how = 'left')
|
48 |
+
X['random_effect'] = merged_df['random_effect']
|
49 |
+
X['pred'] = X['fixed_effect'] + X['random_effect']
|
50 |
+
return X['pred']
|
51 |
+
|
52 |
+
st.set_page_config(
|
53 |
+
page_title="Model Build",
|
54 |
+
page_icon=":shark:",
|
55 |
+
layout="wide",
|
56 |
+
initial_sidebar_state='collapsed'
|
57 |
+
)
|
58 |
+
|
59 |
+
load_local_css('styles.css')
|
60 |
+
set_header()
|
61 |
+
|
62 |
+
|
63 |
+
st.title('1. Build Your Model')
|
64 |
+
|
65 |
+
panel_col= 'markets' # set the panel column
|
66 |
+
date_col = 'date'
|
67 |
+
target_col = 'total_approved_accounts_revenue'
|
68 |
+
|
69 |
+
media_data=pd.read_csv('upf_data_converted.csv')
|
70 |
+
media_data.columns=[i.lower().replace('-','').replace(':','').replace("__", "_") for i in media_data.columns]
|
71 |
+
|
72 |
+
# st.write(media_data.columns)
|
73 |
+
media_data.sort_values(date_col, inplace=True)
|
74 |
+
media_data.reset_index(drop=True,inplace=True)
|
75 |
+
|
76 |
+
date=media_data[date_col]
|
77 |
+
st.session_state['date']=date
|
78 |
+
revenue=media_data[target_col]
|
79 |
+
media_data.drop([target_col],axis=1,inplace=True)
|
80 |
+
media_data.drop([date_col],axis=1,inplace=True)
|
81 |
+
media_data.reset_index(drop=True,inplace=True)
|
82 |
+
|
83 |
+
|
84 |
+
if st.toggle('Apply Transformations on DMA/Panel Level'):
|
85 |
+
dma=st.selectbox('Select the Level of data ',[ col for col in media_data.columns if col.lower() in ['dma','panel', 'markets']])
|
86 |
+
|
87 |
+
|
88 |
+
else:
|
89 |
+
#""" code to aggregate data on date """
|
90 |
+
|
91 |
+
|
92 |
+
dma=None
|
93 |
+
|
94 |
+
# dma_dict={ dm:media_data[media_data[dma]==dm] for dm in media_data[dma].unique()}
|
95 |
+
# st.write(dma_dict)
|
96 |
+
|
97 |
+
st.markdown('## Select the Range of Transformations')
|
98 |
+
columns = st.columns(2)
|
99 |
+
old_shape=media_data.shape
|
100 |
+
|
101 |
+
|
102 |
+
if "old_shape" not in st.session_state:
|
103 |
+
st.session_state['old_shape']=old_shape
|
104 |
+
|
105 |
+
|
106 |
+
with columns[0]:
|
107 |
+
slider_value_adstock = st.slider('Select Adstock Range (only applied to media)', 0.0, 1.0, (0.2, 0.4), step=0.1, format="%.2f")
|
108 |
+
with columns[1]:
|
109 |
+
slider_value_lag = st.slider('Select Lag Range (applied to media, seasonal, macroeconomic variables)', 1, 7, (1, 3), step=1)
|
110 |
+
|
111 |
+
# with columns[2]:
|
112 |
+
# slider_value_power=st.slider('Select Power range (only applied to media )',0,4,(1,2),step=1)
|
113 |
+
|
114 |
+
# with columns[1]:
|
115 |
+
# st.number_input('Select the range of half saturation point ',min_value=1,max_value=5)
|
116 |
+
# st.number_input('Select the range of ')
|
117 |
+
|
118 |
+
# Section 1 - Transformations Functions
|
119 |
+
def lag(data,features,lags,dma=None):
|
120 |
+
if dma:
|
121 |
+
|
122 |
+
transformed_data=pd.concat([data.groupby([dma])[features].shift(lag).add_suffix(f'_lag_{lag}') for lag in lags],axis=1)
|
123 |
+
transformed_data=transformed_data.fillna(method='bfill')
|
124 |
+
return pd.concat([transformed_data,data],axis=1)
|
125 |
+
|
126 |
+
else:
|
127 |
+
|
128 |
+
#''' data should be aggregated on date'''
|
129 |
+
|
130 |
+
transformed_data=pd.concat([data[features].shift(lag).add_suffix(f'_lag_{lag}') for lag in lags],axis=1)
|
131 |
+
transformed_data=transformed_data.fillna(method='bfill')
|
132 |
+
|
133 |
+
return pd.concat([transformed_data,data],axis=1)
|
134 |
+
|
135 |
+
#adstock
|
136 |
+
def adstock(df, alphas, cutoff, features,dma=None):
|
137 |
+
# st.write(features)
|
138 |
+
|
139 |
+
if dma:
|
140 |
+
transformed_data=pd.DataFrame()
|
141 |
+
for d in df[dma].unique():
|
142 |
+
dma_sub_df = df[df[dma] == d]
|
143 |
+
n = len(dma_sub_df)
|
144 |
+
|
145 |
+
|
146 |
+
weights = np.array([[[alpha**(i-j) if i >= j and j >= i-cutoff else 0. for j in range(n)] for i in range(n)] for alpha in alphas])
|
147 |
+
X = dma_sub_df[features].to_numpy()
|
148 |
+
|
149 |
+
res = pd.DataFrame(np.hstack(weights @ X),
|
150 |
+
columns=[f'{col}_adstock_{alpha}' for alpha in alphas for col in features])
|
151 |
+
|
152 |
+
transformed_data=pd.concat([transformed_data,res],axis=0)
|
153 |
+
transformed_data.reset_index(drop=True,inplace=True)
|
154 |
+
return pd.concat([transformed_data,df],axis=1)
|
155 |
+
|
156 |
+
else:
|
157 |
+
|
158 |
+
n = len(df)
|
159 |
+
|
160 |
+
|
161 |
+
weights = np.array([[[alpha**(i-j) if i >= j and j >= i-cutoff else 0. for j in range(n)] for i in range(n)] for alpha in alphas])
|
162 |
+
|
163 |
+
X = df[features].to_numpy()
|
164 |
+
res = pd.DataFrame(np.hstack(weights @ X),
|
165 |
+
columns=[f'{col}_adstock_{alpha}' for alpha in alphas for col in features])
|
166 |
+
return pd.concat([res,df],axis=1)
|
167 |
+
|
168 |
+
|
169 |
+
|
170 |
+
|
171 |
+
# Section 2 - Begin Transformations
|
172 |
+
|
173 |
+
if 'media_data' not in st.session_state:
|
174 |
+
|
175 |
+
st.session_state['media_data']=pd.DataFrame()
|
176 |
+
|
177 |
+
# variables_to_be_transformed=[col for col in media_data.columns if col.lower() not in ['dma','panel'] ] # change for buckets
|
178 |
+
variables_to_be_transformed=[col for col in media_data.columns if '_clicks' in col.lower() or '_impress' in col.lower()] # srishti - change
|
179 |
+
# st.write(variables_to_be_transformed)
|
180 |
+
# st.write(media_data[variables_to_be_transformed].dtypes)
|
181 |
+
|
182 |
+
with columns[0]:
|
183 |
+
if st.button('Apply Transformations'):
|
184 |
+
with st.spinner('Applying Transformations'):
|
185 |
+
transformed_data_lag=lag(media_data,features=variables_to_be_transformed,lags=np.arange(slider_value_lag[0],slider_value_lag[1]+1,1),dma=dma)
|
186 |
+
|
187 |
+
# variables_to_be_transformed=[col for col in list(transformed_data_lag.columns) if col not in ['Date','DMA','Panel']] #change for buckets
|
188 |
+
variables_to_be_transformed = [col for col in media_data.columns if
|
189 |
+
'_clicks' in col.lower() or '_impress' in col.lower()] # srishti - change
|
190 |
+
|
191 |
+
transformed_data_adstock=adstock(df=transformed_data_lag, alphas=np.arange(slider_value_adstock[0],slider_value_adstock[1],0.1), cutoff=8, features=variables_to_be_transformed,dma=dma)
|
192 |
+
|
193 |
+
# st.success('Done')
|
194 |
+
st.success("Transformations complete!")
|
195 |
+
|
196 |
+
st.write(f'old shape {old_shape}, new shape {transformed_data_adstock.shape}')
|
197 |
+
# st.write(media_data.head(10))
|
198 |
+
# st.write(transformed_data_adstock.head(10))
|
199 |
+
|
200 |
+
transformed_data_adstock.columns = [c.replace(".","_") for c in transformed_data_adstock.columns] # srishti
|
201 |
+
# st.write(transformed_data_adstock.columns)
|
202 |
+
st.session_state['media_data']=transformed_data_adstock # srishti
|
203 |
+
|
204 |
+
# with st.spinner('Applying Transformations'):
|
205 |
+
# time.sleep(2)
|
206 |
+
# st.success("Transformations complete!")
|
207 |
+
|
208 |
+
# if st.session_state['media_data'].shape[1]>old_shape[1]:
|
209 |
+
# with columns[0]:
|
210 |
+
# st.write(f'Total no.of variables before transformation: {old_shape[1]}, Total no.of variables after transformation: {st.session_state["media_data"].shape[1]}')
|
211 |
+
#st.write(f'Total no.of variables after transformation: {st.session_state["media_data"].shape[1]}')
|
212 |
+
|
213 |
+
# Section 3 - Create combinations
|
214 |
+
|
215 |
+
# bucket=['paid_search', 'kwai','indicacao','infleux', 'influencer','FB: Level Achieved - Tier 1 Impressions',
|
216 |
+
# ' FB: Level Achieved - Tier 2 Impressions','paid_social_others',
|
217 |
+
# ' GA App: Will And Cid Pequena Baixo Risco Clicks',
|
218 |
+
# 'digital_tactic_others',"programmatic"
|
219 |
+
# ]
|
220 |
+
|
221 |
+
# srishti - bucket names changed
|
222 |
+
bucket=['paid_search', 'kwai','indicacao','infleux', 'influencer','fb_level_achieved_tier_2',
|
223 |
+
'fb_level_achieved_tier_1','paid_social_others',
|
224 |
+
'ga_app',
|
225 |
+
'digital_tactic_others',"programmatic"
|
226 |
+
]
|
227 |
+
|
228 |
+
with columns[1]:
|
229 |
+
if st.button('Create Combinations of Variables'):
|
230 |
+
|
231 |
+
top_3_correlated_features=[]
|
232 |
+
# for col in st.session_state['media_data'].columns[:19]:
|
233 |
+
original_cols = [c for c in st.session_state['media_data'].columns if "_clicks" in c.lower() or "_impressions" in c.lower()]
|
234 |
+
original_cols = [c for c in original_cols if "_lag" not in c.lower() and "_adstock" not in c.lower()]
|
235 |
+
# st.write(original_cols)
|
236 |
+
|
237 |
+
# for col in st.session_state['media_data'].columns[:19]:
|
238 |
+
for col in original_cols: # srishti - new
|
239 |
+
corr_df=pd.concat([st.session_state['media_data'].filter(regex=col),
|
240 |
+
revenue],axis=1).corr()[target_col].iloc[:-1]
|
241 |
+
top_3_correlated_features.append(list(corr_df.sort_values(ascending=False).head(2).index))
|
242 |
+
# st.write(col, top_3_correlated_features)
|
243 |
+
flattened_list = [item for sublist in top_3_correlated_features for item in sublist]
|
244 |
+
# all_features_set={var:[col for col in flattened_list if var in col] for var in bucket}
|
245 |
+
all_features_set={var:[col for col in flattened_list if var in col] for var in bucket if len([col for col in flattened_list if var in col])>0} # srishti
|
246 |
+
|
247 |
+
channels_all=[values for values in all_features_set.values()]
|
248 |
+
# st.write(channels_all)
|
249 |
+
st.session_state['combinations'] = list(itertools.product(*channels_all))
|
250 |
+
# if 'combinations' not in st.session_state:
|
251 |
+
# st.session_state['combinations']=combinations_all
|
252 |
+
|
253 |
+
st.session_state['final_selection']=st.session_state['combinations']
|
254 |
+
st.success('Done')
|
255 |
+
# st.write(f"{len(st.session_state['combinations'])} combinations created")
|
256 |
+
|
257 |
+
|
258 |
+
revenue.reset_index(drop=True,inplace=True)
|
259 |
+
if 'Model_results' not in st.session_state:
|
260 |
+
st.session_state['Model_results']={'Model_object':[],
|
261 |
+
'Model_iteration':[],
|
262 |
+
'Feature_set':[],
|
263 |
+
'MAPE':[],
|
264 |
+
'R2':[],
|
265 |
+
'ADJR2':[]
|
266 |
+
}
|
267 |
+
|
268 |
+
def reset_model_result_dct():
|
269 |
+
st.session_state['Model_results']={'Model_object':[],
|
270 |
+
'Model_iteration':[],
|
271 |
+
'Feature_set':[],
|
272 |
+
'MAPE':[],
|
273 |
+
'R2':[],
|
274 |
+
'ADJR2':[]
|
275 |
+
}
|
276 |
+
|
277 |
+
# if st.button('Build Model'):
|
278 |
+
if 'iterations' not in st.session_state:
|
279 |
+
st.session_state['iterations']=0
|
280 |
+
# st.write("1",st.session_state["final_selection"])
|
281 |
+
|
282 |
+
if 'final_selection' not in st.session_state:
|
283 |
+
st.session_state['final_selection']=False
|
284 |
+
|
285 |
+
save_path = r"Model/"
|
286 |
+
with columns[1]:
|
287 |
+
if st.session_state['final_selection']:
|
288 |
+
st.write(f'Total combinations created {format_numbers(len(st.session_state["final_selection"]))}')
|
289 |
+
|
290 |
+
|
291 |
+
if st.checkbox('Build all iterations'):
|
292 |
+
iterations=len(st.session_state['final_selection'])
|
293 |
+
else:
|
294 |
+
iterations = st.number_input('Select the number of iterations to perform', min_value=0, step=10, value=st.session_state['iterations'],on_change=reset_model_result_dct)
|
295 |
+
# st.write("iterations=", iterations)
|
296 |
+
|
297 |
+
if st.button('Build Model',on_click=reset_model_result_dct):
|
298 |
+
st.session_state['iterations']=iterations
|
299 |
+
# st.write("2",st.session_state["final_selection"])
|
300 |
+
|
301 |
+
# Section 4 - Model
|
302 |
+
|
303 |
+
st.session_state['media_data']=st.session_state['media_data'].fillna(method='ffill')
|
304 |
+
st.markdown(
|
305 |
+
'Data Split -- Training Period: May 9th, 2023 - October 5th,2023 , Testing Period: October 6th, 2023 - November 7th, 2023 ')
|
306 |
+
progress_bar = st.progress(0) # Initialize the progress bar
|
307 |
+
# time_remaining_text = st.empty() # Create an empty space for time remaining text
|
308 |
+
start_time = time.time() # Record the start time
|
309 |
+
progress_text = st.empty()
|
310 |
+
# time_elapsed_text = st.empty()
|
311 |
+
# for i, selected_features in enumerate(st.session_state["final_selection"][40000:40000 + int(iterations)]):
|
312 |
+
# st.write(st.session_state["final_selection"])
|
313 |
+
# for i, selected_features in enumerate(st.session_state["final_selection"]):
|
314 |
+
for i, selected_features in enumerate(st.session_state["final_selection"][0:int(iterations)]): # srishti
|
315 |
+
print("@@@@@@@@@@@@@",i)
|
316 |
+
df = st.session_state['media_data']
|
317 |
+
|
318 |
+
fet = [var for var in selected_features if len(var) > 0]
|
319 |
+
inp_vars_str = " + ".join(fet) # new
|
320 |
+
|
321 |
+
X = df[fet]
|
322 |
+
y = revenue
|
323 |
+
ss = MinMaxScaler()
|
324 |
+
X = pd.DataFrame(ss.fit_transform(X), columns=X.columns)
|
325 |
+
# X = sm.add_constant(X)
|
326 |
+
|
327 |
+
X['total_approved_accounts_revenue'] = revenue # new
|
328 |
+
X[panel_col] = df[panel_col]
|
329 |
+
|
330 |
+
X_train = X.iloc[:8000]
|
331 |
+
X_test = X.iloc[8000:]
|
332 |
+
y_train = y.iloc[:8000]
|
333 |
+
y_test = y.iloc[8000:]
|
334 |
+
|
335 |
+
print(X_train.shape)
|
336 |
+
# model = sm.OLS(y_train, X_train).fit()
|
337 |
+
md = smf.mixedlm("total_approved_accounts_revenue ~ {}".format(inp_vars_str),
|
338 |
+
data=X_train[['total_approved_accounts_revenue'] + fet],
|
339 |
+
groups=X_train[panel_col])
|
340 |
+
mdf = md.fit()
|
341 |
+
predicted_values = mdf.fittedvalues
|
342 |
+
|
343 |
+
# st.write(fet)
|
344 |
+
# positive_coeff=fet
|
345 |
+
# negetive_coeff=[]
|
346 |
+
|
347 |
+
coefficients = mdf.fe_params.to_dict()
|
348 |
+
model_possitive = [col for col in coefficients.keys() if coefficients[col] > 0]
|
349 |
+
# st.write(positive_coeff)
|
350 |
+
# st.write(model_possitive)
|
351 |
+
pvalues = [var for var in list(mdf.pvalues) if var <= 0.06]
|
352 |
+
|
353 |
+
# if (len(model_possitive) / len(selected_features)) > 0.9 and (len(pvalues) / len(selected_features)) >= 0.8:
|
354 |
+
if (len(model_possitive) / len(selected_features)) > 0 and (len(pvalues) / len(selected_features)) >= 0: # srishti - changed just for testing, revert later
|
355 |
+
# predicted_values = model.predict(X_train)
|
356 |
+
mape = mean_absolute_percentage_error(y_train, predicted_values)
|
357 |
+
r2 = r2_score(y_train, predicted_values)
|
358 |
+
adjr2 = 1 - (1 - r2) * (len(y_train) - 1) / (len(y_train) - len(selected_features) - 1)
|
359 |
+
|
360 |
+
filename = os.path.join(save_path, f"model_{i}.pkl")
|
361 |
+
with open(filename, "wb") as f:
|
362 |
+
pickle.dump(mdf, f)
|
363 |
+
# with open(r"C:\Users\ManojP\Documents\MMM\simopt\Model\model.pkl", 'rb') as file:
|
364 |
+
# model = pickle.load(file)
|
365 |
+
|
366 |
+
st.session_state['Model_results']['Model_object'].append(filename)
|
367 |
+
st.session_state['Model_results']['Model_iteration'].append(i)
|
368 |
+
st.session_state['Model_results']['Feature_set'].append(fet)
|
369 |
+
st.session_state['Model_results']['MAPE'].append(mape)
|
370 |
+
st.session_state['Model_results']['R2'].append(r2)
|
371 |
+
st.session_state['Model_results']['ADJR2'].append(adjr2)
|
372 |
+
|
373 |
+
current_time = time.time()
|
374 |
+
time_taken = current_time - start_time
|
375 |
+
time_elapsed_minutes = time_taken / 60
|
376 |
+
completed_iterations_text = f"{i + 1}/{iterations}"
|
377 |
+
progress_bar.progress((i + 1) / int(iterations))
|
378 |
+
progress_text.text(f'Completed iterations: {completed_iterations_text},Time Elapsed (min): {time_elapsed_minutes:.2f}')
|
379 |
+
|
380 |
+
st.write(f'Out of {st.session_state["iterations"]} iterations : {len(st.session_state["Model_results"]["Model_object"])} valid models')
|
381 |
+
pd.DataFrame(st.session_state['Model_results']).to_csv('model_output.csv')
|
382 |
+
|
383 |
+
def to_percentage(value):
|
384 |
+
return f'{value * 100:.1f}%'
|
385 |
+
|
386 |
+
st.title('2. Select Models')
|
387 |
+
if 'tick' not in st.session_state:
|
388 |
+
st.session_state['tick']=False
|
389 |
+
if st.checkbox('Show results of top 10 models (based on MAPE and Adj. R2)',value=st.session_state['tick']):
|
390 |
+
st.session_state['tick']=True
|
391 |
+
st.write('Select one model iteration to generate performance metrics for it:')
|
392 |
+
data=pd.DataFrame(st.session_state['Model_results'])
|
393 |
+
data.sort_values(by=['MAPE'],ascending=False,inplace=True)
|
394 |
+
data.drop_duplicates(subset='Model_iteration',inplace=True)
|
395 |
+
top_10=data.head(10)
|
396 |
+
top_10['Rank']=np.arange(1,len(top_10)+1,1)
|
397 |
+
top_10[['MAPE','R2','ADJR2']]=np.round(top_10[['MAPE','R2','ADJR2']],4).applymap(to_percentage)
|
398 |
+
top_10_table = top_10[['Rank','Model_iteration','MAPE','ADJR2','R2']]
|
399 |
+
#top_10_table.columns=[['Rank','Model Iteration Index','MAPE','Adjusted R2','R2']]
|
400 |
+
gd=GridOptionsBuilder.from_dataframe(top_10_table)
|
401 |
+
gd.configure_pagination(enabled=True)
|
402 |
+
gd.configure_selection(use_checkbox=True)
|
403 |
+
|
404 |
+
|
405 |
+
gridoptions=gd.build()
|
406 |
+
|
407 |
+
table = AgGrid(top_10,gridOptions=gridoptions,update_mode=GridUpdateMode.SELECTION_CHANGED)
|
408 |
+
|
409 |
+
selected_rows=table.selected_rows
|
410 |
+
# if st.session_state["selected_rows"] != selected_rows:
|
411 |
+
# st.session_state["build_rc_cb"] = False
|
412 |
+
st.session_state["selected_rows"] = selected_rows
|
413 |
+
if 'Model' not in st.session_state:
|
414 |
+
st.session_state['Model']={}
|
415 |
+
|
416 |
+
if len(selected_rows)>0:
|
417 |
+
st.header('2.1 Results Summary')
|
418 |
+
|
419 |
+
model_object=data[data['Model_iteration']==selected_rows[0]['Model_iteration']]['Model_object']
|
420 |
+
features_set=data[data['Model_iteration']==selected_rows[0]['Model_iteration']]['Feature_set']
|
421 |
+
|
422 |
+
with open(str(model_object.values[0]), 'rb') as file:
|
423 |
+
# print(file)
|
424 |
+
model = pickle.load(file)
|
425 |
+
st.write(model.summary())
|
426 |
+
st.header('2.2 Actual vs. Predicted Plot')
|
427 |
+
|
428 |
+
df=st.session_state['media_data']
|
429 |
+
X=df[features_set.values[0]]
|
430 |
+
# X = sm.add_constant(X)
|
431 |
+
y=revenue
|
432 |
+
|
433 |
+
ss = MinMaxScaler()
|
434 |
+
X = pd.DataFrame(ss.fit_transform(X), columns=X.columns)
|
435 |
+
|
436 |
+
X['total_approved_accounts_revenue'] = revenue # new
|
437 |
+
X[panel_col] = df[panel_col]
|
438 |
+
|
439 |
+
X_train=X.iloc[:8000]
|
440 |
+
X_test=X.iloc[8000:]
|
441 |
+
y_train=y.iloc[:8000]
|
442 |
+
y_test=y.iloc[8000:]
|
443 |
+
|
444 |
+
st.session_state['X']=X_train
|
445 |
+
st.session_state['features_set']=features_set.values[0]
|
446 |
+
|
447 |
+
metrics_table,line,actual_vs_predicted_plot=plot_actual_vs_predicted(date, y_train, model.fittedvalues, model,target_column='Revenue')
|
448 |
+
|
449 |
+
st.plotly_chart(actual_vs_predicted_plot,use_container_width=True)
|
450 |
+
|
451 |
+
random_eff_df = get_random_effects(media_data, panel_col, model)
|
452 |
+
|
453 |
+
|
454 |
+
st.markdown('## 2.3 Residual Analysis')
|
455 |
+
columns=st.columns(2)
|
456 |
+
with columns[0]:
|
457 |
+
fig=plot_residual_predicted(y_train,model.fittedvalues,X_train)
|
458 |
+
st.plotly_chart(fig)
|
459 |
+
|
460 |
+
with columns[1]:
|
461 |
+
st.empty()
|
462 |
+
fig = qqplot(y_train,model.fittedvalues)
|
463 |
+
st.plotly_chart(fig)
|
464 |
+
|
465 |
+
with columns[0]:
|
466 |
+
fig=residual_distribution(y_train,model.fittedvalues)
|
467 |
+
st.pyplot(fig)
|
468 |
+
|
469 |
+
|
470 |
+
|
471 |
+
vif_data = pd.DataFrame()
|
472 |
+
# X=X.drop('const',axis=1)
|
473 |
+
vif_data["Variable"] = X_train.columns
|
474 |
+
vif_data["VIF"] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
|
475 |
+
vif_data.sort_values(by=['VIF'],ascending=False,inplace=True)
|
476 |
+
vif_data=np.round(vif_data)
|
477 |
+
vif_data['VIF']=vif_data['VIF'].astype(float)
|
478 |
+
st.header('2.4 Variance Inflation Factor (VIF)')
|
479 |
+
#st.dataframe(vif_data)
|
480 |
+
color_mapping = {
|
481 |
+
'darkgreen': (vif_data['VIF'] < 3),
|
482 |
+
'orange': (vif_data['VIF'] >= 3) & (vif_data['VIF'] <= 10),
|
483 |
+
'darkred': (vif_data['VIF'] > 10)
|
484 |
+
}
|
485 |
+
|
486 |
+
# Create a horizontal bar plot
|
487 |
+
fig, ax = plt.subplots()
|
488 |
+
fig.set_figwidth(10) # Adjust the width of the figure as needed
|
489 |
+
|
490 |
+
# Sort the bars by descending VIF values
|
491 |
+
vif_data = vif_data.sort_values(by='VIF', ascending=False)
|
492 |
+
|
493 |
+
# Iterate through the color mapping and plot bars with corresponding colors
|
494 |
+
for color, condition in color_mapping.items():
|
495 |
+
subset = vif_data[condition]
|
496 |
+
bars = ax.barh(subset["Variable"], subset["VIF"], color=color, label=color)
|
497 |
+
|
498 |
+
# Add text annotations on top of the bars
|
499 |
+
for bar in bars:
|
500 |
+
width = bar.get_width()
|
501 |
+
ax.annotate(f'{width:}', xy=(width, bar.get_y() + bar.get_height() / 2), xytext=(5, 0),
|
502 |
+
textcoords='offset points', va='center')
|
503 |
+
|
504 |
+
# Customize the plot
|
505 |
+
ax.set_xlabel('VIF Values')
|
506 |
+
#ax.set_title('2.4 Variance Inflation Factor (VIF)')
|
507 |
+
#ax.legend(loc='upper right')
|
508 |
+
|
509 |
+
# Display the plot in Streamlit
|
510 |
+
st.pyplot(fig)
|
511 |
+
|
512 |
+
with st.expander('Results Summary Test data'):
|
513 |
+
ss = MinMaxScaler()
|
514 |
+
X_test = pd.DataFrame(ss.fit_transform(X_test), columns=X_test.columns)
|
515 |
+
st.header('2.2 Actual vs. Predicted Plot')
|
516 |
+
|
517 |
+
metrics_table,line,actual_vs_predicted_plot=plot_actual_vs_predicted(date, y_test, mdf_predict(X_test,mdf, random_eff_df), model,target_column='Revenue')
|
518 |
+
|
519 |
+
st.plotly_chart(actual_vs_predicted_plot,use_container_width=True)
|
520 |
+
|
521 |
+
st.markdown('## 2.3 Residual Analysis')
|
522 |
+
columns=st.columns(2)
|
523 |
+
with columns[0]:
|
524 |
+
fig=plot_residual_predicted(revenue,mdf_predict(X_test,mdf, random_eff_df),X_test)
|
525 |
+
st.plotly_chart(fig)
|
526 |
+
|
527 |
+
with columns[1]:
|
528 |
+
st.empty()
|
529 |
+
fig = qqplot(revenue,mdf_predict(X_test,mdf, random_eff_df))
|
530 |
+
st.plotly_chart(fig)
|
531 |
+
|
532 |
+
with columns[0]:
|
533 |
+
fig=residual_distribution(revenue,mdf_predict(X_test,mdf, random_eff_df))
|
534 |
+
st.pyplot(fig)
|
535 |
+
|
536 |
+
value=False
|
537 |
+
if st.checkbox('Save this model to tune',key='build_rc_cb'):
|
538 |
+
mod_name=st.text_input('Enter model name')
|
539 |
+
if len(mod_name)>0:
|
540 |
+
st.session_state['Model'][mod_name]={"Model_object":model,'feature_set':st.session_state['features_set'],'X_train':X_train}
|
541 |
+
st.session_state['X_train']=X_train
|
542 |
+
st.session_state['X_test']=X_test
|
543 |
+
st.session_state['y_train']=y_train
|
544 |
+
st.session_state['y_test']=y_test
|
545 |
+
with open("best_models.pkl", "wb") as f:
|
546 |
+
pickle.dump(st.session_state['Model'], f)
|
547 |
+
st.success('Model saved!, Proceed next page to tune the model')
|
548 |
+
value=False
|
dump/2_Model_Build_and_Performance.py
ADDED
@@ -0,0 +1,403 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import plotly.express as px
|
4 |
+
import plotly.graph_objects as go
|
5 |
+
from Eda_functions import format_numbers
|
6 |
+
import numpy as np
|
7 |
+
import pickle
|
8 |
+
from st_aggrid import AgGrid
|
9 |
+
from st_aggrid import GridOptionsBuilder,GridUpdateMode
|
10 |
+
from utilities import set_header,load_local_css
|
11 |
+
from st_aggrid import GridOptionsBuilder
|
12 |
+
import time
|
13 |
+
import itertools
|
14 |
+
import statsmodels.api as sm
|
15 |
+
import numpy as np
|
16 |
+
import re
|
17 |
+
import itertools
|
18 |
+
from sklearn.metrics import mean_absolute_error, r2_score,mean_absolute_percentage_error
|
19 |
+
from sklearn.preprocessing import MinMaxScaler
|
20 |
+
import os
|
21 |
+
import matplotlib.pyplot as plt
|
22 |
+
from statsmodels.stats.outliers_influence import variance_inflation_factor
|
23 |
+
st.set_option('deprecation.showPyplotGlobalUse', False)
|
24 |
+
from datetime import datetime
|
25 |
+
import seaborn as sns
|
26 |
+
from Data_prep_functions import *
|
27 |
+
|
28 |
+
st.set_page_config(
|
29 |
+
page_title="Model Build",
|
30 |
+
page_icon=":shark:",
|
31 |
+
layout="wide",
|
32 |
+
initial_sidebar_state='collapsed'
|
33 |
+
)
|
34 |
+
|
35 |
+
load_local_css('styles.css')
|
36 |
+
set_header()
|
37 |
+
|
38 |
+
|
39 |
+
st.title('1. Build Your Model')
|
40 |
+
|
41 |
+
# media_data=pd.read_csv('Media_data_for_model.csv')
|
42 |
+
media_data=pd.read_csv('Media_data_for_model_dma_level.csv')
|
43 |
+
date=media_data['Date']
|
44 |
+
st.session_state['date']=date
|
45 |
+
revenue=media_data['Total Approved Accounts - Revenue']
|
46 |
+
media_data.drop(['Total Approved Accounts - Revenue'],axis=1,inplace=True)
|
47 |
+
media_data.drop(['Date'],axis=1,inplace=True)
|
48 |
+
media_data.reset_index(drop=True,inplace=True)
|
49 |
+
|
50 |
+
dma=st.selectbox('Select the Level of data ',[ col for col in media_data.columns if col.lower() in ['dma','panel']])
|
51 |
+
|
52 |
+
dma_dict={ dm:media_data[media_data[dma]==dm] for dm in media_data[dma].unique()}
|
53 |
+
# st.write(dma_dict)
|
54 |
+
|
55 |
+
st.markdown('## Select the Range of Transformations')
|
56 |
+
columns = st.columns(2)
|
57 |
+
old_shape=media_data.shape
|
58 |
+
|
59 |
+
|
60 |
+
|
61 |
+
|
62 |
+
|
63 |
+
if "old_shape" not in st.session_state:
|
64 |
+
st.session_state['old_shape']=old_shape
|
65 |
+
|
66 |
+
|
67 |
+
|
68 |
+
|
69 |
+
with columns[0]:
|
70 |
+
slider_value_adstock = st.slider('Select Adstock Range (only applied to media)', 0.0, 1.0, (0.2, 0.4), step=0.1, format="%.2f")
|
71 |
+
with columns[1]:
|
72 |
+
slider_value_lag = st.slider('Select Lag Range (applied to media, seasonal, macroeconomic variables)', 1, 7, (1, 3), step=1)
|
73 |
+
|
74 |
+
# with columns[2]:
|
75 |
+
# slider_value_power=st.slider('Select Power range (only applied to media )',0,4,(1,2),step=1)
|
76 |
+
|
77 |
+
# with columns[1]:
|
78 |
+
# st.number_input('Select the range of half saturation point ',min_value=1,max_value=5)
|
79 |
+
# st.number_input('Select the range of ')
|
80 |
+
|
81 |
+
def lag(X, features, min_lag=0,max_lag=6):
|
82 |
+
|
83 |
+
for i in features:
|
84 |
+
for lag in range(min_lag, max_lag + 1):
|
85 |
+
X[f'{i}_lag{lag}'] = X[i].shift(periods=lag)
|
86 |
+
return X.fillna(method='bfill')
|
87 |
+
|
88 |
+
def adstock_variable(X,variable_name,decay):
|
89 |
+
adstock = [0] * len(X[variable_name])
|
90 |
+
|
91 |
+
for t in range(len(X[variable_name])):
|
92 |
+
if t == 0:
|
93 |
+
adstock[t] = X[variable_name][t]
|
94 |
+
else:
|
95 |
+
adstock[t] = X[variable_name][t] + adstock[t-1] * decay
|
96 |
+
return adstock
|
97 |
+
|
98 |
+
if 'media_data' not in st.session_state:
|
99 |
+
|
100 |
+
st.session_state['media_data']=pd.DataFrame()
|
101 |
+
|
102 |
+
variables_to_be_transformed=[col for col in media_data.columns if col.lower() not in ['dma','panel'] ]
|
103 |
+
# st.write(variables_to_be_transformed)
|
104 |
+
with columns[0]:
|
105 |
+
if st.button('Apply Transformations'):
|
106 |
+
for dm in dma_dict.keys():
|
107 |
+
dma_dict[dm].reset_index(drop=True,inplace=True)
|
108 |
+
dma_dict[dm]=lag(dma_dict[dm],variables_to_be_transformed,min_lag=slider_value_lag[0],max_lag=slider_value_lag[1])
|
109 |
+
|
110 |
+
for dm in dma_dict.keys():
|
111 |
+
for i in dma_dict[dm].drop(['DMA','Panel'],axis=1).columns:
|
112 |
+
for j in np.arange(slider_value_adstock[0],slider_value_adstock[1]+0.1,0.1):#adding adstock
|
113 |
+
dma_dict[dm][f'{i}_adst.{np.round(j,2)}']=adstock_variable(dma_dict[dm],i,j)
|
114 |
+
|
115 |
+
|
116 |
+
st.write(dma_dict)
|
117 |
+
st.session_state['media_data']=media_data
|
118 |
+
|
119 |
+
with st.spinner('Applying Transformations'):
|
120 |
+
time.sleep(2)
|
121 |
+
st.success("Transformations complete!")
|
122 |
+
|
123 |
+
if st.session_state['media_data'].shape[1]>old_shape[1]:
|
124 |
+
with columns[0]:
|
125 |
+
st.write(f'Total no.of variables before transformation: {old_shape[1]}, Total no.of variables after transformation: {st.session_state["media_data"].shape[1]}')
|
126 |
+
#st.write(f'Total no.of variables after transformation: {st.session_state["media_data"].shape[1]}')
|
127 |
+
|
128 |
+
|
129 |
+
bucket=['paid_search', 'kwai','indicacao','infleux', 'influencer','FB: Level Achieved - Tier 1 Impressions',
|
130 |
+
' FB: Level Achieved - Tier 2 Impressions','paid_social_others',
|
131 |
+
' GA App: Will And Cid Pequena Baixo Risco Clicks',
|
132 |
+
'digital_tactic_others',"programmatic"
|
133 |
+
]
|
134 |
+
|
135 |
+
with columns[1]:
|
136 |
+
if st.button('Create Combinations of Variables'):
|
137 |
+
|
138 |
+
top_3_correlated_features=[]
|
139 |
+
for col in st.session_state['media_data'].columns[:19]:
|
140 |
+
corr_df=pd.concat([st.session_state['media_data'].filter(regex=col),
|
141 |
+
revenue],axis=1).corr()['Total Approved Accounts - Revenue'].iloc[:-1]
|
142 |
+
top_3_correlated_features.append(list(corr_df.sort_values(ascending=False).head(2).index))
|
143 |
+
flattened_list = [item for sublist in top_3_correlated_features for item in sublist]
|
144 |
+
all_features_set={var:[col for col in flattened_list if var in col] for var in bucket}
|
145 |
+
channels_all=[values for values in all_features_set.values()]
|
146 |
+
st.session_state['combinations'] = list(itertools.product(*channels_all))
|
147 |
+
|
148 |
+
# if 'combinations' not in st.session_state:
|
149 |
+
# st.session_state['combinations']=combinations_all
|
150 |
+
|
151 |
+
st.session_state['final_selection']=st.session_state['combinations']
|
152 |
+
|
153 |
+
|
154 |
+
|
155 |
+
revenue.reset_index(drop=True,inplace=True)
|
156 |
+
if 'Model_results' not in st.session_state:
|
157 |
+
st.session_state['Model_results']={'Model_object':[],
|
158 |
+
'Model_iteration':[],
|
159 |
+
'Feature_set':[],
|
160 |
+
'MAPE':[],
|
161 |
+
'R2':[],
|
162 |
+
'ADJR2':[]
|
163 |
+
}
|
164 |
+
|
165 |
+
#if st.button('Build Model'):
|
166 |
+
if 'iterations' not in st.session_state:
|
167 |
+
st.session_state['iterations']=1
|
168 |
+
save_path = r"Model"
|
169 |
+
with columns[1]:
|
170 |
+
if "final_selection" in st.session_state:
|
171 |
+
st.write(f'Total combinations created {format_numbers(len(st.session_state["final_selection"]))}')
|
172 |
+
|
173 |
+
st.success('Done')
|
174 |
+
if st.checkbox('Build all iterations'):
|
175 |
+
iterations=len(st.session_state['final_selection'])
|
176 |
+
else:
|
177 |
+
iterations = st.number_input('Select the number of iterations to perform', min_value=1, step=100, value=st.session_state['iterations'])
|
178 |
+
|
179 |
+
st.session_state['iterations']=iterations
|
180 |
+
|
181 |
+
|
182 |
+
st.session_state['media_data']=st.session_state['media_data'].fillna(method='ffill')
|
183 |
+
if st.button("Build Models"):
|
184 |
+
st.markdown('Data Split -- Training Period: May 9th, 2023 - October 5th,2023 , Testing Period: October 6th, 2023 - November 7th, 2023 ')
|
185 |
+
progress_bar = st.progress(0) # Initialize the progress bar
|
186 |
+
#time_remaining_text = st.empty() # Create an empty space for time remaining text
|
187 |
+
start_time = time.time() # Record the start time
|
188 |
+
progress_text = st.empty()
|
189 |
+
#time_elapsed_text = st.empty()
|
190 |
+
for i, selected_features in enumerate(st.session_state["final_selection"][40000:40000+int(iterations)]):
|
191 |
+
df = st.session_state['media_data']
|
192 |
+
|
193 |
+
fet = [var for var in selected_features if len(var) > 0]
|
194 |
+
X = df[fet]
|
195 |
+
y = revenue
|
196 |
+
ss = MinMaxScaler()
|
197 |
+
X = pd.DataFrame(ss.fit_transform(X), columns=X.columns)
|
198 |
+
X = sm.add_constant(X)
|
199 |
+
X_train=X.iloc[:150]
|
200 |
+
X_test=X.iloc[150:]
|
201 |
+
y_train=y.iloc[:150]
|
202 |
+
y_test=y.iloc[150:]
|
203 |
+
|
204 |
+
|
205 |
+
model = sm.OLS(y_train, X_train).fit()
|
206 |
+
# st.write(fet)
|
207 |
+
positive_coeff=X.columns
|
208 |
+
negetive_coeff=[]
|
209 |
+
coefficients=model.params.to_dict()
|
210 |
+
model_possitive=[col for col in coefficients.keys() if coefficients[col]>0]
|
211 |
+
# st.write(positive_coeff)
|
212 |
+
# st.write(model_possitive)
|
213 |
+
pvalues=[var for var in list(model.pvalues) if var<=0.06]
|
214 |
+
if (len(model_possitive)/len(selected_features))>0.9 and (len(pvalues)/len(selected_features))>=0.8:
|
215 |
+
|
216 |
+
|
217 |
+
predicted_values = model.predict(X_train)
|
218 |
+
mape = mean_absolute_percentage_error(y_train, predicted_values)
|
219 |
+
adjr2 = model.rsquared_adj
|
220 |
+
r2 = model.rsquared
|
221 |
+
filename = os.path.join(save_path, f"model_{i}.pkl")
|
222 |
+
with open(filename, "wb") as f:
|
223 |
+
pickle.dump(model, f)
|
224 |
+
# with open(r"C:\Users\ManojP\Documents\MMM\simopt\Model\model.pkl", 'rb') as file:
|
225 |
+
# model = pickle.load(file)
|
226 |
+
|
227 |
+
st.session_state['Model_results']['Model_object'].append(filename)
|
228 |
+
st.session_state['Model_results']['Model_iteration'].append(i)
|
229 |
+
st.session_state['Model_results']['Feature_set'].append(fet)
|
230 |
+
st.session_state['Model_results']['MAPE'].append(mape)
|
231 |
+
st.session_state['Model_results']['R2'].append(r2)
|
232 |
+
st.session_state['Model_results']['ADJR2'].append(adjr2)
|
233 |
+
|
234 |
+
current_time = time.time()
|
235 |
+
time_taken = current_time - start_time
|
236 |
+
time_elapsed_minutes = time_taken / 60
|
237 |
+
completed_iterations_text = f"{i + 1}/{iterations}"
|
238 |
+
progress_bar.progress((i + 1) / int(iterations))
|
239 |
+
progress_text.text(f'Completed iterations: {completed_iterations_text},Time Elapsed (min): {time_elapsed_minutes:.2f}')
|
240 |
+
|
241 |
+
st.write(f'Out of {st.session_state["iterations"]} iterations : {len(st.session_state["Model_results"]["Model_object"])} valid models')
|
242 |
+
pd.DataFrame(st.session_state['Model_results']).to_csv('model_output.csv')
|
243 |
+
|
244 |
+
def to_percentage(value):
|
245 |
+
return f'{value * 100:.1f}%'
|
246 |
+
|
247 |
+
st.title('2. Select Models')
|
248 |
+
if 'tick' not in st.session_state:
|
249 |
+
st.session_state['tick']=False
|
250 |
+
if st.checkbox('Show results of top 10 models (based on MAPE and Adj. R2)',value=st.session_state['tick']):
|
251 |
+
st.session_state['tick']=True
|
252 |
+
st.write('Select one model iteration to generate performance metrics for it:')
|
253 |
+
data=pd.DataFrame(st.session_state['Model_results'])
|
254 |
+
data.sort_values(by=['MAPE'],ascending=False,inplace=True)
|
255 |
+
data.drop_duplicates(subset='Model_iteration',inplace=True)
|
256 |
+
top_10=data.head(10)
|
257 |
+
top_10['Rank']=np.arange(1,len(top_10)+1,1)
|
258 |
+
top_10[['MAPE','R2','ADJR2']]=np.round(top_10[['MAPE','R2','ADJR2']],4).applymap(to_percentage)
|
259 |
+
top_10_table = top_10[['Rank','Model_iteration','MAPE','ADJR2','R2']]
|
260 |
+
#top_10_table.columns=[['Rank','Model Iteration Index','MAPE','Adjusted R2','R2']]
|
261 |
+
|
262 |
+
|
263 |
+
gd=GridOptionsBuilder.from_dataframe(top_10_table)
|
264 |
+
gd.configure_pagination(enabled=True)
|
265 |
+
gd.configure_selection(use_checkbox=True)
|
266 |
+
|
267 |
+
|
268 |
+
gridoptions=gd.build()
|
269 |
+
|
270 |
+
table = AgGrid(top_10,gridOptions=gridoptions,update_mode=GridUpdateMode.SELECTION_CHANGED)
|
271 |
+
|
272 |
+
selected_rows=table.selected_rows
|
273 |
+
# if st.session_state["selected_rows"] != selected_rows:
|
274 |
+
# st.session_state["build_rc_cb"] = False
|
275 |
+
st.session_state["selected_rows"] = selected_rows
|
276 |
+
if 'Model' not in st.session_state:
|
277 |
+
st.session_state['Model']={}
|
278 |
+
|
279 |
+
if len(selected_rows)>0:
|
280 |
+
st.header('2.1 Results Summary')
|
281 |
+
|
282 |
+
model_object=data[data['Model_iteration']==selected_rows[0]['Model_iteration']]['Model_object']
|
283 |
+
features_set=data[data['Model_iteration']==selected_rows[0]['Model_iteration']]['Feature_set']
|
284 |
+
|
285 |
+
with open(str(model_object.values[0]), 'rb') as file:
|
286 |
+
model = pickle.load(file)
|
287 |
+
st.write(model.summary())
|
288 |
+
st.header('2.2 Actual vs. Predicted Plot')
|
289 |
+
|
290 |
+
df=st.session_state['media_data']
|
291 |
+
X=df[features_set.values[0]]
|
292 |
+
X = sm.add_constant(X)
|
293 |
+
y=revenue
|
294 |
+
X_train=X.iloc[:150]
|
295 |
+
X_test=X.iloc[150:]
|
296 |
+
y_train=y.iloc[:150]
|
297 |
+
y_test=y.iloc[150:]
|
298 |
+
ss = MinMaxScaler()
|
299 |
+
X_train = pd.DataFrame(ss.fit_transform(X_train), columns=X_train.columns)
|
300 |
+
st.session_state['X']=X_train
|
301 |
+
st.session_state['features_set']=features_set.values[0]
|
302 |
+
|
303 |
+
metrics_table,line,actual_vs_predicted_plot=plot_actual_vs_predicted(date, y_train, model.predict(X_train), model,target_column='Revenue')
|
304 |
+
|
305 |
+
st.plotly_chart(actual_vs_predicted_plot,use_container_width=True)
|
306 |
+
|
307 |
+
|
308 |
+
|
309 |
+
st.markdown('## 2.3 Residual Analysis')
|
310 |
+
columns=st.columns(2)
|
311 |
+
with columns[0]:
|
312 |
+
fig=plot_residual_predicted(y_train,model.predict(X_train),X_train)
|
313 |
+
st.plotly_chart(fig)
|
314 |
+
|
315 |
+
with columns[1]:
|
316 |
+
st.empty()
|
317 |
+
fig = qqplot(y_train,model.predict(X_train))
|
318 |
+
st.plotly_chart(fig)
|
319 |
+
|
320 |
+
with columns[0]:
|
321 |
+
fig=residual_distribution(y_train,model.predict(X_train))
|
322 |
+
st.pyplot(fig)
|
323 |
+
|
324 |
+
|
325 |
+
|
326 |
+
vif_data = pd.DataFrame()
|
327 |
+
# X=X.drop('const',axis=1)
|
328 |
+
vif_data["Variable"] = X_train.columns
|
329 |
+
vif_data["VIF"] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
|
330 |
+
vif_data.sort_values(by=['VIF'],ascending=False,inplace=True)
|
331 |
+
vif_data=np.round(vif_data)
|
332 |
+
vif_data['VIF']=vif_data['VIF'].astype(float)
|
333 |
+
st.header('2.4 Variance Inflation Factor (VIF)')
|
334 |
+
#st.dataframe(vif_data)
|
335 |
+
color_mapping = {
|
336 |
+
'darkgreen': (vif_data['VIF'] < 3),
|
337 |
+
'orange': (vif_data['VIF'] >= 3) & (vif_data['VIF'] <= 10),
|
338 |
+
'darkred': (vif_data['VIF'] > 10)
|
339 |
+
}
|
340 |
+
|
341 |
+
# Create a horizontal bar plot
|
342 |
+
fig, ax = plt.subplots()
|
343 |
+
fig.set_figwidth(10) # Adjust the width of the figure as needed
|
344 |
+
|
345 |
+
# Sort the bars by descending VIF values
|
346 |
+
vif_data = vif_data.sort_values(by='VIF', ascending=False)
|
347 |
+
|
348 |
+
# Iterate through the color mapping and plot bars with corresponding colors
|
349 |
+
for color, condition in color_mapping.items():
|
350 |
+
subset = vif_data[condition]
|
351 |
+
bars = ax.barh(subset["Variable"], subset["VIF"], color=color, label=color)
|
352 |
+
|
353 |
+
# Add text annotations on top of the bars
|
354 |
+
for bar in bars:
|
355 |
+
width = bar.get_width()
|
356 |
+
ax.annotate(f'{width:}', xy=(width, bar.get_y() + bar.get_height() / 2), xytext=(5, 0),
|
357 |
+
textcoords='offset points', va='center')
|
358 |
+
|
359 |
+
# Customize the plot
|
360 |
+
ax.set_xlabel('VIF Values')
|
361 |
+
#ax.set_title('2.4 Variance Inflation Factor (VIF)')
|
362 |
+
#ax.legend(loc='upper right')
|
363 |
+
|
364 |
+
# Display the plot in Streamlit
|
365 |
+
st.pyplot(fig)
|
366 |
+
|
367 |
+
with st.expander('Results Summary Test data'):
|
368 |
+
ss = MinMaxScaler()
|
369 |
+
X_test = pd.DataFrame(ss.fit_transform(X_test), columns=X_test.columns)
|
370 |
+
st.header('2.2 Actual vs. Predicted Plot')
|
371 |
+
|
372 |
+
metrics_table,line,actual_vs_predicted_plot=plot_actual_vs_predicted(date, y_test, model.predict(X_test), model,target_column='Revenue')
|
373 |
+
|
374 |
+
st.plotly_chart(actual_vs_predicted_plot,use_container_width=True)
|
375 |
+
|
376 |
+
st.markdown('## 2.3 Residual Analysis')
|
377 |
+
columns=st.columns(2)
|
378 |
+
with columns[0]:
|
379 |
+
fig=plot_residual_predicted(revenue,model.predict(X_test),X_test)
|
380 |
+
st.plotly_chart(fig)
|
381 |
+
|
382 |
+
with columns[1]:
|
383 |
+
st.empty()
|
384 |
+
fig = qqplot(revenue,model.predict(X_test))
|
385 |
+
st.plotly_chart(fig)
|
386 |
+
|
387 |
+
with columns[0]:
|
388 |
+
fig=residual_distribution(revenue,model.predict(X_test))
|
389 |
+
st.pyplot(fig)
|
390 |
+
|
391 |
+
value=False
|
392 |
+
if st.checkbox('Save this model to tune',key='build_rc_cb'):
|
393 |
+
mod_name=st.text_input('Enter model name')
|
394 |
+
if len(mod_name)>0:
|
395 |
+
st.session_state['Model'][mod_name]={"Model_object":model,'feature_set':st.session_state['features_set'],'X_train':X_train}
|
396 |
+
st.session_state['X_train']=X_train
|
397 |
+
st.session_state['X_test']=X_test
|
398 |
+
st.session_state['y_train']=y_train
|
399 |
+
st.session_state['y_test']=y_test
|
400 |
+
with open("best_models.pkl", "wb") as f:
|
401 |
+
pickle.dump(st.session_state['Model'], f)
|
402 |
+
st.success('Model saved!, Proceed next page to tune the model')
|
403 |
+
value=False
|
dump/3_Model_Tuning.py
ADDED
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
from Eda_functions import format_numbers
|
4 |
+
import pickle
|
5 |
+
from utilities import set_header,load_local_css
|
6 |
+
import statsmodels.api as sm
|
7 |
+
import re
|
8 |
+
from sklearn.preprocessing import MinMaxScaler
|
9 |
+
import matplotlib.pyplot as plt
|
10 |
+
from statsmodels.stats.outliers_influence import variance_inflation_factor
|
11 |
+
st.set_option('deprecation.showPyplotGlobalUse', False)
|
12 |
+
from Data_prep_functions import *
|
13 |
+
|
14 |
+
st.set_page_config(
|
15 |
+
page_title="Model Tuning",
|
16 |
+
page_icon=":shark:",
|
17 |
+
layout="wide",
|
18 |
+
initial_sidebar_state='collapsed'
|
19 |
+
)
|
20 |
+
load_local_css('styles.css')
|
21 |
+
set_header()
|
22 |
+
|
23 |
+
|
24 |
+
st.title('1. Model Tuning')
|
25 |
+
|
26 |
+
|
27 |
+
if "X_train" not in st.session_state:
|
28 |
+
st.error(
|
29 |
+
"Oops! It seems there are no saved models available. Please build and save a model from the previous page to proceed.")
|
30 |
+
st.stop()
|
31 |
+
X_train=st.session_state['X_train']
|
32 |
+
X_test=st.session_state['X_test']
|
33 |
+
y_train=st.session_state['y_train']
|
34 |
+
y_test=st.session_state['y_test']
|
35 |
+
df=st.session_state['media_data']
|
36 |
+
|
37 |
+
with open("best_models.pkl", 'rb') as file:
|
38 |
+
model_dict= pickle.load(file)
|
39 |
+
|
40 |
+
if 'selected_model' not in st.session_state:
|
41 |
+
st.session_state['selected_model']=0
|
42 |
+
|
43 |
+
|
44 |
+
st.markdown('### 1.1 Event Flags')
|
45 |
+
st.markdown('Helps in quantifying the impact of specific occurrences of events')
|
46 |
+
with st.expander('Apply Event Flags'):
|
47 |
+
st.session_state["selected_model"]=st.selectbox('Select Model to apply flags',model_dict.keys())
|
48 |
+
model =model_dict[st.session_state["selected_model"]]['Model_object']
|
49 |
+
date=st.session_state['date']
|
50 |
+
date=pd.to_datetime(date)
|
51 |
+
X_train =model_dict[st.session_state["selected_model"]]['X_train']
|
52 |
+
features_set= model_dict[st.session_state["selected_model"]]['feature_set']
|
53 |
+
|
54 |
+
col=st.columns(3)
|
55 |
+
min_date=min(date)
|
56 |
+
max_date=max(date)
|
57 |
+
with col[0]:
|
58 |
+
start_date=st.date_input('Select Start Date',min_date,min_value=min_date,max_value=max_date)
|
59 |
+
with col[1]:
|
60 |
+
end_date=st.date_input('Select End Date',max_date,min_value=min_date,max_value=max_date)
|
61 |
+
with col[2]:
|
62 |
+
repeat=st.selectbox('Repeat Annually',['Yes','No'],index=1)
|
63 |
+
if repeat =='Yes':
|
64 |
+
repeat=True
|
65 |
+
else:
|
66 |
+
repeat=False
|
67 |
+
# X_train=sm.add_constant(X_train)
|
68 |
+
|
69 |
+
if 'Flags' not in st.session_state:
|
70 |
+
st.session_state['Flags']={}
|
71 |
+
|
72 |
+
met,line_values,fig_flag=plot_actual_vs_predicted(date[:150], y_train, model.predict(X_train), model,flag=(start_date,end_date),repeat_all_years=repeat)
|
73 |
+
st.plotly_chart(fig_flag,use_container_width=True)
|
74 |
+
flag_name='f1'
|
75 |
+
flag_name=st.text_input('Enter Flag Name')
|
76 |
+
if st.button('Update flag'):
|
77 |
+
st.session_state['Flags'][flag_name]=line_values
|
78 |
+
st.success(f'{flag_name} stored')
|
79 |
+
|
80 |
+
options=list(st.session_state['Flags'].keys())
|
81 |
+
selected_options = []
|
82 |
+
num_columns = 4
|
83 |
+
num_rows = -(-len(options) // num_columns)
|
84 |
+
|
85 |
+
|
86 |
+
tick=False
|
87 |
+
if st.checkbox('Select all'):
|
88 |
+
tick=True
|
89 |
+
selected_options = []
|
90 |
+
for row in range(num_rows):
|
91 |
+
cols = st.columns(num_columns)
|
92 |
+
for col in cols:
|
93 |
+
if options:
|
94 |
+
option = options.pop(0)
|
95 |
+
selected = col.checkbox(option,value=tick)
|
96 |
+
if selected:
|
97 |
+
selected_options.append(option)
|
98 |
+
|
99 |
+
st.markdown('### 1.2 Select Parameters to Apply')
|
100 |
+
parameters=st.columns(3)
|
101 |
+
with parameters[0]:
|
102 |
+
Trend=st.checkbox("**Trend**")
|
103 |
+
st.markdown('Helps account for long-term trends or seasonality that could influence advertising effectiveness')
|
104 |
+
with parameters[1]:
|
105 |
+
week_number=st.checkbox('**Week_number**')
|
106 |
+
st.markdown('Assists in detecting and incorporating weekly patterns or seasonality')
|
107 |
+
with parameters[2]:
|
108 |
+
sine_cosine=st.checkbox('**Sine and Cosine Waves**')
|
109 |
+
st.markdown('Helps in capturing cyclical patterns or seasonality in the data')
|
110 |
+
if st.button('Build model with Selected Parameters and Flags'):
|
111 |
+
st.header('2.1 Results Summary')
|
112 |
+
# date=list(df.index)
|
113 |
+
# df = df.reset_index(drop=True)
|
114 |
+
# st.write(df.head(2))
|
115 |
+
# X_train=df[features_set]
|
116 |
+
ss = MinMaxScaler()
|
117 |
+
X_train_tuned = pd.DataFrame(ss.fit_transform(X_train), columns=X_train.columns)
|
118 |
+
X_train_tuned=sm.add_constant(X_train_tuned)
|
119 |
+
for flag in selected_options:
|
120 |
+
X_train_tuned[flag]=st.session_state['Flags'][flag]
|
121 |
+
if Trend:
|
122 |
+
X_train_tuned['Trend']=np.arange(1,len(X_train_tuned)+1,1)
|
123 |
+
# if week_number:
|
124 |
+
# st.write(date)
|
125 |
+
date=pd.to_datetime(date.values)
|
126 |
+
X_train_tuned['Week_number']=date.day_of_week[:150]
|
127 |
+
model_tuned = sm.OLS(y_train, X_train_tuned).fit()
|
128 |
+
|
129 |
+
metrics_table,line,actual_vs_predicted_plot=plot_actual_vs_predicted(date[:150], y_train, model.predict(X_train), model,target_column='Revenue')
|
130 |
+
metrics_table_tuned,line,actual_vs_predicted_plot_tuned=plot_actual_vs_predicted(date[:150], y_train, model_tuned.predict(X_train_tuned), model_tuned,target_column='Revenue')
|
131 |
+
|
132 |
+
# st.write(metrics_table)
|
133 |
+
mape=np.round(metrics_table.iloc[0,1],2)
|
134 |
+
r2=np.round(metrics_table.iloc[1,1],2)
|
135 |
+
adjr2=np.round(metrics_table.iloc[2,1],2)
|
136 |
+
mape_tuned=np.round(metrics_table_tuned.iloc[0,1],2)
|
137 |
+
r2_tuned=np.round(metrics_table_tuned.iloc[1,1],2)
|
138 |
+
adjr2_tuned=np.round(metrics_table_tuned.iloc[2,1],2)
|
139 |
+
parameters_=st.columns(3)
|
140 |
+
with parameters_[0]:
|
141 |
+
st.metric('R2',r2_tuned,np.round(r2_tuned-r2,2))
|
142 |
+
with parameters_[1]:
|
143 |
+
st.metric('Adjusted R2',adjr2_tuned,np.round(adjr2_tuned-adjr2,2))
|
144 |
+
with parameters_[2]:
|
145 |
+
st.metric('MAPE',mape_tuned,np.round(mape_tuned-mape,2),'inverse')
|
146 |
+
|
147 |
+
st.header('2.2 Actual vs. Predicted Plot')
|
148 |
+
metrics_table,line,actual_vs_predicted_plot=plot_actual_vs_predicted(date, y_train, model.predict(X_train), model,target_column='Revenue')
|
149 |
+
|
150 |
+
st.plotly_chart(actual_vs_predicted_plot,use_container_width=True)
|
151 |
+
|
152 |
+
|
153 |
+
|
154 |
+
st.markdown('## 2.3 Residual Analysis')
|
155 |
+
columns=st.columns(2)
|
156 |
+
with columns[0]:
|
157 |
+
fig=plot_residual_predicted(y_train,model.predict(X_train),X_train)
|
158 |
+
st.plotly_chart(fig)
|
159 |
+
|
160 |
+
with columns[1]:
|
161 |
+
st.empty()
|
162 |
+
fig = qqplot(y_train,model.predict(X_train))
|
163 |
+
st.plotly_chart(fig)
|
164 |
+
|
165 |
+
with columns[0]:
|
166 |
+
fig=residual_distribution(y_train,model.predict(X_train))
|
167 |
+
st.pyplot(fig)
|
168 |
+
|
169 |
+
# if st.checkbox('Use this model to build response curves',key='123'):
|
170 |
+
|
171 |
+
# raw_data=df[features_set]
|
172 |
+
# columns_raw=[re.split(r"(_lag|_adst)",col)[0] for col in raw_data.columns]
|
173 |
+
# raw_data.columns=columns_raw
|
174 |
+
# columns_media=[col for col in columns_raw if Categorised_data[col]['BB']=='Media']
|
175 |
+
# raw_data=raw_data[columns_media]
|
176 |
+
|
177 |
+
# raw_data['Date']=list(df.index)
|
178 |
+
|
179 |
+
# spends_var=[col for col in df.columns if "spends" in col.lower() and 'adst' not in col.lower() and 'lag' not in col.lower()]
|
180 |
+
# spends_df=df[spends_var]
|
181 |
+
# spends_df['Week']=list(df.index)
|
182 |
+
|
183 |
+
|
184 |
+
# j=0
|
185 |
+
# X1=X.copy()
|
186 |
+
# col=X1.columns
|
187 |
+
# for i in model.params.values:
|
188 |
+
# X1[col[j]]=X1.iloc[:,j]*i
|
189 |
+
# j+=1
|
190 |
+
# contribution_df=X1
|
191 |
+
# contribution_df['Date']=list(df.index)
|
192 |
+
# excel_file='Overview_data.xlsx'
|
193 |
+
|
194 |
+
# with pd.ExcelWriter(excel_file,engine='xlsxwriter') as writer:
|
195 |
+
# raw_data.to_excel(writer,sheet_name='RAW DATA MMM',index=False)
|
196 |
+
# spends_df.to_excel(writer,sheet_name='SPEND INPUT',index=False)
|
197 |
+
# contribution_df.to_excel(writer,sheet_name='CONTRIBUTION MMM')
|