import streamlit as st import pandas as pd import plotly.express as px import plotly.graph_objects as go import numpy as np import pickle import statsmodels.api as sm import numpy as np from sklearn.metrics import mean_absolute_error, r2_score,mean_absolute_percentage_error from sklearn.preprocessing import MinMaxScaler import matplotlib.pyplot as plt from statsmodels.stats.outliers_influence import variance_inflation_factor from plotly.subplots import make_subplots st.set_option('deprecation.showPyplotGlobalUse', False) from datetime import datetime import seaborn as sns def calculate_discount(promo_price_series, non_promo_price_series): # Calculate the 4-week moving average of non-promo price window_size = 4 base_price = non_promo_price_series.rolling(window=window_size).mean() # Calculate discount_raw discount_raw_series = (1 - promo_price_series / base_price) * 100 # Calculate discount_final discount_final_series = discount_raw_series.where(discount_raw_series >= 5, 0) return base_price, discount_raw_series, discount_final_series def create_dual_axis_line_chart(date_series, promo_price_series, non_promo_price_series, base_price_series, discount_series): # Create traces for the primary axis (price vars) trace1 = go.Scatter( x=date_series, y=promo_price_series, name='Promo Price', yaxis='y1' ) trace2 = go.Scatter( x=date_series, y=non_promo_price_series, name='Non-Promo Price', yaxis='y1' ) trace3 = go.Scatter( x=date_series, y=base_price_series, name='Base Price', yaxis='y1' ) # Create a trace for the secondary axis (discount) trace4 = go.Scatter( x=date_series, y=discount_series, name='Discount', yaxis='y2' ) # Create the layout with dual axes layout = go.Layout( title='Price and Discount Over Time', yaxis=dict( title='Price', side='left' ), yaxis2=dict( title='Discount', side='right', overlaying='y', showgrid=False ), xaxis=dict(title='Date'), ) # Create the figure with the defined traces and layout fig = go.Figure(data=[trace1, trace2, trace3, trace4], layout=layout) return fig def to_percentage(value): return f'{value * 100:.1f}%' def plot_actual_vs_predicted(date, y, predicted_values, model,target_column=None, flag=None, repeat_all_years=False, is_panel=False): if flag is not None : fig = make_subplots(specs=[[{"secondary_y": True}]]) else : fig = go.Figure() if is_panel : df=pd.DataFrame() df['date'] = date df['Actual'] = y df['Predicted'] = predicted_values df_agg = df.groupby('date').agg({'Actual':'sum', 'Predicted':'sum'}).reset_index() df_agg.columns = ['date', 'Actual', 'Predicted'] assert len(df_agg) == pd.Series(date).nunique() # date = df_agg['date'] # y = df_agg['Actual'] # predicted_values = df_agg['Predicted'] # ymax = df_agg['Actual'].max() # Sprint3 - ymax to set y value for flag fig.add_trace(go.Scatter(x=df_agg['date'], y=df_agg['Actual'], mode='lines', name='Actual', line=dict(color='#08083B'))) fig.add_trace(go.Scatter(x=df_agg['date'], y=df_agg['Predicted'], mode='lines', name='Predicted', line=dict(color='#11B6BD'))) else : fig.add_trace(go.Scatter(x=date, y=y, mode='lines', name='Actual', line=dict(color='#08083B'))) fig.add_trace(go.Scatter(x=date, y=predicted_values, mode='lines', name='Predicted', line=dict(color='#11B6BD'))) line_values=[] if flag: min_date, max_date = flag[0], flag[1] min_week = datetime.strptime(str(min_date), "%Y-%m-%d").strftime("%U") max_week = datetime.strptime(str(max_date), "%Y-%m-%d").strftime("%U") month=pd.to_datetime(min_date).month day=pd.to_datetime(min_date).day #st.write(pd.to_datetime(min_date).week) #st.write(min_week) # Initialize an empty list to store line values # Sprint3 change : put flags to secondary axis, & made their y value to 1 instead of 5M if repeat_all_years: #line_values=list(pd.to_datetime((pd.Series(date)).dt.week).map(lambda x: 10000 if x==min_week else 0 )) #st.write(pd.Series(date).map(lambda x: pd.Timestamp(x).week)) line_values=list(pd.Series(date).map(lambda x: 1 if (pd.Timestamp(x).week >=int(min_week)) & (pd.Timestamp(x).week <=int(max_week)) else 0)) assert len(line_values) == len(date) #st.write(line_values) fig.add_trace(go.Scatter(x=date, y=line_values, mode='lines', name='Flag', line=dict(color='#FF5733')),secondary_y=True) else: line_values = [] line_values = list(pd.Series(date).map(lambda x: 1 if (pd.Timestamp(x) >= pd.Timestamp(min_date)) and (pd.Timestamp(x) <= pd.Timestamp(max_date)) else 0)) #st.write(line_values) fig.add_trace(go.Scatter(x=date, y=line_values, mode='lines', name='Flag', line=dict(color='#FF5733')),secondary_y=True) # Calculate MAPE mape = mean_absolute_percentage_error(y, predicted_values) # Calculate AdjR2 # Assuming X is your feature matrix r2 = r2_score(y, predicted_values) adjr2 = 1 - (1 - r2) * (len(y) - 1) / (len(y) - len(model.fe_params) - 1) # Create a table to display the metrics metrics_table = pd.DataFrame({ 'Metric': ['MAPE', 'R-squared', 'AdjR-squared'], 'Value': [mape, r2, adjr2] }) # st.write(metrics_table) fig.update_layout( xaxis=dict(title='Date'), yaxis=dict(title=target_column), xaxis_tickangle=-30 ) fig.add_annotation( text=f"MAPE: {mape*100:0.1f}%, Adjr2: {adjr2 *100:.1f}%", xref="paper", yref="paper", x=0.95, # Adjust these values to position the annotation y=1.2, showarrow=False, ) # print("{}{}"*20, len(line_values)) #metrics_table.set_index(['Metric'],inplace=True) return metrics_table,line_values, fig def plot_residual_predicted(actual, predicted, df): df_=df.copy() df_['Residuals'] = actual - pd.Series(predicted) df_['StdResidual'] = (df_['Residuals'] - df_['Residuals'].mean()) / df_['Residuals'].std() # Create a Plotly scatter plot fig = px.scatter(df_, x=predicted, y='StdResidual', opacity=0.5,color_discrete_sequence=["#11B6BD"]) # Add horizontal lines fig.add_hline(y=0, line_dash="dash", line_color="darkorange") fig.add_hline(y=2, line_color="red") fig.add_hline(y=-2, line_color="red") fig.update_xaxes(title='Predicted') fig.update_yaxes(title='Standardized Residuals (Actual - Predicted)') # Set the same width and height for both figures fig.update_layout(title='2.3.1 Residuals over Predicted Values', autosize=False, width=600, height=400) return fig def residual_distribution(actual, predicted): Residuals = actual - pd.Series(predicted) # Create a Seaborn distribution plot sns.set(style="whitegrid") plt.figure(figsize=(6, 4)) sns.histplot(Residuals, kde=True, color="#11B6BD") plt.title('2.3.3 Distribution of Residuals') plt.xlabel('Residuals') plt.ylabel('Probability Density') return plt def qqplot(actual, predicted): Residuals = actual - pd.Series(predicted) Residuals = pd.Series(Residuals) Resud_std = (Residuals - Residuals.mean()) / Residuals.std() # Create a QQ plot using Plotly with custom colors fig = go.Figure() fig.add_trace(go.Scatter(x=sm.ProbPlot(Resud_std).theoretical_quantiles, y=sm.ProbPlot(Resud_std).sample_quantiles, mode='markers', marker=dict(size=5, color="#11B6BD"), name='QQ Plot')) # Add the 45-degree reference line diagonal_line = go.Scatter( x=[-2, 2], # Adjust the x values as needed to fit the range of your data y=[-2, 2], # Adjust the y values accordingly mode='lines', line=dict(color='red'), # Customize the line color and style name=' ' ) fig.add_trace(diagonal_line) # Customize the layout fig.update_layout(title='2.3.2 QQ Plot of Residuals',title_x=0.5, autosize=False, width=600, height=400, xaxis_title='Theoretical Quantiles', yaxis_title='Sample Quantiles') return fig