|
|
|
import streamlit as st |
|
|
|
st.set_page_config( |
|
page_title="Data Import", |
|
page_icon=":shark:", |
|
layout="wide", |
|
initial_sidebar_state="collapsed", |
|
) |
|
|
|
import os |
|
import re |
|
import pickle |
|
import sqlite3 |
|
import pandas as pd |
|
from utilities import set_header, load_local_css, update_db, project_selection |
|
|
|
|
|
load_local_css("styles.css") |
|
set_header() |
|
|
|
|
|
if "project_name" not in st.session_state: |
|
st.session_state["project_name"] = None |
|
|
|
if "project_dct" not in st.session_state: |
|
project_selection() |
|
st.stop() |
|
|
|
|
|
if "username" in st.session_state and st.session_state["username"] is not None: |
|
|
|
cols1 = st.columns([2, 1]) |
|
|
|
with cols1[0]: |
|
st.markdown(f"**Welcome {st.session_state['username']}**") |
|
with cols1[1]: |
|
st.markdown(f"**Current Project: {st.session_state['project_name']}**") |
|
|
|
|
|
def validate_date_column(df): |
|
try: |
|
|
|
df["date"] = pd.to_datetime(df["date"], format="%d-%m-%Y") |
|
return True |
|
except: |
|
return False |
|
|
|
|
|
def determine_data_interval(common_freq): |
|
if common_freq == 1: |
|
return "daily" |
|
elif common_freq == 7: |
|
return "weekly" |
|
elif 28 <= common_freq <= 31: |
|
return "monthly" |
|
else: |
|
return "irregular" |
|
|
|
|
|
st.cache_resource(show_spinner=False) |
|
|
|
def files_to_dataframes(uploaded_files): |
|
df_dict = {} |
|
for uploaded_file in uploaded_files: |
|
|
|
file_name = uploaded_file.name.rsplit(".", 1)[0] |
|
|
|
|
|
if file_name in df_dict: |
|
st.warning( |
|
f"Duplicate File: {file_name}. This file will be skipped.", |
|
icon="⚠️", |
|
) |
|
continue |
|
|
|
|
|
df = pd.read_excel(uploaded_file) |
|
|
|
|
|
df.columns = df.columns.str.lower().str.strip() |
|
|
|
|
|
numeric_cols = list(df.select_dtypes(include=["number"]).columns) |
|
non_numeric_cols = [ |
|
col |
|
for col in df.select_dtypes(exclude=["number"]).columns |
|
if col.lower() != "date" |
|
] |
|
|
|
|
|
if not (validate_date_column(df) and len(numeric_cols) > 0): |
|
st.warning( |
|
f"File Name: {file_name} ➜ Please upload data with Date column in 'DD-MM-YYYY' format and at least one media/exogenous column. This file will be skipped.", |
|
icon="⚠️", |
|
) |
|
continue |
|
|
|
|
|
common_freq = common_freq = ( |
|
pd.Series(df["date"].unique()).diff().dt.days.dropna().mode()[0] |
|
) |
|
|
|
interval = determine_data_interval(common_freq) |
|
if interval == "irregular": |
|
st.warning( |
|
f"File Name: {file_name} ➜ Please upload data in daily, weekly or monthly interval. This file will be skipped.", |
|
icon="⚠️", |
|
) |
|
continue |
|
|
|
|
|
df_dict[file_name] = { |
|
"numeric": numeric_cols, |
|
"non_numeric": non_numeric_cols, |
|
"interval": interval, |
|
"df": df, |
|
} |
|
|
|
return df_dict |
|
|
|
|
|
def adjust_dataframe_granularity(df, current_granularity, target_granularity): |
|
|
|
df.set_index("date", inplace=True) |
|
|
|
|
|
aggregation_rules = { |
|
col: "sum" if pd.api.types.is_numeric_dtype(df[col]) else "first" |
|
for col in df.columns |
|
} |
|
|
|
|
|
resampled_df = df |
|
if current_granularity == "daily" and target_granularity == "weekly": |
|
resampled_df = df.resample("W-MON", closed="left", label="left").agg( |
|
aggregation_rules |
|
) |
|
|
|
elif current_granularity == "daily" and target_granularity == "monthly": |
|
resampled_df = df.resample("MS", closed="left", label="left").agg( |
|
aggregation_rules |
|
) |
|
|
|
elif current_granularity == "daily" and target_granularity == "daily": |
|
resampled_df = df.resample("D").agg(aggregation_rules) |
|
|
|
elif ( |
|
current_granularity in ["weekly", "monthly"] |
|
and target_granularity == "daily" |
|
): |
|
|
|
expanded_data = [] |
|
for _, row in df.iterrows(): |
|
if current_granularity == "weekly": |
|
period_range = pd.date_range(start=row.name, periods=7) |
|
elif current_granularity == "monthly": |
|
period_range = pd.date_range( |
|
start=row.name, periods=row.name.days_in_month |
|
) |
|
|
|
for date in period_range: |
|
new_row = {} |
|
for col in df.columns: |
|
if pd.api.types.is_numeric_dtype(df[col]): |
|
if current_granularity == "weekly": |
|
new_row[col] = row[col] / 7 |
|
elif current_granularity == "monthly": |
|
new_row[col] = row[col] / row.name.days_in_month |
|
else: |
|
new_row[col] = row[col] |
|
expanded_data.append((date, new_row)) |
|
|
|
resampled_df = pd.DataFrame( |
|
[data for _, data in expanded_data], |
|
index=[date for date, _ in expanded_data], |
|
) |
|
|
|
|
|
resampled_df = resampled_df.reset_index().rename(columns={"index": "date"}) |
|
|
|
return resampled_df |
|
|
|
|
|
st.cache_resource(show_spinner=False) |
|
|
|
def clean_and_extract_unique_values(files_dict, selections): |
|
all_panel1_values = set() |
|
all_panel2_values = set() |
|
|
|
for file_name, file_data in files_dict.items(): |
|
df = file_data["df"] |
|
|
|
|
|
selected_panel1 = selections[file_name].get("Panel_1") |
|
selected_panel2 = selections[file_name].get("Panel_2") |
|
|
|
|
|
if ( |
|
selected_panel1 |
|
and selected_panel1 != "N/A" |
|
and selected_panel1 in df.columns |
|
): |
|
df[selected_panel1] = ( |
|
df[selected_panel1].str.lower().str.strip().str.replace("_", " ") |
|
) |
|
all_panel1_values.update(df[selected_panel1].dropna().unique()) |
|
|
|
|
|
if ( |
|
selected_panel2 |
|
and selected_panel2 != "N/A" |
|
and selected_panel2 in df.columns |
|
): |
|
df[selected_panel2] = ( |
|
df[selected_panel2].str.lower().str.strip().str.replace("_", " ") |
|
) |
|
all_panel2_values.update(df[selected_panel2].dropna().unique()) |
|
|
|
|
|
files_dict[file_name]["df"] = df |
|
|
|
return all_panel1_values, all_panel2_values |
|
|
|
|
|
st.cache_resource(show_spinner=False) |
|
|
|
def format_values_for_display(values_list): |
|
|
|
formatted_list = [value.replace("_", " ").title() for value in values_list] |
|
|
|
if len(formatted_list) > 1: |
|
return ", ".join(formatted_list[:-1]) + ", and " + formatted_list[-1] |
|
elif formatted_list: |
|
return formatted_list[0] |
|
return "No values available" |
|
|
|
|
|
st.cache(show_spinner=False, allow_output_mutation=True) |
|
|
|
def standardize_data_to_daily(files_dict, selections): |
|
|
|
files_dict = apply_granularity_to_all(files_dict, "daily", selections) |
|
|
|
|
|
for files_name, files_data in files_dict.items(): |
|
files_data["interval"] = "daily" |
|
|
|
return files_dict |
|
|
|
|
|
st.cache_resource(show_spinner=False) |
|
|
|
def apply_granularity_to_all(files_dict, granularity_selection, selections): |
|
for file_name, file_data in files_dict.items(): |
|
df = file_data["df"].copy() |
|
|
|
|
|
selected_panel1 = selections[file_name].get("Panel_1") |
|
selected_panel2 = selections[file_name].get("Panel_2") |
|
|
|
|
|
if selected_panel1 != "N/A" and selected_panel2 != "N/A": |
|
unique_combinations = df[ |
|
[selected_panel1, selected_panel2] |
|
].drop_duplicates() |
|
elif selected_panel1 != "N/A": |
|
unique_combinations = df[[selected_panel1]].drop_duplicates() |
|
selected_panel2 = None |
|
elif selected_panel2 != "N/A": |
|
unique_combinations = df[[selected_panel2]].drop_duplicates() |
|
selected_panel1 = None |
|
else: |
|
|
|
df = adjust_dataframe_granularity( |
|
df, file_data["interval"], granularity_selection |
|
) |
|
files_dict[file_name]["df"] = df |
|
continue |
|
|
|
transformed_segments = [] |
|
for _, combo in unique_combinations.iterrows(): |
|
if selected_panel1 and selected_panel2: |
|
segment = df[ |
|
(df[selected_panel1] == combo[selected_panel1]) |
|
& (df[selected_panel2] == combo[selected_panel2]) |
|
] |
|
elif selected_panel1: |
|
segment = df[df[selected_panel1] == combo[selected_panel1]] |
|
elif selected_panel2: |
|
segment = df[df[selected_panel2] == combo[selected_panel2]] |
|
|
|
|
|
transformed_segment = adjust_dataframe_granularity( |
|
segment, file_data["interval"], granularity_selection |
|
) |
|
transformed_segments.append(transformed_segment) |
|
|
|
|
|
transformed_df = pd.concat(transformed_segments, ignore_index=True) |
|
files_dict[file_name]["df"] = transformed_df |
|
|
|
return files_dict |
|
|
|
|
|
st.cache_resource(show_spinner=False) |
|
|
|
def create_main_dataframe( |
|
files_dict, all_panel1_values, all_panel2_values, granularity_selection |
|
): |
|
|
|
global_start = min(df["df"]["date"].min() for df in files_dict.values()) |
|
global_end = max(df["df"]["date"].max() for df in files_dict.values()) |
|
|
|
|
|
if granularity_selection == "weekly": |
|
|
|
date_range = pd.date_range(start=global_start, end=global_end, freq="W-MON") |
|
elif granularity_selection == "monthly": |
|
|
|
date_range = pd.date_range(start=global_start, end=global_end, freq="MS") |
|
else: |
|
date_range = pd.date_range(start=global_start, end=global_end, freq="D") |
|
|
|
|
|
all_panel1s = all_panel1_values |
|
all_panel2s = all_panel2_values |
|
|
|
|
|
dimensions, merge_keys = [], [] |
|
if all_panel1s: |
|
dimensions.append(all_panel1s) |
|
merge_keys.append("Panel_1") |
|
if all_panel2s: |
|
dimensions.append(all_panel2s) |
|
merge_keys.append("Panel_2") |
|
|
|
dimensions.append(date_range) |
|
merge_keys.append("date") |
|
|
|
|
|
main_df = pd.MultiIndex.from_product( |
|
dimensions, |
|
names=[name for name, _ in zip(merge_keys, dimensions)], |
|
).to_frame(index=False) |
|
|
|
return main_df.reset_index(drop=True) |
|
|
|
|
|
st.cache_resource(show_spinner=False) |
|
|
|
def merge_into_main_df(main_df, files_dict, selections): |
|
for file_name, file_data in files_dict.items(): |
|
df = file_data["df"].copy() |
|
|
|
|
|
selected_panel1 = selections[file_name].get("Panel_1", "N/A") |
|
selected_panel2 = selections[file_name].get("Panel_2", "N/A") |
|
if selected_panel1 != "N/A": |
|
df.rename(columns={selected_panel1: "Panel_1"}, inplace=True) |
|
if selected_panel2 != "N/A": |
|
df.rename(columns={selected_panel2: "Panel_2"}, inplace=True) |
|
|
|
|
|
merge_keys = ["date"] |
|
if "Panel_1" in df.columns: |
|
merge_keys.append("Panel_1") |
|
if "Panel_2" in df.columns: |
|
merge_keys.append("Panel_2") |
|
main_df = pd.merge(main_df, df, on=merge_keys, how="left") |
|
|
|
|
|
sort_by = ["date"] |
|
if "Panel_1" in main_df.columns: |
|
sort_by.append("Panel_1") |
|
if "Panel_2" in main_df.columns: |
|
sort_by.append("Panel_2") |
|
main_df.sort_values(by=sort_by, inplace=True) |
|
main_df.reset_index(drop=True, inplace=True) |
|
|
|
return main_df |
|
|
|
|
|
def categorize_column(column_name): |
|
|
|
internal_keywords = [ |
|
"Internal", |
|
"Price", |
|
"Discount", |
|
"product_price", |
|
"cost", |
|
"margin", |
|
"inventory", |
|
"sales", |
|
"revenue", |
|
"turnover", |
|
"expense", |
|
] |
|
exogenous_keywords = [ |
|
"Exogenous", |
|
"GDP", |
|
"Tax", |
|
"Inflation", |
|
"interest_rate", |
|
"employment_rate", |
|
"exchange_rate", |
|
"consumer_spending", |
|
"retail_sales", |
|
"oil_prices", |
|
"weather", |
|
] |
|
|
|
|
|
|
|
if ( |
|
column_name |
|
in st.session_state["project_dct"]["data_import"]["cat_dct"].keys() |
|
and st.session_state["project_dct"]["data_import"]["cat_dct"][column_name] |
|
is not None |
|
): |
|
|
|
return st.session_state["project_dct"]["data_import"]["cat_dct"][ |
|
column_name |
|
] |
|
|
|
else: |
|
for keyword in ["Response", "Metric"]: |
|
if keyword.lower() in column_name.lower(): |
|
return "Response Metrics" |
|
for keyword in ["Spend", "Cost"]: |
|
if keyword.lower() in column_name.lower(): |
|
return "Spends" |
|
for keyword in internal_keywords: |
|
if keyword.lower() in column_name.lower(): |
|
return "Internal" |
|
for keyword in exogenous_keywords: |
|
if keyword.lower() in column_name.lower(): |
|
return "Exogenous" |
|
|
|
|
|
return "Media" |
|
|
|
|
|
st.cache_resource(show_spinner=False) |
|
|
|
def prepare_missing_stats_df(df): |
|
missing_stats = [] |
|
for column in df.columns: |
|
if ( |
|
column == "date" or column == "Panel_2" or column == "Panel_1" |
|
): |
|
continue |
|
|
|
missing = df[column].isnull().sum() |
|
pct_missing = round((missing / len(df)) * 100, 2) |
|
|
|
|
|
category = categorize_column(column) |
|
|
|
|
|
missing_stats.append( |
|
{ |
|
"Column": column, |
|
"Missing Values": missing, |
|
"Missing Percentage": pct_missing, |
|
"Impute Method": "Fill with 0", |
|
"Category": category, |
|
} |
|
) |
|
|
|
stats_df = pd.DataFrame(missing_stats) |
|
|
|
return stats_df |
|
|
|
|
|
st.cache_resource(show_spinner=False) |
|
|
|
def add_api_dataframe_to_dict(main_df, files_dict): |
|
files_dict["API"] = { |
|
"numeric": list(main_df.select_dtypes(include=["number"]).columns), |
|
"non_numeric": [ |
|
col |
|
for col in main_df.select_dtypes(exclude=["number"]).columns |
|
if col.lower() != "date" |
|
], |
|
"interval": determine_data_interval( |
|
pd.Series(main_df["date"].unique()).diff().dt.days.dropna().mode()[0] |
|
), |
|
"df": main_df, |
|
} |
|
|
|
return files_dict |
|
|
|
|
|
|
|
def read_API_data(project_folder_path, file_path, file_name): |
|
|
|
file_path_os = os.path.join(os.getcwd(), "API_data", file_name) |
|
project_folder_path_os = os.path.normpath(project_folder_path) |
|
|
|
|
|
project_file_path = os.path.join(project_folder_path_os, file_name) |
|
|
|
|
|
if os.path.exists(project_file_path): |
|
|
|
return pd.read_excel(project_file_path, parse_dates=["Date"]) |
|
else: |
|
|
|
data = pd.read_excel(file_path_os, parse_dates=["Date"]) |
|
|
|
|
|
data.to_excel(project_file_path, index=False) |
|
|
|
|
|
return data |
|
|
|
|
|
def set_Panel_1_Panel_2_Selected_false(): |
|
|
|
st.session_state["Panel_1_Panel_2_Selected"] = False |
|
|
|
|
|
st.session_state["project_dct"]["data_import"]["edited_stats_df"] = None |
|
st.session_state["project_dct"]["data_import"]["merged_df"] = None |
|
st.session_state["project_dct"]["data_import"]["missing_stats_df"] = None |
|
st.session_state["project_dct"]["data_import"]["cat_dct"] = {} |
|
st.session_state["project_dct"]["data_import"]["numeric_columns"] = None |
|
st.session_state["project_dct"]["data_import"]["default_df"] = None |
|
st.session_state["project_dct"]["data_import"]["final_df"] = None |
|
st.session_state["project_dct"]["data_import"]["edited_df"] = None |
|
|
|
|
|
@st.cache_resource(show_spinner=False) |
|
def save_to_pickle(file_path, final_df, bin_dict): |
|
|
|
with open(file_path, "wb") as f: |
|
pickle.dump( |
|
{"final_df": final_df, "bin_dict": bin_dict}, f |
|
) |
|
|
|
|
|
@st.cache_resource(show_spinner=False) |
|
def process_dataframes(merged_df, edited_df, edited_stats_df): |
|
|
|
if edited_df.empty: |
|
|
|
return merged_df, edited_stats_df |
|
|
|
|
|
else: |
|
|
|
for index, row in edited_df.iterrows(): |
|
result_column_name = ( |
|
f"{row['Column 1']}{row['Operator']}{row['Column 2']}" |
|
) |
|
col1 = row["Column 1"] |
|
col2 = row["Column 2"] |
|
op = row["Operator"] |
|
|
|
|
|
if op == "+": |
|
merged_df[result_column_name] = merged_df[col1] + merged_df[col2] |
|
elif op == "-": |
|
merged_df[result_column_name] = merged_df[col1] - merged_df[col2] |
|
elif op == "*": |
|
merged_df[result_column_name] = merged_df[col1] * merged_df[col2] |
|
elif op == "/": |
|
merged_df[result_column_name] = merged_df[col1] / merged_df[ |
|
col2 |
|
].replace(0, 1e-9) |
|
|
|
|
|
new_row = { |
|
"Column": result_column_name, |
|
"Missing Values": None, |
|
"Missing Percentage": None, |
|
"Impute Method": None, |
|
"Category": row["Category"], |
|
} |
|
new_row_df = pd.DataFrame([new_row]) |
|
|
|
|
|
edited_stats_df = pd.concat( |
|
[edited_stats_df, new_row_df], ignore_index=True, axis=0 |
|
) |
|
|
|
|
|
combined_columns = set(edited_df["Column 1"]).union( |
|
set(edited_df["Column 2"]) |
|
) |
|
|
|
|
|
edited_stats_df = edited_stats_df[ |
|
~edited_stats_df["Column"].isin(combined_columns) |
|
] |
|
merged_df.drop( |
|
columns=list(combined_columns), errors="ignore", inplace=True |
|
) |
|
|
|
return merged_df, edited_stats_df |
|
|
|
|
|
st.cache_resource(show_spinner=False) |
|
|
|
def prepare_numeric_columns_and_default_df(merged_df, edited_stats_df): |
|
|
|
columns_response_metrics = edited_stats_df[ |
|
edited_stats_df["Category"] == "Response Metrics" |
|
]["Column"].tolist() |
|
|
|
|
|
numeric_columns = [ |
|
col |
|
for col in merged_df.select_dtypes(include=["number"]).columns |
|
if col not in columns_response_metrics |
|
] |
|
|
|
|
|
data = { |
|
"Column 1": pd.Series([], dtype="str"), |
|
"Operator": pd.Series([], dtype="str"), |
|
"Column 2": pd.Series([], dtype="str"), |
|
"Category": pd.Series([], dtype="str"), |
|
} |
|
default_df = pd.DataFrame(data) |
|
|
|
return numeric_columns, default_df |
|
|
|
|
|
|
|
|
|
if "final_df" not in st.session_state: |
|
st.session_state["final_df"] = pd.DataFrame() |
|
|
|
|
|
if "bin_dict" not in st.session_state: |
|
st.session_state["bin_dict"] = {} |
|
|
|
|
|
if "Panel_1_Panel_2_Selected" not in st.session_state: |
|
st.session_state["Panel_1_Panel_2_Selected"] = False |
|
|
|
|
|
st.write("") |
|
st.title("Data Import") |
|
|
|
conn = sqlite3.connect( |
|
r"DB\User.db", check_same_thread=False |
|
) |
|
c = conn.cursor() |
|
|
|
|
|
|
|
|
|
|
|
|
|
main_df = read_API_data( |
|
project_folder_path=st.session_state["project_path"], |
|
file_path=st.session_state["project_dct"]["data_import"]["api_path"], |
|
file_name=st.session_state["project_dct"]["data_import"]["api_name"] + ".xlsx", |
|
) |
|
|
|
|
|
main_df.columns = main_df.columns.str.lower().str.strip() |
|
|
|
|
|
uploaded_files = st.file_uploader( |
|
"Upload additional data", |
|
type=["xlsx"], |
|
accept_multiple_files=True, |
|
on_change=set_Panel_1_Panel_2_Selected_false, |
|
) |
|
|
|
|
|
recommendation_html = f""" |
|
<div style="text-align: justify;"> |
|
<strong>Recommendation:</strong> For optimal processing, please ensure that all uploaded datasets including panel, media, internal, and exogenous data adhere to the following guidelines: Each dataset must include a <code>Date</code> column formatted as <code>DD-MM-YYYY</code>, be free of missing values. |
|
</div> |
|
""" |
|
st.markdown(recommendation_html, unsafe_allow_html=True) |
|
|
|
|
|
st.markdown("#### API Data") |
|
with st.expander("API Data", expanded=False): |
|
st.dataframe(main_df, hide_index=True) |
|
|
|
|
|
st.markdown("#### Choose Desired Granularity") |
|
|
|
|
|
granularity_selection = st.selectbox( |
|
"Choose Date Granularity", |
|
["Daily", "Weekly", "Monthly"], |
|
label_visibility="collapsed", |
|
on_change=set_Panel_1_Panel_2_Selected_false, |
|
index=st.session_state["project_dct"]["data_import"][ |
|
"granularity_selection" |
|
], |
|
) |
|
|
|
|
|
|
|
st.session_state["project_dct"]["data_import"]["granularity_selection"] = [ |
|
"Daily", |
|
"Weekly", |
|
"Monthly", |
|
].index(granularity_selection) |
|
|
|
granularity_selection = str(granularity_selection).lower() |
|
|
|
|
|
files_dict = files_to_dataframes(uploaded_files) |
|
|
|
|
|
if main_df is not None: |
|
files_dict = add_api_dataframe_to_dict(main_df, files_dict) |
|
|
|
|
|
if not files_dict: |
|
st.warning( |
|
"Please upload at least one file to proceed.", |
|
icon="⚠️", |
|
) |
|
st.stop() |
|
|
|
|
|
st.markdown("#### Select Panel columns") |
|
selections = {} |
|
with st.expander("Select Panel columns", expanded=False): |
|
count = 0 |
|
unique_numeric_columns = ( |
|
set() |
|
) |
|
|
|
for file_name, file_data in files_dict.items(): |
|
|
|
numeric_columns = file_data["numeric"] |
|
|
|
|
|
valid_column_pattern = re.compile(r"^[A-Za-z0-9_]+$") |
|
|
|
|
|
for column in numeric_columns: |
|
if column in unique_numeric_columns: |
|
|
|
st.warning( |
|
f"Duplicate column name '{column}' found in file '{file_name}'. Each column name must be unique across all files.", |
|
icon="⚠️", |
|
) |
|
st.stop() |
|
|
|
unique_numeric_columns.add(column) |
|
|
|
|
|
if not valid_column_pattern.match(column): |
|
st.warning( |
|
f"Column name '{column}' in file '{file_name}' contains invalid characters. " |
|
f"Column names should only contain letters (A-Z, a-z), numbers (0-9), and underscores (_).", |
|
icon="⚠️", |
|
) |
|
st.stop() |
|
|
|
if ( |
|
f"Panel_1_selectbox{file_name}" |
|
not in st.session_state["project_dct"]["data_import"].keys() |
|
): |
|
st.session_state["project_dct"]["data_import"][ |
|
f"Panel_1_selectbox{file_name}" |
|
] = 0 |
|
|
|
if ( |
|
f"Panel_2_selectbox{file_name}" |
|
not in st.session_state["project_dct"]["data_import"].keys() |
|
): |
|
|
|
st.session_state["project_dct"]["data_import"][ |
|
f"Panel_2_selectbox{file_name}" |
|
] = 0 |
|
|
|
|
|
if count == 0: |
|
label_visibility = "visible" |
|
else: |
|
label_visibility = "collapsed" |
|
|
|
|
|
non_numeric_cols = file_data["non_numeric"] |
|
|
|
|
|
panel1_values = non_numeric_cols + ["N/A"] |
|
panel2_values = non_numeric_cols + ["N/A"] |
|
|
|
|
|
if len(panel1_values) == 1 and len(panel2_values) == 1: |
|
selected_panel1, selected_panel2 = "N/A", "N/A" |
|
|
|
selections[file_name] = { |
|
"Panel_1": selected_panel1, |
|
"Panel_2": selected_panel2, |
|
} |
|
continue |
|
|
|
|
|
file_name_col, Panel_1_col, Panel_2_col = st.columns([2, 4, 4]) |
|
|
|
with file_name_col: |
|
|
|
if count == 0: |
|
st.write("File Name") |
|
else: |
|
st.write("") |
|
st.write(file_name) |
|
|
|
with Panel_1_col: |
|
|
|
selected_panel1 = st.selectbox( |
|
"Select Panel Level 1", |
|
panel2_values, |
|
on_change=set_Panel_1_Panel_2_Selected_false, |
|
label_visibility=label_visibility, |
|
key=f"Panel_1_selectbox{count}", |
|
index=st.session_state["project_dct"]["data_import"][ |
|
f"Panel_1_selectbox{file_name}" |
|
], |
|
) |
|
|
|
st.session_state["project_dct"]["data_import"][ |
|
f"Panel_1_selectbox{file_name}" |
|
] = panel2_values.index(selected_panel1) |
|
|
|
with Panel_2_col: |
|
|
|
selected_panel2 = st.selectbox( |
|
"Select Panel Level 2", |
|
panel1_values, |
|
on_change=set_Panel_1_Panel_2_Selected_false, |
|
label_visibility=label_visibility, |
|
key=f"Panel_2_selectbox{count}", |
|
index=st.session_state["project_dct"]["data_import"][ |
|
f"Panel_2_selectbox{file_name}" |
|
], |
|
) |
|
|
|
st.session_state["project_dct"]["data_import"][ |
|
f"Panel_2_selectbox{file_name}" |
|
] = panel1_values.index(selected_panel2) |
|
|
|
|
|
if selected_panel2 == selected_panel1 and not ( |
|
selected_panel2 == "N/A" and selected_panel1 == "N/A" |
|
): |
|
st.warning( |
|
f"File: {file_name} → The same column cannot serve as both Panel_1 and Panel_2. Please adjust your selections.", |
|
) |
|
selected_panel1, selected_panel2 = "N/A", "N/A" |
|
st.stop() |
|
|
|
|
|
if len(non_numeric_cols) > 2: |
|
st.warning( |
|
f"File: {file_name} → The input file contains more than two non-numeric/panel columns. Please verify the file's contents.", |
|
) |
|
st.stop() |
|
|
|
|
|
selected_panels_count = (1 if selected_panel1 != "N/A" else 0) + ( |
|
1 if selected_panel2 != "N/A" else 0 |
|
) |
|
|
|
|
|
if len(non_numeric_cols) != selected_panels_count: |
|
st.warning( |
|
f"File: {file_name} → The number of non-numeric columns selected does not match the expected panel count. Please ensure all required columns are selected.", |
|
) |
|
st.stop() |
|
|
|
|
|
selections[file_name] = { |
|
"Panel_1": selected_panel1, |
|
"Panel_2": selected_panel2, |
|
} |
|
|
|
count += 1 |
|
st.write() |
|
|
|
accept = st.button("Accept and Process", use_container_width=True) |
|
|
|
if ( |
|
accept == False |
|
and st.session_state["project_dct"]["data_import"]["edited_stats_df"] |
|
is not None |
|
): |
|
|
|
|
|
st.markdown("#### Unique Panel values") |
|
|
|
with st.expander("Unique Panel values"): |
|
st.write("") |
|
st.markdown( |
|
f""" |
|
<style> |
|
.justify-text {{ |
|
text-align: justify; |
|
}} |
|
</style> |
|
<div class="justify-text"> |
|
<strong>Panel Level 1 Values:</strong> {st.session_state['project_dct']['data_import']['formatted_panel1_values']}<br> |
|
<strong>Panel Level 2 Values:</strong> {st.session_state['project_dct']['data_import']['formatted_panel2_values']} |
|
</div> |
|
""", |
|
unsafe_allow_html=True, |
|
) |
|
|
|
|
|
st.write("") |
|
st.markdown( |
|
f""" |
|
<div style="text-align: justify;"> |
|
<strong>Number of Level 1 Panels detected:</strong> {len(st.session_state['project_dct']['data_import']['formatted_panel2_values'])}<br> |
|
<strong>Number of Level 2 Panels detected:</strong> {len(st.session_state['project_dct']['data_import']['formatted_panel2_values'])} |
|
</div> |
|
""", |
|
unsafe_allow_html=True, |
|
) |
|
st.write("") |
|
|
|
|
|
st.markdown("#### Select Variables Category & Impute Missing Values") |
|
|
|
merged_df = st.session_state["project_dct"]["data_import"]["merged_df"].copy() |
|
missing_stats_df = st.session_state["project_dct"]["data_import"][ |
|
"missing_stats_df" |
|
] |
|
editable_df = st.session_state["project_dct"]["data_import"]["edited_stats_df"] |
|
sorted_editable_df = editable_df.sort_values( |
|
by="Missing Values", ascending=False, na_position="first" |
|
) |
|
|
|
edited_stats_df = st.data_editor( |
|
sorted_editable_df, |
|
column_config={ |
|
"Impute Method": st.column_config.SelectboxColumn( |
|
options=[ |
|
"Drop Column", |
|
"Fill with Mean", |
|
"Fill with Median", |
|
"Fill with 0", |
|
], |
|
required=True, |
|
default="Fill with 0", |
|
), |
|
"Category": st.column_config.SelectboxColumn( |
|
options=[ |
|
"Spends", |
|
"Media", |
|
"Exogenous", |
|
"Internal", |
|
"Response Metrics", |
|
], |
|
required=True, |
|
default="Media", |
|
), |
|
}, |
|
disabled=["Column", "Missing Values", "Missing Percentage"], |
|
hide_index=True, |
|
use_container_width=True, |
|
key="data-editor-1", |
|
) |
|
|
|
st.session_state["project_dct"]["data_import"]["cat_dct"] = { |
|
col: cat |
|
for col, cat in zip(edited_stats_df["Column"], edited_stats_df["Category"]) |
|
if col in merged_df.columns |
|
} |
|
|
|
for i, row in edited_stats_df.iterrows(): |
|
column = row["Column"] |
|
if ( |
|
column |
|
not in st.session_state["project_dct"]["data_import"]["cat_dct"].keys() |
|
): |
|
continue |
|
if row["Impute Method"] == "Drop Column": |
|
merged_df.drop(columns=[column], inplace=True) |
|
|
|
elif row["Impute Method"] == "Fill with Mean": |
|
merged_df[column].fillna( |
|
st.session_state["project_dct"]["data_import"]["merged_df"][ |
|
column |
|
].mean(), |
|
inplace=True, |
|
) |
|
|
|
elif row["Impute Method"] == "Fill with Median": |
|
merged_df[column].fillna( |
|
st.session_state["project_dct"]["data_import"]["merged_df"][ |
|
column |
|
].median(), |
|
inplace=True, |
|
) |
|
|
|
elif row["Impute Method"] == "Fill with 0": |
|
merged_df[column].fillna(0, inplace=True) |
|
|
|
|
|
|
|
|
|
|
|
|
|
numeric_columns = st.session_state["project_dct"]["data_import"][ |
|
"numeric_columns" |
|
] |
|
default_df = st.session_state["project_dct"]["data_import"]["default_df"] |
|
|
|
st.markdown("#### Feature engineering") |
|
|
|
edited_df = st.data_editor( |
|
st.session_state["project_dct"]["data_import"]["edited_df"], |
|
column_config={ |
|
"Column 1": st.column_config.SelectboxColumn( |
|
options=numeric_columns, |
|
required=True, |
|
width=400, |
|
), |
|
"Operator": st.column_config.SelectboxColumn( |
|
options=["+", "-", "*", "/"], |
|
required=True, |
|
default="+", |
|
width=100, |
|
), |
|
"Column 2": st.column_config.SelectboxColumn( |
|
options=numeric_columns, |
|
required=True, |
|
default=numeric_columns[0], |
|
width=400, |
|
), |
|
"Category": st.column_config.SelectboxColumn( |
|
options=[ |
|
"Media", |
|
"Exogenous", |
|
"Internal", |
|
"Response Metrics", |
|
], |
|
required=True, |
|
default="Media", |
|
width=200, |
|
), |
|
}, |
|
num_rows="dynamic", |
|
key="data-editor-4", |
|
) |
|
|
|
final_df, edited_stats_df = process_dataframes( |
|
merged_df, edited_df, edited_stats_df |
|
) |
|
|
|
st.markdown("#### Final DataFrame") |
|
sort_col = [] |
|
for col in final_df.columns: |
|
if col in ["Panel_1", "Panel_2", "date"]: |
|
sort_col.append(col) |
|
|
|
sorted_final_df = final_df.sort_values( |
|
by=sort_col, ascending=True, na_position="first" |
|
) |
|
|
|
st.dataframe(sorted_final_df, hide_index=True) |
|
|
|
|
|
category_dict = { |
|
"Spends": [], |
|
"Media": [], |
|
"Exogenous": [], |
|
"Internal": [], |
|
"Response Metrics": [], |
|
} |
|
|
|
|
|
for i, row in edited_stats_df.iterrows(): |
|
column = row["Column"] |
|
category = row[ |
|
"Category" |
|
] |
|
|
|
if column not in list(final_df.columns): |
|
continue |
|
|
|
|
|
if category not in category_dict: |
|
|
|
category_dict[category] = [column] |
|
else: |
|
|
|
category_dict[category].append(column) |
|
|
|
|
|
category_dict.update({"Date": ["date"]}) |
|
if "Panel_1" in final_df.columns: |
|
category_dict["Panel Level 1"] = ["Panel_1"] |
|
if "Panel_2" in final_df.columns: |
|
category_dict["Panel Level 2"] = ["Panel_2"] |
|
|
|
|
|
|
|
with st.expander("Group Media Channels"): |
|
media_channels = category_dict["Media"] |
|
spends_channels = category_dict["Spends"] |
|
|
|
allowed_channels_bin = media_channels + spends_channels |
|
group_selection_placeholder = st.container() |
|
total_groups = st.number_input("Total Groups", value=0) |
|
try: |
|
total_groups = int(total_groups) |
|
except: |
|
total_groups = 0 |
|
group_dict = {} |
|
channels_added = set() |
|
|
|
with group_selection_placeholder: |
|
for i in range(total_groups): |
|
|
|
group_name_inp_col, group_col = st.columns([1, 4]) |
|
group_name = group_name_inp_col.text_input( |
|
"Group name", key=f"group_name_{i}" |
|
) |
|
|
|
|
|
allowed_channels = sorted( |
|
[ |
|
channel |
|
for channel in allowed_channels_bin |
|
if channel not in channels_added |
|
], |
|
key=lambda x: x.split("_")[ |
|
0 |
|
], |
|
) |
|
|
|
selected_channels = group_col.multiselect( |
|
"Select channels to group", |
|
options=allowed_channels, |
|
key=f"selected_channels_key_{i}", |
|
) |
|
|
|
if ((group_name is not None) and (group_name != "")) and ( |
|
len(selected_channels) > 0 |
|
): |
|
group_dict[group_name] = selected_channels |
|
channels_added.update(selected_channels) |
|
|
|
|
|
|
|
st.markdown("#### Variable Category") |
|
for category, variables in category_dict.items(): |
|
|
|
if len(variables) > 1: |
|
|
|
variables_str = ", ".join(variables[:-1]) + " and " + variables[-1] |
|
else: |
|
|
|
if len(variables) == 0: |
|
variables_str = "" |
|
else: |
|
|
|
variables_str = variables[0] |
|
|
|
|
|
st.markdown( |
|
f"<div style='text-align: justify;'><strong>{category}:</strong> {variables_str}</div>", |
|
unsafe_allow_html=True, |
|
) |
|
|
|
|
|
st.write("") |
|
|
|
|
|
required_categories = ["Response Metrics", "Spends", "Media"] |
|
|
|
|
|
for category in required_categories: |
|
category_columns = category_dict.get(category, []) |
|
if len(category_columns) == 0: |
|
st.warning( |
|
f"Please select at least one column for the {category} category", |
|
icon="⚠️", |
|
) |
|
st.stop() |
|
|
|
|
|
filtered_channels = [ |
|
channel |
|
for channel in category_dict["Media"] + category_dict["Spends"] |
|
if channel in final_df.columns |
|
] |
|
|
|
|
|
all_added_channels = [] |
|
for channels in group_dict: |
|
all_added_channels += group_dict[channels] |
|
|
|
|
|
if len(all_added_channels) != len(set(all_added_channels)): |
|
st.warning( |
|
"A channel can only be grouped once, and duplicate groupings are not permitted", |
|
icon="⚠️", |
|
) |
|
st.stop() |
|
|
|
|
|
if not set(filtered_channels) == set(all_added_channels): |
|
st.warning("Please group all media channels", icon="⚠️") |
|
st.stop() |
|
|
|
|
|
st.session_state["final_df"], st.session_state["bin_dict"] = ( |
|
final_df, |
|
category_dict, |
|
) |
|
|
|
|
|
if st.button( |
|
"Accept and Save", |
|
use_container_width=True, |
|
key="data-editor-button", |
|
): |
|
update_db("1_Data_Import.py") |
|
final_df = final_df.loc[:, ~final_df.columns.duplicated()] |
|
|
|
project_dct_path = os.path.join( |
|
st.session_state["project_path"], "project_dct.pkl" |
|
) |
|
|
|
with open(project_dct_path, "wb") as f: |
|
pickle.dump(st.session_state["project_dct"], f) |
|
|
|
data_path = os.path.join( |
|
st.session_state["project_path"], "data_import.pkl" |
|
) |
|
|
|
st.session_state["data_path"] = data_path |
|
|
|
save_to_pickle( |
|
data_path, |
|
st.session_state["final_df"], |
|
st.session_state["bin_dict"], |
|
) |
|
|
|
st.session_state["project_dct"]["data_import"][ |
|
"edited_stats_df" |
|
] = edited_stats_df |
|
st.session_state["project_dct"]["data_import"]["merged_df"] = merged_df |
|
st.session_state["project_dct"]["data_import"][ |
|
"missing_stats_df" |
|
] = missing_stats_df |
|
st.session_state["project_dct"]["data_import"]["cat_dct"] = { |
|
col: cat |
|
for col, cat in zip( |
|
edited_stats_df["Column"], edited_stats_df["Category"] |
|
) |
|
} |
|
st.session_state["project_dct"]["data_import"][ |
|
"numeric_columns" |
|
] = numeric_columns |
|
st.session_state["project_dct"]["data_import"]["default_df"] = default_df |
|
st.session_state["project_dct"]["data_import"]["final_df"] = final_df |
|
st.session_state["project_dct"]["data_import"]["edited_df"] = edited_df |
|
|
|
st.toast("💾 Saved Successfully!") |
|
|
|
if accept: |
|
|
|
with st.spinner("Processing..."): |
|
files_dict = standardize_data_to_daily(files_dict, selections) |
|
|
|
|
|
files_dict = apply_granularity_to_all( |
|
files_dict, granularity_selection, selections |
|
) |
|
|
|
|
|
st.session_state["files_dict"] = files_dict |
|
|
|
|
|
st.session_state["Panel_1_Panel_2_Selected"] = True |
|
|
|
|
|
|
|
|
|
|
|
|
|
if st.session_state["project_dct"]["data_import"]["edited_stats_df"] is None: |
|
|
|
if ( |
|
"files_dict" in st.session_state |
|
and st.session_state["Panel_1_Panel_2_Selected"] |
|
): |
|
files_dict = st.session_state["files_dict"] |
|
|
|
st.session_state["project_dct"]["data_import"][ |
|
"files_dict" |
|
] = files_dict |
|
else: |
|
st.stop() |
|
|
|
|
|
with st.spinner("Fetching Panel values..."): |
|
all_panel1_values, all_panel2_values = clean_and_extract_unique_values( |
|
files_dict, selections |
|
) |
|
|
|
|
|
list_of_all_panel1_values = list(all_panel1_values) |
|
list_of_all_panel2_values = list(all_panel2_values) |
|
|
|
|
|
formatted_panel1_values = format_values_for_display( |
|
list_of_all_panel1_values |
|
) |
|
formatted_panel2_values = format_values_for_display( |
|
list_of_all_panel2_values |
|
) |
|
|
|
st.session_state["project_dct"]["data_import"][ |
|
"formatted_panel1_values" |
|
] = formatted_panel1_values |
|
st.session_state["project_dct"]["data_import"][ |
|
"formatted_panel2_values" |
|
] = formatted_panel2_values |
|
|
|
|
|
st.markdown("#### Unique Panel values") |
|
|
|
with st.expander("Unique Panel values"): |
|
st.write("") |
|
st.markdown( |
|
f""" |
|
<style> |
|
.justify-text {{ |
|
text-align: justify; |
|
}} |
|
</style> |
|
<div class="justify-text"> |
|
<strong>Panel Level 1 Values:</strong> {formatted_panel1_values}<br> |
|
<strong>Panel Level 2 Values:</strong> {formatted_panel2_values} |
|
</div> |
|
""", |
|
unsafe_allow_html=True, |
|
) |
|
|
|
|
|
st.write("") |
|
st.markdown( |
|
f""" |
|
<div style="text-align: justify;"> |
|
<strong>Number of Level 1 Panels detected:</strong> {len(list_of_all_panel1_values)}<br> |
|
<strong>Number of Level 2 Panels detected:</strong> {len(list_of_all_panel2_values)} |
|
</div> |
|
""", |
|
unsafe_allow_html=True, |
|
) |
|
st.write("") |
|
|
|
|
|
|
|
|
|
|
|
|
|
main_df = create_main_dataframe( |
|
files_dict, |
|
all_panel1_values, |
|
all_panel2_values, |
|
granularity_selection, |
|
) |
|
|
|
merged_df = merge_into_main_df(main_df, files_dict, selections) |
|
|
|
|
|
|
|
|
|
|
|
|
|
st.markdown("#### Select Variables Category & Impute Missing Values") |
|
|
|
|
|
missing_stats_df = prepare_missing_stats_df(merged_df) |
|
sorted_missing_stats_df = missing_stats_df.sort_values( |
|
by="Missing Values", ascending=False, na_position="first" |
|
) |
|
|
|
edited_stats_df = st.data_editor( |
|
sorted_missing_stats_df, |
|
column_config={ |
|
"Impute Method": st.column_config.SelectboxColumn( |
|
options=[ |
|
"Drop Column", |
|
"Fill with Mean", |
|
"Fill with Median", |
|
"Fill with 0", |
|
], |
|
required=True, |
|
default="Fill with 0", |
|
), |
|
"Category": st.column_config.SelectboxColumn( |
|
options=[ |
|
"Spends", |
|
"Media", |
|
"Exogenous", |
|
"Internal", |
|
"Response Metrics", |
|
], |
|
required=True, |
|
default="Media", |
|
), |
|
}, |
|
disabled=["Column", "Missing Values", "Missing Percentage"], |
|
hide_index=True, |
|
use_container_width=True, |
|
key="data-editor-2", |
|
) |
|
|
|
|
|
for i, row in edited_stats_df.iterrows(): |
|
column = row["Column"] |
|
if row["Impute Method"] == "Drop Column": |
|
merged_df.drop(columns=[column], inplace=True) |
|
|
|
elif row["Impute Method"] == "Fill with Mean": |
|
merged_df[column].fillna(merged_df[column].mean(), inplace=True) |
|
|
|
elif row["Impute Method"] == "Fill with Median": |
|
merged_df[column].fillna(merged_df[column].median(), inplace=True) |
|
|
|
elif row["Impute Method"] == "Fill with 0": |
|
merged_df[column].fillna(0, inplace=True) |
|
|
|
|
|
|
|
|
|
|
|
|
|
st.markdown("#### Feature engineering") |
|
|
|
|
|
numeric_columns, default_df = prepare_numeric_columns_and_default_df( |
|
merged_df, edited_stats_df |
|
) |
|
|
|
|
|
edited_df = st.data_editor( |
|
default_df, |
|
column_config={ |
|
"Column 1": st.column_config.SelectboxColumn( |
|
options=numeric_columns, |
|
required=True, |
|
width=400, |
|
), |
|
"Operator": st.column_config.SelectboxColumn( |
|
options=["+", "-", "*", "/"], |
|
required=True, |
|
default="+", |
|
width=100, |
|
), |
|
"Column 2": st.column_config.SelectboxColumn( |
|
options=numeric_columns, |
|
required=True, |
|
default=numeric_columns[0], |
|
width=400, |
|
), |
|
"Category": st.column_config.SelectboxColumn( |
|
options=[ |
|
"Media", |
|
"Exogenous", |
|
"Internal", |
|
"Response Metrics", |
|
], |
|
required=True, |
|
default="Media", |
|
width=200, |
|
), |
|
}, |
|
num_rows="dynamic", |
|
key="data-editor-3", |
|
) |
|
|
|
|
|
final_df, edited_stats_df = process_dataframes( |
|
merged_df, edited_df, edited_stats_df |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
st.markdown("#### Final DataFrame") |
|
|
|
sort_col = [] |
|
for col in final_df.columns: |
|
if col in ["Panel_1", "Panel_2", "date"]: |
|
sort_col.append(col) |
|
|
|
sorted_final_df = final_df.sort_values( |
|
by=sort_col, ascending=True, na_position="first" |
|
) |
|
st.dataframe(sorted_final_df, hide_index=True) |
|
|
|
|
|
category_dict = { |
|
"Spends": [], |
|
"Media": [], |
|
"Exogenous": [], |
|
"Internal": [], |
|
"Response Metrics": [], |
|
} |
|
|
|
|
|
for i, row in edited_stats_df.iterrows(): |
|
column = row["Column"] |
|
category = row[ |
|
"Category" |
|
] |
|
|
|
if column not in list(final_df.columns): |
|
continue |
|
|
|
|
|
if category not in category_dict: |
|
|
|
category_dict[category] = [column] |
|
else: |
|
|
|
category_dict[category].append(column) |
|
|
|
|
|
category_dict.update({"Date": ["date"]}) |
|
if "Panel_1" in final_df.columns: |
|
category_dict["Panel Level 1"] = ["Panel_1"] |
|
if "Panel_2" in final_df.columns: |
|
category_dict["Panel Level 2"] = ["Panel_2"] |
|
|
|
|
|
|
|
with st.expander("Group Media Channels"): |
|
media_channels = category_dict["Media"] |
|
spends_channels = category_dict["Spends"] |
|
|
|
allowed_channels_bin = media_channels + spends_channels |
|
group_selection_placeholder = st.container() |
|
total_groups = st.number_input("Total Groups", value=0) |
|
try: |
|
total_groups = int(total_groups) |
|
except: |
|
total_groups = 0 |
|
group_dict = {} |
|
channels_added = set() |
|
|
|
with group_selection_placeholder: |
|
for i in range(total_groups): |
|
|
|
group_name_inp_col, group_col = st.columns([1, 4]) |
|
group_name = group_name_inp_col.text_input( |
|
"Group name", key=f"group_name_{i}" |
|
) |
|
|
|
|
|
allowed_channels = sorted( |
|
[ |
|
channel |
|
for channel in allowed_channels_bin |
|
if channel not in channels_added |
|
], |
|
key=lambda x: x.split("_")[ |
|
0 |
|
], |
|
) |
|
|
|
selected_channels = group_col.multiselect( |
|
"Select channels to group", |
|
options=allowed_channels, |
|
key=f"selected_channels_key_{i}", |
|
) |
|
|
|
if ((group_name is not None) and (group_name != "")) and ( |
|
len(selected_channels) > 0 |
|
): |
|
group_dict[group_name] = selected_channels |
|
channels_added.update(selected_channels) |
|
|
|
|
|
|
|
|
|
st.markdown("#### Variable Category") |
|
for category, variables in category_dict.items(): |
|
|
|
if len(variables) == 0: |
|
variables_str = "" |
|
|
|
if len(variables) > 1: |
|
|
|
variables_str = ", ".join(variables[:-1]) + " and " + variables[-1] |
|
else: |
|
|
|
if len(variables) == 0: |
|
variables_str = "" |
|
else: |
|
|
|
variables_str = variables[0] |
|
|
|
|
|
st.markdown( |
|
f"<div style='text-align: justify;'><strong>{category}:</strong> {variables_str}</div>", |
|
unsafe_allow_html=True, |
|
) |
|
|
|
|
|
st.write("") |
|
|
|
|
|
required_categories = ["Response Metrics", "Spends", "Media"] |
|
|
|
|
|
for category in required_categories: |
|
category_columns = category_dict.get(category, []) |
|
if len(category_columns) == 0: |
|
st.warning( |
|
f"Please select at least one column for the {category} category", |
|
icon="⚠️", |
|
) |
|
st.stop() |
|
|
|
|
|
filtered_channels = [ |
|
channel |
|
for channel in category_dict["Media"] + category_dict["Spends"] |
|
if channel in final_df.columns |
|
] |
|
|
|
|
|
all_added_channels = [] |
|
for channels in group_dict: |
|
all_added_channels += group_dict[channels] |
|
|
|
|
|
if len(all_added_channels) != len(set(all_added_channels)): |
|
st.warning( |
|
"A channel can only be grouped once, and duplicate groupings are not permitted", |
|
icon="⚠️", |
|
) |
|
st.stop() |
|
|
|
|
|
if not set(filtered_channels) == set(all_added_channels): |
|
st.warning("Please group all media channels", icon="⚠️") |
|
st.stop() |
|
|
|
|
|
st.session_state["final_df"], st.session_state["bin_dict"] = ( |
|
final_df, |
|
category_dict, |
|
) |
|
|
|
|
|
if st.button("Accept and Save", use_container_width=True): |
|
|
|
update_db("1_Data_Import.py") |
|
|
|
project_dct_path = os.path.join( |
|
st.session_state["project_path"], "project_dct.pkl" |
|
) |
|
|
|
with open(project_dct_path, "wb") as f: |
|
pickle.dump(st.session_state["project_dct"], f) |
|
|
|
data_path = os.path.join( |
|
st.session_state["project_path"], "data_import.pkl" |
|
) |
|
st.session_state["data_path"] = data_path |
|
|
|
save_to_pickle( |
|
data_path, |
|
st.session_state["final_df"], |
|
st.session_state["bin_dict"], |
|
) |
|
|
|
|
|
if len(category_dict["Exogenous"]) > 0: |
|
for exog_var in category_dict["Exogenous"]: |
|
group_dict[exog_var] = [exog_var] |
|
with open( |
|
os.path.join(st.session_state["project_path"], "channel_groups.pkl"), |
|
"wb", |
|
) as f: |
|
pickle.dump(group_dict, f) |
|
|
|
st.session_state["project_dct"]["data_import"][ |
|
"edited_stats_df" |
|
] = edited_stats_df |
|
st.session_state["project_dct"]["data_import"]["merged_df"] = merged_df |
|
st.session_state["project_dct"]["data_import"][ |
|
"missing_stats_df" |
|
] = missing_stats_df |
|
st.session_state["project_dct"]["data_import"]["cat_dct"] = { |
|
col: cat |
|
for col, cat in zip( |
|
edited_stats_df["Column"], edited_stats_df["Category"] |
|
) |
|
} |
|
|
|
st.session_state["project_dct"]["data_import"][ |
|
"numeric_columns" |
|
] = numeric_columns |
|
st.session_state["project_dct"]["data_import"]["default_df"] = default_df |
|
st.session_state["project_dct"]["data_import"]["final_df"] = final_df |
|
st.session_state["project_dct"]["data_import"]["edited_df"] = edited_df |
|
|
|
st.toast("💾 Saved Successfully!") |
|
|