Spaces:
Sleeping
Sleeping
import streamlit as st | |
import time | |
import pandas as pd | |
import plotly.express as px | |
import plotly.graph_objects as go | |
import matplotlib.pyplot as plt | |
import numpy as np | |
import lightgbm as lgb | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.metrics.pairwise import cosine_similarity | |
from sklearn.metrics import mean_absolute_error, mean_squared_error | |
from joblib import dump, load | |
from utils import recomienda_tfid | |
# Page configuration | |
st.set_page_config(page_title="DeepInsightz", page_icon=":bar_chart:", layout="wide") | |
# Custom CSS for dynamic theme styling | |
# Streamlit detects light and dark mode automatically via the user's settings in Hugging Face Spaces | |
if st.get_option("theme.base") == "dark": | |
background_color = "#282828" | |
text_color = "white" | |
metric_box_color = "#4f4f4f" | |
sidebar_color = "#282828" | |
plot_bgcolor = "rgba(0, 0, 0, 0)" | |
primary_color = '#00FF00' # for positive delta | |
negative_color = '#FF0000' # for negative delta | |
else: | |
background_color = "#f4f4f4" | |
text_color = "#black" | |
metric_box_color = "#dee2e8" | |
sidebar_color = "#dee2e8" | |
plot_bgcolor = "#f4f4f4" | |
primary_color = '#228B22' # for positive delta in light mode | |
negative_color = '#8B0000' # for negative delta in light mode | |
st.markdown(f""" | |
<style> | |
body {{ | |
background-color: {background_color}; | |
color: {text_color}; | |
}} | |
[data-testid="stMetric"] {{ | |
background-color: {metric_box_color}; | |
border-radius: 10px; | |
text-align: center; | |
padding: 15px 0; | |
margin-bottom: 20px; | |
}} | |
[data-testid="stMetricLabel"] {{ | |
display: flex; | |
justify-content: center; | |
align-items: center; | |
color: {text_color}; | |
}} | |
[data-testid="stSidebar"] {{ | |
background-color: {sidebar_color}; | |
}} | |
</style> | |
""", unsafe_allow_html=True) | |
# Navigation menu | |
with st.sidebar: | |
st.sidebar.title("DeepInsightz") | |
page = st.sidebar.selectbox("Select the tool you want to use", ["Summary", "Customer Analysis", "Articles Recommendations"]) | |
# Load CSV files at the top | |
df = pd.read_csv("df_clean.csv") | |
nombres_proveedores = pd.read_csv("nombres_proveedores.csv", sep=';') | |
euros_proveedor = pd.read_csv("euros_proveedor.csv", sep=',') | |
ventas_clientes = pd.read_csv("ventas_clientes.csv", sep=',') | |
customer_clusters = pd.read_csv('predicts/customer_clusters.csv') # Load the customer clusters here | |
df_agg_2024 = pd.read_csv('predicts/df_agg_2024.csv') | |
pca_data_5 = pd.read_csv('pca_data.csv') | |
# Generamos la columna total_sales | |
ventas_clientes['total_sales'] = ventas_clientes[['VENTA_2021', 'VENTA_2022', 'VENTA_2023']].sum(axis=1) | |
# Ordenar los clientes de mayor a menor según sus ventas totales | |
ventas_top_100 = ventas_clientes.sort_values(by='total_sales', ascending=False).head(100) | |
# Ensure customer codes are strings | |
df['CLIENTE'] = df['CLIENTE'].astype(str) | |
nombres_proveedores['codigo'] = nombres_proveedores['codigo'].astype(str) | |
euros_proveedor['CLIENTE'] = euros_proveedor['CLIENTE'].astype(str) | |
customer_clusters['cliente_id'] = customer_clusters['cliente_id'].astype(str) # Ensure customer IDs are strings | |
fieles_df = pd.read_csv("clientes_relevantes.csv") | |
cestas = pd.read_csv("cestas.csv") | |
productos = pd.read_csv("productos.csv") | |
df_agg_2024['cliente_id'] = df_agg_2024['cliente_id'].astype(str) | |
marca_id_mapping = load('marca_id_mapping.joblib') | |
# Convert all columns except 'CLIENTE' to float in euros_proveedor | |
for col in euros_proveedor.columns: | |
if col != 'CLIENTE': | |
euros_proveedor[col] = pd.to_numeric(euros_proveedor[col], errors='coerce') | |
# Check for NaN values after conversion | |
if euros_proveedor.isna().any().any(): | |
st.warning("Some values in euros_proveedor couldn't be converted to numbers. Please review the input data.") | |
# Ignore the last two columns of df | |
df = df.iloc[:, :-2] | |
# Function to get supplier name | |
def get_supplier_name(code): | |
code = str(code) # Ensure code is a string | |
name = nombres_proveedores[nombres_proveedores['codigo'] == code]['nombre'].values | |
return name[0] if len(name) > 0 else code | |
def get_supplier_name_encoded(encoded_code): | |
encoded_code = int(encoded_code) # Ensure the encoded code is an integer | |
# First, try to use the label encoder to map the encoded code back to the original manufacturer code | |
if encoded_code in marca_id_mapping.classes_: | |
real_code = marca_id_mapping.inverse_transform([encoded_code])[0] | |
else: | |
return f"Unknown code: {encoded_code}" # Handle case where encoded code is not found | |
# Now, use the real_code to find the manufacturer name in nombres_proveedores | |
name = nombres_proveedores[nombres_proveedores['codigo'] == str(real_code)]['nombre'].values | |
return name[0] if len(name) > 0 else real_code | |
# Custom Donut Chart with Plotly for Inbound/Outbound Percentage | |
def create_donut_chart(values, labels, color_scheme, title): | |
fig = px.pie( | |
values=values, | |
names=labels, | |
hole=0.7, | |
color_discrete_sequence=color_scheme | |
) | |
fig.update_traces(textinfo='percent+label', hoverinfo='label+percent', textposition='inside', showlegend=False) | |
fig.update_layout( | |
annotations=[dict(text=f"{int(values[1])}%", x=0.5, y=0.5, font_size=40, showarrow=False)], | |
title=title, | |
height=300, | |
margin=dict(t=30, b=10, l=10, r=10), | |
paper_bgcolor=plot_bgcolor, # Use theme-dependent background color | |
plot_bgcolor=plot_bgcolor | |
) | |
return fig | |
# Donut chart with color scheme based on theme | |
if st.get_option("theme.base") == "dark": | |
donut_color_scheme = ['#155F7A', '#29b5e8'] # Dark mode colors | |
else: | |
donut_color_scheme = ['#007BFF', '#66b5ff'] # Light mode colors | |
# Function to create radar chart with square root transformation | |
def radar_chart(categories, values, amounts, title): | |
N = len(categories) | |
angles = [n / float(N) * 2 * np.pi for n in range(N)] | |
angles += angles[:1] | |
fig, ax = plt.subplots(figsize=(12, 12), subplot_kw=dict(projection='polar')) | |
# Apply square root transformation | |
sqrt_values = np.sqrt(values) | |
sqrt_amounts = np.sqrt(amounts) | |
max_sqrt_value = max(sqrt_values) | |
normalized_values = [v / max_sqrt_value for v in sqrt_values] | |
# Adjust scaling for spend values | |
max_sqrt_amount = max(sqrt_amounts) | |
scaling_factor = 0.7 # Adjust this value to control how much the spend values are scaled up | |
normalized_amounts = [min((a / max_sqrt_amount) * scaling_factor, 1.0) for a in sqrt_amounts] | |
normalized_values += normalized_values[:1] | |
ax.plot(angles, normalized_values, 'o-', linewidth=2, color='#FF69B4', label='% Units (sqrt)') | |
ax.fill(angles, normalized_values, alpha=0.25, color='#FF69B4') | |
normalized_amounts += normalized_amounts[:1] | |
ax.plot(angles, normalized_amounts, 'o-', linewidth=2, color='#4B0082', label='% Spend (sqrt)') | |
ax.fill(angles, normalized_amounts, alpha=0.25, color='#4B0082') | |
ax.set_xticks(angles[:-1]) | |
ax.set_xticklabels(categories, size=8, wrap=True) | |
ax.set_ylim(0, 1) | |
circles = np.linspace(0, 1, 5) | |
for circle in circles: | |
ax.plot(angles, [circle]*len(angles), '--', color='gray', alpha=0.3, linewidth=0.5) | |
ax.set_yticklabels([]) | |
ax.spines['polar'].set_visible(False) | |
plt.title(title, size=16, y=1.1) | |
plt.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1)) | |
return fig | |
if page == "Summary": | |
# st.title("Welcome to DeepInsightz") | |
# st.markdown(""" | |
# ### Data-driven Customer Clustering | |
# We analyzed thousands of customers and suppliers to help businesses make smarter sales decisions. | |
# """) | |
# Create layout with three columns | |
col1, col2, col3 = st.columns((1.5, 4, 2.5), gap='medium') | |
# Left Column (Red): Metrics and Donut Charts | |
with col1: | |
st.markdown('#### General Information') | |
st.metric(label="Range of Dates", value="2021-2023") | |
st.metric(label="Customers Analysed", value="3.000") | |
st.metric(label="Unique Products Sold", value="10.702") | |
st.metric(label="Total Sales Instances", value="764.396") | |
# Middle Column (White): 3D Cluster Model and Bar Chart | |
with col2: | |
st.markdown('#### 3D Customer Clusters') | |
# Create 3D PCA plot using actual data from pca_data_5 | |
fig_cluster = px.scatter_3d( | |
pca_data_5, | |
x='PC1', | |
y='PC2', | |
z='PC3', | |
color='cluster_id', | |
hover_name='CustomerID', | |
) | |
fig_cluster.update_layout( | |
scene=dict(aspectratio=dict(x=1, y=1, z=0.8)), # Adjusted aspect ratio for better balance | |
margin=dict(t=10, b=10, l=10, r=10), # Tighten margins further | |
height=600, # Slightly increased height for better visibility | |
) | |
st.plotly_chart(fig_cluster, use_container_width=True) | |
# Right Column (Blue): Key Metrics Overview and Data Preparation Summary | |
with col3: | |
# Mostrar la tabla con los 100 mejores clientes | |
st.markdown('#### Top 100 Clients by Total Sales') | |
# Configurar columnas para mostrar los clientes y las ventas totales | |
st.dataframe(ventas_top_100[['codigo_cliente', 'total_sales']], | |
column_order=("codigo_cliente", "total_sales"), | |
hide_index=True, | |
width=450, # Ajustar el ancho de la tabla | |
height=600, # Ajustar la altura de la tabla | |
column_config={ | |
"codigo_cliente": st.column_config.TextColumn( | |
"Client Code", | |
), | |
"total_sales": st.column_config.ProgressColumn( | |
"Total Sales (€)", | |
format="%d", | |
min_value=0, | |
max_value=ventas_top_100['total_sales'].max() | |
)} | |
) | |
# Customer Analysis Page | |
elif page == "Customer Analysis": | |
st.markdown(""" | |
<h2 style='text-align: center; font-size: 2.5rem;'>Customer Analysis</h2> | |
<p style='text-align: center; font-size: 1.2rem; color: gray;'> | |
Enter the customer code to explore detailed customer insights, | |
including past sales, predictions for the current year, and manufacturer-specific information. | |
</p> | |
""", unsafe_allow_html=True) | |
# Combine text input and dropdown into a single searchable selectbox | |
customer_code = st.selectbox( | |
"Search and Select Customer Code", | |
df['CLIENTE'].unique(), # All customer codes | |
format_func=lambda x: str(x), # Ensures the values are displayed as strings | |
help="Start typing to search for a specific customer code" | |
) | |
if st.button("Calcular"): | |
if customer_code: | |
with st.spinner("We are identifying the customer's cluster..."): | |
# Find Customer's Cluster | |
customer_match = customer_clusters[customer_clusters['cliente_id'] == customer_code] | |
time.sleep(1) | |
if not customer_match.empty: | |
cluster = customer_match['cluster_id'].values[0] | |
with st.spinner(f"Selecting predictive model..."): | |
# Load the Corresponding Model | |
model_path = f'models/modelo_cluster_{cluster}.txt' | |
gbm = lgb.Booster(model_file=model_path) | |
with st.spinner("Getting the data ready..."): | |
# Load predict data for that cluster | |
predict_data = pd.read_csv(f'predicts/predict_cluster_{cluster}.csv') | |
# Convert cliente_id to string | |
predict_data['cliente_id'] = predict_data['cliente_id'].astype(str) | |
with st.spinner("Filtering data..."): | |
# Filter for the specific customer | |
customer_code_str = str(customer_code) | |
customer_data = predict_data[predict_data['cliente_id'] == customer_code_str] | |
with st.spinner("Generating sales predictions..."): | |
if not customer_data.empty: | |
# Define features consistently with the training process | |
lag_features = [f'precio_total_lag_{lag}' for lag in range(1, 25)] | |
features = lag_features + ['mes', 'marca_id_encoded', 'año', 'cluster_id'] | |
# Prepare data for prediction | |
X_predict = customer_data[features] | |
# Convert categorical features to 'category' dtype | |
categorical_features = ['mes', 'marca_id_encoded', 'cluster_id'] | |
for feature in categorical_features: | |
X_predict[feature] = X_predict[feature].astype('category') | |
# Make Prediction for the selected customer | |
y_pred = gbm.predict(X_predict, num_iteration=gbm.best_iteration) | |
# Reassemble the results | |
results = customer_data[['cliente_id', 'marca_id_encoded', 'fecha_mes']].copy() | |
results['ventas_predichas'] = y_pred | |
# Load actual data from df_agg_2024 | |
actual_sales = df_agg_2024[df_agg_2024['cliente_id'] == customer_code_str] | |
if not actual_sales.empty: | |
# Merge predictions with actual sales | |
results = results.merge(actual_sales[['cliente_id', 'marca_id_encoded', 'fecha_mes', 'precio_total']], | |
on=['cliente_id', 'marca_id_encoded', 'fecha_mes'], | |
how='left') | |
results.rename(columns={'precio_total': 'ventas_reales'}, inplace=True) | |
else: | |
# If no actual sales data for 2024, fill 'ventas_reales' with 0 | |
results['ventas_reales'] = 0 | |
# Ensure any missing sales data is filled with 0 | |
results['ventas_reales'].fillna(0, inplace=True) | |
# Split space into two columns | |
col1, col2 = st.columns(2) | |
# Column 1: Radar chart for top manufacturers | |
with col1: | |
# Overperforming manufacturers alert | |
st.subheader("This looks great!") | |
st.info("Your customer did exceed predicted sales from the following brands:") | |
# Identify manufacturers that exceeded predicted sales | |
overperforming_manufacturers = results[results['ventas_reales'] > results['ventas_predichas']].copy() | |
if not overperforming_manufacturers.empty: | |
# Calculate the extra amount (difference between actual and predicted sales) | |
overperforming_manufacturers['extra_amount'] = overperforming_manufacturers['ventas_reales'] - overperforming_manufacturers['ventas_predichas'] | |
# Sort by the highest extra amount | |
overperforming_manufacturers = overperforming_manufacturers.sort_values(by='extra_amount', ascending=False) | |
# Limit to top 10 overperforming manufacturers | |
top_overperformers = overperforming_manufacturers.head(10) | |
for index, row in top_overperformers.iterrows(): | |
manufacturer_name = get_supplier_name_encoded(row['marca_id_encoded']) | |
predicted = row['ventas_predichas'] | |
actual = row['ventas_reales'] | |
extra = row['extra_amount'] | |
# Use Streamlit's st.metric for better styling | |
st.metric( | |
label=f"{manufacturer_name}", | |
value=f"{actual:.2f}€", | |
delta=f"Exceeded by {extra:.2f}€", | |
delta_color="normal" | |
) | |
else: | |
st.info("No manufacturers exceeded predicted sales.") | |
# Radar chart logic remains the same | |
customer_df = df[df["CLIENTE"] == str(customer_code)] | |
all_manufacturers = customer_df.iloc[:, 1:].T | |
all_manufacturers.index = all_manufacturers.index.astype(str) | |
customer_euros = euros_proveedor[euros_proveedor["CLIENTE"] == str(customer_code)] | |
sales_data = customer_euros.iloc[:, 1:].T | |
sales_data.index = sales_data.index.astype(str) | |
sales_data_filtered = sales_data.drop(index='CLIENTE', errors='ignore') | |
sales_data_filtered = sales_data_filtered.apply(pd.to_numeric, errors='coerce') | |
all_manufacturers = all_manufacturers.apply(pd.to_numeric, errors='coerce') | |
top_units = all_manufacturers.sort_values(by=all_manufacturers.columns[0], ascending=False).head(10) | |
top_sales = sales_data_filtered.sort_values(by=sales_data_filtered.columns[0], ascending=False).head(10) | |
combined_top = pd.concat([top_units, top_sales]).index.unique()[:20] | |
combined_top = [m for m in combined_top if m in all_manufacturers.index and m in sales_data_filtered.index] | |
if combined_top: | |
combined_data = pd.DataFrame({ | |
'units': all_manufacturers.loc[combined_top, all_manufacturers.columns[0]], | |
'sales': sales_data_filtered.loc[combined_top, sales_data_filtered.columns[0]] | |
}).fillna(0) | |
combined_data_sorted = combined_data.sort_values(by=['units', 'sales'], ascending=False) | |
non_zero_manufacturers = combined_data_sorted[combined_data_sorted['units'] > 0] | |
if len(non_zero_manufacturers) < 3: | |
zero_manufacturers = combined_data_sorted[combined_data_sorted['units'] == 0].head(3 - len(non_zero_manufacturers)) | |
manufacturers_to_show = pd.concat([non_zero_manufacturers, zero_manufacturers]) | |
else: | |
manufacturers_to_show = non_zero_manufacturers | |
values = manufacturers_to_show['units'].tolist() | |
amounts = manufacturers_to_show['sales'].tolist() | |
manufacturers = [get_supplier_name(m) for m in manufacturers_to_show.index] | |
if manufacturers: | |
fig = radar_chart(manufacturers, values, amounts, f'Radar Chart for Top {len(manufacturers)} Manufacturers of Customer {customer_code}') | |
st.pyplot(fig) | |
# Column 2: Alerts and additional analysis | |
with col2: | |
st.subheader("You might need to check this!") | |
st.warning("Your customer was expected to buy more products from the following brands:") | |
# Identify manufacturers that didn't meet predicted sales | |
underperforming_manufacturers = results[results['ventas_reales'] < results['ventas_predichas']].copy() | |
if not underperforming_manufacturers.empty: | |
# Calculate the missed amount | |
underperforming_manufacturers['missed_amount'] = underperforming_manufacturers['ventas_predichas'] - underperforming_manufacturers['ventas_reales'] | |
# Sort by the highest missed amount | |
underperforming_manufacturers = underperforming_manufacturers.sort_values(by='missed_amount', ascending=False) | |
# Limit to top 10 missed amounts | |
top_misses = underperforming_manufacturers.head(10) | |
for index, row in top_misses.iterrows(): | |
manufacturer_name = get_supplier_name_encoded(row['marca_id_encoded']) | |
predicted = row['ventas_predichas'] | |
actual = row['ventas_reales'] | |
missed = row['missed_amount'] | |
# Use Streamlit's st.metric for better styling | |
st.metric( | |
label=f"{manufacturer_name}", | |
value=f"{actual:.2f}€", | |
delta=f"Missed by {missed:.2f}€", | |
delta_color="inverse" | |
) | |
else: | |
st.success("All manufacturers have met or exceeded predicted sales.") | |
# Gráfico adicional: Comparar las ventas predichas y reales para los principales fabricantes | |
st.markdown("### Predicted vs Actual Sales for Top Manufacturers") | |
top_manufacturers = results.groupby('marca_id_encoded').agg({'ventas_reales': 'sum', 'ventas_predichas': 'sum'}).sort_values(by='ventas_reales', ascending=False).head(10) | |
fig_comparison = go.Figure() | |
fig_comparison.add_trace(go.Bar(x=top_manufacturers.index, y=top_manufacturers['ventas_reales'], name="Actual Sales", marker_color='blue')) | |
fig_comparison.add_trace(go.Bar(x=top_manufacturers.index, y=top_manufacturers['ventas_predichas'], name="Predicted Sales", marker_color='orange')) | |
fig_comparison.update_layout( | |
title="Actual vs Predicted Sales by Top Manufacturers", | |
xaxis_title="Manufacturer", | |
yaxis_title="Sales (€)", | |
barmode='group', | |
height=400, | |
hovermode="x unified" | |
) | |
st.plotly_chart(fig_comparison, use_container_width=True) | |
# Gráfico de ventas anuales | |
ventas_clientes['codigo_cliente'] = ventas_clientes['codigo_cliente'].astype(str).str.strip() | |
sales_columns = ['VENTA_2021', 'VENTA_2022', 'VENTA_2023'] | |
if all(col in ventas_clientes.columns for col in sales_columns): | |
customer_sales_data = ventas_clientes[ventas_clientes['codigo_cliente'] == customer_code] | |
if not customer_sales_data.empty: | |
customer_sales = customer_sales_data[sales_columns].values[0] | |
years = ['2021', '2022', '2023'] | |
# Add the 2024 actual and predicted data | |
if 'ventas_predichas' in results.columns and 'ventas_reales' in results.columns: | |
actual_sales_2024 = results[results['fecha_mes'].str.startswith('2024')]['ventas_reales'].sum() | |
predicted_sales_2024 = results[results['fecha_mes'].str.startswith('2024')]['ventas_predichas'].sum() | |
months_available = 9 | |
actual_sales_2024_annual = (actual_sales_2024 / months_available) * 12 | |
sales_values = list(customer_sales) + [actual_sales_2024_annual] | |
predicted_values = list(customer_sales) + [predicted_sales_2024] | |
years.append('2024') | |
fig_sales_bar = go.Figure() | |
fig_sales_bar.add_trace(go.Bar( | |
x=years[:3], | |
y=sales_values[:3], | |
name="Historical Sales", | |
marker_color='blue' | |
)) | |
fig_sales_bar.add_trace(go.Bar( | |
x=[years[3]], | |
y=[sales_values[3]], | |
name="2024 Actual Sales (Annualized)", | |
marker_color='green' | |
)) | |
fig_sales_bar.add_trace(go.Bar( | |
x=[years[3]], | |
y=[predicted_values[3]], | |
name="2024 Predicted Sales", | |
marker_color='orange' | |
)) | |
fig_sales_bar.update_layout( | |
title=f"Sales Over the Years for Customer {customer_code}", | |
xaxis_title="Year", | |
yaxis_title="Sales (€)", | |
barmode='group', | |
height=600, | |
legend_title_text="Sales Type", | |
hovermode="x unified" | |
) | |
st.plotly_chart(fig_sales_bar, use_container_width=True) | |
else: | |
st.warning(f"No predicted or actual data found for customer {customer_code} for 2024.") | |
# elif page == "Customer Analysis": | |
# st.markdown(""" | |
# <h2 style='text-align: center; font-size: 2.5rem;'>Customer Analysis</h2> | |
# <p style='text-align: center; font-size: 1.2rem; color: gray;'> | |
# Enter the customer code to explore detailed customer insights, | |
# including past sales, predictions for the current year, and manufacturer-specific information. | |
# </p> | |
# """, unsafe_allow_html=True) | |
# # Combine text input and dropdown into a single searchable selectbox | |
# customer_code = st.selectbox( | |
# "Search and Select Customer Code", | |
# df['CLIENTE'].unique(), # All customer codes | |
# format_func=lambda x: str(x), # Ensures the values are displayed as strings | |
# help="Start typing to search for a specific customer code" | |
# ) | |
# if st.button("Calcular"): | |
# if customer_code: | |
# with st.spinner("We are identifying the customer's cluster..."): | |
# # Find Customer's Cluster | |
# customer_match = customer_clusters[customer_clusters['cliente_id'] == customer_code] | |
# time.sleep(1) | |
# if not customer_match.empty: | |
# cluster = customer_match['cluster_id'].values[0] | |
# with st.spinner(f"Selecting predictive model..."): | |
# # Load the Corresponding Model | |
# model_path = f'models/modelo_cluster_{cluster}.txt' | |
# gbm = lgb.Booster(model_file=model_path) | |
# with st.spinner("Getting the data ready..."): | |
# # Load predict data for that cluster | |
# predict_data = pd.read_csv(f'predicts/predict_cluster_{cluster}.csv') | |
# # Convert cliente_id to string | |
# predict_data['cliente_id'] = predict_data['cliente_id'].astype(str) | |
# with st.spinner("Filtering data..."): | |
# # Filter for the specific customer | |
# customer_code_str = str(customer_code) | |
# customer_data = predict_data[predict_data['cliente_id'] == customer_code_str] | |
# with st.spinner("Generating sales predictions..."): | |
# if not customer_data.empty: | |
# # Define features consistently with the training process | |
# lag_features = [f'precio_total_lag_{lag}' for lag in range(1, 25)] | |
# features = lag_features + ['mes', 'marca_id_encoded', 'año', 'cluster_id'] | |
# # Prepare data for prediction | |
# X_predict = customer_data[features] | |
# # Convert categorical features to 'category' dtype | |
# categorical_features = ['mes', 'marca_id_encoded', 'cluster_id'] | |
# for feature in categorical_features: | |
# X_predict[feature] = X_predict[feature].astype('category') | |
# # Make Prediction for the selected customer | |
# y_pred = gbm.predict(X_predict, num_iteration=gbm.best_iteration) | |
# # Reassemble the results | |
# results = customer_data[['cliente_id', 'marca_id_encoded', 'fecha_mes']].copy() | |
# results['ventas_predichas'] = y_pred | |
# # Load actual data | |
# actual_sales = df_agg_2024[df_agg_2024['cliente_id'] == customer_code_str] | |
# if not actual_sales.empty: | |
# results = results.merge(actual_sales[['cliente_id', 'marca_id_encoded', 'fecha_mes', 'precio_total']], | |
# on=['cliente_id', 'marca_id_encoded', 'fecha_mes'], | |
# how='left') | |
# results.rename(columns={'precio_total': 'ventas_reales'}, inplace=True) | |
# results['ventas_reales'].fillna(0, inplace=True) | |
# # st.write("### Final Results DataFrame:") | |
# # st.write(results.head()) | |
# # st.write(f"Shape: {results.shape}") | |
# # Calculate metrics only for non-null actual sales | |
# valid_results = results.dropna(subset=['ventas_reales']) | |
# non_zero_actuals = valid_results[valid_results['ventas_reales'] != 0] | |
# if not valid_results.empty: | |
# mae = mean_absolute_error(valid_results['ventas_reales'], valid_results['ventas_predichas']) | |
# mape = np.mean(np.abs((non_zero_actuals['ventas_reales'] - non_zero_actuals['ventas_predichas']) / non_zero_actuals['ventas_reales'])) * 100 | |
# rmse = np.sqrt(mean_squared_error(valid_results['ventas_reales'], valid_results['ventas_predichas'])) | |
# # st.write(f"Actual total sales for Customer {customer_code}: {valid_results['ventas_reales'].sum():.2f}") | |
# # st.write(f"MAE: {mae:.2f}€") | |
# # st.write(f"MAPE: {mape:.2f}%") | |
# # st.write(f"RMSE: {rmse:.2f}") | |
# # # Analysis of results | |
# # threshold_good = 100 # You may want to adjust this threshold | |
# # if mae < threshold_good: | |
# # st.success(f"Customer {customer_code} is performing well based on the predictions.") | |
# # else: | |
# # st.warning(f"Customer {customer_code} is not performing well based on the predictions.") | |
# # else: | |
# # st.warning(f"No actual sales data found for customer {customer_code} in df_agg_2024.") | |
# # st.write("### Debug Information for Radar Chart:") | |
# # st.write(f"Shape of customer_data: {customer_data.shape}") | |
# # st.write(f"Shape of euros_proveedor: {euros_proveedor.shape}") | |
# # Get percentage of units sold for each manufacturer | |
# customer_df = df[df["CLIENTE"] == str(customer_code)] # Get the customer data | |
# all_manufacturers = customer_df.iloc[:, 1:].T # Exclude CLIENTE column (manufacturers are in columns) | |
# all_manufacturers.index = all_manufacturers.index.astype(str) | |
# # Get total sales for each manufacturer from euros_proveedor | |
# customer_euros = euros_proveedor[euros_proveedor["CLIENTE"] == str(customer_code)] | |
# sales_data = customer_euros.iloc[:, 1:].T # Exclude CLIENTE column | |
# sales_data.index = sales_data.index.astype(str) | |
# # Remove the 'CLIENTE' row from sales_data to avoid issues with mixed types | |
# sales_data_filtered = sales_data.drop(index='CLIENTE', errors='ignore') | |
# # Ensure all values are numeric | |
# sales_data_filtered = sales_data_filtered.apply(pd.to_numeric, errors='coerce') | |
# all_manufacturers = all_manufacturers.apply(pd.to_numeric, errors='coerce') | |
# # Sort manufacturers by percentage of units and get top 10 | |
# top_units = all_manufacturers.sort_values(by=all_manufacturers.columns[0], ascending=False).head(10) | |
# # Sort manufacturers by total sales and get top 10 | |
# top_sales = sales_data_filtered.sort_values(by=sales_data_filtered.columns[0], ascending=False).head(10) | |
# # Combine top manufacturers from both lists and get up to 20 unique manufacturers | |
# combined_top = pd.concat([top_units, top_sales]).index.unique()[:20] | |
# # Filter out manufacturers that are not present in both datasets | |
# combined_top = [m for m in combined_top if m in all_manufacturers.index and m in sales_data_filtered.index] | |
# # st.write(f"Number of combined top manufacturers: {len(combined_top)}") | |
# if combined_top: | |
# # Create a DataFrame with combined data for these top manufacturers | |
# combined_data = pd.DataFrame({ | |
# 'units': all_manufacturers.loc[combined_top, all_manufacturers.columns[0]], | |
# 'sales': sales_data_filtered.loc[combined_top, sales_data_filtered.columns[0]] | |
# }).fillna(0) | |
# # Sort by units, then by sales | |
# combined_data_sorted = combined_data.sort_values(by=['units', 'sales'], ascending=False) | |
# # Filter out manufacturers with 0 units | |
# non_zero_manufacturers = combined_data_sorted[combined_data_sorted['units'] > 0] | |
# # If we have less than 3 non-zero manufacturers, add some zero-value ones | |
# if len(non_zero_manufacturers) < 3: | |
# zero_manufacturers = combined_data_sorted[combined_data_sorted['units'] == 0].head(3 - len(non_zero_manufacturers)) | |
# manufacturers_to_show = pd.concat([non_zero_manufacturers, zero_manufacturers]) | |
# else: | |
# manufacturers_to_show = non_zero_manufacturers | |
# values = manufacturers_to_show['units'].tolist() | |
# amounts = manufacturers_to_show['sales'].tolist() | |
# manufacturers = [get_supplier_name(m) for m in manufacturers_to_show.index] | |
# # st.write(f"### Results for top {len(manufacturers)} manufacturers:") | |
# # for manufacturer, value, amount in zip(manufacturers, values, amounts): | |
# # (f"{manufacturer} = {value:.2f}% of units, €{amount:.2f} total sales") | |
# if manufacturers: # Only create the chart if we have data | |
# fig = radar_chart(manufacturers, values, amounts, f'Radar Chart for Top {len(manufacturers)} Manufacturers of Customer {customer_code}') | |
# st.pyplot(fig) | |
# else: | |
# st.warning("No data available to create the radar chart.") | |
# else: | |
# st.warning("No combined top manufacturers found.") | |
# # Ensure codigo_cliente in ventas_clientes is a string | |
# ventas_clientes['codigo_cliente'] = ventas_clientes['codigo_cliente'].astype(str).str.strip() | |
# # Ensure customer_code is a string and strip any spaces | |
# customer_code = str(customer_code).strip() | |
# # if customer_code in ventas_clientes['codigo_cliente'].unique(): | |
# # (f"Customer {customer_code} found in ventas_clientes") | |
# # else: | |
# # (f"Customer {customer_code} not found in ventas_clientes") | |
# # Customer sales 2021-2024 (if data exists) | |
# sales_columns = ['VENTA_2021', 'VENTA_2022', 'VENTA_2023'] | |
# if all(col in ventas_clientes.columns for col in sales_columns): | |
# customer_sales_data = ventas_clientes[ventas_clientes['codigo_cliente'] == customer_code] | |
# if not customer_sales_data.empty: | |
# customer_sales = customer_sales_data[sales_columns].values[0] | |
# years = ['2021', '2022', '2023'] | |
# # Add the 2024 actual and predicted data | |
# if 'ventas_predichas' in results.columns and 'ventas_reales' in results.columns: | |
# # Get the actual and predicted sales for 2024 | |
# actual_sales_2024 = results[results['fecha_mes'].str.startswith('2024')]['ventas_reales'].sum() | |
# predicted_sales_2024 = results[results['fecha_mes'].str.startswith('2024')]['ventas_predichas'].sum() | |
# # Estimate full-year predicted sales (assuming predictions available until September) | |
# months_available = 9 # Data available until September | |
# actual_sales_2024_annual = (actual_sales_2024 / months_available) * 12 | |
# # Add 2024 actual and predicted sales | |
# sales_values = list(customer_sales) + [actual_sales_2024_annual] # Actual sales | |
# predicted_values = list(customer_sales) + [predicted_sales_2024] # Predicted sales | |
# # Add 2024 to the years list | |
# years.append('2024') | |
# fig_sales_bar = go.Figure() | |
# # Add trace for historical sales (2021-2023) | |
# fig_sales_bar.add_trace(go.Bar( | |
# x=years[:3], # 2021, 2022, 2023 | |
# y=sales_values[:3], | |
# name="Historical Sales", | |
# marker_color='blue' | |
# )) | |
# # Add trace for 2024 actual sales | |
# fig_sales_bar.add_trace(go.Bar( | |
# x=[years[3]], # 2024 | |
# y=[sales_values[3]], | |
# name="2024 Actual Sales (Annualized)", | |
# marker_color='green' | |
# )) | |
# # Add trace for 2024 predicted sales | |
# fig_sales_bar.add_trace(go.Bar( | |
# x=[years[3]], # 2024 | |
# y=[predicted_values[3]], | |
# name="2024 Predicted Sales", | |
# marker_color='orange' | |
# )) | |
# # Update layout | |
# fig_sales_bar.update_layout( | |
# title=f"Sales Over the Years for Customer {customer_code}", | |
# xaxis_title="Year", | |
# yaxis_title="Sales (€)", | |
# barmode='group', | |
# height=600, | |
# legend_title_text="Sales Type", | |
# hovermode="x unified" | |
# ) | |
# # Show the interactive bar chart in Streamlit | |
# st.plotly_chart(fig_sales_bar, use_container_width=True) | |
# else: | |
# st.warning(f"No predicted or actual data found for customer {customer_code} for 2024.") | |
# else: | |
# st.warning(f"No historical sales data found for customer {customer_code}") | |
# else: | |
# st.warning("Sales data for 2021-2023 not available in the dataset.") | |
# Customer Recommendations Page | |
elif page == "Articles Recommendations": | |
st.title("Articles Recommendations") | |
st.markdown(""" | |
Get tailored recommendations for your customers based on their basket. | |
""") | |
st.write("Select items and assign quantities for the basket:") | |
# Mostrar lista de artículos disponibles | |
available_articles = productos['ARTICULO'].unique() | |
selected_articles = st.multiselect("Select Articles", available_articles) | |
# Crear inputs para ingresar las cantidades de cada artículo seleccionado | |
quantities = {} | |
for article in selected_articles: | |
quantities[article] = st.number_input(f"Quantity for {article}", min_value=0, step=1) | |
if st.button("Calcular"): # Añadimos el botón "Calcular" | |
# Crear una lista de artículos basada en la selección | |
new_basket = [f"{article} x{quantities[article]}" for article in selected_articles if quantities[article] > 0] | |
if new_basket: | |
# Procesar la lista para recomendar | |
recommendations_df = recomienda_tfid(new_basket) | |
if not recommendations_df.empty: | |
st.write("### Recommendations based on the current basket:") | |
st.dataframe(recommendations_df) | |
else: | |
st.warning("No recommendations found for the provided basket.") | |
else: | |
st.warning("Please select at least one article and set its quantity.") | |
# # Customer Recommendations Page | |
# elif page == "Articles Recommendations": | |
# st.title("Articles Recommendations") | |
# st.markdown(""" | |
# Get tailored recommendations for your customers based on their basket. | |
# """) | |
# # Campo input para cliente | |
# partial_code = st.text_input("Enter part of Customer Code for Recommendations (or leave empty to see all)") | |
# if partial_code: | |
# filtered_customers = df[df['CLIENTE'].str.contains(partial_code)] | |
# else: | |
# filtered_customers = df | |
# customer_list = filtered_customers['CLIENTE'].unique() | |
# customer_code = st.selectbox("Select Customer Code for Recommendations", [""] + list(customer_list)) | |
# # Definición de la función recomienda | |
# def recomienda(new_basket): | |
# # Calcular la matriz TF-IDF | |
# tfidf = TfidfVectorizer() | |
# tfidf_matrix = tfidf.fit_transform(cestas['Cestas']) | |
# # Convertir la nueva cesta en formato TF-IDF | |
# new_basket_str = ' '.join(new_basket) | |
# new_basket_tfidf = tfidf.transform([new_basket_str]) | |
# # Comparar la nueva cesta con las anteriores | |
# similarities = cosine_similarity(new_basket_tfidf, tfidf_matrix) | |
# # Obtener los índices de las cestas más similares | |
# similar_indices = similarities.argsort()[0][-3:] # Las 3 más similares | |
# # Crear un diccionario para contar las recomendaciones | |
# recommendations_count = {} | |
# total_similarity = 0 | |
# # Recomendar productos de cestas similares | |
# for idx in similar_indices: | |
# sim_score = similarities[0][idx] | |
# total_similarity += sim_score | |
# products = cestas.iloc[idx]['Cestas'].split() | |
# for product in products: | |
# if product.strip() not in new_basket: # Evitar recomendar lo que ya está en la cesta | |
# if product.strip() in recommendations_count: | |
# recommendations_count[product.strip()] += sim_score | |
# else: | |
# recommendations_count[product.strip()] = sim_score | |
# # Calcular la probabilidad relativa de cada producto recomendado | |
# recommendations_with_prob = [] | |
# if total_similarity > 0: # Verificar que total_similarity no sea cero | |
# recommendations_with_prob = [(product, score / total_similarity) for product, score in recommendations_count.items()] | |
# else: | |
# print("No se encontraron similitudes suficientes para calcular probabilidades.") | |
# recommendations_with_prob.sort(key=lambda x: x[1], reverse=True) # Ordenar por puntuación | |
# # Crear un nuevo DataFrame para almacenar las recomendaciones con descripciones y probabilidades | |
# recommendations_df = pd.DataFrame(columns=['ARTICULO', 'DESCRIPCION', 'PROBABILIDAD']) | |
# # Agregar las recomendaciones al DataFrame usando pd.concat | |
# for product, prob in recommendations_with_prob: | |
# # Buscar la descripción en el DataFrame de productos | |
# description = productos.loc[productos['ARTICULO'] == product, 'DESCRIPCION'] | |
# if not description.empty: | |
# # Crear un nuevo DataFrame temporal para la recomendación | |
# temp_df = pd.DataFrame({ | |
# 'ARTICULO': [product], | |
# 'DESCRIPCION': [description.values[0]], # Obtener el primer valor encontrado | |
# 'PROBABILIDAD': [prob] | |
# }) | |
# # Concatenar el DataFrame temporal al DataFrame de recomendaciones | |
# recommendations_df = pd.concat([recommendations_df, temp_df], ignore_index=True) | |
# return recommendations_df | |
# # Comprobar si el cliente está en el CSV de fieles | |
# is_fiel = customer_code in fieles_df['Cliente'].astype(str).values | |
# if customer_code: | |
# if is_fiel: | |
# st.write(f"### Customer {customer_code} is a loyal customer.") | |
# option = st.selectbox("Select Recommendation Type", ["Select an option", "By Purchase History", "By Current Basket"]) | |
# if option == "By Purchase History": | |
# st.warning("Option not available... aún") | |
# elif option == "By Current Basket": | |
# st.write("Select the items and assign quantities for the basket:") | |
# # Mostrar lista de artículos disponibles | |
# available_articles = productos['ARTICULO'].unique() | |
# selected_articles = st.multiselect("Select Articles", available_articles) | |
# # Crear inputs para ingresar las cantidades de cada artículo seleccionado | |
# quantities = {} | |
# for article in selected_articles: | |
# quantities[article] = st.number_input(f"Quantity for {article}", min_value=0, step=1) | |
# if st.button("Calcular"): # Añadimos el botón "Calcular" | |
# # Crear una lista de artículos basada en la selección | |
# new_basket = [f"{article} x{quantities[article]}" for article in selected_articles if quantities[article] > 0] | |
# if new_basket: | |
# # Procesar la lista para recomendar | |
# recommendations_df = recomienda(new_basket) | |
# if not recommendations_df.empty: | |
# st.write("### Recommendations based on the current basket:") | |
# st.dataframe(recommendations_df) | |
# else: | |
# st.warning("No recommendations found for the provided basket.") | |
# else: | |
# st.warning("Please select at least one article and set its quantity.") | |
# else: | |
# st.write(f"### Customer {customer_code} is not a loyal customer.") | |
# st.write("Select items and assign quantities for the basket:") | |
# # Mostrar lista de artículos disponibles | |
# available_articles = productos['ARTICULO'].unique() | |
# selected_articles = st.multiselect("Select Articles", available_articles) | |
# # Crear inputs para ingresar las cantidades de cada artículo seleccionado | |
# quantities = {} | |
# for article in selected_articles: | |
# quantities[article] = st.number_input(f"Quantity for {article}", min_value=0, step=1) | |
# if st.button("Calcular"): # Añadimos el botón "Calcular" | |
# # Crear una lista de artículos basada en la selección | |
# new_basket = [f"{article} x{quantities[article]}" for article in selected_articles if quantities[article] > 0] | |
# if new_basket: | |
# # Procesar la lista para recomendar | |
# recommendations_df = recomienda(new_basket) | |
# if not recommendations_df.empty: | |
# st.write("### Recommendations based on the current basket:") | |
# st.dataframe(recommendations_df) | |
# else: | |
# st.warning("No recommendations found for the provided basket.") | |
# else: | |
# st.warning("Please select at least one article and set its quantity.") | |