Final_Project / app.py
GMARTINEZMILLA's picture
feat: updated website
ca83516
raw
history blame
58.2 kB
import streamlit as st
import time
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import numpy as np
import lightgbm as lgb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_absolute_error, mean_squared_error
from joblib import dump, load
from utils import recomienda_tfid
# Page configuration
st.set_page_config(page_title="DeepInsightz", page_icon=":bar_chart:", layout="wide")
# Custom CSS for dynamic theme styling
# Streamlit detects light and dark mode automatically via the user's settings in Hugging Face Spaces
if st.get_option("theme.base") == "dark":
background_color = "#282828"
text_color = "white"
metric_box_color = "#4f4f4f"
sidebar_color = "#282828"
plot_bgcolor = "rgba(0, 0, 0, 0)"
primary_color = '#00FF00' # for positive delta
negative_color = '#FF0000' # for negative delta
else:
background_color = "#f4f4f4"
text_color = "#black"
metric_box_color = "#dee2e8"
sidebar_color = "#dee2e8"
plot_bgcolor = "#f4f4f4"
primary_color = '#228B22' # for positive delta in light mode
negative_color = '#8B0000' # for negative delta in light mode
st.markdown(f"""
<style>
body {{
background-color: {background_color};
color: {text_color};
}}
[data-testid="stMetric"] {{
background-color: {metric_box_color};
border-radius: 10px;
text-align: center;
padding: 15px 0;
margin-bottom: 20px;
}}
[data-testid="stMetricLabel"] {{
display: flex;
justify-content: center;
align-items: center;
color: {text_color};
}}
[data-testid="stSidebar"] {{
background-color: {sidebar_color};
}}
</style>
""", unsafe_allow_html=True)
# Load CSV files at the top
df = pd.read_csv("df_clean.csv")
nombres_proveedores = pd.read_csv("nombres_proveedores.csv", sep=';')
euros_proveedor = pd.read_csv("euros_proveedor.csv", sep=',')
ventas_clientes = pd.read_csv("ventas_clientes.csv", sep=',')
customer_clusters = pd.read_csv('predicts/customer_clusters.csv') # Load the customer clusters here
df_agg_2024 = pd.read_csv('predicts/df_agg_2024.csv')
pca_data_5 = pd.read_csv('pca_data.csv')
historical_data = pd.read_csv('historical_data.csv')
with st.sidebar:
st.sidebar.title("DeepInsightz")
page = st.sidebar.selectbox("Select the tool you want to use", ["Summary", "Customer Analysis", "Articles Recommendations"])
# If the user selects "Customer Analysis," show filter options in the sidebar
if page == "Customer Analysis":
st.sidebar.title("Filter Options")
show_all = st.sidebar.checkbox('Show All Manufacturers', value=True)
# If not showing all, allow filtering by manufacturer code
if not show_all:
selected_manufacturer = st.sidebar.selectbox(
'Select Manufacturer Code',
historical_data['marca_id_encoded'].unique()
)
# Generamos la columna total_sales
ventas_clientes['total_sales'] = ventas_clientes[['VENTA_2021', 'VENTA_2022', 'VENTA_2023']].sum(axis=1)
# Ordenar los clientes de mayor a menor según sus ventas totales
ventas_top_100 = ventas_clientes.sort_values(by='total_sales', ascending=False).head(100)
# Ensure customer codes are strings
df['CLIENTE'] = df['CLIENTE'].astype(str)
nombres_proveedores['codigo'] = nombres_proveedores['codigo'].astype(str)
euros_proveedor['CLIENTE'] = euros_proveedor['CLIENTE'].astype(str)
customer_clusters['cliente_id'] = customer_clusters['cliente_id'].astype(str) # Ensure customer IDs are strings
fieles_df = pd.read_csv("clientes_relevantes.csv")
cestas = pd.read_csv("cestas.csv")
productos = pd.read_csv("productos.csv")
df_agg_2024['cliente_id'] = df_agg_2024['cliente_id'].astype(str)
marca_id_mapping = load('marca_id_mapping.joblib')
# Convert all columns except 'CLIENTE' to float in euros_proveedor
for col in euros_proveedor.columns:
if col != 'CLIENTE':
euros_proveedor[col] = pd.to_numeric(euros_proveedor[col], errors='coerce')
# Check for NaN values after conversion
if euros_proveedor.isna().any().any():
st.warning("Some values in euros_proveedor couldn't be converted to numbers. Please review the input data.")
# Ignore the last two columns of df
df = df.iloc[:, :-2]
# Function to get supplier name
def get_supplier_name(code):
code = str(code) # Ensure code is a string
name = nombres_proveedores[nombres_proveedores['codigo'] == code]['nombre'].values
return name[0] if len(name) > 0 else code
def get_supplier_name_encoded(encoded_code):
try:
# Ensure the encoded code is an integer
encoded_code = int(encoded_code)
print(f"Encoded Code: {encoded_code}")
# Use the label encoder to map the encoded code back to the original manufacturer code
if encoded_code < len(marca_id_mapping.classes_):
real_code = marca_id_mapping.inverse_transform([encoded_code])[0]
print(f"Real Manufacturer Code: {real_code}")
else:
print(f"Encoded code not found in the label encoder: {encoded_code}")
return f"Unknown code: {encoded_code}" # Handle case where encoded code is not found
# Now, use the real_code to find the manufacturer name in nombres_proveedores
name = nombres_proveedores[nombres_proveedores['codigo'] == str(real_code)]['nombre'].values
print(f"Manufacturer Name Found: {name}") # Check what name is returned
# Return the manufacturer name if found, otherwise return the real_code
return name[0] if len(name) > 0 else real_code
except Exception as e:
print(f"Error encountered: {e}")
return f"Error for code: {encoded_code}"
# Custom Donut Chart with Plotly for Inbound/Outbound Percentage
def create_donut_chart(values, labels, color_scheme, title):
fig = px.pie(
values=values,
names=labels,
hole=0.7,
color_discrete_sequence=color_scheme
)
fig.update_traces(textinfo='percent+label', hoverinfo='label+percent', textposition='inside', showlegend=False)
fig.update_layout(
annotations=[dict(text=f"{int(values[1])}%", x=0.5, y=0.5, font_size=40, showarrow=False)],
title=title,
height=300,
margin=dict(t=30, b=10, l=10, r=10),
paper_bgcolor=plot_bgcolor, # Use theme-dependent background color
plot_bgcolor=plot_bgcolor
)
return fig
# Donut chart with color scheme based on theme
if st.get_option("theme.base") == "dark":
donut_color_scheme = ['#155F7A', '#29b5e8'] # Dark mode colors
else:
donut_color_scheme = ['#007BFF', '#66b5ff'] # Light mode colors
# Function to create radar chart with square root transformation
def radar_chart(categories, values, amounts, title):
N = len(categories)
angles = [n / float(N) * 2 * np.pi for n in range(N)]
angles += angles[:1]
fig, ax = plt.subplots(figsize=(12, 12), subplot_kw=dict(projection='polar'))
# Apply square root transformation
sqrt_values = np.sqrt(values)
sqrt_amounts = np.sqrt(amounts)
max_sqrt_value = max(sqrt_values)
normalized_values = [v / max_sqrt_value for v in sqrt_values]
# Adjust scaling for spend values
max_sqrt_amount = max(sqrt_amounts)
scaling_factor = 0.7 # Adjust this value to control how much the spend values are scaled up
normalized_amounts = [min((a / max_sqrt_amount) * scaling_factor, 1.0) for a in sqrt_amounts]
normalized_values += normalized_values[:1]
ax.plot(angles, normalized_values, 'o-', linewidth=2, color='#FF69B4', label='% Units (sqrt)')
ax.fill(angles, normalized_values, alpha=0.25, color='#FF69B4')
normalized_amounts += normalized_amounts[:1]
ax.plot(angles, normalized_amounts, 'o-', linewidth=2, color='#4B0082', label='% Spend (sqrt)')
ax.fill(angles, normalized_amounts, alpha=0.25, color='#4B0082')
ax.set_xticks(angles[:-1])
ax.set_xticklabels(categories, size=8, wrap=True)
ax.set_ylim(0, 1)
circles = np.linspace(0, 1, 5)
for circle in circles:
ax.plot(angles, [circle]*len(angles), '--', color='gray', alpha=0.3, linewidth=0.5)
ax.set_yticklabels([])
ax.spines['polar'].set_visible(False)
plt.title(title, size=16, y=1.1)
plt.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1))
return fig
if page == "Summary":
# st.title("Welcome to DeepInsightz")
# st.markdown("""
# ### Data-driven Customer Clustering
# We analyzed thousands of customers and suppliers to help businesses make smarter sales decisions.
# """)
# Create layout with three columns
col1, col2, col3 = st.columns((1.5, 4, 2.5), gap='medium')
# Left Column (Red): Metrics and Donut Charts
with col1:
st.markdown('#### General Information')
st.metric(label="Range of Dates", value="2021-2023")
st.metric(label="Customers Analysed", value="3.000")
st.metric(label="Unique Products Sold", value="10.702")
st.metric(label="Total Sales Instances", value="764.396")
# Middle Column (White): 3D Cluster Model and Bar Chart
with col2:
st.markdown('#### 3D Customer Clusters')
# Create 3D PCA plot using actual data from pca_data_5
fig_cluster = px.scatter_3d(
pca_data_5,
x='PC1',
y='PC2',
z='PC3',
color='cluster_id',
hover_name='CustomerID',
)
fig_cluster.update_layout(
scene=dict(aspectratio=dict(x=1, y=1, z=0.8)), # Adjusted aspect ratio for better balance
margin=dict(t=10, b=10, l=10, r=10), # Tighten margins further
height=600, # Slightly increased height for better visibility
)
st.plotly_chart(fig_cluster, use_container_width=True)
# Right Column (Blue): Key Metrics Overview and Data Preparation Summary
with col3:
# Mostrar la tabla con los 100 mejores clientes
st.markdown('#### Top 100 Clients by Total Sales')
# Configurar columnas para mostrar los clientes y las ventas totales
st.dataframe(ventas_top_100[['codigo_cliente', 'total_sales']],
column_order=("codigo_cliente", "total_sales"),
hide_index=True,
width=450, # Ajustar el ancho de la tabla
height=600, # Ajustar la altura de la tabla
column_config={
"codigo_cliente": st.column_config.TextColumn(
"Client Code",
),
"total_sales": st.column_config.ProgressColumn(
"Total Sales (€)",
format="%d",
min_value=0,
max_value=ventas_top_100['total_sales'].max()
)}
)
# Customer Analysis Page
elif page == "Customer Analysis":
st.markdown("""
<h2 style='text-align: center; font-size: 2.5rem;'>Customer Analysis</h2>
<p style='text-align: center; font-size: 1.2rem; color: gray;'>
Enter the customer code to explore detailed customer insights,
including past sales, predictions for the current year, and manufacturer-specific information.
</p>
""", unsafe_allow_html=True)
# Combine text input and dropdown into a single searchable selectbox
customer_code = st.selectbox(
"Search and Select Customer Code",
df['CLIENTE'].unique(), # All customer codes
format_func=lambda x: str(x), # Ensures the values are displayed as strings
help="Start typing to search for a specific customer code"
)
if st.button("Calcular"):
if customer_code:
with st.spinner("We are identifying the customer's cluster..."):
# Find Customer's Cluster
customer_match = customer_clusters[customer_clusters['cliente_id'] == customer_code]
time.sleep(1)
if not customer_match.empty:
cluster = customer_match['cluster_id'].values[0]
with st.spinner(f"Selecting predictive model..."):
# Load the Corresponding Model
model_path = f'models/modelo_cluster_{cluster}.txt'
gbm = lgb.Booster(model_file=model_path)
with st.spinner("Getting the data ready..."):
# Load predict data for that cluster
predict_data = pd.read_csv(f'predicts/predict_cluster_{cluster}.csv')
# Convert cliente_id to string
predict_data['cliente_id'] = predict_data['cliente_id'].astype(str)
with st.spinner("Filtering data..."):
# Filter for the specific customer
customer_code_str = str(customer_code)
customer_data = predict_data[predict_data['cliente_id'] == customer_code_str]
with st.spinner("Generating sales predictions..."):
if not customer_data.empty:
# Define features consistently with the training process
lag_features = [f'precio_total_lag_{lag}' for lag in range(1, 25)]
features = lag_features + ['mes', 'marca_id_encoded', 'año', 'cluster_id']
# Prepare data for prediction
X_predict = customer_data[features]
# Convert categorical features to 'category' dtype
categorical_features = ['mes', 'marca_id_encoded', 'cluster_id']
for feature in categorical_features:
X_predict[feature] = X_predict[feature].astype('category')
# Make Prediction for the selected customer
y_pred = gbm.predict(X_predict, num_iteration=gbm.best_iteration)
# Reassemble the results
results = customer_data[['cliente_id', 'marca_id_encoded', 'fecha_mes']].copy()
results['ventas_predichas'] = y_pred
# Load actual data from df_agg_2024
actual_sales = df_agg_2024[df_agg_2024['cliente_id'] == customer_code_str]
if not actual_sales.empty:
# Merge predictions with actual sales
results = results.merge(actual_sales[['cliente_id', 'marca_id_encoded', 'fecha_mes', 'precio_total']],
on=['cliente_id', 'marca_id_encoded', 'fecha_mes'],
how='left')
results.rename(columns={'precio_total': 'ventas_reales'}, inplace=True)
else:
# If no actual sales data for 2024, fill 'ventas_reales' with 0
results['ventas_reales'] = 0
# Ensure any missing sales data is filled with 0
results['ventas_reales'].fillna(0, inplace=True)
# Define the cutoff date for the last 12 months
fecha_inicio = pd.to_datetime("2023-01-01")
fecha_corte = pd.to_datetime("2024-09-01")
# Convertir fecha_mes a datetime en ambos DataFrames
historical_data['fecha_mes'] = pd.to_datetime(historical_data['fecha_mes'], errors='coerce')
results['fecha_mes'] = pd.to_datetime(results['fecha_mes'], errors='coerce')
# Filtrar los datos históricos por cliente y por el rango de fechas (2023)
fecha_inicio_2023 = pd.to_datetime("2023-01-01")
fecha_fin_2023 = pd.to_datetime("2023-12-31")
datos_historicos = historical_data[
(historical_data['cliente_id'] == customer_code_str) &
(historical_data['fecha_mes'] >= fecha_inicio_2023) &
(historical_data['fecha_mes'] <= fecha_fin_2023)
].groupby('fecha_mes')['precio_total'].sum().reset_index()
# Asegurarnos de que fecha_mes en datos_historicos es datetime
datos_historicos['fecha_mes'] = pd.to_datetime(datos_historicos['fecha_mes'], errors='coerce')
# Generar un rango de fechas desde enero 2023 hasta diciembre 2023 si no hay datos
if datos_historicos.empty:
fechas_2023 = pd.date_range(start='2023-01-01', end='2023-12-31', freq='M')
datos_historicos = pd.DataFrame({'fecha_mes': fechas_2023, 'ventas_historicas': [0] * len(fechas_2023)})
# Renombrar la columna 'precio_total' a 'ventas_historicas' si hay datos
else:
datos_historicos.rename(columns={'precio_total': 'ventas_historicas'}, inplace=True)
# Filtrar los datos de predicciones y ventas reales para 2024
datos_cliente_total = results.groupby('fecha_mes').agg({
'ventas_reales': 'sum',
'ventas_predichas': 'sum'
}).reset_index()
# Asegurarnos de que fecha_mes en datos_cliente_total es datetime
datos_cliente_total['fecha_mes'] = pd.to_datetime(datos_cliente_total['fecha_mes'], errors='coerce')
# Generar un rango de fechas para 2024 si no hay predicciones
fechas_2024 = pd.date_range(start='2024-01-01', end='2024-12-31', freq='M')
fechas_df_2024 = pd.DataFrame({'fecha_mes': fechas_2024})
# Asegurarnos de que fecha_mes en fechas_df_2024 es datetime
fechas_df_2024['fecha_mes'] = pd.to_datetime(fechas_df_2024['fecha_mes'], errors='coerce')
# Combinar datos históricos con predicciones y ventas reales usando un merge
datos_combinados = pd.merge(fechas_df_2024, datos_cliente_total, on='fecha_mes', how='outer').sort_values('fecha_mes')
# Rellenar los NaN: 0 en ventas_historicas donde faltan predicciones, y viceversa
datos_combinados['ventas_historicas'].fillna(0, inplace=True)
datos_combinados['ventas_predichas'].fillna(0, inplace=True)
datos_combinados['ventas_reales'].fillna(0, inplace=True)
# Mostrar los datos combinados
st.subheader("Datos combinados con valores faltantes llenados")
st.write(datos_combinados)
# **Generar la gráfica**
st.markdown("### Sales History, Predictions, and Real Sales")
# Crear la gráfica con Plotly
fig = go.Figure()
# Graficar ventas históricas
fig.add_trace(go.Scatter(
x=datos_combinados['fecha_mes'],
y=datos_combinados['ventas_historicas'],
mode='lines+markers',
name='Ventas Históricas',
line=dict(color='blue')
))
# Graficar ventas predichas
fig.add_trace(go.Scatter(
x=datos_combinados['fecha_mes'],
y=datos_combinados['ventas_predichas'],
mode='lines+markers',
name='Ventas Predichas',
line=dict(color='orange')
))
# Graficar ventas reales
fig.add_trace(go.Scatter(
x=datos_combinados['fecha_mes'],
y=datos_combinados['ventas_reales'],
mode='lines+markers',
name='Ventas Reales',
line=dict(color='green')
))
# Personalizar el layout para enfocarse en 2023 y 2024
fig.update_layout(
title=f"Ventas Históricas, Predichas y Reales para Cliente {customer_code}",
xaxis_title="Fecha",
yaxis_title="Ventas (€)",
height=600,
xaxis_range=[fecha_inicio_2023, pd.to_datetime("2024-09-30")], # Ajustar el rango del eje x a 2023-2024
legend_title="Tipo de Ventas",
hovermode="x unified"
)
# Mostrar la gráfica en Streamlit
st.plotly_chart(fig)
# Split space into two columns
col1, col2 = st.columns(2)
# Column 1: Radar chart for top manufacturers
with col1:
st.subheader("This looks great!")
st.info("Your customer did exceed predicted sales from the following brands:")
# Identify manufacturers that exceeded predicted sales
overperforming_manufacturers = results[results['ventas_reales'] > results['ventas_predichas']].copy()
if not overperforming_manufacturers.empty:
# Calculate the extra amount (difference between actual and predicted sales)
overperforming_manufacturers['extra_amount'] = overperforming_manufacturers['ventas_reales'] - overperforming_manufacturers['ventas_predichas']
# Sort by the highest extra amount
overperforming_manufacturers = overperforming_manufacturers.sort_values(by='extra_amount', ascending=False)
# Limit to top 10 overperforming manufacturers
top_overperformers = overperforming_manufacturers.head(10)
# Display two cards per row
for i in range(0, len(top_overperformers), 2):
cols = st.columns(2) # Create two columns for two cards in a row
for j, col in enumerate(cols):
if i + j < len(top_overperformers):
row = top_overperformers.iloc[i + j]
manufacturer_name = get_supplier_name_encoded(row['marca_id_encoded'])
predicted = row['ventas_predichas']
actual = row['ventas_reales']
extra = row['extra_amount']
# Use st.metric for compact display in each column
with col:
st.metric(
label=f"{manufacturer_name}",
value=f"{actual:.2f}€",
delta=f"Exceeded by {extra:.2f}€",
delta_color="normal"
)
# Radar chart logic remains the same
customer_df = df[df["CLIENTE"] == str(customer_code)]
all_manufacturers = customer_df.iloc[:, 1:].T
all_manufacturers.index = all_manufacturers.index.astype(str)
customer_euros = euros_proveedor[euros_proveedor["CLIENTE"] == str(customer_code)]
sales_data = customer_euros.iloc[:, 1:].T
sales_data.index = sales_data.index.astype(str)
sales_data_filtered = sales_data.drop(index='CLIENTE', errors='ignore')
sales_data_filtered = sales_data_filtered.apply(pd.to_numeric, errors='coerce')
all_manufacturers = all_manufacturers.apply(pd.to_numeric, errors='coerce')
top_units = all_manufacturers.sort_values(by=all_manufacturers.columns[0], ascending=False).head(10)
top_sales = sales_data_filtered.sort_values(by=sales_data_filtered.columns[0], ascending=False).head(10)
combined_top = pd.concat([top_units, top_sales]).index.unique()[:20]
combined_top = [m for m in combined_top if m in all_manufacturers.index and m in sales_data_filtered.index]
if combined_top:
combined_data = pd.DataFrame({
'units': all_manufacturers.loc[combined_top, all_manufacturers.columns[0]],
'sales': sales_data_filtered.loc[combined_top, sales_data_filtered.columns[0]]
}).fillna(0)
combined_data_sorted = combined_data.sort_values(by=['units', 'sales'], ascending=False)
non_zero_manufacturers = combined_data_sorted[combined_data_sorted['units'] > 0]
if len(non_zero_manufacturers) < 3:
zero_manufacturers = combined_data_sorted[combined_data_sorted['units'] == 0].head(3 - len(non_zero_manufacturers))
manufacturers_to_show = pd.concat([non_zero_manufacturers, zero_manufacturers])
else:
manufacturers_to_show = non_zero_manufacturers
values = manufacturers_to_show['units'].tolist()
amounts = manufacturers_to_show['sales'].tolist()
manufacturers = [get_supplier_name(m) for m in manufacturers_to_show.index]
if manufacturers:
fig = radar_chart(manufacturers, values, amounts, f'Radar Chart for Top {len(manufacturers)} Manufacturers of Customer {customer_code}')
st.pyplot(fig)
# Column 2: Alerts and additional analysis
with col2:
st.subheader("You might need to check this!")
st.warning("Your customer was expected to buy more products from the following brands:")
# Identify manufacturers that didn't meet predicted sales
underperforming_manufacturers = results[results['ventas_reales'] < results['ventas_predichas']].copy()
if not underperforming_manufacturers.empty:
# Calculate the missed amount
underperforming_manufacturers['missed_amount'] = underperforming_manufacturers['ventas_predichas'] - underperforming_manufacturers['ventas_reales']
# Sort by the highest missed amount
underperforming_manufacturers = underperforming_manufacturers.sort_values(by='missed_amount', ascending=False)
# Limit to top 10 missed amounts
top_misses = underperforming_manufacturers.head(10)
# Display two cards per row
for i in range(0, len(top_misses), 2):
cols = st.columns(2) # Create two columns for two cards in a row
for j, col in enumerate(cols):
if i + j < len(top_misses):
row = top_misses.iloc[i + j]
manufacturer_name = get_supplier_name_encoded(row['marca_id_encoded'])
predicted = row['ventas_predichas']
actual = row['ventas_reales']
missed = row['missed_amount']
# Use st.metric for compact display in each column
with col:
st.metric(
label=f"{manufacturer_name}",
value=f"{actual:.2f}€",
delta=f"Missed by {missed:.2f}€",
delta_color="inverse"
)
else:
st.success("All manufacturers have met or exceeded predicted sales.")
# Gráfico adicional: Comparar las ventas predichas y reales para los principales fabricantes
st.markdown("### Predicted vs Actual Sales for Top Manufacturers")
top_manufacturers = results.groupby('marca_id_encoded').agg({'ventas_reales': 'sum', 'ventas_predichas': 'sum'}).sort_values(by='ventas_reales', ascending=False).head(10)
fig_comparison = go.Figure()
fig_comparison.add_trace(go.Bar(x=top_manufacturers.index, y=top_manufacturers['ventas_reales'], name="Actual Sales", marker_color='blue'))
fig_comparison.add_trace(go.Bar(x=top_manufacturers.index, y=top_manufacturers['ventas_predichas'], name="Predicted Sales", marker_color='orange'))
fig_comparison.update_layout(
title="Actual vs Predicted Sales by Top Manufacturers",
xaxis_title="Manufacturer",
yaxis_title="Sales (€)",
barmode='group',
height=400,
hovermode="x unified"
)
st.plotly_chart(fig_comparison, use_container_width=True)
# Gráfico de ventas anuales
ventas_clientes['codigo_cliente'] = ventas_clientes['codigo_cliente'].astype(str).str.strip()
sales_columns = ['VENTA_2021', 'VENTA_2022', 'VENTA_2023']
if all(col in ventas_clientes.columns for col in sales_columns):
customer_sales_data = ventas_clientes[ventas_clientes['codigo_cliente'] == customer_code]
if not customer_sales_data.empty:
customer_sales = customer_sales_data[sales_columns].values[0]
years = ['2021', '2022', '2023']
# Add the 2024 actual and predicted data
if 'ventas_predichas' in results.columns and 'ventas_reales' in results.columns:
actual_sales_2024 = results[results['fecha_mes'].dt.year == 2024]['ventas_reales'].sum()
predicted_sales_2024 = results[results['fecha_mes'].dt.year == 2024]['ventas_predichas'].sum()
months_available = 9
actual_sales_2024_annual = (actual_sales_2024 / months_available) * 12
sales_values = list(customer_sales) + [actual_sales_2024_annual]
predicted_values = list(customer_sales) + [predicted_sales_2024]
years.append('2024')
fig_sales_bar = go.Figure()
fig_sales_bar.add_trace(go.Bar(
x=years[:3],
y=sales_values[:3],
name="Historical Sales",
marker_color='blue'
))
fig_sales_bar.add_trace(go.Bar(
x=[years[3]],
y=[sales_values[3]],
name="2024 Actual Sales (Annualized)",
marker_color='green'
))
fig_sales_bar.add_trace(go.Bar(
x=[years[3]],
y=[predicted_values[3]],
name="2024 Predicted Sales",
marker_color='orange'
))
fig_sales_bar.update_layout(
title=f"Sales Over the Years for Customer {customer_code}",
xaxis_title="Year",
yaxis_title="Sales (€)",
barmode='group',
height=600,
legend_title_text="Sales Type",
hovermode="x unified"
)
st.plotly_chart(fig_sales_bar, use_container_width=True)
else:
st.warning(f"No predicted or actual data found for customer {customer_code} for 2024.")
# elif page == "Customer Analysis":
# st.markdown("""
# <h2 style='text-align: center; font-size: 2.5rem;'>Customer Analysis</h2>
# <p style='text-align: center; font-size: 1.2rem; color: gray;'>
# Enter the customer code to explore detailed customer insights,
# including past sales, predictions for the current year, and manufacturer-specific information.
# </p>
# """, unsafe_allow_html=True)
# # Combine text input and dropdown into a single searchable selectbox
# customer_code = st.selectbox(
# "Search and Select Customer Code",
# df['CLIENTE'].unique(), # All customer codes
# format_func=lambda x: str(x), # Ensures the values are displayed as strings
# help="Start typing to search for a specific customer code"
# )
# if st.button("Calcular"):
# if customer_code:
# with st.spinner("We are identifying the customer's cluster..."):
# # Find Customer's Cluster
# customer_match = customer_clusters[customer_clusters['cliente_id'] == customer_code]
# time.sleep(1)
# if not customer_match.empty:
# cluster = customer_match['cluster_id'].values[0]
# with st.spinner(f"Selecting predictive model..."):
# # Load the Corresponding Model
# model_path = f'models/modelo_cluster_{cluster}.txt'
# gbm = lgb.Booster(model_file=model_path)
# with st.spinner("Getting the data ready..."):
# # Load predict data for that cluster
# predict_data = pd.read_csv(f'predicts/predict_cluster_{cluster}.csv')
# # Convert cliente_id to string
# predict_data['cliente_id'] = predict_data['cliente_id'].astype(str)
# with st.spinner("Filtering data..."):
# # Filter for the specific customer
# customer_code_str = str(customer_code)
# customer_data = predict_data[predict_data['cliente_id'] == customer_code_str]
# with st.spinner("Generating sales predictions..."):
# if not customer_data.empty:
# # Define features consistently with the training process
# lag_features = [f'precio_total_lag_{lag}' for lag in range(1, 25)]
# features = lag_features + ['mes', 'marca_id_encoded', 'año', 'cluster_id']
# # Prepare data for prediction
# X_predict = customer_data[features]
# # Convert categorical features to 'category' dtype
# categorical_features = ['mes', 'marca_id_encoded', 'cluster_id']
# for feature in categorical_features:
# X_predict[feature] = X_predict[feature].astype('category')
# # Make Prediction for the selected customer
# y_pred = gbm.predict(X_predict, num_iteration=gbm.best_iteration)
# # Reassemble the results
# results = customer_data[['cliente_id', 'marca_id_encoded', 'fecha_mes']].copy()
# results['ventas_predichas'] = y_pred
# # Load actual data
# actual_sales = df_agg_2024[df_agg_2024['cliente_id'] == customer_code_str]
# if not actual_sales.empty:
# results = results.merge(actual_sales[['cliente_id', 'marca_id_encoded', 'fecha_mes', 'precio_total']],
# on=['cliente_id', 'marca_id_encoded', 'fecha_mes'],
# how='left')
# results.rename(columns={'precio_total': 'ventas_reales'}, inplace=True)
# results['ventas_reales'].fillna(0, inplace=True)
# # st.write("### Final Results DataFrame:")
# # st.write(results.head())
# # st.write(f"Shape: {results.shape}")
# # Calculate metrics only for non-null actual sales
# valid_results = results.dropna(subset=['ventas_reales'])
# non_zero_actuals = valid_results[valid_results['ventas_reales'] != 0]
# if not valid_results.empty:
# mae = mean_absolute_error(valid_results['ventas_reales'], valid_results['ventas_predichas'])
# mape = np.mean(np.abs((non_zero_actuals['ventas_reales'] - non_zero_actuals['ventas_predichas']) / non_zero_actuals['ventas_reales'])) * 100
# rmse = np.sqrt(mean_squared_error(valid_results['ventas_reales'], valid_results['ventas_predichas']))
# # st.write(f"Actual total sales for Customer {customer_code}: {valid_results['ventas_reales'].sum():.2f}")
# # st.write(f"MAE: {mae:.2f}€")
# # st.write(f"MAPE: {mape:.2f}%")
# # st.write(f"RMSE: {rmse:.2f}")
# # # Analysis of results
# # threshold_good = 100 # You may want to adjust this threshold
# # if mae < threshold_good:
# # st.success(f"Customer {customer_code} is performing well based on the predictions.")
# # else:
# # st.warning(f"Customer {customer_code} is not performing well based on the predictions.")
# # else:
# # st.warning(f"No actual sales data found for customer {customer_code} in df_agg_2024.")
# # st.write("### Debug Information for Radar Chart:")
# # st.write(f"Shape of customer_data: {customer_data.shape}")
# # st.write(f"Shape of euros_proveedor: {euros_proveedor.shape}")
# # Get percentage of units sold for each manufacturer
# customer_df = df[df["CLIENTE"] == str(customer_code)] # Get the customer data
# all_manufacturers = customer_df.iloc[:, 1:].T # Exclude CLIENTE column (manufacturers are in columns)
# all_manufacturers.index = all_manufacturers.index.astype(str)
# # Get total sales for each manufacturer from euros_proveedor
# customer_euros = euros_proveedor[euros_proveedor["CLIENTE"] == str(customer_code)]
# sales_data = customer_euros.iloc[:, 1:].T # Exclude CLIENTE column
# sales_data.index = sales_data.index.astype(str)
# # Remove the 'CLIENTE' row from sales_data to avoid issues with mixed types
# sales_data_filtered = sales_data.drop(index='CLIENTE', errors='ignore')
# # Ensure all values are numeric
# sales_data_filtered = sales_data_filtered.apply(pd.to_numeric, errors='coerce')
# all_manufacturers = all_manufacturers.apply(pd.to_numeric, errors='coerce')
# # Sort manufacturers by percentage of units and get top 10
# top_units = all_manufacturers.sort_values(by=all_manufacturers.columns[0], ascending=False).head(10)
# # Sort manufacturers by total sales and get top 10
# top_sales = sales_data_filtered.sort_values(by=sales_data_filtered.columns[0], ascending=False).head(10)
# # Combine top manufacturers from both lists and get up to 20 unique manufacturers
# combined_top = pd.concat([top_units, top_sales]).index.unique()[:20]
# # Filter out manufacturers that are not present in both datasets
# combined_top = [m for m in combined_top if m in all_manufacturers.index and m in sales_data_filtered.index]
# # st.write(f"Number of combined top manufacturers: {len(combined_top)}")
# if combined_top:
# # Create a DataFrame with combined data for these top manufacturers
# combined_data = pd.DataFrame({
# 'units': all_manufacturers.loc[combined_top, all_manufacturers.columns[0]],
# 'sales': sales_data_filtered.loc[combined_top, sales_data_filtered.columns[0]]
# }).fillna(0)
# # Sort by units, then by sales
# combined_data_sorted = combined_data.sort_values(by=['units', 'sales'], ascending=False)
# # Filter out manufacturers with 0 units
# non_zero_manufacturers = combined_data_sorted[combined_data_sorted['units'] > 0]
# # If we have less than 3 non-zero manufacturers, add some zero-value ones
# if len(non_zero_manufacturers) < 3:
# zero_manufacturers = combined_data_sorted[combined_data_sorted['units'] == 0].head(3 - len(non_zero_manufacturers))
# manufacturers_to_show = pd.concat([non_zero_manufacturers, zero_manufacturers])
# else:
# manufacturers_to_show = non_zero_manufacturers
# values = manufacturers_to_show['units'].tolist()
# amounts = manufacturers_to_show['sales'].tolist()
# manufacturers = [get_supplier_name(m) for m in manufacturers_to_show.index]
# # st.write(f"### Results for top {len(manufacturers)} manufacturers:")
# # for manufacturer, value, amount in zip(manufacturers, values, amounts):
# # (f"{manufacturer} = {value:.2f}% of units, €{amount:.2f} total sales")
# if manufacturers: # Only create the chart if we have data
# fig = radar_chart(manufacturers, values, amounts, f'Radar Chart for Top {len(manufacturers)} Manufacturers of Customer {customer_code}')
# st.pyplot(fig)
# else:
# st.warning("No data available to create the radar chart.")
# else:
# st.warning("No combined top manufacturers found.")
# # Ensure codigo_cliente in ventas_clientes is a string
# ventas_clientes['codigo_cliente'] = ventas_clientes['codigo_cliente'].astype(str).str.strip()
# # Ensure customer_code is a string and strip any spaces
# customer_code = str(customer_code).strip()
# # if customer_code in ventas_clientes['codigo_cliente'].unique():
# # (f"Customer {customer_code} found in ventas_clientes")
# # else:
# # (f"Customer {customer_code} not found in ventas_clientes")
# # Customer sales 2021-2024 (if data exists)
# sales_columns = ['VENTA_2021', 'VENTA_2022', 'VENTA_2023']
# if all(col in ventas_clientes.columns for col in sales_columns):
# customer_sales_data = ventas_clientes[ventas_clientes['codigo_cliente'] == customer_code]
# if not customer_sales_data.empty:
# customer_sales = customer_sales_data[sales_columns].values[0]
# years = ['2021', '2022', '2023']
# # Add the 2024 actual and predicted data
# if 'ventas_predichas' in results.columns and 'ventas_reales' in results.columns:
# # Get the actual and predicted sales for 2024
# actual_sales_2024 = results[results['fecha_mes'].str.startswith('2024')]['ventas_reales'].sum()
# predicted_sales_2024 = results[results['fecha_mes'].str.startswith('2024')]['ventas_predichas'].sum()
# # Estimate full-year predicted sales (assuming predictions available until September)
# months_available = 9 # Data available until September
# actual_sales_2024_annual = (actual_sales_2024 / months_available) * 12
# # Add 2024 actual and predicted sales
# sales_values = list(customer_sales) + [actual_sales_2024_annual] # Actual sales
# predicted_values = list(customer_sales) + [predicted_sales_2024] # Predicted sales
# # Add 2024 to the years list
# years.append('2024')
# fig_sales_bar = go.Figure()
# # Add trace for historical sales (2021-2023)
# fig_sales_bar.add_trace(go.Bar(
# x=years[:3], # 2021, 2022, 2023
# y=sales_values[:3],
# name="Historical Sales",
# marker_color='blue'
# ))
# # Add trace for 2024 actual sales
# fig_sales_bar.add_trace(go.Bar(
# x=[years[3]], # 2024
# y=[sales_values[3]],
# name="2024 Actual Sales (Annualized)",
# marker_color='green'
# ))
# # Add trace for 2024 predicted sales
# fig_sales_bar.add_trace(go.Bar(
# x=[years[3]], # 2024
# y=[predicted_values[3]],
# name="2024 Predicted Sales",
# marker_color='orange'
# ))
# # Update layout
# fig_sales_bar.update_layout(
# title=f"Sales Over the Years for Customer {customer_code}",
# xaxis_title="Year",
# yaxis_title="Sales (€)",
# barmode='group',
# height=600,
# legend_title_text="Sales Type",
# hovermode="x unified"
# )
# # Show the interactive bar chart in Streamlit
# st.plotly_chart(fig_sales_bar, use_container_width=True)
# else:
# st.warning(f"No predicted or actual data found for customer {customer_code} for 2024.")
# else:
# st.warning(f"No historical sales data found for customer {customer_code}")
# else:
# st.warning("Sales data for 2021-2023 not available in the dataset.")
# Customer Recommendations Page
elif page == "Articles Recommendations":
st.title("Articles Recommendations")
st.markdown("""
Get tailored recommendations for your customers based on their basket.
""")
st.write("Select items and assign quantities for the basket:")
# Mostrar lista de artículos disponibles
available_articles = productos['ARTICULO'].unique()
selected_articles = st.multiselect("Select Articles", available_articles)
# Crear inputs para ingresar las cantidades de cada artículo seleccionado
quantities = {}
for article in selected_articles:
quantities[article] = st.number_input(f"Quantity for {article}", min_value=0, step=1)
if st.button("Calcular"): # Añadimos el botón "Calcular"
# Crear una lista de artículos basada en la selección
new_basket = [f"{article} x{quantities[article]}" for article in selected_articles if quantities[article] > 0]
if new_basket:
# Procesar la lista para recomendar
recommendations_df = recomienda_tfid(new_basket)
if not recommendations_df.empty:
st.write("### Recommendations based on the current basket:")
st.dataframe(recommendations_df)
else:
st.warning("No recommendations found for the provided basket.")
else:
st.warning("Please select at least one article and set its quantity.")
# # Customer Recommendations Page
# elif page == "Articles Recommendations":
# st.title("Articles Recommendations")
# st.markdown("""
# Get tailored recommendations for your customers based on their basket.
# """)
# # Campo input para cliente
# partial_code = st.text_input("Enter part of Customer Code for Recommendations (or leave empty to see all)")
# if partial_code:
# filtered_customers = df[df['CLIENTE'].str.contains(partial_code)]
# else:
# filtered_customers = df
# customer_list = filtered_customers['CLIENTE'].unique()
# customer_code = st.selectbox("Select Customer Code for Recommendations", [""] + list(customer_list))
# # Definición de la función recomienda
# def recomienda(new_basket):
# # Calcular la matriz TF-IDF
# tfidf = TfidfVectorizer()
# tfidf_matrix = tfidf.fit_transform(cestas['Cestas'])
# # Convertir la nueva cesta en formato TF-IDF
# new_basket_str = ' '.join(new_basket)
# new_basket_tfidf = tfidf.transform([new_basket_str])
# # Comparar la nueva cesta con las anteriores
# similarities = cosine_similarity(new_basket_tfidf, tfidf_matrix)
# # Obtener los índices de las cestas más similares
# similar_indices = similarities.argsort()[0][-3:] # Las 3 más similares
# # Crear un diccionario para contar las recomendaciones
# recommendations_count = {}
# total_similarity = 0
# # Recomendar productos de cestas similares
# for idx in similar_indices:
# sim_score = similarities[0][idx]
# total_similarity += sim_score
# products = cestas.iloc[idx]['Cestas'].split()
# for product in products:
# if product.strip() not in new_basket: # Evitar recomendar lo que ya está en la cesta
# if product.strip() in recommendations_count:
# recommendations_count[product.strip()] += sim_score
# else:
# recommendations_count[product.strip()] = sim_score
# # Calcular la probabilidad relativa de cada producto recomendado
# recommendations_with_prob = []
# if total_similarity > 0: # Verificar que total_similarity no sea cero
# recommendations_with_prob = [(product, score / total_similarity) for product, score in recommendations_count.items()]
# else:
# print("No se encontraron similitudes suficientes para calcular probabilidades.")
# recommendations_with_prob.sort(key=lambda x: x[1], reverse=True) # Ordenar por puntuación
# # Crear un nuevo DataFrame para almacenar las recomendaciones con descripciones y probabilidades
# recommendations_df = pd.DataFrame(columns=['ARTICULO', 'DESCRIPCION', 'PROBABILIDAD'])
# # Agregar las recomendaciones al DataFrame usando pd.concat
# for product, prob in recommendations_with_prob:
# # Buscar la descripción en el DataFrame de productos
# description = productos.loc[productos['ARTICULO'] == product, 'DESCRIPCION']
# if not description.empty:
# # Crear un nuevo DataFrame temporal para la recomendación
# temp_df = pd.DataFrame({
# 'ARTICULO': [product],
# 'DESCRIPCION': [description.values[0]], # Obtener el primer valor encontrado
# 'PROBABILIDAD': [prob]
# })
# # Concatenar el DataFrame temporal al DataFrame de recomendaciones
# recommendations_df = pd.concat([recommendations_df, temp_df], ignore_index=True)
# return recommendations_df
# # Comprobar si el cliente está en el CSV de fieles
# is_fiel = customer_code in fieles_df['Cliente'].astype(str).values
# if customer_code:
# if is_fiel:
# st.write(f"### Customer {customer_code} is a loyal customer.")
# option = st.selectbox("Select Recommendation Type", ["Select an option", "By Purchase History", "By Current Basket"])
# if option == "By Purchase History":
# st.warning("Option not available... aún")
# elif option == "By Current Basket":
# st.write("Select the items and assign quantities for the basket:")
# # Mostrar lista de artículos disponibles
# available_articles = productos['ARTICULO'].unique()
# selected_articles = st.multiselect("Select Articles", available_articles)
# # Crear inputs para ingresar las cantidades de cada artículo seleccionado
# quantities = {}
# for article in selected_articles:
# quantities[article] = st.number_input(f"Quantity for {article}", min_value=0, step=1)
# if st.button("Calcular"): # Añadimos el botón "Calcular"
# # Crear una lista de artículos basada en la selección
# new_basket = [f"{article} x{quantities[article]}" for article in selected_articles if quantities[article] > 0]
# if new_basket:
# # Procesar la lista para recomendar
# recommendations_df = recomienda(new_basket)
# if not recommendations_df.empty:
# st.write("### Recommendations based on the current basket:")
# st.dataframe(recommendations_df)
# else:
# st.warning("No recommendations found for the provided basket.")
# else:
# st.warning("Please select at least one article and set its quantity.")
# else:
# st.write(f"### Customer {customer_code} is not a loyal customer.")
# st.write("Select items and assign quantities for the basket:")
# # Mostrar lista de artículos disponibles
# available_articles = productos['ARTICULO'].unique()
# selected_articles = st.multiselect("Select Articles", available_articles)
# # Crear inputs para ingresar las cantidades de cada artículo seleccionado
# quantities = {}
# for article in selected_articles:
# quantities[article] = st.number_input(f"Quantity for {article}", min_value=0, step=1)
# if st.button("Calcular"): # Añadimos el botón "Calcular"
# # Crear una lista de artículos basada en la selección
# new_basket = [f"{article} x{quantities[article]}" for article in selected_articles if quantities[article] > 0]
# if new_basket:
# # Procesar la lista para recomendar
# recommendations_df = recomienda(new_basket)
# if not recommendations_df.empty:
# st.write("### Recommendations based on the current basket:")
# st.dataframe(recommendations_df)
# else:
# st.warning("No recommendations found for the provided basket.")
# else:
# st.warning("Please select at least one article and set its quantity.")