import streamlit as st import time import pandas as pd import plotly.express as px import plotly.graph_objects as go import matplotlib.pyplot as plt import numpy as np import lightgbm as lgb from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from sklearn.metrics import mean_absolute_error, mean_squared_error from joblib import dump, load from utils import recomienda_tfid # Page configuration st.set_page_config(page_title="DeepInsightz", page_icon=":bar_chart:", layout="wide") # Custom CSS for dynamic theme styling # Streamlit detects light and dark mode automatically via the user's settings in Hugging Face Spaces if st.get_option("theme.base") == "dark": background_color = "#282828" text_color = "white" metric_box_color = "#4f4f4f" sidebar_color = "#282828" plot_bgcolor = "rgba(0, 0, 0, 0)" primary_color = '#00FF00' # for positive delta negative_color = '#FF0000' # for negative delta else: background_color = "#f4f4f4" text_color = "#black" metric_box_color = "#dee2e8" sidebar_color = "#dee2e8" plot_bgcolor = "#f4f4f4" primary_color = '#228B22' # for positive delta in light mode negative_color = '#8B0000' # for negative delta in light mode st.markdown(f""" """, unsafe_allow_html=True) # Load CSV files at the top df = pd.read_csv("df_clean.csv") nombres_proveedores = pd.read_csv("nombres_proveedores.csv", sep=';') euros_proveedor = pd.read_csv("euros_proveedor.csv", sep=',') ventas_clientes = pd.read_csv("ventas_clientes.csv", sep=',') customer_clusters = pd.read_csv('predicts/customer_clusters.csv') # Load the customer clusters here df_agg_2024 = pd.read_csv('predicts/df_agg_2024.csv') pca_data_5 = pd.read_csv('pca_data.csv') historical_data = pd.read_csv('historical_data.csv') with st.sidebar: st.sidebar.title("DeepInsightz") page = st.sidebar.selectbox("Select the tool you want to use", ["Summary", "Customer Analysis", "Articles Recommendations"]) # Generamos la columna total_sales ventas_clientes['total_sales'] = ventas_clientes[['VENTA_2021', 'VENTA_2022', 'VENTA_2023']].sum(axis=1) # Ordenar los clientes de mayor a menor según sus ventas totales ventas_top_100 = ventas_clientes.sort_values(by='total_sales', ascending=False).head(100) # Ensure customer codes are strings df['CLIENTE'] = df['CLIENTE'].astype(str) nombres_proveedores['codigo'] = nombres_proveedores['codigo'].astype(str) euros_proveedor['CLIENTE'] = euros_proveedor['CLIENTE'].astype(str) customer_clusters['cliente_id'] = customer_clusters['cliente_id'].astype(str) # Ensure customer IDs are strings fieles_df = pd.read_csv("clientes_relevantes.csv") cestas = pd.read_csv("cestas.csv") productos = pd.read_csv("productos.csv") df_agg_2024['cliente_id'] = df_agg_2024['cliente_id'].astype(str) marca_id_mapping = load('marca_id_mapping.joblib') # Convert all columns except 'CLIENTE' to float in euros_proveedor for col in euros_proveedor.columns: if col != 'CLIENTE': euros_proveedor[col] = pd.to_numeric(euros_proveedor[col], errors='coerce') # Check for NaN values after conversion if euros_proveedor.isna().any().any(): st.warning("Some values in euros_proveedor couldn't be converted to numbers. Please review the input data.") # Ignore the last two columns of df df = df.iloc[:, :-2] # Function to get supplier name def get_supplier_name(code): code = str(code) # Ensure code is a string name = nombres_proveedores[nombres_proveedores['codigo'] == code]['nombre'].values return name[0] if len(name) > 0 else code def get_supplier_name_encoded(encoded_code): try: # Ensure the encoded code is an integer encoded_code = int(encoded_code) print(f"Encoded Code: {encoded_code}") # Use the label encoder to map the encoded code back to the original manufacturer code if encoded_code < len(marca_id_mapping.classes_): real_code = marca_id_mapping.inverse_transform([encoded_code])[0] print(f"Real Manufacturer Code: {real_code}") else: print(f"Encoded code not found in the label encoder: {encoded_code}") return f"Unknown code: {encoded_code}" # Handle case where encoded code is not found # Now, use the real_code to find the manufacturer name in nombres_proveedores name = nombres_proveedores[nombres_proveedores['codigo'] == str(real_code)]['nombre'].values print(f"Manufacturer Name Found: {name}") # Check what name is returned # Return the manufacturer name if found, otherwise return the real_code return name[0] if len(name) > 0 else real_code except Exception as e: print(f"Error encountered: {e}") return f"Error for code: {encoded_code}" # Custom Donut Chart with Plotly for Inbound/Outbound Percentage def create_donut_chart(values, labels, color_scheme, title): fig = px.pie( values=values, names=labels, hole=0.7, color_discrete_sequence=color_scheme ) fig.update_traces(textinfo='percent+label', hoverinfo='label+percent', textposition='inside', showlegend=False) fig.update_layout( annotations=[dict(text=f"{int(values[1])}%", x=0.5, y=0.5, font_size=40, showarrow=False)], title=title, height=300, margin=dict(t=30, b=10, l=10, r=10), paper_bgcolor=plot_bgcolor, # Use theme-dependent background color plot_bgcolor=plot_bgcolor ) return fig # Donut chart with color scheme based on theme if st.get_option("theme.base") == "dark": donut_color_scheme = ['#155F7A', '#29b5e8'] # Dark mode colors else: donut_color_scheme = ['#007BFF', '#66b5ff'] # Light mode colors # Function to create radar chart with square root transformation def radar_chart(categories, values, amounts, title): N = len(categories) angles = [n / float(N) * 2 * np.pi for n in range(N)] angles += angles[:1] fig, ax = plt.subplots(figsize=(12, 12), subplot_kw=dict(projection='polar')) # Apply square root transformation sqrt_values = np.sqrt(values) sqrt_amounts = np.sqrt(amounts) max_sqrt_value = max(sqrt_values) normalized_values = [v / max_sqrt_value for v in sqrt_values] # Adjust scaling for spend values max_sqrt_amount = max(sqrt_amounts) scaling_factor = 0.7 # Adjust this value to control how much the spend values are scaled up normalized_amounts = [min((a / max_sqrt_amount) * scaling_factor, 1.0) for a in sqrt_amounts] normalized_values += normalized_values[:1] ax.plot(angles, normalized_values, 'o-', linewidth=2, color='#FF69B4', label='% Units (sqrt)') ax.fill(angles, normalized_values, alpha=0.25, color='#FF69B4') normalized_amounts += normalized_amounts[:1] ax.plot(angles, normalized_amounts, 'o-', linewidth=2, color='#4B0082', label='% Spend (sqrt)') ax.fill(angles, normalized_amounts, alpha=0.25, color='#4B0082') ax.set_xticks(angles[:-1]) ax.set_xticklabels(categories, size=8, wrap=True) ax.set_ylim(0, 1) circles = np.linspace(0, 1, 5) for circle in circles: ax.plot(angles, [circle]*len(angles), '--', color='gray', alpha=0.3, linewidth=0.5) ax.set_yticklabels([]) ax.spines['polar'].set_visible(False) plt.title(title, size=16, y=1.1) plt.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1)) return fig if page == "Summary": # st.title("Welcome to DeepInsightz") # st.markdown(""" # ### Data-driven Customer Clustering # We analyzed thousands of customers and suppliers to help businesses make smarter sales decisions. # """) # Create layout with three columns col1, col2, col3 = st.columns((1.5, 4, 2.5), gap='medium') # Left Column (Red): Metrics and Donut Charts with col1: st.markdown('#### General Information') st.metric(label="Range of Dates", value="2021-2023") st.metric(label="Customers Analysed", value="3.000") st.metric(label="Unique Products Sold", value="10.702") st.metric(label="Total Sales Instances", value="764.396") # Middle Column (White): 3D Cluster Model and Bar Chart with col2: st.markdown('#### 3D Customer Clusters') # Create 3D PCA plot using actual data from pca_data_5 fig_cluster = px.scatter_3d( pca_data_5, x='PC1', y='PC2', z='PC3', color='cluster_id', hover_name='CustomerID', ) fig_cluster.update_layout( scene=dict(aspectratio=dict(x=1, y=1, z=0.8)), # Adjusted aspect ratio for better balance margin=dict(t=10, b=10, l=10, r=10), # Tighten margins further height=600, # Slightly increased height for better visibility ) st.plotly_chart(fig_cluster, use_container_width=True) # Right Column (Blue): Key Metrics Overview and Data Preparation Summary with col3: # Mostrar la tabla con los 100 mejores clientes st.markdown('#### Top 100 Clients by Total Sales') # Configurar columnas para mostrar los clientes y las ventas totales st.dataframe(ventas_top_100[['codigo_cliente', 'total_sales']], column_order=("codigo_cliente", "total_sales"), hide_index=True, width=450, # Ajustar el ancho de la tabla height=600, # Ajustar la altura de la tabla column_config={ "codigo_cliente": st.column_config.TextColumn( "Client Code", ), "total_sales": st.column_config.ProgressColumn( "Total Sales (€)", format="%d", min_value=0, max_value=ventas_top_100['total_sales'].max() )} ) # Customer Analysis Page # Customer Analysis Page elif page == "Customer Analysis": st.markdown("""
Enter the customer code to explore detailed customer insights, including past sales, predictions for the current year, and manufacturer-specific information.
""", unsafe_allow_html=True) # Combine text input and dropdown into a single searchable selectbox customer_code = st.selectbox( "Search and Select Customer Code", df['CLIENTE'].unique(), # All customer codes format_func=lambda x: str(x), # Ensures the values are displayed as strings help="Start typing to search for a specific customer code" ) if st.button("Calcular"): if customer_code: with st.spinner("We are identifying the customer's cluster..."): # Find Customer's Cluster customer_match = customer_clusters[customer_clusters['cliente_id'] == customer_code] time.sleep(1) if not customer_match.empty: cluster = customer_match['cluster_id'].values[0] with st.spinner(f"Selecting predictive model..."): # Load the Corresponding Model model_path = f'models/modelo_cluster_{cluster}.txt' gbm = lgb.Booster(model_file=model_path) with st.spinner("Getting the data ready..."): # Load predict data for that cluster predict_data = pd.read_csv(f'predicts/predict_cluster_{cluster}.csv') # Convert cliente_id to string predict_data['cliente_id'] = predict_data['cliente_id'].astype(str) with st.spinner("Filtering data..."): # Filter for the specific customer customer_code_str = str(customer_code) customer_data = predict_data[predict_data['cliente_id'] == customer_code_str] with st.spinner("Generating sales predictions..."): if not customer_data.empty: # Define features consistently with the training process lag_features = [f'precio_total_lag_{lag}' for lag in range(1, 25)] features = lag_features + ['mes', 'marca_id_encoded', 'año', 'cluster_id'] # Prepare data for prediction X_predict = customer_data[features] # Convert categorical features to 'category' dtype categorical_features = ['mes', 'marca_id_encoded', 'cluster_id'] for feature in categorical_features: X_predict[feature] = X_predict[feature].astype('category') # Make Prediction for the selected customer y_pred = gbm.predict(X_predict, num_iteration=gbm.best_iteration) # Reassemble the results results = customer_data[['cliente_id', 'marca_id_encoded', 'fecha_mes']].copy() results['ventas_predichas'] = y_pred # Load actual data from df_agg_2024 actual_sales = df_agg_2024[df_agg_2024['cliente_id'] == customer_code_str] if not actual_sales.empty: # Merge predictions with actual sales results = results.merge(actual_sales[['cliente_id', 'marca_id_encoded', 'fecha_mes', 'precio_total']], on=['cliente_id', 'marca_id_encoded', 'fecha_mes'], how='left') results.rename(columns={'precio_total': 'ventas_reales'}, inplace=True) else: # If no actual sales data for 2024, fill 'ventas_reales' with 0 results['ventas_reales'] = 0 # Ensure any missing sales data is filled with 0 results['ventas_reales'].fillna(0, inplace=True) # Define the cutoff date for the last 12 months fecha_inicio = pd.to_datetime("2023-01-01") fecha_corte = pd.to_datetime("2024-09-01") # Convertir fecha_mes a datetime en el DataFrame historical_data historical_data['fecha_mes'] = pd.to_datetime(historical_data['fecha_mes'], errors='coerce') # Ensure cliente_id is of type string and strip any leading/trailing whitespace historical_data['cliente_id'] = historical_data['cliente_id'].astype(str).str.strip() customer_code_str = str(customer_code).strip() # Ensure the customer code is also properly formatted filtered_historical_data = historical_data[historical_data['cliente_id'] == customer_code_str] # Filtrar los datos históricos por cliente y por el rango de fechas (2023) fecha_inicio_2023 = pd.to_datetime("2023-01-01") fecha_fin_2023 = pd.to_datetime("2023-12-31") datos_historicos = historical_data[ (historical_data['cliente_id'] == customer_code_str) & (historical_data['fecha_mes'] >= fecha_inicio_2023) & (historical_data['fecha_mes'] <= fecha_fin_2023) ].groupby('fecha_mes')['precio_total'].sum().reset_index() # Renombrar la columna 'precio_total' a 'ventas_historicas' si no está vacía if not datos_historicos.empty: datos_historicos.rename(columns={'precio_total': 'ventas_historicas'}, inplace=True) else: # Si los datos históricos están vacíos, generar fechas de 2023 con ventas_historicas = 0 fechas_2023 = pd.date_range(start='2023-01-01', end='2023-12-31', freq='M') datos_historicos = pd.DataFrame({'fecha_mes': fechas_2023, 'ventas_historicas': [0] * len(fechas_2023)}) # Filtrar los datos de predicciones y ventas reales para 2024 datos_cliente_total = results.groupby('fecha_mes').agg({ 'ventas_reales': 'sum', 'ventas_predichas': 'sum' }).reset_index() # Asegurarnos de que fecha_mes en datos_cliente_total es datetime datos_cliente_total['fecha_mes'] = pd.to_datetime(datos_cliente_total['fecha_mes'], errors='coerce') # Generar un rango de fechas para 2024 si no hay predicciones fechas_2024 = pd.date_range(start='2024-01-01', end='2024-12-31', freq='M') fechas_df_2024 = pd.DataFrame({'fecha_mes': fechas_2024}) # Asegurarnos de que fecha_mes en fechas_df_2024 es datetime fechas_df_2024['fecha_mes'] = pd.to_datetime(fechas_df_2024['fecha_mes'], errors='coerce') # Combinar datos históricos con predicciones y ventas reales usando un merge # Usamos how='outer' para asegurarnos de incluir todas las fechas de 2023 y 2024 datos_combinados = pd.merge(datos_historicos, datos_cliente_total, on='fecha_mes', how='outer').sort_values('fecha_mes') # Rellenar los NaN: 0 en ventas_historicas donde faltan predicciones, y viceversa datos_combinados['ventas_historicas'].fillna(0, inplace=True) datos_combinados['ventas_predichas'].fillna(0, inplace=True) datos_combinados['ventas_reales'].fillna(0, inplace=True) # Crear la gráfica con Plotly fig = go.Figure() # Graficar ventas históricas fig.add_trace(go.Scatter( x=datos_combinados['fecha_mes'], y=datos_combinados['ventas_historicas'], mode='lines+markers', name='Ventas Históricas', line=dict(color='blue') )) # Graficar ventas predichas fig.add_trace(go.Scatter( x=datos_combinados['fecha_mes'], y=datos_combinados['ventas_predichas'], mode='lines+markers', name='Ventas Predichas', line=dict(color='orange') )) # Graficar ventas reales fig.add_trace(go.Scatter( x=datos_combinados['fecha_mes'], y=datos_combinados['ventas_reales'], mode='lines+markers', name='Ventas Reales', line=dict(color='green') )) # Personalizar el layout para enfocarse en 2023 y 2024 fig.update_layout( title=f"Ventas Históricas, Predichas y Reales para Cliente {customer_code}", xaxis_title="Fecha", yaxis_title="Ventas (€)", height=600, xaxis_range=[fecha_inicio_2023, pd.to_datetime("2024-09-30")], # Ajustar el rango del eje x a 2023-2024 legend_title="Tipo de Ventas", hovermode="x unified" ) # Mostrar la gráfica en Streamlit st.plotly_chart(fig) # Split space into two columns col1, col2 = st.columns(2) # Column 1: Radar chart for top manufacturers with col1: st.subheader("This looks great!") st.info("Your customer did exceed predicted sales from the following brands:") # Group results by manufacturer to calculate the total predicted and actual sales grouped_results = results.groupby('marca_id_encoded').agg({ 'ventas_reales': 'sum', 'ventas_predichas': 'sum' }).reset_index() # Identify manufacturers that exceeded predicted sales overperforming_manufacturers = grouped_results[grouped_results['ventas_reales'] > grouped_results['ventas_predichas']].copy() if not overperforming_manufacturers.empty: # Calculate the extra amount (difference between actual and predicted sales) overperforming_manufacturers['extra_amount'] = overperforming_manufacturers['ventas_reales'] - overperforming_manufacturers['ventas_predichas'] # Sort by the highest extra amount overperforming_manufacturers = overperforming_manufacturers.sort_values(by='extra_amount', ascending=False) # Limit to top 10 overperforming manufacturers top_overperformers = overperforming_manufacturers.head(10) # Display two cards per row for i in range(0, len(top_overperformers), 2): cols = st.columns(2) # Create two columns for two cards in a row for j, col in enumerate(cols): if i + j < len(top_overperformers): row = top_overperformers.iloc[i + j] manufacturer_name = get_supplier_name_encoded(row['marca_id_encoded']) predicted = row['ventas_predichas'] actual = row['ventas_reales'] extra = row['extra_amount'] # Use st.metric for compact display in each column with col: st.metric( label=f"{manufacturer_name}", value=f"{actual:.2f}€", delta=f"Exceeded by {extra:.2f}€", delta_color="normal" ) # Radar chart logic remains the same customer_df = df[df["CLIENTE"] == str(customer_code)] all_manufacturers = customer_df.iloc[:, 1:].T all_manufacturers.index = all_manufacturers.index.astype(str) customer_euros = euros_proveedor[euros_proveedor["CLIENTE"] == str(customer_code)] sales_data = customer_euros.iloc[:, 1:].T sales_data.index = sales_data.index.astype(str) sales_data_filtered = sales_data.drop(index='CLIENTE', errors='ignore') sales_data_filtered = sales_data_filtered.apply(pd.to_numeric, errors='coerce') all_manufacturers = all_manufacturers.apply(pd.to_numeric, errors='coerce') top_units = all_manufacturers.sort_values(by=all_manufacturers.columns[0], ascending=False).head(10) top_sales = sales_data_filtered.sort_values(by=sales_data_filtered.columns[0], ascending=False).head(10) combined_top = pd.concat([top_units, top_sales]).index.unique()[:20] combined_top = [m for m in combined_top if m in all_manufacturers.index and m in sales_data_filtered.index] if combined_top: combined_data = pd.DataFrame({ 'units': all_manufacturers.loc[combined_top, all_manufacturers.columns[0]], 'sales': sales_data_filtered.loc[combined_top, sales_data_filtered.columns[0]] }).fillna(0) combined_data_sorted = combined_data.sort_values(by=['units', 'sales'], ascending=False) non_zero_manufacturers = combined_data_sorted[combined_data_sorted['units'] > 0] if len(non_zero_manufacturers) < 3: zero_manufacturers = combined_data_sorted[combined_data_sorted['units'] == 0].head(3 - len(non_zero_manufacturers)) manufacturers_to_show = pd.concat([non_zero_manufacturers, zero_manufacturers]) else: manufacturers_to_show = non_zero_manufacturers values = manufacturers_to_show['units'].tolist() amounts = manufacturers_to_show['sales'].tolist() manufacturers = [get_supplier_name(m) for m in manufacturers_to_show.index] if manufacturers: fig = radar_chart(manufacturers, values, amounts, f'Radar Chart for Top {len(manufacturers)} Manufacturers of Customer {customer_code}') st.pyplot(fig) # Column 2: Alerts and additional analysis with col2: st.subheader("You might need to check this!") st.warning("Your customer was expected to buy more products from the following brands:") # Group results by manufacturer to calculate the total predicted and actual sales grouped_results = results.groupby('marca_id_encoded').agg({ 'ventas_reales': 'sum', 'ventas_predichas': 'sum' }).reset_index() # Identify manufacturers that didn't meet predicted sales underperforming_manufacturers = grouped_results[grouped_results['ventas_reales'] < grouped_results['ventas_predichas']].copy() if not underperforming_manufacturers.empty: # Calculate the missed amount underperforming_manufacturers['missed_amount'] = underperforming_manufacturers['ventas_predichas'] - underperforming_manufacturers['ventas_reales'] # Sort by the highest missed amount underperforming_manufacturers = underperforming_manufacturers.sort_values(by='missed_amount', ascending=False) # Limit to top 10 missed amounts top_misses = underperforming_manufacturers.head(10) # Display two cards per row for i in range(0, len(top_misses), 2): cols = st.columns(2) # Create two columns for two cards in a row for j, col in enumerate(cols): if i + j < len(top_misses): row = top_misses.iloc[i + j] manufacturer_name = get_supplier_name_encoded(row['marca_id_encoded']) predicted = row['ventas_predichas'] actual = row['ventas_reales'] missed = row['missed_amount'] # Use st.metric for compact display in each column with col: st.metric( label=f"{manufacturer_name}", value=f"{actual:.2f}€", delta=f"Missed by {missed:.2f}€", delta_color="inverse" ) else: st.success("All manufacturers have met or exceeded predicted sales.") # # Gráfico adicional: Comparar las ventas predichas y reales para los principales fabricantes # st.markdown("### Predicted vs Actual Sales for Top Manufacturers") # top_manufacturers = results.groupby('marca_id_encoded').agg({'ventas_reales': 'sum', 'ventas_predichas': 'sum'}).sort_values(by='ventas_reales', ascending=False).head(10) # fig_comparison = go.Figure() # fig_comparison.add_trace(go.Bar(x=top_manufacturers.index, y=top_manufacturers['ventas_reales'], name="Actual Sales", marker_color='blue')) # fig_comparison.add_trace(go.Bar(x=top_manufacturers.index, y=top_manufacturers['ventas_predichas'], name="Predicted Sales", marker_color='orange')) # fig_comparison.update_layout( # title="Actual vs Predicted Sales by Top Manufacturers", # xaxis_title="Manufacturer", # yaxis_title="Sales (€)", # barmode='group', # height=400, # hovermode="x unified" # ) # st.plotly_chart(fig_comparison, use_container_width=True) # Gráfico de ventas anuales ventas_clientes['codigo_cliente'] = ventas_clientes['codigo_cliente'].astype(str).str.strip() sales_columns = ['VENTA_2021', 'VENTA_2022', 'VENTA_2023'] if all(col in ventas_clientes.columns for col in sales_columns): customer_sales_data = ventas_clientes[ventas_clientes['codigo_cliente'] == customer_code] if not customer_sales_data.empty: customer_sales = customer_sales_data[sales_columns].values[0] years = ['2021', '2022', '2023'] # Convert 'fecha_mes' to datetime format if it's not already if not pd.api.types.is_datetime64_any_dtype(results['fecha_mes']): results['fecha_mes'] = pd.to_datetime(results['fecha_mes'], errors='coerce') # Add the 2024 actual and predicted data if 'ventas_predichas' in results.columns and 'ventas_reales' in results.columns: actual_sales_2024 = results[results['fecha_mes'].dt.year == 2024]['ventas_reales'].sum() predicted_sales_2024 = results[results['fecha_mes'].dt.year == 2024]['ventas_predichas'].sum() # Assuming only 9 months of actual data are available, annualize the sales months_available = 9 actual_sales_2024_annual = (actual_sales_2024 / months_available) * 12 # Prepare data for the bar chart sales_values = list(customer_sales) + [actual_sales_2024_annual] predicted_values = list(customer_sales) + [predicted_sales_2024] years.append('2024') # Create the bar chart for historical and 2024 data fig_sales_bar = go.Figure() fig_sales_bar.add_trace(go.Bar( x=years[:3], y=sales_values[:3], name="Historical Sales", marker_color='blue' )) fig_sales_bar.add_trace(go.Bar( x=[years[3]], y=[sales_values[3]], name="2024 Actual Sales (Annualized)", marker_color='green' )) fig_sales_bar.add_trace(go.Bar( x=[years[3]], y=[predicted_values[3]], name="2024 Predicted Sales", marker_color='orange' )) # Customize layout fig_sales_bar.update_layout( title=f"Sales Over the Years for Customer {customer_code}", xaxis_title="Year", yaxis_title="Sales (€)", barmode='group', height=600, legend_title_text="Sales Type", hovermode="x unified" ) # Display the chart st.plotly_chart(fig_sales_bar, use_container_width=True) else: st.warning(f"No predicted or actual data found for customer {customer_code} for 2024.") # elif page == "Customer Analysis": # st.markdown(""" ## Enter the customer code to explore detailed customer insights, # including past sales, predictions for the current year, and manufacturer-specific information. #
# """, unsafe_allow_html=True) # # Combine text input and dropdown into a single searchable selectbox # customer_code = st.selectbox( # "Search and Select Customer Code", # df['CLIENTE'].unique(), # All customer codes # format_func=lambda x: str(x), # Ensures the values are displayed as strings # help="Start typing to search for a specific customer code" # ) # if st.button("Calcular"): # if customer_code: # with st.spinner("We are identifying the customer's cluster..."): # # Find Customer's Cluster # customer_match = customer_clusters[customer_clusters['cliente_id'] == customer_code] # time.sleep(1) # if not customer_match.empty: # cluster = customer_match['cluster_id'].values[0] # with st.spinner(f"Selecting predictive model..."): # # Load the Corresponding Model # model_path = f'models/modelo_cluster_{cluster}.txt' # gbm = lgb.Booster(model_file=model_path) # with st.spinner("Getting the data ready..."): # # Load predict data for that cluster # predict_data = pd.read_csv(f'predicts/predict_cluster_{cluster}.csv') # # Convert cliente_id to string # predict_data['cliente_id'] = predict_data['cliente_id'].astype(str) # with st.spinner("Filtering data..."): # # Filter for the specific customer # customer_code_str = str(customer_code) # customer_data = predict_data[predict_data['cliente_id'] == customer_code_str] # with st.spinner("Generating sales predictions..."): # if not customer_data.empty: # # Define features consistently with the training process # lag_features = [f'precio_total_lag_{lag}' for lag in range(1, 25)] # features = lag_features + ['mes', 'marca_id_encoded', 'año', 'cluster_id'] # # Prepare data for prediction # X_predict = customer_data[features] # # Convert categorical features to 'category' dtype # categorical_features = ['mes', 'marca_id_encoded', 'cluster_id'] # for feature in categorical_features: # X_predict[feature] = X_predict[feature].astype('category') # # Make Prediction for the selected customer # y_pred = gbm.predict(X_predict, num_iteration=gbm.best_iteration) # # Reassemble the results # results = customer_data[['cliente_id', 'marca_id_encoded', 'fecha_mes']].copy() # results['ventas_predichas'] = y_pred # # Load actual data # actual_sales = df_agg_2024[df_agg_2024['cliente_id'] == customer_code_str] # if not actual_sales.empty: # results = results.merge(actual_sales[['cliente_id', 'marca_id_encoded', 'fecha_mes', 'precio_total']], # on=['cliente_id', 'marca_id_encoded', 'fecha_mes'], # how='left') # results.rename(columns={'precio_total': 'ventas_reales'}, inplace=True) # results['ventas_reales'].fillna(0, inplace=True) # # st.write("### Final Results DataFrame:") # # st.write(results.head()) # # st.write(f"Shape: {results.shape}") # # Calculate metrics only for non-null actual sales # valid_results = results.dropna(subset=['ventas_reales']) # non_zero_actuals = valid_results[valid_results['ventas_reales'] != 0] # if not valid_results.empty: # mae = mean_absolute_error(valid_results['ventas_reales'], valid_results['ventas_predichas']) # mape = np.mean(np.abs((non_zero_actuals['ventas_reales'] - non_zero_actuals['ventas_predichas']) / non_zero_actuals['ventas_reales'])) * 100 # rmse = np.sqrt(mean_squared_error(valid_results['ventas_reales'], valid_results['ventas_predichas'])) # # st.write(f"Actual total sales for Customer {customer_code}: {valid_results['ventas_reales'].sum():.2f}") # # st.write(f"MAE: {mae:.2f}€") # # st.write(f"MAPE: {mape:.2f}%") # # st.write(f"RMSE: {rmse:.2f}") # # # Analysis of results # # threshold_good = 100 # You may want to adjust this threshold # # if mae < threshold_good: # # st.success(f"Customer {customer_code} is performing well based on the predictions.") # # else: # # st.warning(f"Customer {customer_code} is not performing well based on the predictions.") # # else: # # st.warning(f"No actual sales data found for customer {customer_code} in df_agg_2024.") # # st.write("### Debug Information for Radar Chart:") # # st.write(f"Shape of customer_data: {customer_data.shape}") # # st.write(f"Shape of euros_proveedor: {euros_proveedor.shape}") # # Get percentage of units sold for each manufacturer # customer_df = df[df["CLIENTE"] == str(customer_code)] # Get the customer data # all_manufacturers = customer_df.iloc[:, 1:].T # Exclude CLIENTE column (manufacturers are in columns) # all_manufacturers.index = all_manufacturers.index.astype(str) # # Get total sales for each manufacturer from euros_proveedor # customer_euros = euros_proveedor[euros_proveedor["CLIENTE"] == str(customer_code)] # sales_data = customer_euros.iloc[:, 1:].T # Exclude CLIENTE column # sales_data.index = sales_data.index.astype(str) # # Remove the 'CLIENTE' row from sales_data to avoid issues with mixed types # sales_data_filtered = sales_data.drop(index='CLIENTE', errors='ignore') # # Ensure all values are numeric # sales_data_filtered = sales_data_filtered.apply(pd.to_numeric, errors='coerce') # all_manufacturers = all_manufacturers.apply(pd.to_numeric, errors='coerce') # # Sort manufacturers by percentage of units and get top 10 # top_units = all_manufacturers.sort_values(by=all_manufacturers.columns[0], ascending=False).head(10) # # Sort manufacturers by total sales and get top 10 # top_sales = sales_data_filtered.sort_values(by=sales_data_filtered.columns[0], ascending=False).head(10) # # Combine top manufacturers from both lists and get up to 20 unique manufacturers # combined_top = pd.concat([top_units, top_sales]).index.unique()[:20] # # Filter out manufacturers that are not present in both datasets # combined_top = [m for m in combined_top if m in all_manufacturers.index and m in sales_data_filtered.index] # # st.write(f"Number of combined top manufacturers: {len(combined_top)}") # if combined_top: # # Create a DataFrame with combined data for these top manufacturers # combined_data = pd.DataFrame({ # 'units': all_manufacturers.loc[combined_top, all_manufacturers.columns[0]], # 'sales': sales_data_filtered.loc[combined_top, sales_data_filtered.columns[0]] # }).fillna(0) # # Sort by units, then by sales # combined_data_sorted = combined_data.sort_values(by=['units', 'sales'], ascending=False) # # Filter out manufacturers with 0 units # non_zero_manufacturers = combined_data_sorted[combined_data_sorted['units'] > 0] # # If we have less than 3 non-zero manufacturers, add some zero-value ones # if len(non_zero_manufacturers) < 3: # zero_manufacturers = combined_data_sorted[combined_data_sorted['units'] == 0].head(3 - len(non_zero_manufacturers)) # manufacturers_to_show = pd.concat([non_zero_manufacturers, zero_manufacturers]) # else: # manufacturers_to_show = non_zero_manufacturers # values = manufacturers_to_show['units'].tolist() # amounts = manufacturers_to_show['sales'].tolist() # manufacturers = [get_supplier_name(m) for m in manufacturers_to_show.index] # # st.write(f"### Results for top {len(manufacturers)} manufacturers:") # # for manufacturer, value, amount in zip(manufacturers, values, amounts): # # (f"{manufacturer} = {value:.2f}% of units, €{amount:.2f} total sales") # if manufacturers: # Only create the chart if we have data # fig = radar_chart(manufacturers, values, amounts, f'Radar Chart for Top {len(manufacturers)} Manufacturers of Customer {customer_code}') # st.pyplot(fig) # else: # st.warning("No data available to create the radar chart.") # else: # st.warning("No combined top manufacturers found.") # # Ensure codigo_cliente in ventas_clientes is a string # ventas_clientes['codigo_cliente'] = ventas_clientes['codigo_cliente'].astype(str).str.strip() # # Ensure customer_code is a string and strip any spaces # customer_code = str(customer_code).strip() # # if customer_code in ventas_clientes['codigo_cliente'].unique(): # # (f"Customer {customer_code} found in ventas_clientes") # # else: # # (f"Customer {customer_code} not found in ventas_clientes") # # Customer sales 2021-2024 (if data exists) # sales_columns = ['VENTA_2021', 'VENTA_2022', 'VENTA_2023'] # if all(col in ventas_clientes.columns for col in sales_columns): # customer_sales_data = ventas_clientes[ventas_clientes['codigo_cliente'] == customer_code] # if not customer_sales_data.empty: # customer_sales = customer_sales_data[sales_columns].values[0] # years = ['2021', '2022', '2023'] # # Add the 2024 actual and predicted data # if 'ventas_predichas' in results.columns and 'ventas_reales' in results.columns: # # Get the actual and predicted sales for 2024 # actual_sales_2024 = results[results['fecha_mes'].str.startswith('2024')]['ventas_reales'].sum() # predicted_sales_2024 = results[results['fecha_mes'].str.startswith('2024')]['ventas_predichas'].sum() # # Estimate full-year predicted sales (assuming predictions available until September) # months_available = 9 # Data available until September # actual_sales_2024_annual = (actual_sales_2024 / months_available) * 12 # # Add 2024 actual and predicted sales # sales_values = list(customer_sales) + [actual_sales_2024_annual] # Actual sales # predicted_values = list(customer_sales) + [predicted_sales_2024] # Predicted sales # # Add 2024 to the years list # years.append('2024') # fig_sales_bar = go.Figure() # # Add trace for historical sales (2021-2023) # fig_sales_bar.add_trace(go.Bar( # x=years[:3], # 2021, 2022, 2023 # y=sales_values[:3], # name="Historical Sales", # marker_color='blue' # )) # # Add trace for 2024 actual sales # fig_sales_bar.add_trace(go.Bar( # x=[years[3]], # 2024 # y=[sales_values[3]], # name="2024 Actual Sales (Annualized)", # marker_color='green' # )) # # Add trace for 2024 predicted sales # fig_sales_bar.add_trace(go.Bar( # x=[years[3]], # 2024 # y=[predicted_values[3]], # name="2024 Predicted Sales", # marker_color='orange' # )) # # Update layout # fig_sales_bar.update_layout( # title=f"Sales Over the Years for Customer {customer_code}", # xaxis_title="Year", # yaxis_title="Sales (€)", # barmode='group', # height=600, # legend_title_text="Sales Type", # hovermode="x unified" # ) # # Show the interactive bar chart in Streamlit # st.plotly_chart(fig_sales_bar, use_container_width=True) # else: # st.warning(f"No predicted or actual data found for customer {customer_code} for 2024.") # else: # st.warning(f"No historical sales data found for customer {customer_code}") # else: # st.warning("Sales data for 2021-2023 not available in the dataset.") # Customer Recommendations Page # Customer Recommendations Page elif page == "Articles Recommendations": st.title("Articles Recommendations") st.markdown(""" Get tailored recommendations for your customers based on their basket. """) st.write("Select items and assign quantities for the basket:") # Mostrar lista de artículos disponibles available_articles = productos['ARTICULO'].unique() selected_articles = st.multiselect("Select Articles", available_articles) # Crear inputs para ingresar las cantidades de cada artículo seleccionado quantities = {} for article in selected_articles: quantities[article] = st.number_input(f"Quantity for {article}", min_value=0, step=1) if st.button("Calcular"): # Añadimos el botón "Calcular" # Crear una lista de artículos basada en la selección de códigos y cantidades new_basket = [] for article in selected_articles: quantity = quantities[article] if quantity > 0: new_basket.extend([article] * quantity) # Añadir el código 'article' tantas veces como 'quantity' if new_basket: # Procesar la lista para recomendar recommendations_df = recomienda_tfid(new_basket) if not recommendations_df.empty: st.write("### Recommendations based on the current basket:") st.dataframe(recommendations_df) else: st.warning("No recommendations found for the provided basket.") else: st.warning("Please select at least one article and set its quantity.") # # Customer Recommendations Page # elif page == "Articles Recommendations": # st.title("Articles Recommendations") # st.markdown(""" # Get tailored recommendations for your customers based on their basket. # """) # # Campo input para cliente # partial_code = st.text_input("Enter part of Customer Code for Recommendations (or leave empty to see all)") # if partial_code: # filtered_customers = df[df['CLIENTE'].str.contains(partial_code)] # else: # filtered_customers = df # customer_list = filtered_customers['CLIENTE'].unique() # customer_code = st.selectbox("Select Customer Code for Recommendations", [""] + list(customer_list)) # # Definición de la función recomienda # def recomienda(new_basket): # # Calcular la matriz TF-IDF # tfidf = TfidfVectorizer() # tfidf_matrix = tfidf.fit_transform(cestas['Cestas']) # # Convertir la nueva cesta en formato TF-IDF # new_basket_str = ' '.join(new_basket) # new_basket_tfidf = tfidf.transform([new_basket_str]) # # Comparar la nueva cesta con las anteriores # similarities = cosine_similarity(new_basket_tfidf, tfidf_matrix) # # Obtener los índices de las cestas más similares # similar_indices = similarities.argsort()[0][-3:] # Las 3 más similares # # Crear un diccionario para contar las recomendaciones # recommendations_count = {} # total_similarity = 0 # # Recomendar productos de cestas similares # for idx in similar_indices: # sim_score = similarities[0][idx] # total_similarity += sim_score # products = cestas.iloc[idx]['Cestas'].split() # for product in products: # if product.strip() not in new_basket: # Evitar recomendar lo que ya está en la cesta # if product.strip() in recommendations_count: # recommendations_count[product.strip()] += sim_score # else: # recommendations_count[product.strip()] = sim_score # # Calcular la probabilidad relativa de cada producto recomendado # recommendations_with_prob = [] # if total_similarity > 0: # Verificar que total_similarity no sea cero # recommendations_with_prob = [(product, score / total_similarity) for product, score in recommendations_count.items()] # else: # print("No se encontraron similitudes suficientes para calcular probabilidades.") # recommendations_with_prob.sort(key=lambda x: x[1], reverse=True) # Ordenar por puntuación # # Crear un nuevo DataFrame para almacenar las recomendaciones con descripciones y probabilidades # recommendations_df = pd.DataFrame(columns=['ARTICULO', 'DESCRIPCION', 'PROBABILIDAD']) # # Agregar las recomendaciones al DataFrame usando pd.concat # for product, prob in recommendations_with_prob: # # Buscar la descripción en el DataFrame de productos # description = productos.loc[productos['ARTICULO'] == product, 'DESCRIPCION'] # if not description.empty: # # Crear un nuevo DataFrame temporal para la recomendación # temp_df = pd.DataFrame({ # 'ARTICULO': [product], # 'DESCRIPCION': [description.values[0]], # Obtener el primer valor encontrado # 'PROBABILIDAD': [prob] # }) # # Concatenar el DataFrame temporal al DataFrame de recomendaciones # recommendations_df = pd.concat([recommendations_df, temp_df], ignore_index=True) # return recommendations_df # # Comprobar si el cliente está en el CSV de fieles # is_fiel = customer_code in fieles_df['Cliente'].astype(str).values # if customer_code: # if is_fiel: # st.write(f"### Customer {customer_code} is a loyal customer.") # option = st.selectbox("Select Recommendation Type", ["Select an option", "By Purchase History", "By Current Basket"]) # if option == "By Purchase History": # st.warning("Option not available... aún") # elif option == "By Current Basket": # st.write("Select the items and assign quantities for the basket:") # # Mostrar lista de artículos disponibles # available_articles = productos['ARTICULO'].unique() # selected_articles = st.multiselect("Select Articles", available_articles) # # Crear inputs para ingresar las cantidades de cada artículo seleccionado # quantities = {} # for article in selected_articles: # quantities[article] = st.number_input(f"Quantity for {article}", min_value=0, step=1) # if st.button("Calcular"): # Añadimos el botón "Calcular" # # Crear una lista de artículos basada en la selección # new_basket = [f"{article} x{quantities[article]}" for article in selected_articles if quantities[article] > 0] # if new_basket: # # Procesar la lista para recomendar # recommendations_df = recomienda(new_basket) # if not recommendations_df.empty: # st.write("### Recommendations based on the current basket:") # st.dataframe(recommendations_df) # else: # st.warning("No recommendations found for the provided basket.") # else: # st.warning("Please select at least one article and set its quantity.") # else: # st.write(f"### Customer {customer_code} is not a loyal customer.") # st.write("Select items and assign quantities for the basket:") # # Mostrar lista de artículos disponibles # available_articles = productos['ARTICULO'].unique() # selected_articles = st.multiselect("Select Articles", available_articles) # # Crear inputs para ingresar las cantidades de cada artículo seleccionado # quantities = {} # for article in selected_articles: # quantities[article] = st.number_input(f"Quantity for {article}", min_value=0, step=1) # if st.button("Calcular"): # Añadimos el botón "Calcular" # # Crear una lista de artículos basada en la selección # new_basket = [f"{article} x{quantities[article]}" for article in selected_articles if quantities[article] > 0] # if new_basket: # # Procesar la lista para recomendar # recommendations_df = recomienda(new_basket) # if not recommendations_df.empty: # st.write("### Recommendations based on the current basket:") # st.dataframe(recommendations_df) # else: # st.warning("No recommendations found for the provided basket.") # else: # st.warning("Please select at least one article and set its quantity.")