import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import pycaret
import streamlit as st
from streamlit_option_menu import option_menu
import PIL
from PIL import Image
from PIL import ImageColor
from PIL import ImageDraw
from PIL import ImageFont
def main():
st.set_page_config(layout="wide")
hide_streamlit_style = """
"""
st.markdown(hide_streamlit_style, unsafe_allow_html=True)
with st.sidebar:
image = Image.open('itaca_logo.png')
st.image(image, width=150) #,use_column_width=True)
page = option_menu(menu_title='Menu',
menu_icon="robot",
options=["Clustering Analysis",
"Anomaly Detection"],
icons=["chat-dots",
"key"],
default_index=0
)
# Additional section below the option menu
# st.markdown("---") # Add a separator line
st.header("Settings")
p_delimiter = st.selectbox ("Choose a delimiter", [",", ";", "|"])
num_lines = st.number_input("% of lines to be processed:", min_value=0, max_value=100, value=100)
graph_select = st.checkbox("Show Graphics", value= True)
feat_imp_select = st.checkbox("Feature Importance", value= False)
# Define the options for the dropdown list
numclusters = [2, 3, 4, 5, 6]
selected_clusters = st.slider("Choose a number of clusters", min_value=2, max_value=10, value=4)
p_remove_multicollinearity = st.checkbox("Remove Multicollinearity", value=False)
p_multicollinearity_threshold = st.slider("Choose multicollinearity thresholds", min_value=0.0, max_value=1.0, value=0.9)
# p_remove_outliers = st.checkbox("Remove Outliers", value=False)
# p_outliers_method = st.selectbox ("Choose an Outlier Method", ["iforest", "ee", "lof"])
p_transformation = st.checkbox("Choose Power Transform", value = False)
p_normalize = st.checkbox("Choose Normalize", value = False)
p_pca = st.checkbox("Choose PCA", value = False)
p_pca_method = st.selectbox ("Choose a PCA Method", ["linear", "kernel", "incremental"])
st.title('ITACA Insurance Core AI Module')
#col1, col2 = st.columns(2)
if page == "Clustering Analysis":
#with col1:
st.header('Clustering Analysis')
st.write(
"""
"""
)
# import pycaret unsupervised models
from pycaret.clustering import setup, create_model, assign_model, pull, plot_model
# import ClusteringExperiment
from pycaret.clustering import ClusteringExperiment
# Display the list of CSV files
directory = "./"
all_files = os.listdir(directory)
# Filter files to only include CSV files
csv_files = [file for file in all_files if file.endswith(".csv")]
# Select a CSV file from the list
selected_csv = st.selectbox("Select a CSV file from the list", ["None"] + csv_files)
# Upload the CSV file
uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
# Define the unsupervised model
clusteringmodel = ['kmeans', 'ap', 'meanshift', 'sc', 'hclust', 'dbscan', 'optics', 'birch']
selected_model = st.selectbox("Choose a clustering model", clusteringmodel)
# Read and display the CSV file
if selected_csv != "None" or uploaded_file is not None:
if uploaded_file:
try:
insurance_claims = pd.read_csv (uploaded_file, sep=p_delimiter)
except ValueError:
insurance_claims = pd.read_csv (uploaded_file, sep=p_delimiter, encoding='latin-1')
else:
insurance_claims = pd.read_csv(selected_csv)
num_rows = int(insurance_claims.shape[0]*(num_lines)/100)
insurance_claims_reduced = insurance_claims.head(num_rows)
st.write("Rows to be processed: " + str(num_rows))
all_columns = insurance_claims_reduced.columns.tolist()
selected_columns = st.multiselect("Choose columns", all_columns, default=all_columns)
insurance_claims_reduced = insurance_claims_reduced[selected_columns].copy()
with st.expander("Inference Description", expanded=True):
insurance_claims_reduced.describe().T
with st.expander("Head Map", expanded=True):
cat_col = insurance_claims_reduced.select_dtypes(include=['object']).columns
num_col = insurance_claims_reduced.select_dtypes(exclude=['object']).columns
# insurance_claims[num_col].hist(bins=15, figsize=(20, 15), layout=(5, 4))
# Calculate the correlation matrix
corr_matrix = insurance_claims_reduced[num_col].corr()
# Create a Matplotlib figure
fig, ax = plt.subplots(figsize=(12, 8))
# Create a heatmap using seaborn
#st.header("Heat Map")
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', ax=ax)
# Set the title for the heatmap
ax.set_title('Correlation Heatmap')
# Display the heatmap in Streamlit
st.pyplot(fig)
if st.button("Prediction"):
#insurance_claims_reduced = insurance_claims_reduced[selected_columns].copy()
s = setup(insurance_claims_reduced, session_id = 123, remove_multicollinearity=p_remove_multicollinearity, multicollinearity_threshold=p_multicollinearity_threshold,
# remove_outliers=p_remove_outliers, outliers_method=p_outliers_method,
transformation=p_transformation,
normalize=p_normalize, pca=p_pca, pca_method=p_pca_method)
exp_clustering = ClusteringExperiment()
# init setup on exp
exp_clustering.setup(insurance_claims_reduced, session_id = 123)
with st.spinner("Analyzing..."):
#with col2:
#st.markdown("
", unsafe_allow_html=True)
# train kmeans model
cluster_model = create_model(selected_model, num_clusters = selected_clusters)
cluster_model_2 = assign_model(cluster_model)
# Calculate summary statistics for each cluster
cluster_summary = cluster_model_2.groupby('Cluster').agg(['count', 'mean', 'median', 'min', 'max',
'std', 'var', 'sum', ('quantile_25', lambda x: x.quantile(0.25)),
('quantile_75', lambda x: x.quantile(0.75)), 'skew'])
with st.expander("Cluster Summary", expanded=False):
#st.header("Cluster Summary")
cluster_summary
with st.expander("Model Assign", expanded=False):
#st.header("Assign Model")
cluster_model_2
# all_metrics = get_metrics()
# all_metrics
with st.expander("Clustering Metrics", expanded=False):
#st.header("Clustering Metrics")
cluster_results = pull()
cluster_results
with st.expander("Clustering Plots", expanded=False):
if graph_select:
#st.header("Clustering Plots")
# plot pca cluster plot
plot_model(cluster_model, plot = 'cluster', display_format = 'streamlit')
if selected_model != 'ap':
plot_model(cluster_model, plot = 'tsne', display_format = 'streamlit')
if selected_model not in ('ap', 'meanshift', 'dbscan', 'optics'):
plot_model(cluster_model, plot = 'elbow', display_format = 'streamlit')
if selected_model not in ('ap', 'meanshift', 'sc', 'hclust', 'dbscan', 'optics'):
plot_model(cluster_model, plot = 'silhouette', display_format = 'streamlit')
if selected_model not in ('ap', 'sc', 'hclust', 'dbscan', 'optics', 'birch'):
plot_model(cluster_model, plot = 'distance', display_format = 'streamlit')
if selected_model != 'ap':
plot_model(cluster_model, plot = 'distribution', display_format = 'streamlit')
with st.expander("Feature Importance", expanded=False):
# Create a Classification Model to extract feature importance
if graph_select and feat_imp_select:
#st.header("Feature Importance")
from pycaret.classification import setup, create_model, get_config
s = setup(cluster_model_2, target = 'Cluster')
lr = create_model('lr')
# this is how you can recreate the table
feat_imp = pd.DataFrame({'Feature': get_config('X_train').columns, 'Value' : abs(lr.coef_[0])}).sort_values(by='Value', ascending=False)
# sort by feature importance value and filter top 10
feat_imp = feat_imp.sort_values(by='Value', ascending=False).head(10)
# Display the filtered table in Streamlit
# st.dataframe(feat_imp)
# Display the filtered table as a bar chart in Streamlit
st.bar_chart(feat_imp.set_index('Feature'))
elif page == "Anomaly Detection":
#with col1:
st.header('Anomaly Detection')
st.write(
"""
"""
)
# import pycaret anomaly
from pycaret.anomaly import setup, create_model, assign_model, pull, plot_model
# import AnomalyExperiment
from pycaret.anomaly import AnomalyExperiment
# Display the list of CSV files
directory = "./"
all_files = os.listdir(directory)
# Filter files to only include CSV files
csv_files = [file for file in all_files if file.endswith(".csv")]
# Select a CSV file from the list
selected_csv = st.selectbox("Select a CSV file from the list", ["None"] + csv_files)
# Upload the CSV file
uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
# Define the unsupervised model
anomalymodel = ['abod', 'cluster', 'cof', 'iforest', 'histogram', 'knn', 'lof', 'svm', 'pca', 'mcd', 'sod', 'sos']
selected_model = st.selectbox("Choose an anomaly model", anomalymodel)
# Read and display the CSV file
if selected_csv != "None" or uploaded_file is not None:
if uploaded_file:
try:
insurance_claims = pd.read_csv (uploaded_file, sep=p_delimiter)
except ValueError:
insurance_claims = pd.read_csv (uploaded_file, sep=p_delimiter, encoding='latin-1')
else:
insurance_claims = pd.read_csv(selected_csv)
num_rows = int(insurance_claims.shape[0]*(num_lines)/100)
insurance_claims_reduced = insurance_claims.head(num_rows)
st.write("Rows to be processed: " + str(num_rows))
all_columns = insurance_claims_reduced.columns.tolist()
selected_columns = st.multiselect("Choose columns", all_columns, default=all_columns)
insurance_claims_reduced = insurance_claims_reduced[selected_columns].copy()
with st.expander("Inference Description", expanded=True):
insurance_claims_reduced.describe().T
with st.expander("Head Map", expanded=True):
cat_col = insurance_claims_reduced.select_dtypes(include=['object']).columns
num_col = insurance_claims_reduced.select_dtypes(exclude=['object']).columns
# insurance_claims[num_col].hist(bins=15, figsize=(20, 15), layout=(5, 4))
# Calculate the correlation matrix
corr_matrix = insurance_claims_reduced[num_col].corr()
# Create a Matplotlib figure
fig, ax = plt.subplots(figsize=(12, 8))
# Create a heatmap using seaborn
#st.header("Heat Map")
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', ax=ax)
# Set the title for the heatmap
ax.set_title('Correlation Heatmap')
# Display the heatmap in Streamlit
st.pyplot(fig)
if st.button("Prediction"):
s = setup(insurance_claims_reduced, session_id = 123, remove_multicollinearity=p_remove_multicollinearity, multicollinearity_threshold=p_multicollinearity_threshold,
# remove_outliers=p_remove_outliers, outliers_method=p_outliers_method,
transformation=p_transformation,
normalize=p_normalize, pca=p_pca, pca_method=p_pca_method)
exp_anomaly = AnomalyExperiment()
# init setup on exp
exp_anomaly.setup(insurance_claims_reduced, session_id = 123)
with st.spinner("Analyzing..."):
#with col2:
#st.markdown("
", unsafe_allow_html=True)
# train model
anomaly_model = create_model(selected_model)
with st.expander("Assign Model", expanded=False):
#st.header("Assign Model")
anomaly_model_2 = assign_model(anomaly_model)
anomaly_model_2
with st.expander("Anomaly Metrics", expanded=False):
#st.header("Anomaly Metrics")
anomaly_results = pull()
anomaly_results
with st.expander("Anomaly Plots", expanded=False):
if graph_select:
# plot
#st.header("Anomaly Plots")
plot_model(anomaly_model, plot = 'tsne', display_format = 'streamlit')
plot_model(anomaly_model, plot = 'umap', display_format = 'streamlit')
with st.expander("Feature Importance", expanded=False):
if graph_select and feat_imp_select:
# Create a Classification Model to extract feature importance
#st.header("Feature Importance")
from pycaret.classification import setup, create_model, get_config
s = setup(anomaly_model_2, target = 'Anomaly')
lr = create_model('lr')
# this is how you can recreate the table
feat_imp = pd.DataFrame({'Feature': get_config('X_train').columns, 'Value' : abs(lr.coef_[0])}).sort_values(by='Value', ascending=False)
# sort by feature importance value and filter top 10
feat_imp = feat_imp.sort_values(by='Value', ascending=False).head(10)
# Display the filtered table in Streamlit
# st.dataframe(feat_imp)
# Display the filtered table as a bar chart in Streamlit
st.bar_chart(feat_imp.set_index('Feature'))
try:
main()
except Exception as e:
st.sidebar.error(f"An error occurred: {e}")