|
import os |
|
import pandas as pd |
|
import numpy as np |
|
import seaborn as sns |
|
import matplotlib.pyplot as plt |
|
import matplotlib as mpl |
|
import pycaret |
|
import streamlit as st |
|
from streamlit_option_menu import option_menu |
|
import PIL |
|
from PIL import Image |
|
from PIL import ImageColor |
|
from PIL import ImageDraw |
|
from PIL import ImageFont |
|
|
|
def main(): |
|
hide_streamlit_style = """ |
|
<style> |
|
#MainMenu {visibility: hidden;} |
|
footer {visibility: hidden;} |
|
</style> |
|
""" |
|
st.markdown(hide_streamlit_style, unsafe_allow_html=True) |
|
|
|
with st.sidebar: |
|
image = Image.open('itaca_logo.png') |
|
st.image(image, width=150) |
|
page = option_menu(menu_title='Menu', |
|
menu_icon="robot", |
|
options=["Clustering Analysis", |
|
"Anomaly Detection"], |
|
icons=["chat-dots", |
|
"key"], |
|
default_index=0 |
|
) |
|
|
|
|
|
|
|
st.header("Settings") |
|
|
|
num_lines = st.text_input("% of lines to be processed:", value=100) |
|
graph_select = st.checkbox("Show Graphics", value= True) |
|
feat_imp_select = st.checkbox("Feature Importance", value= False) |
|
|
|
|
|
numclusters = [2, 3, 4, 5, 6] |
|
selected_clusters = st.slider("Choose a number of clusters", min_value=2, max_value=10, value=4) |
|
|
|
p_remove_multicollinearity = st.checkbox("Remove Multicollinearity", value=False) |
|
p_multicollinearity_threshold = st.slider("Choose multicollinearity thresholds", min_value=0.0, max_value=1.0, value=0.9) |
|
|
|
|
|
p_transformation = st.checkbox("Choose Power Transform", value = False) |
|
p_normalize = st.checkbox("Choose Normalize", value = False) |
|
p_pca = st.checkbox("Choose PCA", value = False) |
|
p_pca_method = st.selectbox ("Choose a PCA Method", ["linear", "kernel", "incremental"]) |
|
|
|
st.title('ITACA Insurance Core AI Module') |
|
|
|
if page == "Clustering Analysis": |
|
st.header('Clustering Analysis') |
|
|
|
st.write( |
|
""" |
|
""" |
|
) |
|
|
|
|
|
from pycaret.clustering import setup, create_model, assign_model, pull, plot_model |
|
|
|
from pycaret.clustering import ClusteringExperiment |
|
|
|
|
|
directory = "./" |
|
all_files = os.listdir(directory) |
|
|
|
csv_files = [file for file in all_files if file.endswith(".csv")] |
|
|
|
selected_csv = st.selectbox("Select a CSV file from the list", ["None"] + csv_files) |
|
|
|
|
|
uploaded_file = st.file_uploader("Choose a CSV file", type="csv") |
|
|
|
|
|
clusteringmodel = ['kmeans', 'ap', 'meanshift', 'sc', 'hclust', 'dbscan', 'optics', 'birch'] |
|
selected_model = st.selectbox("Choose a clustering model", clusteringmodel) |
|
|
|
|
|
if selected_csv != "None" or uploaded_file is not None: |
|
if uploaded_file: |
|
try: |
|
delimiter = ',' |
|
insurance_claims = pd.read_csv (uploaded_file, sep=delimiter) |
|
except ValueError: |
|
delimiter = '|' |
|
insurance_claims = pd.read_csv (uploaded_file, sep=delimiter, encoding='latin-1') |
|
else: |
|
insurance_claims = pd.read_csv(selected_csv) |
|
|
|
num_rows = int(insurance_claims.shape[0]*int(num_lines)/100) |
|
insurance_claims_reduced = insurance_claims.head(num_rows) |
|
st.write("Rows to be processed: " + str(num_rows)) |
|
|
|
all_columns = insurance_claims_reduced.columns.tolist() |
|
selected_columns = st.multiselect("Choose columns", all_columns, default=all_columns) |
|
insurance_claims_reduced = insurance_claims_reduced[selected_columns].copy() |
|
|
|
st.header("Inference Description") |
|
insurance_claims_reduced.describe().T |
|
|
|
cat_col = insurance_claims_reduced.select_dtypes(include=['object']).columns |
|
num_col = insurance_claims_reduced.select_dtypes(exclude=['object']).columns |
|
|
|
|
|
|
|
corr_matrix = insurance_claims_reduced[num_col].corr() |
|
|
|
fig, ax = plt.subplots(figsize=(12, 8)) |
|
|
|
st.header("Heat Map") |
|
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', ax=ax) |
|
|
|
ax.set_title('Correlation Heatmap') |
|
|
|
st.pyplot(fig) |
|
|
|
if st.button("Prediction"): |
|
|
|
|
|
s = setup(insurance_claims_reduced, session_id = 123, remove_multicollinearity=p_remove_multicollinearity, multicollinearity_threshold=p_multicollinearity_threshold, |
|
|
|
transformation=p_transformation, |
|
normalize=p_normalize, pca=p_pca, pca_method=p_pca_method) |
|
exp_clustering = ClusteringExperiment() |
|
|
|
exp_clustering.setup(insurance_claims_reduced, session_id = 123) |
|
|
|
with st.spinner("Analyzing..."): |
|
|
|
cluster_model = create_model(selected_model, num_clusters = selected_clusters) |
|
|
|
cluster_model_2 = assign_model(cluster_model) |
|
|
|
cluster_summary = cluster_model_2.groupby('Cluster').agg(['count', 'mean', 'median', 'min', 'max', |
|
'std', 'var', 'sum', ('quantile_25', lambda x: x.quantile(0.25)), |
|
('quantile_75', lambda x: x.quantile(0.75)), 'skew']) |
|
st.header("Cluster Summary") |
|
cluster_summary |
|
st.header("Assign Model") |
|
cluster_model_2 |
|
|
|
|
|
|
|
|
|
st.header("Clustering Metrics") |
|
cluster_results = pull() |
|
cluster_results |
|
|
|
if graph_select: |
|
st.header("Clustering Plots") |
|
|
|
plot_model(cluster_model, plot = 'cluster', display_format = 'streamlit') |
|
|
|
if selected_model != 'ap': |
|
plot_model(cluster_model, plot = 'tsne', display_format = 'streamlit') |
|
|
|
if selected_model not in ('ap', 'meanshift', 'dbscan', 'optics'): |
|
plot_model(cluster_model, plot = 'elbow', display_format = 'streamlit') |
|
|
|
if selected_model not in ('ap', 'meanshift', 'sc', 'hclust', 'dbscan', 'optics'): |
|
plot_model(cluster_model, plot = 'silhouette', display_format = 'streamlit') |
|
|
|
if selected_model not in ('ap', 'sc', 'hclust', 'dbscan', 'optics', 'birch'): |
|
plot_model(cluster_model, plot = 'distance', display_format = 'streamlit') |
|
|
|
if selected_model != 'ap': |
|
plot_model(cluster_model, plot = 'distribution', display_format = 'streamlit') |
|
|
|
|
|
if feat_imp_select: |
|
st.header("Feature Importance") |
|
from pycaret.classification import setup, create_model, get_config |
|
s = setup(cluster_model_2, target = 'Cluster') |
|
lr = create_model('lr') |
|
|
|
|
|
feat_imp = pd.DataFrame({'Feature': get_config('X_train').columns, 'Value' : abs(lr.coef_[0])}).sort_values(by='Value', ascending=False) |
|
|
|
feat_imp = feat_imp.sort_values(by='Value', ascending=False).head(10) |
|
|
|
|
|
|
|
st.bar_chart(feat_imp.set_index('Feature')) |
|
|
|
elif page == "Anomaly Detection": |
|
st.header('Anomaly Detection') |
|
|
|
st.write( |
|
""" |
|
""" |
|
) |
|
|
|
|
|
from pycaret.anomaly import setup, create_model, assign_model, pull, plot_model |
|
|
|
from pycaret.anomaly import AnomalyExperiment |
|
|
|
|
|
directory = "./" |
|
all_files = os.listdir(directory) |
|
|
|
csv_files = [file for file in all_files if file.endswith(".csv")] |
|
|
|
selected_csv = st.selectbox("Select a CSV file from the list", ["None"] + csv_files) |
|
|
|
|
|
uploaded_file = st.file_uploader("Choose a CSV file", type="csv") |
|
|
|
|
|
anomalymodel = ['abod', 'cluster', 'cof', 'iforest', 'histogram', 'knn', 'lof', 'svm', 'pca', 'mcd', 'sod', 'sos'] |
|
selected_model = st.selectbox("Choose an anomaly model", anomalymodel) |
|
|
|
|
|
if selected_csv != "None" or uploaded_file is not None: |
|
if uploaded_file: |
|
try: |
|
delimiter = ',' |
|
insurance_claims = pd.read_csv (uploaded_file, sep=delimiter) |
|
except ValueError: |
|
delimiter = '|' |
|
insurance_claims = pd.read_csv (uploaded_file, sep=delimiter, encoding='latin-1') |
|
else: |
|
insurance_claims = pd.read_csv(selected_csv) |
|
|
|
num_rows = int(insurance_claims.shape[0]*int(num_lines)/100) |
|
insurance_claims_reduced = insurance_claims.head(num_rows) |
|
st.write("Rows to be processed: " + str(num_rows)) |
|
|
|
all_columns = insurance_claims_reduced.columns.tolist() |
|
selected_columns = st.multiselect("Choose columns", all_columns, default=all_columns) |
|
insurance_claims_reduced = insurance_claims_reduced[selected_columns].copy() |
|
|
|
if st.button("Prediction"): |
|
|
|
s = setup(insurance_claims_reduced, session_id = 123, remove_multicollinearity=p_remove_multicollinearity, multicollinearity_threshold=p_multicollinearity_threshold, |
|
|
|
transformation=p_transformation, |
|
normalize=p_normalize, pca=p_pca, pca_method=p_pca_method) |
|
|
|
exp_anomaly = AnomalyExperiment() |
|
|
|
exp_anomaly.setup(insurance_claims_reduced, session_id = 123) |
|
|
|
with st.spinner("Analyzing..."): |
|
|
|
anomaly_model = create_model(selected_model) |
|
|
|
st.header("Assign Model") |
|
anomaly_model_2 = assign_model(anomaly_model) |
|
anomaly_model_2 |
|
|
|
st.header("Anomaly Metrics") |
|
anomaly_results = pull() |
|
anomaly_results |
|
|
|
if graph_select: |
|
|
|
st.header("Anomaly Plots") |
|
plot_model(anomaly_model, plot = 'tsne', display_format = 'streamlit') |
|
plot_model(anomaly_model, plot = 'umap', display_format = 'streamlit') |
|
|
|
if feat_imp_select: |
|
|
|
st.header("Feature Importance") |
|
from pycaret.classification import setup, create_model, get_config |
|
s = setup(anomaly_model_2, target = 'Anomaly') |
|
lr = create_model('lr') |
|
|
|
feat_imp = pd.DataFrame({'Feature': get_config('X_train').columns, 'Value' : abs(lr.coef_[0])}).sort_values(by='Value', ascending=False) |
|
|
|
feat_imp = feat_imp.sort_values(by='Value', ascending=False).head(10) |
|
|
|
|
|
|
|
st.bar_chart(feat_imp.set_index('Feature')) |
|
try: |
|
main() |
|
except Exception as e: |
|
st.sidebar.error(f"An error occurred: {e}") |