"A script to generate all development files necessary for the project."

import shutil
import numpy
import pandas

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

from ..settings import DEPLOYMENT_PATH, RANDOM_STATE
from client_server_interface import MultiInputsFHEModelDev
from model import MultiInputXGBClassifier
from development.pre_processing import pre_process_data


print("Load and pre-process the data")

data = pandas.read_csv("data/clean_data.csv", encoding="utf-8")

# Make median annual salary similar to France (2023): from 157500 to 22050
data["Total_income"] = data["Total_income"] * 0.14

# Remove ID feature
data.drop("ID", axis=1, inplace=True)

# Feature engineer the data
pre_processed_data, training_bins = pre_process_data(data)

# Define input and target data
y = pre_processed_data.pop("Target")
x = pre_processed_data

# The initial data-set is very imbalanced: use SMOTE to get better results
x, y = SMOTE().fit_resample(x, y)

# Retrieve the training data
X_train, _, y_train, _ = train_test_split(
    x, y, stratify=y, test_size=0.3, random_state=RANDOM_STATE
)

# Convert the Pandas data frames into Numpy arrays
X_train_np = X_train.to_numpy()
y_train_np = y_train.to_numpy()


print("Train and compile the model")

model = MultiInputXGBClassifier(max_depth=3, n_estimators=40)

model.fit(X_train_np, y_train_np)
 
multi_inputs_train = numpy.array_split(X_train_np, 3, axis=1)

model.compile(*multi_inputs_train, inputs_encryption_status=["encrypted", "encrypted", "encrypted"])

# Delete the deployment folder and its content if it already exists
if DEPLOYMENT_PATH.is_dir():
    shutil.rmtree(DEPLOYMENT_PATH)


print("Save deployment files")

# Save the files needed for deployment
fhe_dev = MultiInputsFHEModelDev(model, DEPLOYMENT_PATH)
fhe_dev.save()

print("Done !")