"A script to generate all development files necessary for the project." import shutil import numpy import pandas from sklearn.model_selection import train_test_split from imblearn.over_sampling import SMOTE from ..settings import DEPLOYMENT_PATH, RANDOM_STATE from client_server_interface import MultiInputsFHEModelDev from model import MultiInputXGBClassifier from development.pre_processing import pre_process_data print("Load and pre-process the data") data = pandas.read_csv("data/clean_data.csv", encoding="utf-8") # Make median annual salary similar to France (2023): from 157500 to 22050 data["Total_income"] = data["Total_income"] * 0.14 # Remove ID feature data.drop("ID", axis=1, inplace=True) # Feature engineer the data pre_processed_data, training_bins = pre_process_data(data) # Define input and target data y = pre_processed_data.pop("Target") x = pre_processed_data # The initial data-set is very imbalanced: use SMOTE to get better results x, y = SMOTE().fit_resample(x, y) # Retrieve the training data X_train, _, y_train, _ = train_test_split( x, y, stratify=y, test_size=0.3, random_state=RANDOM_STATE ) # Convert the Pandas data frames into Numpy arrays X_train_np = X_train.to_numpy() y_train_np = y_train.to_numpy() print("Train and compile the model") model = MultiInputXGBClassifier(max_depth=3, n_estimators=40) model.fit(X_train_np, y_train_np) multi_inputs_train = numpy.array_split(X_train_np, 3, axis=1) model.compile(*multi_inputs_train, inputs_encryption_status=["encrypted", "encrypted", "encrypted"]) # Delete the deployment folder and its content if it already exists if DEPLOYMENT_PATH.is_dir(): shutil.rmtree(DEPLOYMENT_PATH) print("Save deployment files") # Save the files needed for deployment fhe_dev = MultiInputsFHEModelDev(model, DEPLOYMENT_PATH) fhe_dev.save() print("Done !")