import os import pandas as pd import numpy import pickle import pefile import sklearn.ensemble as ek from sklearn.feature_selection import SelectFromModel import joblib from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import confusion_matrix from sklearn import svm import sklearn.metrics as metrics from sklearn.model_selection import train_test_split import pdb from tqdm import tqdm dataset = pd.read_csv("data.csv", sep="|") # Feature X = dataset.drop( ["Name", "md5", "legitimate"], axis=1 ).values # Droping this because classification model will not accept object type elements (float and int only) # Target variable ugly = [ "Machine", "SizeOfOptionalHeader", "Characteristics", "MajorLinkerVersion", "MinorLinkerVersion", "SizeOfCode", "SizeOfInitializedData", "SizeOfUninitializedData", "AddressOfEntryPoint", "BaseOfCode", "BaseOfData", "ImageBase", "SectionAlignment", "FileAlignment", "MajorOperatingSystemVersion", "MinorOperatingSystemVersion", "MajorImageVersion", "MinorImageVersion", "MajorSubsystemVersion", "MinorSubsystemVersion", "SizeOfImage", "SizeOfHeaders", "CheckSum", "Subsystem", "DllCharacteristics", "SizeOfStackReserve", "SizeOfStackCommit", "SizeOfHeapReserve", "SizeOfHeapCommit", "LoaderFlags", "NumberOfRvaAndSizes", "SectionsNb", "SectionsMeanEntropy", "SectionsMinEntropy", "SectionsMaxEntropy", "SectionsMeanRawsize", "SectionsMinRawsize", #"SectionsMaxRawsize", "SectionsMeanVirtualsize", "SectionsMinVirtualsize", "SectionMaxVirtualsize", "ImportsNbDLL", "ImportsNb", "ImportsNbOrdinal", "ExportNb", "ResourcesNb", "ResourcesMeanEntropy", "ResourcesMinEntropy", "ResourcesMaxEntropy", "ResourcesMeanSize", "ResourcesMinSize", "ResourcesMaxSize", "LoadConfigurationSize", "VersionInformationSize", ] X = dataset[ugly].values y = dataset["legitimate"].values extratrees = ek.ExtraTreesClassifier().fit(X[:1000], y[:1000]) model = SelectFromModel(extratrees, prefit=True) X_new = model.transform(X) nbfeatures = X_new.shape[1] # splitting the data (70% - training and 30% - testing) X_train, X_test, y_train, y_test = train_test_split( X_new, y, test_size=0.29, stratify=y ) features = [] index = numpy.argsort(extratrees.feature_importances_)[::-1][:nbfeatures] for f in range(nbfeatures): print( "%d. feature %s (%f)" % ( f + 1, dataset.columns[2 + index[f]], extratrees.feature_importances_[index[f]], ) ) features.append(dataset.columns[2 + f]) model = { "DecisionTree": DecisionTreeClassifier(max_depth=10), "RandomForest": ek.RandomForestClassifier(n_estimators=50), } results = {} for algo in model: clf = model[algo] clf.fit(X_train, y_train) score = clf.score(X_test, y_test) print("%s : %s " % (algo, score)) results[algo] = score winner = max(results, key=results.get) # Selecting the classifier with good result print("Using", winner, "for classification, with", len(features), "features.") joblib.dump(model[winner], "classifier.pkl") open("features.pkl", "wb").write(pickle.dumps(features)) from fhe_utils import ( client_server_interaction, train_zama, setup_network, copy_directory, setup_client, ) model_dev_fhe = train_zama(X_train, y_train) #pdb.set_trace() network, _ = setup_network(model_dev_fhe) copied, error_message = copy_directory(network.dev_dir.name, destination="fhe_model") if not copied: print(f"Error copying directory: {error_message}") network.dev_send_model_to_server() network.dev_send_clientspecs_and_modelspecs_to_client() fhemodel_client, serialized_evaluation_keys = setup_client( network, network.client_dir.name ) print(f"Evaluation keys size: {len(serialized_evaluation_keys)} B") network.client_send_evaluation_key_to_server(serialized_evaluation_keys) decrypted_predictions, execution_time = client_server_interaction(network, fhemodel_client, X_test[:100])