|
import sys |
|
import os |
|
|
|
import numpy as np |
|
import random |
|
import json |
|
import shutil |
|
import time |
|
from scipy.stats import pearsonr |
|
from sklearn.model_selection import GridSearchCV |
|
from sklearn.svm import LinearSVR as LinearSVR |
|
from sklearn.model_selection import KFold |
|
from chemdata import ( |
|
convert_numpy, |
|
LinearSVRZAMA, |
|
XGBRegressorZAMA, |
|
OnDiskNetwork, |
|
FHEModelDev, |
|
FHEModelClient, |
|
get_ECFP_AND_FEATURES, |
|
) |
|
import matplotlib.pyplot as plt |
|
import xgboost as xgb |
|
|
|
random.seed(42) |
|
|
|
|
|
def hyper_opt(X_train, y_train, param_grid, regressor, verbose=10): |
|
if regressor == "SVR": |
|
if param_grid is None: |
|
param_grid = { |
|
"epsilon": [1e-2, 1e-1, 0.5], |
|
"C": [1e-4,1e-3, 1e-2, 1e-1], |
|
"loss": ["squared_epsilon_insensitive"], |
|
"tol": [0.0001], |
|
"max_iter": [50000], |
|
"dual": [True], |
|
} |
|
regressor_fct = LinearSVR() |
|
elif regressor == "XGB": |
|
if param_grid is None: |
|
param_grid = { |
|
"max_depth": [3, 6, 10], |
|
"learning_rate": [0.01, 0.1, 0.2], |
|
"n_estimators": [10, 20, 50, 100], |
|
"colsample_bytree": [0.3, 0.7], |
|
} |
|
regressor_fct = xgb.XGBRegressor(objective="reg:squarederror") |
|
else: |
|
raise ValueError("Unknown regressor type") |
|
|
|
kfold = KFold(n_splits=5, shuffle=True, random_state=42) |
|
grid_search = GridSearchCV( |
|
estimator=regressor_fct, |
|
param_grid=param_grid, |
|
cv=kfold, |
|
verbose=verbose, |
|
n_jobs=-1, |
|
) |
|
grid_search.fit(X_train, y_train) |
|
return ( |
|
grid_search.best_params_, |
|
grid_search.best_score_, |
|
grid_search.best_estimator_, |
|
) |
|
|
|
|
|
def train_xgb_regressor(X_train, y_train, param_grid=None, verbose=10): |
|
if param_grid is None: |
|
param_grid = { |
|
"max_depth": [3, 6], |
|
"learning_rate": [0.01, 0.1, 0.2], |
|
"n_estimators": [20], |
|
"colsample_bytree": [0.3, 0.7], |
|
} |
|
|
|
xgb_regressor = xgb.XGBRegressor(objective="reg:squarederror") |
|
|
|
kfold = KFold(n_splits=5, shuffle=True, random_state=42) |
|
grid_search = GridSearchCV( |
|
estimator=xgb_regressor, |
|
param_grid=param_grid, |
|
cv=kfold, |
|
verbose=verbose, |
|
n_jobs=-1, |
|
) |
|
|
|
grid_search.fit(X_train, y_train) |
|
return ( |
|
grid_search.best_params_, |
|
grid_search.best_score_, |
|
grid_search.best_estimator_, |
|
) |
|
|
|
|
|
def evaluate_model(model, X_test, y_test): |
|
y_pred = model.predict(X_test) |
|
pearsonr_score = pearsonr(y_test, y_pred).statistic |
|
return pearsonr_score |
|
|
|
|
|
def performance_bits(): |
|
""" |
|
Test the model performance for different number of bits = feature vector length |
|
""" |
|
bits = np.array([2**i for i in range(4, 12)]) |
|
plt.close("all") |
|
fig, ax = plt.subplots() |
|
|
|
for r in [2, 3, 4]: |
|
performance = [] |
|
for bit in bits: |
|
X_train, X_test, y_train, y_test = load_data( |
|
"LOG HLM_CLint (mL/min/kg)", bits=bit, radius=r |
|
) |
|
param_grid = { |
|
"epsilon": [0.0, 0.1, 0.2, 0.5, 1.0], |
|
"C": [0.1, 1, 10, 100], |
|
"loss": ["epsilon_insensitive", "squared_epsilon_insensitive"], |
|
"tol": [1e-4, 1e-3, 1e-2], |
|
"max_iter": [1000, 5000, 10000], |
|
} |
|
best_params, best_score, best_model = hyper_opt( |
|
X_train, y_train, param_grid, regressor="SVR", verbose=10 |
|
) |
|
if not os.path.exists("data"): |
|
os.makedirs("data") |
|
|
|
with open("data/best_params_{}.json".format(bit), "w") as fp: |
|
json.dump(best_params, fp, default=convert_numpy) |
|
|
|
pearsonr_score = evaluate_model(best_model, X_test, y_test) |
|
performance.append(pearsonr_score) |
|
|
|
performance = np.array(performance) |
|
ax.plot(bits, performance, marker="o", label=f"radius={r}") |
|
|
|
ax.set_xlabel("Number of Bits") |
|
ax.set_ylabel("Pearson's r Correlation Coefficient") |
|
ax.legend() |
|
plt.grid(True) |
|
if not os.path.exists("figures"): |
|
os.makedirs("figures") |
|
plt.savefig("figures/performance_bits.png") |
|
|
|
return bits, performance |
|
|
|
|
|
def predict_fhe(model, X_test): |
|
y_pred_fhe = model.predict(X_test, fhe="execute") |
|
return y_pred_fhe |
|
|
|
|
|
def setup_network(model_dev): |
|
network = OnDiskNetwork() |
|
fhemodel_dev = FHEModelDev(network.dev_dir.name, model_dev) |
|
fhemodel_dev.save() |
|
return network, fhemodel_dev |
|
|
|
|
|
def copy_directory(source, destination="deployment"): |
|
try: |
|
|
|
if not os.path.exists(source): |
|
return False, "Source directory does not exist." |
|
|
|
|
|
if not os.path.exists(destination): |
|
os.makedirs(destination) |
|
|
|
|
|
for item in os.listdir(source): |
|
s = os.path.join(source, item) |
|
d = os.path.join(destination, item) |
|
if os.path.isdir(s): |
|
shutil.copytree( |
|
s, d, dirs_exist_ok=True |
|
) |
|
else: |
|
shutil.copy2(s, d) |
|
|
|
return True, None |
|
|
|
except Exception as e: |
|
return False, str(e) |
|
|
|
|
|
def client_server_interaction(network, fhemodel_client, X_client): |
|
decrypted_predictions = [] |
|
execution_time = [] |
|
for i in range(X_client.shape[0]): |
|
clear_input = X_client[[i], :] |
|
encrypted_input = fhemodel_client.quantize_encrypt_serialize(clear_input) |
|
execution_time.append( |
|
network.client_send_input_to_server_for_prediction(encrypted_input) |
|
) |
|
encrypted_prediction = network.server_send_encrypted_prediction_to_client() |
|
decrypted_prediction = fhemodel_client.deserialize_decrypt_dequantize( |
|
encrypted_prediction |
|
)[0] |
|
decrypted_predictions.append(decrypted_prediction) |
|
return decrypted_predictions, execution_time |
|
|
|
|
|
def train_zama(X_train, y_train, best_params, regressor="SVR"): |
|
if regressor == "SVR": |
|
best_params["n_bits"] = 12 |
|
model_dev = LinearSVRZAMA(**best_params) |
|
elif regressor == "XGB": |
|
best_params["n_bits"] = 6 |
|
model_dev = XGBRegressorZAMA(**best_params) |
|
|
|
print("Training Zama model...") |
|
model_dev.fit(X_train, y_train) |
|
print("compiling model...") |
|
model_dev.compile(X_train) |
|
print("done") |
|
|
|
return model_dev |
|
|
|
|
|
def time_prediction(model, X_sample): |
|
time_begin = time.time() |
|
y_pred_fhe = model.predict(X_sample, fhe="execute") |
|
time_end = time.time() |
|
return time_end - time_begin |
|
|
|
|
|
def setup_client(network, key_dir): |
|
fhemodel_client = FHEModelClient(network.client_dir.name, key_dir=key_dir) |
|
fhemodel_client.generate_private_and_evaluation_keys() |
|
serialized_evaluation_keys = fhemodel_client.get_serialized_evaluation_keys() |
|
return fhemodel_client, serialized_evaluation_keys |
|
|
|
|
|
def compare_predictions(network, fhemodel_client, sklearn_model, X_client): |
|
fhe_predictions_decrypted, _ = client_server_interaction( |
|
network, fhemodel_client, X_client |
|
) |
|
fhe_predictions_decrypted = [ |
|
item for sublist in fhe_predictions_decrypted for item in sublist |
|
] |
|
fhe_predictions_decrypted = np.array(fhe_predictions_decrypted) |
|
|
|
sklearn_predictions = sklearn_model.predict(X_client) |
|
|
|
|
|
mae = np.mean( |
|
np.abs(sklearn_predictions.flatten() - fhe_predictions_decrypted.flatten()) |
|
) |
|
|
|
pearsonr_score = pearsonr( |
|
sklearn_predictions.flatten(), fhe_predictions_decrypted.flatten() |
|
).statistic |
|
|
|
print("sklearn_predictions") |
|
print(sklearn_predictions) |
|
print("fhe_predictions_decrypted:") |
|
print(fhe_predictions_decrypted) |
|
|
|
print("Pearson's r between sklearn and fhe predictions: " f"{pearsonr_score:.2f}") |
|
|
|
return mae, pearsonr_score |
|
|
|
|
|
def predict_ADME(network, fhemodel_client, molecule, bits=256, radius=2): |
|
encodings = get_ECFP_AND_FEATURES(molecule, bits=bits, radius=radius).reshape(1, -1) |
|
|
|
enc_inp = fhemodel_client.quantize_encrypt_serialize(encodings) |
|
network.client_send_input_to_server_for_prediction(enc_inp) |
|
encrypted_prediction = network.server_send_encrypted_prediction_to_client() |
|
decrypted_prediction = fhemodel_client.deserialize_decrypt_dequantize( |
|
encrypted_prediction |
|
) |
|
return np.array([decrypted_prediction]) |
|
|
|
|
|
def fit_final_model(HYPER=True): |
|
task = "LOG HLM_CLint (mL/min/kg)" |
|
bits, radius = 1024, 2 |
|
X_train, X_test, y_train, y_test = load_data(task, bits=bits, radius=radius) |
|
|
|
if HYPER: |
|
param_grid = { |
|
"epsilon": [0.0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.5, 0.75, 1.0], |
|
"C": [0.1, 0.5, 1, 5, 10, 50, 100], |
|
"loss": ["epsilon_insensitive", "squared_epsilon_insensitive"], |
|
"tol": [1e-5, 5e-5, 1e-4, 5e-4, 1e-3, 5e-3, 1e-2], |
|
"max_iter": [5000, 1e4, 2e4], |
|
} |
|
best_params, best_score, best_model = hyper_opt( |
|
X_train, y_train, param_grid, regressor="SVR", verbose=10 |
|
) |
|
with open("best_params.json", "w") as fp: |
|
json.dump(best_params, fp, default=convert_numpy) |
|
print(best_params) |
|
pearsonr_score = evaluate_model(best_model, X_test, y_test) |
|
print(pearsonr_score) |
|
|
|
try: |
|
with open("best_params.json", "r") as fp: |
|
best_params = json.load(fp) |
|
print(best_params) |
|
except: |
|
print( |
|
"No hyperparameter file found. Please run function with HYPER=True first." |
|
) |
|
exit() |
|
|
|
model_dev = train_zama(X_train, y_train, best_params) |
|
|
|
prediction_time = time_prediction(model_dev, X_test[0]) |
|
print(f"Time to predict one sample: {prediction_time:.2f} seconds") |
|
|
|
network, fhemodel_dev = setup_network(model_dev) |
|
copied, error_message = copy_directory(network.dev_dir.name) |
|
if not copied: |
|
print(f"Error copying directory: {error_message}") |
|
|
|
network.dev_send_model_to_server() |
|
network.dev_send_clientspecs_and_modelspecs_to_client() |
|
|
|
fhemodel_client, serialized_evaluation_keys = setup_client( |
|
network, network.client_dir.name |
|
) |
|
print(f"Evaluation keys size: {len(serialized_evaluation_keys) / (10**6):.2f} MB") |
|
|
|
network.client_send_evaluation_key_to_server(serialized_evaluation_keys) |
|
|
|
mae_fhe, pearsonr_score_fhe = compare_predictions( |
|
network, fhemodel_client, best_model, X_test[-10:] |
|
) |
|
|
|
pred = predict_with_fingerprint( |
|
network, fhemodel_client, "CC(=O)OC1=CC=CC=C1C(=O)O", bits=1024, radius=2 |
|
) |
|
print(f"Prediction: {pred}") |
|
|
|
|
|
if __name__ == "__main__": |
|
fit_final_model(HYPER=True) |
|
bits, performance = performance_bits() |
|
|