molvault / regress_utils.py
VaultChem's picture
initial commit
d831908 verified
import sys
import os
import numpy as np
import random
import json
import shutil
import time
from scipy.stats import pearsonr
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVR as LinearSVR
from sklearn.model_selection import KFold
from chemdata import (
convert_numpy,
LinearSVRZAMA,
XGBRegressorZAMA,
OnDiskNetwork,
FHEModelDev,
FHEModelClient,
get_ECFP_AND_FEATURES,
)
import matplotlib.pyplot as plt
import xgboost as xgb
random.seed(42)
def hyper_opt(X_train, y_train, param_grid, regressor, verbose=10):
if regressor == "SVR":
if param_grid is None:
param_grid = {
"epsilon": [1e-2, 1e-1, 0.5],
"C": [1e-4,1e-3, 1e-2, 1e-1],
"loss": ["squared_epsilon_insensitive"],
"tol": [0.0001],
"max_iter": [50000],
"dual": [True],
}
regressor_fct = LinearSVR()
elif regressor == "XGB":
if param_grid is None:
param_grid = {
"max_depth": [3, 6, 10],
"learning_rate": [0.01, 0.1, 0.2],
"n_estimators": [10, 20, 50, 100],
"colsample_bytree": [0.3, 0.7],
}
regressor_fct = xgb.XGBRegressor(objective="reg:squarederror")
else:
raise ValueError("Unknown regressor type")
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(
estimator=regressor_fct,
param_grid=param_grid,
cv=kfold,
verbose=verbose,
n_jobs=-1,
)
grid_search.fit(X_train, y_train)
return (
grid_search.best_params_,
grid_search.best_score_,
grid_search.best_estimator_,
)
def train_xgb_regressor(X_train, y_train, param_grid=None, verbose=10):
if param_grid is None:
param_grid = {
"max_depth": [3, 6],
"learning_rate": [0.01, 0.1, 0.2],
"n_estimators": [20],
"colsample_bytree": [0.3, 0.7],
}
xgb_regressor = xgb.XGBRegressor(objective="reg:squarederror")
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(
estimator=xgb_regressor,
param_grid=param_grid,
cv=kfold,
verbose=verbose,
n_jobs=-1,
)
grid_search.fit(X_train, y_train)
return (
grid_search.best_params_,
grid_search.best_score_,
grid_search.best_estimator_,
)
def evaluate_model(model, X_test, y_test):
y_pred = model.predict(X_test)
pearsonr_score = pearsonr(y_test, y_pred).statistic
return pearsonr_score
def performance_bits():
"""
Test the model performance for different number of bits = feature vector length
"""
bits = np.array([2**i for i in range(4, 12)])
plt.close("all")
fig, ax = plt.subplots()
for r in [2, 3, 4]:
performance = []
for bit in bits:
X_train, X_test, y_train, y_test = load_data(
"LOG HLM_CLint (mL/min/kg)", bits=bit, radius=r
)
param_grid = {
"epsilon": [0.0, 0.1, 0.2, 0.5, 1.0],
"C": [0.1, 1, 10, 100],
"loss": ["epsilon_insensitive", "squared_epsilon_insensitive"],
"tol": [1e-4, 1e-3, 1e-2],
"max_iter": [1000, 5000, 10000],
}
best_params, best_score, best_model = hyper_opt(
X_train, y_train, param_grid, regressor="SVR", verbose=10
)
if not os.path.exists("data"):
os.makedirs("data")
with open("data/best_params_{}.json".format(bit), "w") as fp:
json.dump(best_params, fp, default=convert_numpy)
pearsonr_score = evaluate_model(best_model, X_test, y_test)
performance.append(pearsonr_score)
performance = np.array(performance)
ax.plot(bits, performance, marker="o", label=f"radius={r}")
ax.set_xlabel("Number of Bits")
ax.set_ylabel("Pearson's r Correlation Coefficient")
ax.legend()
plt.grid(True)
if not os.path.exists("figures"):
os.makedirs("figures")
plt.savefig("figures/performance_bits.png")
return bits, performance
def predict_fhe(model, X_test):
y_pred_fhe = model.predict(X_test, fhe="execute")
return y_pred_fhe
def setup_network(model_dev):
network = OnDiskNetwork()
fhemodel_dev = FHEModelDev(network.dev_dir.name, model_dev)
fhemodel_dev.save()
return network, fhemodel_dev
def copy_directory(source, destination="deployment"):
try:
# Check if the source directory exists
if not os.path.exists(source):
return False, "Source directory does not exist."
# Check if the destination directory exists
if not os.path.exists(destination):
os.makedirs(destination)
# Copy each item in the source directory
for item in os.listdir(source):
s = os.path.join(source, item)
d = os.path.join(destination, item)
if os.path.isdir(s):
shutil.copytree(
s, d, dirs_exist_ok=True
) # dirs_exist_ok is available from Python 3.8
else:
shutil.copy2(s, d)
return True, None
except Exception as e:
return False, str(e)
def client_server_interaction(network, fhemodel_client, X_client):
decrypted_predictions = []
execution_time = []
for i in range(X_client.shape[0]):
clear_input = X_client[[i], :]
encrypted_input = fhemodel_client.quantize_encrypt_serialize(clear_input)
execution_time.append(
network.client_send_input_to_server_for_prediction(encrypted_input)
)
encrypted_prediction = network.server_send_encrypted_prediction_to_client()
decrypted_prediction = fhemodel_client.deserialize_decrypt_dequantize(
encrypted_prediction
)[0]
decrypted_predictions.append(decrypted_prediction)
return decrypted_predictions, execution_time
def train_zama(X_train, y_train, best_params, regressor="SVR"):
if regressor == "SVR":
best_params["n_bits"] = 12
model_dev = LinearSVRZAMA(**best_params)
elif regressor == "XGB":
best_params["n_bits"] = 6
model_dev = XGBRegressorZAMA(**best_params)
print("Training Zama model...")
model_dev.fit(X_train, y_train)
print("compiling model...")
model_dev.compile(X_train)
print("done")
return model_dev
def time_prediction(model, X_sample):
time_begin = time.time()
y_pred_fhe = model.predict(X_sample, fhe="execute")
time_end = time.time()
return time_end - time_begin
def setup_client(network, key_dir):
fhemodel_client = FHEModelClient(network.client_dir.name, key_dir=key_dir)
fhemodel_client.generate_private_and_evaluation_keys()
serialized_evaluation_keys = fhemodel_client.get_serialized_evaluation_keys()
return fhemodel_client, serialized_evaluation_keys
def compare_predictions(network, fhemodel_client, sklearn_model, X_client):
fhe_predictions_decrypted, _ = client_server_interaction(
network, fhemodel_client, X_client
)
fhe_predictions_decrypted = [
item for sublist in fhe_predictions_decrypted for item in sublist
]
fhe_predictions_decrypted = np.array(fhe_predictions_decrypted)
sklearn_predictions = sklearn_model.predict(X_client)
# try:
mae = np.mean(
np.abs(sklearn_predictions.flatten() - fhe_predictions_decrypted.flatten())
)
# and pearson correlation
pearsonr_score = pearsonr(
sklearn_predictions.flatten(), fhe_predictions_decrypted.flatten()
).statistic
# pearsons r
print("sklearn_predictions")
print(sklearn_predictions)
print("fhe_predictions_decrypted:")
print(fhe_predictions_decrypted)
print("Pearson's r between sklearn and fhe predictions: " f"{pearsonr_score:.2f}")
return mae, pearsonr_score
def predict_ADME(network, fhemodel_client, molecule, bits=256, radius=2):
encodings = get_ECFP_AND_FEATURES(molecule, bits=bits, radius=radius).reshape(1, -1)
# generate_fingerprint(molecule, radius=radius, bits=bits).reshape(1, -1)
enc_inp = fhemodel_client.quantize_encrypt_serialize(encodings)
network.client_send_input_to_server_for_prediction(enc_inp)
encrypted_prediction = network.server_send_encrypted_prediction_to_client()
decrypted_prediction = fhemodel_client.deserialize_decrypt_dequantize(
encrypted_prediction
)
return np.array([decrypted_prediction])
def fit_final_model(HYPER=True):
task = "LOG HLM_CLint (mL/min/kg)"
bits, radius = 1024, 2
X_train, X_test, y_train, y_test = load_data(task, bits=bits, radius=radius)
if HYPER:
param_grid = {
"epsilon": [0.0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.5, 0.75, 1.0],
"C": [0.1, 0.5, 1, 5, 10, 50, 100],
"loss": ["epsilon_insensitive", "squared_epsilon_insensitive"],
"tol": [1e-5, 5e-5, 1e-4, 5e-4, 1e-3, 5e-3, 1e-2],
"max_iter": [5000, 1e4, 2e4],
}
best_params, best_score, best_model = hyper_opt(
X_train, y_train, param_grid, regressor="SVR", verbose=10
)
with open("best_params.json", "w") as fp:
json.dump(best_params, fp, default=convert_numpy)
print(best_params)
pearsonr_score = evaluate_model(best_model, X_test, y_test)
print(pearsonr_score)
try:
with open("best_params.json", "r") as fp:
best_params = json.load(fp)
print(best_params)
except:
print(
"No hyperparameter file found. Please run function with HYPER=True first."
)
exit()
model_dev = train_zama(X_train, y_train, best_params)
prediction_time = time_prediction(model_dev, X_test[0])
print(f"Time to predict one sample: {prediction_time:.2f} seconds")
network, fhemodel_dev = setup_network(model_dev)
copied, error_message = copy_directory(network.dev_dir.name)
if not copied:
print(f"Error copying directory: {error_message}")
network.dev_send_model_to_server()
network.dev_send_clientspecs_and_modelspecs_to_client()
fhemodel_client, serialized_evaluation_keys = setup_client(
network, network.client_dir.name
)
print(f"Evaluation keys size: {len(serialized_evaluation_keys) / (10**6):.2f} MB")
network.client_send_evaluation_key_to_server(serialized_evaluation_keys)
mae_fhe, pearsonr_score_fhe = compare_predictions(
network, fhemodel_client, best_model, X_test[-10:]
)
pred = predict_with_fingerprint(
network, fhemodel_client, "CC(=O)OC1=CC=CC=C1C(=O)O", bits=1024, radius=2
)
print(f"Prediction: {pred}")
if __name__ == "__main__":
fit_final_model(HYPER=True)
bits, performance = performance_bits()