""" | |
scikit-learn models on Hugging Face Hub | |
--------------------------------------- | |
This guide demonstrates how you can use this package to create a Hugging Face | |
Hub model repository based on a scikit-learn compatible model, and how to | |
fetch scikit-learn compatible models from the Hub and run them locally. | |
""" | |
# %% | |
# Imports | |
# ======= | |
# First we will import everything required for the rest of this document. | |
import json | |
import os | |
import pickle | |
from pathlib import Path | |
from tempfile import mkdtemp, mkstemp | |
from uuid import uuid4 | |
import sklearn | |
from huggingface_hub import HfApi | |
from sklearn.datasets import load_breast_cancer | |
from sklearn.ensemble import HistGradientBoostingClassifier | |
from sklearn.experimental import enable_halving_search_cv # noqa | |
from sklearn.model_selection import HalvingGridSearchCV, train_test_split | |
from skops import card, hub_utils | |
# %% | |
# Data | |
# ==== | |
# Then we create some random data to train and evaluate our model. | |
X, y = load_breast_cancer(as_frame=True, return_X_y=True) | |
X_train, X_test, y_train, y_test = train_test_split( | |
X, y, test_size=0.3, random_state=42 | |
) | |
print("X's summary: ", X.describe()) | |
print("y's summary: ", y.describe()) | |
# %% | |
# Train a Model | |
# ============= | |
# Using the above data, we train a model. To select the model, we use | |
# :class:`~sklearn.model_selection.HalvingGridSearchCV` with a parameter grid | |
# over :class:`~sklearn.ensemble.HistGradientBoostingClassifier`. | |
param_grid = { | |
"max_leaf_nodes": [5, 10, 15], | |
"max_depth": [2, 5, 10], | |
} | |
model = HalvingGridSearchCV( | |
estimator=HistGradientBoostingClassifier(), | |
param_grid=param_grid, | |
random_state=42, | |
n_jobs=-1, | |
).fit(X_train, y_train) | |
model.score(X_test, y_test) | |
# %% | |
# Initialize a Model Repo | |
# ======================= | |
# We now initialize a model repository locally, and push it to the hub. For | |
# that, we need to first store the model as a pickle file and pass it to the | |
# hub tools. | |
# The file name is not significant, here we choose to save it with a `pkl` | |
# extension. | |
_, pkl_name = mkstemp(prefix="skops-", suffix=".pkl") | |
with open(pkl_name, mode="bw") as f: | |
pickle.dump(model, file=f) | |
local_repo = mkdtemp(prefix="skops-") | |
hub_utils.init( | |
model=pkl_name, | |
requirements=[f"scikit-learn={sklearn.__version__}"], | |
dst=local_repo, | |
task="tabular-classification", | |
data=X_test, | |
) | |
if "__file__" in locals(): # __file__ not defined during docs built | |
# Add this script itself to the files to be uploaded for reproducibility | |
hub_utils.add_files(__file__, dst=local_repo) | |
# %% | |
# We can no see what the contents of the created local repo are: | |
print(os.listdir(local_repo)) | |
# %% | |
# Model Card | |
# ========== | |
# We will now create a model card and save it. For more information about how | |
# to create a good model card, refer to the :ref:`model card example | |
# <sphx_glr_auto_examples_plot_model_card.py>`. The following code uses | |
# :func:`~skops.card.metadata_from_config` which creates a minimal metadata | |
# object to be included in the metadata section of the model card. The | |
# configuration used by this method is stored in the ``config.json`` file which | |
# is created by the call to :func:`~skops.hub_utils.init`. | |
model_card = card.Card(model, metadata=card.metadata_from_config(Path(local_repo))) | |
model_card.save(Path(local_repo) / "README.md") | |
# %% | |
# Push to Hub | |
# =========== | |
# And finally, we can push the model to the hub. This requires a user access | |
# token which you can get under https://huggingface.co/settings/tokens | |
# you can put your own token here, or set it as an environment variable before | |
# running this script. | |
token = os.environ["HF_HUB_TOKEN"] | |
repo_name = f"hf_hub_example-{uuid4()}" | |
user_name = HfApi().whoami(token=token)["name"] | |
repo_id = f"{user_name}/{repo_name}" | |
print(f"Creating and pushing to repo: {repo_id}") | |
# %% | |
# Now we can push our files to the repo. The following function creates the | |
# remote repository if it doesn't exist; this is controlled via the | |
# ``create_remote`` argument. Note that here we're setting ``private=True``, | |
# which means only people with the right permissions would see the model. Set | |
# ``private=False`` to make it visible to the public. | |
hub_utils.push( | |
repo_id=repo_id, | |
source=local_repo, | |
token=token, | |
commit_message="pushing files to the repo from the example!", | |
create_remote=True, | |
private=True, | |
) | |
# %% | |
# Once uploaded, other users can download and use it, unless you make the repo | |
# private. Given a repository's name, here's how one can download it: | |
repo_copy = mkdtemp(prefix="skops") | |
hub_utils.download(repo_id=repo_id, dst=repo_copy, token=token) | |
print(os.listdir(repo_copy)) | |
# %% | |
# You can also get the requirements of this repository: | |
print(hub_utils.get_requirements(path=repo_copy)) | |
# %% | |
# As well as the complete configuration of the project: | |
print(json.dumps(hub_utils.get_config(path=repo_copy), indent=2)) | |
# %% | |
# Now you can check the contents of the repository under your user. | |
# | |
# Update Requirements | |
# =================== | |
# If you update your environment and the versions of your requirements are | |
# changed, you can update the requirement in your repo by calling | |
# ``update_env``, which automatically detects the existing installation of the | |
# current environment and updates the requirements accordingly. | |
hub_utils.update_env(path=local_repo, requirements=["scikit-learn"]) | |
# %% | |
# Delete Repository | |
# ================= | |
# At the end, you can also delete the repository you created using | |
# ``HfApi().delete_repo``. For more information please refer to the | |
# documentation of ``huggingface_hub`` library. | |
#HfApi().delete_repo(repo_id=repo_id, token=token) | |