File size: 4,391 Bytes
564fdde
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8fadbe3
564fdde
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87f945c
564fdde
 
 
 
66b5b2f
564fdde
 
e599435
 
87f945c
 
e599435
 
 
87f945c
 
 
e4f91e7
564fdde
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import gradio as gr
import numpy as np
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml
from sklearn.utils import shuffle
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import RidgeCV
from skops.hub_utils import download
import joblib
import shutil

# load dataset
def load_ames_housing():
    df = fetch_openml(name="house_prices", as_frame=True, parser="pandas")
    X = df.data
    y = df.target

    features = [
        "YrSold",
        "HeatingQC",
        "Street",
        "YearRemodAdd",
        "Heating",
        "MasVnrType",
        "BsmtUnfSF",
        "Foundation",
        "MasVnrArea",
        "MSSubClass",
        "ExterQual",
        "Condition2",
        "GarageCars",
        "GarageType",
        "OverallQual",
        "TotalBsmtSF",
        "BsmtFinSF1",
        "HouseStyle",
        "MiscFeature",
        "MoSold",
    ]

    X = X.loc[:, features]
    X, y = shuffle(X, y, random_state=0)

    X = X.iloc[:600]
    y = y.iloc[:600]
    return X, np.log(y)

def stacked_model(model1,model2,model3):
    X, y = load_ames_housing()
    estimators = []
    for model in [model1,model2,model3]:
        download(repo_id=model, dst='temp_dir')
        pipeline = joblib.load( "temp_dir/model.pkl")
        estimators.append((model.split('/')[-1], pipeline))
        shutil.rmtree("temp_dir")

    stacking_regressor = StackingRegressor(estimators=estimators, final_estimator=RidgeCV())

    # plot and compare the performance of the single models and the stacked model
    import time
    import matplotlib.pyplot as plt
    from sklearn.metrics import PredictionErrorDisplay
    from sklearn.model_selection import cross_validate, cross_val_predict

    fig, axs = plt.subplots(2, 2, figsize=(9, 7))
    axs = np.ravel(axs)

    for ax, (name, est) in zip(
        axs, estimators + [("Stacking Regressor", stacking_regressor)]
    ):
        scorers = {"R2": "r2", "MAE": "neg_mean_absolute_error"}

        start_time = time.time()
        scores = cross_validate(
            est, X, y, scoring=list(scorers.values()), n_jobs=-1, verbose=0
        )

        elapsed_time = time.time() - start_time

        y_pred = cross_val_predict(est, X, y, n_jobs=-1, verbose=0)
        scores = {
            key: (
                f"{np.abs(np.mean(scores[f'test_{value}'])):.2f} +- "
                f"{np.std(scores[f'test_{value}']):.2f}"
            )
            for key, value in scorers.items()
        }

        display = PredictionErrorDisplay.from_predictions(
            y_true=y,
            y_pred=y_pred,
            kind="actual_vs_predicted",
            ax=ax,
            scatter_kwargs={"alpha": 0.2, "color": "tab:blue"},
            line_kwargs={"color": "tab:red"},
        )
        ax.set_title(f"{name}\nEvaluation in {elapsed_time:.2f} seconds")

        for name, score in scores.items():
            ax.plot([], [], " ", label=f"{name}: {score}")
        ax.legend(loc="upper left")

    fig.suptitle("Single predictor versus stacked predictors")
    fig.tight_layout()
    fig.subplots_adjust(top=0.9)
    return fig

title = "Combine predictors using stacking"
with gr.Blocks(title=title) as demo:
    gr.Markdown(f"## {title}")
    gr.Markdown("""
    This app demonstrates combining 3 predictors trained on Ames housing dataset from OpenML using stacking and Ridge estimator as final estimator.  
    Stacking uses a meta-learning algorithm to learn how to combine the predictions from trained models. 
    The OpenML Ames housing dataset is a processed version of the 'Ames Iowa Housing' with 81 features.
    This app is developed based on [scikit-learn example](https://scikit-learn.org/stable/auto_examples/ensemble/plot_stack_predictors.html#sphx-glr-auto-examples-ensemble-plot-stack-predictors-py)
    """)

    model1 = gr.Textbox(label="Repo id of first model", value="haizad/ames-housing-random-forest-predictor")
    model2 = gr.Textbox(label="Repo id of second model", value="haizad/ames-housing-gbdt-predictor")
    model3 = gr.Textbox(label="Repo id of third model", value="haizad/ames-housing-lasso-predictor")
    plot = gr.Plot(label="Comparison of single predictor against stacked predictor")
    stack_btn = gr.Button("Stack")
    stack_btn.click(fn=stacked_model, inputs=[model1,model2,model3], outputs=[plot])

demo.launch()