File size: 1,991 Bytes
3b271ab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72

import joblib
import pandas as pd

from sklearn.datasets import fetch_openml

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor

print("Loading the csv")
df = pd.read_csv("insurance.csv")

numerical_columns = ['age', 'bmi', 'children']
categorical_columns = ['sex', 'smoker', 'region']
target = 'charges'


print("Splitting the data")
x = df[numerical_columns + categorical_columns]
y = df[target]

x_train, x_test, y_train, y_test = train_test_split(
    x, y,
    test_size=0.2,
    random_state=1 #IMPORTANT use the same random state every time
)

print("Building the pipeline")
numerical_pipeline = Pipeline([
    ('scaler', StandardScaler())
])
categorical_pipeline = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = make_column_transformer(
    (numerical_pipeline, numerical_columns),
    (categorical_pipeline, categorical_columns)
)

rfg = RandomForestRegressor();

params = {
    'criterion': ['squared_error', 'absolute_error'],
    'n_estimators': [1, 2, 3, 5, 8, 13, 21, 34],
    'max_depth': [1, 2, 3, 5, 8, 13, 21],
    'max_features': ['sqrt', 'log2']
}

randomSearchCV = RandomizedSearchCV(
    rfg,
    params,
    cv=2,
    refit=True # important for pipelines to refit w/ hyper parameters as needed
)

pipeline = make_pipeline(preprocessor, randomSearchCV)

print("Fitting the pipeline and doing metrics")
pipeline.fit(x_train, y_train)
predictions = pipeline.predict(x_test)
print(f"RMSE: {mean_squared_error(y_test, predictions, squared=False)}")
print(f"R-squared: {r2_score(y_test, predictions)}")

print("Saving the model")
joblib.dump(pipeline, "model.joblib")