File size: 1,991 Bytes
3b271ab |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
import joblib
import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
print("Loading the csv")
df = pd.read_csv("insurance.csv")
numerical_columns = ['age', 'bmi', 'children']
categorical_columns = ['sex', 'smoker', 'region']
target = 'charges'
print("Splitting the data")
x = df[numerical_columns + categorical_columns]
y = df[target]
x_train, x_test, y_train, y_test = train_test_split(
x, y,
test_size=0.2,
random_state=1 #IMPORTANT use the same random state every time
)
print("Building the pipeline")
numerical_pipeline = Pipeline([
('scaler', StandardScaler())
])
categorical_pipeline = Pipeline([
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = make_column_transformer(
(numerical_pipeline, numerical_columns),
(categorical_pipeline, categorical_columns)
)
rfg = RandomForestRegressor();
params = {
'criterion': ['squared_error', 'absolute_error'],
'n_estimators': [1, 2, 3, 5, 8, 13, 21, 34],
'max_depth': [1, 2, 3, 5, 8, 13, 21],
'max_features': ['sqrt', 'log2']
}
randomSearchCV = RandomizedSearchCV(
rfg,
params,
cv=2,
refit=True # important for pipelines to refit w/ hyper parameters as needed
)
pipeline = make_pipeline(preprocessor, randomSearchCV)
print("Fitting the pipeline and doing metrics")
pipeline.fit(x_train, y_train)
predictions = pipeline.predict(x_test)
print(f"RMSE: {mean_squared_error(y_test, predictions, squared=False)}")
print(f"R-squared: {r2_score(y_test, predictions)}")
print("Saving the model")
joblib.dump(pipeline, "model.joblib")
|