import joblib |
import pandas as pd |
from sklearn.datasets import fetch_openml |
from sklearn.preprocessing import StandardScaler, OneHotEncoder |
from sklearn.compose import make_column_transformer |
from sklearn.pipeline import make_pipeline, Pipeline |
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score |
from sklearn.linear_model import LinearRegression |
from sklearn.metrics import mean_squared_error, r2_score |
from sklearn.ensemble import RandomForestRegressor |
print("Loading the csv") |
df = pd.read_csv("insurance.csv") |
numerical_columns = ['age', 'bmi', 'children'] |
categorical_columns = ['sex', 'smoker', 'region'] |
target = 'charges' |
print("Splitting the data") |
x = df[numerical_columns + categorical_columns] |
y = df[target] |
x_train, x_test, y_train, y_test = train_test_split( |
x, y, |
test_size=0.2, |
random_state=1 |
) |
print("Building the pipeline") |
numerical_pipeline = Pipeline([ |
('scaler', StandardScaler()) |
]) |
categorical_pipeline = Pipeline([ |
('onehot', OneHotEncoder(handle_unknown='ignore')) |
]) |
preprocessor = make_column_transformer( |
(numerical_pipeline, numerical_columns), |
(categorical_pipeline, categorical_columns) |
) |
rfg = RandomForestRegressor(); |
params = { |
'criterion': ['squared_error', 'absolute_error'], |
'n_estimators': [1, 2, 3, 5, 8, 13, 21, 34], |
'max_depth': [1, 2, 3, 5, 8, 13, 21], |
'max_features': ['sqrt', 'log2'] |
} |
randomSearchCV = RandomizedSearchCV( |
rfg, |
params, |
cv=2, |
refit=True |
) |
pipeline = make_pipeline(preprocessor, randomSearchCV) |
print("Fitting the pipeline and doing metrics") |
pipeline.fit(x_train, y_train) |
predictions = pipeline.predict(x_test) |
print(f"RMSE: {mean_squared_error(y_test, predictions, squared=False)}") |
print(f"R-squared: {r2_score(y_test, predictions)}") |
print("Saving the model") |
joblib.dump(pipeline, "model.joblib") |