|
|
|
import joblib |
|
import pandas as pd |
|
|
|
from sklearn.datasets import fetch_openml |
|
|
|
from sklearn.preprocessing import StandardScaler, OneHotEncoder |
|
from sklearn.compose import make_column_transformer |
|
from sklearn.pipeline import make_pipeline, Pipeline |
|
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score |
|
from sklearn.linear_model import LinearRegression |
|
from sklearn.metrics import mean_squared_error, r2_score |
|
from sklearn.ensemble import RandomForestRegressor |
|
|
|
print("Loading the csv") |
|
df = pd.read_csv("insurance.csv") |
|
|
|
numerical_columns = ['age', 'bmi', 'children'] |
|
categorical_columns = ['sex', 'smoker', 'region'] |
|
target = 'charges' |
|
|
|
|
|
print("Splitting the data") |
|
x = df[numerical_columns + categorical_columns] |
|
y = df[target] |
|
|
|
x_train, x_test, y_train, y_test = train_test_split( |
|
x, y, |
|
test_size=0.2, |
|
random_state=1 |
|
) |
|
|
|
print("Building the pipeline") |
|
numerical_pipeline = Pipeline([ |
|
('scaler', StandardScaler()) |
|
]) |
|
categorical_pipeline = Pipeline([ |
|
('onehot', OneHotEncoder(handle_unknown='ignore')) |
|
]) |
|
|
|
preprocessor = make_column_transformer( |
|
(numerical_pipeline, numerical_columns), |
|
(categorical_pipeline, categorical_columns) |
|
) |
|
|
|
rfg = RandomForestRegressor(); |
|
|
|
params = { |
|
'criterion': ['squared_error', 'absolute_error'], |
|
'n_estimators': [1, 2, 3, 5, 8, 13, 21, 34], |
|
'max_depth': [1, 2, 3, 5, 8, 13, 21], |
|
'max_features': ['sqrt', 'log2'] |
|
} |
|
|
|
randomSearchCV = RandomizedSearchCV( |
|
rfg, |
|
params, |
|
cv=2, |
|
refit=True |
|
) |
|
|
|
pipeline = make_pipeline(preprocessor, randomSearchCV) |
|
|
|
print("Fitting the pipeline and doing metrics") |
|
pipeline.fit(x_train, y_train) |
|
predictions = pipeline.predict(x_test) |
|
print(f"RMSE: {mean_squared_error(y_test, predictions, squared=False)}") |
|
print(f"R-squared: {r2_score(y_test, predictions)}") |
|
|
|
print("Saving the model") |
|
joblib.dump(pipeline, "model.joblib") |
|
|