import joblib import pandas as pd from sklearn.datasets import fetch_openml from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.compose import make_column_transformer from sklearn.pipeline import make_pipeline, Pipeline from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error, r2_score from sklearn.ensemble import RandomForestRegressor print("Loading the csv") df = pd.read_csv("insurance.csv") numerical_columns = ['age', 'bmi', 'children'] categorical_columns = ['sex', 'smoker', 'region'] target = 'charges' print("Splitting the data") x = df[numerical_columns + categorical_columns] y = df[target] x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=1 #IMPORTANT use the same random state every time ) print("Building the pipeline") numerical_pipeline = Pipeline([ ('scaler', StandardScaler()) ]) categorical_pipeline = Pipeline([ ('onehot', OneHotEncoder(handle_unknown='ignore')) ]) preprocessor = make_column_transformer( (numerical_pipeline, numerical_columns), (categorical_pipeline, categorical_columns) ) rfg = RandomForestRegressor(); params = { 'criterion': ['squared_error', 'absolute_error'], 'n_estimators': [1, 2, 3, 5, 8, 13, 21, 34], 'max_depth': [1, 2, 3, 5, 8, 13, 21], 'max_features': ['sqrt', 'log2'] } randomSearchCV = RandomizedSearchCV( rfg, params, cv=2, refit=True # important for pipelines to refit w/ hyper parameters as needed ) pipeline = make_pipeline(preprocessor, randomSearchCV) print("Fitting the pipeline and doing metrics") pipeline.fit(x_train, y_train) predictions = pipeline.predict(x_test) print(f"RMSE: {mean_squared_error(y_test, predictions, squared=False)}") print(f"R-squared: {r2_score(y_test, predictions)}") print("Saving the model") joblib.dump(pipeline, "model.joblib")