Spaces:

ericeilander
/

ericeilander_insurance_charge_prediction

Sleeping

ericeilander_insurance_charge_prediction / train.py

demonmittenhands

Initial commit

3b271ab 5 months ago

1.99 kB


	import joblib
	import pandas as pd

	from sklearn.datasets import fetch_openml

	from sklearn.preprocessing import StandardScaler, OneHotEncoder
	from sklearn.compose import make_column_transformer
	from sklearn.pipeline import make_pipeline, Pipeline
	from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
	from sklearn.linear_model import LinearRegression
	from sklearn.metrics import mean_squared_error, r2_score
	from sklearn.ensemble import RandomForestRegressor

	print("Loading the csv")
	df = pd.read_csv("insurance.csv")

	numerical_columns = ['age', 'bmi', 'children']
	categorical_columns = ['sex', 'smoker', 'region']
	target = 'charges'


	print("Splitting the data")
	x = df[numerical_columns + categorical_columns]
	y = df[target]

	x_train, x_test, y_train, y_test = train_test_split(
	x, y,
	test_size=0.2,
	random_state=1 #IMPORTANT use the same random state every time
	)

	print("Building the pipeline")
	numerical_pipeline = Pipeline([
	('scaler', StandardScaler())
	])
	categorical_pipeline = Pipeline([
	('onehot', OneHotEncoder(handle_unknown='ignore'))
	])

	preprocessor = make_column_transformer(
	(numerical_pipeline, numerical_columns),
	(categorical_pipeline, categorical_columns)
	)

	rfg = RandomForestRegressor();

	params = {
	'criterion': ['squared_error', 'absolute_error'],
	'n_estimators': [1, 2, 3, 5, 8, 13, 21, 34],
	'max_depth': [1, 2, 3, 5, 8, 13, 21],
	'max_features': ['sqrt', 'log2']
	}

	randomSearchCV = RandomizedSearchCV(
	rfg,
	params,
	cv=2,
	refit=True # important for pipelines to refit w/ hyper parameters as needed
	)

	pipeline = make_pipeline(preprocessor, randomSearchCV)

	print("Fitting the pipeline and doing metrics")
	pipeline.fit(x_train, y_train)
	predictions = pipeline.predict(x_test)
	print(f"RMSE: {mean_squared_error(y_test, predictions, squared=False)}")
	print(f"R-squared: {r2_score(y_test, predictions)}")

	print("Saving the model")
	joblib.dump(pipeline, "model.joblib")