Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
"""Data pre-processing functions. | |
The pre-processing steps are heavily inspired by the following notebook : | |
https://www.kaggle.com/code/rikdifos/credit-card-approval-prediction-using-ml | |
Additional steps, mostly including renaming some values or features, were added for better user | |
experience. | |
""" | |
import numpy | |
from sklearn.compose import ColumnTransformer | |
from sklearn.pipeline import Pipeline | |
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, KBinsDiscretizer | |
def _get_pipeline_replace_one_hot(func, value): | |
return Pipeline([ | |
("replace", FunctionTransformer( | |
func, | |
kw_args={"value": value}, | |
feature_names_out='one-to-one', | |
)), | |
("one_hot", OneHotEncoder(),), | |
]) | |
def _replace_values_geq(column, value): | |
return numpy.where(column >= value, f"{value}_or_more", column) | |
def _replace_values_eq(column, value): | |
for desired_value, values_to_replace in value.items(): | |
column = numpy.where(numpy.isin(column, values_to_replace), desired_value, column) | |
return column | |
def get_pre_processors(): | |
pre_processor_user = ColumnTransformer( | |
transformers=[ | |
( | |
"replace_num_children", | |
_get_pipeline_replace_one_hot(_replace_values_geq, 2), | |
['Num_children'] | |
), | |
( | |
"replace_household_size", | |
_get_pipeline_replace_one_hot(_replace_values_geq, 3), | |
['Household_size'] | |
), | |
( | |
"replace_income_type", | |
_get_pipeline_replace_one_hot(_replace_values_eq, {"Public Sector": ["Retired", "Student"]}), | |
['Income_type'] | |
), | |
( | |
"replace_education_type", | |
_get_pipeline_replace_one_hot(_replace_values_eq, {"Higher education": ["Academic degree"]}), | |
['Education_type'] | |
), | |
( | |
"replace_occupation_type_labor", | |
_get_pipeline_replace_one_hot( | |
_replace_values_eq, | |
{ | |
"Labor_work": ["Cleaning staff", "Cooking staff", "Drivers", "Laborers", "Low-wage laborers", "Security staff", "Waiters/barmen staff"], | |
"Office_work": ["Accountants", "Core staff", "HR staff", "Medicine staff", "Private service staff", "Realty agents", "Sales staff", "Secretaries"], | |
"High_tech_work": ["Managers", "High skill tech staff", "IT staff"], | |
}, | |
), | |
['Occupation_type'] | |
), | |
('one_hot_housing_fam_status', OneHotEncoder(), ['Housing_type', 'Family_status']), | |
('qbin_total_income', KBinsDiscretizer(n_bins=3, strategy='quantile', encode="onehot"), ['Total_income']), | |
('bin_age', KBinsDiscretizer(n_bins=5, strategy='uniform', encode="onehot"), ['Age']), | |
], | |
remainder='passthrough', | |
verbose_feature_names_out=False, | |
) | |
pre_processor_third_party = ColumnTransformer( | |
transformers=[ | |
('bin_years_employed', KBinsDiscretizer(n_bins=5, strategy='uniform', encode="onehot"), ['Years_employed']) | |
], | |
remainder='passthrough', | |
verbose_feature_names_out=False, | |
) | |
return pre_processor_user, pre_processor_third_party |