Custom Transformer to add additional column

Question

I am trying to replicate my lambda function into my pipeline

def determine_healthy(_list):
    if ('no' in _list['smoker'] and (_list['bmi'] >= 18.5) and (_list['bmi']<= 24.9)):
        return True
    else:
        return False

df['healthy'] = df.apply(lambda row: determine_healthy(row), axis=1)

The problem comes when I am integrating it into my pipeline, I’m not sure if the issue is that there is an additional column ‘healthy’ that is being added. This error is thrown when I’m trying to transform my X_train

from sklearn.base import BaseEstimator, TransformerMixin

class HealthyAttributeAdder(BaseEstimator, TransformerMixin):
    def __init__(self, items=None):
        if items is None: items = []
        self.l = items
    def fit(self, X , y=None):
        return self
    def transform(self, X):
        #X = X.copy()
        temp_cols = X.columns.to_list()
        temp_cols = temp_cols.append('healthy')
        new_cols = {k:v for k,v in zip(range(len(temp_cols)),temp_cols)}
        healthy = X.apply(lambda row: determine_healthy(row), axis=1)
        combined_df = pd.DataFrame(np.c_[X, healthy]).rename(columns=new_cols)
        return combined_df

num_col = ['age','bmi']
cat_col = ['sex', 'smoker','region','children','healthy']
y = df.pop('charges')
X = df 
all_col = X.columns
X_train, X_test, y_train, y_test = train_test_split(X,y , test_size=0.2, random_state = 42)

transform_pipeline = ColumnTransformer([
    ('healthy', HealthyAttributeAdder(), all_col),
    ('ss', StandardScaler(), num_col),
    ('ohe', OneHotEncoder(drop='first'), cat_col),
])

price_pipeline = Pipeline([
    ('transform', transform_pipeline),
    ('lasso',Lasso())
])

health_transform = HealthyAttributeAdder()
health_transform.fit_transform(X_train)
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
~AppDataLocalTemp/ipykernel_19796/500623650.py in <module>
----> 1 health_transform.fit_transform(X_train)

~Venvhdbtestlibsite-packagessklearnbase.py in fit_transform(self, X, y, **fit_params)
    850         if y is None:
    851             # fit method of arity 1 (unsupervised transformation)
--> 852             return self.fit(X, **fit_params).transform(X)
    853         else:
    854             # fit method of arity 2 (supervised transformation)

~AppDataLocalTemp/ipykernel_19796/3713134512.py in transform(self, X)
     11         temp_cols = X.columns.to_list()
     12         temp_cols = temp_cols.append('healthy')
---> 13         new_cols = {k:v for k,v in zip(range(len(temp_cols)),temp_cols)}
     14         healthy = X.apply(lambda row: determine_healthy(row), axis=1)
     15         combined_df = pd.DataFrame(np.c_[X, healthy]).rename(columns=new_cols)

TypeError: object of type 'NoneType' has no len()

Error when I use it to predict:

price_pipeline.fit(X_train,y_train)
y_pred = price_pipeline.predict(X_test)
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
~Venvhdbtestlibsite-packagespandascoreindexesbase.py in get_loc(self, key, method, tolerance)
   3360             try:
-> 3361                 return self._engine.get_loc(casted_key)
   3362             except KeyError as err:

~Venvhdbtestlibsite-packagespandas_libsindex.pyx in pandas._libs.index.IndexEngine.get_loc()

~Venvhdbtestlibsite-packagespandas_libsindex.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas_libshashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas_libshashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'healthy'

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
~Venvhdbtestlibsite-packagessklearnutils__init__.py in _get_column_indices(X, key)
    432             for col in columns:
--> 433                 col_idx = all_columns.get_loc(col)
    434                 if not isinstance(col_idx, numbers.Integral):

~Venvhdbtestlibsite-packagespandascoreindexesbase.py in get_loc(self, key, method, tolerance)
   3362             except KeyError as err:
-> 3363                 raise KeyError(key) from err
   3364 

KeyError: 'healthy'

The above exception was the direct cause of the following exception:

ValueError                                Traceback (most recent call last)
~AppDataLocalTemp/ipykernel_19796/993407432.py in <module>
----> 1 price_pipeline.fit(X_train,y_train)
      2 y_pred = price_pipeline.predict(X_test)

~Venvhdbtestlibsite-packagessklearnpipeline.py in fit(self, X, y, **fit_params)
    388         """
    389         fit_params_steps = self._check_fit_params(**fit_params)
--> 390         Xt = self._fit(X, y, **fit_params_steps)
    391         with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)):
    392             if self._final_estimator != "passthrough":

~Venvhdbtestlibsite-packagessklearnpipeline.py in _fit(self, X, y, **fit_params_steps)
    346                 cloned_transformer = clone(transformer)
    347             # Fit or load from cache the current transformer
--> 348             X, fitted_transformer = fit_transform_one_cached(
    349                 cloned_transformer,
    350                 X,

~Venvhdbtestlibsite-packagesjoblibmemory.py in __call__(self, *args, **kwargs)
    347 
    348     def __call__(self, *args, **kwargs):
--> 349         return self.func(*args, **kwargs)
    350 
    351     def call_and_shelve(self, *args, **kwargs):

~Venvhdbtestlibsite-packagessklearnpipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
    891     with _print_elapsed_time(message_clsname, message):
    892         if hasattr(transformer, "fit_transform"):
--> 893             res = transformer.fit_transform(X, y, **fit_params)
    894         else:
    895             res = transformer.fit(X, y, **fit_params).transform(X)

~Venvhdbtestlibsite-packagessklearncompose_column_transformer.py in fit_transform(self, X, y)
    670         self._check_n_features(X, reset=True)
    671         self._validate_transformers()
--> 672         self._validate_column_callables(X)
    673         self._validate_remainder(X)
    674 

~Venvhdbtestlibsite-packagessklearncompose_column_transformer.py in _validate_column_callables(self, X)
    350                 columns = columns(X)
    351             all_columns.append(columns)
--> 352             transformer_to_input_indices[name] = _get_column_indices(X, columns)
    353 
    354         self._columns = all_columns

~Venvhdbtestlibsite-packagessklearnutils__init__.py in _get_column_indices(X, key)
    439 
    440         except KeyError as e:
--> 441             raise ValueError("A given column is not a column of the dataframe") from e
    442 
    443         return column_indices

ValueError: A given column is not a column of the dataframe

Asked By: Randy Chng

||

Source

Answer 1

The first issue is actually independent from the ColumnTransformer usage and it is due to a bug in method transform‘s implementation in your HealthyAttributeAdder class.

In order to get a consistent result you should modify line

temp_cols = temp_cols.append('healthy')

into

temp_cols.append('healthy')

Actually, the issue is the one described here.

On the other hand, when you switch to ColumnTransformer, the issue is the one described either here or here eg (you’ll find other posts related, too). Namely, ColumnTransformer applies its transformers in parallel (to the X_train dataset you’re passing); therefore, when it comes to One-Hot-Encoding your categorical features, the OneHotEncoder is asked to transform the 'healthy' column (as present in cat_col), the same column not being present on X_train.

A possible way of solving the problem might be defining a separate pipeline to deal with HealthyAttributeAdder and prepend its application with respect to your ColumnTransformer‘s instance transform_pipeline.

class HealthyAttributeAdder(BaseEstimator, TransformerMixin):
    def fit(self, X , y=None):
        return self
    def transform(self, X):
        #X = X.copy()
        temp_cols = X.columns.to_list()
        temp_cols.append('healthy')
        new_cols = {k:v for k,v in zip(range(len(temp_cols)),temp_cols)}
        healthy = X.apply(lambda row: determine_healthy(row), axis=1)
        combined_df = pd.DataFrame(np.c_[X, healthy]).rename(columns=new_cols)
        return combined_df

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import Lasso

transform_pipeline = ColumnTransformer([
    #('healthy', HealthyAttributeAdder(), all_col),
    ('ss', StandardScaler(), num_col),
    ('ohe', OneHotEncoder(drop='first'), cat_col),
])

healthy_pipeline = Pipeline([
    ('healthy', HealthyAttributeAdder())                          
])

price_pipeline = Pipeline([
    ('add_healthy', healthy_pipeline),                     
    ('transform', transform_pipeline),
    ('lasso',Lasso())
])

price_pipeline.fit(X_train,y_train)
y_pred = price_pipeline.predict(X_test)

Like so, the output of the first step (add_healthy) of your price_pipeline will add the healthy column to X_train first; then this transformed X_train will be passed parallely to both StandardScaler() and OneHotEncoder() and – in particular – OneHotEncoder() won’t have any problems in One-Hot-Encoding column 'healthy'.

Answered By: amiola

Custom Transformer to add additional column

Question:

Answers: