How to create a custom ColumnTransformer using scikit-learn?

Question:

I have the below dataset:

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd

dt = pd.DataFrame({
    "time": ["1/4/2021 0:00","1/4/2021 1:00","1/4/2021 2:00","1/4/2021 3:00","1/4/2021 4:00"],
    "age": np.random.randint(12,80,5)
})

I need to create a custom ColumnTransformer using scikit-learn to convert the data and time features to numeric features.

Here I define my custom ColumnTransformer:

class DateTimeTransformer(BaseEstimator, TransformerMixin):

    def fit(self, X, y = None):
        return self

    def transform(self, X, y = None):
        return np.c_[ [self.date_and_time_to_num(x) for x in X] ]

    def date_and_time_to_num(self,date_and_time):
        date_and_time_in_list = date_and_time.split(" ")
        date_in_seconds = self.date_to_num(date_and_time_in_list[0])
        time_in_seconds = self.time_to_num(date_and_time_in_list[1])
        return date_in_seconds + time_in_seconds

    def date_to_num(self,date):
        yy, mm, dd = map(int, date.split('/'))
        return 10000 * yy + 100 * mm + dd

    def time_to_num(self,time_str):
        hh, mm = map(int, time_str.split(':'))
        return 60 * (mm + 60 * hh)

Then, I transform my features using the two below functions:

def process_data(x):
    column_transformer = get_column_transformer()
    column_transformer.fit(X=x)
    return column_transformer.transform(x)

def get_column_transformer():
    return make_column_transformer(
        (MinMaxScaler(),dt["age"].values.tolist()),
        (DateTimeTransformer(),dt["time"].values.tolist())
    )

And finally I call the process_data function to apply the changes:

print(process_data(dt))

However, I face the following error:

raise ValueError(ValueError: all features must be in [0, 1] or [-2, 0]
Asked By: Sal-laS

||

Answers:

The error is due to the fact that make_column_transformer takes the column names or column indices as inputs, not the data. In your case the correct syntax would be

make_column_transformer(
   (MinMaxScaler(), ['age']),
   (DateTimeTransformer(), 'time')
)

or, equivalently,

make_column_transformer(
    (MinMaxScaler(), [1]),
    (DateTimeTransformer(), 0)
)

For the MinMaxScaler you should use ['age'] or [1] as the MinMaxScaler expects a 2d array as input (e.g. a pd.DataFrame), while for the DateTimeTransformer you can use 'time' or 0 as the DateTimeTransformer expects a 1d array as input (e.g. a pd.Series). This is explained in the documentation.

Example with column names:

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd
np.random.seed(0)

class DateTimeTransformer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return np.c_[[self.date_and_time_to_num(x) for x in X]]

    def date_and_time_to_num(self, date_and_time):
        date_and_time_in_list = date_and_time.split(' ')
        date_in_seconds = self.date_to_num(date_and_time_in_list[0])
        time_in_seconds = self.time_to_num(date_and_time_in_list[1])
        return date_in_seconds + time_in_seconds

    def date_to_num(self, date):
        yy, mm, dd = map(int, date.split('/'))
        return 10000 * yy + 100 * mm + dd

    def time_to_num(self, time_str):
        hh, mm = map(int, time_str.split(':'))
        return 60 * (mm + 60 * hh)

def process_data(x):
    column_transformer = get_column_transformer()
    column_transformer.fit(X=x)
    return column_transformer.transform(x)

def get_column_transformer():
    return make_column_transformer(
        (MinMaxScaler(), ['age']),
        (DateTimeTransformer(), 'time')
    )

df = pd.DataFrame({
    'time': ['1/4/2021 0:00', '1/4/2021 1:00', '1/4/2021 2:00', '1/4/2021 3:00', '1/4/2021 4:00'],
    'age': np.random.randint(12, 80, 5)
})

process_data(df)
# array([[0.00000000e+00, 1.24210000e+04],
#        [1.30434783e-01, 1.60210000e+04],
#        [8.69565217e-01, 1.96210000e+04],
#        [1.00000000e+00, 2.32210000e+04],
#        [1.00000000e+00, 2.68210000e+04]])

Example with column indices:

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd
np.random.seed(0)

class DateTimeTransformer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return np.c_[[self.date_and_time_to_num(x) for x in X]]

    def date_and_time_to_num(self, date_and_time):
        date_and_time_in_list = date_and_time.split(' ')
        date_in_seconds = self.date_to_num(date_and_time_in_list[0])
        time_in_seconds = self.time_to_num(date_and_time_in_list[1])
        return date_in_seconds + time_in_seconds

    def date_to_num(self, date):
        yy, mm, dd = map(int, date.split('/'))
        return 10000 * yy + 100 * mm + dd

    def time_to_num(self, time_str):
        hh, mm = map(int, time_str.split(':'))
        return 60 * (mm + 60 * hh)

def process_data(x):
    column_transformer = get_column_transformer()
    column_transformer.fit(X=x)
    return column_transformer.transform(x)

def get_column_transformer():
    return make_column_transformer(
        (MinMaxScaler(), [1]),
        (DateTimeTransformer(), 0)
    )

df = pd.DataFrame({
    'time': ['1/4/2021 0:00', '1/4/2021 1:00', '1/4/2021 2:00', '1/4/2021 3:00', '1/4/2021 4:00'],
    'age': np.random.randint(12, 80, 5)
})

process_data(df)
# array([[0.00000000e+00, 1.24210000e+04],
#        [1.30434783e-01, 1.60210000e+04],
#        [8.69565217e-01, 1.96210000e+04],
#        [1.00000000e+00, 2.32210000e+04],
#        [1.00000000e+00, 2.68210000e+04]])
Answered By: Flavia Giammarino