Using Scikit-Learn OneHotEncoder with a Pandas DataFrame
Question:
I’m trying to replace a column within a Pandas DataFrame containing strings into a one-hot encoded equivalent using Scikit-Learn’s OneHotEncoder. My code below doesn’t work:
from sklearn.preprocessing import OneHotEncoder
# data is a Pandas DataFrame
jobs_encoder = OneHotEncoder()
jobs_encoder.fit(data['Profession'].unique().reshape(1, -1))
data['Profession'] = jobs_encoder.transform(data['Profession'].to_numpy().reshape(-1, 1))
It produces the following error (strings in the list are omitted):
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-91-3a1f568322f5> in <module>()
3 jobs_encoder = OneHotEncoder()
4 jobs_encoder.fit(data['Profession'].unique().reshape(1, -1))
----> 5 data['Profession'] = jobs_encoder.transform(data['Profession'].to_numpy().reshape(-1, 1))
/usr/local/anaconda3/envs/ml/lib/python3.6/site-packages/sklearn/preprocessing/_encoders.py in transform(self, X)
730 copy=True)
731 else:
--> 732 return self._transform_new(X)
733
734 def inverse_transform(self, X):
/usr/local/anaconda3/envs/ml/lib/python3.6/site-packages/sklearn/preprocessing/_encoders.py in _transform_new(self, X)
678 """New implementation assuming categorical input"""
679 # validation of X happens in _check_X called by _transform
--> 680 X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown)
681
682 n_samples, n_features = X_int.shape
/usr/local/anaconda3/envs/ml/lib/python3.6/site-packages/sklearn/preprocessing/_encoders.py in _transform(self, X, handle_unknown)
120 msg = ("Found unknown categories {0} in column {1}"
121 " during transform".format(diff, i))
--> 122 raise ValueError(msg)
123 else:
124 # Set the problematic rows to an acceptable value and
ValueError: Found unknown categories ['...', ..., '...'] in column 0 during transform
Here’s some sample data:
data['Profession'] =
0 unkn
1 safe
2 rece
3 unkn
4 lead
...
111988 indu
111989 seni
111990 mess
111991 seni
111992 proj
Name: Profession, Length: 111993, dtype: object
What exactly am I doing wrong?
Answers:
OneHotEncoder Encodes categorical integer features as a one-hot numeric array. Its Transform method returns a sparse matrix if sparse=True
, otherwise it returns a 2-d array.
You can’t cast a 2-d array (or sparse matrix) into a Pandas Series. You must create a Pandas Serie (a column in a Pandas dataFrame) for each category.
I would recommend pandas.get_dummies instead:
data = pd.get_dummies(data,prefix=['Profession'], columns = ['Profession'], drop_first=True)
EDIT:
Using Sklearn OneHotEncoder:
transformed = jobs_encoder.transform(data['Profession'].to_numpy().reshape(-1, 1))
#Create a Pandas DataFrame of the hot encoded column
ohe_df = pd.DataFrame(transformed, columns=jobs_encoder.get_feature_names())
#concat with original data
data = pd.concat([data, ohe_df], axis=1).drop(['Profession'], axis=1)
Other Options: If you are doing hyperparameter tuning with GridSearch it’s recommanded to use ColumnTransformer and FeatureUnion with Pipeline or directly make_column_transformer
So turned out that Scikit-Learns LabelBinarizer gave me better luck in converting the data to one-hot encoded format, with help from Amnie’s solution, my final code is as follows
import pandas as pd
from sklearn.preprocessing import LabelBinarizer
jobs_encoder = LabelBinarizer()
jobs_encoder.fit(data['Profession'])
transformed = jobs_encoder.transform(data['Profession'])
ohe_df = pd.DataFrame(transformed)
data = pd.concat([data, ohe_df], axis=1).drop(['Profession'], axis=1)
This below is an approach suggested by Kaggle Learn. Do not think there is a simpler way to do so at the moment to go from an original pandas DataFrame
to a one-hot encoded DataFrame
.
# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[low_cardinality_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[low_cardinality_cols]))
# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index
# Remove categorical columns (will replace with one-hot encoding)
numeric_X_train = X_train.drop(low_cardinality_cols, axis=1)
numeric_X_valid = X_valid.drop(low_cardinality_cols, axis=1)
# Add one-hot encoded columns to numerical features
new_X_train = pd.concat([numeric_X_train, OH_cols_train], axis=1)
new_X_valid = pd.concat([numeric_X_valid, OH_cols_valid], axis=1)
print(new_X_train)
This will do the trick. Remove plotly parts if you are not interested in viz. Also change printmd to print if you don’t need markdown.
def fn_cat_onehot(df):
"""Generate onehoteencoded features for all categorical columns in df"""
printmd(f"df shape: {df.shape}")
# NaN handing
nan_count = df.isna().sum().sum()
if nan_count > 0:
printmd(f"NaN = **{nan_count}** will be categorized under feature_nan columns")
# generation
from sklearn.preprocessing import OneHotEncoder
model_oh = OneHotEncoder(handle_unknown="ignore", sparse=False)
for c in df.select_dtypes("category").columns:
printmd(f"Encoding **{c}**") # which column
matrix = model_oh.fit_transform(
df[[c]]
) # get a matrix of new features and values
names = model_oh.get_feature_names_out() # get names for these features
df_oh = pd.DataFrame(
data=matrix, columns=names, index=df.index
) # create df of these new features
display(df_oh.plot.hist())
df = pd.concat([df, df_oh], axis=1) # concat with existing df
df.drop(
c, axis=1, inplace=True
) # drop categorical column so that it is all numerical for modelling
printmd(f"#### New df shape: **{df.shape}**")
return df
I capsulize @IndreshKumar’s solution into a sklearn transformer:
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder
class CategoricalOneHot(BaseEstimator, TransformerMixin):
def __init__(self, list_key_words=None):
self.oh_dict = {}
self.list_key_words = list_key_words
def fit(self, X, y=None):
self.list_cat_col = []
for key_word in self.list_key_words:
self.list_cat_col += [col for col in X.columns if key_word in col]
for col in self.list_cat_col:
oh = OneHotEncoder(handle_unknown="ignore", sparse=False)
oh.fit(X[[col]])
names = oh.get_feature_names_out()
self.oh_dict[col] = (oh, names)
return self
def transform(self, X):
_X = X.copy()
for col in self.list_cat_col:
oh = self.oh_dict[col][0]
df_oh = pd.DataFrame(
data=oh.transform(_X[[col]]),
columns=self.oh_dict[col][1],
index=_X.index)
_X = pd.concat([_X, df_oh], axis=1)
_X.drop(col, axis=1, inplace=True)
return _X
if __name__ == "__main__":
tex = pd.DataFrame({'city': ['a', 'a', 'e', 'b'], 'state': ['f', 'c', 'd', 'd']})
coh = CategoricalOneHot(list_key_words=['city', 'state'])
print(coh.fit_transform(tex))
Example:
Given a data frame with two categorical columns:
city state
0 a f
1 a c
2 e d
3 b d
the output looks like this:
city_a city_b city_e state_c state_d state_f
0 1.0 0.0 0.0 0.0 0.0 1.0
1 1.0 0.0 0.0 1.0 0.0 0.0
2 0.0 0.0 1.0 0.0 1.0 0.0
3 0.0 1.0 0.0 0.0 1.0 0.0
I’m trying to replace a column within a Pandas DataFrame containing strings into a one-hot encoded equivalent using Scikit-Learn’s OneHotEncoder. My code below doesn’t work:
from sklearn.preprocessing import OneHotEncoder
# data is a Pandas DataFrame
jobs_encoder = OneHotEncoder()
jobs_encoder.fit(data['Profession'].unique().reshape(1, -1))
data['Profession'] = jobs_encoder.transform(data['Profession'].to_numpy().reshape(-1, 1))
It produces the following error (strings in the list are omitted):
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-91-3a1f568322f5> in <module>()
3 jobs_encoder = OneHotEncoder()
4 jobs_encoder.fit(data['Profession'].unique().reshape(1, -1))
----> 5 data['Profession'] = jobs_encoder.transform(data['Profession'].to_numpy().reshape(-1, 1))
/usr/local/anaconda3/envs/ml/lib/python3.6/site-packages/sklearn/preprocessing/_encoders.py in transform(self, X)
730 copy=True)
731 else:
--> 732 return self._transform_new(X)
733
734 def inverse_transform(self, X):
/usr/local/anaconda3/envs/ml/lib/python3.6/site-packages/sklearn/preprocessing/_encoders.py in _transform_new(self, X)
678 """New implementation assuming categorical input"""
679 # validation of X happens in _check_X called by _transform
--> 680 X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown)
681
682 n_samples, n_features = X_int.shape
/usr/local/anaconda3/envs/ml/lib/python3.6/site-packages/sklearn/preprocessing/_encoders.py in _transform(self, X, handle_unknown)
120 msg = ("Found unknown categories {0} in column {1}"
121 " during transform".format(diff, i))
--> 122 raise ValueError(msg)
123 else:
124 # Set the problematic rows to an acceptable value and
ValueError: Found unknown categories ['...', ..., '...'] in column 0 during transform
Here’s some sample data:
data['Profession'] =
0 unkn
1 safe
2 rece
3 unkn
4 lead
...
111988 indu
111989 seni
111990 mess
111991 seni
111992 proj
Name: Profession, Length: 111993, dtype: object
What exactly am I doing wrong?
OneHotEncoder Encodes categorical integer features as a one-hot numeric array. Its Transform method returns a sparse matrix if sparse=True
, otherwise it returns a 2-d array.
You can’t cast a 2-d array (or sparse matrix) into a Pandas Series. You must create a Pandas Serie (a column in a Pandas dataFrame) for each category.
I would recommend pandas.get_dummies instead:
data = pd.get_dummies(data,prefix=['Profession'], columns = ['Profession'], drop_first=True)
EDIT:
Using Sklearn OneHotEncoder:
transformed = jobs_encoder.transform(data['Profession'].to_numpy().reshape(-1, 1))
#Create a Pandas DataFrame of the hot encoded column
ohe_df = pd.DataFrame(transformed, columns=jobs_encoder.get_feature_names())
#concat with original data
data = pd.concat([data, ohe_df], axis=1).drop(['Profession'], axis=1)
Other Options: If you are doing hyperparameter tuning with GridSearch it’s recommanded to use ColumnTransformer and FeatureUnion with Pipeline or directly make_column_transformer
So turned out that Scikit-Learns LabelBinarizer gave me better luck in converting the data to one-hot encoded format, with help from Amnie’s solution, my final code is as follows
import pandas as pd
from sklearn.preprocessing import LabelBinarizer
jobs_encoder = LabelBinarizer()
jobs_encoder.fit(data['Profession'])
transformed = jobs_encoder.transform(data['Profession'])
ohe_df = pd.DataFrame(transformed)
data = pd.concat([data, ohe_df], axis=1).drop(['Profession'], axis=1)
This below is an approach suggested by Kaggle Learn. Do not think there is a simpler way to do so at the moment to go from an original pandas DataFrame
to a one-hot encoded DataFrame
.
# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[low_cardinality_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[low_cardinality_cols]))
# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index
# Remove categorical columns (will replace with one-hot encoding)
numeric_X_train = X_train.drop(low_cardinality_cols, axis=1)
numeric_X_valid = X_valid.drop(low_cardinality_cols, axis=1)
# Add one-hot encoded columns to numerical features
new_X_train = pd.concat([numeric_X_train, OH_cols_train], axis=1)
new_X_valid = pd.concat([numeric_X_valid, OH_cols_valid], axis=1)
print(new_X_train)
This will do the trick. Remove plotly parts if you are not interested in viz. Also change printmd to print if you don’t need markdown.
def fn_cat_onehot(df):
"""Generate onehoteencoded features for all categorical columns in df"""
printmd(f"df shape: {df.shape}")
# NaN handing
nan_count = df.isna().sum().sum()
if nan_count > 0:
printmd(f"NaN = **{nan_count}** will be categorized under feature_nan columns")
# generation
from sklearn.preprocessing import OneHotEncoder
model_oh = OneHotEncoder(handle_unknown="ignore", sparse=False)
for c in df.select_dtypes("category").columns:
printmd(f"Encoding **{c}**") # which column
matrix = model_oh.fit_transform(
df[[c]]
) # get a matrix of new features and values
names = model_oh.get_feature_names_out() # get names for these features
df_oh = pd.DataFrame(
data=matrix, columns=names, index=df.index
) # create df of these new features
display(df_oh.plot.hist())
df = pd.concat([df, df_oh], axis=1) # concat with existing df
df.drop(
c, axis=1, inplace=True
) # drop categorical column so that it is all numerical for modelling
printmd(f"#### New df shape: **{df.shape}**")
return df
I capsulize @IndreshKumar’s solution into a sklearn transformer:
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder
class CategoricalOneHot(BaseEstimator, TransformerMixin):
def __init__(self, list_key_words=None):
self.oh_dict = {}
self.list_key_words = list_key_words
def fit(self, X, y=None):
self.list_cat_col = []
for key_word in self.list_key_words:
self.list_cat_col += [col for col in X.columns if key_word in col]
for col in self.list_cat_col:
oh = OneHotEncoder(handle_unknown="ignore", sparse=False)
oh.fit(X[[col]])
names = oh.get_feature_names_out()
self.oh_dict[col] = (oh, names)
return self
def transform(self, X):
_X = X.copy()
for col in self.list_cat_col:
oh = self.oh_dict[col][0]
df_oh = pd.DataFrame(
data=oh.transform(_X[[col]]),
columns=self.oh_dict[col][1],
index=_X.index)
_X = pd.concat([_X, df_oh], axis=1)
_X.drop(col, axis=1, inplace=True)
return _X
if __name__ == "__main__":
tex = pd.DataFrame({'city': ['a', 'a', 'e', 'b'], 'state': ['f', 'c', 'd', 'd']})
coh = CategoricalOneHot(list_key_words=['city', 'state'])
print(coh.fit_transform(tex))
Example:
Given a data frame with two categorical columns:
city state
0 a f
1 a c
2 e d
3 b d
the output looks like this:
city_a city_b city_e state_c state_d state_f
0 1.0 0.0 0.0 0.0 0.0 1.0
1 1.0 0.0 0.0 1.0 0.0 0.0
2 0.0 0.0 1.0 0.0 1.0 0.0
3 0.0 1.0 0.0 0.0 1.0 0.0