How run sklearn.preprocessing.OrdinalEncoder on several columns?
Question:
this code raise error:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder
# Define categorical columns and mapping dictionary
categorical_cols = ['color', 'shape', 'size']
mapping = {'red': 0, 'green': 1, 'blue': 2, 'circle': 0, 'square': 1, 'triangle': 2, 'small': 0, 'medium': 1, 'large': 2}
cols = ['color','size']
# Define ColumnTransformer to preprocess categorical columns
preprocessor = ColumnTransformer(
transformers=[
('orlEncdr_with_map', Pipeline(steps=[('orlEnc_with_map', OrdinalEncoder(categories=[list(mapping.keys())], dtype=int))]), cols),
])
# Load sample data
data = pd.DataFrame({'color': ['red', 'green', 'blue', 'red'], 'shape': ['circle', 'square', 'triangle', 'triangle'], 'size': ['small', 'medium', 'large', 'medium']})
# Apply preprocessor to data
preprocessed_data = preprocessor.fit_transform(data)
# View preprocessed data
print(preprocessed_data)
Error:
ValueError Traceback (most recent call last)
~AppDataLocalTempipykernel_381481089712396.py in <module>
18
19 # Apply preprocessor to data
---> 20 preprocessed_data = preprocessor.fit_transform(data)
21
22 # View preprocessed data
~Anaconda3libsite-packagessklearncompose_column_transformer.py in fit_transform(self, X, y)
673 self._validate_remainder(X)
674
--> 675 result = self._fit_transform(X, y, _fit_transform_one)
676
677 if not result:
~Anaconda3libsite-packagessklearncompose_column_transformer.py in _fit_transform(self, X, y, func, fitted, column_as_strings)
604 )
605 try:
--> 606 return Parallel(n_jobs=self.n_jobs)(
607 delayed(func)(
608 transformer=clone(trans) if not fitted else trans,
~Anaconda3libsite-packagesjoblibparallel.py in __call__(self, iterable)
1046 # remaining jobs.
1047 self._iterating = False
-> 1048 if self.dispatch_one_batch(iterator):
1049 self._iterating = self._original_iterator is not None
1050
~Anaconda3libsite-packagesjoblibparallel.py in dispatch_one_batch(self, iterator)
862 return False
863 else:
--> 864 self._dispatch(tasks)
865 return True
866
~Anaconda3libsite-packagesjoblibparallel.py in _dispatch(self, batch)
780 with self._lock:
781 job_idx = len(self._jobs)
--> 782 job = self._backend.apply_async(batch, callback=cb)
783 # A job can complete so quickly than its callback is
784 # called before we get here, causing self._jobs to
~Anaconda3libsite-packagesjoblib_parallel_backends.py in apply_async(self, func, callback)
206 def apply_async(self, func, callback=None):
207 """Schedule a func to be run"""
--> 208 result = ImmediateResult(func)
209 if callback:
210 callback(result)
~Anaconda3libsite-packagesjoblib_parallel_backends.py in __init__(self, batch)
570 # Don't delay the application, to avoid keeping the input
571 # arguments in memory
--> 572 self.results = batch()
573
574 def get(self):
~Anaconda3libsite-packagesjoblibparallel.py in __call__(self)
261 # change the default number of processes to -1
262 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 263 return [func(*args, **kwargs)
264 for func, args, kwargs in self.items]
265
~Anaconda3libsite-packagesjoblibparallel.py in <listcomp>(.0)
261 # change the default number of processes to -1
262 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 263 return [func(*args, **kwargs)
264 for func, args, kwargs in self.items]
265
~Anaconda3libsite-packagessklearnutilsfixes.py in __call__(self, *args, **kwargs)
214 def __call__(self, *args, **kwargs):
215 with config_context(**self.config):
--> 216 return self.function(*args, **kwargs)
217
218
~Anaconda3libsite-packagessklearnpipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
891 with _print_elapsed_time(message_clsname, message):
892 if hasattr(transformer, "fit_transform"):
--> 893 res = transformer.fit_transform(X, y, **fit_params)
894 else:
895 res = transformer.fit(X, y, **fit_params).transform(X)
~Anaconda3libsite-packagessklearnpipeline.py in fit_transform(self, X, y, **fit_params)
432 fit_params_last_step = fit_params_steps[self.steps[-1][0]]
433 if hasattr(last_step, "fit_transform"):
--> 434 return last_step.fit_transform(Xt, y, **fit_params_last_step)
435 else:
436 return last_step.fit(Xt, y, **fit_params_last_step).transform(Xt)
~Anaconda3libsite-packagessklearnbase.py in fit_transform(self, X, y, **fit_params)
850 if y is None:
851 # fit method of arity 1 (unsupervised transformation)
--> 852 return self.fit(X, **fit_params).transform(X)
853 else:
854 # fit method of arity 2 (supervised transformation)
~Anaconda3libsite-packagessklearnpreprocessing_encoders.py in fit(self, X, y)
884
885 # `_fit` will only raise an error when `self.handle_unknown="error"`
--> 886 self._fit(X, handle_unknown=self.handle_unknown, force_all_finite="allow-nan")
887
888 if self.handle_unknown == "use_encoded_value":
~Anaconda3libsite-packagessklearnpreprocessing_encoders.py in _fit(self, X, handle_unknown, force_all_finite)
82 if self.categories != "auto":
83 if len(self.categories) != n_features:
---> 84 raise ValueError(
85 "Shape mismatch: if categories is an array,"
86 " it has to be of shape (n_features,)."
ValueError: Shape mismatch: if categories is an array, it has to be of shape (n_features,).
if you change it in this way it works:
cols = ['size']
How can I change it to works for several columns?
Answers:
First, you don’t need the pipeline (within the ColumnTransformer
), but it should work nevertheless.
Ordinal encoding works with this pared down ColumnTransformer
:
ct = ColumnTransformer(transformers=[('oe',
OrdinalEncoder(),
['color', 'size'])
],
remainder='passthrough')
ct.fit_transform(data)
[Omit the remainder='passthrough'
if you don’t want ‘shape’ passed through.]
So it looks like there is a problem with the way you define the categories. Do you really need to do this? OrdinalEncoder
defines the categories automatically based on the data seen in the fit
method.
If, however, you do want to explicitly define the categories you can do so with a list of lists:
ct2=ColumnTransformer(transformers=[('oe',
OrdinalEncoder(categories=[['green','blue','red'],
['circle','square','triangle']]),
['color','shape'])
])
ct2.fit_transform(data)
You can’t use the mapping_dict
here. You have to replace it by a list of array to control the mapping.
# Define categorical columns and mapping dictionary
categorical_cols = ['color', 'shape', 'size']
cols = ['color', 'size']
cats = [['red', 'green', 'blue'], ['small', 'medium', 'large']] # <- HERE
# Define ColumnTransformer to preprocess categorical columns
preprocessor = ColumnTransformer(
transformers=[
('orlEncdr_with_map', Pipeline(steps=[('orlEnc_with_map', OrdinalEncoder(categories=cats, dtype=int))]), cols),
])
Output:
>>> preprocessed_data
array([[0, 0],
[1, 1],
[2, 2],
[0, 1]])
If you want to start from 1 instead of 0, you have to pad left:
cats = [[None, 'red', 'green', 'blue'], [None, 'small', 'medium', 'large']]
>>> preprocessed_data
array([[1, 1],
[2, 2],
[3, 3],
[1, 2]])
this code raise error:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder
# Define categorical columns and mapping dictionary
categorical_cols = ['color', 'shape', 'size']
mapping = {'red': 0, 'green': 1, 'blue': 2, 'circle': 0, 'square': 1, 'triangle': 2, 'small': 0, 'medium': 1, 'large': 2}
cols = ['color','size']
# Define ColumnTransformer to preprocess categorical columns
preprocessor = ColumnTransformer(
transformers=[
('orlEncdr_with_map', Pipeline(steps=[('orlEnc_with_map', OrdinalEncoder(categories=[list(mapping.keys())], dtype=int))]), cols),
])
# Load sample data
data = pd.DataFrame({'color': ['red', 'green', 'blue', 'red'], 'shape': ['circle', 'square', 'triangle', 'triangle'], 'size': ['small', 'medium', 'large', 'medium']})
# Apply preprocessor to data
preprocessed_data = preprocessor.fit_transform(data)
# View preprocessed data
print(preprocessed_data)
Error:
ValueError Traceback (most recent call last)
~AppDataLocalTempipykernel_381481089712396.py in <module>
18
19 # Apply preprocessor to data
---> 20 preprocessed_data = preprocessor.fit_transform(data)
21
22 # View preprocessed data
~Anaconda3libsite-packagessklearncompose_column_transformer.py in fit_transform(self, X, y)
673 self._validate_remainder(X)
674
--> 675 result = self._fit_transform(X, y, _fit_transform_one)
676
677 if not result:
~Anaconda3libsite-packagessklearncompose_column_transformer.py in _fit_transform(self, X, y, func, fitted, column_as_strings)
604 )
605 try:
--> 606 return Parallel(n_jobs=self.n_jobs)(
607 delayed(func)(
608 transformer=clone(trans) if not fitted else trans,
~Anaconda3libsite-packagesjoblibparallel.py in __call__(self, iterable)
1046 # remaining jobs.
1047 self._iterating = False
-> 1048 if self.dispatch_one_batch(iterator):
1049 self._iterating = self._original_iterator is not None
1050
~Anaconda3libsite-packagesjoblibparallel.py in dispatch_one_batch(self, iterator)
862 return False
863 else:
--> 864 self._dispatch(tasks)
865 return True
866
~Anaconda3libsite-packagesjoblibparallel.py in _dispatch(self, batch)
780 with self._lock:
781 job_idx = len(self._jobs)
--> 782 job = self._backend.apply_async(batch, callback=cb)
783 # A job can complete so quickly than its callback is
784 # called before we get here, causing self._jobs to
~Anaconda3libsite-packagesjoblib_parallel_backends.py in apply_async(self, func, callback)
206 def apply_async(self, func, callback=None):
207 """Schedule a func to be run"""
--> 208 result = ImmediateResult(func)
209 if callback:
210 callback(result)
~Anaconda3libsite-packagesjoblib_parallel_backends.py in __init__(self, batch)
570 # Don't delay the application, to avoid keeping the input
571 # arguments in memory
--> 572 self.results = batch()
573
574 def get(self):
~Anaconda3libsite-packagesjoblibparallel.py in __call__(self)
261 # change the default number of processes to -1
262 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 263 return [func(*args, **kwargs)
264 for func, args, kwargs in self.items]
265
~Anaconda3libsite-packagesjoblibparallel.py in <listcomp>(.0)
261 # change the default number of processes to -1
262 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 263 return [func(*args, **kwargs)
264 for func, args, kwargs in self.items]
265
~Anaconda3libsite-packagessklearnutilsfixes.py in __call__(self, *args, **kwargs)
214 def __call__(self, *args, **kwargs):
215 with config_context(**self.config):
--> 216 return self.function(*args, **kwargs)
217
218
~Anaconda3libsite-packagessklearnpipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
891 with _print_elapsed_time(message_clsname, message):
892 if hasattr(transformer, "fit_transform"):
--> 893 res = transformer.fit_transform(X, y, **fit_params)
894 else:
895 res = transformer.fit(X, y, **fit_params).transform(X)
~Anaconda3libsite-packagessklearnpipeline.py in fit_transform(self, X, y, **fit_params)
432 fit_params_last_step = fit_params_steps[self.steps[-1][0]]
433 if hasattr(last_step, "fit_transform"):
--> 434 return last_step.fit_transform(Xt, y, **fit_params_last_step)
435 else:
436 return last_step.fit(Xt, y, **fit_params_last_step).transform(Xt)
~Anaconda3libsite-packagessklearnbase.py in fit_transform(self, X, y, **fit_params)
850 if y is None:
851 # fit method of arity 1 (unsupervised transformation)
--> 852 return self.fit(X, **fit_params).transform(X)
853 else:
854 # fit method of arity 2 (supervised transformation)
~Anaconda3libsite-packagessklearnpreprocessing_encoders.py in fit(self, X, y)
884
885 # `_fit` will only raise an error when `self.handle_unknown="error"`
--> 886 self._fit(X, handle_unknown=self.handle_unknown, force_all_finite="allow-nan")
887
888 if self.handle_unknown == "use_encoded_value":
~Anaconda3libsite-packagessklearnpreprocessing_encoders.py in _fit(self, X, handle_unknown, force_all_finite)
82 if self.categories != "auto":
83 if len(self.categories) != n_features:
---> 84 raise ValueError(
85 "Shape mismatch: if categories is an array,"
86 " it has to be of shape (n_features,)."
ValueError: Shape mismatch: if categories is an array, it has to be of shape (n_features,).
if you change it in this way it works:
cols = ['size']
How can I change it to works for several columns?
First, you don’t need the pipeline (within the ColumnTransformer
), but it should work nevertheless.
Ordinal encoding works with this pared down ColumnTransformer
:
ct = ColumnTransformer(transformers=[('oe',
OrdinalEncoder(),
['color', 'size'])
],
remainder='passthrough')
ct.fit_transform(data)
[Omit the remainder='passthrough'
if you don’t want ‘shape’ passed through.]
So it looks like there is a problem with the way you define the categories. Do you really need to do this? OrdinalEncoder
defines the categories automatically based on the data seen in the fit
method.
If, however, you do want to explicitly define the categories you can do so with a list of lists:
ct2=ColumnTransformer(transformers=[('oe',
OrdinalEncoder(categories=[['green','blue','red'],
['circle','square','triangle']]),
['color','shape'])
])
ct2.fit_transform(data)
You can’t use the mapping_dict
here. You have to replace it by a list of array to control the mapping.
# Define categorical columns and mapping dictionary
categorical_cols = ['color', 'shape', 'size']
cols = ['color', 'size']
cats = [['red', 'green', 'blue'], ['small', 'medium', 'large']] # <- HERE
# Define ColumnTransformer to preprocess categorical columns
preprocessor = ColumnTransformer(
transformers=[
('orlEncdr_with_map', Pipeline(steps=[('orlEnc_with_map', OrdinalEncoder(categories=cats, dtype=int))]), cols),
])
Output:
>>> preprocessed_data
array([[0, 0],
[1, 1],
[2, 2],
[0, 1]])
If you want to start from 1 instead of 0, you have to pad left:
cats = [[None, 'red', 'green', 'blue'], [None, 'small', 'medium', 'large']]
>>> preprocessed_data
array([[1, 1],
[2, 2],
[3, 3],
[1, 2]])