How run sklearn.preprocessing.OrdinalEncoder on several columns?

Question:

this code raise error:

import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder

# Define categorical columns and mapping dictionary
categorical_cols = ['color', 'shape', 'size']
mapping = {'red': 0, 'green': 1, 'blue': 2, 'circle': 0, 'square': 1, 'triangle': 2, 'small': 0, 'medium': 1, 'large': 2}
cols = ['color','size']
# Define ColumnTransformer to preprocess categorical columns
preprocessor = ColumnTransformer(
  transformers=[
    ('orlEncdr_with_map', Pipeline(steps=[('orlEnc_with_map', OrdinalEncoder(categories=[list(mapping.keys())], dtype=int))]), cols),
  ])

# Load sample data
data = pd.DataFrame({'color': ['red', 'green', 'blue', 'red'], 'shape': ['circle', 'square', 'triangle', 'triangle'], 'size': ['small', 'medium', 'large', 'medium']})

# Apply preprocessor to data
preprocessed_data = preprocessor.fit_transform(data)

# View preprocessed data
print(preprocessed_data)

Error:

ValueError                                Traceback (most recent call last)
~AppDataLocalTempipykernel_381481089712396.py in <module>
     18 
     19 # Apply preprocessor to data
---> 20 preprocessed_data = preprocessor.fit_transform(data)
     21 
     22 # View preprocessed data

~Anaconda3libsite-packagessklearncompose_column_transformer.py in fit_transform(self, X, y)
    673         self._validate_remainder(X)
    674 
--> 675         result = self._fit_transform(X, y, _fit_transform_one)
    676 
    677         if not result:

~Anaconda3libsite-packagessklearncompose_column_transformer.py in _fit_transform(self, X, y, func, fitted, column_as_strings)
    604         )
    605         try:
--> 606             return Parallel(n_jobs=self.n_jobs)(
    607                 delayed(func)(
    608                     transformer=clone(trans) if not fitted else trans,

~Anaconda3libsite-packagesjoblibparallel.py in __call__(self, iterable)
   1046             # remaining jobs.
   1047             self._iterating = False
-> 1048             if self.dispatch_one_batch(iterator):
   1049                 self._iterating = self._original_iterator is not None
   1050 

~Anaconda3libsite-packagesjoblibparallel.py in dispatch_one_batch(self, iterator)
    862                 return False
    863             else:
--> 864                 self._dispatch(tasks)
    865                 return True
    866 

~Anaconda3libsite-packagesjoblibparallel.py in _dispatch(self, batch)
    780         with self._lock:
    781             job_idx = len(self._jobs)
--> 782             job = self._backend.apply_async(batch, callback=cb)
    783             # A job can complete so quickly than its callback is
    784             # called before we get here, causing self._jobs to

~Anaconda3libsite-packagesjoblib_parallel_backends.py in apply_async(self, func, callback)
    206     def apply_async(self, func, callback=None):
    207         """Schedule a func to be run"""
--> 208         result = ImmediateResult(func)
    209         if callback:
    210             callback(result)

~Anaconda3libsite-packagesjoblib_parallel_backends.py in __init__(self, batch)
    570         # Don't delay the application, to avoid keeping the input
    571         # arguments in memory
--> 572         self.results = batch()
    573 
    574     def get(self):

~Anaconda3libsite-packagesjoblibparallel.py in __call__(self)
    261         # change the default number of processes to -1
    262         with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 263             return [func(*args, **kwargs)
    264                     for func, args, kwargs in self.items]
    265 

~Anaconda3libsite-packagesjoblibparallel.py in <listcomp>(.0)
    261         # change the default number of processes to -1
    262         with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 263             return [func(*args, **kwargs)
    264                     for func, args, kwargs in self.items]
    265 

~Anaconda3libsite-packagessklearnutilsfixes.py in __call__(self, *args, **kwargs)
    214     def __call__(self, *args, **kwargs):
    215         with config_context(**self.config):
--> 216             return self.function(*args, **kwargs)
    217 
    218 

~Anaconda3libsite-packagessklearnpipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
    891     with _print_elapsed_time(message_clsname, message):
    892         if hasattr(transformer, "fit_transform"):
--> 893             res = transformer.fit_transform(X, y, **fit_params)
    894         else:
    895             res = transformer.fit(X, y, **fit_params).transform(X)

~Anaconda3libsite-packagessklearnpipeline.py in fit_transform(self, X, y, **fit_params)
    432             fit_params_last_step = fit_params_steps[self.steps[-1][0]]
    433             if hasattr(last_step, "fit_transform"):
--> 434                 return last_step.fit_transform(Xt, y, **fit_params_last_step)
    435             else:
    436                 return last_step.fit(Xt, y, **fit_params_last_step).transform(Xt)

~Anaconda3libsite-packagessklearnbase.py in fit_transform(self, X, y, **fit_params)
    850         if y is None:
    851             # fit method of arity 1 (unsupervised transformation)
--> 852             return self.fit(X, **fit_params).transform(X)
    853         else:
    854             # fit method of arity 2 (supervised transformation)

~Anaconda3libsite-packagessklearnpreprocessing_encoders.py in fit(self, X, y)
    884 
    885         # `_fit` will only raise an error when `self.handle_unknown="error"`
--> 886         self._fit(X, handle_unknown=self.handle_unknown, force_all_finite="allow-nan")
    887 
    888         if self.handle_unknown == "use_encoded_value":

~Anaconda3libsite-packagessklearnpreprocessing_encoders.py in _fit(self, X, handle_unknown, force_all_finite)
     82         if self.categories != "auto":
     83             if len(self.categories) != n_features:
---> 84                 raise ValueError(
     85                     "Shape mismatch: if categories is an array,"
     86                     " it has to be of shape (n_features,)."

ValueError: Shape mismatch: if categories is an array, it has to be of shape (n_features,).

if you change it in this way it works:
cols = ['size']

How can I change it to works for several columns?

Asked By: parvij

||

Answers:

First, you don’t need the pipeline (within the ColumnTransformer), but it should work nevertheless.

Ordinal encoding works with this pared down ColumnTransformer:

ct = ColumnTransformer(transformers=[('oe', 
                                      OrdinalEncoder(), 
                                      ['color', 'size'])
                                    ],
                       remainder='passthrough')

ct.fit_transform(data)

[Omit the remainder='passthrough' if you don’t want ‘shape’ passed through.]

So it looks like there is a problem with the way you define the categories. Do you really need to do this? OrdinalEncoder defines the categories automatically based on the data seen in the fit method.

If, however, you do want to explicitly define the categories you can do so with a list of lists:

ct2=ColumnTransformer(transformers=[('oe',
                                     OrdinalEncoder(categories=[['green','blue','red'],
                                                                ['circle','square','triangle']]),
                                     ['color','shape'])
                                   ])
ct2.fit_transform(data)
Answered By: njp

You can’t use the mapping_dict here. You have to replace it by a list of array to control the mapping.

# Define categorical columns and mapping dictionary
categorical_cols = ['color', 'shape', 'size']
cols = ['color', 'size']
cats = [['red', 'green', 'blue'], ['small', 'medium', 'large']]  # <- HERE

# Define ColumnTransformer to preprocess categorical columns
preprocessor = ColumnTransformer(
  transformers=[
    ('orlEncdr_with_map', Pipeline(steps=[('orlEnc_with_map', OrdinalEncoder(categories=cats, dtype=int))]), cols),
  ])

Output:

>>> preprocessed_data
array([[0, 0],
       [1, 1],
       [2, 2],
       [0, 1]])

If you want to start from 1 instead of 0, you have to pad left:

cats = [[None, 'red', 'green', 'blue'], [None, 'small', 'medium', 'large']]
>>> preprocessed_data
array([[1, 1],
       [2, 2],
       [3, 3],
       [1, 2]])
Answered By: Corralien
Categories: questions Tags: , ,
Answers are sorted by their score. The answer accepted by the question owner as the best is marked with
at the top-right corner.