SHAP KernelExplainer using PipeLine
Question:
I have a problem where I want to use pipeline (with OHE as preprocess and simple Linear Regression as model) with SHAP tools.
As for the data, here are my data (I’m using my modified version of bike sharing dataset):
bike_data=pd.read_csv("bike_outlier_clean.csv")
bike_data['season']=bike_data.season.astype('category')
bike_data['year']=bike_data.year.astype('category')
bike_data['holiday']=bike_data.holiday.astype('category')
bike_data['workingday']=bike_data.workingday.astype('category')
bike_data['weather_condition']=bike_data.weather_condition.astype('category')
bike_data['season'] = bike_data['season'].map({1:'Spring', 2:'Summer', 3:'Fall', 4: 'Winter'})
bike_data['year'] = bike_data['year'].map({0: 2011, 1: 2012})
bike_data['holiday'] = bike_data['holiday'].map({0: False, 1: True})
bike_data['workingday'] = bike_data['workingday'].map({0: False, 1: True})
bike_data['weather_condition'] = bike_data['weather_condition'].map({1:'Clear', 2:'Mist', 3:'Light Snow/Rain', 4: 'Heavy Snow/Rain'})
bike_data = bike_data[['total_count','season','month','year','weekday','holiday','workingday','weather_condition','humidity','temp','windspeed']]
x = bike_data.drop('total_count', axis=1)
y = bike_data['total_count']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)
and for my pipeline
category_columns = list(set(bike_data.columns) - set(bike_data._get_numeric_data().columns))
preprocessor = ColumnTransformer(
transformers=[
('cat', OneHotEncoder(), category_columns)
],
remainder='passthrough'
)
model = LinearRegression()
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
pipeline.fit(x_train,y_train)
and finally, using the kernelSHAP explainer
explainer = shap.KernelExplainer(pipeline.predict, shap.sample(x, 5))
However, that is where the error occur.
123 # Make a copy so that the feature names are not removed from the original model
124 out = copy.deepcopy(out)
--> 125 out.f.__self__.feature_names_in_ = None
126
127 return out
AttributeError: can't set attribute 'feature_names_in_'
I’m quite clueless as for now what should I do to fix it.
Answers:
Shap
doesn’t behave well with Pipeline
object, so I suggest the following (watch out when I start using numpy
array instead of Pandas
df):
import shap
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
print(shap.__version__)
bike_data = pd.read_csv("archive/bike_sharing_daily.csv")
bike_data['season']=bike_data.season.astype('category')
bike_data['holiday']=bike_data.holiday.astype('category')
bike_data['workingday']=bike_data.workingday.astype('category')
bike_data['season'] = bike_data['season'].map({1:'Spring', 2:'Summer', 3:'Fall', 4: 'Winter'})
bike_data['holiday'] = bike_data['holiday'].map({0: False, 1: True})
bike_data['workingday'] = bike_data['workingday'].map({0: False, 1: True})
bike_data = bike_data[['season','weekday','holiday','workingday','temp','windspeed']]
x = bike_data
y = np.random.randint(0, 10, len(bike_data))
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)
category_columns = list(set(bike_data.columns) - set(bike_data._get_numeric_data().columns))
col_idx = [i for i, col in enumerate(x_train.columns) if col in category_columns]
preprocessor = ColumnTransformer(
transformers=[
('cat', OneHotEncoder(), col_idx)
],
remainder='passthrough'
)
model = LinearRegression()
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
pipeline.fit(x_train.values, y_train) # <-- from here
explainer = shap.KernelExplainer(pipeline.predict, x_train.values[:10])
sv = explainer.shap_values(x_train.values)
shap.summary_plot(sv, x.columns) # <-- add column names back
0.44.1.dev4
I have a problem where I want to use pipeline (with OHE as preprocess and simple Linear Regression as model) with SHAP tools.
As for the data, here are my data (I’m using my modified version of bike sharing dataset):
bike_data=pd.read_csv("bike_outlier_clean.csv")
bike_data['season']=bike_data.season.astype('category')
bike_data['year']=bike_data.year.astype('category')
bike_data['holiday']=bike_data.holiday.astype('category')
bike_data['workingday']=bike_data.workingday.astype('category')
bike_data['weather_condition']=bike_data.weather_condition.astype('category')
bike_data['season'] = bike_data['season'].map({1:'Spring', 2:'Summer', 3:'Fall', 4: 'Winter'})
bike_data['year'] = bike_data['year'].map({0: 2011, 1: 2012})
bike_data['holiday'] = bike_data['holiday'].map({0: False, 1: True})
bike_data['workingday'] = bike_data['workingday'].map({0: False, 1: True})
bike_data['weather_condition'] = bike_data['weather_condition'].map({1:'Clear', 2:'Mist', 3:'Light Snow/Rain', 4: 'Heavy Snow/Rain'})
bike_data = bike_data[['total_count','season','month','year','weekday','holiday','workingday','weather_condition','humidity','temp','windspeed']]
x = bike_data.drop('total_count', axis=1)
y = bike_data['total_count']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)
and for my pipeline
category_columns = list(set(bike_data.columns) - set(bike_data._get_numeric_data().columns))
preprocessor = ColumnTransformer(
transformers=[
('cat', OneHotEncoder(), category_columns)
],
remainder='passthrough'
)
model = LinearRegression()
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
pipeline.fit(x_train,y_train)
and finally, using the kernelSHAP explainer
explainer = shap.KernelExplainer(pipeline.predict, shap.sample(x, 5))
However, that is where the error occur.
123 # Make a copy so that the feature names are not removed from the original model
124 out = copy.deepcopy(out)
--> 125 out.f.__self__.feature_names_in_ = None
126
127 return out
AttributeError: can't set attribute 'feature_names_in_'
I’m quite clueless as for now what should I do to fix it.
Shap
doesn’t behave well with Pipeline
object, so I suggest the following (watch out when I start using numpy
array instead of Pandas
df):
import shap
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
print(shap.__version__)
bike_data = pd.read_csv("archive/bike_sharing_daily.csv")
bike_data['season']=bike_data.season.astype('category')
bike_data['holiday']=bike_data.holiday.astype('category')
bike_data['workingday']=bike_data.workingday.astype('category')
bike_data['season'] = bike_data['season'].map({1:'Spring', 2:'Summer', 3:'Fall', 4: 'Winter'})
bike_data['holiday'] = bike_data['holiday'].map({0: False, 1: True})
bike_data['workingday'] = bike_data['workingday'].map({0: False, 1: True})
bike_data = bike_data[['season','weekday','holiday','workingday','temp','windspeed']]
x = bike_data
y = np.random.randint(0, 10, len(bike_data))
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)
category_columns = list(set(bike_data.columns) - set(bike_data._get_numeric_data().columns))
col_idx = [i for i, col in enumerate(x_train.columns) if col in category_columns]
preprocessor = ColumnTransformer(
transformers=[
('cat', OneHotEncoder(), col_idx)
],
remainder='passthrough'
)
model = LinearRegression()
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
pipeline.fit(x_train.values, y_train) # <-- from here
explainer = shap.KernelExplainer(pipeline.predict, x_train.values[:10])
sv = explainer.shap_values(x_train.values)
shap.summary_plot(sv, x.columns) # <-- add column names back
0.44.1.dev4