What to do about future warning when using sklearn.neighbors?

Question:

I fear, I have the same problem as in this post:

getting a warning when using sklearn.neighbors about keepdims

I try to use KNN as part of an ensemble classifier, but everytime I get the following warning:

FutureWarning: Unlike other reduction functions (e.g. skew, kurtosis), the default behavior of mode typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of keepdims will become False, the axis over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set keepdims to True or False to avoid this warning.
mode, _ = stats.mode(_y[neigh_ind, k], axis=1)

I know that one way to solve this issue is to suppress future warnings, but since this might lead to errors later, I would rather fix it now. Is there a way to do this? I tried simply calling KNeighborsClassifier(keepdim = True) but this syntax was not accepted.

Also when adding
from warnings import simplefilter simplefilter(action='ignore', category=FutureWarning) does not suppress the message for me.

Here is the full code:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(style='darkgrid', font_scale=1.4)
from imblearn.over_sampling import SMOTE
import itertools
#import warnings
#warnings.filterwarnings('ignore')
import plotly.express as px
import time

# Sklearn
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score, f1_score
from sklearn.metrics import roc_auc_score, plot_confusion_matrix, plot_roc_curve, roc_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.feature_selection import mutual_info_classif
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import eli5
from eli5.sklearn import PermutationImportance
from sklearn.utils import resample
from sklearn.metrics import cohen_kappa_score, make_scorer
from sklearn import metrics

# Models
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.naive_bayes import GaussianNB

# Train-validation split
X_train, X_valid, y_train, y_valid = train_test_split(X,y,stratify=y,train_size=0.8,test_size=0.2,random_state=0)


oversample = SMOTE(random_state=0)

X_train_Smot, Y_train_Smot = oversample.fit_resample(X_train, y_train)
# Classifiers
classifiers = {
    "LogisticRegression" : LogisticRegression(random_state=0, solver='lbfgs'),
    "KNN" : KNeighborsClassifier(),
    "SVC" : SVC(random_state=0, probability=True),
    "RandomForest" : RandomForestClassifier(random_state=0),
    "XGBoost" : XGBClassifier(random_state=0, use_label_encoder=False, eval_metric='logloss'), # XGBoost takes too long
    "LGBM" : LGBMClassifier(random_state=0),
    #"CatBoost" : CatBoostClassifier(random_state=0, verbose=False),
    "NaiveBayes": GaussianNB()
}

# Grids for grid search
LR_grid = {'C': [0.25, 0.5, 0.75, 1, 1.25, 1.5],
           'max_iter': [50, 100, 150]
          }

KNN_grid = {'n_neighbors': [3, 5, 7, 9],
            'p': [1, 2]}

SVC_grid = {'C': [0.25, 0.5, 0.75, 1, 1.25, 1.5],
            'kernel': ['linear', 'rbf'],
            'gamma': ['scale', 'auto']}

RF_grid = {'n_estimators': [50, 100, 150, 200, 250, 300],
        'max_depth': [4, 6, 8, 10, 12]}

boosted_grid = {'n_estimators': [50, 100, 150, 200],
        'max_depth': [4, 8, 12],
        'learning_rate': [0.05, 0.1, 0.15]}

NB_grid={'var_smoothing': [1e-10, 1e-9, 1e-8, 1e-7]}

# Dictionary of all grids
grid = {
    "LogisticRegression" : LR_grid,
    "KNN" : KNN_grid,
    "SVC" : SVC_grid,
    "RandomForest" : RF_grid,
    "XGBoost" : boosted_grid,
    "LGBM" : boosted_grid,
    "CatBoost" : boosted_grid,
    "NaiveBayes": NB_grid
}
i=0
clf_best_params=classifiers.copy()
valid_scores=pd.DataFrame({'Classifer':classifiers.keys(), 'Validation accuracy': np.zeros(len(classifiers)), 'Training time': np.zeros(len(classifiers))})
for key, classifier in classifiers.items():
    start = time.time()
    clf = GridSearchCV(estimator=classifier, param_grid=grid[key], n_jobs=-1, cv=None)

    # Train and score
    clf.fit(X_train_Smot, Y_train_Smot)
    #valid_scores.iloc[i,1]=clf.score(X_valid, y_valid)
    y_pred = clf.predict(X_valid)
    valid_scores.iloc[i,1]=metrics.cohen_kappa_score(y_pred, y_valid, weights='quadratic')
    # Save trained model
    clf_best_params[key]=clf.best_params_
    
    # Print iteration and training time
    stop = time.time()
    valid_scores.iloc[i,2]=np.round((stop - start)/60, 2)
    
    print('Model:', key)
    print('Training time (mins):', valid_scores.iloc[i,2])
    print('')
    i+=1


Asked By: samabu

||

Answers:

This is a warning generated when predict function in sklearn internally calls scipy.stats.mode. This was fixed here – I suggest you update scikit-learn to latest and try.

Answered By: Jagadeesh

I also find the same error with the below code.

  final_preds = [mode([i,j,k])[0][0] for i,j,k in zip(svm_preds, 
  nb_preds, rf_preds)]

Then I update code:

final_preds = [mode([i,j,k], keepdims=True)[0][0] for i,j,k in 
zip(svm_preds, nb_preds, rf_preds)]

And found no error.

You can also try by adding "keepdims=True"

Answered By: Asif