How to get true labels from LabelEncoder
Question:
I have the below code snippet:
df = pd.read_csv("data.csv")
X = df.drop(['label'], axis=1)
Y= df['label']
le = LabelEncoder()
Y = le.fit_transform(Y)
mapping = dict(zip(le.classes_, range(len(le.classes_))))
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=7,stratify=Y)
##xgb model
model = XGBClassifier()
model.fit(x_train, y_train)
#predict
y_pred = model.predict(x_train)
Here the y_pred
gives encoded labels. How can I get the true labels before encoding?
Answers:
Create a dict with original labels and encoded labels, use that dict to map y_pred to true labels
You can use
le.inverse_transform(y_pred)
where le
is the fitted LabelEncoder
le = LabelEncoder().fit(y)
See the documentation.
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
x = np.random.normal(0, 1, (20, 2))
y = np.array(['a', 'b'] * 10)
print(y)
# ['a' 'b' 'a' 'b' 'a' 'b' 'a' 'b' 'a' 'b' 'a' 'b' 'a' 'b' 'a' 'b' 'a' 'b' 'a' 'b']
le = LabelEncoder().fit(y)
y_enc = le.transform(y)
print(y_enc)
# [0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1]
x_train, x_test, y_train, y_test = train_test_split(x, y_enc, test_size=0.33, random_state=7, stratify=y_enc)
model = XGBClassifier()
model.fit(x_train, y_train)
y_pred_enc = model.predict(x_train)
print(y_pred_enc)
# [1 0 0 0 0 0 0 0 1 1 1 0 1]
y_pred = le.inverse_transform(y_pred_enc)
print(y_pred)
# ['b' 'a' 'a' 'a' 'a' 'a' 'a' 'a' 'b' 'b' 'b' 'a' 'b']
df = pd.read_csv("data.csv")
X = df.drop(['label'], axis=1)
Y= df['label']
le = LabelEncoder()
Y = le.fit_transform(Y)
If there are n number of classes, then the encoded values will range from (0, n-1) each of which represents a class. Lets save the numbers to a list.
encoding_numbers = list(range(0,len(le.classes_)))
Then use the inverse_transform function which takes the encoding_numbers as input to get corresponding categorical values and save it in a dictionary to get the mapping of categorical values to encoding
d = dict(zip(encoding_numbers, le.inverse_transform(encoding_numbers)))
print(d)
Note: inverse_transform
can only be used after you fit_transform
I have the below code snippet:
df = pd.read_csv("data.csv")
X = df.drop(['label'], axis=1)
Y= df['label']
le = LabelEncoder()
Y = le.fit_transform(Y)
mapping = dict(zip(le.classes_, range(len(le.classes_))))
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=7,stratify=Y)
##xgb model
model = XGBClassifier()
model.fit(x_train, y_train)
#predict
y_pred = model.predict(x_train)
Here the y_pred
gives encoded labels. How can I get the true labels before encoding?
Create a dict with original labels and encoded labels, use that dict to map y_pred to true labels
You can use
le.inverse_transform(y_pred)
where le
is the fitted LabelEncoder
le = LabelEncoder().fit(y)
See the documentation.
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
x = np.random.normal(0, 1, (20, 2))
y = np.array(['a', 'b'] * 10)
print(y)
# ['a' 'b' 'a' 'b' 'a' 'b' 'a' 'b' 'a' 'b' 'a' 'b' 'a' 'b' 'a' 'b' 'a' 'b' 'a' 'b']
le = LabelEncoder().fit(y)
y_enc = le.transform(y)
print(y_enc)
# [0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1]
x_train, x_test, y_train, y_test = train_test_split(x, y_enc, test_size=0.33, random_state=7, stratify=y_enc)
model = XGBClassifier()
model.fit(x_train, y_train)
y_pred_enc = model.predict(x_train)
print(y_pred_enc)
# [1 0 0 0 0 0 0 0 1 1 1 0 1]
y_pred = le.inverse_transform(y_pred_enc)
print(y_pred)
# ['b' 'a' 'a' 'a' 'a' 'a' 'a' 'a' 'b' 'b' 'b' 'a' 'b']
df = pd.read_csv("data.csv")
X = df.drop(['label'], axis=1)
Y= df['label']
le = LabelEncoder()
Y = le.fit_transform(Y)
If there are n number of classes, then the encoded values will range from (0, n-1) each of which represents a class. Lets save the numbers to a list.
encoding_numbers = list(range(0,len(le.classes_)))
Then use the inverse_transform function which takes the encoding_numbers as input to get corresponding categorical values and save it in a dictionary to get the mapping of categorical values to encoding
d = dict(zip(encoding_numbers, le.inverse_transform(encoding_numbers)))
print(d)
Note: inverse_transform
can only be used after you fit_transform