Pandas – Case when & default in pandas
Question:
I have the below case statement in python,
pd_df['difficulty'] = 'Unknown'
pd_df['difficulty'][(pd_df['Time']<30) & (pd_df['Time']>0)] = 'Easy'
pd_df['difficulty'][(pd_df['Time']>=30) & (pd_df['Time']<=60)] = 'Medium'
pd_df['difficulty'][pd_df['Time']>60] = 'Hard'
But when I run the code, it throws an error.
A value is trying to be set on a copy of a slice from a DataFrame
Answers:
Option 1
For performance, use a nested np.where
condition. For the condition, you can just use pd.Series.between
, and the default value will be inserted accordingly.
pd_df['difficulty'] = np.where(
pd_df['Time'].between(0, 30, inclusive=False),
'Easy',
np.where(
pd_df['Time'].between(0, 30, inclusive=False), 'Medium', 'Unknown'
)
)
Option 2
Similarly, using np.select
, this gives more room for adding conditions:
pd_df['difficulty'] = np.select(
[
pd_df['Time'].between(0, 30, inclusive=False),
pd_df['Time'].between(30, 60, inclusive=True)
],
[
'Easy',
'Medium'
],
default='Unknown'
)
Option 3
Another performant solution involves loc
:
pd_df['difficulty'] = 'Unknown'
pd_df.loc[pd_df['Time'].between(0, 30, inclusive=False), 'difficulty'] = 'Easy'
pd_df.loc[pd_df['Time'].between(30, 60, inclusive=True), 'difficulty'] = 'Medium'
OP’s code only needed loc
to correctly call the __setitem__()
method via []
. In particular, they already have used the proper brackets ()
to evaluate &
-chained conditions individually.
The basic idea of this approach is to initialize a column with some default value (e.g. "Unknown"
) and update rows depending on condtions (e.g. "Easy"
if 0<Time<30
), etc.
When I time the options given on this page, for large frames, loc
approach is the fastest (4-5 times faster than np.select
and nested np.where
).1.
pd_df['difficulty'] = 'Unknown'
pd_df.loc[(pd_df['Time']<30) & (pd_df['Time']>0), 'difficulty'] = 'Easy'
pd_df.loc[(pd_df['Time']>=30) & (pd_df['Time']<=60), 'difficulty'] = 'Medium'
pd_df.loc[pd_df['Time']>60, 'difficulty'] = 'Hard'
1: Code used for benchmark.
def loc(pd_df):
pd_df['difficulty'] = 'Unknown'
pd_df.loc[(pd_df['Time']<30) & (pd_df['Time']>0), 'difficulty'] = 'Easy'
pd_df.loc[(pd_df['Time']>=30) & (pd_df['Time']<=60), 'difficulty'] = 'Medium'
pd_df.loc[pd_df['Time']>60, 'difficulty'] = 'Hard'
return pd_df
def np_select(pd_df):
pd_df['difficulty'] = np.select([pd_df['Time'].between(0, 30, inclusive='neither'), pd_df['Time'].between(30, 60, inclusive='both'), pd_df['Time']>60], ['Easy', 'Medium', 'Hard'], 'Unknown')
return pd_df
def nested_np_where(pd_df):
pd_df['difficulty'] = np.where(pd_df['Time'].between(0, 30, inclusive='neither'), 'Easy', np.where(pd_df['Time'].between(30, 60, inclusive='both'), 'Medium', np.where(pd_df['Time'] > 60, 'Hard', 'Unknown')))
return pd_df
df = pd.DataFrame({'Time': np.random.default_rng().choice(120, size=15_000_000)-30})
%timeit loc(df.copy())
# 891 ms ± 6.14 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit np_select(df.copy())
# 3.93 s ± 100 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit nested_np_where(df.copy())
# 4.82 s ± 1.05 s per loop (mean ± std. dev. of 7 runs, 10 loops each)
import numpy as np
import pandas as pd
def case_when(*args):
return np.select(
condlist = [args[i] for i in range(0, len(args), 2)],
choicelist = [args[i] for i in range(1, len(args), 2)],
default=pd.NA
)
df = pd.DataFrame({"cola":["a","b","a","a","c","d","d","e","c"],
"colb":range(9)})
df["newcol"] = case_when(df["cola"] == "a","ap",
df["colb"] == 0, "x", # Not taken because it's after the first line
df["colb"] == 1, "y",
True, df["cola"]
)
df["newcolb"] = case_when(df["cola"] == "e",1,
df["colb"] == 8, 2
)
df
# cola colb newcol newcolb
# 0 a 0 ap <NA>
# 1 b 1 y <NA>
# 2 a 2 ap <NA>
# 3 a 3 ap <NA>
# 4 c 4 c <NA>
# 5 d 5 d <NA>
# 6 d 6 d <NA>
# 7 e 7 e 1
# 8 c 8 c 2
For the sake of readability, like in R.
I have the below case statement in python,
pd_df['difficulty'] = 'Unknown'
pd_df['difficulty'][(pd_df['Time']<30) & (pd_df['Time']>0)] = 'Easy'
pd_df['difficulty'][(pd_df['Time']>=30) & (pd_df['Time']<=60)] = 'Medium'
pd_df['difficulty'][pd_df['Time']>60] = 'Hard'
But when I run the code, it throws an error.
A value is trying to be set on a copy of a slice from a DataFrame
Option 1
For performance, use a nested np.where
condition. For the condition, you can just use pd.Series.between
, and the default value will be inserted accordingly.
pd_df['difficulty'] = np.where(
pd_df['Time'].between(0, 30, inclusive=False),
'Easy',
np.where(
pd_df['Time'].between(0, 30, inclusive=False), 'Medium', 'Unknown'
)
)
Option 2
Similarly, using np.select
, this gives more room for adding conditions:
pd_df['difficulty'] = np.select(
[
pd_df['Time'].between(0, 30, inclusive=False),
pd_df['Time'].between(30, 60, inclusive=True)
],
[
'Easy',
'Medium'
],
default='Unknown'
)
Option 3
Another performant solution involves loc
:
pd_df['difficulty'] = 'Unknown'
pd_df.loc[pd_df['Time'].between(0, 30, inclusive=False), 'difficulty'] = 'Easy'
pd_df.loc[pd_df['Time'].between(30, 60, inclusive=True), 'difficulty'] = 'Medium'
OP’s code only needed loc
to correctly call the __setitem__()
method via []
. In particular, they already have used the proper brackets ()
to evaluate &
-chained conditions individually.
The basic idea of this approach is to initialize a column with some default value (e.g. "Unknown"
) and update rows depending on condtions (e.g. "Easy"
if 0<Time<30
), etc.
When I time the options given on this page, for large frames, loc
approach is the fastest (4-5 times faster than np.select
and nested np.where
).1.
pd_df['difficulty'] = 'Unknown'
pd_df.loc[(pd_df['Time']<30) & (pd_df['Time']>0), 'difficulty'] = 'Easy'
pd_df.loc[(pd_df['Time']>=30) & (pd_df['Time']<=60), 'difficulty'] = 'Medium'
pd_df.loc[pd_df['Time']>60, 'difficulty'] = 'Hard'
1: Code used for benchmark.
def loc(pd_df):
pd_df['difficulty'] = 'Unknown'
pd_df.loc[(pd_df['Time']<30) & (pd_df['Time']>0), 'difficulty'] = 'Easy'
pd_df.loc[(pd_df['Time']>=30) & (pd_df['Time']<=60), 'difficulty'] = 'Medium'
pd_df.loc[pd_df['Time']>60, 'difficulty'] = 'Hard'
return pd_df
def np_select(pd_df):
pd_df['difficulty'] = np.select([pd_df['Time'].between(0, 30, inclusive='neither'), pd_df['Time'].between(30, 60, inclusive='both'), pd_df['Time']>60], ['Easy', 'Medium', 'Hard'], 'Unknown')
return pd_df
def nested_np_where(pd_df):
pd_df['difficulty'] = np.where(pd_df['Time'].between(0, 30, inclusive='neither'), 'Easy', np.where(pd_df['Time'].between(30, 60, inclusive='both'), 'Medium', np.where(pd_df['Time'] > 60, 'Hard', 'Unknown')))
return pd_df
df = pd.DataFrame({'Time': np.random.default_rng().choice(120, size=15_000_000)-30})
%timeit loc(df.copy())
# 891 ms ± 6.14 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit np_select(df.copy())
# 3.93 s ± 100 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit nested_np_where(df.copy())
# 4.82 s ± 1.05 s per loop (mean ± std. dev. of 7 runs, 10 loops each)
import numpy as np
import pandas as pd
def case_when(*args):
return np.select(
condlist = [args[i] for i in range(0, len(args), 2)],
choicelist = [args[i] for i in range(1, len(args), 2)],
default=pd.NA
)
df = pd.DataFrame({"cola":["a","b","a","a","c","d","d","e","c"],
"colb":range(9)})
df["newcol"] = case_when(df["cola"] == "a","ap",
df["colb"] == 0, "x", # Not taken because it's after the first line
df["colb"] == 1, "y",
True, df["cola"]
)
df["newcolb"] = case_when(df["cola"] == "e",1,
df["colb"] == 8, 2
)
df
# cola colb newcol newcolb
# 0 a 0 ap <NA>
# 1 b 1 y <NA>
# 2 a 2 ap <NA>
# 3 a 3 ap <NA>
# 4 c 4 c <NA>
# 5 d 5 d <NA>
# 6 d 6 d <NA>
# 7 e 7 e 1
# 8 c 8 c 2
For the sake of readability, like in R.