Pandas df with multiple operator OR statement
Question:
I have a function where one passes a pandas df and it will return Boolean a 1 or 0 based on if some conditions are met.
Can you use multiple OR statements using Pythons built in operator? For example I need to see if 3 conditions are met in the dataframe row but the operator
can only take 2 vars. My testing with Pytest on this function this doesn’t work. Thanks for any advice or pseudo code.
import operator
import pandas as pd
def fault_finder(df):
df['flag'] = operator.or_( # <-- 1st operator statement
(df['temp1'] >= df['temp2'])
# verify operating state 2
& (df['free_clg_signal'] > .2)
& (df['mechanical_clg_signal'] < .1), # OR
operator.or_( # <-- 2nd operator statement
# verify operating state 3
(df['temp1'] >= df['temp2'])
& (df['mechanical_clg_signal'] > .01)
& (df['free_clg_signal'] == .2), # OR
# verify operating state 4
(df['temp1'] >= df['temp2'])
& (df['mechanical_clg_signal'] > .01)
& (df['free_clg_signal'] > .9)
)
).astype(int)
return df
Answers:
You can use numpy.logical_or.reduce
:
def fault_finder(df):
df['flag'] = np.logical_or.reduce([
(df['temp1'] >= df['temp2'])
# verify operating state 2
& (df['free_clg_signal'] > .2)
& (df['mechanical_clg_signal'] < .1),
# verify operating state 3
(df['temp1'] >= df['temp2'])
& (df['mechanical_clg_signal'] > .01)
& (df['free_clg_signal'] == .2),
# verify operating state 4
(df['temp1'] >= df['temp2'])
& (df['mechanical_clg_signal'] > .01)
& (df['free_clg_signal'] > .9)
]).astype(int)
return df
Or |
and parentheses:
def fault_finder(df):
df['flag'] = ((
(df['temp1'] >= df['temp2'])
# verify operating state 2
& (df['free_clg_signal'] > .2)
& (df['mechanical_clg_signal'] < .1))
|
( # verify operating state 3
(df['temp1'] >= df['temp2'])
& (df['mechanical_clg_signal'] > .01)
& (df['free_clg_signal'] == .2))
|
(# verify operating state 4
(df['temp1'] >= df['temp2'])
& (df['mechanical_clg_signal'] > .01)
& (df['free_clg_signal'] > .9))
).astype(int)
return df
Example:
fault_finder(df)
temp1 temp2 free_clg_signal mechanical_clg_signal flag
0 1 0 0.3 0.05 1
1 2 4 0.2 1.00 0
2 3 0 1.0 1.00 1
Unfortunately operator.or_
can only take two arguments. So if you have more than two boolean masks, you’ll need to nest calls to operator.or_
. But why not use |
?
import pandas as pd
mask1 = pd.Series([True, False, True, True, False])
mask2 = pd.Series([False, False, False, True, True])
mask3 = pd.Series([True, False, False, True, True])
out = operator.or_(
operator.or_(mask1, mask2),
mask3
)
# Note that you can't do operator.or_(mask1, mask2, mask3)
# because operator.or_ only takes 2 arguments
which is equivalent to the less verbose:
out = mask1 | mask2 | mask3
You could also use a functional approach; something like:
import functools
functools.reduce(operator.or_, [mask1, mask2, mask3])
If the boolean series are in one dataframe, you could also use pd.DataFrame.any
with an axis argument.
df = pd.concat([mask1, mask2, mask3], axis=1)
out = df.any(axis=1)
or numpy.any
:
import numpy as np
np.any([mask1, mask2, mask3], axis=0)
but numpy approaches will return an array, not a pd.Series
.
I have a function where one passes a pandas df and it will return Boolean a 1 or 0 based on if some conditions are met.
Can you use multiple OR statements using Pythons built in operator? For example I need to see if 3 conditions are met in the dataframe row but the operator
can only take 2 vars. My testing with Pytest on this function this doesn’t work. Thanks for any advice or pseudo code.
import operator
import pandas as pd
def fault_finder(df):
df['flag'] = operator.or_( # <-- 1st operator statement
(df['temp1'] >= df['temp2'])
# verify operating state 2
& (df['free_clg_signal'] > .2)
& (df['mechanical_clg_signal'] < .1), # OR
operator.or_( # <-- 2nd operator statement
# verify operating state 3
(df['temp1'] >= df['temp2'])
& (df['mechanical_clg_signal'] > .01)
& (df['free_clg_signal'] == .2), # OR
# verify operating state 4
(df['temp1'] >= df['temp2'])
& (df['mechanical_clg_signal'] > .01)
& (df['free_clg_signal'] > .9)
)
).astype(int)
return df
You can use numpy.logical_or.reduce
:
def fault_finder(df):
df['flag'] = np.logical_or.reduce([
(df['temp1'] >= df['temp2'])
# verify operating state 2
& (df['free_clg_signal'] > .2)
& (df['mechanical_clg_signal'] < .1),
# verify operating state 3
(df['temp1'] >= df['temp2'])
& (df['mechanical_clg_signal'] > .01)
& (df['free_clg_signal'] == .2),
# verify operating state 4
(df['temp1'] >= df['temp2'])
& (df['mechanical_clg_signal'] > .01)
& (df['free_clg_signal'] > .9)
]).astype(int)
return df
Or |
and parentheses:
def fault_finder(df):
df['flag'] = ((
(df['temp1'] >= df['temp2'])
# verify operating state 2
& (df['free_clg_signal'] > .2)
& (df['mechanical_clg_signal'] < .1))
|
( # verify operating state 3
(df['temp1'] >= df['temp2'])
& (df['mechanical_clg_signal'] > .01)
& (df['free_clg_signal'] == .2))
|
(# verify operating state 4
(df['temp1'] >= df['temp2'])
& (df['mechanical_clg_signal'] > .01)
& (df['free_clg_signal'] > .9))
).astype(int)
return df
Example:
fault_finder(df)
temp1 temp2 free_clg_signal mechanical_clg_signal flag
0 1 0 0.3 0.05 1
1 2 4 0.2 1.00 0
2 3 0 1.0 1.00 1
Unfortunately operator.or_
can only take two arguments. So if you have more than two boolean masks, you’ll need to nest calls to operator.or_
. But why not use |
?
import pandas as pd
mask1 = pd.Series([True, False, True, True, False])
mask2 = pd.Series([False, False, False, True, True])
mask3 = pd.Series([True, False, False, True, True])
out = operator.or_(
operator.or_(mask1, mask2),
mask3
)
# Note that you can't do operator.or_(mask1, mask2, mask3)
# because operator.or_ only takes 2 arguments
which is equivalent to the less verbose:
out = mask1 | mask2 | mask3
You could also use a functional approach; something like:
import functools
functools.reduce(operator.or_, [mask1, mask2, mask3])
If the boolean series are in one dataframe, you could also use pd.DataFrame.any
with an axis argument.
df = pd.concat([mask1, mask2, mask3], axis=1)
out = df.any(axis=1)
or numpy.any
:
import numpy as np
np.any([mask1, mask2, mask3], axis=0)
but numpy approaches will return an array, not a pd.Series
.