How to check multiple conditions on dataframe columns, row by row, without using a loop Pandas
Question:
I have a big dataframe df that I need to clean from bad data using four different conditions:
- if the element of column
A
is a number
- if this number is different from
nan
- if the element of column
B
belongs to list1
- if the element of column
C
belongs to list2
I would like to check these conditions without a loop because it runs very slowly.
This is the code I wrote:
df1=pd.read_csv(data1)
list1=pd.read_csv(data2)
list2=pd.read_csv(data3)
col_df= #list of target column names (it's not important to specify what they are)
df2=pd.DataFrame(np.zeros(shape=(1,len(col_df))),columns=col_df)
row_df3=pd.DataFrame(np.zeros(shape=(1,len(col_df))),columns=col_df) #I use it to append new zero row to previous
k1=0 #index to add new row to df2
for j in range(len(df1)):
try:
float(df1['A'][j])
check=1
except:
check=0
if check==1: #if the element is a number
if df1['B'][j] in list1['B'].unique(): #if the element is in the first list
if str(df1['A'][j])!='nan': #if the element value is different from nan
if da['C'][j] in list2['C'].unique(): #if the element is in the second list
iii=list1.index[list1['B']==df1['B'][j]] #I take the index in list1
iii=iii[0]
df2.loc[k1]=row_df2.loc[0] #new zero row
df2.loc[k1]=df1.iloc[j] #assign elements to new df2 with same column name of df1
df2['B'][k1]=list1['B'][iii] #assign element from list1
k1=k1+1 #new row for df2
It works but slowly. Is there a way to write all this conditions without using a loop? If yes I also thought to use drop
function instead of creating a new dataframe df2
.
Thank you in advance!
I tried to do this:
def is_float(element: any) -> bool:
#If you expect None to be passed:
if element is None:
return False
try:
float(element)
return True
except ValueError:
return False
df1['Bool']=df1['A'].apply(is_float) #to create a new column that check if it's a number
df2=df1[df1['Bool']==True and df1['B'] in list1['B'].unique() and da['C'] in list2['C'].unique()] #to check all other conditions
without defining df2
previous this time, but it doesn’t work.. I get two errors:
ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all()
and if I erase the first condition, so
df2=df1[df1['B'] in list1['B'].unique() and da['C'] in list2['C'].unique()]
I get:
ValueError: (‘Lengths must match to compare’)
Answers:
using apply and those functions you should be able to make a column grouping all conditions together
def is_null(value):
# Null values are considered as float but nan is not equal to nan
if pd.isna(value):
return True
else:
return False
def is_float(value):
if type(value) == float:
return True
else:
return False
def in_list1(value):
if value in list1:
return True
else:
return False
def in_list2(value):
if value in list2:
return True
else:
return False
Well, it works perfectly! Code execution time on each dataframe went from 5 hours to 3 minutes! I used the idea of function like this:
def func(value):
if value['B'] in list1['B'].unique():
if value['C'] in list2['C'].unique():
check=0
try:
float(value['A'])
check=1
except:
check=0
if check==1:
if str(value['A'])!='nan': #all condition are satisfied
iii=list1.index[list1['B']==value['B']]
iii=iii[0]
value['B']=list1['B'][iii]
value['Check']=True
return value
else:
value['Check']=False
return value
else:
value['Check']=False
return value
else:
value['Check']=False
return value
else:
value['Check']=False
return value
df1=pd.read_csv(data1) #read data
df1.insert(loc=7, column='B', value=0) #insert column to assign list1 value
df1['Check']=0 #new column to check the conditional statements
list1=pd.read_csv(data2) #read list1
list2=pd.read_csv(data3) #read list2
df1=df1.apply(func,axis=1) #apply function which replaces each row with the new one processed
df1=df1[df1['Check']==True] #select all rows which satisfy The conditional statement
df1=df1.drop('Check',axis=1) #drop the check column after use
I decided to post it so maybe it can be useful to someone. Thank you very much @Marc
I have a big dataframe df that I need to clean from bad data using four different conditions:
- if the element of column
A
is a number - if this number is different from
nan
- if the element of column
B
belongs tolist1
- if the element of column
C
belongs tolist2
I would like to check these conditions without a loop because it runs very slowly.
This is the code I wrote:
df1=pd.read_csv(data1)
list1=pd.read_csv(data2)
list2=pd.read_csv(data3)
col_df= #list of target column names (it's not important to specify what they are)
df2=pd.DataFrame(np.zeros(shape=(1,len(col_df))),columns=col_df)
row_df3=pd.DataFrame(np.zeros(shape=(1,len(col_df))),columns=col_df) #I use it to append new zero row to previous
k1=0 #index to add new row to df2
for j in range(len(df1)):
try:
float(df1['A'][j])
check=1
except:
check=0
if check==1: #if the element is a number
if df1['B'][j] in list1['B'].unique(): #if the element is in the first list
if str(df1['A'][j])!='nan': #if the element value is different from nan
if da['C'][j] in list2['C'].unique(): #if the element is in the second list
iii=list1.index[list1['B']==df1['B'][j]] #I take the index in list1
iii=iii[0]
df2.loc[k1]=row_df2.loc[0] #new zero row
df2.loc[k1]=df1.iloc[j] #assign elements to new df2 with same column name of df1
df2['B'][k1]=list1['B'][iii] #assign element from list1
k1=k1+1 #new row for df2
It works but slowly. Is there a way to write all this conditions without using a loop? If yes I also thought to use drop
function instead of creating a new dataframe df2
.
Thank you in advance!
I tried to do this:
def is_float(element: any) -> bool:
#If you expect None to be passed:
if element is None:
return False
try:
float(element)
return True
except ValueError:
return False
df1['Bool']=df1['A'].apply(is_float) #to create a new column that check if it's a number
df2=df1[df1['Bool']==True and df1['B'] in list1['B'].unique() and da['C'] in list2['C'].unique()] #to check all other conditions
without defining df2
previous this time, but it doesn’t work.. I get two errors:
ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all()
and if I erase the first condition, so
df2=df1[df1['B'] in list1['B'].unique() and da['C'] in list2['C'].unique()]
I get:
ValueError: (‘Lengths must match to compare’)
using apply and those functions you should be able to make a column grouping all conditions together
def is_null(value):
# Null values are considered as float but nan is not equal to nan
if pd.isna(value):
return True
else:
return False
def is_float(value):
if type(value) == float:
return True
else:
return False
def in_list1(value):
if value in list1:
return True
else:
return False
def in_list2(value):
if value in list2:
return True
else:
return False
Well, it works perfectly! Code execution time on each dataframe went from 5 hours to 3 minutes! I used the idea of function like this:
def func(value):
if value['B'] in list1['B'].unique():
if value['C'] in list2['C'].unique():
check=0
try:
float(value['A'])
check=1
except:
check=0
if check==1:
if str(value['A'])!='nan': #all condition are satisfied
iii=list1.index[list1['B']==value['B']]
iii=iii[0]
value['B']=list1['B'][iii]
value['Check']=True
return value
else:
value['Check']=False
return value
else:
value['Check']=False
return value
else:
value['Check']=False
return value
else:
value['Check']=False
return value
df1=pd.read_csv(data1) #read data
df1.insert(loc=7, column='B', value=0) #insert column to assign list1 value
df1['Check']=0 #new column to check the conditional statements
list1=pd.read_csv(data2) #read list1
list2=pd.read_csv(data3) #read list2
df1=df1.apply(func,axis=1) #apply function which replaces each row with the new one processed
df1=df1[df1['Check']==True] #select all rows which satisfy The conditional statement
df1=df1.drop('Check',axis=1) #drop the check column after use
I decided to post it so maybe it can be useful to someone. Thank you very much @Marc