calculate zscores using groupby and highlighting using style
Question:
I want to highlight possible outliers in a dataframe based on zscore grouped by location. I have used this answer to identify and score values but I can’t seem to implement the .groupby correctly. For example df.groupby('Loc').style.apply(highlight_outliers,color='red', threshold_val=1.5, axis=None)
gives ‘AttributeError: ‘DataFrameGroupBy’ object has no attribute ‘style’
Dummy data:
import pandas as pd
array=[['Site 1',750.0, 1.1e-09, 'daljk', 6.0],
['Site 1',890.0, 1e-09, 'djfh', 8.0],
['Site 1',1720.0, 1e-09, 'dkhf', 4.0],
['Site 1',999.0, 1e-09, 'dkafh', 10.0],
['Site 1',890.0, 1e-09, 'dkajfh', 0.0005],
['Site 1',909.0, 1e-09, 'jkdafh', 6.0],
['Site 1',1002.0, 1e-09, 'dlfakh', np.nan],
['Site 1',990.0, 1e-09, 'ldkj', 3.0],
['Site 1',0.0001, 1e-09, 'dlkfj', 10.0],
['Site 2',7500.0, 1.1e-09, 'daljk', 6.0],
['Site 2',890.0, 1e-09, 'djfh', 8.0],
['Site 2',1720.0, 1e-09, 'dkhf', 4.0],
['Site 2',1, 1e-09, 'dkafh', 10.0],
['Site 2',890.0, 1e-09, 'dkajfh', 0.0005],
['Site 2',909.0, 1e-09, 'jkdafh', 6.0],
['Site 2',1002.0, 1e-09, 'dlfakh', np.nan],
['Site 2',990.0, 1e-09, 'ldkj', 3.0],
['Site 2',0.0001, 1e-09, 'dlkfj', 10.0]]
df = pd.DataFrame(array, columns = ['Loc','A','B','C','D'])
df
Code not working:
from scipy import stats
css_colours={"red": 'red'}
def highlight_outliers(x,color,threshold_val):
color=css_colours[color]
#extract numeric columns
c=x.select_dtypes([np.number]).columns
#create df of numeric cols
df2=pd.DataFrame(x,columns=c)
#calculate zscores
df2=df2.apply(stats.zscore, nan_policy='omit').abs()
#boolean mask of values greater than threshold value
mask=(df2[c].apply(pd.to_numeric, errors='coerce').fillna(-np.Inf).replace(0, -np.Inf).values<threshold_val)
#create blank df of numeric cols
df1=pd.DataFrame('',index=x.index, columns=c)
#style locations which exceed threshold (fill orange) based on mask
df1=(df1.where(mask, 'background-color:{}'.format(color)).reindex(columns=x.columns, fill_value=''))
return df1
df.groupby(['Loc']).style.apply(highlight_outliers,color='red', threshold_val=1.5, axis=None)
I think I need to use ‘transform’ but I can’t see how to apply it in this case. Creating a boolean mask is more intuitive to me (novice in python).
Expected output:
Answers:
You can use a wrapper to keep your original function:
from scipy import stats
css_colours={"red": 'red'}
def highlight_outliers(x,color,threshold_val):
color=css_colours[color]
#extract numeric columns
c=x.select_dtypes([np.number]).columns
#create df of numeric cols
df2=pd.DataFrame(x,columns=c)
#calculate zscores
df2=df2.apply(stats.zscore, nan_policy='omit').abs()
#boolean mask of values greater than threshold value
mask=(df2[c].apply(pd.to_numeric, errors='coerce').fillna(-np.Inf).replace(0, -np.Inf).values<threshold_val)
#create blank df of numeric cols
df1=pd.DataFrame('',index=x.index, columns=c)
#style locations which exceed threshold (fill orange) based on mask
df1=(df1.where(mask, 'background-color:{}'.format(color)).reindex(columns=x.columns, fill_value=''))
return df1
def highlight_per_group(df, *args, **kwargs):
return df.groupby(['Loc']).apply(highlight_outliers, *args, **kwargs)
df.style.apply(highlight_per_group, color='red', threshold_val=1.5, axis=None)
I want to highlight possible outliers in a dataframe based on zscore grouped by location. I have used this answer to identify and score values but I can’t seem to implement the .groupby correctly. For example df.groupby('Loc').style.apply(highlight_outliers,color='red', threshold_val=1.5, axis=None)
gives ‘AttributeError: ‘DataFrameGroupBy’ object has no attribute ‘style’
Dummy data:
import pandas as pd
array=[['Site 1',750.0, 1.1e-09, 'daljk', 6.0],
['Site 1',890.0, 1e-09, 'djfh', 8.0],
['Site 1',1720.0, 1e-09, 'dkhf', 4.0],
['Site 1',999.0, 1e-09, 'dkafh', 10.0],
['Site 1',890.0, 1e-09, 'dkajfh', 0.0005],
['Site 1',909.0, 1e-09, 'jkdafh', 6.0],
['Site 1',1002.0, 1e-09, 'dlfakh', np.nan],
['Site 1',990.0, 1e-09, 'ldkj', 3.0],
['Site 1',0.0001, 1e-09, 'dlkfj', 10.0],
['Site 2',7500.0, 1.1e-09, 'daljk', 6.0],
['Site 2',890.0, 1e-09, 'djfh', 8.0],
['Site 2',1720.0, 1e-09, 'dkhf', 4.0],
['Site 2',1, 1e-09, 'dkafh', 10.0],
['Site 2',890.0, 1e-09, 'dkajfh', 0.0005],
['Site 2',909.0, 1e-09, 'jkdafh', 6.0],
['Site 2',1002.0, 1e-09, 'dlfakh', np.nan],
['Site 2',990.0, 1e-09, 'ldkj', 3.0],
['Site 2',0.0001, 1e-09, 'dlkfj', 10.0]]
df = pd.DataFrame(array, columns = ['Loc','A','B','C','D'])
df
Code not working:
from scipy import stats
css_colours={"red": 'red'}
def highlight_outliers(x,color,threshold_val):
color=css_colours[color]
#extract numeric columns
c=x.select_dtypes([np.number]).columns
#create df of numeric cols
df2=pd.DataFrame(x,columns=c)
#calculate zscores
df2=df2.apply(stats.zscore, nan_policy='omit').abs()
#boolean mask of values greater than threshold value
mask=(df2[c].apply(pd.to_numeric, errors='coerce').fillna(-np.Inf).replace(0, -np.Inf).values<threshold_val)
#create blank df of numeric cols
df1=pd.DataFrame('',index=x.index, columns=c)
#style locations which exceed threshold (fill orange) based on mask
df1=(df1.where(mask, 'background-color:{}'.format(color)).reindex(columns=x.columns, fill_value=''))
return df1
df.groupby(['Loc']).style.apply(highlight_outliers,color='red', threshold_val=1.5, axis=None)
I think I need to use ‘transform’ but I can’t see how to apply it in this case. Creating a boolean mask is more intuitive to me (novice in python).
Expected output:
You can use a wrapper to keep your original function:
from scipy import stats
css_colours={"red": 'red'}
def highlight_outliers(x,color,threshold_val):
color=css_colours[color]
#extract numeric columns
c=x.select_dtypes([np.number]).columns
#create df of numeric cols
df2=pd.DataFrame(x,columns=c)
#calculate zscores
df2=df2.apply(stats.zscore, nan_policy='omit').abs()
#boolean mask of values greater than threshold value
mask=(df2[c].apply(pd.to_numeric, errors='coerce').fillna(-np.Inf).replace(0, -np.Inf).values<threshold_val)
#create blank df of numeric cols
df1=pd.DataFrame('',index=x.index, columns=c)
#style locations which exceed threshold (fill orange) based on mask
df1=(df1.where(mask, 'background-color:{}'.format(color)).reindex(columns=x.columns, fill_value=''))
return df1
def highlight_per_group(df, *args, **kwargs):
return df.groupby(['Loc']).apply(highlight_outliers, *args, **kwargs)
df.style.apply(highlight_per_group, color='red', threshold_val=1.5, axis=None)