How to convert multi column expressions from Pandas to Polars
Question:
I just found out about the Polars lib and I wanted to convert some old functions to get familiar.
However, I stumbled upon an issue with my code. The "Mean_Angle" column is not calculated, and I have no idea if the last part even works as intended, it aborts during the groupby operation as the column is missing.
This is the code I want to convert:
def calc_mean_and_error(df: pd.DataFrame, columns=None, groupby="Magn_Pos") -> pd.DataFrame:
data = df.copy()
if columns is None:
columns = ['Left_Angle', 'Right_Angle', 'Magn_Pos', 'Magn_Field']
if 'Left_Angle' in columns and 'Right_Angle' in columns:
data['Mean_Angle'] = (data['Left_Angle'] + data['Right_Angle']) / 2
columns.append('Mean_Angle')
grouped_df = data[columns].groupby(groupby,sort=False)
num_points_per_group = grouped_df.size().values
mean_df = grouped_df.mean()
# standard deviation
mean_df[['Left_Angle_SDEV','Right_Angle_SDEV','Mean_Angle_SDEV']] = grouped_df[['Left_Angle','Right_Angle','Mean_Angle']].std()
# standard error, 1 sigma confidence interval
mean_df[['Left_Angle_SEM_68','Right_Angle_SEM_68','Mean_Angle_SEM_68']] = grouped_df[['Left_Angle','Right_Angle','Mean_Angle']].sem()
# standard error, 2 sigma confidence interval - t distribution
t_fac_95_conf_int = stats.t.ppf(0.95, num_points_per_group) # factor according to https://en.wikipedia.org/wiki/Student%27s_t-distribution
mean_df[['Left_Angle_SEM_95','Right_Angle_SEM_95','Mean_Angle_SEM_95']] = mean_df[['Left_Angle_SEM_68','Right_Angle_SEM_68','Mean_Angle_SEM_68']].multiply(t_fac_95_conf_int, axis=0)
# standard error, 3 sigma confidence interval - t distribution
t_fac_99_conf_int = stats.t.ppf(0.997, num_points_per_group)
mean_df[['Left_Angle_SEM_99','Right_Angle_SEM_99','Mean_Angle_SEM_99']] = mean_df[['Left_Angle_SEM_68','Right_Angle_SEM_68','Mean_Angle_SEM_68']].multiply(t_fac_99_conf_int, axis=0)
mean_df = mean_df.reset_index()
return mean_df
This is what I have so far:
def calc_mean_and_error(df: pl.DataFrame, columns=None, groupby="Magn_Pos") -> pl.DataFrame:
data = df
if columns is None:
columns = ['Left_Angle', 'Right_Angle', 'Magn_Pos', 'Magn_Field']
if 'Left_Angle' in columns and 'Right_Angle' in columns:
# this doesn't work?
data.with_column(
pl.struct(['Left_Angle', 'Right_Angle']).apply(lambda x: (x['Left_Angle'] + x['Right_Angle']) / 2).alias("Mean_Angle")
)
columns.append('Mean_Angle')
grouped_df = data.select(columns).groupy(groupby)
num_points_per_group = grouped_df.count()['count'].take(0)
mean_df = grouped_df.mean()
t_fac_95_conf_int = stats.t.ppf(0.95, num_points_per_group) # factor according to https://en.wikipedia.org/wiki/Student%27s_t-distribution
t_fac_99_conf_int = stats.t.ppf(0.997, num_points_per_group)
# standard deviation
mean_df = data.select(columns).groupby(groupby).agg(
[
pl.col('*').mean(),
pl.col('*').std().suffix('_SDEV'),
pl.col('*').std().apply(lambda x: x / np.sqrt(num_points_per_group)).suffix('_SEM_68'), # standard error
pl.col('*').std().apply(lambda x: x*t_fac_95_conf_int / np.sqrt(num_points_per_group)).suffix('_SEM_95'),
pl.col('*').std().apply(lambda x: x*t_fac_99_conf_int / np.sqrt(num_points_per_group)).suffix('_SEM_99'),
]
)
return mean_df
Example:
data_raw = """TimetRepetitiontLeft_AngletRight_AngletMagn_PostMagn_Field
0.0t0t111.62539060014953t111.65929559305457t20.0t0.05012
289.75t1t113.43406129503042t113.29101205027376t20.0t0.05012
343.420999999973t2t113.21669960326668t113.30918399000467t20.0t0.05012
397.68700000003446t0t114.50650196149256t114.78488582815113t10.0t0.1317
456.10900000005495t1t114.7078936381882t114.70239460290726t10.0t0.1317
507.8279999999795t2t115.71894177915732t115.70104461571628t10.0t0.1317
565.3429999999935t0t121.71521327349599t121.55379420624988t5.0t0.2276
612.045999999973t1t122.53171995914443t122.4555143281342t5.0t0.2276
668.3120000000345t2t121.65748098845367t121.60313424823333t5.0t0.2276
714.484000000055t0t130.88884567117995t130.82365731381574t2.5t0.3011
774.9679999999935t1t132.72366563179372t132.59019277520363t2.5t0.3011
817.765000000014t2t133.5549497954158t133.4637401535662t2.5t0.3011
891.7029999999795t0t139.9155468732065t139.78384156146674t0.0t0.3907
940.655999999959t1t143.34707217674438t143.2278696177915t0.0t0.3907
984.125t2t144.30042471080577t144.16800277145435t0.0t0.3907""".encode("utf8")
data = pl.read_csv(data_raw, sep='t', encoding="utf8")
eval = calc_mean_and_error(data, columns=['Left_Angle', 'Right_Angle', 'Magn_Pos', 'Magn_Field'])
print(eval)
I’m not really sure about the last part though! I am not entirely familiar with the syntax of the expressions. And I am not sure how to prevent calling groupby twice. Can someone lead me in the right direction? Thanks!
Answers:
.with_column()
returns a new dataframe – it does not modify in-place.
You would assign the result:
data = data.with_column(...)
You don’t need to use .apply()
here – you can use expressions:
data = data.with_column(
((pl.col("Left_Angle") + pl.col("Right_Angle")) / 2)
.alias("Mean_Angle")
)
Regarding calling .groupby()
twice – you’re already using grouped_df
– you could re-use it:
mean_df = grouped_df.agg(...)
However – it may make more sense to add the values you’re using as columns and call groupby twice.
That way you can remove all the further calls to .apply()
and use expressions instead:
# all columns excluding the "temporary" ones we will add
columns = pl.exclude(["t_fac_95_conf_int", "t_fact_99_conf_int", "sqrt"])
mean_df = data.with_columns([
pl.lit(t_fac_95_conf_int).alias("t_fac_95_conf_int"),
pl.lit(t_fac_99_conf_int).alias("t_fac_99_conf_int"),
pl.lit(np.sqrt(num_points_per_group)).alias("sqrt")
]).groupby(groupby).agg([
columns.mean(),
columns.std().suffix("_SDEV"),
(columns.std() / pl.col("sqrt")).first().suffix("_SEM_68"),
(columns.std() * pl.col("t_fac_95_conf_int") / pl.col("sqrt")).first().suffix("_SEM_95"),
(columns.std() * pl.col("t_fac_99_conf_int") / pl.col("sqrt")).first().suffix("_SEM_99"),
])
Further possible changes:
-
Instead of the first groupby – you can use pl.count().over()
-
You can use .map()
to run the count column through stats.t.ppf()
– [.flatten()
is used to unwrap the list]
-
As you now have a count column – Expr.sqrt()
can be used instead of np.sqrt()
This should be equivalent:
columns = pl.exclude(["count", "sqrt", "t_fac_95_conf_int", "t_fac_99_conf_int"])
mean_df = (
data
.with_columns(pl.count().over(groupby))
.with_columns([
pl.col("count").map(lambda col: stats.t.ppf(0.95, col)).flatten().alias("t_fac_95_conf_int"),
pl.col("count").map(lambda col: stats.t.ppf(0.997, col)).flatten().alias("t_fac_99_conf_int"),
pl.col("count").sqrt().alias("sqrt"),
])
.groupby(groupby)
.agg([
columns.mean(),
columns.std().suffix("_SDEV"),
(columns.std() / pl.col("sqrt")).first().suffix("_SEM_68"),
(columns.std() * pl.col("t_fac_95_conf_int") / pl.col("sqrt")).first().suffix("_SEM_95"),
(columns.std() * pl.col("t_fac_99_conf_int") / pl.col("sqrt")).first().suffix("_SEM_99"),
])
)
I just found out about the Polars lib and I wanted to convert some old functions to get familiar.
However, I stumbled upon an issue with my code. The "Mean_Angle" column is not calculated, and I have no idea if the last part even works as intended, it aborts during the groupby operation as the column is missing.
This is the code I want to convert:
def calc_mean_and_error(df: pd.DataFrame, columns=None, groupby="Magn_Pos") -> pd.DataFrame:
data = df.copy()
if columns is None:
columns = ['Left_Angle', 'Right_Angle', 'Magn_Pos', 'Magn_Field']
if 'Left_Angle' in columns and 'Right_Angle' in columns:
data['Mean_Angle'] = (data['Left_Angle'] + data['Right_Angle']) / 2
columns.append('Mean_Angle')
grouped_df = data[columns].groupby(groupby,sort=False)
num_points_per_group = grouped_df.size().values
mean_df = grouped_df.mean()
# standard deviation
mean_df[['Left_Angle_SDEV','Right_Angle_SDEV','Mean_Angle_SDEV']] = grouped_df[['Left_Angle','Right_Angle','Mean_Angle']].std()
# standard error, 1 sigma confidence interval
mean_df[['Left_Angle_SEM_68','Right_Angle_SEM_68','Mean_Angle_SEM_68']] = grouped_df[['Left_Angle','Right_Angle','Mean_Angle']].sem()
# standard error, 2 sigma confidence interval - t distribution
t_fac_95_conf_int = stats.t.ppf(0.95, num_points_per_group) # factor according to https://en.wikipedia.org/wiki/Student%27s_t-distribution
mean_df[['Left_Angle_SEM_95','Right_Angle_SEM_95','Mean_Angle_SEM_95']] = mean_df[['Left_Angle_SEM_68','Right_Angle_SEM_68','Mean_Angle_SEM_68']].multiply(t_fac_95_conf_int, axis=0)
# standard error, 3 sigma confidence interval - t distribution
t_fac_99_conf_int = stats.t.ppf(0.997, num_points_per_group)
mean_df[['Left_Angle_SEM_99','Right_Angle_SEM_99','Mean_Angle_SEM_99']] = mean_df[['Left_Angle_SEM_68','Right_Angle_SEM_68','Mean_Angle_SEM_68']].multiply(t_fac_99_conf_int, axis=0)
mean_df = mean_df.reset_index()
return mean_df
This is what I have so far:
def calc_mean_and_error(df: pl.DataFrame, columns=None, groupby="Magn_Pos") -> pl.DataFrame:
data = df
if columns is None:
columns = ['Left_Angle', 'Right_Angle', 'Magn_Pos', 'Magn_Field']
if 'Left_Angle' in columns and 'Right_Angle' in columns:
# this doesn't work?
data.with_column(
pl.struct(['Left_Angle', 'Right_Angle']).apply(lambda x: (x['Left_Angle'] + x['Right_Angle']) / 2).alias("Mean_Angle")
)
columns.append('Mean_Angle')
grouped_df = data.select(columns).groupy(groupby)
num_points_per_group = grouped_df.count()['count'].take(0)
mean_df = grouped_df.mean()
t_fac_95_conf_int = stats.t.ppf(0.95, num_points_per_group) # factor according to https://en.wikipedia.org/wiki/Student%27s_t-distribution
t_fac_99_conf_int = stats.t.ppf(0.997, num_points_per_group)
# standard deviation
mean_df = data.select(columns).groupby(groupby).agg(
[
pl.col('*').mean(),
pl.col('*').std().suffix('_SDEV'),
pl.col('*').std().apply(lambda x: x / np.sqrt(num_points_per_group)).suffix('_SEM_68'), # standard error
pl.col('*').std().apply(lambda x: x*t_fac_95_conf_int / np.sqrt(num_points_per_group)).suffix('_SEM_95'),
pl.col('*').std().apply(lambda x: x*t_fac_99_conf_int / np.sqrt(num_points_per_group)).suffix('_SEM_99'),
]
)
return mean_df
Example:
data_raw = """TimetRepetitiontLeft_AngletRight_AngletMagn_PostMagn_Field
0.0t0t111.62539060014953t111.65929559305457t20.0t0.05012
289.75t1t113.43406129503042t113.29101205027376t20.0t0.05012
343.420999999973t2t113.21669960326668t113.30918399000467t20.0t0.05012
397.68700000003446t0t114.50650196149256t114.78488582815113t10.0t0.1317
456.10900000005495t1t114.7078936381882t114.70239460290726t10.0t0.1317
507.8279999999795t2t115.71894177915732t115.70104461571628t10.0t0.1317
565.3429999999935t0t121.71521327349599t121.55379420624988t5.0t0.2276
612.045999999973t1t122.53171995914443t122.4555143281342t5.0t0.2276
668.3120000000345t2t121.65748098845367t121.60313424823333t5.0t0.2276
714.484000000055t0t130.88884567117995t130.82365731381574t2.5t0.3011
774.9679999999935t1t132.72366563179372t132.59019277520363t2.5t0.3011
817.765000000014t2t133.5549497954158t133.4637401535662t2.5t0.3011
891.7029999999795t0t139.9155468732065t139.78384156146674t0.0t0.3907
940.655999999959t1t143.34707217674438t143.2278696177915t0.0t0.3907
984.125t2t144.30042471080577t144.16800277145435t0.0t0.3907""".encode("utf8")
data = pl.read_csv(data_raw, sep='t', encoding="utf8")
eval = calc_mean_and_error(data, columns=['Left_Angle', 'Right_Angle', 'Magn_Pos', 'Magn_Field'])
print(eval)
I’m not really sure about the last part though! I am not entirely familiar with the syntax of the expressions. And I am not sure how to prevent calling groupby twice. Can someone lead me in the right direction? Thanks!
.with_column()
returns a new dataframe – it does not modify in-place.
You would assign the result:
data = data.with_column(...)
You don’t need to use .apply()
here – you can use expressions:
data = data.with_column(
((pl.col("Left_Angle") + pl.col("Right_Angle")) / 2)
.alias("Mean_Angle")
)
Regarding calling .groupby()
twice – you’re already using grouped_df
– you could re-use it:
mean_df = grouped_df.agg(...)
However – it may make more sense to add the values you’re using as columns and call groupby twice.
That way you can remove all the further calls to .apply()
and use expressions instead:
# all columns excluding the "temporary" ones we will add
columns = pl.exclude(["t_fac_95_conf_int", "t_fact_99_conf_int", "sqrt"])
mean_df = data.with_columns([
pl.lit(t_fac_95_conf_int).alias("t_fac_95_conf_int"),
pl.lit(t_fac_99_conf_int).alias("t_fac_99_conf_int"),
pl.lit(np.sqrt(num_points_per_group)).alias("sqrt")
]).groupby(groupby).agg([
columns.mean(),
columns.std().suffix("_SDEV"),
(columns.std() / pl.col("sqrt")).first().suffix("_SEM_68"),
(columns.std() * pl.col("t_fac_95_conf_int") / pl.col("sqrt")).first().suffix("_SEM_95"),
(columns.std() * pl.col("t_fac_99_conf_int") / pl.col("sqrt")).first().suffix("_SEM_99"),
])
Further possible changes:
-
Instead of the first groupby – you can use
pl.count().over()
-
You can use
.map()
to run the count column throughstats.t.ppf()
– [.flatten()
is used to unwrap the list] -
As you now have a count column –
Expr.sqrt()
can be used instead ofnp.sqrt()
This should be equivalent:
columns = pl.exclude(["count", "sqrt", "t_fac_95_conf_int", "t_fac_99_conf_int"])
mean_df = (
data
.with_columns(pl.count().over(groupby))
.with_columns([
pl.col("count").map(lambda col: stats.t.ppf(0.95, col)).flatten().alias("t_fac_95_conf_int"),
pl.col("count").map(lambda col: stats.t.ppf(0.997, col)).flatten().alias("t_fac_99_conf_int"),
pl.col("count").sqrt().alias("sqrt"),
])
.groupby(groupby)
.agg([
columns.mean(),
columns.std().suffix("_SDEV"),
(columns.std() / pl.col("sqrt")).first().suffix("_SEM_68"),
(columns.std() * pl.col("t_fac_95_conf_int") / pl.col("sqrt")).first().suffix("_SEM_95"),
(columns.std() * pl.col("t_fac_99_conf_int") / pl.col("sqrt")).first().suffix("_SEM_99"),
])
)