How to convert multi column expressions from Pandas to Polars

Question:

I just found out about the Polars lib and I wanted to convert some old functions to get familiar.

However, I stumbled upon an issue with my code. The "Mean_Angle" column is not calculated, and I have no idea if the last part even works as intended, it aborts during the groupby operation as the column is missing.

This is the code I want to convert:

def calc_mean_and_error(df: pd.DataFrame, columns=None, groupby="Magn_Pos") -> pd.DataFrame:
    data = df.copy()
    if columns is None:
        columns = ['Left_Angle', 'Right_Angle', 'Magn_Pos', 'Magn_Field']

    if 'Left_Angle' in columns and 'Right_Angle' in columns:
        data['Mean_Angle'] = (data['Left_Angle'] + data['Right_Angle']) / 2
        columns.append('Mean_Angle')
    grouped_df = data[columns].groupby(groupby,sort=False)

    num_points_per_group = grouped_df.size().values
    mean_df = grouped_df.mean()

    # standard deviation
    mean_df[['Left_Angle_SDEV','Right_Angle_SDEV','Mean_Angle_SDEV']] = grouped_df[['Left_Angle','Right_Angle','Mean_Angle']].std()

    # standard error, 1 sigma confidence interval
    mean_df[['Left_Angle_SEM_68','Right_Angle_SEM_68','Mean_Angle_SEM_68']] = grouped_df[['Left_Angle','Right_Angle','Mean_Angle']].sem()

    # standard error, 2 sigma confidence interval - t distribution
    t_fac_95_conf_int = stats.t.ppf(0.95, num_points_per_group) # factor according to https://en.wikipedia.org/wiki/Student%27s_t-distribution
    mean_df[['Left_Angle_SEM_95','Right_Angle_SEM_95','Mean_Angle_SEM_95']] = mean_df[['Left_Angle_SEM_68','Right_Angle_SEM_68','Mean_Angle_SEM_68']].multiply(t_fac_95_conf_int, axis=0)

    # standard error, 3 sigma confidence interval - t distribution
    t_fac_99_conf_int = stats.t.ppf(0.997, num_points_per_group)
    mean_df[['Left_Angle_SEM_99','Right_Angle_SEM_99','Mean_Angle_SEM_99']] = mean_df[['Left_Angle_SEM_68','Right_Angle_SEM_68','Mean_Angle_SEM_68']].multiply(t_fac_99_conf_int, axis=0)

    mean_df = mean_df.reset_index()

    return mean_df

This is what I have so far:

def calc_mean_and_error(df: pl.DataFrame, columns=None, groupby="Magn_Pos") -> pl.DataFrame:
    data = df
    if columns is None:
        columns = ['Left_Angle', 'Right_Angle', 'Magn_Pos', 'Magn_Field']

    if 'Left_Angle' in columns and 'Right_Angle' in columns:
# this doesn't work?
        data.with_column(
            pl.struct(['Left_Angle', 'Right_Angle']).apply(lambda x: (x['Left_Angle'] + x['Right_Angle']) / 2).alias("Mean_Angle")
        )
        columns.append('Mean_Angle')
    grouped_df = data.select(columns).groupy(groupby)

    num_points_per_group = grouped_df.count()['count'].take(0)
    mean_df = grouped_df.mean()

    t_fac_95_conf_int = stats.t.ppf(0.95, num_points_per_group) # factor according to https://en.wikipedia.org/wiki/Student%27s_t-distribution
    t_fac_99_conf_int = stats.t.ppf(0.997, num_points_per_group)
    # standard deviation
    mean_df = data.select(columns).groupby(groupby).agg(
        [
            pl.col('*').mean(),
            pl.col('*').std().suffix('_SDEV'),
            pl.col('*').std().apply(lambda x: x / np.sqrt(num_points_per_group)).suffix('_SEM_68'), # standard error
            pl.col('*').std().apply(lambda x: x*t_fac_95_conf_int / np.sqrt(num_points_per_group)).suffix('_SEM_95'),
            pl.col('*').std().apply(lambda x: x*t_fac_99_conf_int / np.sqrt(num_points_per_group)).suffix('_SEM_99'),
        ]
    )

    return mean_df

Example:

data_raw = """TimetRepetitiontLeft_AngletRight_AngletMagn_PostMagn_Field
0.0t0t111.62539060014953t111.65929559305457t20.0t0.05012
289.75t1t113.43406129503042t113.29101205027376t20.0t0.05012
343.420999999973t2t113.21669960326668t113.30918399000467t20.0t0.05012
397.68700000003446t0t114.50650196149256t114.78488582815113t10.0t0.1317
456.10900000005495t1t114.7078936381882t114.70239460290726t10.0t0.1317
507.8279999999795t2t115.71894177915732t115.70104461571628t10.0t0.1317
565.3429999999935t0t121.71521327349599t121.55379420624988t5.0t0.2276
612.045999999973t1t122.53171995914443t122.4555143281342t5.0t0.2276
668.3120000000345t2t121.65748098845367t121.60313424823333t5.0t0.2276
714.484000000055t0t130.88884567117995t130.82365731381574t2.5t0.3011
774.9679999999935t1t132.72366563179372t132.59019277520363t2.5t0.3011
817.765000000014t2t133.5549497954158t133.4637401535662t2.5t0.3011
891.7029999999795t0t139.9155468732065t139.78384156146674t0.0t0.3907
940.655999999959t1t143.34707217674438t143.2278696177915t0.0t0.3907
984.125t2t144.30042471080577t144.16800277145435t0.0t0.3907""".encode("utf8")

data = pl.read_csv(data_raw, sep='t', encoding="utf8")
eval = calc_mean_and_error(data, columns=['Left_Angle', 'Right_Angle', 'Magn_Pos', 'Magn_Field'])
print(eval)

I’m not really sure about the last part though! I am not entirely familiar with the syntax of the expressions. And I am not sure how to prevent calling groupby twice. Can someone lead me in the right direction? Thanks!

Asked By: Raphael

||

Answers:

.with_column() returns a new dataframe – it does not modify in-place.

You would assign the result:

data = data.with_column(...)

You don’t need to use .apply() here – you can use expressions:

data = data.with_column(
   ((pl.col("Left_Angle") + pl.col("Right_Angle")) / 2)
   .alias("Mean_Angle")
)

Regarding calling .groupby() twice – you’re already using grouped_df – you could re-use it:

mean_df = grouped_df.agg(...)

However – it may make more sense to add the values you’re using as columns and call groupby twice.

That way you can remove all the further calls to .apply() and use expressions instead:

# all columns excluding the "temporary" ones we will add
columns = pl.exclude(["t_fac_95_conf_int", "t_fact_99_conf_int", "sqrt"])

mean_df = data.with_columns([
   pl.lit(t_fac_95_conf_int).alias("t_fac_95_conf_int"),
   pl.lit(t_fac_99_conf_int).alias("t_fac_99_conf_int"),
   pl.lit(np.sqrt(num_points_per_group)).alias("sqrt")
]).groupby(groupby).agg([
   columns.mean(),
   columns.std().suffix("_SDEV"),
   (columns.std() / pl.col("sqrt")).first().suffix("_SEM_68"),
   (columns.std() * pl.col("t_fac_95_conf_int") / pl.col("sqrt")).first().suffix("_SEM_95"),
   (columns.std() * pl.col("t_fac_99_conf_int") / pl.col("sqrt")).first().suffix("_SEM_99"),
])

Further possible changes:

  • Instead of the first groupby – you can use pl.count().over()

  • You can use .map() to run the count column through stats.t.ppf() – [.flatten() is used to unwrap the list]

  • As you now have a count column – Expr.sqrt() can be used instead of np.sqrt()

This should be equivalent:

columns = pl.exclude(["count", "sqrt", "t_fac_95_conf_int", "t_fac_99_conf_int"])
mean_df = (
   data
   .with_columns(pl.count().over(groupby))
   .with_columns([
      pl.col("count").map(lambda col: stats.t.ppf(0.95,  col)).flatten().alias("t_fac_95_conf_int"),
      pl.col("count").map(lambda col: stats.t.ppf(0.997, col)).flatten().alias("t_fac_99_conf_int"),
      pl.col("count").sqrt().alias("sqrt"),
   ])
   .groupby(groupby)
   .agg([
      columns.mean(),
      columns.std().suffix("_SDEV"),
      (columns.std() / pl.col("sqrt")).first().suffix("_SEM_68"),
      (columns.std() * pl.col("t_fac_95_conf_int") / pl.col("sqrt")).first().suffix("_SEM_95"),
      (columns.std() * pl.col("t_fac_99_conf_int") / pl.col("sqrt")).first().suffix("_SEM_99"),
   ])
)
Answered By: jqurious
Categories: questions Tags: , ,
Answers are sorted by their score. The answer accepted by the question owner as the best is marked with
at the top-right corner.