is there a way to implement pandas wide_to_long in Polars?

Question:

I use Pandas wide to long to stack survey data and it works beautifully with regex and stub names, is this possible to do in Polars ?

e.g. in Pandas –

import pandas as pd
df = pd.DataFrame({
'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3],
'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3],
'ht_one': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1],
'ht_two': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9]
})
changed_df = pd.wide_to_long(df, 
                              stubnames='ht', 
                              i=['famid', 'birth'], 
                              j='age',
                              sep='_', 
                              suffix=r'w+')

stubnames can take a list as well.

Edit- Added code after taking inspiration from Jqurious –

import pandas as pd
import numpy as np
import polars as pl
import re

# Create age group data
age_groups = np.random.choice(['0-18', '19-35', '36-50', '51-65', '65+'], size=10)

# Create gender data
genders = np.random.choice(['Male', 'Female', 'Other'], size=10)

# Create familiarity and affinity data
fam_aff = np.random.rand(10, 4)

# Create column names
cols = ['Age_group', 'Gender', 'Familiarity_loop1', 'Familiarity_loop2', 'Affinity_loop1', 'Affinity_loop2']

# Combine data into dataframe
data = np.column_stack([age_groups, genders, fam_aff])
df = pd.DataFrame(data=data, columns=cols)
df["unique_records"] = np.arange(len(df))

regex_pattern = '^.*_loopd'

# get polars DF
pl_df = pl.from_pandas(df)

# get all columns list
col_list = pl_df.columns

loop_list = [] # list of columns which contains _loop
sans_loop_list = [] # list of columns which do not contain _loop

for col in col_list:
    if re.search(regex_pattern, col):
        loop_list.append(col)
    else:
        sans_loop_list.append(col)
        

pl_melt_df = (pl_df
 .melt(
    id_vars = pl_df.select(sans_loop_list).columns, 
    variable_name = "master_stack")
    .with_columns(pl.col("master_stack").str.replace(r"_loopd","")) 
)

pl_melt_df.pivot(index=sans_loop_list, columns="master_stack", values="value")

I want to see Affinity and Familiarity as their own columns, but I am not able to achieve it.

Edit 2 – Added Polars output and Pandas output

Polars –
polars melt and pivot output

Pandas output –
pandas wide_to_long output

Asked By: Ezio

||

Answers:

It looks like a type of .melt:

(df
 .melt(
    id_vars = df.select(pl.exclude(r"^ht_w+$")).columns, 
    variable_name = "age")
 .with_columns(
    pl.col("age").str.replace(r"^[^_]+_", ""))
)
shape: (18, 4)
┌───────┬───────┬─────┬───────┐
│ famid ┆ birth ┆ age ┆ value │
│ ---   ┆ ---   ┆ --- ┆ ---   │
│ i64   ┆ i64   ┆ str ┆ f64   │
╞═══════╪═══════╪═════╪═══════╡
│ 1     ┆ 1     ┆ one ┆ 2.8   │
│ 1     ┆ 2     ┆ one ┆ 2.9   │
│ 1     ┆ 3     ┆ one ┆ 2.2   │
│ 2     ┆ 1     ┆ one ┆ 2.0   │
│ …     ┆ …     ┆ …   ┆ …     │
│ 2     ┆ 3     ┆ two ┆ 2.4   │
│ 3     ┆ 1     ┆ two ┆ 3.3   │
│ 3     ┆ 2     ┆ two ┆ 3.4   │
│ 3     ┆ 3     ┆ two ┆ 2.9   │
└───────┴───────┴─────┴───────┘

Update: Showing how to .melt + .pivot as per updated example.

suffix = r"_loopd+$"
id_vars = df.select(pl.exclude("^.+" + suffix)).columns

(df.melt(id_vars)
   .with_columns(pl.col("variable").str.replace(suffix, ""))
   .with_columns(row_nr = pl.first().cumcount().over("variable"))
   .pivot(index=id_vars + ["row_nr"], columns="variable", values="value", aggregate_function="first")
)
shape: (20, 6)
┌───────────┬────────┬────────────────┬────────┬──────────────────────┬────────────────────┐
│ Age_group ┆ Gender ┆ unique_records ┆ row_nr ┆ Familiarity          ┆ Affinity           │
│ ---       ┆ ---    ┆ ---            ┆ ---    ┆ ---                  ┆ ---                │
│ str       ┆ str    ┆ i64            ┆ i64    ┆ str                  ┆ str                │
╞═══════════╪════════╪════════════════╪════════╪══════════════════════╪════════════════════╡
│ 36-50     ┆ Other  ┆ 0              ┆ 0      ┆ 0.5569650307413312   ┆ 0.9752962344272071 │
│ 19-35     ┆ Other  ┆ 1              ┆ 1      ┆ 0.8723228408633724   ┆ 0.9051378743187902 │
│ 19-35     ┆ Other  ┆ 2              ┆ 2      ┆ 0.010929392505575009 ┆ 0.7381748177290146 │
│ 36-50     ┆ Female ┆ 3              ┆ 3      ┆ 0.9782593916079607   ┆ 0.5099868864386063 │
│ …         ┆ …      ┆ …              ┆ …      ┆ …                    ┆ …                  │
│ 0-18      ┆ Female ┆ 6              ┆ 16     ┆ 0.6795089322714142   ┆ 0.3982539618169999 │
│ 19-35     ┆ Female ┆ 7              ┆ 17     ┆ 0.8168297900583801   ┆ 0.6198522863927297 │
│ 51-65     ┆ Female ┆ 8              ┆ 18     ┆ 0.46387232803532885  ┆ 0.9925845189718061 │
│ 51-65     ┆ Male   ┆ 9              ┆ 19     ┆ 0.20514774525608237  ┆ 0.9388295904692754 │
└───────────┴────────┴────────────────┴────────┴──────────────────────┴────────────────────┘

Pivot explanation:

df = pl.DataFrame({"variable": ["familiarity"] * 3 + ["affinity"] * 3, "value": [1, 2, 3, 4, 5, 6]})
shape: (6, 2)
┌─────────────┬───────┐
│ variable    ┆ value │
│ ---         ┆ ---   │
│ str         ┆ i64   │
╞═════════════╪═══════╡
│ familiarity ┆ 1     │
│ familiarity ┆ 2     │
│ familiarity ┆ 3     │
│ affinity    ┆ 4     │
│ affinity    ┆ 5     │
│ affinity    ┆ 6     │
└─────────────┴───────┘

We use a window function to generate "row ids" to be used in the pivot index.

This would be what you would use .groupby + .cumcount for in pandas.

>>> df.with_columns(row_nr = pl.first().cumcount().over("variable"))
shape: (6, 3)
┌─────────────┬───────┬────────┐
│ variable    ┆ value ┆ row_nr │
│ ---         ┆ ---   ┆ ---    │
│ str         ┆ i64   ┆ i64    │
╞═════════════╪═══════╪════════╡
│ familiarity ┆ 1     ┆ 0      │
│ familiarity ┆ 2     ┆ 1      │
│ familiarity ┆ 3     ┆ 2      │
│ affinity    ┆ 4     ┆ 0      │
│ affinity    ┆ 5     ┆ 1      │
│ affinity    ┆ 6     ┆ 2      │
└─────────────┴───────┴────────┘
(df.with_columns(row_nr = pl.first().cumcount().over("variable"))
   .pivot(index="row_nr", columns="variable", values="value", aggregate_function="first"))
shape: (3, 3)
┌────────┬─────────────┬──────────┐
│ row_nr ┆ familiarity ┆ affinity │
│ ---    ┆ ---         ┆ ---      │
│ i64    ┆ i64         ┆ i64      │
╞════════╪═════════════╪══════════╡
│ 0      ┆ 1           ┆ 4        │
│ 1      ┆ 2           ┆ 5        │
│ 2      ┆ 3           ┆ 6        │
└────────┴─────────────┴──────────┘
Answered By: jqurious

Here’s an alternative to the .melt / .pivot approach.

df = pl.DataFrame({
   "id": ["a", "b", "c", "d"], 
   "Fam_loop1": [1, 2, 3, 4], 
   "Aff_loop1": [8, 7, 6, 5],
   "Aff_loop2": [4, 3, 2, 1]
})
shape: (4, 4)
┌─────┬───────────┬───────────┬───────────┐
│ id  ┆ Fam_loop1 ┆ Aff_loop1 ┆ Aff_loop2 │
│ --- ┆ ---       ┆ ---       ┆ ---       │
│ str ┆ i64       ┆ i64       ┆ i64       │
╞═════╪═══════════╪═══════════╪═══════════╡
│ a   ┆ 1         ┆ 8         ┆ 4         │
│ b   ┆ 2         ┆ 7         ┆ 3         │
│ c   ┆ 3         ┆ 6         ┆ 2         │
│ d   ┆ 4         ┆ 5         ┆ 1         │
└─────┴───────────┴───────────┴───────────┘

If we fill in missing columns e.g. Fam_loop2 in the above exmple – we can then create lists of equal size:

[id, id], [fam_loop1, fam_loop2], [aff_loop1, aff_loop2]

Which we can then .explode to create the long format.

This should be a much more efficient approach, and also works with the Lazy API.


Update: After reading Tricky Long Pivot by Reverse Aggregation transformation (Pandas) I thought it would be nice to allow the prefix or suffix to be used as the resulting column names.

Using pivot=True would use the suffixes as column names.


def wide_to_long(df, starts_with=None, ends_with=None, pivot=False, pivot_name="prefix"):
    if starts_with and ends_with: 
       raise ValueError("Must provide either `starts_with` or `ends_with`.")

    if starts_with is None and ends_with is None:
       raise ValueError("Must provide either `starts_with` or `ends_with`.")

    if starts_with:
       pattern = starts_with
       prefix  = pl.all().str.extract("(" + pattern + ")")
       suffix  = pl.all().str.replace(pattern, "")
       
    if ends_with:
       pattern = ends_with
       prefix  = pl.all().str.replace(pattern, "")
       suffix  = pl.all().str.extract("(" + pattern + ")")

    columns = pl.DataFrame(df.columns, schema=["col"])

    is_wide = pl.all().str.contains(pattern)
    narrow  = columns.filter(is_wide.is_not())
    wide    = columns.filter(is_wide)

    wide = wide.with_columns(
       prefix = prefix,
       suffix = suffix,
       dtype  = pl.all().map_dict(df.schema)
    )
    
    prefixes = wide.unique(subset="prefix", maintain_order=True)
    suffixes = wide.select(pl.col("suffix").unique(maintain_order=True))

    combinations = (
       suffixes
       .join(prefixes, how="cross")
       .select( 
          col = pl.col("prefix") + pl.col("suffix"),
          dtype = "dtype"
       )
    )
    
    missing = (
       combinations
       .join(columns, how="anti", on=columns.columns)
    )

    nulls = (
       pl.lit(None).alias(col).cast(dtype) 
       for col, dtype in missing.select("col", "dtype").iter_rows()
    )

    meta = []
    prefix_columns = []
        
    height = suffixes.height
    names  = prefixes.get_column("prefix")
    fmt    = "^{}.+$"
        
    # use suffixes as column names
    if pivot:

        height = prefixes.height
        names  = suffixes.to_series()
        fmt    = "^.+{}$"

        meta = (
           pl.lit(prefix).alias(f"_{pivot_name}{n}") 
           for n, prefix in enumerate(prefixes.get_column("prefix"))
        )
        
        prefix_columns = [
           pl.concat_list(pl.list(rf"^_{pivot_name}d+$")).alias(pivot_name)
        ]
        
    narrow_columns = (
       pl.concat_list(pl.list(name) for _ in range(height)) 
       for name in narrow.to_series()
    )
           
    wide_columns = (
       pl.concat_list(pl.list(fmt.format(name)).alias(name))
       for name in names
    )
    
    return (
       df.with_columns(nulls)
         .with_columns(meta)
         .select(*narrow_columns, *prefix_columns, *wide_columns)
         .explode(pl.all())
    )
>>> wide_to_long(df, ends_with=r"_loopd+$")
shape: (8, 3)
┌─────┬──────┬─────┐
│ id  ┆ Fam  ┆ Aff │
│ --- ┆ ---  ┆ --- │
│ str ┆ i64  ┆ i64 │
╞═════╪══════╪═════╡
│ a   ┆ 1    ┆ 8   │
│ b   ┆ 2    ┆ 7   │
│ c   ┆ 3    ┆ 6   │
│ d   ┆ 4    ┆ 5   │
│ a   ┆ null ┆ 4   │
│ b   ┆ null ┆ 3   │
│ c   ┆ null ┆ 2   │
│ d   ┆ null ┆ 1   │
└─────┴──────┴─────┘

Using a size of 500_000 from your example:

start = time.perf_counter()
melt_pivot(pl_df)
time.perf_counter() - start

# 2.419576150015928
start = time.perf_counter()
wide_to_long(pl_df, ends_with=r"_loopd+$")
time.perf_counter() - start

# 0.06175561097916216

2.4s -> 0.06s

Answered By: jqurious
Categories: questions Tags: , ,
Answers are sorted by their score. The answer accepted by the question owner as the best is marked with
at the top-right corner.