How to swap column values on conditions in python polars?
Question:
I have a data frame as below:
df_n = pl.from_pandas(pd.DataFrame({'last_name':[np.nan,'mallesh','bhavik'],
'first_name':['a','b','c'],
'middle_name_or_initial':['aa','bb','cc']}))
Here I would like to find an observation which has First and Middle Name not NULL and Last Name is Null, in this case first_name should be swapped to last_name and middle_name should be swapped to first_name, and middle_name to be EMPTY.
expected output will be:
I’m trying with this command:
df_n.with_columns([
pl.when((pl.col('first_name').is_not_null()) & (pl.col('middle_name_or_initial').is_not_null()) & (pl.col('last_name').is_null())
).then(pl.col('first_name').alias('last_name')).otherwise(pl.col('last_name').alias('first_name')),
pl.when((pl.col('first_name').is_not_null()) & (pl.col('middle_name_or_initial').is_not_null()) & (pl.col('last_name').is_null())
).then(pl.col('middle_name_or_initial').alias('first_name')).otherwise('').alias('middle_name_or_initial')
]
)
Here it is throwing a wrong output and any help ?
Answers:
With pl.when().then().otherwise()
you create values for only one column (so only one alias at the end is allowed).
In [67]: df_n.with_columns(
...: [
...: # Create temp column with filter, so it does not have to be recalculated 3 times.
...: ((pl.col('first_name').is_not_null()) & (pl.col('middle_name_or_initial').is_not_null()) & (pl.col('last_name').is_null())).alias("swap_names")
...: ]
...: ).with_columns(
...: [
...: # Create new columns with the correct value based on the swap_names column.
...: pl.when(pl.col("swap_names")).then(pl.col("first_name")).otherwise(pl.col("last_name")).alias("last_name_new"),
...: pl.when(pl.col("swap_names")).then(pl.col("middle_name_or_initial")).otherwise(pl.col("first_name")).alias("first_name_new"),
...: pl.when(pl.col("swap_names")).then(None).otherwise(pl.col("middle_name_or_initial")).alias("middle_name_or_initial_new"),
...: ]
...: )
Out[67]:
shape: (3, 7)
┌───────────┬────────────┬────────────────────────┬────────────┬───────────────┬────────────────┬────────────────────────────┐
│ last_name ┆ first_name ┆ middle_name_or_initial ┆ swap_names ┆ last_name_new ┆ first_name_new ┆ middle_name_or_initial_new │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ str ┆ str ┆ bool ┆ str ┆ str ┆ str │
╞═══════════╪════════════╪════════════════════════╪════════════╪═══════════════╪════════════════╪════════════════════════════╡
│ null ┆ a ┆ aa ┆ true ┆ a ┆ aa ┆ null │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ mallesh ┆ b ┆ bb ┆ false ┆ mallesh ┆ b ┆ bb │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ bhavik ┆ c ┆ cc ┆ false ┆ bhavik ┆ c ┆ cc │
└───────────┴────────────┴────────────────────────┴────────────┴───────────────┴────────────────┴────────────────────────────┘
You can actually change the values of multiple columns within a single when/then/otherwise
statement.
The Algorithm
name_cols = ["last_name", "first_name", "middle_name_or_initial"]
(
df_n.with_column(
pl.when(
(pl.col("first_name").is_not_null())
& (pl.col("middle_name_or_initial").is_not_null())
& (pl.col("last_name").is_null())
)
.then(pl.struct([
pl.col('first_name').alias('last_name'),
pl.col('middle_name_or_initial').alias('first_name'),
pl.col('last_name').alias('middle_name_or_initial'),
]))
.otherwise(pl.struct(name_cols))
.alias('name_struct')
)
.drop(name_cols)
.unnest('name_struct')
)
shape: (3, 3)
┌───────────┬────────────┬────────────────────────┐
│ last_name ┆ first_name ┆ middle_name_or_initial │
│ --- ┆ --- ┆ --- │
│ str ┆ str ┆ str │
╞═══════════╪════════════╪════════════════════════╡
│ a ┆ aa ┆ null │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ mallesh ┆ b ┆ bb │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ bhavik ┆ c ┆ cc │
└───────────┴────────────┴────────────────────────┘
How it works
To change the values of multiple columns within a single when/then/otherwise
statement, we can use structs. But you must observe some rules with structs. In all your then
and otherwise
statements, your structs must have:
- the same field names
- in the same order
- with the same data type in corresponding fields.
So, in both the then
and otherwise
statements, I’m going to create a struct with field names in this order:
- last_name: string
- first_name: string
- middle_name_or_initial: string
In our then
statement, I’m swapping values and using alias
to ensure that my fields names are as stated above. (This is important.)
.then(pl.struct([
pl.col('first_name').alias('last_name'),
pl.col('middle_name_or_initial').alias('first_name'),
pl.col('last_name').alias('middle_name_or_initial'),
]))
And in the otherwise
statement, we’ll simply name the existing columns that we want, in the order that we want – using the list name_cols
that I created in a previous step.
name_cols = ["last_name", "first_name", "middle_name_or_initial"]
...
.otherwise(pl.struct(name_cols))
Here’s the result after the when/then/otherwise
statement.
name_cols = ["last_name", "first_name", "middle_name_or_initial"]
(
df_n.with_column(
pl.when(
(pl.col("first_name").is_not_null())
& (pl.col("middle_name_or_initial").is_not_null())
& (pl.col("last_name").is_null())
)
.then(pl.struct([
pl.col('first_name').alias('last_name'),
pl.col('middle_name_or_initial').alias('first_name'),
pl.col('last_name').alias('middle_name_or_initial'),
]))
.otherwise(pl.struct(name_cols))
.alias('name_struct')
)
)
shape: (3, 4)
┌───────────┬────────────┬────────────────────────┬──────────────────────┐
│ last_name ┆ first_name ┆ middle_name_or_initial ┆ name_struct │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ str ┆ str ┆ struct[3] │
╞═══════════╪════════════╪════════════════════════╪══════════════════════╡
│ null ┆ a ┆ aa ┆ {"a","aa",null} │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ mallesh ┆ b ┆ bb ┆ {"mallesh","b","bb"} │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ bhavik ┆ c ┆ cc ┆ {"bhavik","c","cc"} │
└───────────┴────────────┴────────────────────────┴──────────────────────┘
Notice that our new struct name_struct
has the values that we want – in the correct order.
Next, we will use unnest
to break the struct into separate columns. (But first, we must drop the existing columns so that we don’t get 2 sets of columns with the same names.)
name_cols = ["last_name", "first_name", "middle_name_or_initial"]
(
df_n.with_column(
pl.when(
(pl.col("first_name").is_not_null())
& (pl.col("middle_name_or_initial").is_not_null())
& (pl.col("last_name").is_null())
)
.then(pl.struct([
pl.col('first_name').alias('last_name'),
pl.col('middle_name_or_initial').alias('first_name'),
pl.col('last_name').alias('middle_name_or_initial'),
]))
.otherwise(pl.struct(name_cols))
.alias('name_struct')
)
.drop(name_cols)
.unnest('name_struct')
)
shape: (3, 3)
┌───────────┬────────────┬────────────────────────┐
│ last_name ┆ first_name ┆ middle_name_or_initial │
│ --- ┆ --- ┆ --- │
│ str ┆ str ┆ str │
╞═══════════╪════════════╪════════════════════════╡
│ a ┆ aa ┆ null │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ mallesh ┆ b ┆ bb │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ bhavik ┆ c ┆ cc │
└───────────┴────────────┴────────────────────────┘
I have a data frame as below:
df_n = pl.from_pandas(pd.DataFrame({'last_name':[np.nan,'mallesh','bhavik'],
'first_name':['a','b','c'],
'middle_name_or_initial':['aa','bb','cc']}))
Here I would like to find an observation which has First and Middle Name not NULL and Last Name is Null, in this case first_name should be swapped to last_name and middle_name should be swapped to first_name, and middle_name to be EMPTY.
expected output will be:
I’m trying with this command:
df_n.with_columns([
pl.when((pl.col('first_name').is_not_null()) & (pl.col('middle_name_or_initial').is_not_null()) & (pl.col('last_name').is_null())
).then(pl.col('first_name').alias('last_name')).otherwise(pl.col('last_name').alias('first_name')),
pl.when((pl.col('first_name').is_not_null()) & (pl.col('middle_name_or_initial').is_not_null()) & (pl.col('last_name').is_null())
).then(pl.col('middle_name_or_initial').alias('first_name')).otherwise('').alias('middle_name_or_initial')
]
)
Here it is throwing a wrong output and any help ?
With pl.when().then().otherwise()
you create values for only one column (so only one alias at the end is allowed).
In [67]: df_n.with_columns(
...: [
...: # Create temp column with filter, so it does not have to be recalculated 3 times.
...: ((pl.col('first_name').is_not_null()) & (pl.col('middle_name_or_initial').is_not_null()) & (pl.col('last_name').is_null())).alias("swap_names")
...: ]
...: ).with_columns(
...: [
...: # Create new columns with the correct value based on the swap_names column.
...: pl.when(pl.col("swap_names")).then(pl.col("first_name")).otherwise(pl.col("last_name")).alias("last_name_new"),
...: pl.when(pl.col("swap_names")).then(pl.col("middle_name_or_initial")).otherwise(pl.col("first_name")).alias("first_name_new"),
...: pl.when(pl.col("swap_names")).then(None).otherwise(pl.col("middle_name_or_initial")).alias("middle_name_or_initial_new"),
...: ]
...: )
Out[67]:
shape: (3, 7)
┌───────────┬────────────┬────────────────────────┬────────────┬───────────────┬────────────────┬────────────────────────────┐
│ last_name ┆ first_name ┆ middle_name_or_initial ┆ swap_names ┆ last_name_new ┆ first_name_new ┆ middle_name_or_initial_new │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ str ┆ str ┆ bool ┆ str ┆ str ┆ str │
╞═══════════╪════════════╪════════════════════════╪════════════╪═══════════════╪════════════════╪════════════════════════════╡
│ null ┆ a ┆ aa ┆ true ┆ a ┆ aa ┆ null │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ mallesh ┆ b ┆ bb ┆ false ┆ mallesh ┆ b ┆ bb │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ bhavik ┆ c ┆ cc ┆ false ┆ bhavik ┆ c ┆ cc │
└───────────┴────────────┴────────────────────────┴────────────┴───────────────┴────────────────┴────────────────────────────┘
You can actually change the values of multiple columns within a single when/then/otherwise
statement.
The Algorithm
name_cols = ["last_name", "first_name", "middle_name_or_initial"]
(
df_n.with_column(
pl.when(
(pl.col("first_name").is_not_null())
& (pl.col("middle_name_or_initial").is_not_null())
& (pl.col("last_name").is_null())
)
.then(pl.struct([
pl.col('first_name').alias('last_name'),
pl.col('middle_name_or_initial').alias('first_name'),
pl.col('last_name').alias('middle_name_or_initial'),
]))
.otherwise(pl.struct(name_cols))
.alias('name_struct')
)
.drop(name_cols)
.unnest('name_struct')
)
shape: (3, 3)
┌───────────┬────────────┬────────────────────────┐
│ last_name ┆ first_name ┆ middle_name_or_initial │
│ --- ┆ --- ┆ --- │
│ str ┆ str ┆ str │
╞═══════════╪════════════╪════════════════════════╡
│ a ┆ aa ┆ null │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ mallesh ┆ b ┆ bb │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ bhavik ┆ c ┆ cc │
└───────────┴────────────┴────────────────────────┘
How it works
To change the values of multiple columns within a single when/then/otherwise
statement, we can use structs. But you must observe some rules with structs. In all your then
and otherwise
statements, your structs must have:
- the same field names
- in the same order
- with the same data type in corresponding fields.
So, in both the then
and otherwise
statements, I’m going to create a struct with field names in this order:
- last_name: string
- first_name: string
- middle_name_or_initial: string
In our then
statement, I’m swapping values and using alias
to ensure that my fields names are as stated above. (This is important.)
.then(pl.struct([
pl.col('first_name').alias('last_name'),
pl.col('middle_name_or_initial').alias('first_name'),
pl.col('last_name').alias('middle_name_or_initial'),
]))
And in the otherwise
statement, we’ll simply name the existing columns that we want, in the order that we want – using the list name_cols
that I created in a previous step.
name_cols = ["last_name", "first_name", "middle_name_or_initial"]
...
.otherwise(pl.struct(name_cols))
Here’s the result after the when/then/otherwise
statement.
name_cols = ["last_name", "first_name", "middle_name_or_initial"]
(
df_n.with_column(
pl.when(
(pl.col("first_name").is_not_null())
& (pl.col("middle_name_or_initial").is_not_null())
& (pl.col("last_name").is_null())
)
.then(pl.struct([
pl.col('first_name').alias('last_name'),
pl.col('middle_name_or_initial').alias('first_name'),
pl.col('last_name').alias('middle_name_or_initial'),
]))
.otherwise(pl.struct(name_cols))
.alias('name_struct')
)
)
shape: (3, 4)
┌───────────┬────────────┬────────────────────────┬──────────────────────┐
│ last_name ┆ first_name ┆ middle_name_or_initial ┆ name_struct │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ str ┆ str ┆ struct[3] │
╞═══════════╪════════════╪════════════════════════╪══════════════════════╡
│ null ┆ a ┆ aa ┆ {"a","aa",null} │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ mallesh ┆ b ┆ bb ┆ {"mallesh","b","bb"} │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ bhavik ┆ c ┆ cc ┆ {"bhavik","c","cc"} │
└───────────┴────────────┴────────────────────────┴──────────────────────┘
Notice that our new struct name_struct
has the values that we want – in the correct order.
Next, we will use unnest
to break the struct into separate columns. (But first, we must drop the existing columns so that we don’t get 2 sets of columns with the same names.)
name_cols = ["last_name", "first_name", "middle_name_or_initial"]
(
df_n.with_column(
pl.when(
(pl.col("first_name").is_not_null())
& (pl.col("middle_name_or_initial").is_not_null())
& (pl.col("last_name").is_null())
)
.then(pl.struct([
pl.col('first_name').alias('last_name'),
pl.col('middle_name_or_initial').alias('first_name'),
pl.col('last_name').alias('middle_name_or_initial'),
]))
.otherwise(pl.struct(name_cols))
.alias('name_struct')
)
.drop(name_cols)
.unnest('name_struct')
)
shape: (3, 3)
┌───────────┬────────────┬────────────────────────┐
│ last_name ┆ first_name ┆ middle_name_or_initial │
│ --- ┆ --- ┆ --- │
│ str ┆ str ┆ str │
╞═══════════╪════════════╪════════════════════════╡
│ a ┆ aa ┆ null │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ mallesh ┆ b ┆ bb │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ bhavik ┆ c ┆ cc │
└───────────┴────────────┴────────────────────────┘