Forward filling by column values in python polars
Question:
I am wondering if there’s a way to do forward filling by columns in polars because I tried the fill_null(strategy='forward')
but it filled the null not with the next column value. What’s the best way to achieve like ffill()
df = pl.DataFrame(
{
"id": ["NY", "TK", "FD"],
"eat2000": [1, 6, 3],
"eat2001": [-2, None, 4],
"eat2002": [None, None, None],
"eat2003": [-9, 3, 8],
"eat2004": [None, None, 8]
}
); df
shape: (3, 6)
┌─────┬─────────┬─────────┬─────────┬─────────┬─────────┐
│ id ┆ eat2000 ┆ eat2001 ┆ eat2002 ┆ eat2003 ┆ eat2004 │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 ┆ f64 ┆ i64 ┆ i64 │
╞═════╪═════════╪═════════╪═════════╪═════════╪═════════╡
│ NY ┆ 1 ┆ -2 ┆ null ┆ -9 ┆ null │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ TK ┆ 6 ┆ null ┆ null ┆ 3 ┆ null │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ FD ┆ 3 ┆ 4 ┆ null ┆ 8 ┆ 8 │
└─────┴─────────┴─────────┴─────────┴─────────┴─────────┘
df.fill_null(strategy='forward')
┌─────┬─────────┬─────────┬─────────┬─────────┬─────────┐
│ id ┆ eat2000 ┆ eat2001 ┆ eat2002 ┆ eat2003 ┆ eat2004 │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 ┆ f64 ┆ i64 ┆ i64 │
╞═════╪═════════╪═════════╪═════════╪═════════╪═════════╡
│ NY ┆ 1 ┆ -2 ┆ null ┆ -9 ┆ null │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ TK ┆ 6 ┆ -2 ┆ null ┆ 3 ┆ null │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ FD ┆ 3 ┆ 4 ┆ null ┆ 8 ┆ 8 │
└─────┴─────────┴─────────┴─────────┴─────────┴─────────┘
Expected result:
┌─────┬─────────┬─────────┬─────────┬─────────┬─────────┐
│ id ┆ eat2000 ┆ eat2001 ┆ eat2002 ┆ eat2003 ┆ eat2004 │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 ┆ f64 ┆ i64 ┆ i64 │
╞═════╪═════════╪═════════╪═════════╪═════════╪═════════╡
│ NY ┆ 1 ┆ -2 ┆ -2 ┆ -9 ┆ -9 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ TK ┆ 6 ┆ 6 ┆ 6 ┆ 3 ┆ 3 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ FD ┆ 3 ┆ 4 ┆ 4 ┆ 8 ┆ 8 │
└─────┴─────────┴─────────┴─────────┴─────────┴─────────┘
Answers:
You can use the new coalesce
Expression to fold columns horizontally. If you place the coalesce
expressions in a with_columns
context, they will be run in parallel.
(
df
.with_columns(pl.col("^eat.*$").cast(pl.Int64))
.with_columns(
[
pl.coalesce(["eat2004", "eat2003", "eat2002", "eat2001", "eat2000"]),
pl.coalesce(["eat2003", "eat2002", "eat2001", "eat2000"]),
pl.coalesce(["eat2002", "eat2001", "eat2000"]),
pl.coalesce(["eat2001", "eat2000"]),
]
)
)
shape: (3, 6)
┌─────┬─────────┬─────────┬─────────┬─────────┬─────────┐
│ id ┆ eat2000 ┆ eat2001 ┆ eat2002 ┆ eat2003 ┆ eat2004 │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │
╞═════╪═════════╪═════════╪═════════╪═════════╪═════════╡
│ NY ┆ 1 ┆ -2 ┆ -2 ┆ -9 ┆ -9 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ TK ┆ 6 ┆ 6 ┆ 6 ┆ 3 ┆ 3 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ FD ┆ 3 ┆ 4 ┆ 4 ┆ 8 ┆ 8 │
└─────┴─────────┴─────────┴─────────┴─────────┴─────────┘
Couple of notes.
I first cast the eatXXXX
columns to the same type. (In the DataFrame constructor, eat2002
is of type Float64
because of the way Polars initializes an all-null column that is not supplied with an explicit datatype).
I’ve written out the list of coalesce
Expressions for demonstration, but the list of expressions can be generated with a Python list comprehension.
eat_cols = [col_nm for col_nm in reversed(df.columns)
if col_nm.startswith('eat')]
(
df
.with_columns(pl.col("^eat.*$").cast(pl.Int64))
.with_columns([
pl.coalesce(eat_cols[idx:])
for idx in range(0, len(eat_cols) - 1)
])
)
(
df
.transpose(include_header=True)
.with_columns(
[
pl.col(x).forward_fill() for x in df.transpose(include_header=True).columns
])
.transpose(column_names=df.columns)[1:]
.with_columns(
[
pl.all().exclude("id").cast(pl.Int64)
]
)
)
I am wondering if there’s a way to do forward filling by columns in polars because I tried the fill_null(strategy='forward')
but it filled the null not with the next column value. What’s the best way to achieve like ffill()
df = pl.DataFrame(
{
"id": ["NY", "TK", "FD"],
"eat2000": [1, 6, 3],
"eat2001": [-2, None, 4],
"eat2002": [None, None, None],
"eat2003": [-9, 3, 8],
"eat2004": [None, None, 8]
}
); df
shape: (3, 6)
┌─────┬─────────┬─────────┬─────────┬─────────┬─────────┐
│ id ┆ eat2000 ┆ eat2001 ┆ eat2002 ┆ eat2003 ┆ eat2004 │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 ┆ f64 ┆ i64 ┆ i64 │
╞═════╪═════════╪═════════╪═════════╪═════════╪═════════╡
│ NY ┆ 1 ┆ -2 ┆ null ┆ -9 ┆ null │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ TK ┆ 6 ┆ null ┆ null ┆ 3 ┆ null │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ FD ┆ 3 ┆ 4 ┆ null ┆ 8 ┆ 8 │
└─────┴─────────┴─────────┴─────────┴─────────┴─────────┘
df.fill_null(strategy='forward')
┌─────┬─────────┬─────────┬─────────┬─────────┬─────────┐
│ id ┆ eat2000 ┆ eat2001 ┆ eat2002 ┆ eat2003 ┆ eat2004 │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 ┆ f64 ┆ i64 ┆ i64 │
╞═════╪═════════╪═════════╪═════════╪═════════╪═════════╡
│ NY ┆ 1 ┆ -2 ┆ null ┆ -9 ┆ null │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ TK ┆ 6 ┆ -2 ┆ null ┆ 3 ┆ null │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ FD ┆ 3 ┆ 4 ┆ null ┆ 8 ┆ 8 │
└─────┴─────────┴─────────┴─────────┴─────────┴─────────┘
Expected result:
┌─────┬─────────┬─────────┬─────────┬─────────┬─────────┐
│ id ┆ eat2000 ┆ eat2001 ┆ eat2002 ┆ eat2003 ┆ eat2004 │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 ┆ f64 ┆ i64 ┆ i64 │
╞═════╪═════════╪═════════╪═════════╪═════════╪═════════╡
│ NY ┆ 1 ┆ -2 ┆ -2 ┆ -9 ┆ -9 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ TK ┆ 6 ┆ 6 ┆ 6 ┆ 3 ┆ 3 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ FD ┆ 3 ┆ 4 ┆ 4 ┆ 8 ┆ 8 │
└─────┴─────────┴─────────┴─────────┴─────────┴─────────┘
You can use the new coalesce
Expression to fold columns horizontally. If you place the coalesce
expressions in a with_columns
context, they will be run in parallel.
(
df
.with_columns(pl.col("^eat.*$").cast(pl.Int64))
.with_columns(
[
pl.coalesce(["eat2004", "eat2003", "eat2002", "eat2001", "eat2000"]),
pl.coalesce(["eat2003", "eat2002", "eat2001", "eat2000"]),
pl.coalesce(["eat2002", "eat2001", "eat2000"]),
pl.coalesce(["eat2001", "eat2000"]),
]
)
)
shape: (3, 6)
┌─────┬─────────┬─────────┬─────────┬─────────┬─────────┐
│ id ┆ eat2000 ┆ eat2001 ┆ eat2002 ┆ eat2003 ┆ eat2004 │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │
╞═════╪═════════╪═════════╪═════════╪═════════╪═════════╡
│ NY ┆ 1 ┆ -2 ┆ -2 ┆ -9 ┆ -9 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ TK ┆ 6 ┆ 6 ┆ 6 ┆ 3 ┆ 3 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ FD ┆ 3 ┆ 4 ┆ 4 ┆ 8 ┆ 8 │
└─────┴─────────┴─────────┴─────────┴─────────┴─────────┘
Couple of notes.
I first cast the eatXXXX
columns to the same type. (In the DataFrame constructor, eat2002
is of type Float64
because of the way Polars initializes an all-null column that is not supplied with an explicit datatype).
I’ve written out the list of coalesce
Expressions for demonstration, but the list of expressions can be generated with a Python list comprehension.
eat_cols = [col_nm for col_nm in reversed(df.columns)
if col_nm.startswith('eat')]
(
df
.with_columns(pl.col("^eat.*$").cast(pl.Int64))
.with_columns([
pl.coalesce(eat_cols[idx:])
for idx in range(0, len(eat_cols) - 1)
])
)
(
df
.transpose(include_header=True)
.with_columns(
[
pl.col(x).forward_fill() for x in df.transpose(include_header=True).columns
])
.transpose(column_names=df.columns)[1:]
.with_columns(
[
pl.all().exclude("id").cast(pl.Int64)
]
)
)