Polars: Fill missing months per group
Question:
I want to fill in missing months in a data frame per group based on the minimum and maximum date in each group.
This approach works but uses polars.apply
.
import polars as pl
import numpy as np
from datetime import date
DATA_SIZE = 10000000
raw_df = pl.DataFrame({
"id": np.random.choice(range(0, 10000), DATA_SIZE),
"date": pl.date_range(date(1940, 1, 1), date(2020, 1, 1), interval="1mo").sample(DATA_SIZE, with_replacement=True),
"value": np.random.rand(DATA_SIZE)
})
clean_df = (raw_df
.groupby("id")
.agg(
pl.struct([pl.col("date").min().alias("startDate"), pl.col("date").max().alias("endDate")])
.apply(lambda row: pl.date_range(row["startDate"], row["endDate"], interval="1mo")).alias("date")
)
.explode("date")
.join(raw_df, how="left", on=["id", "date"])
.with_column(pl.col("value").fill_null(0))
)
Now that date_range
accepts expressions there should be a more elegant and faster solution, but I can’t get it to work: ValueError: did not expect value <built-in function id> of type <class 'builtin_function_or_method'>
(raw_df
.groupby("id")
.agg([
pl.col("date").min().alias("startDate"), pl.col("date").max().alias("endDate")
])
.select([
id, pl.date_range(pl.col("startDate"), pl.col("endDate"), interval="1mo")
])
)
How can I use the date_range
with expressions?
Answers:
Your answer is correct but you have a little syntax error, you missed the quotation marks around "id". The error message is a little bit cryptic since id
is also a function.
(raw_df
.groupby("id")
.agg([
pl.col("date").min().alias("startDate"), pl.col("date").max().alias("endDate")
])
.select([
"id", pl.date_range(pl.col("startDate"), pl.col("endDate"), interval="1mo").alias("date")
])
.explode("date")
.join(raw_df, how="left", on=["id", "date"])
.with_column(pl.col("value").fill_null(0))
)
I want to fill in missing months in a data frame per group based on the minimum and maximum date in each group.
This approach works but uses polars.apply
.
import polars as pl
import numpy as np
from datetime import date
DATA_SIZE = 10000000
raw_df = pl.DataFrame({
"id": np.random.choice(range(0, 10000), DATA_SIZE),
"date": pl.date_range(date(1940, 1, 1), date(2020, 1, 1), interval="1mo").sample(DATA_SIZE, with_replacement=True),
"value": np.random.rand(DATA_SIZE)
})
clean_df = (raw_df
.groupby("id")
.agg(
pl.struct([pl.col("date").min().alias("startDate"), pl.col("date").max().alias("endDate")])
.apply(lambda row: pl.date_range(row["startDate"], row["endDate"], interval="1mo")).alias("date")
)
.explode("date")
.join(raw_df, how="left", on=["id", "date"])
.with_column(pl.col("value").fill_null(0))
)
Now that date_range
accepts expressions there should be a more elegant and faster solution, but I can’t get it to work: ValueError: did not expect value <built-in function id> of type <class 'builtin_function_or_method'>
(raw_df
.groupby("id")
.agg([
pl.col("date").min().alias("startDate"), pl.col("date").max().alias("endDate")
])
.select([
id, pl.date_range(pl.col("startDate"), pl.col("endDate"), interval="1mo")
])
)
How can I use the date_range
with expressions?
Your answer is correct but you have a little syntax error, you missed the quotation marks around "id". The error message is a little bit cryptic since id
is also a function.
(raw_df
.groupby("id")
.agg([
pl.col("date").min().alias("startDate"), pl.col("date").max().alias("endDate")
])
.select([
"id", pl.date_range(pl.col("startDate"), pl.col("endDate"), interval="1mo").alias("date")
])
.explode("date")
.join(raw_df, how="left", on=["id", "date"])
.with_column(pl.col("value").fill_null(0))
)