Repeating a date in polars and exploding it
Question:
I have a polars dataframe with two date columns that represent a start and end date and then a value that I want to repeat for all dates in between those two dates so that I can join those on other tables.
Example input is
id
start
end
value
123
2022-01-01
2022-01-04
10
abc
2022-03-04
2022-03-04
3
456
2022-05-11
2022-05-16
4
and expected output is
id
date
value
123
2022-01-01
10
123
2022-01-02
10
123
2022-01-03
10
123
2022-01-04
10
abc
2022-03-04
3
456
2022-05-11
4
456
2022-05-12
4
456
2022-05-13
4
456
2022-05-14
4
456
2022-05-15
4
456
2022-05-16
4
Answers:
Starting with this data:
import polars as pl
from datetime import date
df = pl.DataFrame(
{
"id": ["123", "abc", "456"],
"start": [date(2022, 1, 1), date(2022, 3, 4), date(2022, 5, 11)],
"end": [date(2022, 1, 4), date(2022, 3, 4), date(2022, 5, 16)],
"value": [10, 3, 4],
}
)
df
shape: (3, 4)
┌─────┬────────────┬────────────┬───────┐
│ id ┆ start ┆ end ┆ value │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ date ┆ date ┆ i64 │
╞═════╪════════════╪════════════╪═══════╡
│ 123 ┆ 2022-01-01 ┆ 2022-01-04 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ abc ┆ 2022-03-04 ┆ 2022-03-04 ┆ 3 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-11 ┆ 2022-05-16 ┆ 4 │
└─────┴────────────┴────────────┴───────┘
The Algorithm
(
df.with_columns(
[pl.arange(pl.col("start"), pl.col("end") + 1).alias("date")])
.explode("date")
.with_column(pl.col("date").cast(pl.Date))
.select(["id", "date", "value"])
)
shape: (11, 3)
┌─────┬────────────┬───────┐
│ id ┆ date ┆ value │
│ --- ┆ --- ┆ --- │
│ str ┆ date ┆ i64 │
╞═════╪════════════╪═══════╡
│ 123 ┆ 2022-01-01 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-02 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-03 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-04 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ abc ┆ 2022-03-04 ┆ 3 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-11 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-12 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-13 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-14 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-15 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-16 ┆ 4 │
└─────┴────────────┴───────┘
In steps
Normally, we create a range of dates using the date_range
expression. However, date_range
does not take an Expression as its low
and high
parameters.
However, arange
does allow Expressions as its low
and high
parameters. We can (implicitly) cast the start
and end
dates to integers, which represent the number of days since the UNIX epoch.
The result is a list of integers which represents the days between (and including) the start
and end
dates (expressed as days since the UNIX epoch)..
Notice that we have to add 1 to the high
parameter to make sure we capture the end date.
(
df.with_columns(
[pl.arange(pl.col("start"), pl.col("end") + 1).alias("date")])
)
shape: (3, 5)
┌─────┬────────────┬────────────┬───────┬───────────────────────────┐
│ id ┆ start ┆ end ┆ value ┆ date │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ date ┆ date ┆ i64 ┆ list[i64] │
╞═════╪════════════╪════════════╪═══════╪═══════════════════════════╡
│ 123 ┆ 2022-01-01 ┆ 2022-01-04 ┆ 10 ┆ [18993, 18994, ... 18996] │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ abc ┆ 2022-03-04 ┆ 2022-03-04 ┆ 3 ┆ [19055] │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-11 ┆ 2022-05-16 ┆ 4 ┆ [19123, 19124, ... 19128] │
└─────┴────────────┴────────────┴───────┴───────────────────────────┘
Next we can use explode
to place each of the integers on a separate row.
(
df.with_columns(
[pl.arange(pl.col("start"), pl.col("end") + 1).alias("date")])
.explode("date")
)
shape: (11, 5)
┌─────┬────────────┬────────────┬───────┬───────┐
│ id ┆ start ┆ end ┆ value ┆ date │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ date ┆ date ┆ i64 ┆ i64 │
╞═════╪════════════╪════════════╪═══════╪═══════╡
│ 123 ┆ 2022-01-01 ┆ 2022-01-04 ┆ 10 ┆ 18993 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-01 ┆ 2022-01-04 ┆ 10 ┆ 18994 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-01 ┆ 2022-01-04 ┆ 10 ┆ 18995 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-01 ┆ 2022-01-04 ┆ 10 ┆ 18996 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ abc ┆ 2022-03-04 ┆ 2022-03-04 ┆ 3 ┆ 19055 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-11 ┆ 2022-05-16 ┆ 4 ┆ 19123 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-11 ┆ 2022-05-16 ┆ 4 ┆ 19124 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-11 ┆ 2022-05-16 ┆ 4 ┆ 19125 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-11 ┆ 2022-05-16 ┆ 4 ┆ 19126 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-11 ┆ 2022-05-16 ┆ 4 ┆ 19127 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-11 ┆ 2022-05-16 ┆ 4 ┆ 19128 │
└─────┴────────────┴────────────┴───────┴───────┘
The final step is to cast the date
column back to a pl.Date
, and then select only the columns that we want.
(
df.with_columns(
[pl.arange(pl.col("start"), pl.col("end") + 1).alias("date")])
.explode("date")
.with_column(pl.col("date").cast(pl.Date))
.select(["id", "date", "value"])
)
shape: (11, 3)
┌─────┬────────────┬───────┐
│ id ┆ date ┆ value │
│ --- ┆ --- ┆ --- │
│ str ┆ date ┆ i64 │
╞═════╪════════════╪═══════╡
│ 123 ┆ 2022-01-01 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-02 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-03 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-04 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ abc ┆ 2022-03-04 ┆ 3 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-11 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-12 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-13 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-14 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-15 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-16 ┆ 4 │
└─────┴────────────┴───────┘
I struggled today with the same problem and I thought I share my solution.
As cbilot already mentions pl.dat_range doesn’t take expressions as low and high value. So I worked around by using apply.
Data:
import polars as pl
from datetime import date
df = pl.DataFrame(
{
"id": ["123", "abc", "456"],
"start": [date(2022, 1, 1), date(2022, 3, 4), date(2022, 5, 11)],
"end": [date(2022, 1, 4), date(2022, 3, 4), date(2022, 5, 16)],
"value": [10, 3, 4],
}
)
Solution:
(
df.with_columns(
[(pl.struct(["start", "end"])
.apply(lambda x: pl.date_range(x["start"], x["end"], "1d"))
.alias("date"))])
.explode(pl.col("date"))
.select(["id", "date", "value"])
)
shape: (11, 3)
┌─────┬────────────┬───────┐
│ id ┆ date ┆ value │
│ --- ┆ --- ┆ --- │
│ str ┆ date ┆ i64 │
╞═════╪════════════╪═══════╡
│ 123 ┆ 2022-01-01 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-02 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-03 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-04 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ ... ┆ ... ┆ ... │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-13 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-14 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-15 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-16 ┆ 4 │
└─────┴────────────┴───────┘
I managed to implement it without using apply:
df = pl.DataFrame(
{
"id": ["123", "abc", "456"],
"start": [date(2022, 1, 1), date(2022, 3, 4), date(2022, 5, 11)],
"end": [date(2022, 1, 4), date(2022, 3, 4), date(2022, 5, 16)],
"value": [10, 3, 4],
})
df = df.with_columns(
[pl.date_range(low=pl.col('start'), high=pl.col('end'), interval='1d')
.alias('date')])
.explode('date')
.select(["id", "date", "value"])
I have a polars dataframe with two date columns that represent a start and end date and then a value that I want to repeat for all dates in between those two dates so that I can join those on other tables.
Example input is
id | start | end | value |
---|---|---|---|
123 | 2022-01-01 | 2022-01-04 | 10 |
abc | 2022-03-04 | 2022-03-04 | 3 |
456 | 2022-05-11 | 2022-05-16 | 4 |
and expected output is
id | date | value |
---|---|---|
123 | 2022-01-01 | 10 |
123 | 2022-01-02 | 10 |
123 | 2022-01-03 | 10 |
123 | 2022-01-04 | 10 |
abc | 2022-03-04 | 3 |
456 | 2022-05-11 | 4 |
456 | 2022-05-12 | 4 |
456 | 2022-05-13 | 4 |
456 | 2022-05-14 | 4 |
456 | 2022-05-15 | 4 |
456 | 2022-05-16 | 4 |
Starting with this data:
import polars as pl
from datetime import date
df = pl.DataFrame(
{
"id": ["123", "abc", "456"],
"start": [date(2022, 1, 1), date(2022, 3, 4), date(2022, 5, 11)],
"end": [date(2022, 1, 4), date(2022, 3, 4), date(2022, 5, 16)],
"value": [10, 3, 4],
}
)
df
shape: (3, 4)
┌─────┬────────────┬────────────┬───────┐
│ id ┆ start ┆ end ┆ value │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ date ┆ date ┆ i64 │
╞═════╪════════════╪════════════╪═══════╡
│ 123 ┆ 2022-01-01 ┆ 2022-01-04 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ abc ┆ 2022-03-04 ┆ 2022-03-04 ┆ 3 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-11 ┆ 2022-05-16 ┆ 4 │
└─────┴────────────┴────────────┴───────┘
The Algorithm
(
df.with_columns(
[pl.arange(pl.col("start"), pl.col("end") + 1).alias("date")])
.explode("date")
.with_column(pl.col("date").cast(pl.Date))
.select(["id", "date", "value"])
)
shape: (11, 3)
┌─────┬────────────┬───────┐
│ id ┆ date ┆ value │
│ --- ┆ --- ┆ --- │
│ str ┆ date ┆ i64 │
╞═════╪════════════╪═══════╡
│ 123 ┆ 2022-01-01 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-02 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-03 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-04 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ abc ┆ 2022-03-04 ┆ 3 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-11 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-12 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-13 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-14 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-15 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-16 ┆ 4 │
└─────┴────────────┴───────┘
In steps
Normally, we create a range of dates using the date_range
expression. However, date_range
does not take an Expression as its low
and high
parameters.
However, arange
does allow Expressions as its low
and high
parameters. We can (implicitly) cast the start
and end
dates to integers, which represent the number of days since the UNIX epoch.
The result is a list of integers which represents the days between (and including) the start
and end
dates (expressed as days since the UNIX epoch)..
Notice that we have to add 1 to the high
parameter to make sure we capture the end date.
(
df.with_columns(
[pl.arange(pl.col("start"), pl.col("end") + 1).alias("date")])
)
shape: (3, 5)
┌─────┬────────────┬────────────┬───────┬───────────────────────────┐
│ id ┆ start ┆ end ┆ value ┆ date │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ date ┆ date ┆ i64 ┆ list[i64] │
╞═════╪════════════╪════════════╪═══════╪═══════════════════════════╡
│ 123 ┆ 2022-01-01 ┆ 2022-01-04 ┆ 10 ┆ [18993, 18994, ... 18996] │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ abc ┆ 2022-03-04 ┆ 2022-03-04 ┆ 3 ┆ [19055] │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-11 ┆ 2022-05-16 ┆ 4 ┆ [19123, 19124, ... 19128] │
└─────┴────────────┴────────────┴───────┴───────────────────────────┘
Next we can use explode
to place each of the integers on a separate row.
(
df.with_columns(
[pl.arange(pl.col("start"), pl.col("end") + 1).alias("date")])
.explode("date")
)
shape: (11, 5)
┌─────┬────────────┬────────────┬───────┬───────┐
│ id ┆ start ┆ end ┆ value ┆ date │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ date ┆ date ┆ i64 ┆ i64 │
╞═════╪════════════╪════════════╪═══════╪═══════╡
│ 123 ┆ 2022-01-01 ┆ 2022-01-04 ┆ 10 ┆ 18993 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-01 ┆ 2022-01-04 ┆ 10 ┆ 18994 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-01 ┆ 2022-01-04 ┆ 10 ┆ 18995 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-01 ┆ 2022-01-04 ┆ 10 ┆ 18996 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ abc ┆ 2022-03-04 ┆ 2022-03-04 ┆ 3 ┆ 19055 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-11 ┆ 2022-05-16 ┆ 4 ┆ 19123 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-11 ┆ 2022-05-16 ┆ 4 ┆ 19124 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-11 ┆ 2022-05-16 ┆ 4 ┆ 19125 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-11 ┆ 2022-05-16 ┆ 4 ┆ 19126 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-11 ┆ 2022-05-16 ┆ 4 ┆ 19127 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-11 ┆ 2022-05-16 ┆ 4 ┆ 19128 │
└─────┴────────────┴────────────┴───────┴───────┘
The final step is to cast the date
column back to a pl.Date
, and then select only the columns that we want.
(
df.with_columns(
[pl.arange(pl.col("start"), pl.col("end") + 1).alias("date")])
.explode("date")
.with_column(pl.col("date").cast(pl.Date))
.select(["id", "date", "value"])
)
shape: (11, 3)
┌─────┬────────────┬───────┐
│ id ┆ date ┆ value │
│ --- ┆ --- ┆ --- │
│ str ┆ date ┆ i64 │
╞═════╪════════════╪═══════╡
│ 123 ┆ 2022-01-01 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-02 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-03 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-04 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ abc ┆ 2022-03-04 ┆ 3 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-11 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-12 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-13 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-14 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-15 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-16 ┆ 4 │
└─────┴────────────┴───────┘
I struggled today with the same problem and I thought I share my solution.
As cbilot already mentions pl.dat_range doesn’t take expressions as low and high value. So I worked around by using apply.
Data:
import polars as pl
from datetime import date
df = pl.DataFrame(
{
"id": ["123", "abc", "456"],
"start": [date(2022, 1, 1), date(2022, 3, 4), date(2022, 5, 11)],
"end": [date(2022, 1, 4), date(2022, 3, 4), date(2022, 5, 16)],
"value": [10, 3, 4],
}
)
Solution:
(
df.with_columns(
[(pl.struct(["start", "end"])
.apply(lambda x: pl.date_range(x["start"], x["end"], "1d"))
.alias("date"))])
.explode(pl.col("date"))
.select(["id", "date", "value"])
)
shape: (11, 3)
┌─────┬────────────┬───────┐
│ id ┆ date ┆ value │
│ --- ┆ --- ┆ --- │
│ str ┆ date ┆ i64 │
╞═════╪════════════╪═══════╡
│ 123 ┆ 2022-01-01 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-02 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-03 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-04 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ ... ┆ ... ┆ ... │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-13 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-14 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-15 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-16 ┆ 4 │
└─────┴────────────┴───────┘
I managed to implement it without using apply:
df = pl.DataFrame(
{
"id": ["123", "abc", "456"],
"start": [date(2022, 1, 1), date(2022, 3, 4), date(2022, 5, 11)],
"end": [date(2022, 1, 4), date(2022, 3, 4), date(2022, 5, 16)],
"value": [10, 3, 4],
})
df = df.with_columns(
[pl.date_range(low=pl.col('start'), high=pl.col('end'), interval='1d')
.alias('date')])
.explode('date')
.select(["id", "date", "value"])