Combine date ranges in Spark dataframe
Question:
I have a problem similar to this one.
However, I am dealing with a huge dataset. I was trying to see if I can do the same thing in PySpark instead of pandas. Below is the solution in pandas. Can this be done in PySpark?
def merge_dates(grp):
# Find contiguous date groups, and get the first/last start/end date for each group.
dt_groups = (grp['StartDate'] != grp['EndDate'].shift()).cumsum()
return grp.groupby(dt_groups).agg({'StartDate': 'first', 'EndDate': 'last'})
# Perform a groupby and apply the merge_dates function, followed by formatting.
df = df.groupby(['FruitID', 'FruitType']).apply(merge_dates)
df = df.reset_index().drop('level_2', axis=1)
Answers:
We can use a Window
and lag
function to calculate the contiguous groups and then aggregate those in a similar way as the Pandas
function you shared. A working example is given below, hope this helps!
import pandas as pd
from dateutil.parser import parse
from pyspark.sql.window import Window
import pyspark.sql.functions as F
# EXAMPLE DATA -----------------------------------------------
pdf = pd.DataFrame.from_items([('FruitID', [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4]),
('FruitType', ['Apple', 'Apple', 'Apple', 'Orange', 'Orange', 'Orange', 'Banana', 'Banana', 'Blueberry', 'Mango', 'Kiwi', 'Mango']),
('StartDate', [parse(x) for x in ['2015-01-01', '2016-01-01', '2017-01-01', '2015-01-01', '2016-05-31',
'2017-01-01', '2015-01-01', '2016-01-01', '2017-01-01', '2015-01-01', '2016-09-15', '2017-01-01']]),
('EndDate', [parse(x) for x in ['2016-01-01', '2017-01-01', '2018-01-01', '2016-01-01', '2017-01-01',
'2018-01-01', '2016-01-01', '2017-01-01', '2018-01-01', '2016-01-01', '2017-01-01', '2018-01-01']])
])
pdf.sort_values(['FruitID', 'StartDate'])
df = sqlContext.createDataFrame(pdf)
# FIND CONTIGUOUS GROUPS AND AGGREGATE ---------------------
w = Window.partitionBy("FruitType").orderBy("StartDate")
contiguous = F.when(F.datediff(F.lag("EndDate", 1).over(w),F.col("StartDate"))!=0,F.lit(1)).otherwise(F.lit(0))
df = (df
.withColumn('contiguous_grp', F.sum(contiguous).over(w))
.groupBy('FruitType','contiguous_grp')
.agg(F.first('StartDate').alias('StartDate'),F.last('EndDate').alias('EndDate'))
.drop('contiguous_grp'))
df.show()
Output:
+---------+-------------------+-------------------+
|FruitType| StartDate| EndDate|
+---------+-------------------+-------------------+
| Orange|2015-01-01 00:00:00|2016-01-01 00:00:00|
| Orange|2016-05-31 00:00:00|2018-01-01 00:00:00|
| Banana|2015-01-01 00:00:00|2017-01-01 00:00:00|
| Kiwi|2016-09-15 00:00:00|2017-01-01 00:00:00|
| Mango|2015-01-01 00:00:00|2016-01-01 00:00:00|
| Mango|2017-01-01 00:00:00|2018-01-01 00:00:00|
| Apple|2015-01-01 00:00:00|2018-01-01 00:00:00|
|Blueberry|2017-01-01 00:00:00|2018-01-01 00:00:00|
+---------+-------------------+-------------------+
The Florian’s answer does not distinguish between date ranges inside other date ranges and doesn’t cover other important cases, so below I provide several modified versions.
When combining date ranges, these are the points to take into account:
- ranges inside other ranges
- null values
- acceptable size of the gap between the date ranges (do you need "touching" date ranges to be combined too?)
All the 4 following scripts work with ranges inside other ranges, but they differ in the other two criteria, that’s why there are several variations.
Option 1 – when null dates may exist and both "touching" (consecutive) and overlapping date ranges need to be combined
null in column start_date
is considered as the earliest date and in end_date
– the latest
w = W.partitionBy("id").orderBy("start_date")
max_end = F.when(~F.expr("any(end_date is null)").over(w), F.max("end_date").over(w))
contiguous = F.when(F.datediff(F.lag(max_end).over(w), "start_date") < -1, 1).otherwise(0)
df = (df
.withColumn("contiguous_grp", F.sum(contiguous).over(w))
.groupBy("id", "contiguous_grp")
.agg(
F.first("start_date").alias("start_date"),
F.when(~F.expr("any(end_date is null)"), F.max("end_date")).alias("end_date"))
.drop("contiguous_grp")
)
Test dataframe:
from pyspark.sql import functions as F, Window as W
df = spark.createDataFrame(
[("separate", "2022-01-01", "2022-01-09"),
("separate", "2022-01-11", "2022-01-20"),
("consecutive", "2022-02-01", "2022-02-10"),
("consecutive", "2022-02-11", "2022-02-20"),
("overlap by 1", "2022-03-01", "2022-03-11"),
("overlap by 1", "2022-03-11", "2022-03-20"),
("overlap by 2", "2022-04-01", "2022-04-12"),
("overlap by 2", "2022-04-11", "2022-04-20"),
("inside", "2022-05-01", "2022-05-20"),
("inside", "2022-05-02", "2022-05-19"),
("common_start", "2022-06-01", "2022-06-20"),
("common_start", "2022-06-01", "2022-06-19"),
("common_end", "2022-07-01", "2022-07-20"),
("common_end", "2022-07-02", "2022-07-20"),
("overlap grp of 3", "2022-08-01", "2022-08-19"),
("overlap grp of 3", "2022-08-08", "2022-08-12"),
("overlap grp of 3", "2022-08-15", "2022-08-20"),
("n: separate", None, "2022-01-09"),
("n: separate", "2022-01-11", None),
("n: consecutive", None, "2022-02-10"),
("n: consecutive", "2022-02-11", None),
("n: overlap by 1", None, "2022-03-11"),
("n: overlap by 1", "2022-03-11", None),
("n: overlap by 2", None, "2022-04-12"),
("n: overlap by 2", "2022-04-11", None),
("n: inside", None, None),
("n: inside", "2022-05-02", "2022-05-19"),
("n: common_start", None, "2022-06-20"),
("n: common_start", None, "2022-06-19"),
("n: common_end", "2022-07-01", None),
("n: common_end", "2022-07-02", None),
("n: overlap grp of 3", None, None),
("n: overlap grp of 3", "2022-08-08", "2022-08-12"),
("n: overlap grp of 3", "2022-08-15", None)],
[ "id", "start_date", "end_date"])
Result:
+-------------------+----------+----------+
| id|start_date| end_date|
+-------------------+----------+----------+
| common_end|2022-07-01|2022-07-20|
| common_start|2022-06-01|2022-06-20|
| consecutive|2022-02-01|2022-02-20|
| inside|2022-05-01|2022-05-20|
| n: common_end|2022-07-01| null|
| n: common_start| null|2022-06-20|
| n: consecutive| null| null|
| n: inside| null| null|
| n: overlap by 1| null| null|
| n: overlap by 2| null| null|
|n: overlap grp of 3| null| null|
| n: separate| null|2022-01-09|
| n: separate|2022-01-11| null|
| overlap by 1|2022-03-01|2022-03-20|
| overlap by 2|2022-04-01|2022-04-20|
| overlap grp of 3|2022-08-01|2022-08-20|
| separate|2022-01-01|2022-01-09|
| separate|2022-01-11|2022-01-20|
+-------------------+----------+----------+
Option 2 – when null dates may exist and only overlapping date ranges need to be combined ("touching"/consecutive ranges don’t need to be combined)
Same as above: null in column start_date
is considered as the earliest date and in end_date
– the latest
w = W.partitionBy("id").orderBy("start_date")
max_end = F.when(~F.expr("any(end_date is null)").over(w), F.max("end_date").over(w))
contiguous = F.when(F.datediff(F.lag(max_end).over(w), "start_date") < 0, 1).otherwise(0)
df = (df
.withColumn("contiguous_grp", F.sum(contiguous).over(w))
.groupBy("id", "contiguous_grp")
.agg(
F.first("start_date").alias("start_date"),
F.when(~F.expr("any(end_date is null)"), F.max("end_date")).alias("end_date"))
.drop("contiguous_grp")
)
Result:
+-------------------+----------+----------+
| id|start_date| end_date|
+-------------------+----------+----------+
| common_end|2022-07-01|2022-07-20|
| common_start|2022-06-01|2022-06-20|
| consecutive|2022-02-01|2022-02-10|
| consecutive|2022-02-11|2022-02-20|
| inside|2022-05-01|2022-05-20|
| n: common_end|2022-07-01| null|
| n: common_start| null|2022-06-20|
| n: consecutive| null|2022-02-10|
| n: consecutive|2022-02-11| null|
| n: inside| null| null|
| n: overlap by 1| null| null|
| n: overlap by 2| null| null|
|n: overlap grp of 3| null| null|
| n: separate| null|2022-01-09|
| n: separate|2022-01-11| null|
| overlap by 1|2022-03-01|2022-03-20|
| overlap by 2|2022-04-01|2022-04-20|
| overlap grp of 3|2022-08-01|2022-08-20|
| separate|2022-01-01|2022-01-09|
| separate|2022-01-11|2022-01-20|
+-------------------+----------+----------+
Option 3 – when null dates don’t exist and both "touching" (consecutive) and overlapping date ranges need to be combined
w = W.partitionBy("id").orderBy("start_date")
max_end = F.max("end_date").over(w)
contiguous = F.when(F.datediff(F.lag(max_end).over(w), "start_date") < -1, 1).otherwise(0)
df = (df
.withColumn("contiguous_grp", F.sum(contiguous).over(w))
.groupBy("id", "contiguous_grp")
.agg(F.first("start_date").alias("start_date"), F.max("end_date").alias("end_date"))
.drop("contiguous_grp")
)
Test dataframe:
from pyspark.sql import functions as F, Window as W
df = spark.createDataFrame(
[("separate", "2022-01-01", "2022-01-09"),
("separate", "2022-01-11", "2022-01-20"),
("consecutive", "2022-02-01", "2022-02-10"),
("consecutive", "2022-02-11", "2022-02-20"),
("overlap by 1", "2022-03-01", "2022-03-11"),
("overlap by 1", "2022-03-11", "2022-03-20"),
("overlap by 2", "2022-04-01", "2022-04-12"),
("overlap by 2", "2022-04-11", "2022-04-20"),
("inside", "2022-05-01", "2022-05-20"),
("inside", "2022-05-02", "2022-05-19"),
("common_start", "2022-06-01", "2022-06-20"),
("common_start", "2022-06-01", "2022-06-19"),
("common_end", "2022-07-01", "2022-07-20"),
("common_end", "2022-07-02", "2022-07-20"),
("overlap grp of 3", "2022-08-01", "2022-08-19"),
("overlap grp of 3", "2022-08-08", "2022-08-12"),
("overlap grp of 3", "2022-08-15", "2022-08-20")],
[ "id", "start_date", "end_date"])
Result:
+----------------+----------+----------+
| id|start_date| end_date|
+----------------+----------+----------+
| common_end|2022-07-01|2022-07-20|
| common_start|2022-06-01|2022-06-20|
| consecutive|2022-02-01|2022-02-20|
| inside|2022-05-01|2022-05-20|
| overlap by 1|2022-03-01|2022-03-20|
| overlap by 2|2022-04-01|2022-04-20|
|overlap grp of 3|2022-08-01|2022-08-20|
| separate|2022-01-01|2022-01-09|
| separate|2022-01-11|2022-01-20|
+----------------+----------+----------+
Option 4 – when null dates don’t exist and only overlapping date ranges need to be combined ("touching"/consecutive ranges don’t need to be combined)
w = W.partitionBy("id").orderBy("start_date")
max_end = F.max("end_date").over(w)
contiguous = F.when(F.datediff(F.lag(max_end).over(w), "start_date") < 0, 1).otherwise(0)
df = (df
.withColumn("contiguous_grp", F.sum(contiguous).over(w))
.groupBy("id", "contiguous_grp")
.agg(F.first("start_date").alias("start_date"), F.max("end_date").alias("end_date"))
.drop("contiguous_grp")
)
Result:
+----------------+----------+----------+
| id|start_date| end_date|
+----------------+----------+----------+
| common_end|2022-07-01|2022-07-20|
| common_start|2022-06-01|2022-06-20|
| consecutive|2022-02-01|2022-02-10|
| consecutive|2022-02-11|2022-02-20|
| inside|2022-05-01|2022-05-20|
| overlap by 1|2022-03-01|2022-03-20|
| overlap by 2|2022-04-01|2022-04-20|
|overlap grp of 3|2022-08-01|2022-08-20|
| separate|2022-01-01|2022-01-09|
| separate|2022-01-11|2022-01-20|
+----------------+----------+----------+
I have a problem similar to this one.
However, I am dealing with a huge dataset. I was trying to see if I can do the same thing in PySpark instead of pandas. Below is the solution in pandas. Can this be done in PySpark?
def merge_dates(grp):
# Find contiguous date groups, and get the first/last start/end date for each group.
dt_groups = (grp['StartDate'] != grp['EndDate'].shift()).cumsum()
return grp.groupby(dt_groups).agg({'StartDate': 'first', 'EndDate': 'last'})
# Perform a groupby and apply the merge_dates function, followed by formatting.
df = df.groupby(['FruitID', 'FruitType']).apply(merge_dates)
df = df.reset_index().drop('level_2', axis=1)
We can use a Window
and lag
function to calculate the contiguous groups and then aggregate those in a similar way as the Pandas
function you shared. A working example is given below, hope this helps!
import pandas as pd
from dateutil.parser import parse
from pyspark.sql.window import Window
import pyspark.sql.functions as F
# EXAMPLE DATA -----------------------------------------------
pdf = pd.DataFrame.from_items([('FruitID', [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4]),
('FruitType', ['Apple', 'Apple', 'Apple', 'Orange', 'Orange', 'Orange', 'Banana', 'Banana', 'Blueberry', 'Mango', 'Kiwi', 'Mango']),
('StartDate', [parse(x) for x in ['2015-01-01', '2016-01-01', '2017-01-01', '2015-01-01', '2016-05-31',
'2017-01-01', '2015-01-01', '2016-01-01', '2017-01-01', '2015-01-01', '2016-09-15', '2017-01-01']]),
('EndDate', [parse(x) for x in ['2016-01-01', '2017-01-01', '2018-01-01', '2016-01-01', '2017-01-01',
'2018-01-01', '2016-01-01', '2017-01-01', '2018-01-01', '2016-01-01', '2017-01-01', '2018-01-01']])
])
pdf.sort_values(['FruitID', 'StartDate'])
df = sqlContext.createDataFrame(pdf)
# FIND CONTIGUOUS GROUPS AND AGGREGATE ---------------------
w = Window.partitionBy("FruitType").orderBy("StartDate")
contiguous = F.when(F.datediff(F.lag("EndDate", 1).over(w),F.col("StartDate"))!=0,F.lit(1)).otherwise(F.lit(0))
df = (df
.withColumn('contiguous_grp', F.sum(contiguous).over(w))
.groupBy('FruitType','contiguous_grp')
.agg(F.first('StartDate').alias('StartDate'),F.last('EndDate').alias('EndDate'))
.drop('contiguous_grp'))
df.show()
Output:
+---------+-------------------+-------------------+
|FruitType| StartDate| EndDate|
+---------+-------------------+-------------------+
| Orange|2015-01-01 00:00:00|2016-01-01 00:00:00|
| Orange|2016-05-31 00:00:00|2018-01-01 00:00:00|
| Banana|2015-01-01 00:00:00|2017-01-01 00:00:00|
| Kiwi|2016-09-15 00:00:00|2017-01-01 00:00:00|
| Mango|2015-01-01 00:00:00|2016-01-01 00:00:00|
| Mango|2017-01-01 00:00:00|2018-01-01 00:00:00|
| Apple|2015-01-01 00:00:00|2018-01-01 00:00:00|
|Blueberry|2017-01-01 00:00:00|2018-01-01 00:00:00|
+---------+-------------------+-------------------+
The Florian’s answer does not distinguish between date ranges inside other date ranges and doesn’t cover other important cases, so below I provide several modified versions.
When combining date ranges, these are the points to take into account:
- ranges inside other ranges
- null values
- acceptable size of the gap between the date ranges (do you need "touching" date ranges to be combined too?)
All the 4 following scripts work with ranges inside other ranges, but they differ in the other two criteria, that’s why there are several variations.
Option 1 – when null dates may exist and both "touching" (consecutive) and overlapping date ranges need to be combined
null in column start_date
is considered as the earliest date and in end_date
– the latest
w = W.partitionBy("id").orderBy("start_date")
max_end = F.when(~F.expr("any(end_date is null)").over(w), F.max("end_date").over(w))
contiguous = F.when(F.datediff(F.lag(max_end).over(w), "start_date") < -1, 1).otherwise(0)
df = (df
.withColumn("contiguous_grp", F.sum(contiguous).over(w))
.groupBy("id", "contiguous_grp")
.agg(
F.first("start_date").alias("start_date"),
F.when(~F.expr("any(end_date is null)"), F.max("end_date")).alias("end_date"))
.drop("contiguous_grp")
)
Test dataframe:
from pyspark.sql import functions as F, Window as W
df = spark.createDataFrame(
[("separate", "2022-01-01", "2022-01-09"),
("separate", "2022-01-11", "2022-01-20"),
("consecutive", "2022-02-01", "2022-02-10"),
("consecutive", "2022-02-11", "2022-02-20"),
("overlap by 1", "2022-03-01", "2022-03-11"),
("overlap by 1", "2022-03-11", "2022-03-20"),
("overlap by 2", "2022-04-01", "2022-04-12"),
("overlap by 2", "2022-04-11", "2022-04-20"),
("inside", "2022-05-01", "2022-05-20"),
("inside", "2022-05-02", "2022-05-19"),
("common_start", "2022-06-01", "2022-06-20"),
("common_start", "2022-06-01", "2022-06-19"),
("common_end", "2022-07-01", "2022-07-20"),
("common_end", "2022-07-02", "2022-07-20"),
("overlap grp of 3", "2022-08-01", "2022-08-19"),
("overlap grp of 3", "2022-08-08", "2022-08-12"),
("overlap grp of 3", "2022-08-15", "2022-08-20"),
("n: separate", None, "2022-01-09"),
("n: separate", "2022-01-11", None),
("n: consecutive", None, "2022-02-10"),
("n: consecutive", "2022-02-11", None),
("n: overlap by 1", None, "2022-03-11"),
("n: overlap by 1", "2022-03-11", None),
("n: overlap by 2", None, "2022-04-12"),
("n: overlap by 2", "2022-04-11", None),
("n: inside", None, None),
("n: inside", "2022-05-02", "2022-05-19"),
("n: common_start", None, "2022-06-20"),
("n: common_start", None, "2022-06-19"),
("n: common_end", "2022-07-01", None),
("n: common_end", "2022-07-02", None),
("n: overlap grp of 3", None, None),
("n: overlap grp of 3", "2022-08-08", "2022-08-12"),
("n: overlap grp of 3", "2022-08-15", None)],
[ "id", "start_date", "end_date"])
Result:
+-------------------+----------+----------+
| id|start_date| end_date|
+-------------------+----------+----------+
| common_end|2022-07-01|2022-07-20|
| common_start|2022-06-01|2022-06-20|
| consecutive|2022-02-01|2022-02-20|
| inside|2022-05-01|2022-05-20|
| n: common_end|2022-07-01| null|
| n: common_start| null|2022-06-20|
| n: consecutive| null| null|
| n: inside| null| null|
| n: overlap by 1| null| null|
| n: overlap by 2| null| null|
|n: overlap grp of 3| null| null|
| n: separate| null|2022-01-09|
| n: separate|2022-01-11| null|
| overlap by 1|2022-03-01|2022-03-20|
| overlap by 2|2022-04-01|2022-04-20|
| overlap grp of 3|2022-08-01|2022-08-20|
| separate|2022-01-01|2022-01-09|
| separate|2022-01-11|2022-01-20|
+-------------------+----------+----------+
Option 2 – when null dates may exist and only overlapping date ranges need to be combined ("touching"/consecutive ranges don’t need to be combined)
Same as above: null in column start_date
is considered as the earliest date and in end_date
– the latest
w = W.partitionBy("id").orderBy("start_date")
max_end = F.when(~F.expr("any(end_date is null)").over(w), F.max("end_date").over(w))
contiguous = F.when(F.datediff(F.lag(max_end).over(w), "start_date") < 0, 1).otherwise(0)
df = (df
.withColumn("contiguous_grp", F.sum(contiguous).over(w))
.groupBy("id", "contiguous_grp")
.agg(
F.first("start_date").alias("start_date"),
F.when(~F.expr("any(end_date is null)"), F.max("end_date")).alias("end_date"))
.drop("contiguous_grp")
)
Result:
+-------------------+----------+----------+
| id|start_date| end_date|
+-------------------+----------+----------+
| common_end|2022-07-01|2022-07-20|
| common_start|2022-06-01|2022-06-20|
| consecutive|2022-02-01|2022-02-10|
| consecutive|2022-02-11|2022-02-20|
| inside|2022-05-01|2022-05-20|
| n: common_end|2022-07-01| null|
| n: common_start| null|2022-06-20|
| n: consecutive| null|2022-02-10|
| n: consecutive|2022-02-11| null|
| n: inside| null| null|
| n: overlap by 1| null| null|
| n: overlap by 2| null| null|
|n: overlap grp of 3| null| null|
| n: separate| null|2022-01-09|
| n: separate|2022-01-11| null|
| overlap by 1|2022-03-01|2022-03-20|
| overlap by 2|2022-04-01|2022-04-20|
| overlap grp of 3|2022-08-01|2022-08-20|
| separate|2022-01-01|2022-01-09|
| separate|2022-01-11|2022-01-20|
+-------------------+----------+----------+
Option 3 – when null dates don’t exist and both "touching" (consecutive) and overlapping date ranges need to be combined
w = W.partitionBy("id").orderBy("start_date")
max_end = F.max("end_date").over(w)
contiguous = F.when(F.datediff(F.lag(max_end).over(w), "start_date") < -1, 1).otherwise(0)
df = (df
.withColumn("contiguous_grp", F.sum(contiguous).over(w))
.groupBy("id", "contiguous_grp")
.agg(F.first("start_date").alias("start_date"), F.max("end_date").alias("end_date"))
.drop("contiguous_grp")
)
Test dataframe:
from pyspark.sql import functions as F, Window as W
df = spark.createDataFrame(
[("separate", "2022-01-01", "2022-01-09"),
("separate", "2022-01-11", "2022-01-20"),
("consecutive", "2022-02-01", "2022-02-10"),
("consecutive", "2022-02-11", "2022-02-20"),
("overlap by 1", "2022-03-01", "2022-03-11"),
("overlap by 1", "2022-03-11", "2022-03-20"),
("overlap by 2", "2022-04-01", "2022-04-12"),
("overlap by 2", "2022-04-11", "2022-04-20"),
("inside", "2022-05-01", "2022-05-20"),
("inside", "2022-05-02", "2022-05-19"),
("common_start", "2022-06-01", "2022-06-20"),
("common_start", "2022-06-01", "2022-06-19"),
("common_end", "2022-07-01", "2022-07-20"),
("common_end", "2022-07-02", "2022-07-20"),
("overlap grp of 3", "2022-08-01", "2022-08-19"),
("overlap grp of 3", "2022-08-08", "2022-08-12"),
("overlap grp of 3", "2022-08-15", "2022-08-20")],
[ "id", "start_date", "end_date"])
Result:
+----------------+----------+----------+
| id|start_date| end_date|
+----------------+----------+----------+
| common_end|2022-07-01|2022-07-20|
| common_start|2022-06-01|2022-06-20|
| consecutive|2022-02-01|2022-02-20|
| inside|2022-05-01|2022-05-20|
| overlap by 1|2022-03-01|2022-03-20|
| overlap by 2|2022-04-01|2022-04-20|
|overlap grp of 3|2022-08-01|2022-08-20|
| separate|2022-01-01|2022-01-09|
| separate|2022-01-11|2022-01-20|
+----------------+----------+----------+
Option 4 – when null dates don’t exist and only overlapping date ranges need to be combined ("touching"/consecutive ranges don’t need to be combined)
w = W.partitionBy("id").orderBy("start_date")
max_end = F.max("end_date").over(w)
contiguous = F.when(F.datediff(F.lag(max_end).over(w), "start_date") < 0, 1).otherwise(0)
df = (df
.withColumn("contiguous_grp", F.sum(contiguous).over(w))
.groupBy("id", "contiguous_grp")
.agg(F.first("start_date").alias("start_date"), F.max("end_date").alias("end_date"))
.drop("contiguous_grp")
)
Result:
+----------------+----------+----------+
| id|start_date| end_date|
+----------------+----------+----------+
| common_end|2022-07-01|2022-07-20|
| common_start|2022-06-01|2022-06-20|
| consecutive|2022-02-01|2022-02-10|
| consecutive|2022-02-11|2022-02-20|
| inside|2022-05-01|2022-05-20|
| overlap by 1|2022-03-01|2022-03-20|
| overlap by 2|2022-04-01|2022-04-20|
|overlap grp of 3|2022-08-01|2022-08-20|
| separate|2022-01-01|2022-01-09|
| separate|2022-01-11|2022-01-20|
+----------------+----------+----------+