How to efficiently join two dataframes if they have overlapping intervals?
Question:
I have two dataframes:
df_A = pd.DataFrame({'start_date':['2022-07-01 00:02:41','2022-07-01 00:07:41','2022-07-01 00:22:41','2022-07-01 01:01:23','2022-07-01 01:01:23'],'end_date':['2022-07-01 00:02:41','2022-07-01 00:17:41','2022-07-01 00:57:42','2022-07-01 01:01:23','2022-07-01 01:03:51',], 'eventname':['e2','e3','e5','e6','e7',] })
df_B = pd.DataFrame({'start_date':['2022-07-01 00:00:41','2022-07-01 00:06:41','2022-07-01 00:17:56','2022-07-01 01:03:40',],'end_date':['2022-07-01 00:01:41','2022-07-01 00:14:41','2022-07-01 00:19:42','2022-07-01 02:03:23',], 'eventname':['e1','e3','e4','e7',] })
#df_A
start_date end_date eventname
0 2022-07-01 00:02:41 2022-07-01 00:02:41 e2
1 2022-07-01 00:07:41 2022-07-01 00:17:41 e3
2 2022-07-01 00:22:41 2022-07-01 00:57:42 e5
3 2022-07-01 01:01:23 2022-07-01 01:01:23 e6
4 2022-07-01 01:01:23 2022-07-01 01:03:51 e7
#df_B
start_date end_date eventname
0 2022-07-01 00:00:41 2022-07-01 00:01:41 e1
1 2022-07-01 00:06:41 2022-07-01 00:14:41 e3
2 2022-07-01 00:17:56 2022-07-01 00:19:42 e4
3 2022-07-01 01:03:40 2022-07-01 02:03:23 e7
I would like to join the rows of df_B to df_A if the time interval has any overlap and my expected result is this:
df_C = pd.DataFrame({'start_date_A':['nan','2022-07-01 00:02:41','2022-07-01 00:07:41','nan','2022-07-01 00:22:41','2022-07-01 01:01:23','2022-07-01 01:01:23',],'end_date_A':['nan','2022-07-01 00:02:41', '2022-07-01 00:17:41','nan','2022-07-01 00:57:42','2022-07-01 01:01:23','2022-07-01 01:03:51',],'start_date_b':['2022-07-01 00:00:41','nan','2022-07-01 00:06:41','2022-07-01 00:17:56','nan','nan','2022-07-01 01:03:40',],'end_date_b':['2022-07-01 00:01:41', 'nan', '2022-07-01 00:14:41', '2022-07-01 00:19:42', 'nan', 'nan', '2022-07-01 02:03:23', ], 'eventname':['e2','e1','e3','e4','e5','e6','e7'] })
#df_C
I tried the solution mentioned here but it gives me only the first value of the column each time, not the value for that row.
I also tried using the IntervalIndex like this:
bins = pd.IntervalIndex.from_arrays(df_A['start_date'],
df_A['end_date'],
closed='both')
out = df_B.assign(interval=pd.cut(df_B['eventname'], bins))
.merge(df_A.assign(interval=bins), on='interval', how='left')
but I get the error that overlapping intervals are not allowed. How can I do this?
Edit: eventname also need to be the same for the join to happen. If not, both the rows have to be present separately so it will be a full join
Answers:
import pandas as pd
df_A = pd.DataFrame({'start_date':['2022-07-01 00:02:41','2022-07-01 00:07:41','2022-07-01 00:22:41','2022-07-01 01:01:23','2022-07-01 01:01:23'],'end_date':['2022-07-01 00:02:41','2022-07-01 00:17:41','2022-07-01 00:57:42','2022-07-01 01:01:23','2022-07-01 01:03:51',], 'eventname':['e2','e3','e5','e6','e7',] })
df_B = pd.DataFrame({'start_date':['2022-07-01 00:00:41','2022-07-01 00:06:41','2022-07-01 00:17:56','2022-07-01 01:03:40',],'end_date':['2022-07-01 00:01:41','2022-07-01 00:14:41','2022-07-01 00:19:42','2022-07-01 02:03:23',], 'eventname':['e1','e3','e4','e7',] })
df_A.rename(columns={'start_date': 'start_date_A', 'end_date': 'end_date_A'}, inplace=True)
df_B.rename(columns={'start_date': 'start_date_B', 'end_date': 'end_date_B'}, inplace=True)
df1 = pd.concat([df_A, df_B]) # or df_A.append(df_B)
# reset index
df1.reset_index(drop=True, inplace=True)
df1.info()
df1
Output-
start_date_A end_date_A eventname start_date_B end_date_B
0 2022-07-01 00:02:41 2022-07-01 00:02:41 e2 NaN NaN
1 2022-07-01 00:07:41 2022-07-01 00:17:41 e3 NaN NaN
2 2022-07-01 00:22:41 2022-07-01 00:57:42 e5 NaN NaN
3 2022-07-01 01:01:23 2022-07-01 01:01:23 e6 NaN NaN
4 2022-07-01 01:01:23 2022-07-01 01:03:51 e7 NaN NaN
5 NaN NaN e1 2022-07-01 00:00:41 2022-07-01 00:01:41
6 NaN NaN e3 2022-07-01 00:06:41 2022-07-01 00:14:41
7 NaN NaN e4 2022-07-01 00:17:56 2022-07-01 00:19:42
8 NaN NaN e7 2022-07-01 01:03:40 2022-07-01 02:03:23
Do full join on eventname
and then split rows where time intervals do not overlap:
df_A.start_date = pd.to_datetime(df_A.start_date)
df_A.end_date = pd.to_datetime(df_A.end_date)
df_B.start_date = pd.to_datetime(df_B.start_date)
df_B.end_date = pd.to_datetime(df_B.end_date)
df = df_A.set_index("eventname").join(df_B.set_index("eventname"), how="outer", lsuffix="_a", rsuffix="_b")
def overlap_processing(row: pd.Series):
if (
(row.start_date_a <= row.start_date_b <= row.end_date_a)
or (row.start_date_a <= row.end_date_b <= row.end_date_a)
or row.isna().any()
):
return row
else:
return pd.Series(
{
"start_date_a": [row.start_date_a, pd.NaT],
"end_date_a": [row.end_date_a, pd.NaT],
"start_date_b": [pd.NaT, row.start_date_b],
"end_date_b": [pd.NaT, row.end_date_b],
}
)
result = df.apply(overlap_processing, axis=1).explode(list(df.columns)).reset_index()
Here is a way to do it.
# first, make sure we have Timestamps
for df in [df_A, df_B]:
for k in ['start_date', 'end_date']:
df[k] = pd.to_datetime(df[k])
# next, merge on eventname, regardless of interval overlapping
zo = df_A.merge(df_B, on='eventname', suffixes=['_a', '_b'], how='outer')
# finally, transform that result to dissociate non-overlapping groups
def proc(g):
assert len(g) == 1
r = g.squeeze()
try:
a = pd.Interval(r['start_date_a'], r['end_date_a'])
b = pd.Interval(r['start_date_b'], r['end_date_b'])
if a.overlaps(b):
return g
# emit two rows instead
cols = ['start_date_a', 'end_date_a', 'eventname', 'start_date_b', 'end_date_b']
sa, ea, ev, sb, eb = r[cols]
return pd.DataFrame([
[pd.NaT, pd.NaT, ev, sb, eb],
[sa, ea, ev, pd.NaT, pd.NaT],
], columns=cols)
except ValueError:
return g
out = zo.groupby(level=0).apply(proc).reset_index(drop=True)
Test on OP’s data, adding an eventname='e8'
with non-overlapping intervals to demo what happens then:
df_A = pd.DataFrame({
'start_date': [
'2022-07-01 00:02:41', '2022-07-01 00:07:41', '2022-07-01 00:22:41',
'2022-07-01 01:01:23', '2022-07-01 01:01:23', '2000-01-01'],
'end_date': [
'2022-07-01 00:02:41', '2022-07-01 00:17:41', '2022-07-01 00:57:42',
'2022-07-01 01:01:23', '2022-07-01 01:03:51', '2000-01-02'],
'eventname': ['e2', 'e3', 'e5', 'e6', 'e7', 'e8'],
})
df_B = pd.DataFrame({
'start_date': [
'2022-07-01 00:00:41', '2022-07-01 00:06:41', '2022-07-01 00:17:56',
'2022-07-01 01:03:40', '2001-01-01'],
'end_date': [
'2022-07-01 00:01:41', '2022-07-01 00:14:41', '2022-07-01 00:19:42',
'2022-07-01 02:03:23', '2001-01-02'],
'eventname': ['e1', 'e3', 'e4', 'e7', 'e8'],
})
# code above (including pd.to_datetime)
>>> out
start_date_a end_date_a eventname start_date_b end_date_b
0 2022-07-01 00:02:41 2022-07-01 00:02:41 e2 NaT NaT
1 2022-07-01 00:07:41 2022-07-01 00:17:41 e3 2022-07-01 00:06:41 2022-07-01 00:14:41
2 2022-07-01 00:22:41 2022-07-01 00:57:42 e5 NaT NaT
3 2022-07-01 01:01:23 2022-07-01 01:01:23 e6 NaT NaT
4 2022-07-01 01:01:23 2022-07-01 01:03:51 e7 2022-07-01 01:03:40 2022-07-01 02:03:23
5 NaT NaT e8 2001-01-01 00:00:00 2001-01-02 00:00:00
6 2000-01-01 00:00:00 2000-01-02 00:00:00 e8 NaT NaT
7 NaT NaT e1 2022-07-01 00:00:41 2022-07-01 00:01:41
8 NaT NaT e4 2022-07-01 00:17:56 2022-07-01 00:19:42
Another way to dissociate rows with non-overlapping intervals
We can also use some numpy tricks to quickly repeat rows that should be dissociated and then selectively null out a
or b
dates in those rows:
# starting from zo as above:
zo = df_A.merge(df_B, on='eventname', suffixes=['_a', '_b'], how='outer')
acols = ['start_date_a', 'end_date_a']
bcols = ['start_date_b', 'end_date_b']
mask = ~zo[acols + bcols].isna().any(axis=1).values
ov = zo.loc[mask].apply(
lambda r: pd.Interval(*r[acols]).overlaps(pd.Interval(*r[bcols])),
axis=1).values
mask[mask] &= ~ov
Now, mask
contains True
only where zo
rows should be dissociated:
>>> zo.loc[mask]
start_date_a end_date_a eventname start_date_b end_date_b
5 2000-01-01 2000-01-02 e8 2001-01-01 2001-01-02
Now we use some numpy
functions to achieve duplication of the rows to dissociate, and anull
and bnull
arrays to indicate where to blank out the result:
sel = np.repeat(np.arange(len(zo)), mask + 1)
anull = np.r_[np.diff(sel), 1] == 0
bnull = np.r_[1, np.diff(sel)] == 0
>>> sel
array([0, 1, 2, 3, 4, 5, 5, 6, 7])
>>> anull
array([False, False, False, False, False, True, False, False, False])
>>> bnull
array([False, False, False, False, False, False, True, False, False])
out = zo.iloc[sel].reset_index(drop=True)
out.loc[anull, acols] = pd.NaT
out.loc[bnull, bcols] = pd.NaT
>>> out
# same as above
I have two dataframes:
df_A = pd.DataFrame({'start_date':['2022-07-01 00:02:41','2022-07-01 00:07:41','2022-07-01 00:22:41','2022-07-01 01:01:23','2022-07-01 01:01:23'],'end_date':['2022-07-01 00:02:41','2022-07-01 00:17:41','2022-07-01 00:57:42','2022-07-01 01:01:23','2022-07-01 01:03:51',], 'eventname':['e2','e3','e5','e6','e7',] })
df_B = pd.DataFrame({'start_date':['2022-07-01 00:00:41','2022-07-01 00:06:41','2022-07-01 00:17:56','2022-07-01 01:03:40',],'end_date':['2022-07-01 00:01:41','2022-07-01 00:14:41','2022-07-01 00:19:42','2022-07-01 02:03:23',], 'eventname':['e1','e3','e4','e7',] })
#df_A
start_date end_date eventname
0 2022-07-01 00:02:41 2022-07-01 00:02:41 e2
1 2022-07-01 00:07:41 2022-07-01 00:17:41 e3
2 2022-07-01 00:22:41 2022-07-01 00:57:42 e5
3 2022-07-01 01:01:23 2022-07-01 01:01:23 e6
4 2022-07-01 01:01:23 2022-07-01 01:03:51 e7
#df_B
start_date end_date eventname
0 2022-07-01 00:00:41 2022-07-01 00:01:41 e1
1 2022-07-01 00:06:41 2022-07-01 00:14:41 e3
2 2022-07-01 00:17:56 2022-07-01 00:19:42 e4
3 2022-07-01 01:03:40 2022-07-01 02:03:23 e7
I would like to join the rows of df_B to df_A if the time interval has any overlap and my expected result is this:
df_C = pd.DataFrame({'start_date_A':['nan','2022-07-01 00:02:41','2022-07-01 00:07:41','nan','2022-07-01 00:22:41','2022-07-01 01:01:23','2022-07-01 01:01:23',],'end_date_A':['nan','2022-07-01 00:02:41', '2022-07-01 00:17:41','nan','2022-07-01 00:57:42','2022-07-01 01:01:23','2022-07-01 01:03:51',],'start_date_b':['2022-07-01 00:00:41','nan','2022-07-01 00:06:41','2022-07-01 00:17:56','nan','nan','2022-07-01 01:03:40',],'end_date_b':['2022-07-01 00:01:41', 'nan', '2022-07-01 00:14:41', '2022-07-01 00:19:42', 'nan', 'nan', '2022-07-01 02:03:23', ], 'eventname':['e2','e1','e3','e4','e5','e6','e7'] })
#df_C
I tried the solution mentioned here but it gives me only the first value of the column each time, not the value for that row.
I also tried using the IntervalIndex like this:
bins = pd.IntervalIndex.from_arrays(df_A['start_date'],
df_A['end_date'],
closed='both')
out = df_B.assign(interval=pd.cut(df_B['eventname'], bins))
.merge(df_A.assign(interval=bins), on='interval', how='left')
but I get the error that overlapping intervals are not allowed. How can I do this?
Edit: eventname also need to be the same for the join to happen. If not, both the rows have to be present separately so it will be a full join
import pandas as pd
df_A = pd.DataFrame({'start_date':['2022-07-01 00:02:41','2022-07-01 00:07:41','2022-07-01 00:22:41','2022-07-01 01:01:23','2022-07-01 01:01:23'],'end_date':['2022-07-01 00:02:41','2022-07-01 00:17:41','2022-07-01 00:57:42','2022-07-01 01:01:23','2022-07-01 01:03:51',], 'eventname':['e2','e3','e5','e6','e7',] })
df_B = pd.DataFrame({'start_date':['2022-07-01 00:00:41','2022-07-01 00:06:41','2022-07-01 00:17:56','2022-07-01 01:03:40',],'end_date':['2022-07-01 00:01:41','2022-07-01 00:14:41','2022-07-01 00:19:42','2022-07-01 02:03:23',], 'eventname':['e1','e3','e4','e7',] })
df_A.rename(columns={'start_date': 'start_date_A', 'end_date': 'end_date_A'}, inplace=True)
df_B.rename(columns={'start_date': 'start_date_B', 'end_date': 'end_date_B'}, inplace=True)
df1 = pd.concat([df_A, df_B]) # or df_A.append(df_B)
# reset index
df1.reset_index(drop=True, inplace=True)
df1.info()
df1
Output-
start_date_A end_date_A eventname start_date_B end_date_B
0 2022-07-01 00:02:41 2022-07-01 00:02:41 e2 NaN NaN
1 2022-07-01 00:07:41 2022-07-01 00:17:41 e3 NaN NaN
2 2022-07-01 00:22:41 2022-07-01 00:57:42 e5 NaN NaN
3 2022-07-01 01:01:23 2022-07-01 01:01:23 e6 NaN NaN
4 2022-07-01 01:01:23 2022-07-01 01:03:51 e7 NaN NaN
5 NaN NaN e1 2022-07-01 00:00:41 2022-07-01 00:01:41
6 NaN NaN e3 2022-07-01 00:06:41 2022-07-01 00:14:41
7 NaN NaN e4 2022-07-01 00:17:56 2022-07-01 00:19:42
8 NaN NaN e7 2022-07-01 01:03:40 2022-07-01 02:03:23
Do full join on eventname
and then split rows where time intervals do not overlap:
df_A.start_date = pd.to_datetime(df_A.start_date)
df_A.end_date = pd.to_datetime(df_A.end_date)
df_B.start_date = pd.to_datetime(df_B.start_date)
df_B.end_date = pd.to_datetime(df_B.end_date)
df = df_A.set_index("eventname").join(df_B.set_index("eventname"), how="outer", lsuffix="_a", rsuffix="_b")
def overlap_processing(row: pd.Series):
if (
(row.start_date_a <= row.start_date_b <= row.end_date_a)
or (row.start_date_a <= row.end_date_b <= row.end_date_a)
or row.isna().any()
):
return row
else:
return pd.Series(
{
"start_date_a": [row.start_date_a, pd.NaT],
"end_date_a": [row.end_date_a, pd.NaT],
"start_date_b": [pd.NaT, row.start_date_b],
"end_date_b": [pd.NaT, row.end_date_b],
}
)
result = df.apply(overlap_processing, axis=1).explode(list(df.columns)).reset_index()
Here is a way to do it.
# first, make sure we have Timestamps
for df in [df_A, df_B]:
for k in ['start_date', 'end_date']:
df[k] = pd.to_datetime(df[k])
# next, merge on eventname, regardless of interval overlapping
zo = df_A.merge(df_B, on='eventname', suffixes=['_a', '_b'], how='outer')
# finally, transform that result to dissociate non-overlapping groups
def proc(g):
assert len(g) == 1
r = g.squeeze()
try:
a = pd.Interval(r['start_date_a'], r['end_date_a'])
b = pd.Interval(r['start_date_b'], r['end_date_b'])
if a.overlaps(b):
return g
# emit two rows instead
cols = ['start_date_a', 'end_date_a', 'eventname', 'start_date_b', 'end_date_b']
sa, ea, ev, sb, eb = r[cols]
return pd.DataFrame([
[pd.NaT, pd.NaT, ev, sb, eb],
[sa, ea, ev, pd.NaT, pd.NaT],
], columns=cols)
except ValueError:
return g
out = zo.groupby(level=0).apply(proc).reset_index(drop=True)
Test on OP’s data, adding an eventname='e8'
with non-overlapping intervals to demo what happens then:
df_A = pd.DataFrame({
'start_date': [
'2022-07-01 00:02:41', '2022-07-01 00:07:41', '2022-07-01 00:22:41',
'2022-07-01 01:01:23', '2022-07-01 01:01:23', '2000-01-01'],
'end_date': [
'2022-07-01 00:02:41', '2022-07-01 00:17:41', '2022-07-01 00:57:42',
'2022-07-01 01:01:23', '2022-07-01 01:03:51', '2000-01-02'],
'eventname': ['e2', 'e3', 'e5', 'e6', 'e7', 'e8'],
})
df_B = pd.DataFrame({
'start_date': [
'2022-07-01 00:00:41', '2022-07-01 00:06:41', '2022-07-01 00:17:56',
'2022-07-01 01:03:40', '2001-01-01'],
'end_date': [
'2022-07-01 00:01:41', '2022-07-01 00:14:41', '2022-07-01 00:19:42',
'2022-07-01 02:03:23', '2001-01-02'],
'eventname': ['e1', 'e3', 'e4', 'e7', 'e8'],
})
# code above (including pd.to_datetime)
>>> out
start_date_a end_date_a eventname start_date_b end_date_b
0 2022-07-01 00:02:41 2022-07-01 00:02:41 e2 NaT NaT
1 2022-07-01 00:07:41 2022-07-01 00:17:41 e3 2022-07-01 00:06:41 2022-07-01 00:14:41
2 2022-07-01 00:22:41 2022-07-01 00:57:42 e5 NaT NaT
3 2022-07-01 01:01:23 2022-07-01 01:01:23 e6 NaT NaT
4 2022-07-01 01:01:23 2022-07-01 01:03:51 e7 2022-07-01 01:03:40 2022-07-01 02:03:23
5 NaT NaT e8 2001-01-01 00:00:00 2001-01-02 00:00:00
6 2000-01-01 00:00:00 2000-01-02 00:00:00 e8 NaT NaT
7 NaT NaT e1 2022-07-01 00:00:41 2022-07-01 00:01:41
8 NaT NaT e4 2022-07-01 00:17:56 2022-07-01 00:19:42
Another way to dissociate rows with non-overlapping intervals
We can also use some numpy tricks to quickly repeat rows that should be dissociated and then selectively null out a
or b
dates in those rows:
# starting from zo as above:
zo = df_A.merge(df_B, on='eventname', suffixes=['_a', '_b'], how='outer')
acols = ['start_date_a', 'end_date_a']
bcols = ['start_date_b', 'end_date_b']
mask = ~zo[acols + bcols].isna().any(axis=1).values
ov = zo.loc[mask].apply(
lambda r: pd.Interval(*r[acols]).overlaps(pd.Interval(*r[bcols])),
axis=1).values
mask[mask] &= ~ov
Now, mask
contains True
only where zo
rows should be dissociated:
>>> zo.loc[mask]
start_date_a end_date_a eventname start_date_b end_date_b
5 2000-01-01 2000-01-02 e8 2001-01-01 2001-01-02
Now we use some numpy
functions to achieve duplication of the rows to dissociate, and anull
and bnull
arrays to indicate where to blank out the result:
sel = np.repeat(np.arange(len(zo)), mask + 1)
anull = np.r_[np.diff(sel), 1] == 0
bnull = np.r_[1, np.diff(sel)] == 0
>>> sel
array([0, 1, 2, 3, 4, 5, 5, 6, 7])
>>> anull
array([False, False, False, False, False, True, False, False, False])
>>> bnull
array([False, False, False, False, False, False, True, False, False])
out = zo.iloc[sel].reset_index(drop=True)
out.loc[anull, acols] = pd.NaT
out.loc[bnull, bcols] = pd.NaT
>>> out
# same as above