merge chronological elements
Question:
I have a set of items that consists of the start and stop dates, as the following:
ID
started
stop
1
2019-01-14
2018-02-05
2
2019-01-14
2019-03-06
3
2019-03-07
2019-03-20->
4
Some-Date
NULL
5
2020-09-08
2020-09-14
6
2020-09-15
2020-10-14
7
->2019-03-21
2019-03-30
I would like to merge those item who share a chronological order from the order: elem.stop = nxtElem.started + 1
The result should look like:
ID
started
stop
1
2019-01-14
2018-02-05
2
2019-01-14
2019-03-30
3
Some-Date
NULL
4
2020-09-08
2020-10-14
I am currently checking the difference between each date, and if its one day then group them, however i am getting weird results
class Records:
def __init__(self, start_dt, stop_dt):
self.groupNum = None
self.dayDiff = None
self.start_dt = start_dt
self.stop_dt = stop_dt
def setGroupNum(self, groupNum):
self.groupNum = groupNum
def setdayDiff(self, dayDiff):
self.dayDiff = dayDiff
def main():
recordsLst = []
resultLst = []
recordsLst.append(Records(datetime.date(2017, 8, 14), datetime.date(2018, 3, 5)))
recordsLst.append(Records(datetime.date(2019, 1, 14), datetime.date(2019, 3, 6)))
recordsLst.append(Records(datetime.date(2019, 3, 7), datetime.date(2019, 3, 20)))
recordsLst.append(Records(datetime.date(2023, 12, 30), datetime.date(9999, 12, 31)))
recordsLst.append(Records(datetime.date(2020, 9, 8), datetime.date(2020, 9, 14)))
recordsLst.append(Records(datetime.date(2020, 9, 15), datetime.date(2020, 10, 14)))
recordsLst.append(Records(datetime.date(2019, 3, 21), datetime.date(2019, 3, 30)))
recordsLst .sort(key=lambda x: x.start_dt, reverse=False)
for index, a in enumerate(recordsLst):
for b in recordsLst[index:]:
# If same item
if (a.start_dt.day == b.start_dt.day and
a.start_dt.month == b.start_dt.month and
a.start_dt.year == b.start_dt.year) and
(a.stop_dt.day == b.stop_dt.day and
a.stop_dt.month == b.stop_dt.month and
a.stop_dt.year == b.stop_dt.year):
a.setGroupNum('same')
# If in a chronological order
if a.stop_dt.month == b.start_dt.month
and a.stop_dt.year == b.start_dt.year
and (a.stop_dt.day - b.start_dt.day) == -1:
a.setdayDiff(-1)
a.setGroupNum(index)
resultLst.append(Datum(a.stop_dt, b.start_dt))
else:
a.setdayDiff(None)
print(index, a, b)
New pandas dataset
df = pd.DataFrame([[datetime.date(2016, 1, 2), datetime.date(2016, 5, 5)],
# case A->B, B->C, B->D => A->D
[datetime.date(2010, 2, 14), datetime.date(2010, 3, 22)],
[datetime.date(2010, 3, 23), datetime.date(2010, 4, 12)],
[datetime.date(2010, 3, 23), datetime.date(2010, 5, 14)],
[datetime.date(2010, 5, 15), datetime.date(2010, 6, 7)],
# -> 2010-02-14 | 2010-10-20
# case A->B, A->C, B->D => A->D
[datetime.date(2011, 1, 1), datetime.date(2011, 2, 2)],
[datetime.date(2011, 1, 1), datetime.date(2011, 3, 4)],
[datetime.date(2011, 2, 3), datetime.date(2011, 4, 4)],
# -> 2011-01-01 | 2011-04-04
# case A->C, B->C, C->D => A->D
[datetime.date(2012, 5, 5), datetime.date(2012, 6, 6)],
[datetime.date(2012, 5, 7), datetime.date(2012, 6, 6)],
[datetime.date(2012, 6, 7), datetime.date(2012, 12, 12)],
# -> 2012-05-05 | 2012-12-12
[datetime.date(2010, 6, 8), datetime.date(2010, 10, 20)],
[datetime.date(2016, 5, 6), datetime.date(2016, 10, 10)],
[datetime.date(2011, 1, 1), datetime.date(9999, 12, 31)]],
columns=['start', 'end'])
Thanks in advance.
Answers:
Do you have to use the Records
-class? If not, pandas offers a very clean implementation of what you are looking for:
import datetime
import pandas as pd
import numpy as np
df = pd.DataFrame([[datetime.date(2017, 8, 14), datetime.date(2018, 3, 5)],
[datetime.date(2019, 1, 14), datetime.date(2019, 3, 6)],
[datetime.date(2019, 3, 7), datetime.date(2019, 3, 20)],
[datetime.date(2023, 12, 30), datetime.date(9999, 12, 31)],
[datetime.date(2020, 9, 8), datetime.date(2020, 9, 14)],
[datetime.date(2020, 9, 15), datetime.date(2020, 10, 14)],
[datetime.date(2019, 3, 21), datetime.date(2019, 3, 30)]],
columns=['start', 'end'])
df = df.sort_values('start').reset_index(drop=True)
mask = df['start'] - pd.to_timedelta('1 day') == df['end'].shift(1)
df.loc[mask.shift(-1).fillna(False), 'end'] = np.nan
df['end'] = df['end'].bfill()
df = df[~mask]
print(df)
And even if you have to use your class, you could just create it after you have done the data handling in pandas by running:
resultLst = df.apply(lambda x: Records(x['start'], x['end']), axis=1).tolist()
EDIT:
Unfortunately, it is not really easy to understand what your underlying rules are, but the following works out almost the same way as what you say:
df = df.groupby('end').min().reset_index() # If two end dates are identical, we keep the first?
df = df.sort_values('start').reset_index(drop=True)
df['start_reduced'] = df['start'] - pd.to_timedelta('1 day')
df['idx_orig'] = df.index
cols_to_drop = [x+'_y' for x in df.columns]
first_iter = True
seed_start_idx = []
while first_iter or mask.any():
df = df.merge(df, how='left', left_on='end', right_on='start_reduced', suffixes=('', '_y'))
mask = ~df['end_y'].isna()
df.loc[mask, 'end'] = df.loc[mask, 'end_y'].values
if first_iter:
seed_start_idx = df.loc[~df['start'].isin(df.loc[mask, 'start_y']), 'idx_orig'].tolist()
df = df.drop(columns=cols_to_drop)
first_iter = False
df = df[df['idx_orig'].isin(seed_start_idx)].drop_duplicates(subset='idx_orig', keep='last').drop(columns=['start_reduced', 'idx_orig'])
The only difference is that it is not possible to distinguish which of the ones starting 2011-01-01 should be kept. You state that the one ending 2011-03-04 should not be kept, but the one ending 9999-12-31 should seemingly be kept. I cannot understand the logic behind that differentiation. The rest works though.
I have a set of items that consists of the start and stop dates, as the following:
ID | started | stop |
---|---|---|
1 | 2019-01-14 | 2018-02-05 |
2 | 2019-01-14 | 2019-03-06 |
3 | 2019-03-07 | 2019-03-20-> |
4 | Some-Date | NULL |
5 | 2020-09-08 | 2020-09-14 |
6 | 2020-09-15 | 2020-10-14 |
7 | ->2019-03-21 | 2019-03-30 |
I would like to merge those item who share a chronological order from the order: elem.stop = nxtElem.started + 1
The result should look like:
ID | started | stop |
---|---|---|
1 | 2019-01-14 | 2018-02-05 |
2 | 2019-01-14 | 2019-03-30 |
3 | Some-Date | NULL |
4 | 2020-09-08 | 2020-10-14 |
I am currently checking the difference between each date, and if its one day then group them, however i am getting weird results
class Records:
def __init__(self, start_dt, stop_dt):
self.groupNum = None
self.dayDiff = None
self.start_dt = start_dt
self.stop_dt = stop_dt
def setGroupNum(self, groupNum):
self.groupNum = groupNum
def setdayDiff(self, dayDiff):
self.dayDiff = dayDiff
def main():
recordsLst = []
resultLst = []
recordsLst.append(Records(datetime.date(2017, 8, 14), datetime.date(2018, 3, 5)))
recordsLst.append(Records(datetime.date(2019, 1, 14), datetime.date(2019, 3, 6)))
recordsLst.append(Records(datetime.date(2019, 3, 7), datetime.date(2019, 3, 20)))
recordsLst.append(Records(datetime.date(2023, 12, 30), datetime.date(9999, 12, 31)))
recordsLst.append(Records(datetime.date(2020, 9, 8), datetime.date(2020, 9, 14)))
recordsLst.append(Records(datetime.date(2020, 9, 15), datetime.date(2020, 10, 14)))
recordsLst.append(Records(datetime.date(2019, 3, 21), datetime.date(2019, 3, 30)))
recordsLst .sort(key=lambda x: x.start_dt, reverse=False)
for index, a in enumerate(recordsLst):
for b in recordsLst[index:]:
# If same item
if (a.start_dt.day == b.start_dt.day and
a.start_dt.month == b.start_dt.month and
a.start_dt.year == b.start_dt.year) and
(a.stop_dt.day == b.stop_dt.day and
a.stop_dt.month == b.stop_dt.month and
a.stop_dt.year == b.stop_dt.year):
a.setGroupNum('same')
# If in a chronological order
if a.stop_dt.month == b.start_dt.month
and a.stop_dt.year == b.start_dt.year
and (a.stop_dt.day - b.start_dt.day) == -1:
a.setdayDiff(-1)
a.setGroupNum(index)
resultLst.append(Datum(a.stop_dt, b.start_dt))
else:
a.setdayDiff(None)
print(index, a, b)
New pandas dataset
df = pd.DataFrame([[datetime.date(2016, 1, 2), datetime.date(2016, 5, 5)],
# case A->B, B->C, B->D => A->D
[datetime.date(2010, 2, 14), datetime.date(2010, 3, 22)],
[datetime.date(2010, 3, 23), datetime.date(2010, 4, 12)],
[datetime.date(2010, 3, 23), datetime.date(2010, 5, 14)],
[datetime.date(2010, 5, 15), datetime.date(2010, 6, 7)],
# -> 2010-02-14 | 2010-10-20
# case A->B, A->C, B->D => A->D
[datetime.date(2011, 1, 1), datetime.date(2011, 2, 2)],
[datetime.date(2011, 1, 1), datetime.date(2011, 3, 4)],
[datetime.date(2011, 2, 3), datetime.date(2011, 4, 4)],
# -> 2011-01-01 | 2011-04-04
# case A->C, B->C, C->D => A->D
[datetime.date(2012, 5, 5), datetime.date(2012, 6, 6)],
[datetime.date(2012, 5, 7), datetime.date(2012, 6, 6)],
[datetime.date(2012, 6, 7), datetime.date(2012, 12, 12)],
# -> 2012-05-05 | 2012-12-12
[datetime.date(2010, 6, 8), datetime.date(2010, 10, 20)],
[datetime.date(2016, 5, 6), datetime.date(2016, 10, 10)],
[datetime.date(2011, 1, 1), datetime.date(9999, 12, 31)]],
columns=['start', 'end'])
Thanks in advance.
Do you have to use the Records
-class? If not, pandas offers a very clean implementation of what you are looking for:
import datetime
import pandas as pd
import numpy as np
df = pd.DataFrame([[datetime.date(2017, 8, 14), datetime.date(2018, 3, 5)],
[datetime.date(2019, 1, 14), datetime.date(2019, 3, 6)],
[datetime.date(2019, 3, 7), datetime.date(2019, 3, 20)],
[datetime.date(2023, 12, 30), datetime.date(9999, 12, 31)],
[datetime.date(2020, 9, 8), datetime.date(2020, 9, 14)],
[datetime.date(2020, 9, 15), datetime.date(2020, 10, 14)],
[datetime.date(2019, 3, 21), datetime.date(2019, 3, 30)]],
columns=['start', 'end'])
df = df.sort_values('start').reset_index(drop=True)
mask = df['start'] - pd.to_timedelta('1 day') == df['end'].shift(1)
df.loc[mask.shift(-1).fillna(False), 'end'] = np.nan
df['end'] = df['end'].bfill()
df = df[~mask]
print(df)
And even if you have to use your class, you could just create it after you have done the data handling in pandas by running:
resultLst = df.apply(lambda x: Records(x['start'], x['end']), axis=1).tolist()
EDIT:
Unfortunately, it is not really easy to understand what your underlying rules are, but the following works out almost the same way as what you say:
df = df.groupby('end').min().reset_index() # If two end dates are identical, we keep the first?
df = df.sort_values('start').reset_index(drop=True)
df['start_reduced'] = df['start'] - pd.to_timedelta('1 day')
df['idx_orig'] = df.index
cols_to_drop = [x+'_y' for x in df.columns]
first_iter = True
seed_start_idx = []
while first_iter or mask.any():
df = df.merge(df, how='left', left_on='end', right_on='start_reduced', suffixes=('', '_y'))
mask = ~df['end_y'].isna()
df.loc[mask, 'end'] = df.loc[mask, 'end_y'].values
if first_iter:
seed_start_idx = df.loc[~df['start'].isin(df.loc[mask, 'start_y']), 'idx_orig'].tolist()
df = df.drop(columns=cols_to_drop)
first_iter = False
df = df[df['idx_orig'].isin(seed_start_idx)].drop_duplicates(subset='idx_orig', keep='last').drop(columns=['start_reduced', 'idx_orig'])
The only difference is that it is not possible to distinguish which of the ones starting 2011-01-01 should be kept. You state that the one ending 2011-03-04 should not be kept, but the one ending 9999-12-31 should seemingly be kept. I cannot understand the logic behind that differentiation. The rest works though.