Dataframe of start and end dates into sum of days in an array of periods
Question:
I have a pandas data frame of start and end dates for contracts. I want to work out the number of in force contract days for all periods (e.g. months) covered by the contracts.
Example input:
start_date end_date
0 2022-01-01 2022-02-15
1 2022-02-01 2022-04-01
2 2022-03-01 2022-04-15
Resulting output:
2022-01 30
2022-02 41
2022-03 61
2022-04 14
Freq: M, dtype: int64
I have already written a working solution but it takes a fairly naive approach and I would appreciate suggestions for improved efficiency or more pandas/pythonic approaches.
Once I have figured out the minimum spanning set of periods the solution leaves behind array functions and uses a loop over rows and periods. I want to be able to apply this function to many millions of rows of a dataframe so efficiency will become important.
I looked for some array functions providing something like an overlap or a timedelta within a period but it seems start time and end time where the only useful tools available.
import pandas
def days_in_periods(df: pandas.DataFrame, inc_st: bool = True, inc_en: bool = True, period_freq='M') -> pandas.Series:
""" Calculate the days in each period covered by any contract defined within the dataframe """
# create period range
periods = pandas.period_range(start=df['start_date'].min(),
end=df['end_date'].max(),
freq=period_freq)
period_days = pandas.Series(data=[0] * len(periods),
index=periods,
dtype=int)
for index, row in df.iterrows():
st = row['start_date']
en = row['end_date']
print(f'contract: {st:%d/%m} - {en:%d/%m}')
total_days: int = (en - st).days + inc_en - (1 - inc_st)
print(f'contract days: {total_days}')
total_days_check: int = 0
for period in periods:
per_st = period.start_time
per_en = period.end_time
print(f'tperiod: {per_st:%d/%m} - {per_en:%d/%m}', end='')
if per_en < st or per_st > en:
print('t0')
continue
days: int = (per_en - per_st).days + 1
if per_st <= st <= per_en:
days -= (st - per_st).days + (1 - inc_st)
if per_st <= en <= per_en:
days -= (per_en - en).days + (1 - inc_en)
total_days_check += days
print(f't{days}')
period_days[period] += days
print(f'total days check: {total_days_check}')
assert total_days == total_days_check
return period_days
# create sample DataFrame
df_ex = pandas.DataFrame({'start_date': ['2022-01-01', '2022-02-01', '2022-03-01'],
'end_date': ['2022-02-15', '2022-04-01', '2022-04-15']})
# convert start_date and end_date to datetime objects
df_ex['start_date'] = pandas.to_datetime(df_ex['start_date'])
df_ex['end_date'] = pandas.to_datetime(df_ex['end_date'])
days_in_periods(df_ex, inc_st=True, inc_en=True)
days_in_periods(df_ex, inc_st=True, inc_en=False)
days_in_periods(df_ex, inc_st=False, inc_en=True)
print(days_in_periods(df_ex, inc_st=False, inc_en=False))
Rewrite after sammywemmy’s suggestions below:
import operator
import pandas
def days_in_periods(df: pandas.DataFrame,
inc_st: bool = True,
inc_en: bool = True,
period_freq='M') -> pandas.DataFrame:
""" Calculate the days in each period covered by any contract defined within the dataframe """
day_range = pandas.date_range(df['start_date'].min(),
df['end_date'].max(),
freq='D').to_series(name='days').reset_index(drop=True)
if inc_st:
st_op = operator.le
else:
st_op = operator.lt
if inc_en:
en_op = operator.ge
else:
en_op = operator.gt
df = df.merge(day_range, how='cross')
df = (df.loc[st_op(df['start_date'], df['days'])
& en_op(df['end_date'], df['days'])]
.resample(on='days', rule=period_freq)
.size()
)
df.index = df.index.to_period(period_freq)
return df
# create sample DataFrame
df_ex = pandas.DataFrame({'start_date': ['2022-01-01', '2022-02-01', '2022-03-01'],
'end_date': ['2022-02-15', '2022-04-01', '2022-04-15']})
# convert start_date and end_date to datetime objects
df_ex['start_date'] = pandas.to_datetime(df_ex['start_date'])
df_ex['end_date'] = pandas.to_datetime(df_ex['end_date'])
print(days_in_periods(df_ex, inc_st=False, inc_en=False))
Answers:
Looks like some form of inequality join – if that is the case, you can use conditional_join from pyjanitor to get your results, before grouping – should be faster than having to use iterrows:
# pip install pyjanitor
import pandas as pd
import janitor
# build a Pandas series of dates:
minimum = df_ex.to_numpy().min(axis=None)
maximum = df_ex.to_numpy().max(axis=None)
ser = pd.date_range(minimum, maximum, freq='D', name='dates').to_series()
ser.index = range(len(ser))
(df_ex
.conditional_join(
ser,
# column from left, column from right, comparator
('start_date', 'dates', '<'),
('end_date', 'dates', '>'),
# depending on the data size,
# you might get more performance with numba
use_numba = False,
)
.loc(axis=1)[['dates']]
.resample(on='dates', rule='MS')
.size()
)
dates
2022-01-01 30
2022-02-01 41
2022-03-01 61
2022-04-01 14
Freq: MS, dtype: int64
I have a pandas data frame of start and end dates for contracts. I want to work out the number of in force contract days for all periods (e.g. months) covered by the contracts.
Example input:
start_date end_date
0 2022-01-01 2022-02-15
1 2022-02-01 2022-04-01
2 2022-03-01 2022-04-15
Resulting output:
2022-01 30
2022-02 41
2022-03 61
2022-04 14
Freq: M, dtype: int64
I have already written a working solution but it takes a fairly naive approach and I would appreciate suggestions for improved efficiency or more pandas/pythonic approaches.
Once I have figured out the minimum spanning set of periods the solution leaves behind array functions and uses a loop over rows and periods. I want to be able to apply this function to many millions of rows of a dataframe so efficiency will become important.
I looked for some array functions providing something like an overlap or a timedelta within a period but it seems start time and end time where the only useful tools available.
import pandas
def days_in_periods(df: pandas.DataFrame, inc_st: bool = True, inc_en: bool = True, period_freq='M') -> pandas.Series:
""" Calculate the days in each period covered by any contract defined within the dataframe """
# create period range
periods = pandas.period_range(start=df['start_date'].min(),
end=df['end_date'].max(),
freq=period_freq)
period_days = pandas.Series(data=[0] * len(periods),
index=periods,
dtype=int)
for index, row in df.iterrows():
st = row['start_date']
en = row['end_date']
print(f'contract: {st:%d/%m} - {en:%d/%m}')
total_days: int = (en - st).days + inc_en - (1 - inc_st)
print(f'contract days: {total_days}')
total_days_check: int = 0
for period in periods:
per_st = period.start_time
per_en = period.end_time
print(f'tperiod: {per_st:%d/%m} - {per_en:%d/%m}', end='')
if per_en < st or per_st > en:
print('t0')
continue
days: int = (per_en - per_st).days + 1
if per_st <= st <= per_en:
days -= (st - per_st).days + (1 - inc_st)
if per_st <= en <= per_en:
days -= (per_en - en).days + (1 - inc_en)
total_days_check += days
print(f't{days}')
period_days[period] += days
print(f'total days check: {total_days_check}')
assert total_days == total_days_check
return period_days
# create sample DataFrame
df_ex = pandas.DataFrame({'start_date': ['2022-01-01', '2022-02-01', '2022-03-01'],
'end_date': ['2022-02-15', '2022-04-01', '2022-04-15']})
# convert start_date and end_date to datetime objects
df_ex['start_date'] = pandas.to_datetime(df_ex['start_date'])
df_ex['end_date'] = pandas.to_datetime(df_ex['end_date'])
days_in_periods(df_ex, inc_st=True, inc_en=True)
days_in_periods(df_ex, inc_st=True, inc_en=False)
days_in_periods(df_ex, inc_st=False, inc_en=True)
print(days_in_periods(df_ex, inc_st=False, inc_en=False))
Rewrite after sammywemmy’s suggestions below:
import operator
import pandas
def days_in_periods(df: pandas.DataFrame,
inc_st: bool = True,
inc_en: bool = True,
period_freq='M') -> pandas.DataFrame:
""" Calculate the days in each period covered by any contract defined within the dataframe """
day_range = pandas.date_range(df['start_date'].min(),
df['end_date'].max(),
freq='D').to_series(name='days').reset_index(drop=True)
if inc_st:
st_op = operator.le
else:
st_op = operator.lt
if inc_en:
en_op = operator.ge
else:
en_op = operator.gt
df = df.merge(day_range, how='cross')
df = (df.loc[st_op(df['start_date'], df['days'])
& en_op(df['end_date'], df['days'])]
.resample(on='days', rule=period_freq)
.size()
)
df.index = df.index.to_period(period_freq)
return df
# create sample DataFrame
df_ex = pandas.DataFrame({'start_date': ['2022-01-01', '2022-02-01', '2022-03-01'],
'end_date': ['2022-02-15', '2022-04-01', '2022-04-15']})
# convert start_date and end_date to datetime objects
df_ex['start_date'] = pandas.to_datetime(df_ex['start_date'])
df_ex['end_date'] = pandas.to_datetime(df_ex['end_date'])
print(days_in_periods(df_ex, inc_st=False, inc_en=False))
Looks like some form of inequality join – if that is the case, you can use conditional_join from pyjanitor to get your results, before grouping – should be faster than having to use iterrows:
# pip install pyjanitor
import pandas as pd
import janitor
# build a Pandas series of dates:
minimum = df_ex.to_numpy().min(axis=None)
maximum = df_ex.to_numpy().max(axis=None)
ser = pd.date_range(minimum, maximum, freq='D', name='dates').to_series()
ser.index = range(len(ser))
(df_ex
.conditional_join(
ser,
# column from left, column from right, comparator
('start_date', 'dates', '<'),
('end_date', 'dates', '>'),
# depending on the data size,
# you might get more performance with numba
use_numba = False,
)
.loc(axis=1)[['dates']]
.resample(on='dates', rule='MS')
.size()
)
dates
2022-01-01 30
2022-02-01 41
2022-03-01 61
2022-04-01 14
Freq: MS, dtype: int64