Dataframe of start and end dates into sum of days in an array of periods

Question

I have a pandas data frame of start and end dates for contracts. I want to work out the number of in force contract days for all periods (e.g. months) covered by the contracts.

Example input:

  start_date   end_date
0 2022-01-01 2022-02-15
1 2022-02-01 2022-04-01
2 2022-03-01 2022-04-15

Resulting output:

2022-01    30
2022-02    41
2022-03    61
2022-04    14
Freq: M, dtype: int64

I have already written a working solution but it takes a fairly naive approach and I would appreciate suggestions for improved efficiency or more pandas/pythonic approaches.

Once I have figured out the minimum spanning set of periods the solution leaves behind array functions and uses a loop over rows and periods. I want to be able to apply this function to many millions of rows of a dataframe so efficiency will become important.

I looked for some array functions providing something like an overlap or a timedelta within a period but it seems start time and end time where the only useful tools available.

import pandas


def days_in_periods(df: pandas.DataFrame, inc_st: bool = True, inc_en: bool = True, period_freq='M') -> pandas.Series:
    """ Calculate the days in each period covered by any contract defined within the dataframe """
    # create period range
    periods = pandas.period_range(start=df['start_date'].min(),
                                  end=df['end_date'].max(),
                                  freq=period_freq)

    period_days = pandas.Series(data=[0] * len(periods),
                                index=periods,
                                dtype=int)

    for index, row in df.iterrows():
        st = row['start_date']
        en = row['end_date']

        print(f'contract: {st:%d/%m} - {en:%d/%m}')

        total_days: int = (en - st).days + inc_en - (1 - inc_st)

        print(f'contract days: {total_days}')

        total_days_check: int = 0

        for period in periods:
            per_st = period.start_time
            per_en = period.end_time
            print(f'tperiod: {per_st:%d/%m} - {per_en:%d/%m}', end='')

            if per_en < st or per_st > en:
                print('t0')
                continue

            days: int = (per_en - per_st).days + 1

            if per_st <= st <= per_en:
                days -= (st - per_st).days + (1 - inc_st)

            if per_st <= en <= per_en:
                days -= (per_en - en).days + (1 - inc_en)

            total_days_check += days

            print(f't{days}')

            period_days[period] += days

        print(f'total days check: {total_days_check}')
        assert total_days == total_days_check

    return period_days


# create sample DataFrame
df_ex = pandas.DataFrame({'start_date': ['2022-01-01', '2022-02-01', '2022-03-01'],
                          'end_date': ['2022-02-15', '2022-04-01', '2022-04-15']})

# convert start_date and end_date to datetime objects
df_ex['start_date'] = pandas.to_datetime(df_ex['start_date'])
df_ex['end_date'] = pandas.to_datetime(df_ex['end_date'])

days_in_periods(df_ex, inc_st=True, inc_en=True)
days_in_periods(df_ex, inc_st=True, inc_en=False)
days_in_periods(df_ex, inc_st=False, inc_en=True)
print(days_in_periods(df_ex, inc_st=False, inc_en=False))

Rewrite after sammywemmy’s suggestions below:

import operator
import pandas


def days_in_periods(df: pandas.DataFrame,
                    inc_st: bool = True,
                    inc_en: bool = True,
                    period_freq='M') -> pandas.DataFrame:
    """ Calculate the days in each period covered by any contract defined within the dataframe """
    day_range = pandas.date_range(df['start_date'].min(),
                                  df['end_date'].max(),
                                  freq='D').to_series(name='days').reset_index(drop=True)

    if inc_st:
        st_op = operator.le
    else:
        st_op = operator.lt

    if inc_en:
        en_op = operator.ge
    else:
        en_op = operator.gt

    df = df.merge(day_range, how='cross')

    df = (df.loc[st_op(df['start_date'], df['days'])
                 & en_op(df['end_date'], df['days'])]
            .resample(on='days', rule=period_freq)
            .size()
          )
    df.index = df.index.to_period(period_freq)

    return df


# create sample DataFrame
df_ex = pandas.DataFrame({'start_date': ['2022-01-01', '2022-02-01', '2022-03-01'],
                          'end_date': ['2022-02-15', '2022-04-01', '2022-04-15']})

# convert start_date and end_date to datetime objects
df_ex['start_date'] = pandas.to_datetime(df_ex['start_date'])
df_ex['end_date'] = pandas.to_datetime(df_ex['end_date'])

print(days_in_periods(df_ex, inc_st=False, inc_en=False))

Asked By: George

||

Source

Answer 1

Looks like some form of inequality join – if that is the case, you can use conditional_join from pyjanitor to get your results, before grouping – should be faster than having to use iterrows:

# pip install pyjanitor
import pandas as pd
import janitor

# build a Pandas series of dates: 
minimum = df_ex.to_numpy().min(axis=None)
maximum = df_ex.to_numpy().max(axis=None)
ser = pd.date_range(minimum, maximum, freq='D', name='dates').to_series()
ser.index = range(len(ser))


(df_ex
.conditional_join(
    ser, 
    # column from left, column from right, comparator
    ('start_date', 'dates', '<'), 
    ('end_date', 'dates', '>'),
    # depending on the data size,
    # you might get more performance with numba
    use_numba = False,
)
.loc(axis=1)[['dates']]
.resample(on='dates', rule='MS')
.size()
)

dates
2022-01-01    30
2022-02-01    41
2022-03-01    61
2022-04-01    14
Freq: MS, dtype: int64

Answered By: sammywemmy

Dataframe of start and end dates into sum of days in an array of periods

Question:

Answers: