What is a faster method to calculate hourly totals from a pandas DataFrame than a for loop?

Question:

I have a pandas DataFrame with about 200,000 rows of raw data. Each row has start and stop times that can span an hour to years. I am using a for loop to calculate a total for each hour of a year; each hourly total sums records that span that hour. I am calculating hourly totals for a full year, or about 24 * 365 = 8760 hours. Even running this for only 1,000 rows of raw data takes about 40 seconds on my laptop. I’m looking for a much faster alternative.

I extracted the logic to create this standalone code, which runs successfully in Python 3.10.

import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import random


def make_test_data(records: int = 1000) -> pd.DataFrame:
    """

    :param records: int: the number of records to create for tthe est data set.
    :return: pd.DataFrame with columns ['INTERFACE', 'CLASS', 'START_TIME',
             'STOP_TIME', 'CAPACITY']
    """

    random.seed(0)

    # Random data will be in these bounds.
    valid_interfaces = ('A', 'B', 'C', 'D', 'E', 'F')
    valid_classes = ('FIRM', 'NON-FIRM', 'SECONDARY')
    min_date = datetime(2022, 1, 1)
    max_date = datetime(2023, 1, 1)

    # Random date genertor
    def rand_date(min_date, max_date):
        days = (max_date - min_date).days
        return min_date + timedelta(days=random.randrange(days))

    records = records or 100
    interfaces = random.choices(valid_interfaces, k=records)
    classes = random.choices(valid_classes, k=records)
    starts = [rand_date(min_date, max_date) for _ in range(records)]
    stops = [rand_date(_, max_date) for _ in starts]
    capacities = [random.randint(-10000, 10000) for _ in range(records)]

    data = {'INTERFACE': interfaces,
            'CLASS': classes,
            'START_TIME': starts,
            'STOP_TIME': stops,
            'CAPACITY': capacities
            }
    return pd.DataFrame(data)


def calc_hourly_totals(data: pd.DataFrame) -> pd.DataFrame:
    """
    Create a dataframe with net capacity by hour, interface and class.
    :param data: pd.DataFrame with columns ['INTERFACE', 'CLASS', 'START_TIME',
                                            'STOP_TIME', 'CAPACITY']
    :return: pd.DataFrame with columns ['INTERFACE', 'CLASS', 'HOUR_BEGINNING',
                                        'CAPACITY']
    """
    min_date = data.START_TIME.min()
    max_date = data.START_TIME.max()

    hourly_dates = [min_date + timedelta(hours=_) for _
                    in range((max_date - min_date).days * 24
                             + int((max_date - min_date).seconds / 3600)
                             )
                    ]
    # print('hourly_dates:', hourly_dates)
    result = pd.DataFrame(columns=['INTERFACE', 'CLASS',
                                   'HOUR_BEGINNING', 'CAPACITY'])
    loop = 0
    for hour_start in hourly_dates:
        hour_stop = hour_start + timedelta(hours=1)
        # Filter data. Keep only the ones that overlap within this day/hour.
        df = data[(data.START_TIME < np.datetime64(hour_stop))
                  & (data.STOP_TIME > np.datetime64(hour_start))]
        df = df.groupby(['INTERFACE', 'CLASS']).sum('CAPACITY')
        df['HOUR_BEGINNING'] = hour_start
        df.reset_index(inplace=True)  # Move INTERFACE from index to a column.
        df = df[['INTERFACE', 'CLASS', 'HOUR_BEGINNING', 'CAPACITY']]

        result = pd.concat([result, df])
        # loop += 1
        # if loop % 1000 == 0:
        #     print(f'   completed {loop} of {len(hourly_dates)} loops.')

    result.rename(columns={'CAPACITY': 'NET_CAPACITY'}, inplace=True)
    result.reset_index(inplace=True)
    return result


if __name__ == '__main__':
    from time import perf_counter

    test_data = make_test_data(1_000)
    # test_data = pd.read_excel(r'C:Personalprojectstsd_pythondatanet_tsr_data_2021_04_01-2022_04_01.xlsx',
    #                           sheet_name='Raw TSR Data')
    print('test data rows:', len(test_data))
    t_start = perf_counter()
    result = calc_hourly_totals(test_data)
    t_finish = perf_counter()
    print('computation time:', t_finish - t_start)
    print(result.head())

My results look like this:

test data rows: 1000
computation time: 38.52079169999888
   index INTERFACE      CLASS      HOUR_BEGINNING NET_CAPACITY
0      0         A       FIRM 2022-01-01 00:00:00        -8646
1      1         B  SECONDARY 2022-01-01 00:00:00        -2296
2      2         E  SECONDARY 2022-01-01 00:00:00         7927
3      0         A       FIRM 2022-01-01 01:00:00        -8646
4      1         B  SECONDARY 2022-01-01 01:00:00        -2296

Process finished with exit code 0
Asked By: cadvena

||

Answers:

If you are willing to trade off speed for memory and you have enough of the latter, the following could work:

# Calculate all possible time-points - similar to hourly_dates but with an additional time-point at the end 
hour_beginning = pd.Series(pd.date_range(test_data['START_TIME'].min(), test_data['STOP_TIME'].max(), freq='H'))

# Remove this additional -last- time-point
hour_beginning = hour_beginning.drop(hour_beginning.index[-1])

You then calculate the cartesian product of the possible INTERFACE, CLASS and hour_beginning values:

interfaces = test_data['INTERFACE'].unique()
classes = test_data['CLASS'].unique()

# Import itertools
comb = pd.DataFrame(
    itertools.product(interfaces, classes, hour_beginning),
    columns=['INTERFACE', 'CLASS', 'HOUR_BEGINNING'],
)

Then you merge the test_data with the product df, but select only the time-points between START_TIME and END_TIME:

df_mrg = test_data.merge(comb, on = ['INTERFACE', 'CLASS']).query('START_TIME <= HOUR_BEGINNING < STOP_TIME')

Finally you groupby and sum:

result2 = df_mrg.groupby(['INTERFACE', 'CLASS', 'HOUR_BEGINNING']).sum(numeric_only=True)
result2 = result2.reset_index().sort_values(['INTERFACE', 'CLASS', 'HOUR_BEGINNING']).rename(columns={'CAPACITY': 'NET_CAPACITY'})

result2 is equal to "your" result sorted:

result1 = result.drop('index', axis = 1).sort_values(['INTERFACE', 'CLASS', 'HOUR_BEGINNING'])
result1.reset_index(drop=True).compare(result2.reset_index(drop=True)) # Raises no error

On Colab the new aggregation takes 1.7 sec, the old 1 min 27 sec.

Answered By: notiv