What is a faster method to calculate hourly totals from a pandas DataFrame than a for loop?
Question:
I have a pandas DataFrame with about 200,000 rows of raw data. Each row has start and stop times that can span an hour to years. I am using a for loop to calculate a total for each hour of a year; each hourly total sums records that span that hour. I am calculating hourly totals for a full year, or about 24 * 365 = 8760 hours. Even running this for only 1,000 rows of raw data takes about 40 seconds on my laptop. I’m looking for a much faster alternative.
I extracted the logic to create this standalone code, which runs successfully in Python 3.10.
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import random
def make_test_data(records: int = 1000) -> pd.DataFrame:
"""
:param records: int: the number of records to create for tthe est data set.
:return: pd.DataFrame with columns ['INTERFACE', 'CLASS', 'START_TIME',
'STOP_TIME', 'CAPACITY']
"""
random.seed(0)
# Random data will be in these bounds.
valid_interfaces = ('A', 'B', 'C', 'D', 'E', 'F')
valid_classes = ('FIRM', 'NON-FIRM', 'SECONDARY')
min_date = datetime(2022, 1, 1)
max_date = datetime(2023, 1, 1)
# Random date genertor
def rand_date(min_date, max_date):
days = (max_date - min_date).days
return min_date + timedelta(days=random.randrange(days))
records = records or 100
interfaces = random.choices(valid_interfaces, k=records)
classes = random.choices(valid_classes, k=records)
starts = [rand_date(min_date, max_date) for _ in range(records)]
stops = [rand_date(_, max_date) for _ in starts]
capacities = [random.randint(-10000, 10000) for _ in range(records)]
data = {'INTERFACE': interfaces,
'CLASS': classes,
'START_TIME': starts,
'STOP_TIME': stops,
'CAPACITY': capacities
}
return pd.DataFrame(data)
def calc_hourly_totals(data: pd.DataFrame) -> pd.DataFrame:
"""
Create a dataframe with net capacity by hour, interface and class.
:param data: pd.DataFrame with columns ['INTERFACE', 'CLASS', 'START_TIME',
'STOP_TIME', 'CAPACITY']
:return: pd.DataFrame with columns ['INTERFACE', 'CLASS', 'HOUR_BEGINNING',
'CAPACITY']
"""
min_date = data.START_TIME.min()
max_date = data.START_TIME.max()
hourly_dates = [min_date + timedelta(hours=_) for _
in range((max_date - min_date).days * 24
+ int((max_date - min_date).seconds / 3600)
)
]
# print('hourly_dates:', hourly_dates)
result = pd.DataFrame(columns=['INTERFACE', 'CLASS',
'HOUR_BEGINNING', 'CAPACITY'])
loop = 0
for hour_start in hourly_dates:
hour_stop = hour_start + timedelta(hours=1)
# Filter data. Keep only the ones that overlap within this day/hour.
df = data[(data.START_TIME < np.datetime64(hour_stop))
& (data.STOP_TIME > np.datetime64(hour_start))]
df = df.groupby(['INTERFACE', 'CLASS']).sum('CAPACITY')
df['HOUR_BEGINNING'] = hour_start
df.reset_index(inplace=True) # Move INTERFACE from index to a column.
df = df[['INTERFACE', 'CLASS', 'HOUR_BEGINNING', 'CAPACITY']]
result = pd.concat([result, df])
# loop += 1
# if loop % 1000 == 0:
# print(f' completed {loop} of {len(hourly_dates)} loops.')
result.rename(columns={'CAPACITY': 'NET_CAPACITY'}, inplace=True)
result.reset_index(inplace=True)
return result
if __name__ == '__main__':
from time import perf_counter
test_data = make_test_data(1_000)
# test_data = pd.read_excel(r'C:Personalprojectstsd_pythondatanet_tsr_data_2021_04_01-2022_04_01.xlsx',
# sheet_name='Raw TSR Data')
print('test data rows:', len(test_data))
t_start = perf_counter()
result = calc_hourly_totals(test_data)
t_finish = perf_counter()
print('computation time:', t_finish - t_start)
print(result.head())
My results look like this:
test data rows: 1000
computation time: 38.52079169999888
index INTERFACE CLASS HOUR_BEGINNING NET_CAPACITY
0 0 A FIRM 2022-01-01 00:00:00 -8646
1 1 B SECONDARY 2022-01-01 00:00:00 -2296
2 2 E SECONDARY 2022-01-01 00:00:00 7927
3 0 A FIRM 2022-01-01 01:00:00 -8646
4 1 B SECONDARY 2022-01-01 01:00:00 -2296
Process finished with exit code 0
Answers:
If you are willing to trade off speed for memory and you have enough of the latter, the following could work:
# Calculate all possible time-points - similar to hourly_dates but with an additional time-point at the end
hour_beginning = pd.Series(pd.date_range(test_data['START_TIME'].min(), test_data['STOP_TIME'].max(), freq='H'))
# Remove this additional -last- time-point
hour_beginning = hour_beginning.drop(hour_beginning.index[-1])
You then calculate the cartesian product of the possible INTERFACE
, CLASS
and hour_beginning
values:
interfaces = test_data['INTERFACE'].unique()
classes = test_data['CLASS'].unique()
# Import itertools
comb = pd.DataFrame(
itertools.product(interfaces, classes, hour_beginning),
columns=['INTERFACE', 'CLASS', 'HOUR_BEGINNING'],
)
Then you merge the test_data
with the product df
, but select only the time-points between START_TIME
and END_TIME
:
df_mrg = test_data.merge(comb, on = ['INTERFACE', 'CLASS']).query('START_TIME <= HOUR_BEGINNING < STOP_TIME')
Finally you groupby
and sum
:
result2 = df_mrg.groupby(['INTERFACE', 'CLASS', 'HOUR_BEGINNING']).sum(numeric_only=True)
result2 = result2.reset_index().sort_values(['INTERFACE', 'CLASS', 'HOUR_BEGINNING']).rename(columns={'CAPACITY': 'NET_CAPACITY'})
result2
is equal to "your" result
sorted:
result1 = result.drop('index', axis = 1).sort_values(['INTERFACE', 'CLASS', 'HOUR_BEGINNING'])
result1.reset_index(drop=True).compare(result2.reset_index(drop=True)) # Raises no error
On Colab the new aggregation takes 1.7 sec, the old 1 min 27 sec.
I have a pandas DataFrame with about 200,000 rows of raw data. Each row has start and stop times that can span an hour to years. I am using a for loop to calculate a total for each hour of a year; each hourly total sums records that span that hour. I am calculating hourly totals for a full year, or about 24 * 365 = 8760 hours. Even running this for only 1,000 rows of raw data takes about 40 seconds on my laptop. I’m looking for a much faster alternative.
I extracted the logic to create this standalone code, which runs successfully in Python 3.10.
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import random
def make_test_data(records: int = 1000) -> pd.DataFrame:
"""
:param records: int: the number of records to create for tthe est data set.
:return: pd.DataFrame with columns ['INTERFACE', 'CLASS', 'START_TIME',
'STOP_TIME', 'CAPACITY']
"""
random.seed(0)
# Random data will be in these bounds.
valid_interfaces = ('A', 'B', 'C', 'D', 'E', 'F')
valid_classes = ('FIRM', 'NON-FIRM', 'SECONDARY')
min_date = datetime(2022, 1, 1)
max_date = datetime(2023, 1, 1)
# Random date genertor
def rand_date(min_date, max_date):
days = (max_date - min_date).days
return min_date + timedelta(days=random.randrange(days))
records = records or 100
interfaces = random.choices(valid_interfaces, k=records)
classes = random.choices(valid_classes, k=records)
starts = [rand_date(min_date, max_date) for _ in range(records)]
stops = [rand_date(_, max_date) for _ in starts]
capacities = [random.randint(-10000, 10000) for _ in range(records)]
data = {'INTERFACE': interfaces,
'CLASS': classes,
'START_TIME': starts,
'STOP_TIME': stops,
'CAPACITY': capacities
}
return pd.DataFrame(data)
def calc_hourly_totals(data: pd.DataFrame) -> pd.DataFrame:
"""
Create a dataframe with net capacity by hour, interface and class.
:param data: pd.DataFrame with columns ['INTERFACE', 'CLASS', 'START_TIME',
'STOP_TIME', 'CAPACITY']
:return: pd.DataFrame with columns ['INTERFACE', 'CLASS', 'HOUR_BEGINNING',
'CAPACITY']
"""
min_date = data.START_TIME.min()
max_date = data.START_TIME.max()
hourly_dates = [min_date + timedelta(hours=_) for _
in range((max_date - min_date).days * 24
+ int((max_date - min_date).seconds / 3600)
)
]
# print('hourly_dates:', hourly_dates)
result = pd.DataFrame(columns=['INTERFACE', 'CLASS',
'HOUR_BEGINNING', 'CAPACITY'])
loop = 0
for hour_start in hourly_dates:
hour_stop = hour_start + timedelta(hours=1)
# Filter data. Keep only the ones that overlap within this day/hour.
df = data[(data.START_TIME < np.datetime64(hour_stop))
& (data.STOP_TIME > np.datetime64(hour_start))]
df = df.groupby(['INTERFACE', 'CLASS']).sum('CAPACITY')
df['HOUR_BEGINNING'] = hour_start
df.reset_index(inplace=True) # Move INTERFACE from index to a column.
df = df[['INTERFACE', 'CLASS', 'HOUR_BEGINNING', 'CAPACITY']]
result = pd.concat([result, df])
# loop += 1
# if loop % 1000 == 0:
# print(f' completed {loop} of {len(hourly_dates)} loops.')
result.rename(columns={'CAPACITY': 'NET_CAPACITY'}, inplace=True)
result.reset_index(inplace=True)
return result
if __name__ == '__main__':
from time import perf_counter
test_data = make_test_data(1_000)
# test_data = pd.read_excel(r'C:Personalprojectstsd_pythondatanet_tsr_data_2021_04_01-2022_04_01.xlsx',
# sheet_name='Raw TSR Data')
print('test data rows:', len(test_data))
t_start = perf_counter()
result = calc_hourly_totals(test_data)
t_finish = perf_counter()
print('computation time:', t_finish - t_start)
print(result.head())
My results look like this:
test data rows: 1000
computation time: 38.52079169999888
index INTERFACE CLASS HOUR_BEGINNING NET_CAPACITY
0 0 A FIRM 2022-01-01 00:00:00 -8646
1 1 B SECONDARY 2022-01-01 00:00:00 -2296
2 2 E SECONDARY 2022-01-01 00:00:00 7927
3 0 A FIRM 2022-01-01 01:00:00 -8646
4 1 B SECONDARY 2022-01-01 01:00:00 -2296
Process finished with exit code 0
If you are willing to trade off speed for memory and you have enough of the latter, the following could work:
# Calculate all possible time-points - similar to hourly_dates but with an additional time-point at the end
hour_beginning = pd.Series(pd.date_range(test_data['START_TIME'].min(), test_data['STOP_TIME'].max(), freq='H'))
# Remove this additional -last- time-point
hour_beginning = hour_beginning.drop(hour_beginning.index[-1])
You then calculate the cartesian product of the possible INTERFACE
, CLASS
and hour_beginning
values:
interfaces = test_data['INTERFACE'].unique()
classes = test_data['CLASS'].unique()
# Import itertools
comb = pd.DataFrame(
itertools.product(interfaces, classes, hour_beginning),
columns=['INTERFACE', 'CLASS', 'HOUR_BEGINNING'],
)
Then you merge the test_data
with the product df
, but select only the time-points between START_TIME
and END_TIME
:
df_mrg = test_data.merge(comb, on = ['INTERFACE', 'CLASS']).query('START_TIME <= HOUR_BEGINNING < STOP_TIME')
Finally you groupby
and sum
:
result2 = df_mrg.groupby(['INTERFACE', 'CLASS', 'HOUR_BEGINNING']).sum(numeric_only=True)
result2 = result2.reset_index().sort_values(['INTERFACE', 'CLASS', 'HOUR_BEGINNING']).rename(columns={'CAPACITY': 'NET_CAPACITY'})
result2
is equal to "your" result
sorted:
result1 = result.drop('index', axis = 1).sort_values(['INTERFACE', 'CLASS', 'HOUR_BEGINNING'])
result1.reset_index(drop=True).compare(result2.reset_index(drop=True)) # Raises no error
On Colab the new aggregation takes 1.7 sec, the old 1 min 27 sec.