How to efficiently calculate membership counts by month and group
Question:
I have to calculate in Python the number of unique active members by year, month, and group for a large dataset (N ~ 30M). Membership always starts at the beginning of the month and ends at the end of the month. Here is a very small subset of the data.
print(df.head(6))
member_id type start_date end_date
1 10 A 2021-12-01 2022-05-31
2 22 B 2022-01-01 2022-07-31
3 17 A 2022-01-01 2022-06-30
4 57 A 2022-02-02 2022-02-28
5 41 B 2022-02-02 2022-04-30
My current solution is inefficient as it relies on a for loop:
import pandas as pd
date_list = pd.date_range(
start=min(df.start_date),
end=max(df.end_date),
freq='MS'
)
members = pd.DataFrame()
for d in date_list:
df['date_filter'] = (
(d >= df.start_date)
& (d <= df.end_date)
)
grouped_members = (
df
.loc[df.date_filter]
.groupby(by='type', as_index=False)
.member_id
.nunique()
)
member_counts = pd.DataFrame(
data={'year': d.year, 'month': d.month}
index=[0]
)
member_counts = member_counts.merge(
right=grouped_members,
how='cross'
)
members = pd.concat[members, member_counts]
members = members.reset_index(drop=True)
It produces the following:
print(members)
year month type member_id
0 2021 12 A 1
1 2021 12 B 0
2 2022 1 A 3
3 2022 1 B 1
4 2022 2 A 3
5 2022 2 B 2
6 2022 3 A 2
7 2022 3 B 2
8 2022 4 A 2
9 2022 4 B 2
10 2022 5 A 2
11 2022 5 B 1
12 2022 6 A 1
13 2022 6 B 1
14 2022 7 A 0
15 2022 7 B 1
I’m looking for a completely vectorized solution to reduce computational time.
Answers:
You can try creating temporary column storing pd.date_range
, exploding it and then do pd.crosstab
:
df["range"] = df.apply(
lambda x: pd.date_range(x["start_date"], x["end_date"], freq="M"),
axis=1,
)
# exploding only two columns to save memory
df = (
pd.crosstab(
(x := df[["range", "type"]].explode("range"))["range"], x["type"]
)
.stack()
.to_frame("member_id")
.reset_index()
)
df["year"], df["month"] = (d := df.pop("range")).dt.year, d.dt.month
print(df[["year", "month", "type", "member_id"]])
Prints:
year month type member_id
0 2021 12 A 1
1 2021 12 B 0
2 2022 1 A 2
3 2022 1 B 1
4 2022 2 A 3
5 2022 2 B 2
6 2022 3 A 2
7 2022 3 B 2
8 2022 4 A 2
9 2022 4 B 2
10 2022 5 A 2
11 2022 5 B 1
12 2022 6 A 1
13 2022 6 B 1
14 2022 7 A 0
15 2022 7 B 1
Updated answer that avoids melt
. Maybe faster? Uses the same idea as before where we don’t actually care about member ids, we are just keeping track of start/end counts
#Create multiindexed series for reindexing later
months = pd.date_range(
start=df.start_date.min(),
end=df.end_date.max(),
freq='MS',
).to_period('M')
ind = pd.MultiIndex.from_product([df.type.unique(),months],names=['type','month'])
#push each end date to the next month
df['end_date'] += pd.DateOffset(1)
#Convert the dates to yyyy-mm
df['start_date'] = df.start_date.dt.to_period('M')
df['end_date'] = df.end_date.dt.to_period('M')
#Get cumsum counts per type/month of start and ends
gb_counts = (
df.groupby('type').agg(
start = ('start_date','value_counts'),
end = ('end_date','value_counts'),
)
.reindex(ind)
.fillna(0)
.groupby('type')
.cumsum()
.astype(int)
)
counts = (gb_counts.start-gb_counts.end).unstack()
counts
ORIGINAL
Updated answer than works unless the same member_id/group has overlapping date ranges (in which case it double-counts)
The idea is to keep track of when the number of users changes per group instead of exploding out all months per user.
I think this should be very fast and I’m curious how it performs
Output
Code (looks long but is mostly comments)
import pandas as pd
import itertools
#Load example data
import io #just for reading in your example table
df = pd.read_csv(
io.StringIO("""
0 member_id type start_date end_date
1 10 A 2021-12-01 2022-05-31
2 22 B 2022-01-01 2022-07-31
3 17 A 2022-01-01 2022-06-30
4 57 A 2022-02-02 2022-02-28
5 41 B 2022-02-02 2022-04-30
"""),
delim_whitespace=True,
index_col=0,
parse_dates=['start_date','end_date'],
).reset_index(drop=True)
#Create categorical index for reindexing and ffill
months = pd.date_range(
start=df.start_date.min(),
end=df.end_date.max(),
freq='MS',
).to_period('M')
cat_ind = pd.Categorical(itertools.product(df.type.unique(),months))
#push each end date to the next month
df['end_date'] += pd.DateOffset(1)
#Convert the dates to yyyy-mm
df['start_date'] = df.start_date.dt.to_period('M')
df['end_date'] = df.end_date.dt.to_period('M')
#Melt from:
#
#member_id | type | start_date | end_date
#----------|------|------------|-----------
# 10 | A | 2021-12-01 | 2022-05-31
# ...
#
#to
#
# type | active_users | date
#----------------------------
# A | start_date | 2021-12-01
# A | end_date | 2022-05-31
# ...
df = df.melt(
id_vars='type',
value_vars=['start_date','end_date'],
var_name='active_users',
value_name='date',
).sort_values('date')
#Replace var column with +1/-1 for start/end date rows
#
# type | active_users | date
#----------------------------
# A | 1 | 2021-12-01
# A | -1 | 2022-05-31
# ...
df['active_users'] = df.active_users.replace({'start_date':1,'end_date':-1})
#Sum within each type/date then cumsum the number of active users
df = df.groupby(['type','date']).sum().cumsum()
#Reindex to ffill missing dates
df = df.reindex(cat_ind).ffill().astype(int)
df.unstack()
Just so you have a naive python solution to use as a baseline:
from collections import defaultdict
from datetime import datetime, timedelta
from random import randint
rows = [
{'member_id': '10', 'type': 'A', 'start_date': '2021-12-01', 'end_date': '2022-05-31'},
{'member_id': '22', 'type': 'B', 'start_date': '2022-01-01', 'end_date': '2022-07-31'},
{'member_id': '17', 'type': 'A', 'start_date': '2022-01-01', 'end_date': '2022-06-30'},
{'member_id': '57', 'type': 'A', 'start_date': '2022-02-02', 'end_date': '2022-02-28'},
{'member_id': '41', 'type': 'B', 'start_date': '2022-02-02', 'end_date': '2022-04-30'}
]
# generate 30M rows with random member ids (cardinality = 10M)
rows_30M = [
{
"member_id": randint(0, 10000000),
"type": row["type"],
"start_date": row["start_date"],
"end_date": row["end_date"],
}
for i in range(200000*30)
for row in rows
]
# %%time
# build tree-like structure: type -> start_date -> end_date
# + remember min and max dates
tree = defaultdict(lambda: defaultdict(lambda: defaultdict(set)))
min_dt_str = ""
max_dt_str = ""
for row in rows_30M:
if min_dt_str == "" or min_dt_str > row["start_date"]:
min_dt_str = row["start_date"]
if max_dt_str == "" or max_dt_str < row["end_date"]:
max_dt_str = row["end_date"]
tree[row["type"]][row["start_date"]][row["end_date"]].add(row["member_id"])
dt = datetime.strptime(min_dt_str, "%Y-%m-%d").date()
max_dt = datetime.strptime(max_dt_str, "%Y-%m-%d").date()
# building results by iterating dates and types
results = []
while dt <= max_dt:
dt_str = str(dt)
for type_, date_ranges in tree.items():
results.append(
{
"type": type_,
"year": dt.year,
"month": dt.month,
"mau": len(
{
member
for start_date, data in date_ranges.items()
if start_date <= dt_str
for end_date, members in data.items()
if dt_str <= end_date
for member in members
}
),
}
)
dt = (dt + timedelta(days=31)).replace(day=1)
# CPU times: user 23.2 s, sys: 451 ms, total: 23.6 s
# Wall time: 23.6 s
# In [18]: results
# Out[18]:
# [{'type': 'A', 'year': 2021, 'month': 12, 'mau': 4512755},
# {'type': 'B', 'year': 2021, 'month': 12, 'mau': 0},
# {'type': 'A', 'year': 2022, 'month': 1, 'mau': 6988505},
# {'type': 'B', 'year': 2022, 'month': 1, 'mau': 4511064},
# {'type': 'A', 'year': 2022, 'month': 2, 'mau': 6988505},
# {'type': 'B', 'year': 2022, 'month': 2, 'mau': 4511064},
# {'type': 'A', 'year': 2022, 'month': 3, 'mau': 6988505},
# {'type': 'B', 'year': 2022, 'month': 3, 'mau': 6987830},
# {'type': 'A', 'year': 2022, 'month': 4, 'mau': 6988505},
# {'type': 'B', 'year': 2022, 'month': 4, 'mau': 6987830},
# {'type': 'A', 'year': 2022, 'month': 5, 'mau': 6988505},
# {'type': 'B', 'year': 2022, 'month': 5, 'mau': 4511064},
# {'type': 'A', 'year': 2022, 'month': 6, 'mau': 4511612},
# {'type': 'B', 'year': 2022, 'month': 6, 'mau': 4511064},
# {'type': 'A', 'year': 2022, 'month': 7, 'mau': 0},
# {'type': 'B', 'year': 2022, 'month': 7, 'mau': 4511064}]
If member_id
is already unique in the input data, then we just need to sum lengths of sets like this:
# "mau": sum(
# len(members)
# for start_date, data in date_ranges.items()
# if start_date <= dt_str
# for end_date, members in data.items()
# if dt_str <= end_date
# ),
# CPU times: user 12.4 s, sys: 91.9 ms, total: 12.5 s
# Wall time: 12.5 s
I have to calculate in Python the number of unique active members by year, month, and group for a large dataset (N ~ 30M). Membership always starts at the beginning of the month and ends at the end of the month. Here is a very small subset of the data.
print(df.head(6))
member_id type start_date end_date
1 10 A 2021-12-01 2022-05-31
2 22 B 2022-01-01 2022-07-31
3 17 A 2022-01-01 2022-06-30
4 57 A 2022-02-02 2022-02-28
5 41 B 2022-02-02 2022-04-30
My current solution is inefficient as it relies on a for loop:
import pandas as pd
date_list = pd.date_range(
start=min(df.start_date),
end=max(df.end_date),
freq='MS'
)
members = pd.DataFrame()
for d in date_list:
df['date_filter'] = (
(d >= df.start_date)
& (d <= df.end_date)
)
grouped_members = (
df
.loc[df.date_filter]
.groupby(by='type', as_index=False)
.member_id
.nunique()
)
member_counts = pd.DataFrame(
data={'year': d.year, 'month': d.month}
index=[0]
)
member_counts = member_counts.merge(
right=grouped_members,
how='cross'
)
members = pd.concat[members, member_counts]
members = members.reset_index(drop=True)
It produces the following:
print(members)
year month type member_id
0 2021 12 A 1
1 2021 12 B 0
2 2022 1 A 3
3 2022 1 B 1
4 2022 2 A 3
5 2022 2 B 2
6 2022 3 A 2
7 2022 3 B 2
8 2022 4 A 2
9 2022 4 B 2
10 2022 5 A 2
11 2022 5 B 1
12 2022 6 A 1
13 2022 6 B 1
14 2022 7 A 0
15 2022 7 B 1
I’m looking for a completely vectorized solution to reduce computational time.
You can try creating temporary column storing pd.date_range
, exploding it and then do pd.crosstab
:
df["range"] = df.apply(
lambda x: pd.date_range(x["start_date"], x["end_date"], freq="M"),
axis=1,
)
# exploding only two columns to save memory
df = (
pd.crosstab(
(x := df[["range", "type"]].explode("range"))["range"], x["type"]
)
.stack()
.to_frame("member_id")
.reset_index()
)
df["year"], df["month"] = (d := df.pop("range")).dt.year, d.dt.month
print(df[["year", "month", "type", "member_id"]])
Prints:
year month type member_id
0 2021 12 A 1
1 2021 12 B 0
2 2022 1 A 2
3 2022 1 B 1
4 2022 2 A 3
5 2022 2 B 2
6 2022 3 A 2
7 2022 3 B 2
8 2022 4 A 2
9 2022 4 B 2
10 2022 5 A 2
11 2022 5 B 1
12 2022 6 A 1
13 2022 6 B 1
14 2022 7 A 0
15 2022 7 B 1
Updated answer that avoids melt
. Maybe faster? Uses the same idea as before where we don’t actually care about member ids, we are just keeping track of start/end counts
#Create multiindexed series for reindexing later
months = pd.date_range(
start=df.start_date.min(),
end=df.end_date.max(),
freq='MS',
).to_period('M')
ind = pd.MultiIndex.from_product([df.type.unique(),months],names=['type','month'])
#push each end date to the next month
df['end_date'] += pd.DateOffset(1)
#Convert the dates to yyyy-mm
df['start_date'] = df.start_date.dt.to_period('M')
df['end_date'] = df.end_date.dt.to_period('M')
#Get cumsum counts per type/month of start and ends
gb_counts = (
df.groupby('type').agg(
start = ('start_date','value_counts'),
end = ('end_date','value_counts'),
)
.reindex(ind)
.fillna(0)
.groupby('type')
.cumsum()
.astype(int)
)
counts = (gb_counts.start-gb_counts.end).unstack()
counts
ORIGINAL
Updated answer than works unless the same member_id/group has overlapping date ranges (in which case it double-counts)
The idea is to keep track of when the number of users changes per group instead of exploding out all months per user.
I think this should be very fast and I’m curious how it performs
Output
Code (looks long but is mostly comments)
import pandas as pd
import itertools
#Load example data
import io #just for reading in your example table
df = pd.read_csv(
io.StringIO("""
0 member_id type start_date end_date
1 10 A 2021-12-01 2022-05-31
2 22 B 2022-01-01 2022-07-31
3 17 A 2022-01-01 2022-06-30
4 57 A 2022-02-02 2022-02-28
5 41 B 2022-02-02 2022-04-30
"""),
delim_whitespace=True,
index_col=0,
parse_dates=['start_date','end_date'],
).reset_index(drop=True)
#Create categorical index for reindexing and ffill
months = pd.date_range(
start=df.start_date.min(),
end=df.end_date.max(),
freq='MS',
).to_period('M')
cat_ind = pd.Categorical(itertools.product(df.type.unique(),months))
#push each end date to the next month
df['end_date'] += pd.DateOffset(1)
#Convert the dates to yyyy-mm
df['start_date'] = df.start_date.dt.to_period('M')
df['end_date'] = df.end_date.dt.to_period('M')
#Melt from:
#
#member_id | type | start_date | end_date
#----------|------|------------|-----------
# 10 | A | 2021-12-01 | 2022-05-31
# ...
#
#to
#
# type | active_users | date
#----------------------------
# A | start_date | 2021-12-01
# A | end_date | 2022-05-31
# ...
df = df.melt(
id_vars='type',
value_vars=['start_date','end_date'],
var_name='active_users',
value_name='date',
).sort_values('date')
#Replace var column with +1/-1 for start/end date rows
#
# type | active_users | date
#----------------------------
# A | 1 | 2021-12-01
# A | -1 | 2022-05-31
# ...
df['active_users'] = df.active_users.replace({'start_date':1,'end_date':-1})
#Sum within each type/date then cumsum the number of active users
df = df.groupby(['type','date']).sum().cumsum()
#Reindex to ffill missing dates
df = df.reindex(cat_ind).ffill().astype(int)
df.unstack()
Just so you have a naive python solution to use as a baseline:
from collections import defaultdict
from datetime import datetime, timedelta
from random import randint
rows = [
{'member_id': '10', 'type': 'A', 'start_date': '2021-12-01', 'end_date': '2022-05-31'},
{'member_id': '22', 'type': 'B', 'start_date': '2022-01-01', 'end_date': '2022-07-31'},
{'member_id': '17', 'type': 'A', 'start_date': '2022-01-01', 'end_date': '2022-06-30'},
{'member_id': '57', 'type': 'A', 'start_date': '2022-02-02', 'end_date': '2022-02-28'},
{'member_id': '41', 'type': 'B', 'start_date': '2022-02-02', 'end_date': '2022-04-30'}
]
# generate 30M rows with random member ids (cardinality = 10M)
rows_30M = [
{
"member_id": randint(0, 10000000),
"type": row["type"],
"start_date": row["start_date"],
"end_date": row["end_date"],
}
for i in range(200000*30)
for row in rows
]
# %%time
# build tree-like structure: type -> start_date -> end_date
# + remember min and max dates
tree = defaultdict(lambda: defaultdict(lambda: defaultdict(set)))
min_dt_str = ""
max_dt_str = ""
for row in rows_30M:
if min_dt_str == "" or min_dt_str > row["start_date"]:
min_dt_str = row["start_date"]
if max_dt_str == "" or max_dt_str < row["end_date"]:
max_dt_str = row["end_date"]
tree[row["type"]][row["start_date"]][row["end_date"]].add(row["member_id"])
dt = datetime.strptime(min_dt_str, "%Y-%m-%d").date()
max_dt = datetime.strptime(max_dt_str, "%Y-%m-%d").date()
# building results by iterating dates and types
results = []
while dt <= max_dt:
dt_str = str(dt)
for type_, date_ranges in tree.items():
results.append(
{
"type": type_,
"year": dt.year,
"month": dt.month,
"mau": len(
{
member
for start_date, data in date_ranges.items()
if start_date <= dt_str
for end_date, members in data.items()
if dt_str <= end_date
for member in members
}
),
}
)
dt = (dt + timedelta(days=31)).replace(day=1)
# CPU times: user 23.2 s, sys: 451 ms, total: 23.6 s
# Wall time: 23.6 s
# In [18]: results
# Out[18]:
# [{'type': 'A', 'year': 2021, 'month': 12, 'mau': 4512755},
# {'type': 'B', 'year': 2021, 'month': 12, 'mau': 0},
# {'type': 'A', 'year': 2022, 'month': 1, 'mau': 6988505},
# {'type': 'B', 'year': 2022, 'month': 1, 'mau': 4511064},
# {'type': 'A', 'year': 2022, 'month': 2, 'mau': 6988505},
# {'type': 'B', 'year': 2022, 'month': 2, 'mau': 4511064},
# {'type': 'A', 'year': 2022, 'month': 3, 'mau': 6988505},
# {'type': 'B', 'year': 2022, 'month': 3, 'mau': 6987830},
# {'type': 'A', 'year': 2022, 'month': 4, 'mau': 6988505},
# {'type': 'B', 'year': 2022, 'month': 4, 'mau': 6987830},
# {'type': 'A', 'year': 2022, 'month': 5, 'mau': 6988505},
# {'type': 'B', 'year': 2022, 'month': 5, 'mau': 4511064},
# {'type': 'A', 'year': 2022, 'month': 6, 'mau': 4511612},
# {'type': 'B', 'year': 2022, 'month': 6, 'mau': 4511064},
# {'type': 'A', 'year': 2022, 'month': 7, 'mau': 0},
# {'type': 'B', 'year': 2022, 'month': 7, 'mau': 4511064}]
If member_id
is already unique in the input data, then we just need to sum lengths of sets like this:
# "mau": sum(
# len(members)
# for start_date, data in date_ranges.items()
# if start_date <= dt_str
# for end_date, members in data.items()
# if dt_str <= end_date
# ),
# CPU times: user 12.4 s, sys: 91.9 ms, total: 12.5 s
# Wall time: 12.5 s