How to efficiently calculate membership counts by month and group

Question:

I have to calculate in Python the number of unique active members by year, month, and group for a large dataset (N ~ 30M). Membership always starts at the beginning of the month and ends at the end of the month. Here is a very small subset of the data.

print(df.head(6))
   member_id  type  start_date    end_date
1         10     A  2021-12-01  2022-05-31
2         22     B  2022-01-01  2022-07-31
3         17     A  2022-01-01  2022-06-30
4         57     A  2022-02-02  2022-02-28
5         41     B  2022-02-02  2022-04-30

My current solution is inefficient as it relies on a for loop:

import pandas as pd


date_list = pd.date_range(
    start=min(df.start_date),
    end=max(df.end_date),
    freq='MS'
)
members = pd.DataFrame()

for d in date_list:
    df['date_filter'] = (
        (d >= df.start_date)
        & (d <= df.end_date)
    )
    grouped_members = (
         df
         .loc[df.date_filter]
         .groupby(by='type', as_index=False)
         .member_id
         .nunique()
    )
    member_counts = pd.DataFrame(
         data={'year': d.year, 'month': d.month}
         index=[0]
    )
     member_counts = member_counts.merge(
         right=grouped_members,
         how='cross'
    )
    members = pd.concat[members, member_counts]
members = members.reset_index(drop=True)

It produces the following:

print(members)

    year  month  type  member_id
 0  2021     12     A          1
 1  2021     12     B          0
 2  2022      1     A          3
 3  2022      1     B          1
 4  2022      2     A          3
 5  2022      2     B          2
 6  2022      3     A          2
 7  2022      3     B          2
 8  2022      4     A          2
 9  2022      4     B          2
10  2022      5     A          2
11  2022      5     B          1
12  2022      6     A          1
13  2022      6     B          1
14  2022      7     A          0
15  2022      7     B          1

I’m looking for a completely vectorized solution to reduce computational time.

Asked By: David Johnson

||

Answers:

You can try creating temporary column storing pd.date_range, exploding it and then do pd.crosstab:

df["range"] = df.apply(
    lambda x: pd.date_range(x["start_date"], x["end_date"], freq="M"),
    axis=1,
)

# exploding only two columns to save memory
df = (
    pd.crosstab(
        (x := df[["range", "type"]].explode("range"))["range"], x["type"]
    )
    .stack()
    .to_frame("member_id")
    .reset_index()
)

df["year"], df["month"] = (d := df.pop("range")).dt.year, d.dt.month
print(df[["year", "month", "type", "member_id"]])

Prints:

    year  month type  member_id
0   2021     12    A          1
1   2021     12    B          0
2   2022      1    A          2
3   2022      1    B          1
4   2022      2    A          3
5   2022      2    B          2
6   2022      3    A          2
7   2022      3    B          2
8   2022      4    A          2
9   2022      4    B          2
10  2022      5    A          2
11  2022      5    B          1
12  2022      6    A          1
13  2022      6    B          1
14  2022      7    A          0
15  2022      7    B          1
Answered By: Andrej Kesely

Updated answer that avoids melt. Maybe faster? Uses the same idea as before where we don’t actually care about member ids, we are just keeping track of start/end counts

#Create multiindexed series for reindexing later
months = pd.date_range(
    start=df.start_date.min(),
    end=df.end_date.max(),
    freq='MS',
).to_period('M')

ind = pd.MultiIndex.from_product([df.type.unique(),months],names=['type','month'])

#push each end date to the next month
df['end_date'] += pd.DateOffset(1) 

#Convert the dates to yyyy-mm
df['start_date'] = df.start_date.dt.to_period('M')
df['end_date'] = df.end_date.dt.to_period('M')

#Get cumsum counts per type/month of start and ends 
gb_counts = (
    df.groupby('type').agg(
        start = ('start_date','value_counts'),
        end = ('end_date','value_counts'),
    )
    .reindex(ind)
    .fillna(0)
    .groupby('type')
    .cumsum()
    .astype(int)
)

counts = (gb_counts.start-gb_counts.end).unstack()
counts

ORIGINAL

Updated answer than works unless the same member_id/group has overlapping date ranges (in which case it double-counts)

The idea is to keep track of when the number of users changes per group instead of exploding out all months per user.

I think this should be very fast and I’m curious how it performs

Output

enter image description here

Code (looks long but is mostly comments)

import pandas as pd
import itertools

#Load example data
import io #just for reading in your example table
df = pd.read_csv(
    io.StringIO("""
0  member_id  type  start_date    end_date
1         10     A  2021-12-01  2022-05-31
2         22     B  2022-01-01  2022-07-31
3         17     A  2022-01-01  2022-06-30
4         57     A  2022-02-02  2022-02-28
5         41     B  2022-02-02  2022-04-30
"""),
    delim_whitespace=True,
    index_col=0,
    parse_dates=['start_date','end_date'],
).reset_index(drop=True)

#Create categorical index for reindexing and ffill
months = pd.date_range(
    start=df.start_date.min(),
    end=df.end_date.max(),
    freq='MS',
).to_period('M')

cat_ind = pd.Categorical(itertools.product(df.type.unique(),months))

#push each end date to the next month
df['end_date'] += pd.DateOffset(1) 

#Convert the dates to yyyy-mm
df['start_date'] = df.start_date.dt.to_period('M')
df['end_date'] = df.end_date.dt.to_period('M')

#Melt from:
#
#member_id | type | start_date |  end_date
#----------|------|------------|-----------
#       10 |   A  | 2021-12-01 | 2022-05-31
# ...
#
#to
#
# type | active_users | date
#----------------------------
#    A |   start_date | 2021-12-01
#    A |     end_date | 2022-05-31
# ...
df = df.melt(
    id_vars='type',
    value_vars=['start_date','end_date'],
    var_name='active_users',
    value_name='date',
).sort_values('date')

#Replace var column with +1/-1 for start/end date rows
#
# type | active_users | date
#----------------------------
#    A |            1 | 2021-12-01
#    A |           -1 | 2022-05-31
# ...
df['active_users'] = df.active_users.replace({'start_date':1,'end_date':-1})

#Sum within each type/date then cumsum the number of active users
df = df.groupby(['type','date']).sum().cumsum()

#Reindex to ffill missing dates
df = df.reindex(cat_ind).ffill().astype(int)

df.unstack()
Answered By: mitoRibo

Just so you have a naive python solution to use as a baseline:

from collections import defaultdict
from datetime import datetime, timedelta
from random import randint

rows = [
    {'member_id': '10', 'type': 'A', 'start_date': '2021-12-01', 'end_date': '2022-05-31'},
    {'member_id': '22', 'type': 'B', 'start_date': '2022-01-01', 'end_date': '2022-07-31'},
    {'member_id': '17', 'type': 'A', 'start_date': '2022-01-01', 'end_date': '2022-06-30'},
    {'member_id': '57', 'type': 'A', 'start_date': '2022-02-02', 'end_date': '2022-02-28'},
    {'member_id': '41', 'type': 'B', 'start_date': '2022-02-02', 'end_date': '2022-04-30'}
]
# generate 30M rows with random member ids (cardinality = 10M)
rows_30M = [
    {
        "member_id": randint(0, 10000000),
        "type": row["type"],
        "start_date": row["start_date"],
        "end_date": row["end_date"],
    }
    for i in range(200000*30)
    for row in rows
]

# %%time
# build tree-like structure: type -> start_date -> end_date
#  + remember min and max dates
tree = defaultdict(lambda: defaultdict(lambda: defaultdict(set)))
min_dt_str = ""
max_dt_str = ""
for row in rows_30M:
    if min_dt_str == "" or min_dt_str > row["start_date"]:
        min_dt_str = row["start_date"]
    if max_dt_str == "" or max_dt_str < row["end_date"]:
        max_dt_str = row["end_date"]
    tree[row["type"]][row["start_date"]][row["end_date"]].add(row["member_id"])


dt = datetime.strptime(min_dt_str, "%Y-%m-%d").date()
max_dt = datetime.strptime(max_dt_str, "%Y-%m-%d").date()

# building results by iterating dates and types
results = []
while dt <= max_dt:
    dt_str = str(dt)
    for type_, date_ranges in tree.items():
        results.append(
            {
                "type": type_,
                "year": dt.year,
                "month": dt.month,
                "mau": len(
                    {
                        member
                        for start_date, data in date_ranges.items()
                        if start_date <= dt_str
                        for end_date, members in data.items()
                        if dt_str <= end_date
                        for member in members
                    }
                ),
            }
        )
    dt = (dt + timedelta(days=31)).replace(day=1)

# CPU times: user 23.2 s, sys: 451 ms, total: 23.6 s
# Wall time: 23.6 s

# In [18]: results
# Out[18]:
# [{'type': 'A', 'year': 2021, 'month': 12, 'mau': 4512755},
#  {'type': 'B', 'year': 2021, 'month': 12, 'mau': 0},
#  {'type': 'A', 'year': 2022, 'month': 1, 'mau': 6988505},
#  {'type': 'B', 'year': 2022, 'month': 1, 'mau': 4511064},
#  {'type': 'A', 'year': 2022, 'month': 2, 'mau': 6988505},
#  {'type': 'B', 'year': 2022, 'month': 2, 'mau': 4511064},
#  {'type': 'A', 'year': 2022, 'month': 3, 'mau': 6988505},
#  {'type': 'B', 'year': 2022, 'month': 3, 'mau': 6987830},
#  {'type': 'A', 'year': 2022, 'month': 4, 'mau': 6988505},
#  {'type': 'B', 'year': 2022, 'month': 4, 'mau': 6987830},
#  {'type': 'A', 'year': 2022, 'month': 5, 'mau': 6988505},
#  {'type': 'B', 'year': 2022, 'month': 5, 'mau': 4511064},
#  {'type': 'A', 'year': 2022, 'month': 6, 'mau': 4511612},
#  {'type': 'B', 'year': 2022, 'month': 6, 'mau': 4511064},
#  {'type': 'A', 'year': 2022, 'month': 7, 'mau': 0},
#  {'type': 'B', 'year': 2022, 'month': 7, 'mau': 4511064}]

If member_id is already unique in the input data, then we just need to sum lengths of sets like this:

#     "mau": sum(
#         len(members)
#         for start_date, data in date_ranges.items()
#         if start_date <= dt_str
#         for end_date, members in data.items()
#         if dt_str <= end_date
#     ),
# CPU times: user 12.4 s, sys: 91.9 ms, total: 12.5 s
# Wall time: 12.5 s
Answered By: westandskif