Pandas Cumcount() over multiple columns

Question

I have a dataframe that looks like this:

data = {'exercise': ['squat', 'squat', 'squat', 'squat', 'bench', 'bench', 'bench', 'bench', 'squat', 'squat', 'squat', 'squat', 'bench', 'bench', 'bench', 'bench'],
        'session': [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1],
        'weight': [100, 100, 120, 120, 80, 80, 100, 110, 120, 130, 140, 150, 80, 90, 100, 110],
        'velocity': [0.30, 0.25, 0.20, 0.15, 0.30, 0.25, 0.20, 0.15, 0.30, 0.25, 0.20, 0.15, 0.30, 0.25, 0.20, 0.15]}
df = pd.DataFrame(data, columns = data.keys())

print(df)

   exercise  session  weight  velocity
0     squat        0     100      0.30
1     squat        0     100      0.25
2     squat        0     120      0.20
3     squat        0     120      0.15
4     bench        0      80      0.30
5     bench        0      80      0.25
6     bench        0     100      0.20
7     bench        0     110      0.15
8     squat        1     120      0.30
9     squat        1     130      0.25
10    squat        1     140      0.20
11    squat        1     150      0.15
12    bench        1      80      0.30
13    bench        1      90      0.25
14    bench        1     100      0.20
15    bench        1     110      0.15

What I want to do is to add two columns, one for set number and one for rep number.
Set number should increase by 1 for each weight change exercise & session is the same, else reset to 0.

Rep number should increase by 1 for each velocity change if exercise, session & weight is the same, else reset to 0.

The logic I wrote above was flawed. What I meant was that the set number should increase for each change in weight (per line), but reset to 0 if either exercise or session changes.

Rep count should then be the number of lines in each set.

Like this:

data = {'exercise': ['squat', 'squat', 'squat', 'squat', 'bench', 'bench', 'bench', 'bench', 'squat', 'squat', 'squat', 'squat', 'bench', 'bench', 'bench', 'bench'],
        'session': [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1],
        'weight': [100, 100, 120, 120, 80, 80, 100, 110, 120, 130, 140, 150, 80, 90, 100, 110],
        'velocity': [0.30, 0.25, 0.20, 0.15, 0.30, 0.25, 0.20, 0.15, 0.30, 0.25, 0.20, 0.15, 0.30, 0.25, 0.20, 0.15],
        'set': [0, 0, 1, 1, 0, 0, 1, 2, 0, 1, 2, 3, 0, 1, 2, 3],
        'rep': [0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}
df = pd.DataFrame(data, columns = data.keys())
print(df)

   exercise  session  weight  velocity  set  rep
0     squat        0     100      0.30    0    0
1     squat        0     100      0.25    0    1
2     squat        0     120      0.20    1    0
3     squat        0     120      0.15    1    1
4     bench        0      80      0.30    0    0
5     bench        0      80      0.25    0    1
6     bench        0     100      0.20    1    0
7     bench        0     110      0.15    2    0
8     squat        1     120      0.30    0    0
9     squat        1     130      0.25    1    0
10    squat        1     140      0.20    2    0
11    squat        1     150      0.15    3    0
12    bench        1      80      0.30    0    0
13    bench        1      90      0.25    1    0
14    bench        1     100      0.20    2    0
15    bench        1     110      0.15    3    0

I think this should be possible using groupby and cumcount, but I struggle to make it work.

Asked By: Mattias Thalén

||

Source

Answer 1

Use GroupBy.transform with factorize and GroupBy.cumcount:

df['set1'] = (df.groupby(['exercise','session'])['weight']
                .transform(lambda x: pd.factorize(x)[0]))
df['rep1'] = df.groupby(['exercise','session','weight']).cumcount()
print (df)
   exercise  session  weight  velocity  set  rep  set1  rep1
0     squat        0     100      0.30    0    0     0     0
1     squat        0     100      0.25    0    1     0     1
2     squat        0     120      0.20    1    0     1     0
3     squat        0     120      0.15    1    1     1     1
4     bench        0      80      0.30    0    0     0     0
5     bench        0      80      0.25    0    1     0     1
6     bench        0     100      0.20    1    0     1     0
7     bench        0     110      0.15    1    1     2     0
8     squat        1     120      0.30    0    0     0     0
9     squat        1     130      0.25    1    0     1     0
10    squat        1     140      0.20    2    0     2     0
11    squat        1     150      0.15    3    0     3     0
12    bench        1      80      0.30    0    0     0     0
13    bench        1      90      0.25    1    0     1     0
14    bench        1     100      0.20    2    0     2     0
15    bench        1     110      0.15    3    0     3     0

Answered By: jezrael

Answer 2

As the question describe, we should count when there is a change, I suggest the following solution:

data = {'exercise': ['squat', 'squat', 'squat', 'squat', 'bench', 'bench', 'bench', 'bench', 'squat', 'squat', 'squat', 'squat', 'bench', 'bench', 'bench', 'bench'],
        'session': [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1],
        'weight': [100, 100, 120, 120, 80, 80, 100, 110, 120, 130, 140, 150, 80, 90, 100, 110],
        'velocity': [0.30, 0.25, 0.20, 0.15, 0.30, 0.25, 0.20, 0.15, 0.30, 0.25, 0.20, 0.15, 0.30, 0.25, 0.20, 0.15],
        'set': [0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 2, 3, 0, 1, 2, 3],
        'rep': [0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]}
df = pd.DataFrame(data, columns = data.keys())


# df['velocity'] = 0.30
# df['weight'] = 100
# print(df)

grp_rep = ['exercise', 'session',  'weight']
grp_set = ['exercise','session']
def is_diff(arr):
    return [0] + [int(d) for d in arr[:-1]!=arr[1:]]
    
df['rep_sol'] = df.groupby(grp_rep)['velocity'].transform(lambda x: is_diff(x.values))
df['set_sol'] = df.groupby(grp_set)['weight'].transform(lambda x: is_diff(x.values))
df['rep_sol'] = df.groupby(grp_rep)['rep_sol'].cumsum().astype(int)
df['set_sol'] = df.groupby(grp_set)['set_sol'].cumsum().astype(int)
cols = ['exercise', 'session',  'weight', 'velocity', 'set', 'set_sol', 'rep', 'rep_sol']
print(df[cols])

which return:

   exercise  session  weight  velocity  set  set_sol  rep  rep_sol
0     squat        0     100      0.30    0        0    0        0
1     squat        0     100      0.25    0        0    1        1
2     squat        0     120      0.20    1        1    0        0
3     squat        0     120      0.15    1        1    1        1
4     bench        0      80      0.30    0        0    0        0
5     bench        0      80      0.25    0        0    1        1
6     bench        0     100      0.20    1        1    0        0
7     bench        0     110      0.15    1        2    1        0
8     squat        1     120      0.30    0        0    0        0
9     squat        1     130      0.25    1        1    0        0
10    squat        1     140      0.20    2        2    0        0
11    squat        1     150      0.15    3        3    0        0
12    bench        1      80      0.30    0        0    0        0
13    bench        1      90      0.25    1        1    0        0
14    bench        1     100      0.20    2        2    0        0
15    bench        1     110      0.15    3        3    0        0

This solution still hold if we test for weightand velocityconstant.

data = {'exercise': ['squat', 'squat', 'squat', 'squat', 'bench', 'bench', 'bench', 'bench', 'squat', 'squat', 'squat', 'squat', 'bench', 'bench', 'bench', 'bench'],
        'session': [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1],
        'weight': [100, 100, 120, 120, 80, 80, 100, 110, 120, 130, 140, 150, 80, 90, 100, 110],
        'velocity': [0.30, 0.25, 0.20, 0.15, 0.30, 0.25, 0.20, 0.15, 0.30, 0.25, 0.20, 0.15, 0.30, 0.25, 0.20, 0.15],
        'set': [0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 2, 3, 0, 1, 2, 3],
        'rep': [0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]}
df = pd.DataFrame(data, columns = data.keys())

df['velocity'] = 0.30
df['weight'] = 100
# print(df)

grp_rep = ['exercise', 'session',  'weight']
grp_set = ['exercise','session']
def is_diff(arr):
    return [0] + [int(d) for d in arr[:-1]!=arr[1:]]
    
df['rep_sol'] = df.groupby(grp_rep)['velocity'].transform(lambda x: is_diff(x.values))
df['set_sol'] = df.groupby(grp_set)['weight'].transform(lambda x: is_diff(x.values))
df['rep_sol'] = df.groupby(grp_rep)['rep_sol'].cumsum().astype(int)
df['set_sol'] = df.groupby(grp_set)['set_sol'].cumsum().astype(int)
#cols = ['exercise', 'session',  'weight', 'velocity', 'set', 'set_sol', 'rep', 'rep_sol']
cols = ['exercise', 'session',  'weight', 'velocity', 'set_sol', 'rep_sol']
print(df[cols])

which will return no changes as what should be expected:

   exercise  session  weight  velocity  set_sol  rep_sol
0     squat        0     100       0.3        0        0
1     squat        0     100       0.3        0        0
2     squat        0     100       0.3        0        0
3     squat        0     100       0.3        0        0
4     bench        0     100       0.3        0        0
5     bench        0     100       0.3        0        0
6     bench        0     100       0.3        0        0
7     bench        0     100       0.3        0        0
8     squat        1     100       0.3        0        0
9     squat        1     100       0.3        0        0
10    squat        1     100       0.3        0        0
11    squat        1     100       0.3        0        0
12    bench        1     100       0.3        0        0
13    bench        1     100       0.3        0        0
14    bench        1     100       0.3        0        0
15    bench        1     100       0.3        0        0

Answered By: antoine

Answer 3

After experimenting a bit, I ended up using this solution:

df['set_'] = df.groupby(['exercise', 'session'])['weight'].apply(lambda x: (x != x.shift()).cumsum() - 1)
df['rep_'] = df.groupby(['exercise', 'session', 'set_']).cumcount()

It works as expected for the posted example and in my real dataset with much more data and more scenarios.

data = {'exercise': ['squat', 'squat', 'squat', 'squat', 'bench', 'bench', 'bench', 'bench', 'squat', 'squat', 'squat', 'squat', 'bench', 'bench', 'bench', 'bench'],
        'session': [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1],
        'weight': [100, 100, 120, 120, 80, 80, 100, 110, 120, 130, 140, 150, 80, 90, 100, 110],
        'velocity': [0.30, 0.25, 0.20, 0.15, 0.30, 0.25, 0.20, 0.15, 0.30, 0.25, 0.20, 0.15, 0.30, 0.25, 0.20, 0.15],
        'set': [0, 0, 1, 1, 0, 0, 1, 2, 0, 1, 2, 3, 0, 1, 2, 3],
        'rep': [0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}
df = pd.DataFrame(data, columns = data.keys())

df['set_'] = df.groupby(['exercise', 'session'])['weight'].apply(lambda x: (x != x.shift()).cumsum() - 1)
df['rep_'] = df.groupby(['exercise', 'session', 'set_']).cumcount()
df['as_expected'] = (df['set_'] == df['set']) & (df['rep_'] == df['rep'])

print(df)

   exercise  session  weight  velocity  set  rep  set_  rep_  as_expected
0     squat        0     100      0.30    0    0     0     0         True
1     squat        0     100      0.25    0    1     0     1         True
2     squat        0     120      0.20    1    0     1     0         True
3     squat        0     120      0.15    1    1     1     1         True
4     bench        0      80      0.30    0    0     0     0         True
5     bench        0      80      0.25    0    1     0     1         True
6     bench        0     100      0.20    1    0     1     0         True
7     bench        0     110      0.15    2    0     2     0         True
8     squat        1     120      0.30    0    0     0     0         True
9     squat        1     130      0.25    1    0     1     0         True
10    squat        1     140      0.20    2    0     2     0         True
11    squat        1     150      0.15    3    0     3     0         True
12    bench        1      80      0.30    0    0     0     0         True
13    bench        1      90      0.25    1    0     1     0         True
14    bench        1     100      0.20    2    0     2     0         True
15    bench        1     110      0.15    3    0     3     0         True

Answered By: Mattias Thalén

Pandas Cumcount() over multiple columns

Question:

Answers: