Sort by column and append counter using pandas

Question:

I have a large data frame (>1m rows, 10+ cols) that I need to do the following to:

  • Group by two of the columns (A & B in example)
  • Sort within the grouping by another column (C in example)
  • Append an incremental counter to another column, incrementing in the order of the sorted values of C (E in example)
  • Leave other columns unedited (D in example)

I have got the following code working, which gives the correct results. However, it is very slow. Can anyone suggest some pandas magic to get improved performance?

import pandas as pd
import numpy as np
np.random.seed(0)

A = list()
B = list()
C = list()
D = list()
E = list()

np_alphabet = np.array(list('ABCEEFGHIJKLMNOPQRSTUVWXYZ'), dtype="|S1")
np_codes = np.random.choice(np_alphabet, [5, 10])
for a in np_codes:
    for b in range(2):
        for i in range(5):
            A.append(''.join(a))
            B.append('{}_{}'.format(b, A[-1]))
            C.append(np.random.rand())
            D.append(i)
            E.append(B[-1])

df = pd.DataFrame({
    'A': A,
    'B': B,
    'C': C,
    'D': D,
    'E': E
})
df.set_index(['A', 'B'], drop=False, inplace=True)
df.sort_index(inplace=True)

print(df)

grouped_sizes = df.groupby(level=[0, 1]).size()
num_indices = grouped_sizes.shape[0]
print_num = max(1, num_indices // 20)
for idx in grouped_sizes.index:
    if grouped_sizes[idx] > 1:
        tmp_df = df.loc[idx].sort_values('C', inplace=False)
        tmp_df['E'] = map(lambda x: '{}_{}'.format(*x), zip(range(1, tmp_df.size + 1), tmp_df['E']))
        df.loc[idx] = tmp_df
    else:
        df.loc[idx, 'E'] = '1_{}'.format(df.loc[idx, 'E'])

print(df)

which gives the following output

# before
                                  A             B         C  D             E
A          B                                                                
AXEJWVIZMZ 0_AXEJWVIZMZ  AXEJWVIZMZ  0_AXEJWVIZMZ  0.954914  0  0_AXEJWVIZMZ
           0_AXEJWVIZMZ  AXEJWVIZMZ  0_AXEJWVIZMZ  0.758615  1  0_AXEJWVIZMZ
           0_AXEJWVIZMZ  AXEJWVIZMZ  0_AXEJWVIZMZ  0.952573  2  0_AXEJWVIZMZ
           0_AXEJWVIZMZ  AXEJWVIZMZ  0_AXEJWVIZMZ  0.903142  3  0_AXEJWVIZMZ
           0_AXEJWVIZMZ  AXEJWVIZMZ  0_AXEJWVIZMZ  0.154262  4  0_AXEJWVIZMZ
           1_AXEJWVIZMZ  AXEJWVIZMZ  1_AXEJWVIZMZ  0.560586  0  1_AXEJWVIZMZ
           1_AXEJWVIZMZ  AXEJWVIZMZ  1_AXEJWVIZMZ  0.528869  1  1_AXEJWVIZMZ
           1_AXEJWVIZMZ  AXEJWVIZMZ  1_AXEJWVIZMZ  0.115331  2  1_AXEJWVIZMZ
           1_AXEJWVIZMZ  AXEJWVIZMZ  1_AXEJWVIZMZ  0.380718  3  1_AXEJWVIZMZ
           1_AXEJWVIZMZ  AXEJWVIZMZ  1_AXEJWVIZMZ  0.147092  4  1_AXEJWVIZMZ
BVBSPSACVA 0_BVBSPSACVA  BVBSPSACVA  0_BVBSPSACVA  0.824997  0  0_BVBSPSACVA
           0_BVBSPSACVA  BVBSPSACVA  0_BVBSPSACVA  0.264456  1  0_BVBSPSACVA
           0_BVBSPSACVA  BVBSPSACVA  0_BVBSPSACVA  0.282663  2  0_BVBSPSACVA
           0_BVBSPSACVA  BVBSPSACVA  0_BVBSPSACVA  0.678287  3  0_BVBSPSACVA
           0_BVBSPSACVA  BVBSPSACVA  0_BVBSPSACVA  0.409996  4  0_BVBSPSACVA
           1_BVBSPSACVA  BVBSPSACVA  1_BVBSPSACVA  0.149984  0  1_BVBSPSACVA
           1_BVBSPSACVA  BVBSPSACVA  1_BVBSPSACVA  0.711210  1  1_BVBSPSACVA
           1_BVBSPSACVA  BVBSPSACVA  1_BVBSPSACVA  0.840399  2  1_BVBSPSACVA
           1_BVBSPSACVA  BVBSPSACVA  1_BVBSPSACVA  0.804939  3  1_BVBSPSACVA
           1_BVBSPSACVA  BVBSPSACVA  1_BVBSPSACVA  0.290150  4  1_BVBSPSACVA
FHNMIRQRSP 0_FHNMIRQRSP  FHNMIRQRSP  0_FHNMIRQRSP  0.119058  0  0_FHNMIRQRSP
           0_FHNMIRQRSP  FHNMIRQRSP  0_FHNMIRQRSP  0.021955  1  0_FHNMIRQRSP
           0_FHNMIRQRSP  FHNMIRQRSP  0_FHNMIRQRSP  0.299527  2  0_FHNMIRQRSP
           0_FHNMIRQRSP  FHNMIRQRSP  0_FHNMIRQRSP  0.449371  3  0_FHNMIRQRSP
           0_FHNMIRQRSP  FHNMIRQRSP  0_FHNMIRQRSP  0.179845  4  0_FHNMIRQRSP
           1_FHNMIRQRSP  FHNMIRQRSP  1_FHNMIRQRSP  0.075765  0  1_FHNMIRQRSP
           1_FHNMIRQRSP  FHNMIRQRSP  1_FHNMIRQRSP  0.413373  1  1_FHNMIRQRSP
           1_FHNMIRQRSP  FHNMIRQRSP  1_FHNMIRQRSP  0.835250  2  1_FHNMIRQRSP
           1_FHNMIRQRSP  FHNMIRQRSP  1_FHNMIRQRSP  0.371984  3  1_FHNMIRQRSP
           1_FHNMIRQRSP  FHNMIRQRSP  1_FHNMIRQRSP  0.265494  4  1_FHNMIRQRSP
TJSECSLWFT 0_TJSECSLWFT  TJSECSLWFT  0_TJSECSLWFT  0.804553  0  0_TJSECSLWFT
           0_TJSECSLWFT  TJSECSLWFT  0_TJSECSLWFT  0.376646  1  0_TJSECSLWFT
           0_TJSECSLWFT  TJSECSLWFT  0_TJSECSLWFT  0.904908  2  0_TJSECSLWFT
           0_TJSECSLWFT  TJSECSLWFT  0_TJSECSLWFT  0.274501  3  0_TJSECSLWFT
           0_TJSECSLWFT  TJSECSLWFT  0_TJSECSLWFT  0.820866  4  0_TJSECSLWFT
           1_TJSECSLWFT  TJSECSLWFT  1_TJSECSLWFT  0.886687  0  1_TJSECSLWFT
           1_TJSECSLWFT  TJSECSLWFT  1_TJSECSLWFT  0.198887  1  1_TJSECSLWFT
           1_TJSECSLWFT  TJSECSLWFT  1_TJSECSLWFT  0.857795  2  1_TJSECSLWFT
           1_TJSECSLWFT  TJSECSLWFT  1_TJSECSLWFT  0.326926  3  1_TJSECSLWFT
           1_TJSECSLWFT  TJSECSLWFT  1_TJSECSLWFT  0.116743  4  1_TJSECSLWFT
WXEKPQSLQK 0_WXEKPQSLQK  WXEKPQSLQK  0_WXEKPQSLQK  0.249891  0  0_WXEKPQSLQK
           0_WXEKPQSLQK  WXEKPQSLQK  0_WXEKPQSLQK  0.945414  1  0_WXEKPQSLQK
           0_WXEKPQSLQK  WXEKPQSLQK  0_WXEKPQSLQK  0.235062  2  0_WXEKPQSLQK
           0_WXEKPQSLQK  WXEKPQSLQK  0_WXEKPQSLQK  0.082703  3  0_WXEKPQSLQK
           0_WXEKPQSLQK  WXEKPQSLQK  0_WXEKPQSLQK  0.894169  4  0_WXEKPQSLQK
           1_WXEKPQSLQK  WXEKPQSLQK  1_WXEKPQSLQK  0.595575  0  1_WXEKPQSLQK
           1_WXEKPQSLQK  WXEKPQSLQK  1_WXEKPQSLQK  0.769144  1  1_WXEKPQSLQK
           1_WXEKPQSLQK  WXEKPQSLQK  1_WXEKPQSLQK  0.917691  2  1_WXEKPQSLQK
           1_WXEKPQSLQK  WXEKPQSLQK  1_WXEKPQSLQK  0.567448  3  1_WXEKPQSLQK
           1_WXEKPQSLQK  WXEKPQSLQK  1_WXEKPQSLQK  0.832299  4  1_WXEKPQSLQK

and

# after
                                  A             B         C  D               E
A          B                                                                  
AXEJWVIZMZ 0_AXEJWVIZMZ  AXEJWVIZMZ  0_AXEJWVIZMZ  0.154262  4  1_0_AXEJWVIZMZ
           0_AXEJWVIZMZ  AXEJWVIZMZ  0_AXEJWVIZMZ  0.758615  1  2_0_AXEJWVIZMZ
           0_AXEJWVIZMZ  AXEJWVIZMZ  0_AXEJWVIZMZ  0.903142  3  3_0_AXEJWVIZMZ
           0_AXEJWVIZMZ  AXEJWVIZMZ  0_AXEJWVIZMZ  0.952573  2  4_0_AXEJWVIZMZ
           0_AXEJWVIZMZ  AXEJWVIZMZ  0_AXEJWVIZMZ  0.954914  0  5_0_AXEJWVIZMZ
           1_AXEJWVIZMZ  AXEJWVIZMZ  1_AXEJWVIZMZ  0.115331  2  1_1_AXEJWVIZMZ
           1_AXEJWVIZMZ  AXEJWVIZMZ  1_AXEJWVIZMZ  0.147092  4  2_1_AXEJWVIZMZ
           1_AXEJWVIZMZ  AXEJWVIZMZ  1_AXEJWVIZMZ  0.380718  3  3_1_AXEJWVIZMZ
           1_AXEJWVIZMZ  AXEJWVIZMZ  1_AXEJWVIZMZ  0.528869  1  4_1_AXEJWVIZMZ
           1_AXEJWVIZMZ  AXEJWVIZMZ  1_AXEJWVIZMZ  0.560586  0  5_1_AXEJWVIZMZ
BVBSPSACVA 0_BVBSPSACVA  BVBSPSACVA  0_BVBSPSACVA  0.264456  1  1_0_BVBSPSACVA
           0_BVBSPSACVA  BVBSPSACVA  0_BVBSPSACVA  0.282663  2  2_0_BVBSPSACVA
           0_BVBSPSACVA  BVBSPSACVA  0_BVBSPSACVA  0.409996  4  3_0_BVBSPSACVA
           0_BVBSPSACVA  BVBSPSACVA  0_BVBSPSACVA  0.678287  3  4_0_BVBSPSACVA
           0_BVBSPSACVA  BVBSPSACVA  0_BVBSPSACVA  0.824997  0  5_0_BVBSPSACVA
           1_BVBSPSACVA  BVBSPSACVA  1_BVBSPSACVA  0.149984  0  1_1_BVBSPSACVA
           1_BVBSPSACVA  BVBSPSACVA  1_BVBSPSACVA  0.290150  4  2_1_BVBSPSACVA
           1_BVBSPSACVA  BVBSPSACVA  1_BVBSPSACVA  0.711210  1  3_1_BVBSPSACVA
           1_BVBSPSACVA  BVBSPSACVA  1_BVBSPSACVA  0.804939  3  4_1_BVBSPSACVA
           1_BVBSPSACVA  BVBSPSACVA  1_BVBSPSACVA  0.840399  2  5_1_BVBSPSACVA
FHNMIRQRSP 0_FHNMIRQRSP  FHNMIRQRSP  0_FHNMIRQRSP  0.021955  1  1_0_FHNMIRQRSP
           0_FHNMIRQRSP  FHNMIRQRSP  0_FHNMIRQRSP  0.119058  0  2_0_FHNMIRQRSP
           0_FHNMIRQRSP  FHNMIRQRSP  0_FHNMIRQRSP  0.179845  4  3_0_FHNMIRQRSP
           0_FHNMIRQRSP  FHNMIRQRSP  0_FHNMIRQRSP  0.299527  2  4_0_FHNMIRQRSP
           0_FHNMIRQRSP  FHNMIRQRSP  0_FHNMIRQRSP  0.449371  3  5_0_FHNMIRQRSP
           1_FHNMIRQRSP  FHNMIRQRSP  1_FHNMIRQRSP  0.075765  0  1_1_FHNMIRQRSP
           1_FHNMIRQRSP  FHNMIRQRSP  1_FHNMIRQRSP  0.265494  4  2_1_FHNMIRQRSP
           1_FHNMIRQRSP  FHNMIRQRSP  1_FHNMIRQRSP  0.371984  3  3_1_FHNMIRQRSP
           1_FHNMIRQRSP  FHNMIRQRSP  1_FHNMIRQRSP  0.413373  1  4_1_FHNMIRQRSP
           1_FHNMIRQRSP  FHNMIRQRSP  1_FHNMIRQRSP  0.835250  2  5_1_FHNMIRQRSP
TJSECSLWFT 0_TJSECSLWFT  TJSECSLWFT  0_TJSECSLWFT  0.274501  3  1_0_TJSECSLWFT
           0_TJSECSLWFT  TJSECSLWFT  0_TJSECSLWFT  0.376646  1  2_0_TJSECSLWFT
           0_TJSECSLWFT  TJSECSLWFT  0_TJSECSLWFT  0.804553  0  3_0_TJSECSLWFT
           0_TJSECSLWFT  TJSECSLWFT  0_TJSECSLWFT  0.820866  4  4_0_TJSECSLWFT
           0_TJSECSLWFT  TJSECSLWFT  0_TJSECSLWFT  0.904908  2  5_0_TJSECSLWFT
           1_TJSECSLWFT  TJSECSLWFT  1_TJSECSLWFT  0.116743  4  1_1_TJSECSLWFT
           1_TJSECSLWFT  TJSECSLWFT  1_TJSECSLWFT  0.198887  1  2_1_TJSECSLWFT
           1_TJSECSLWFT  TJSECSLWFT  1_TJSECSLWFT  0.326926  3  3_1_TJSECSLWFT
           1_TJSECSLWFT  TJSECSLWFT  1_TJSECSLWFT  0.857795  2  4_1_TJSECSLWFT
           1_TJSECSLWFT  TJSECSLWFT  1_TJSECSLWFT  0.886687  0  5_1_TJSECSLWFT
WXEKPQSLQK 0_WXEKPQSLQK  WXEKPQSLQK  0_WXEKPQSLQK  0.082703  3  1_0_WXEKPQSLQK
           0_WXEKPQSLQK  WXEKPQSLQK  0_WXEKPQSLQK  0.235062  2  2_0_WXEKPQSLQK
           0_WXEKPQSLQK  WXEKPQSLQK  0_WXEKPQSLQK  0.249891  0  3_0_WXEKPQSLQK
           0_WXEKPQSLQK  WXEKPQSLQK  0_WXEKPQSLQK  0.894169  4  4_0_WXEKPQSLQK
           0_WXEKPQSLQK  WXEKPQSLQK  0_WXEKPQSLQK  0.945414  1  5_0_WXEKPQSLQK
           1_WXEKPQSLQK  WXEKPQSLQK  1_WXEKPQSLQK  0.567448  3  1_1_WXEKPQSLQK
           1_WXEKPQSLQK  WXEKPQSLQK  1_WXEKPQSLQK  0.595575  0  2_1_WXEKPQSLQK
           1_WXEKPQSLQK  WXEKPQSLQK  1_WXEKPQSLQK  0.769144  1  3_1_WXEKPQSLQK
           1_WXEKPQSLQK  WXEKPQSLQK  1_WXEKPQSLQK  0.832299  4  4_1_WXEKPQSLQK
           1_WXEKPQSLQK  WXEKPQSLQK  1_WXEKPQSLQK  0.917691  2  5_1_WXEKPQSLQK

EDIT: A new suggestion, timing, and testing of other suggested answers

from __future__ import print_function, division
from timeit import Timer
import pandas as pd
import numpy as np


def create_df():
    np.random.seed(0)
    A = list()
    B = list()
    C = list()
    D = list()
    E = list()

    np_alphabet = np.array(list('ABCEEFGHIJKLMNOPQRSTUVWXYZ'), dtype="|S1")
    np_codes = np.random.choice(np_alphabet, [100, 10])
    for a in np_codes:
        for b in range(2):
            for i in range(5):
                A.append(''.join(a))
                B.append('{}_{}'.format(b, A[-1]))
                C.append(np.random.rand())
                D.append(i)
                E.append(B[-1])

    df = pd.DataFrame({
        'A': A,
        'B': B,
        'C': C,
        'D': D,
        'E': E
    })
    return df.copy()


def method1(df):
    df.set_index(['A', 'B'], drop=False, inplace=True)
    df.sort_index(inplace=True)
    grouped_sizes = df.groupby(level=[0, 1]).size()
    for idx in grouped_sizes.index:
        if grouped_sizes[idx] > 1:
            tmp_df = df.loc[idx].sort_values('C', inplace=False)
            tmp_df['E'] = map(lambda x: '{}_{}'.format(*x), zip(range(1, tmp_df.size + 1), tmp_df['E']))
            df.loc[idx] = tmp_df
        else:
            df.loc[idx, 'E'] = '1_{}'.format(df.loc[idx, 'E'])
    return df


def method1a(df):
    df.set_index(['A', 'B'], drop=False, inplace=True)
    df.sort_values(['A', 'B', 'C'], inplace=True)
    grouped_sizes = df.groupby(level=[0, 1]).size()
    for idx in grouped_sizes.index:
        if grouped_sizes[idx] > 1:
            df.loc[idx, 'E'] = map(lambda x: '{}_{}'.format(*x), zip(range(1, grouped_sizes[idx] + 1), df.loc[idx, 'E']))
        else:
            df.loc[idx, 'E'] = '1_{}'.format(df.loc[idx, 'E'])
    return df


def method2(df):
    df.set_index(['A', 'B'], drop=False, inplace=True)
    df['F'] = 0
    df.sort_values(['A', 'B', 'C'], inplace=True)
    grouped_sizes = df.groupby(level=[0, 1]).size()
    for idx in grouped_sizes.index:
        if grouped_sizes[idx] > 1:
            df.loc[idx, 'F'] = range(1, grouped_sizes[idx] + 1)
        else:
            df.loc[idx, 'F'] = 1
    df['E'] = df['F'].map(str) + '_' + df['E']
    df.drop('F', axis=1, inplace=True)
    return df


def method3(df):
    df.set_index(['A', 'B'], drop=False, inplace=True)
    df['F'] = 0
    df.sort_values(['A', 'B', 'C'], inplace=True)
    grouped_sizes = df.groupby(level=[0, 1]).size()
    for idx in grouped_sizes.index:
        if grouped_sizes[idx] > 1:
            df.loc[idx, 'F'] = map(str, range(1, grouped_sizes[idx] + 1))
        else:
            df.loc[idx, 'F'] = '1'
    df['E'] = df['F'] + '_' + df['E']
    df.drop('F', axis=1, inplace=True)
    return df


def method4(df):
    prefixes = df.groupby(['A', 'B']).C.apply(pd.Series.argsort).add(1).astype(str)
    df['E'] = prefixes + '_' + df.E
    return df


def method5(df):
    df.set_index(['A', 'B'], drop=False, inplace=True)
    df.sort_values(['A', 'B', 'C'], inplace=True)
    df['E'] = df.groupby(level=[0, 1]).cumcount().add(1).astype(str) + '_' + df['E']
    return df


def assert_success(df):
    row = df[(df['A'] == 'AEVFGIURPE') & (df['B'] == '0_AEVFGIURPE') & (df['D'] == 2)].iloc[0]
    if not np.allclose(row['C'], 0.381397) or row['E'] != '3_0_AEVFGIURPE':
        print('A: method{}() failed: {} != 0.871083 or {} != 5_1_XOYRFZNIJU'.format(func, row['C'], row['E']))
        return
    row = df[(df['A'] == 'XOYRFZNIJU') & (df['B'] == '1_XOYRFZNIJU') & (df['D'] == 1)].iloc[0]
    if not np.allclose(row['C'], 0.871083) or row['E'] != '5_1_XOYRFZNIJU':
        print('B: method{}() failed: {} != 0.871083 or {} != 5_1_XOYRFZNIJU'.format(func, row['C'], row['E']))
        return


functions = list()
functions.append('1')
functions.append('1a')
functions.append('2')
functions.append('3')
functions.append('4')
functions.append('5')
for func in functions:
    print('method{}'.format(func),
          Timer(setup='from __main__ import create_df, assert_success, method{} as func'.format(func),
                stmt='df = create_df(); df = func(df); assert_success(df)').repeat(number=10))

Which gives the following results:

method1 [6.581194877624512, 6.625822067260742, 6.722187042236328]
method1a [1.9003210067749023, 1.9387969970703125, 1.9142169952392578]
method2 [0.9547598361968994, 0.9532740116119385, 0.9760739803314209]
method3 [1.0121638774871826, 1.0000989437103271, 0.9709858894348145]
B: method4() failed: 0.871082572438 != 0.871083 or 1_1_XOYRFZNIJU != 5_1_XOYRFZNIJU
B: method4() failed: 0.871082572438 != 0.871083 or 1_1_XOYRFZNIJU != 5_1_XOYRFZNIJU
B: method4() failed: 0.871082572438 != 0.871083 or 1_1_XOYRFZNIJU != 5_1_XOYRFZNIJU
B: method4() failed: 0.871082572438 != 0.871083 or 1_1_XOYRFZNIJU != 5_1_XOYRFZNIJU
B: method4() failed: 0.871082572438 != 0.871083 or 1_1_XOYRFZNIJU != 5_1_XOYRFZNIJU
B: method4() failed: 0.871082572438 != 0.871083 or 1_1_XOYRFZNIJU != 5_1_XOYRFZNIJU
B: method4() failed: 0.871082572438 != 0.871083 or 1_1_XOYRFZNIJU != 5_1_XOYRFZNIJU
B: method4() failed: 0.871082572438 != 0.871083 or 1_1_XOYRFZNIJU != 5_1_XOYRFZNIJU
B: method4() failed: 0.871082572438 != 0.871083 or 1_1_XOYRFZNIJU != 5_1_XOYRFZNIJU
B: method4() failed: 0.871082572438 != 0.871083 or 1_1_XOYRFZNIJU != 5_1_XOYRFZNIJU
B: method4() failed: 0.871082572438 != 0.871083 or 1_1_XOYRFZNIJU != 5_1_XOYRFZNIJU
B: method4() failed: 0.871082572438 != 0.871083 or 1_1_XOYRFZNIJU != 5_1_XOYRFZNIJU
B: method4() failed: 0.871082572438 != 0.871083 or 1_1_XOYRFZNIJU != 5_1_XOYRFZNIJU
B: method4() failed: 0.871082572438 != 0.871083 or 1_1_XOYRFZNIJU != 5_1_XOYRFZNIJU
B: method4() failed: 0.871082572438 != 0.871083 or 1_1_XOYRFZNIJU != 5_1_XOYRFZNIJU
B: method4() failed: 0.871082572438 != 0.871083 or 1_1_XOYRFZNIJU != 5_1_XOYRFZNIJU
B: method4() failed: 0.871082572438 != 0.871083 or 1_1_XOYRFZNIJU != 5_1_XOYRFZNIJU
B: method4() failed: 0.871082572438 != 0.871083 or 1_1_XOYRFZNIJU != 5_1_XOYRFZNIJU
B: method4() failed: 0.871082572438 != 0.871083 or 1_1_XOYRFZNIJU != 5_1_XOYRFZNIJU
B: method4() failed: 0.871082572438 != 0.871083 or 1_1_XOYRFZNIJU != 5_1_XOYRFZNIJU
B: method4() failed: 0.871082572438 != 0.871083 or 1_1_XOYRFZNIJU != 5_1_XOYRFZNIJU
B: method4() failed: 0.871082572438 != 0.871083 or 1_1_XOYRFZNIJU != 5_1_XOYRFZNIJU
B: method4() failed: 0.871082572438 != 0.871083 or 1_1_XOYRFZNIJU != 5_1_XOYRFZNIJU
B: method4() failed: 0.871082572438 != 0.871083 or 1_1_XOYRFZNIJU != 5_1_XOYRFZNIJU
B: method4() failed: 0.871082572438 != 0.871083 or 1_1_XOYRFZNIJU != 5_1_XOYRFZNIJU
B: method4() failed: 0.871082572438 != 0.871083 or 1_1_XOYRFZNIJU != 5_1_XOYRFZNIJU
B: method4() failed: 0.871082572438 != 0.871083 or 1_1_XOYRFZNIJU != 5_1_XOYRFZNIJU
B: method4() failed: 0.871082572438 != 0.871083 or 1_1_XOYRFZNIJU != 5_1_XOYRFZNIJU
B: method4() failed: 0.871082572438 != 0.871083 or 1_1_XOYRFZNIJU != 5_1_XOYRFZNIJU
B: method4() failed: 0.871082572438 != 0.871083 or 1_1_XOYRFZNIJU != 5_1_XOYRFZNIJU
method4 [0.3202958106994629, 0.3348369598388672, 0.33800482749938965]
method5 [0.11518096923828125, 0.10490703582763672, 0.09626197814941406]
Asked By: BrT

||

Answers:

Sort your dataframe first, then use the .groupby() method with a .cumcount():

df.sort_values(['A','B','C'], inplace = True)
df['D'] = df.groupby(['A','B']).cumcount()

If you want to sort it the other way, supply an ascending parameter to .sort_values(): ascending = [True,True,False]. You also actually don’t need to sort by A and B if you want to retain the original ordering of the data (they will still be ‘ranked’ correctly).

A general remark: if you find yourself using a for loop in pandas to iterate over data, chances are that you can speed up your code substantially by making use of pandas’ vectorization.

Answered By: Ken Wei

I think you need sort_values first, then groupby + cumcount for counter, then add 1 and convert to str:

df.sort_values(['A', 'B', 'C'], inplace=True)
df['E'] = df.groupby(level=['A','B']).cumcount().add(1).astype(str) + '_' + df['E']
print(df)
                                  A             B         C  D               E
A          B                                                                  
MPVAEEHJTV 0_MPVAEEHJTV  MPVAEEHJTV  0_MPVAEEHJTV  0.264556  3  1_0_MPVAEEHJTV
           0_MPVAEEHJTV  MPVAEEHJTV  0_MPVAEEHJTV  0.414662  2  2_0_MPVAEEHJTV
           0_MPVAEEHJTV  MPVAEEHJTV  0_MPVAEEHJTV  0.521848  1  3_0_MPVAEEHJTV
           0_MPVAEEHJTV  MPVAEEHJTV  0_MPVAEEHJTV  0.774234  4  4_0_MPVAEEHJTV
           0_MPVAEEHJTV  MPVAEEHJTV  0_MPVAEEHJTV  0.944669  0  5_0_MPVAEEHJTV
           1_MPVAEEHJTV  MPVAEEHJTV  1_MPVAEEHJTV  0.018790  2  1_1_MPVAEEHJTV
           1_MPVAEEHJTV  MPVAEEHJTV  1_MPVAEEHJTV  0.456150  0  2_1_MPVAEEHJTV
           1_MPVAEEHJTV  MPVAEEHJTV  1_MPVAEEHJTV  0.568434  1  3_1_MPVAEEHJTV
           1_MPVAEEHJTV  MPVAEEHJTV  1_MPVAEEHJTV  0.612096  4  4_1_MPVAEEHJTV
           1_MPVAEEHJTV  MPVAEEHJTV  1_MPVAEEHJTV  0.617635  3  5_1_MPVAEEHJTV
RTTTOHABJZ 0_RTTTOHABJZ  RTTTOHABJZ  0_RTTTOHABJZ  0.096098  2  1_0_RTTTOHABJZ
           0_RTTTOHABJZ  RTTTOHABJZ  0_RTTTOHABJZ  0.097101  0  2_0_RTTTOHABJZ
           0_RTTTOHABJZ  RTTTOHABJZ  0_RTTTOHABJZ  0.468651  4  3_0_RTTTOHABJZ
           0_RTTTOHABJZ  RTTTOHABJZ  0_RTTTOHABJZ  0.837945  1  4_0_RTTTOHABJZ
           0_RTTTOHABJZ  RTTTOHABJZ  0_RTTTOHABJZ  0.976459  3  5_0_RTTTOHABJZ
           1_RTTTOHABJZ  RTTTOHABJZ  1_RTTTOHABJZ  0.039188  3  1_1_RTTTOHABJZ
...
...

EDIT:

It seems like bug if dont define levels by columns only if column names are same as index names:

df['E'] = df.groupby(['A','B']).cumcount().add(1).astype(str) + '_' + df['E']

FutureWarning: ‘A’ is both a column name and an index level.
Defaulting to column but this will raise an ambiguity error in a future version
df[‘E’] = df.groupby(column=[‘A’,’B’]).cumcount().add(1).astype(str) + ‘_’ + df[‘E’]

FutureWarning: ‘B’ is both a column name and an index level.
Defaulting to column but this will raise an ambiguity error in a future version
df[‘E’] = df.groupby(column=[‘A’,’B’]).cumcount().add(1).astype(str) + ‘_’ + df[‘E’]

One possible solution is rename_axis if necessary groupby by columns names (or rename columns which same names as index names):

df.sort_values(['A', 'B', 'C'], inplace=True)
df = df.rename_axis(('A_lev','B_lev'))
df['E'] = df.groupby(['A','B']).cumcount().add(1).astype(str) + '_' + df['E']
print(df)
                                  A             B         C  D               E
A_lev      B_lev                                                              
MPVAEEHJTV 0_MPVAEEHJTV  MPVAEEHJTV  0_MPVAEEHJTV  0.264556  3  1_0_MPVAEEHJTV
           0_MPVAEEHJTV  MPVAEEHJTV  0_MPVAEEHJTV  0.414662  2  2_0_MPVAEEHJTV
           0_MPVAEEHJTV  MPVAEEHJTV  0_MPVAEEHJTV  0.521848  1  3_0_MPVAEEHJTV
           0_MPVAEEHJTV  MPVAEEHJTV  0_MPVAEEHJTV  0.774234  4  4_0_MPVAEEHJTV
           0_MPVAEEHJTV  MPVAEEHJTV  0_MPVAEEHJTV  0.944669  0  5_0_MPVAEEHJTV
           1_MPVAEEHJTV  MPVAEEHJTV  1_MPVAEEHJTV  0.018790  2  1_1_MPVAEEHJTV
           1_MPVAEEHJTV  MPVAEEHJTV  1_MPVAEEHJTV  0.456150  0  2_1_MPVAEEHJTV
           1_MPVAEEHJTV  MPVAEEHJTV  1_MPVAEEHJTV  0.568434  1  3_1_MPVAEEHJTV
           1_MPVAEEHJTV  MPVAEEHJTV  1_MPVAEEHJTV  0.612096  4  4_1_MPVAEEHJTV
           1_MPVAEEHJTV  MPVAEEHJTV  1_MPVAEEHJTV  0.617635  3  5_1_MPVAEEHJTV
RTTTOHABJZ 0_RTTTOHABJZ  RTTTOHABJZ  0_RTTTOHABJZ  0.096098  2  1_0_RTTTOHABJZ
           0_RTTTOHABJZ  RTTTOHABJZ  0_RTTTOHABJZ  0.097101  0  2_0_RTTTOHABJZ
           0_RTTTOHABJZ  RTTTOHABJZ  0_RTTTOHABJZ  0.468651  4  3_0_RTTTOHABJZ
           0_RTTTOHABJZ  RTTTOHABJZ  0_RTTTOHABJZ  0.837945  1  4_0_RTTTOHABJZ
           0_RTTTOHABJZ  RTTTOHABJZ  0_RTTTOHABJZ  0.976459  3  5_0_RTTTOHABJZ
           1_RTTTOHABJZ  RTTTOHABJZ  1_RTTTOHABJZ  0.039188  3  1_1_RTTTOHABJZ
           1_RTTTOHABJZ  RTTTOHABJZ  1_RTTTOHABJZ  0.282807  4  2_1_RTTTOHABJZ
           1_RTTTOHABJZ  RTTTOHABJZ  1_RTTTOHABJZ  0.604846  1  3_1_RTTTOHABJZ
           1_RTTTOHABJZ  RTTTOHABJZ  1_RTTTOHABJZ  0.739264  2  4_1_RTTTOHABJZ
           1_RTTTOHABJZ  RTTTOHABJZ  1_RTTTOHABJZ  0.976761  0  5_1_RTTTOHABJZ
...
...
Answered By: jezrael