Looking for alternatives to increase performance in a FOR python loop

Question

I’m currently running this five FOR loops to meet certain conditions and fill another dataframe column. Both dataframe sizes approximately 500 row so basically each loop i run I’m iterating over 250k combinations. Although it’s working as expected, its taking too much time to execute but i didnt find any alternatives yet.

for x, linha_calc in calculos.iterrows():
    for z, linha_opf in opf.iterrows():
        if (linha_calc['E'] == linha_opf['E']) & (
                linha_calc['F'] == linha_opf['F']) & (
                linha_calc['G'] == linha_opf['G']) & (
                linha_opf['H'] == 'C') & (
                linha_opf['I'] == 'OFC'):
            calculos.at[x, 'J'] = 1
            break

for x, linha_calc in calculos.iterrows():
    for z, linha_opf in opf.iterrows():
        if (linha_calc['K'] == linha_opf['K']) & (
                linha_calc['L'] == linha_opf['L']) & (
                linha_calc['M'] == linha_opf['M']) & (
                linha_opf['N'] == 'V') & (
                linha_opf['O'] == 'OFV'):
            calculos.at[x, 'P'] = 1
            break

for x, linha_calc in calculos.iterrows():
    for z, linha_opf in opf.iterrows():
        if (linha_calc['Q'] == linha_opf['Q']) & (
                linha_calc['R'] == linha_opf['R']) & (
                linha_calc['S'] == linha_opf['S']) & (
                linha_opf['T'] == 'C') & (
                linha_opf['U'] == 'OFV'):
            calculos.at[x, 'V'] = 1
            break

for x, linha_calc in calculos.iterrows():
    for z, linha_opf in opf.iterrows():
        if (linha_calc['X'] == linha_opf['X']) & (
                linha_calc['Y'] == linha_opf['Y']) & (
                linha_calc['Z'] == linha_opf['Z']) & (
                linha_opf['W'] == 'V') & (
                linha_opf['W1'] == 'OFC'):
            calculos.at[x, 'W2'] = 1
            break

for x, linha_calc in calculos.iterrows():
    for z, linha_opf in opf.iterrows():
        if (linha_calc['AA'] == linha_opf['AA']) & (
                linha_calc['BB'] == linha_opf['BB']) & (
                linha_calc['CC'] == linha_opf['CC']) & (
                linha_opf['DD'] == 'V') & (
                linha_opf['EE'] == 'OFC'):
            calculos.at[x, 'FF'] = 1
            break

for x, linha_calc in calculos.iterrows():
    for z, linha_opf in opf.iterrows():
        if (linha_calc['A'] == linha_opf['A']) & (
                linha_calc['B'] == linha_opf['B']) & (
                linha_opf['C'] != 0):
            calculos.at[x, 'D'] = 1
            break
    enter code here

Asked By: pedrosjo

||

Source

Answer 1

You could probably combine the loops, then set a bool value for each condition, replacing the breaks.

cond1, cond2, ... = False, False, ...
for x, linha_calc in calculos.iterrows():
    for z, linha_opf in opf.iterrows(): 
        if (linha_calc['E'] == linha_opf['E']) & (
            linha_calc['F'] == linha_opf['F']) & (
            linha_calc['G'] == linha_opf['G']) & (
            linha_opf['H'] == 'C') & (
            linha_opf['I'] == 'OFC') & not cond1:
        calculos.at[x, 'J'] = 1
        cond1 = True
        
        if (linha_calc['K'] == linha_opf['K']) & (
            linha_calc['L'] == linha_opf['L']) & (
            linha_calc['M'] == linha_opf['M']) & (
            linha_opf['N'] == 'V') & (
            linha_opf['O'] == 'OFV') & not cond2:
        calculos.at[x, 'P'] = 1
        cond2 = True
        ...
        ...
        ...

        if cond1 and con2 and ... condX:
            break

This will save you looping through the document for each case, X times.

Answered By: ZWang

Answer 2

Two methods to speed up processing

Method 1: loop using zip rather than iterrows reference
Method 2: Vectorize the process and avoid looping entirely

Testing with Dataframe with 1,000 rows:

Posted Solution took ~2 minutes
Method 1 took ~1 second (~120X speed up)
Method 2 takes ~24.8 ms (~4800 speed up)

Method 1

import pandas as pd

# Utility function to handle the various for loops
def update(calculos, opf, input_columns, values, update_column):
    '''
        Uses zip to iterate over rows of calculos and opf
    '''
    # Mask of rows to change
    mask = [False]*calculos.shape[0]
    
    if isinstance(values, list):
        n = len(values)
        values = tuple(values)   # to allow comparison to tuple later
    else:
        n = 1       # values is scalar
        
    for x, calc in enumerate(zip(*(calculos[col] for col in input_columns))): # Iterate over rows of calculos
        calc = calc[:-n]   # drop last n elements (i.e. not used in comparison)

        for opf_ in zip(*(opf[col] for col in input_columns)):               # Iterate over rows of opf
            
            # Compare calc and calc_ without last n elements and compare last n elements to values
            if calc == opf_[:-n]:
                if ((n > 1 and opf_[-n:] == values) or 
                    (n == 1 and opf_[-1] != values)):   # second condition for scalar value (e.g. not 0)
                    mask[x] = 1
                    break

        calculos[update_column] = np.where(mask, 1, calculos[update_column])   # 1 where Mask is true, unchanged otherwise

Usage

import pandas as pd
import numpy as np

from random import randint, seed
import timeit

seed(1234)
N = 5            # number of rows
M = 100             # range of values i.e. 0 to M

# Generate data
data = {"E":[randint(0, M) for _ in range(N)],
        "F":[randint(0, M) for _ in range(N)], 
        "G":[randint(0, M) for _ in range(N)],
        "H":['C' if i%2==0 else 'X' for i in range(N)],
        "I":['OFC' if i%2==0 else 'X' for i in range(N)],
        "J":[randint(2, M) for _ in range(N)],
        "K":[randint(0, M) for _ in range(N)],
        "L":[randint(0, M) for _ in range(N)], 
        "M":[randint(0, M) for _ in range(N)],
        "N":['C' if i%2==0 else 'X' for i in range(N)],
        "O":['OFC' if i%2==0 else 'X' for i in range(N)],
        "P":[randint(2, M) for _ in range(N)],
        "Q":[randint(0, M) for _ in range(N)],
        "R":[randint(0, M) for _ in range(N)], 
        "S":[randint(0, M) for _ in range(N)],
        "T":['C' if i%2==0 else 'X' for i in range(N)],
        "U":['OFC' if i%2==0 else 'X' for i in range(N)],
        "V":[randint(2, M) for _ in range(N)],
        "A":[randint(0, M) for _ in range(N)],
        "B":['C' if i%2==0 else 'X' for i in range(N)],
        "C":['OFC' if i%2==0 else 'X' for i in range(N)],
        "D":[randint(2, M) for _ in range(N)]}

calculos = pd.DataFrame(data)
opf = pd.DataFrame(data)

# Use update for posted for loops with parameters as follows.
update(calculos, opf, ['E', 'F', 'G', 'H', 'I'], ['C', 'OFC'], 'J')
update(calculos, opf, ['K', 'L', 'M', 'N', 'O'], ['V', 'OFV'], 'P')
update(calculos, opf, ['Q', 'R', 'S', 'T', 'U'], ['C', 'OFV'], 'V')
update(calculos, opf, ['A', 'B', 'C'], 0, 'D')   # use scalar value of 0

Output

E   F   G   H   I   J   K   L   M   N   ... Q   R   S   T   U   V   A   B   C   D
0   40  37  44  C   OFC 8   11  67  24  C   ... 33  25  36  C   OFC 32  18  C   OFC 1
1   10  95  70  X   X   63  40  7   100 X   ... 56  9   15  X   X   15  44  X   X   1
2   25  85  33  C   OFC 66  31  96  35  C   ... 58  42  63  C   OFC 63  4   C   OFC 1
3   34  13  95  X   X   22  42  82  23  X   ... 17  42  63  X   X   34  21  X   X   1
4   7   76  51  C   OFC 9   63  17  65  C   ... 11  15  24  C   OFC 42  51  C   OFC 1
5 rows × 22 columns

Performance

Summary: ~120X speed up (timing first for loop only)
Timing:

Posted Method ~ 2 minutes
Revised method using update function: ~1 second

Data Generation

N = 1000    # Number of rows
M = 100
seed(1234)   # random seed to make results repeatable

data = {"E":[randint(0, M) for _ in range(N)],
        "F":[randint(0, M) for _ in range(N)], 
        "G":[randint(0, M) for _ in range(N)],
        "H":['C' if i%2==0 else 'X' for i in range(N)],
        "I":['OFC' if i%2==0 else 'X' for i in range(N)],
        "J":[randint(2, M) for _ in range(N)]}

calculos = pd.DataFrame(data)
opf = pd.DataFrame(data)

Using Update

%timeit update(calculos, opf, [‘E’, ‘F’, ‘G’, ‘H’, ‘I’], [‘C’, ‘OFC’], ‘J’)

Result: 1.09 s ± 29.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

Posted Method

%%timeit
for x, linha_calc in calculos.iterrows():
    for z, linha_opf in opf.iterrows():
        if (linha_calc['E'] == linha_opf['E']) & (
                linha_calc['F'] == linha_opf['F']) & (
                linha_calc['G'] == linha_opf['G']) & (
                linha_opf['H'] == 'C') & (
                linha_opf['I'] == 'OFC'):
            calculos.at[x, 'J'] = 1
            break

Result: 2min 51s ± 6.96 s per loop (mean ± std. dev. of 7 runs, 1 loop each)
rows
for rows in linha_calc_rows:

Method 2

Update method becomes

def update(calculos, opf, input_columns, values, update_column):
    if not isinstance(values, (list, tuple)):
        values = (values,)   # make scalar into tuple

    n = len(values)

    # Find rowsin opf that match values
    mask = pd.Series([True]*calculos.shape[0])
    if n == 1:
        mask &= opf[input_columns[-1]] != values[0]
    else:
        for v, col in zip(values, input_columns[-n:]):
            mask &= opf[col]==v

    # Rows that match with list of values
    opf_ = opf[mask]

    if not opf_.empty:
        # Find matching rows between calculos and opf_ using merge
        left = calculos.merge(opf_, on = input_columns[:-n], how = "left", indicator = True)
        # Assign 1 to column update_column with matching rows
        calculos.loc[left["_merge"] == "both", update_column] = 1

Performance

Using same data (calculos, opf) we have:

%timeit update(calculos, opf, ['E', 'F', 'G', 'H', 'I'], ['C', 'OFC'], 'J')

24.8 ms ± 3.52 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

Answered By: DarrylG

Looking for alternatives to increase performance in a FOR python loop

Question:

Answers: