Pandas: Create new column that contains the most recent index where a condition related to the current row is met

Question

In the below example, I wish to return the last index ocurrance relative to the current row in which the "lower" was >= the "upper" column. I am able to do this with the results as expected but it is not truly vectorized and is very inefficient for larger dataframes.

import pandas as pd

# Sample DataFrame
data = {'lower': [7, 1, 6, 1, 1, 1, 1, 11, 1, 1],
        'upper': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11]}

df = pd.DataFrame(data=data)

df['DATE'] = pd.date_range('2020-01-01', periods=len(data['lower']))
df['DATE'] = pd.to_datetime(df['DATE'])
df.set_index('DATE', inplace=True)

# new columns that contains the most recent index of previous rows, where the previous "lower" is greater than or equal to the current "upper"
def get_most_recent_index(row):
    previous_indices = df.loc[:row.name - pd.Timedelta(minutes=1)]  
    recent_index = previous_indices[previous_indices['lower'] >= row['upper']].index.max()
    return recent_index

df['prev'] = df.apply(get_most_recent_index, axis=1) 

print(df)

How would I rewrite this to be most efficient?

EDIT:

Firstly, thank you to you all for your responses.

On the topic of performance between the four viable solutions, the clear winner is bisect, proposed by Andrej Kesely. I have excluded pyjanitor as with any data volume approaching my set, we quickly run into memory alloc errors.

baseline:
1min 35s ± 5.15 s per loop (mean ± std. dev. of 2 runs, 2 loops each)

bisect:
1.76 s ± 82.5 ms per loop (mean ± std. dev. of 2 runs, 2 loops each)

enumerate:
1min 13s ± 2.17 s per loop (mean ± std. dev. of 2 runs, 2 loops each)

import pandas as pd
import numpy as np
from bisect import bisect_left
import janitor

def get_sample_df(rows=100_000):
    # Sample DataFrame
    data = {'lower': np.random.default_rng(seed=1).uniform(1,100,rows),
            'upper': np.random.default_rng(seed=2).uniform(1,100,rows)}

    df = pd.DataFrame(data=data)
    df = df.astype(int)

    df['DATE'] = pd.date_range('2020-01-01', periods=len(data['lower']), freq="min")
    df['DATE'] = pd.to_datetime(df['DATE'])
    df.set_index('DATE', inplace=True)

    return df


def get_baseline():
    df = get_sample_df()

    # new columns that contains the most recent index of previous rows, where the previous "lower" is greater than or equal to the current "upper"
    def get_most_recent_index(row):
        previous_indices = df.loc[:row.name - pd.Timedelta(minutes=1)]  
        recent_index = previous_indices[previous_indices['lower'] >= row['upper']].index.max()
        return recent_index

    df['prev'] = df.apply(get_most_recent_index, axis=1) 
    return df


def get_pyjanitor():

    df = get_sample_df()
    df.reset_index(inplace=True)

    # set the DATE column as an index
    # after the operation you can set the original DATE
    # column as an index
    left_df = df.assign(index_prev=df.index)
    right_df = df.assign(index_next=df.index)
    out=(left_df
        .conditional_join(
            right_df, 
            ('lower','upper','>='), 
            ('index_prev','index_next','<'), 
            df_columns='index_prev', 
            right_columns=['index_next','lower','upper'])
        )
    # based on the matches, we may have multiple returns
    # what we need is the closest to the current row
    closest=out.index_next-out.index_prev
    grouper=[out.index_next, out.lower,out.upper]
    min_closest=closest.groupby(grouper).transform('min')
    closest=closest==min_closest
    # we have out matches, which is defined by `index_prev`
    # use index_prev to get the relevant DATE
    prev=out.loc[closest,'index_prev']
    prev=df.loc[prev,'DATE'].array # avoid index alignment here
    index_next=out.loc[closest,'index_next']
    # now assign back to df, based on index_next and prev
    prev=pd.Series(prev,index=index_next)
    df = df.assign(prev=prev)
    return df

   

def get_bisect():
    df = get_sample_df()

    def get_prev_bs(lower, upper, _date):
        uniq_lower = sorted(set(lower))
        last_seen = {}

        for l, u, d in zip(lower, upper, _date):
            # find index of element that is >= u
            idx = bisect_left(uniq_lower, u)

            max_date = None
            for lv in uniq_lower[idx:]:
                if lv in last_seen:
                    if max_date is None:
                        max_date = last_seen[lv]
                    elif last_seen[lv] > max_date:
                        max_date = last_seen[lv]
            yield max_date
            last_seen[l] = d

    df["prev"] = list(get_prev_bs(df["lower"], df["upper"], df.index))
    return df

def get_enumerate():
    df = get_sample_df()
    df.reset_index(inplace=True)

    date_list=df["DATE"].values.tolist()
    lower_list=df["lower"].values.tolist()
    upper_list=df["upper"].values.tolist()
    new_list=[]
    for i,(x,y) in enumerate(zip(lower_list,upper_list)):
        if i==0:
            new_list.append(None)
        else:
            if (any(j >= y for j in lower_list[0:i])):
                

                for ll,dl in zip(reversed(lower_list[0:i]),reversed(date_list[0:i])):
                    if ll>=y:
                        new_list.append(dl)
                        break
                    else:
                        continue
            else:
                new_list.append(None)
    df['prev']=new_list
    df['prev']=pd.to_datetime(df['prev'])
    return df

print("baseline:")
%timeit -n 2 -r 2 get_baseline()

# Unable to allocate 37.2 GiB for an array with shape (4994299505,) and data type int64
# print("pyjanitor:")
# %timeit -n 2 get_pyjanitor()

print("bisect:")
%timeit -n 2 -r 2 get_bisect()

print("enumerate:")
%timeit -n 2 -r 2 get_enumerate()

Asked By: TheBob

||

Source

Answer 1

I’m not sure if this can be vectorized (because you have variables that are dependent on past state). But you can try to speed up the computation using binary search, e.g.:

from bisect import bisect_left


def get_prev(lower, upper, _date):
    uniq_lower = sorted(set(lower))
    last_seen = {}

    for l, u, d in zip(lower, upper, _date):
        # find index of element that is >= u
        idx = bisect_left(uniq_lower, u)

        max_date = None
        for lv in uniq_lower[idx:]:
            if lv in last_seen:
                if max_date is None:
                    max_date = last_seen[lv]
                elif last_seen[lv] > max_date:
                    max_date = last_seen[lv]
        yield max_date
        last_seen[l] = d


df["prev_new"] = list(get_prev(df["lower"], df["upper"], df.index))
print(df)

Prints:

            lower  upper       prev   prev_new
DATE                                          
2020-01-01      7      2        NaT        NaT
2020-01-02      1      3 2020-01-01 2020-01-01
2020-01-03      6      4 2020-01-01 2020-01-01
2020-01-04      1      5 2020-01-03 2020-01-03
2020-01-05      1      6 2020-01-03 2020-01-03
2020-01-06      1      7 2020-01-01 2020-01-01
2020-01-07      1      8        NaT        NaT
2020-01-08     11      9        NaT        NaT
2020-01-09      1     10 2020-01-08 2020-01-08
2020-01-10      1     11 2020-01-08 2020-01-08

Answered By: Andrej Kesely

Answer 2

There’s an alternative answer with slightly different result.

import pandas as pd
import numpy as np
# Sample DataFrame
data = {'lower': [7, 1, 6, 1, 1, 1, 1, 11, 1, 1],
        'upper': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11]}

df = pd.DataFrame(data=data)

df['DATE'] = pd.date_range('2020-01-01', periods=len(data['lower']))
df['DATE'] = pd.to_datetime(df['DATE'])
df['prev'] = pd.to_datetime(np.nan)

df['prev'] = np.where(df['lower'] >= df['upper'], df['DATE'], df['prev'])
df['prev'] = df['prev'].shift(1).fillna(method = 'ffill')

print(df)

  lower upper   DATE      prev
0   7   2   2020-01-01  NaT
1   1   3   2020-01-02  2020-01-01
2   6   4   2020-01-03  2020-01-01
3   1   5   2020-01-04  2020-01-03
4   1   6   2020-01-05  2020-01-03
5   1   7   2020-01-06  2020-01-03
6   1   8   2020-01-07  2020-01-03
7   11  9   2020-01-08  2020-01-03
8   1   10  2020-01-09  2020-01-08
9   1   11  2020-01-10  2020-01-08

I’m not sure why we are getting NaT for 2 dates in the middle. My solution does not have NaT in these places.

Answered By: ragas

Answer 3

In my understanding, it is faster to loop through python objects such as lists and dictionaries rather than pandas dataframe rows (could be wrong). Therefore, below is what i have tried and it works for your input df:

date_list=df["DATE"].values.tolist()
lower_list=df["lower"].values.tolist()
upper_list=df["upper"].values.tolist()
new_list=[]
for i,(x,y) in enumerate(zip(lower_list,upper_list)):
    if i==0:
        new_list.append(None)
    else:
        if (any(j >= y for j in lower_list[0:i])):
            

            for ll,dl in zip(reversed(lower_list[0:i]),reversed(date_list[0:i])):
                if ll>=y:
                    new_list.append(dl)
                    break
                else:
                    continue
        else:
            new_list.append(None)
df['prev']=new_list
df['prev']=pd.to_datetime(df['prev'])

Answered By: Pawan Tolani

Answer 4

You can use a range join to efficiently get your matches – conditional_join from pyjanitor solves this. Kindly share your performance tests when you can.

# pip install pyjanitor
import pandas as pd
import janitor

# set the DATE column as an index
# after the operation you can set the original DATE
# column as an index
left_df = df.assign(index_prev=df.index)
right_df = df.assign(index_next=df.index)
out=(left_df
    .conditional_join(
        right_df, 
        ('lower','upper','>='), 
        ('index_prev','index_next','<'), 
        df_columns='index_prev', 
        right_columns=['index_next','lower','upper'])
    )
# based on the matches, we may have multiple returns
# what we need is the closest to the current row
closest=out.index_next-out.index_prev
grouper=[out.index_next, out.lower,out.upper]
min_closest=closest.groupby(grouper).transform('min')
closest=closest==min_closest
# we have out matches, which is defined by `index_prev`
# use index_prev to get the relevant DATE
prev=out.loc[closest,'index_prev']
prev=df.loc[prev,'DATE'].array # avoid index alignment here
index_next=out.loc[closest,'index_next']
# now assign back to df, based on index_next and prev
prev=pd.Series(prev,index=index_next)
df.assign(prev=prev)

   lower  upper       DATE       prev
0      7      2 2020-01-01        NaT
1      1      3 2020-01-02 2020-01-01
2      6      4 2020-01-03 2020-01-01
3      1      5 2020-01-04 2020-01-03
4      1      6 2020-01-05 2020-01-03
5      1      7 2020-01-06 2020-01-01
6      1      8 2020-01-07        NaT
7     11      9 2020-01-08        NaT
8      1     10 2020-01-09 2020-01-08
9      1     11 2020-01-10 2020-01-08

Answered By: sammywemmy

Pandas: Create new column that contains the most recent index where a condition related to the current row is met

Question:

Answers: