How to find a missing row between two similar 2d numpy arrays

Question

Say I have two 2d arrays:

aa = np.array([[1,2],[3,4],[9,10],[48,59]])
bb = np.array([[1,2],[9,10]])

array aa is my ideal array and bb is what I have recorded. I want to find out what rows bb is missing compared to aa. I can do this by looping through the rows but as my dataset is approximately 5Gb I want to know if there is a pythonic way of doing this to minimize run time. It would be preferable to carry out this operation as a numpy array but it doesn’t HAVE to be that way if there is a better option.

Any help is appreciated. Thanks

Asked By: Aidan O'Farrell

||

Source

Answer 1

Try this:

bb_missing = aa[~np.all(aa==bb[:, None], axis=2).any(axis=0)]

Output:

>>> bb_missing
array([[ 3,  4],
       [48, 59]])

Answered By: user17242583

Answer 2

The main step is to find boolean indices of aa that tells if items in aa are present in bb. For example, if:

aa = np.array([[1,2],[3,4],[9,10],[48,59]])
bb = np.array([[1,2],[9,10]])

then we should get:

idx
>>> array([ True, False,  True, False])

And we could use later:

aa[~idx]
>>> 
array([[ 3,  4],
       [48, 59]])

Since aa[~idx] is relatively fast, I’ll try to review all the ways to find idx. They divide between two groups: the way of searching the data (np.isin, np.searchsort, etc.) and the way that data could be made 1-dimensional.

The way of reduction

Way 1. Make arrays aa and bb contiguous then call np.isin on their views:

def _view1D(a, b): # a, b are arrays
    a = np.ascontiguousarray(a, dtype=np.uint32)
    b = np.ascontiguousarray(b, dtype=np.uint32)
    uint32_dt = np.dtype([('', np.uint32)]*2)
    return a.view(uint32_dt).ravel(),  b.view(uint32_dt).ravel()

A, B = _view1D(aa, bb)
idx = np.isin(A, B)

Way 2.
Reduce dimensionality of aa and bb then call np.isin on resulting arrays:

def numpy_dimreduce(arr, M):
    return np.ravel_multi_index(arr.T, M, order='F')

M = aa.max(axis=0)+1
idx = np.isin(numpy_dimreduce(aa, M), numpy_dimreduce(bb, M))

Way 3. Improve performance of dimensionality reduction significantly using numba:

@njit(parallel=True)
def _numba_dot(arr, dimshape, len_arr, len_dimshape, su):
    for i in prange(len_arr):
        for j in range(len_dimshape):
            su[i] = su[i] + arr[i][j] * dimshape[j]
        
def numba_dimreduce(arr, M, dtype=np.int64):
    '''not safe if allocation is exceeded'''
    dimshape = np.cumprod(np.insert(M[:-1], 0, 1))
    su = np.zeros(len(arr), dtype=dtype)
    _numba_dot(arr, dimshape, len(arr), len(dimshape), su)
    return su

M = aa.max(axis=0)+1
idx = np.isin(numba_dimreduce(aa, M), numba_dimreduce(bb, M))

The way of searching

Way 1.
Use np.isinlike in previous solutions

Way 2.
Use np.in1D instead of np.isin.

Way 3.
Use argsort + np.searchsorted:

def searchsort(A, B):
    sidx = A.argsort()
    out = np.zeros(len(A), dtype=bool)
    s = np.searchsorted(A, B, sorter=sidx)    
    out[sidx[s]] = True
    return out

Way 4 Use masking:

def masking(A, B):
    mask = np.empty(shape=np.prod(M), dtype=bool)
    mask[A] = 0
    mask[B] = 1
    return mask[A]

It could be applied in case np.prod(M) is small only. Therefore I put it into category of alternative solutions.

Conclusion

In conclusion, the general pattern of finding idx looks like this:

def isin2D(a, b, view=_view1D, search=np.isin):
    # choose your optional view and search
    A, B = view(a), view(b)
    idx = search(A, B)
    return idx

Actually, neither ndarray.views nor np.searchsorted helps to save time. The best option is to combine dimensionality reduction with np.isin. In OP’s case, it could be applied like so:

def isin2D(a, b):
    M = a.max(axis=0) + 1
    A = np.ravel_multi_index(a.T, M, order='C')
    B = np.ravel_multi_index(b.T, M, order='C')
    idx = np.isin(A, B)
    return idx

idx = isin2D(aa, bb)
aa[~idx]
>>> 
array([[ 3,  4],
       [48, 59]])

Performance

Now let’s time it using benchit package to illustrate what’s winning

import numpy as np
from numba import njit, prange
import benchit
%matplotlib inline
benchit.setparams(rep=3)

sizes = [100000, 300000, 500000, 1000000, 1700000, 3000000, 5000000]#, 10000000, 30000000]
N = sizes[-1]
aa = np.random.randint(0, 1000000, size=(N, 2))
r = np.random.randint(0, 2, size=N).astype(bool)
M = aa.max(axis=0) + 1

def _view1D(arr, dtype=np.uint32):
    a = np.ascontiguousarray(arr, dtype=dtype)
    dt = np.dtype([('', dtype)]*2)
    return a.view(dt).ravel()

def _numpy_dimreduce(arr, M):
    return np.ravel_multi_index(arr.T, M, order='C')

@njit(parallel=True)
def _numba_dot(arr, dimshape, len_arr, len_dimshape, su):
    for i in prange(len_arr):
        for j in range(len_dimshape):
            su[i] = su[i] + arr[i][j] * dimshape[j]
        
def _numba_dimreduce(arr, M, dtype=np.int64):
    '''not safe if allocation is exceeded'''
    dimshape = np.cumprod(np.insert(M[:-1], 0, 1))
    su = np.zeros(len(arr), dtype=dtype)
    _numba_dot(arr, dimshape, len(arr), len(dimshape), su)
    return su


def isin2D(a, b, view=_view1D, search=np.isin):
    A, B = view(a), view(b)
    idx = search(A, B)
    return idx

def searchsort(A, B):
    sidx = A.argsort()
    out = np.zeros(len(A), dtype=bool)
    s = np.searchsorted(A, B, sorter=sidx)    
    out[sidx[s]] = True
    return out

def isin2D_numpy_view(a, b): return isin2D(a, b, view=_view1D, search=np.isin)

def isin2D_numpy_dimr_isin(a, b, M=M): return isin2D(a, b, view=lambda arr, M=M: _numpy_dimreduce(arr, M), search=np.isin)
def isin2D_numba_dimr_isin(a, b, M=M): return isin2D(a, b, view=lambda arr, M=M: _numba_dimreduce(arr, M), search=np.isin)

def isin2D_numpy_dimr_searchsort(a, b, M=M): return isin2D(a, b, view=lambda arr, M=M: _numpy_dimreduce(arr, M), search=searchsort)
def isin2D_numba_dimr_searchsort(a, b, M=M): return isin2D(a, b, view=lambda arr, M=M: _numba_dimreduce(arr[:, ::-1], M[::-1]), search=searchsort)

def isin2D_numpy_dimr_in1D(a, b, M=M): return isin2D(a, b, view=lambda arr, M=M: _numpy_dimreduce(arr, M), search=np.in1d)
def isin2D_numba_dimr_in1D(a, b, M=M): return isin2D(a, b, view=lambda arr, M=M: _numba_dimreduce(arr[:, ::-1], M[::-1]), search=np.in1d)

fns = [isin2D_numpy_dimr, isin2D_numba_dimr, isin2D_numpy_view, isin2D_numpy_dimr_searchsort, isin2D_numba_dimr_searchsort, isin2D_numpy_dimr_in1D, isin2D_numba_dimr_in1D]

in_ = {s/1000000: (aa[:s], aa[:s][r[:s]]) for s in sizes}
t = benchit.timings(fns, in_, multivar=True, input_name='Millions of events')
t.plot(logx=True, figsize=(12, 6), fontsize=14)

Alternative solutions

In case product of M = aa.max(axis=0) + 1 is not too large you could allocate M items for masking. Make sure you won’t blow up your RAM. A couple more ways:

def masking1D(a, b):
    M = aa.max(axis=0) + 1
    A = np.ravel_multi_index(a.T, M, order='C')
    B = np.ravel_multi_index(b.T, M, order='C')
    mask = np.empty(shape=np.prod(M), dtype=bool)
    mask[A] = 1
    mask[B] = 0
    return mask[A]

def masking2D(a, b):
    M = a.max(axis=0) + 1
    mask = np.empty(shape=M, dtype=bool)
    mask[tuple(a.T)] = 1
    mask[tuple(b.T)] = 0
    return mask[tuple(a.T)]

aa = np.array([[988, 986], [299, 327], [756, 30], [327, 189], [988, 294], [64, 232], [11, 46], [38, 223], [933, 770], [141, 141]])
bb = aa[[1, 2, 3, 6, 7, 4]]
masking1D(aa, bb)
>>> array([ True, False, False, False, False,  True, False, False,  True, True])
masking2D(aa, bb)
>>> array([ True, False, False, False, False,  True, False, False,  True, True])

Runtime in case np.max(aa, axis=0) + 1 = (1000, 1000):

Answered By: mathfux