How to find a missing row between two similar 2d numpy arrays
Question:
Say I have two 2d arrays:
aa = np.array([[1,2],[3,4],[9,10],[48,59]])
bb = np.array([[1,2],[9,10]])
array aa
is my ideal array and bb
is what I have recorded. I want to find out what rows bb
is missing compared to aa
. I can do this by looping through the rows but as my dataset is approximately 5Gb I want to know if there is a pythonic way of doing this to minimize run time. It would be preferable to carry out this operation as a numpy array but it doesn’t HAVE to be that way if there is a better option.
Any help is appreciated. Thanks
Answers:
Try this:
bb_missing = aa[~np.all(aa==bb[:, None], axis=2).any(axis=0)]
Output:
>>> bb_missing
array([[ 3, 4],
[48, 59]])
The main step is to find boolean indices of aa
that tells if items in aa
are present in bb
. For example, if:
aa = np.array([[1,2],[3,4],[9,10],[48,59]])
bb = np.array([[1,2],[9,10]])
then we should get:
idx
>>> array([ True, False, True, False])
And we could use later:
aa[~idx]
>>>
array([[ 3, 4],
[48, 59]])
Since aa[~idx]
is relatively fast, I’ll try to review all the ways to find idx
. They divide between two groups: the way of searching the data (np.isin
, np.searchsort
, etc.) and the way that data could be made 1-dimensional.
The way of reduction
Way 1. Make arrays aa
and bb
contiguous then call np.isin
on their views:
def _view1D(a, b): # a, b are arrays
a = np.ascontiguousarray(a, dtype=np.uint32)
b = np.ascontiguousarray(b, dtype=np.uint32)
uint32_dt = np.dtype([('', np.uint32)]*2)
return a.view(uint32_dt).ravel(), b.view(uint32_dt).ravel()
A, B = _view1D(aa, bb)
idx = np.isin(A, B)
Way 2.
Reduce dimensionality of aa
and bb
then call np.isin
on resulting arrays:
def numpy_dimreduce(arr, M):
return np.ravel_multi_index(arr.T, M, order='F')
M = aa.max(axis=0)+1
idx = np.isin(numpy_dimreduce(aa, M), numpy_dimreduce(bb, M))
Way 3. Improve performance of dimensionality reduction significantly using numba
:
@njit(parallel=True)
def _numba_dot(arr, dimshape, len_arr, len_dimshape, su):
for i in prange(len_arr):
for j in range(len_dimshape):
su[i] = su[i] + arr[i][j] * dimshape[j]
def numba_dimreduce(arr, M, dtype=np.int64):
'''not safe if allocation is exceeded'''
dimshape = np.cumprod(np.insert(M[:-1], 0, 1))
su = np.zeros(len(arr), dtype=dtype)
_numba_dot(arr, dimshape, len(arr), len(dimshape), su)
return su
M = aa.max(axis=0)+1
idx = np.isin(numba_dimreduce(aa, M), numba_dimreduce(bb, M))
The way of searching
Way 1.
Use np.isin
like in previous solutions
Way 2.
Use np.in1D
instead of np.isin
.
Way 3.
Use argsort
+ np.searchsorted
:
def searchsort(A, B):
sidx = A.argsort()
out = np.zeros(len(A), dtype=bool)
s = np.searchsorted(A, B, sorter=sidx)
out[sidx[s]] = True
return out
Way 4 Use masking:
def masking(A, B):
mask = np.empty(shape=np.prod(M), dtype=bool)
mask[A] = 0
mask[B] = 1
return mask[A]
It could be applied in case np.prod(M)
is small only. Therefore I put it into category of alternative solutions.
Conclusion
In conclusion, the general pattern of finding idx
looks like this:
def isin2D(a, b, view=_view1D, search=np.isin):
# choose your optional view and search
A, B = view(a), view(b)
idx = search(A, B)
return idx
Actually, neither ndarray.view
s nor np.searchsorted
helps to save time. The best option is to combine dimensionality reduction with np.isin
. In OP’s case, it could be applied like so:
def isin2D(a, b):
M = a.max(axis=0) + 1
A = np.ravel_multi_index(a.T, M, order='C')
B = np.ravel_multi_index(b.T, M, order='C')
idx = np.isin(A, B)
return idx
idx = isin2D(aa, bb)
aa[~idx]
>>>
array([[ 3, 4],
[48, 59]])
Performance
Now let’s time it using benchit
package to illustrate what’s winning
import numpy as np
from numba import njit, prange
import benchit
%matplotlib inline
benchit.setparams(rep=3)
sizes = [100000, 300000, 500000, 1000000, 1700000, 3000000, 5000000]#, 10000000, 30000000]
N = sizes[-1]
aa = np.random.randint(0, 1000000, size=(N, 2))
r = np.random.randint(0, 2, size=N).astype(bool)
M = aa.max(axis=0) + 1
def _view1D(arr, dtype=np.uint32):
a = np.ascontiguousarray(arr, dtype=dtype)
dt = np.dtype([('', dtype)]*2)
return a.view(dt).ravel()
def _numpy_dimreduce(arr, M):
return np.ravel_multi_index(arr.T, M, order='C')
@njit(parallel=True)
def _numba_dot(arr, dimshape, len_arr, len_dimshape, su):
for i in prange(len_arr):
for j in range(len_dimshape):
su[i] = su[i] + arr[i][j] * dimshape[j]
def _numba_dimreduce(arr, M, dtype=np.int64):
'''not safe if allocation is exceeded'''
dimshape = np.cumprod(np.insert(M[:-1], 0, 1))
su = np.zeros(len(arr), dtype=dtype)
_numba_dot(arr, dimshape, len(arr), len(dimshape), su)
return su
def isin2D(a, b, view=_view1D, search=np.isin):
A, B = view(a), view(b)
idx = search(A, B)
return idx
def searchsort(A, B):
sidx = A.argsort()
out = np.zeros(len(A), dtype=bool)
s = np.searchsorted(A, B, sorter=sidx)
out[sidx[s]] = True
return out
def isin2D_numpy_view(a, b): return isin2D(a, b, view=_view1D, search=np.isin)
def isin2D_numpy_dimr_isin(a, b, M=M): return isin2D(a, b, view=lambda arr, M=M: _numpy_dimreduce(arr, M), search=np.isin)
def isin2D_numba_dimr_isin(a, b, M=M): return isin2D(a, b, view=lambda arr, M=M: _numba_dimreduce(arr, M), search=np.isin)
def isin2D_numpy_dimr_searchsort(a, b, M=M): return isin2D(a, b, view=lambda arr, M=M: _numpy_dimreduce(arr, M), search=searchsort)
def isin2D_numba_dimr_searchsort(a, b, M=M): return isin2D(a, b, view=lambda arr, M=M: _numba_dimreduce(arr[:, ::-1], M[::-1]), search=searchsort)
def isin2D_numpy_dimr_in1D(a, b, M=M): return isin2D(a, b, view=lambda arr, M=M: _numpy_dimreduce(arr, M), search=np.in1d)
def isin2D_numba_dimr_in1D(a, b, M=M): return isin2D(a, b, view=lambda arr, M=M: _numba_dimreduce(arr[:, ::-1], M[::-1]), search=np.in1d)
fns = [isin2D_numpy_dimr, isin2D_numba_dimr, isin2D_numpy_view, isin2D_numpy_dimr_searchsort, isin2D_numba_dimr_searchsort, isin2D_numpy_dimr_in1D, isin2D_numba_dimr_in1D]
in_ = {s/1000000: (aa[:s], aa[:s][r[:s]]) for s in sizes}
t = benchit.timings(fns, in_, multivar=True, input_name='Millions of events')
t.plot(logx=True, figsize=(12, 6), fontsize=14)
Alternative solutions
In case product of M = aa.max(axis=0) + 1
is not too large you could allocate M
items for masking. Make sure you won’t blow up your RAM. A couple more ways:
def masking1D(a, b):
M = aa.max(axis=0) + 1
A = np.ravel_multi_index(a.T, M, order='C')
B = np.ravel_multi_index(b.T, M, order='C')
mask = np.empty(shape=np.prod(M), dtype=bool)
mask[A] = 1
mask[B] = 0
return mask[A]
def masking2D(a, b):
M = a.max(axis=0) + 1
mask = np.empty(shape=M, dtype=bool)
mask[tuple(a.T)] = 1
mask[tuple(b.T)] = 0
return mask[tuple(a.T)]
aa = np.array([[988, 986], [299, 327], [756, 30], [327, 189], [988, 294], [64, 232], [11, 46], [38, 223], [933, 770], [141, 141]])
bb = aa[[1, 2, 3, 6, 7, 4]]
masking1D(aa, bb)
>>> array([ True, False, False, False, False, True, False, False, True, True])
masking2D(aa, bb)
>>> array([ True, False, False, False, False, True, False, False, True, True])
Runtime in case np.max(aa, axis=0) + 1 = (1000, 1000)
:
Say I have two 2d arrays:
aa = np.array([[1,2],[3,4],[9,10],[48,59]])
bb = np.array([[1,2],[9,10]])
array aa
is my ideal array and bb
is what I have recorded. I want to find out what rows bb
is missing compared to aa
. I can do this by looping through the rows but as my dataset is approximately 5Gb I want to know if there is a pythonic way of doing this to minimize run time. It would be preferable to carry out this operation as a numpy array but it doesn’t HAVE to be that way if there is a better option.
Any help is appreciated. Thanks
Try this:
bb_missing = aa[~np.all(aa==bb[:, None], axis=2).any(axis=0)]
Output:
>>> bb_missing
array([[ 3, 4],
[48, 59]])
The main step is to find boolean indices of aa
that tells if items in aa
are present in bb
. For example, if:
aa = np.array([[1,2],[3,4],[9,10],[48,59]])
bb = np.array([[1,2],[9,10]])
then we should get:
idx
>>> array([ True, False, True, False])
And we could use later:
aa[~idx]
>>>
array([[ 3, 4],
[48, 59]])
Since aa[~idx]
is relatively fast, I’ll try to review all the ways to find idx
. They divide between two groups: the way of searching the data (np.isin
, np.searchsort
, etc.) and the way that data could be made 1-dimensional.
The way of reduction
Way 1. Make arrays aa
and bb
contiguous then call np.isin
on their views:
def _view1D(a, b): # a, b are arrays
a = np.ascontiguousarray(a, dtype=np.uint32)
b = np.ascontiguousarray(b, dtype=np.uint32)
uint32_dt = np.dtype([('', np.uint32)]*2)
return a.view(uint32_dt).ravel(), b.view(uint32_dt).ravel()
A, B = _view1D(aa, bb)
idx = np.isin(A, B)
Way 2.
Reduce dimensionality of aa
and bb
then call np.isin
on resulting arrays:
def numpy_dimreduce(arr, M):
return np.ravel_multi_index(arr.T, M, order='F')
M = aa.max(axis=0)+1
idx = np.isin(numpy_dimreduce(aa, M), numpy_dimreduce(bb, M))
Way 3. Improve performance of dimensionality reduction significantly using numba
:
@njit(parallel=True)
def _numba_dot(arr, dimshape, len_arr, len_dimshape, su):
for i in prange(len_arr):
for j in range(len_dimshape):
su[i] = su[i] + arr[i][j] * dimshape[j]
def numba_dimreduce(arr, M, dtype=np.int64):
'''not safe if allocation is exceeded'''
dimshape = np.cumprod(np.insert(M[:-1], 0, 1))
su = np.zeros(len(arr), dtype=dtype)
_numba_dot(arr, dimshape, len(arr), len(dimshape), su)
return su
M = aa.max(axis=0)+1
idx = np.isin(numba_dimreduce(aa, M), numba_dimreduce(bb, M))
The way of searching
Way 1.
Use np.isin
like in previous solutions
Way 2.
Use np.in1D
instead of np.isin
.
Way 3.
Use argsort
+ np.searchsorted
:
def searchsort(A, B):
sidx = A.argsort()
out = np.zeros(len(A), dtype=bool)
s = np.searchsorted(A, B, sorter=sidx)
out[sidx[s]] = True
return out
Way 4 Use masking:
def masking(A, B):
mask = np.empty(shape=np.prod(M), dtype=bool)
mask[A] = 0
mask[B] = 1
return mask[A]
It could be applied in case np.prod(M)
is small only. Therefore I put it into category of alternative solutions.
Conclusion
In conclusion, the general pattern of finding idx
looks like this:
def isin2D(a, b, view=_view1D, search=np.isin):
# choose your optional view and search
A, B = view(a), view(b)
idx = search(A, B)
return idx
Actually, neither ndarray.view
s nor np.searchsorted
helps to save time. The best option is to combine dimensionality reduction with np.isin
. In OP’s case, it could be applied like so:
def isin2D(a, b):
M = a.max(axis=0) + 1
A = np.ravel_multi_index(a.T, M, order='C')
B = np.ravel_multi_index(b.T, M, order='C')
idx = np.isin(A, B)
return idx
idx = isin2D(aa, bb)
aa[~idx]
>>>
array([[ 3, 4],
[48, 59]])
Performance
Now let’s time it using benchit
package to illustrate what’s winning
import numpy as np
from numba import njit, prange
import benchit
%matplotlib inline
benchit.setparams(rep=3)
sizes = [100000, 300000, 500000, 1000000, 1700000, 3000000, 5000000]#, 10000000, 30000000]
N = sizes[-1]
aa = np.random.randint(0, 1000000, size=(N, 2))
r = np.random.randint(0, 2, size=N).astype(bool)
M = aa.max(axis=0) + 1
def _view1D(arr, dtype=np.uint32):
a = np.ascontiguousarray(arr, dtype=dtype)
dt = np.dtype([('', dtype)]*2)
return a.view(dt).ravel()
def _numpy_dimreduce(arr, M):
return np.ravel_multi_index(arr.T, M, order='C')
@njit(parallel=True)
def _numba_dot(arr, dimshape, len_arr, len_dimshape, su):
for i in prange(len_arr):
for j in range(len_dimshape):
su[i] = su[i] + arr[i][j] * dimshape[j]
def _numba_dimreduce(arr, M, dtype=np.int64):
'''not safe if allocation is exceeded'''
dimshape = np.cumprod(np.insert(M[:-1], 0, 1))
su = np.zeros(len(arr), dtype=dtype)
_numba_dot(arr, dimshape, len(arr), len(dimshape), su)
return su
def isin2D(a, b, view=_view1D, search=np.isin):
A, B = view(a), view(b)
idx = search(A, B)
return idx
def searchsort(A, B):
sidx = A.argsort()
out = np.zeros(len(A), dtype=bool)
s = np.searchsorted(A, B, sorter=sidx)
out[sidx[s]] = True
return out
def isin2D_numpy_view(a, b): return isin2D(a, b, view=_view1D, search=np.isin)
def isin2D_numpy_dimr_isin(a, b, M=M): return isin2D(a, b, view=lambda arr, M=M: _numpy_dimreduce(arr, M), search=np.isin)
def isin2D_numba_dimr_isin(a, b, M=M): return isin2D(a, b, view=lambda arr, M=M: _numba_dimreduce(arr, M), search=np.isin)
def isin2D_numpy_dimr_searchsort(a, b, M=M): return isin2D(a, b, view=lambda arr, M=M: _numpy_dimreduce(arr, M), search=searchsort)
def isin2D_numba_dimr_searchsort(a, b, M=M): return isin2D(a, b, view=lambda arr, M=M: _numba_dimreduce(arr[:, ::-1], M[::-1]), search=searchsort)
def isin2D_numpy_dimr_in1D(a, b, M=M): return isin2D(a, b, view=lambda arr, M=M: _numpy_dimreduce(arr, M), search=np.in1d)
def isin2D_numba_dimr_in1D(a, b, M=M): return isin2D(a, b, view=lambda arr, M=M: _numba_dimreduce(arr[:, ::-1], M[::-1]), search=np.in1d)
fns = [isin2D_numpy_dimr, isin2D_numba_dimr, isin2D_numpy_view, isin2D_numpy_dimr_searchsort, isin2D_numba_dimr_searchsort, isin2D_numpy_dimr_in1D, isin2D_numba_dimr_in1D]
in_ = {s/1000000: (aa[:s], aa[:s][r[:s]]) for s in sizes}
t = benchit.timings(fns, in_, multivar=True, input_name='Millions of events')
t.plot(logx=True, figsize=(12, 6), fontsize=14)
Alternative solutions
In case product of M = aa.max(axis=0) + 1
is not too large you could allocate M
items for masking. Make sure you won’t blow up your RAM. A couple more ways:
def masking1D(a, b):
M = aa.max(axis=0) + 1
A = np.ravel_multi_index(a.T, M, order='C')
B = np.ravel_multi_index(b.T, M, order='C')
mask = np.empty(shape=np.prod(M), dtype=bool)
mask[A] = 1
mask[B] = 0
return mask[A]
def masking2D(a, b):
M = a.max(axis=0) + 1
mask = np.empty(shape=M, dtype=bool)
mask[tuple(a.T)] = 1
mask[tuple(b.T)] = 0
return mask[tuple(a.T)]
aa = np.array([[988, 986], [299, 327], [756, 30], [327, 189], [988, 294], [64, 232], [11, 46], [38, 223], [933, 770], [141, 141]])
bb = aa[[1, 2, 3, 6, 7, 4]]
masking1D(aa, bb)
>>> array([ True, False, False, False, False, True, False, False, True, True])
masking2D(aa, bb)
>>> array([ True, False, False, False, False, True, False, False, True, True])
Runtime in case np.max(aa, axis=0) + 1 = (1000, 1000)
: