Efficient way to normalize a Scipy Sparse Matrix
Question:
I’d like to write a function that normalizes the rows of a large sparse matrix (such that they sum to one).
from pylab import *
import scipy.sparse as sp
def normalize(W):
z = W.sum(0)
z[z < 1e-6] = 1e-6
return W / z[None,:]
w = (rand(10,10)<0.1)*rand(10,10)
w = sp.csr_matrix(w)
w = normalize(w)
However this gives the following exception:
File "/usr/lib/python2.6/dist-packages/scipy/sparse/base.py", line 325, in __div__
return self.__truediv__(other)
File "/usr/lib/python2.6/dist-packages/scipy/sparse/compressed.py", line 230, in __truediv__
raise NotImplementedError
Are there any reasonably simple solutions? I have looked at this, but am still unclear on how to actually do the division.
Answers:
This has been implemented in scikit-learn sklearn.preprocessing.normalize.
from sklearn.preprocessing import normalize
w_normalized = normalize(w, norm='l1', axis=1)
axis=1
should normalize by rows, axis=0
to normalize by column. Use the optional argument copy=False
to modify the matrix in place.
here is my solution.
- transpose A
- calculate sum of each col
- format diagonal matrix B with reciprocal of sum
- A*B equals normalization
-
transpose C
import scipy.sparse as sp
import numpy as np
import math
minf = 0.0001
A = sp.lil_matrix((5,5))
b = np.arange(0,5)
A.setdiag(b[:-1], k=1)
A.setdiag(b)
print A.todense()
A = A.T
print A.todense()
sum_of_col = A.sum(0).tolist()
print sum_of_col
c = []
for i in sum_of_col:
for j in i:
if math.fabs(j)<minf:
c.append(0)
else:
c.append(1/j)
print c
B = sp.lil_matrix((5,5))
B.setdiag(c)
print B.todense()
C = A*B
print C.todense()
C = C.T
print C.todense()
While Aarons answer is correct, I implemented a solution when I wanted to normalize with respect to the maximum of the absolute values, which sklearn is not offering. My method uses the nonzero entries and finds them in the csr_matrix.data array to replace values there quickly.
def normalize_sparse(csr_matrix):
nonzero_rows = csr_matrix.nonzero()[0]
for idx in np.unique(nonzero_rows):
data_idx = np.where(nonzero_rows==idx)[0]
abs_max = np.max(np.abs(csr_matrix.data[data_idx]))
if abs_max != 0:
csr_matrix.data[data_idx] = 1./abs_max * csr_matrix.data[data_idx]
In contrast to sunan’s solution, this method does not require any casting of the matrix into dense format (which could raise memory problems) and no matrix multiplications either. I tested the method on a sparse matrix of shape (35’000, 486’000) and it took ~ 18 seconds.
Without importing sklearn, converting to dense or multiplying matrices and by exploiting the data representation of csr matrices:
from scipy.sparse import isspmatrix_csr
def normalize(W):
""" row normalize scipy sparse csr matrices inplace.
"""
if not isspmatrix_csr(W):
raise ValueError('W must be in CSR format.')
else:
for i in range(W.shape[0]):
row_sum = W.data[W.indptr[i]:W.indptr[i+1]].sum()
if row_sum != 0:
W.data[W.indptr[i]:W.indptr[i+1]] /= row_sum
Remember that W.indices
is the array of column indices,
W.data
is the array of corresponding nonzero values
and W.indptr
points to row starts in indices and data.
You can add a numpy.abs()
when taking the sum if you need the L1 norm or use numpy.max()
to normalize by the maximum value per row.
I found this as an elegant way of doing it without using inbuilt functions.
import scipy.sparse as sp
def normalize(W):
#Find the row scalars as a Matrix_(n,1)
rowSumW = sp.csr_matrix(W.sum(axis=1))
rowSumW.data = 1/rowSumW.data
#Find the diagonal matrix to scale the rows
rowSumW = rowSumW.transpose()
scaling_matrix = sp.diags(rowSumW.toarray()[0])
return scaling_matrix.dot(W)
My setup: Python 3.8.10, SciPy 1.5.4
Working just with csr_array you could nowadays use its multiply
and sum
methods and do:
# w : scipy.sparse.csr_array
row_sum = w.sum(axis=1)
row_sum[row_sum == 0] = 1 # to avoid divide by zero
w = w.multiply(1. / row_sum)
I’d like to write a function that normalizes the rows of a large sparse matrix (such that they sum to one).
from pylab import *
import scipy.sparse as sp
def normalize(W):
z = W.sum(0)
z[z < 1e-6] = 1e-6
return W / z[None,:]
w = (rand(10,10)<0.1)*rand(10,10)
w = sp.csr_matrix(w)
w = normalize(w)
However this gives the following exception:
File "/usr/lib/python2.6/dist-packages/scipy/sparse/base.py", line 325, in __div__
return self.__truediv__(other)
File "/usr/lib/python2.6/dist-packages/scipy/sparse/compressed.py", line 230, in __truediv__
raise NotImplementedError
Are there any reasonably simple solutions? I have looked at this, but am still unclear on how to actually do the division.
This has been implemented in scikit-learn sklearn.preprocessing.normalize.
from sklearn.preprocessing import normalize
w_normalized = normalize(w, norm='l1', axis=1)
axis=1
should normalize by rows, axis=0
to normalize by column. Use the optional argument copy=False
to modify the matrix in place.
here is my solution.
- transpose A
- calculate sum of each col
- format diagonal matrix B with reciprocal of sum
- A*B equals normalization
-
transpose C
import scipy.sparse as sp import numpy as np import math minf = 0.0001 A = sp.lil_matrix((5,5)) b = np.arange(0,5) A.setdiag(b[:-1], k=1) A.setdiag(b) print A.todense() A = A.T print A.todense() sum_of_col = A.sum(0).tolist() print sum_of_col c = [] for i in sum_of_col: for j in i: if math.fabs(j)<minf: c.append(0) else: c.append(1/j) print c B = sp.lil_matrix((5,5)) B.setdiag(c) print B.todense() C = A*B print C.todense() C = C.T print C.todense()
While Aarons answer is correct, I implemented a solution when I wanted to normalize with respect to the maximum of the absolute values, which sklearn is not offering. My method uses the nonzero entries and finds them in the csr_matrix.data array to replace values there quickly.
def normalize_sparse(csr_matrix):
nonzero_rows = csr_matrix.nonzero()[0]
for idx in np.unique(nonzero_rows):
data_idx = np.where(nonzero_rows==idx)[0]
abs_max = np.max(np.abs(csr_matrix.data[data_idx]))
if abs_max != 0:
csr_matrix.data[data_idx] = 1./abs_max * csr_matrix.data[data_idx]
In contrast to sunan’s solution, this method does not require any casting of the matrix into dense format (which could raise memory problems) and no matrix multiplications either. I tested the method on a sparse matrix of shape (35’000, 486’000) and it took ~ 18 seconds.
Without importing sklearn, converting to dense or multiplying matrices and by exploiting the data representation of csr matrices:
from scipy.sparse import isspmatrix_csr
def normalize(W):
""" row normalize scipy sparse csr matrices inplace.
"""
if not isspmatrix_csr(W):
raise ValueError('W must be in CSR format.')
else:
for i in range(W.shape[0]):
row_sum = W.data[W.indptr[i]:W.indptr[i+1]].sum()
if row_sum != 0:
W.data[W.indptr[i]:W.indptr[i+1]] /= row_sum
Remember that W.indices
is the array of column indices,
W.data
is the array of corresponding nonzero values
and W.indptr
points to row starts in indices and data.
You can add a numpy.abs()
when taking the sum if you need the L1 norm or use numpy.max()
to normalize by the maximum value per row.
I found this as an elegant way of doing it without using inbuilt functions.
import scipy.sparse as sp
def normalize(W):
#Find the row scalars as a Matrix_(n,1)
rowSumW = sp.csr_matrix(W.sum(axis=1))
rowSumW.data = 1/rowSumW.data
#Find the diagonal matrix to scale the rows
rowSumW = rowSumW.transpose()
scaling_matrix = sp.diags(rowSumW.toarray()[0])
return scaling_matrix.dot(W)
My setup: Python 3.8.10, SciPy 1.5.4
Working just with csr_array you could nowadays use its multiply
and sum
methods and do:
# w : scipy.sparse.csr_array
row_sum = w.sum(axis=1)
row_sum[row_sum == 0] = 1 # to avoid divide by zero
w = w.multiply(1. / row_sum)