Building up an array in numpy/scipy by iteration in Python?
Question:
Often, I am building an array by iterating through some data, e.g.:
my_array = []
for n in range(1000):
# do operation, get value
my_array.append(value)
# cast to array
my_array = array(my_array)
I find that I have to first build a list and then cast it (using “array”) to an array. Is there a way around these? All these casting calls clutter the code… how can I iteratively build up “my_array”, with it being an array from the start?
Answers:
The recommended way to do this is to preallocate before the loop and use slicing and indexing to insert
my_array = numpy.zeros(1,1000)
for i in xrange(1000):
#for 1D array
my_array[i] = functionToGetValue(i)
#OR to fill an entire row
my_array[i:] = functionToGetValue(i)
#or to fill an entire column
my_array[:,i] = functionToGetValue(i)
numpy does provide an array.resize()
method, but this will be far slower due to the cost of reallocating memory inside a loop. If you must have flexibility, then I’m afraid the only way is to create an array
from a list
.
EDIT: If you are worried that you’re allocating too much memory for your data, I’d use the method above to over-allocate and then when the loop is done, lop off the unused bits of the array using array.resize()
. This will be far, far faster than constantly reallocating the array inside the loop.
EDIT: In response to @user248237’s comment, assuming you know any one dimension of the array (for simplicity’s sake):
my_array = numpy.array(10000, SOMECONSTANT)
for i in xrange(someVariable):
if i >= my_array.shape[0]:
my_array.resize((my_array.shape[0]*2, SOMECONSTANT))
my_array[i:] = someFunction()
#lop off extra bits with resize() here
The general principle is “allocate more than you think you’ll need, and if things change, resize the array as few times as possible”. Doubling the size could be thought of as excessive, but in fact this is the method used by several data structures in several standard libraries in other languages (java.util.Vector
does this by default for example. I think several implementations of std::vector
in C++ do this as well).
If i understand your question correctly, this should do what you want:
# the array passed into your function
ax = NP.random.randint(10, 99, 20).reshape(5, 4)
# just define a function to operate on some data
fnx = lambda x : NP.sum(x)**2
# apply the function directly to the numpy array
new_row = NP.apply_along_axis(func1d=fnx, axis=0, arr=ax)
# 'append' the new values to the original array
new_row = new_row.reshape(1,4)
ax = NP.vstack((ax, new_row))
NumPy provides a ‘fromiter’ method:
def myfunc(n):
for i in range(n):
yield i**2
np.fromiter(myfunc(5), dtype=int)
which yields
array([ 0, 1, 4, 9, 16])
Building up the array using list.append()
seems to be much faster than any kind of dynamic resizing of a Numpy array:
import numpy as np
import timeit
class ndarray_builder:
def __init__(self, capacity_step, column_count):
self.capacity_step = capacity_step
self.column_count = column_count
self.arr = np.empty((self.capacity_step, self.column_count))
self.row_pointer = 0
def __enter__(self):
return self
def __exit__(self, type, value, traceback):
self.close()
def append(self, row):
if self.row_pointer == self.arr.shape[0]:
self.arr.resize((self.arr.shape[0] + self.capacity_step, self.column_count))
self.arr[self.row_pointer] = row
self.row_pointer += 1
def close(self):
self.arr.resize((self.row_pointer, self.column_count))
def with_builder():
with ndarray_builder(1000, 2) as b:
for i in range(10000):
b.append((1, 2))
b.append((3, 4))
return b.arr
def without_builder():
b = []
for i in range(10000):
b.append((1, 2))
b.append((3, 4))
return np.array(b)
print(f'without_builder: {timeit.timeit(without_builder, number=1000)}')
print(f'with_builder: {timeit.timeit(with_builder, number=1000)}')
without_builder: 3.4763141250000444
with_builder: 7.960973499999909
Often, I am building an array by iterating through some data, e.g.:
my_array = []
for n in range(1000):
# do operation, get value
my_array.append(value)
# cast to array
my_array = array(my_array)
I find that I have to first build a list and then cast it (using “array”) to an array. Is there a way around these? All these casting calls clutter the code… how can I iteratively build up “my_array”, with it being an array from the start?
The recommended way to do this is to preallocate before the loop and use slicing and indexing to insert
my_array = numpy.zeros(1,1000)
for i in xrange(1000):
#for 1D array
my_array[i] = functionToGetValue(i)
#OR to fill an entire row
my_array[i:] = functionToGetValue(i)
#or to fill an entire column
my_array[:,i] = functionToGetValue(i)
numpy does provide an array.resize()
method, but this will be far slower due to the cost of reallocating memory inside a loop. If you must have flexibility, then I’m afraid the only way is to create an array
from a list
.
EDIT: If you are worried that you’re allocating too much memory for your data, I’d use the method above to over-allocate and then when the loop is done, lop off the unused bits of the array using array.resize()
. This will be far, far faster than constantly reallocating the array inside the loop.
EDIT: In response to @user248237’s comment, assuming you know any one dimension of the array (for simplicity’s sake):
my_array = numpy.array(10000, SOMECONSTANT)
for i in xrange(someVariable):
if i >= my_array.shape[0]:
my_array.resize((my_array.shape[0]*2, SOMECONSTANT))
my_array[i:] = someFunction()
#lop off extra bits with resize() here
The general principle is “allocate more than you think you’ll need, and if things change, resize the array as few times as possible”. Doubling the size could be thought of as excessive, but in fact this is the method used by several data structures in several standard libraries in other languages (java.util.Vector
does this by default for example. I think several implementations of std::vector
in C++ do this as well).
If i understand your question correctly, this should do what you want:
# the array passed into your function
ax = NP.random.randint(10, 99, 20).reshape(5, 4)
# just define a function to operate on some data
fnx = lambda x : NP.sum(x)**2
# apply the function directly to the numpy array
new_row = NP.apply_along_axis(func1d=fnx, axis=0, arr=ax)
# 'append' the new values to the original array
new_row = new_row.reshape(1,4)
ax = NP.vstack((ax, new_row))
NumPy provides a ‘fromiter’ method:
def myfunc(n):
for i in range(n):
yield i**2
np.fromiter(myfunc(5), dtype=int)
which yields
array([ 0, 1, 4, 9, 16])
Building up the array using list.append()
seems to be much faster than any kind of dynamic resizing of a Numpy array:
import numpy as np
import timeit
class ndarray_builder:
def __init__(self, capacity_step, column_count):
self.capacity_step = capacity_step
self.column_count = column_count
self.arr = np.empty((self.capacity_step, self.column_count))
self.row_pointer = 0
def __enter__(self):
return self
def __exit__(self, type, value, traceback):
self.close()
def append(self, row):
if self.row_pointer == self.arr.shape[0]:
self.arr.resize((self.arr.shape[0] + self.capacity_step, self.column_count))
self.arr[self.row_pointer] = row
self.row_pointer += 1
def close(self):
self.arr.resize((self.row_pointer, self.column_count))
def with_builder():
with ndarray_builder(1000, 2) as b:
for i in range(10000):
b.append((1, 2))
b.append((3, 4))
return b.arr
def without_builder():
b = []
for i in range(10000):
b.append((1, 2))
b.append((3, 4))
return np.array(b)
print(f'without_builder: {timeit.timeit(without_builder, number=1000)}')
print(f'with_builder: {timeit.timeit(with_builder, number=1000)}')
without_builder: 3.4763141250000444
with_builder: 7.960973499999909