save numpy array in append mode
Question:
Is it possible to save a numpy array appending it to an already existing npy-file — something like np.save(filename,arr,mode='a')
?
I have several functions that have to iterate over the rows of a large array. I cannot create the array at once because of memory constrains. To avoid to create the rows over and over again, I wanted to create each row once and save it to file appending it to the previous row in the file. Later I could load the npy-file in mmap_mode, accessing the slices when needed.
Answers:
The build-in .npy
file format is perfectly fine for working with small datasets, without relying on external modules other then numpy
.
However, when you start having large amounts of data, the use of a file format, such as HDF5, designed to handle such datasets, is to be preferred [1].
For instance, below is a solution to save numpy
arrays in HDF5 with PyTables,
Step 1: Create an extendable EArray
storage
import tables
import numpy as np
filename = 'outarray.h5'
ROW_SIZE = 100
NUM_COLUMNS = 200
f = tables.open_file(filename, mode='w')
atom = tables.Float64Atom()
array_c = f.create_earray(f.root, 'data', atom, (0, ROW_SIZE))
for idx in range(NUM_COLUMNS):
x = np.random.rand(1, ROW_SIZE)
array_c.append(x)
f.close()
Step 2: Append rows to an existing dataset (if needed)
f = tables.open_file(filename, mode='a')
f.root.data.append(x)
Step 3: Read back a subset of the data
f = tables.open_file(filename, mode='r')
print(f.root.data[1:10,2:20]) # e.g. read from disk only this part of the dataset
For appending data to an already existing file using numpy.save, we should use:
f_handle = file(filename, 'a')
numpy.save(f_handle, arr)
f_handle.close()
I have checked that it works in python 2.7 and numpy 1.10.4
I have adapted the code from here, which talks about savetxt method.
.npy
files contain header which has shape and dtype of the array in it. If you know what your resulting array looks like, you can write header yourself and then data in chunks. E.g., here is the code for concatenating 2d matrices:
import numpy as np
import numpy.lib.format as fmt
def get_header(fnames):
dtype = None
shape_0 = 0
shape_1 = None
for i, fname in enumerate(fnames):
m = np.load(fname, mmap_mode='r') # mmap so we read only header really fast
if i == 0:
dtype = m.dtype
shape_1 = m.shape[1]
else:
assert m.dtype == dtype
assert m.shape[1] == shape_1
shape_0 += m.shape[0]
return {'descr': fmt.dtype_to_descr(dtype), 'fortran_order': False, 'shape': (shape_0, shape_1)}
def concatenate(res_fname, input_fnames):
header = get_header(input_fnames)
with open(res_fname, 'wb') as f:
fmt.write_array_header_2_0(f, header)
for fname in input_fnames:
m = np.load(fname)
f.write(m.tostring('C'))
If you need a more general solution (edit header in place while appending) you’ll have to resort to fseek
tricks like in [1].
Inspired by
[1]: https://mail.scipy.org/pipermail/numpy-discussion/2009-August/044570.html (doesn’t work out of the box)
[2]: https://docs.scipy.org/doc/numpy/neps/npy-format.html
[3]: https://github.com/numpy/numpy/blob/master/numpy/lib/format.py
you can try something like reading the file then add new data
import numpy as np
import os.path
x = np.arange(10) #[0 1 2 3 4 5 6 7 8 9]
y = np.load("save.npy") if os.path.isfile("save.npy") else [] #get data if exist
np.save("save.npy",np.append(y,x)) #save the new
after 2 operation:
print(np.load("save.npy")) #[0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9]
This is an expansion on Mohit Pandey’s answer showing a full save / load example. It was tested using Python 3.6 and Numpy 1.11.3.
from pathlib import Path
import numpy as np
import os
p = Path('temp.npy')
with p.open('ab') as f:
np.save(f, np.zeros(2))
np.save(f, np.ones(2))
with p.open('rb') as f:
fsz = os.fstat(f.fileno()).st_size
out = np.load(f)
while f.tell() < fsz:
out = np.vstack((out, np.load(f)))
out = array([[ 0., 0.], [ 1., 1.]])
The following is based upon PaxRomana99’s answer.
It creates a class that you can use to save and load the arrays.
Ideally, one would also change the header of the npy file every time a new array is added in order to modify the description of the shape (see here for the description of the header)
import numpy as np
import pickle
from pathlib import Path
import os
class npyAppendableFile():
def __init__(self, fname, newfile=True):
'''
Creates a new instance of the appendable filetype
If newfile is True, recreate the file even if already exists
'''
self.fname=Path(fname)
if newfile:
with open(self.fname, "wb") as fh:
fh.close()
def write(self, data):
'''
append a new array to the file
note that this will not change the header
'''
with open(self.fname, "ab") as fh:
np.save(fh, data)
def load(self, axis=2):
'''
Load the whole file, returning all the arrays that were consecutively
saved on top of each other
axis defines how the arrays should be concatenated
'''
with open(self.fname, "rb") as fh:
fsz = os.fstat(fh.fileno()).st_size
out = np.load(fh)
while fh.tell() < fsz:
out = np.concatenate((out, np.load(fh)), axis=axis)
return out
def update_content(self):
'''
'''
content = self.load()
with open(self.fname, "wb") as fh:
np.save(fh, content)
@property
def _dtype(self):
return self.load().dtype
@property
def _actual_shape(self):
return self.load().shape
@property
def header(self):
'''
Reads the header of the npy file
'''
with open(self.fname, "rb") as fh:
version = np.lib.format.read_magic(fh)
shape, fortran, dtype = np.lib.format._read_array_header(fh, version)
return version, {'descr': dtype,
'fortran_order' : fortran,
'shape' : shape}
arr_a = np.random.rand(5,40,10)
arr_b = np.random.rand(5,40,7)
arr_c = np.random.rand(5,40,3)
f = npyAppendableFile("testfile.npy", True)
f.write(arr_a)
f.write(arr_b)
f.write(arr_c)
out = f.load()
print (f.header)
print (f._actual_shape)
# after update we can load with regular np.load()
f.update_content()
new_content = np.load('testfile.npy')
print (new_content.shape)
I made a library to append to Numpy .npy
files. Here an excerpt from
https://pypi.org/project/npy-append-array
NpyAppendArray
Create Numpy .npy
files by appending on the growth axis (0 for C order, -1
for Fortran order). It behaves like numpy.concatenate
with the difference
that the result is stored out-of-memory in a .npy
file and can be reused for
further appending. After creation, the file can then be read with memory
mapping (e.g. by adding mmap_mode="r"
) which altogether allows to create and
read files (optionally) larger than the machine’s main memory.
Installation
conda install -c conda-forge npy-append-array
or
pip install npy-append-array
Example
from npy_append_array import NpyAppendArray
import numpy as np
arr1 = np.array([[1,2],[3,4]])
arr2 = np.array([[1,2],[3,4],[5,6]])
filename = 'out.npy'
with NpyAppendArray(filename) as npaa:
npaa.append(arr1)
npaa.append(arr2)
npaa.append(arr2)
data = np.load(filename, mmap_mode="r")
print(data)
Implementation Details
NpyAppendArray contains a modified, partial version of format.py
from the
Numpy package. It ensures that array headers are created with 21
(=len(str(8*2**64-1))
) bytes of spare space. This allows to fit an array of
maxed out dimensions (for a 64 bit machine) without increasing the array
header size. This allows to simply rewrite the header as we append data to the
end of the .npy
file.
Is it possible to save a numpy array appending it to an already existing npy-file — something like np.save(filename,arr,mode='a')
?
I have several functions that have to iterate over the rows of a large array. I cannot create the array at once because of memory constrains. To avoid to create the rows over and over again, I wanted to create each row once and save it to file appending it to the previous row in the file. Later I could load the npy-file in mmap_mode, accessing the slices when needed.
The build-in .npy
file format is perfectly fine for working with small datasets, without relying on external modules other then numpy
.
However, when you start having large amounts of data, the use of a file format, such as HDF5, designed to handle such datasets, is to be preferred [1].
For instance, below is a solution to save numpy
arrays in HDF5 with PyTables,
Step 1: Create an extendable EArray
storage
import tables
import numpy as np
filename = 'outarray.h5'
ROW_SIZE = 100
NUM_COLUMNS = 200
f = tables.open_file(filename, mode='w')
atom = tables.Float64Atom()
array_c = f.create_earray(f.root, 'data', atom, (0, ROW_SIZE))
for idx in range(NUM_COLUMNS):
x = np.random.rand(1, ROW_SIZE)
array_c.append(x)
f.close()
Step 2: Append rows to an existing dataset (if needed)
f = tables.open_file(filename, mode='a')
f.root.data.append(x)
Step 3: Read back a subset of the data
f = tables.open_file(filename, mode='r')
print(f.root.data[1:10,2:20]) # e.g. read from disk only this part of the dataset
For appending data to an already existing file using numpy.save, we should use:
f_handle = file(filename, 'a')
numpy.save(f_handle, arr)
f_handle.close()
I have checked that it works in python 2.7 and numpy 1.10.4
I have adapted the code from here, which talks about savetxt method.
.npy
files contain header which has shape and dtype of the array in it. If you know what your resulting array looks like, you can write header yourself and then data in chunks. E.g., here is the code for concatenating 2d matrices:
import numpy as np
import numpy.lib.format as fmt
def get_header(fnames):
dtype = None
shape_0 = 0
shape_1 = None
for i, fname in enumerate(fnames):
m = np.load(fname, mmap_mode='r') # mmap so we read only header really fast
if i == 0:
dtype = m.dtype
shape_1 = m.shape[1]
else:
assert m.dtype == dtype
assert m.shape[1] == shape_1
shape_0 += m.shape[0]
return {'descr': fmt.dtype_to_descr(dtype), 'fortran_order': False, 'shape': (shape_0, shape_1)}
def concatenate(res_fname, input_fnames):
header = get_header(input_fnames)
with open(res_fname, 'wb') as f:
fmt.write_array_header_2_0(f, header)
for fname in input_fnames:
m = np.load(fname)
f.write(m.tostring('C'))
If you need a more general solution (edit header in place while appending) you’ll have to resort to fseek
tricks like in [1].
Inspired by
[1]: https://mail.scipy.org/pipermail/numpy-discussion/2009-August/044570.html (doesn’t work out of the box)
[2]: https://docs.scipy.org/doc/numpy/neps/npy-format.html
[3]: https://github.com/numpy/numpy/blob/master/numpy/lib/format.py
you can try something like reading the file then add new data
import numpy as np
import os.path
x = np.arange(10) #[0 1 2 3 4 5 6 7 8 9]
y = np.load("save.npy") if os.path.isfile("save.npy") else [] #get data if exist
np.save("save.npy",np.append(y,x)) #save the new
after 2 operation:
print(np.load("save.npy")) #[0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9]
This is an expansion on Mohit Pandey’s answer showing a full save / load example. It was tested using Python 3.6 and Numpy 1.11.3.
from pathlib import Path
import numpy as np
import os
p = Path('temp.npy')
with p.open('ab') as f:
np.save(f, np.zeros(2))
np.save(f, np.ones(2))
with p.open('rb') as f:
fsz = os.fstat(f.fileno()).st_size
out = np.load(f)
while f.tell() < fsz:
out = np.vstack((out, np.load(f)))
out = array([[ 0., 0.], [ 1., 1.]])
The following is based upon PaxRomana99’s answer.
It creates a class that you can use to save and load the arrays.
Ideally, one would also change the header of the npy file every time a new array is added in order to modify the description of the shape (see here for the description of the header)
import numpy as np
import pickle
from pathlib import Path
import os
class npyAppendableFile():
def __init__(self, fname, newfile=True):
'''
Creates a new instance of the appendable filetype
If newfile is True, recreate the file even if already exists
'''
self.fname=Path(fname)
if newfile:
with open(self.fname, "wb") as fh:
fh.close()
def write(self, data):
'''
append a new array to the file
note that this will not change the header
'''
with open(self.fname, "ab") as fh:
np.save(fh, data)
def load(self, axis=2):
'''
Load the whole file, returning all the arrays that were consecutively
saved on top of each other
axis defines how the arrays should be concatenated
'''
with open(self.fname, "rb") as fh:
fsz = os.fstat(fh.fileno()).st_size
out = np.load(fh)
while fh.tell() < fsz:
out = np.concatenate((out, np.load(fh)), axis=axis)
return out
def update_content(self):
'''
'''
content = self.load()
with open(self.fname, "wb") as fh:
np.save(fh, content)
@property
def _dtype(self):
return self.load().dtype
@property
def _actual_shape(self):
return self.load().shape
@property
def header(self):
'''
Reads the header of the npy file
'''
with open(self.fname, "rb") as fh:
version = np.lib.format.read_magic(fh)
shape, fortran, dtype = np.lib.format._read_array_header(fh, version)
return version, {'descr': dtype,
'fortran_order' : fortran,
'shape' : shape}
arr_a = np.random.rand(5,40,10)
arr_b = np.random.rand(5,40,7)
arr_c = np.random.rand(5,40,3)
f = npyAppendableFile("testfile.npy", True)
f.write(arr_a)
f.write(arr_b)
f.write(arr_c)
out = f.load()
print (f.header)
print (f._actual_shape)
# after update we can load with regular np.load()
f.update_content()
new_content = np.load('testfile.npy')
print (new_content.shape)
I made a library to append to Numpy .npy
files. Here an excerpt from
https://pypi.org/project/npy-append-array
NpyAppendArray
Create Numpy .npy
files by appending on the growth axis (0 for C order, -1
for Fortran order). It behaves like numpy.concatenate
with the difference
that the result is stored out-of-memory in a .npy
file and can be reused for
further appending. After creation, the file can then be read with memory
mapping (e.g. by adding mmap_mode="r"
) which altogether allows to create and
read files (optionally) larger than the machine’s main memory.
Installation
conda install -c conda-forge npy-append-array
or
pip install npy-append-array
Example
from npy_append_array import NpyAppendArray
import numpy as np
arr1 = np.array([[1,2],[3,4]])
arr2 = np.array([[1,2],[3,4],[5,6]])
filename = 'out.npy'
with NpyAppendArray(filename) as npaa:
npaa.append(arr1)
npaa.append(arr2)
npaa.append(arr2)
data = np.load(filename, mmap_mode="r")
print(data)
Implementation Details
NpyAppendArray contains a modified, partial version of format.py
from the
Numpy package. It ensures that array headers are created with 21
(=len(str(8*2**64-1))
) bytes of spare space. This allows to fit an array of
maxed out dimensions (for a 64 bit machine) without increasing the array
header size. This allows to simply rewrite the header as we append data to the
end of the .npy
file.