How to Save a Tensorflow Dataset

Question:

As the title says I’m trying to save a TensorSliceDataset object to file. Viewing tensorflow’s website it seems that the tf.data.Dataset class has a save function but it is not implemented for TensorSliceDataset objects. Pickling also did not work for me.

Example code

import tensorflow as tf
t = tf.range(10)
ds = tf.data.Dataset.from_tensor_slices(t)
ds.save()

returns error: AttributeError: 'TensorSliceDataset' object has no attribute 'save'

Asked By: ablanch5

||

Answers:

You can convert TensorSliceDataset to numpy array and then save it.

Sample code:

np.save('data.npy', np.array(list(dataset.as_numpy_iterator())))
Answered By: maciek97x

It simply saves as text or checkpoints.

Sample: Frames or really

import os
from os.path import exists

import tensorflow as tf
import pandas as pd

"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
None
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
physical_devices = tf.config.experimental.list_physical_devices('GPU')
assert len(physical_devices) > 0, "Not enough GPU hardware devices available"
physical_devices = tf.config.experimental.list_physical_devices()
print(physical_devices)

"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
: Variables
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
BATCH_SIZE = 1
IMG_SIZE = (32, 32)
new_dataset_folder = "F:\temp\Python\excel"

PATH = 'F:\datasets\downloads\cats_name'
train_dir = os.path.join(PATH, 'train')
validation_dir = os.path.join(PATH, 'validation')

train_dataset = tf.keras.utils.image_dataset_from_directory(train_dir, shuffle=True,
    batch_size=BATCH_SIZE, image_size=IMG_SIZE)
                                                            
class_names = train_dataset.class_names

print( 'class_names: ' + str( class_names ) )
print( train_dataset )

### 1. Save dataset using .save()
path = "F:\temp\saved_dataset"
train_dataset.save(
    path, compression=None, shard_func=None, checkpoint_args=None
)

"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
: Dataset
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
dataset = {
    "image" :[],
    "label" : []
}

file_order = 0
for data in train_dataset :
    file_path = new_dataset_folder + "\" + str(int(data[1][0])) + ".npz"
    dataset["image"].append(file_path)
    dataset["label"].append(str(int(data[1][0])))
    # Save
    encoding = "utf-8"
    with open( new_dataset_folder + "\" + str(file_order), "wb" ) as f:
        f.write(str(data[0]).encode(encoding))
    
    file_order = file_order + 1

### 2. Save dataset using Panda
df = pd.DataFrame(dataset)
df.to_csv(os.path.join(new_dataset_folder, "train.csv"), index=False)

Sample

Answered By: Jirayu Kaewprateep

With Tensorflow 2.10.0, you can use tf.data.Dataset.save:

import tensorflow as tf

print(tf.__version__)
# 2.10.0

path = '/content/'
t = tf.range(10)
ds = tf.data.Dataset.from_tensor_slices(t)

tf.data.Dataset.save(ds, path)
new_ds = tf.data.Dataset.load(path)

Otherwise, use tf.data.experimental.save for older versions:

import tensorflow as tf

path = '/content/'
t = tf.range(10)
ds = tf.data.Dataset.from_tensor_slices(t)
tf.data.experimental.save(ds, path)
new_ds = tf.data.experimental.load(path)
Answered By: AloneTogether