Tensorflow create a tfrecords file from csv

Question:

I am trying to write a csv file (all columns are floats) to a tfrecords file then read them back out. All the examples I have seen pack the csv columns then feed it to sess.run() directly but I can’t figure out how to write the feature columns and label column to a tfrecord instead. How could I do this?

Asked By: Nitro

||

Answers:

You will need a separate script to convert your csv file to TFRecords.

Imagine you have a CSV with the following header:

feature_1, feature_2, ..., feature_n, label

You need to read your CSV with something like pandas, construct tf.train.Example manually and then write it to file with TFRecordWriter

csv = pandas.read_csv("your.csv").values
with tf.python_io.TFRecordWriter("csv.tfrecords") as writer:
    for row in csv:
        features, label = row[:-1], row[-1]
        example = tf.train.Example()
        example.features.feature["features"].float_list.value.extend(features)
        example.features.feature["label"].int64_list.value.append(label)
        writer.write(example.SerializeToString())
Answered By: standy
def convert_to():
filename = os.path.join(wdir, 'ml-100k' + '.tfrecords')
print('Writing', filename)
with tf.python_io.TFRecordWriter(filename) as writer:
    with open("/Users/shishir/Documents/botconnect_Playground/tfRecords/ml-100k.train.rating", "r") as f:
        line = f.readline()
        while line != None and line != "":
            arr = line.split("t")
            u, i, l  = int(arr[0]), int(arr[1]), int(arr[2])
            u_arr = np.reshape(u,[1]).astype('int64')
            i_arr = np.reshape(i,[1]).astype('int64')
            l_arr = np.reshape(l,[1]).astype('int64')
            example = tf.train.Example()
            example.features.feature["user"].int64_list.value.extend(u_arr)
            example.features.feature["item"].int64_list.value.extend(i_arr)
            example.features.feature["label"].int64_list.value.append(int(l_arr))
            writer.write(example.SerializeToString())
            line = f.readline()

So that is my Solution and it works! Hope this helps

Cheers.

Answered By: Shishir Narayan

The above solution not worked in my case.Another way to read csv file and create tfRecord is shown below:

The feature set column names are :Sl.No:,Time,Height, Width,Mean,Std, Variance, Non-homogeneity, PixelCount, contourCount, Class.

Sample features that we get from dataset.csv:

Features= [5, ‘D’, 268, 497, 13.706, 863.4939, 29.385, 0.0427, 39675, 10]

label : medium

import pandas as pd
import tensorflow as tf

def create_tf_example(features, label):

    tf_example = tf.train.Example(features=tf.train.Features(feature={
        'Time': tf.train.Feature(bytes_list=tf.train.BytesList(value=[features[1].encode('utf-8')])),
        'Height':tf.train.Feature(int64_list=tf.train.Int64List(value=[features[2]])),
        'Width':tf.train.Feature(int64_list=tf.train.Int64List(value=[features[3]])),
        'Mean':tf.train.Feature(float_list=tf.train.FloatList(value=[features[4]])),
        'Std':tf.train.Feature(float_list=tf.train.FloatList(value=[features[5]])),
        'Variance':tf.train.Feature(float_list=tf.train.FloatList(value=[features[6]])),
        'Non-homogeneity':tf.train.Feature(float_list=tf.train.FloatList(value=[features[7]])),
        'PixelCount':tf.train.Feature(int64_list=tf.train.Int64List(value=[features[8]])),
        'contourCount':tf.train.Feature(int64_list=tf.train.Int64List(value=[features[9]])),
        'Class':tf.train.Feature(bytes_list=tf.train.BytesList(value=[label.encode('utf-8')])),
    }))
    return tf_example

csv = pd.read_csv("dataset.csv").values
with tf.python_io.TFRecordWriter("dataset.tfrecords") as writer:
  for row in csv:
     features, label = row[:-1], row[-1]
     print features, label
     example = create_tf_example(features, label)
     writer.write(example.SerializeToString())
writer.close()

For more details click here.This works for me, hope it works.

Answered By: Nija I Pillai

@Nija I Pillai ‘s answer for tensorflow 2

import pandas as pd
import tensorflow as tf

def create_tf_example(features, label):
    tf_example = tf.train.Example(features=tf.train.Features(feature={
        'attr1': tf.train.Feature(bytes_list=tf.train.BytesList(value=[features[1].encode('utf-8')])),
        'attr2':tf.train.Feature(int64_list=tf.train.Int64List(value=[features[2]])),
        'attr3':tf.train.Feature(int64_list=tf.train.Int64List(value=[features[3]])),
        'attr4':tf.train.Feature(float_list=tf.train.FloatList(value=[features[4]])),
        'attr5':tf.train.Feature(float_list=tf.train.FloatList(value=[features[5]])),
        'attr6':tf.train.Feature(float_list=tf.train.FloatList(value=[features[6]])),
        'attr7':tf.train.Feature(float_list=tf.train.FloatList(value=[features[7]])),
        'attr8':tf.train.Feature(int64_list=tf.train.Int64List(value=[features[8]]))
    }))
    return tf_example

csv = pd.read_csv("dataset.csv").values
with tf.io.TFRecordWriter("dataset.tfrecords") as writer:
  for row in csv:
     features, label = row[:-1], row[-1]
     print(features, label)
     example = create_tf_example(features, label)
     writer.write(example.SerializeToString())
writer.close()
Answered By: Ángel B.
Categories: questions Tags: ,
Answers are sorted by their score. The answer accepted by the question owner as the best is marked with
at the top-right corner.