tensorflow数据读取和处理

文件匹配

["file0", "file1"]或[("file%d" % i) for i in range(2)]

files = tf.train.match_filenames_once("C:/path/to/data.tfrecords-*")

读取文件队列

二进制文件，每一个像素点的代表占用一个byte的文件，所以在以二进制存储的图片中，图片总共的像素点表示一张图片的大小

tf.train.string_input_producer()

传入以一个文件列表，系统会自动生成文件名的队列

num_epochs: 使用队列的次数

shufflue: 对传入的文件列表进行打乱

*注意：只使用tf.train.string_input_producer()，不会吐出数据（里面的文件不会流动起来，处与停滞状态）只用调用 tf.train.start_queue_runers 后才会使停滞的数据流动起来，程序不会陷入等待状态。

batch 输出队列

深入tensorflow

读取

tf.wholeFileReader()

读取队列中的文件队列中的文件，读取整个文件，如果一个文件中有多个文件，则不能使用wholeFileReader，读取一个文件中的多个文件使用 tf.FIxedLengthRecordReader()(已固定的大小读取文件中)

tf.FIxedLengthRecordReader()

每次一固定的大小读取一个文件中的片段

https://github.com/YJango/TFRecord-Dataset-Estimator-API/blob/master/TensorFlow Dataset %2B TFRecords.ipynb

TFrecorder

知乎:https://zhuanlan.zhihu.com/p/33223782

制作

# open TFRecord file
writer = tf.python_io.TFRecordWriter('%s.tfrecord' %'test')

# 这里我们将会写3个样本，每个样本里有4个feature：标量，向量，矩阵，张量
for i in range(3):
    # 创建字典
    features={}
    # 写入标量，类型Int64，由于是标量，所以"value=[scalars[i]]" 变成list
    features['scalar'] = tf.train.Feature(int64_list=tf.train.Int64List(value=[scalars[i]]))
    
    # 写入向量，类型float，本身就是list，所以"value=vectors[i]"没有中括号
    features['vector'] = tf.train.Feature(float_list = tf.train.FloatList(value=vectors[i]))
    
    # 写入矩阵，类型float，本身是矩阵，一种方法是将矩阵flatten成list
    features['matrix'] = tf.train.Feature(float_list = tf.train.FloatList(value=matrices[i].reshape(-1)))
    # 然而矩阵的形状信息(2,3)会丢失，需要存储形状信息，随后可转回原形状
    features['matrix_shape'] = tf.train.Feature(int64_list = tf.train.Int64List(value=matrices[i].shape))
    
    # 写入张量，类型float，本身是三维张量，另一种方法是转变成字符类型存储，随后再转回原类型
    features['tensor']= tf.train.Feature(bytes_list=tf.train.BytesList(value=[tensors[i].tostring()]))
    # 存储丢失的形状信息(806,806,3)
    features['tensor_shape'] = tf.train.Feature(int64_list = tf.train.Int64List(value=tensors[i].shape))
  
    # 将存有所有feature的字典送入tf.train.Features中
    tf_features = tf.train.Features(feature= features)
    # 再将其变成一个样本example
    tf_example = tf.train.Example(features = tf_features)
    # 序列化该样本
    tf_serialized = tf_example.SerializeToString()
    # write
    writer.write(tf_serialized)
# close  
writer.close()

加载

def parse_function(example_proto):
    # example_proto，tf_serialized
    dics = {'scalar': tf.FixedLenFeature(shape=(), dtype=tf.int64, default_value=None),            
        # when parse the example, shape below can be used as reshape, for example reshape (3,) to (1,3)
        'vector': tf.FixedLenFeature(shape=(1,3), dtype=tf.float32), 
        
        # we can use VarLenFeature, but it returns SparseTensor
        'matrix': tf.VarLenFeature(dtype=dtype('float32')), 
        'matrix_shape': tf.FixedLenFeature(shape=(2,), dtype=tf.int64), 
        
        # tensor在写入时 使用了toString()，shape是()
        # we first set the type as tf.string, then change to its original type: tf.uint8
        'tensor': tf.FixedLenFeature(shape=(), dtype=tf.string), 
        'tensor_shape': tf.FixedLenFeature(shape=(3,), dtype=tf.int64)}
# parse all features in a single example according to the dics
parsed_example = tf.parse_single_example(example_proto, dics)
# decode string
parsed_example['tensor'] = tf.decode_raw(parsed_example['tensor'], tf.uint8)
# sparse_tensor_to_dense
parsed_example['matrix'] = tf.sparse_tensor_to_dense(parsed_example['matrix'])

# reshape matrix
parsed_example['matrix'] = tf.reshape(parsed_example['matrix'], parsed_example['matrix_shape'])

# reshape tensor
parsed_example['tensor'] = tf.reshape(parsed_example['tensor'], parsed_example['tensor_shape'])
return parsed_example

处理

代码应用

21个项目 p28

with tf.Session() as sess:
    # 我们要读三幅图片A.jpg, B.jpg, C.jpg
    filename = ['A.jpg', 'B.jpg', 'C.jpg']
    # string_input_producer会产生一个文件名队列
    filename_queue = tf.train.string_input_producer(filename, shuffle=False, num_epochs=5)
    # reader从文件名队列中读数据。对应的方法是reader.read
    reader = tf.WholeFileReader()
    key, value = reader.read(filename_queue)
    # tf.train.string_input_producer定义了一个epoch变量，要对它进行初始化
    tf.local_variables_initializer().run()
    # 使用start_queue_runners之后，才会开始填充队列
    threads = tf.train.start_queue_runners(sess=sess)
    i = 0
    while True:
        i += 1
        # 获取图片数据并保存
        image_data = sess.run(value)
        with open('read/test_%d.jpg' % i, 'wb') as f:
            f.write(image_data)
# 程序最后会抛出一个OutOfRangeError，这是epoch跑完，队列关闭的标志