Tfrecord简介
什么是Tfrecord? Tfrecord是TensorFlow中一种统一的格式,用来存储数据,这个格式就是TFRecords。TFRecords 其实是一种二进制文件,虽然它不如其他格式好理解,但是它能更好的利用内存,更方便赋值和移动,并且不需要单独的标签文件,理论上,它能保存所有的信息。
Tfrecord的结构 TFRecord其内部包含了多个tf.train.Example,而Example是protocol buffer(protobuf) 数据标准的实现,在一个Example消息体中包含了一系列的tf.train.feature属性,而每一个feature 是一个key-value的键值对,其中,key 是string类型,而value 的取值有三种:
bytes_list: 可以存储string 和byte两种数据类型。
float_list: 可以存储float(float32)与double(float64) 两种数据类型 。
int64_list: 可以存储:bool, enum, int32, uint32, int64, uint64 。
Tfrecord实战
import matplotlib as mpl
import matplotlib. pyplot as plt
% matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf
from tensorflow import keras
print ( tf. __version__)
print ( sys. version_info)
for module in mpl, np , pd, sklearn, tf, keras:
print ( module. __name__, module. __version__)
2.0.0
sys.version_info(major=3, minor=7, micro=6, releaselevel='final', serial=0)
matplotlib 3.1.3
numpy 1.18.1
pandas 1.0.0
sklearn 0.22.1
tensorflow 2.0.0
tensorflow_core.keras 2.2.4-tf
生成Example消息体中的tf.train.feature属性
favorite_books = [ name. encode( 'utf-8' )
for name in [ 'machine learning' , 'cc150' ] ]
favorite_books_bytelist = tf. train. BytesList( value = favorite_books)
print ( favorite_books_bytelist)
hours_floatlist = tf. train. FloatList( value = [ 15.5 , 9.5 , 7.0 , 8.0 ] )
print ( hours_floatlist)
age_int64list = tf. train. Int64List( value= [ 42 ] )
print ( age_int64list)
features = tf. train. Features(
feature = {
"favorite_books" : tf. train. Feature(
bytes_list = favorite_books_bytelist) ,
"hours" : tf. train. Feature(
float_list = hours_floatlist) ,
"age" : tf. train. Feature( int64_list = age_int64list) ,
}
)
print ( features)
value: "machine learning"
value: "cc150"
value: 15.5
value: 9.5
value: 7.0
value: 8.0
value: 42
feature {
key: "age"
value {
int64_list {
value: 42
}
}
}
feature {
key: "favorite_books"
value {
bytes_list {
value: "machine learning"
value: "cc150"
}
}
}
feature {
key: "hours"
value {
float_list {
value: 15.5
value: 9.5
value: 7.0
value: 8.0
}
}
}
example = tf. train. Example( features= features)
print ( example)
serialized_example = example. SerializeToString( )
print ( serialized_example)
features {
feature {
key: "age"
value {
int64_list {
value: 42
}
}
}
feature {
key: "favorite_books"
value {
bytes_list {
value: "machine learning"
value: "cc150"
}
}
}
feature {
key: "hours"
value {
float_list {
value: 15.5
value: 9.5
value: 7.0
value: 8.0
}
}
}
}
b'\n\\\n-\n\x0efavorite_books\x12\x1b\n\x19\n\x10machine learning\n\x05cc150\n\x1d\n\x05hours\x12\x14\x12\x12\n\x10\x00\x00xA\x00\x00\x18A\x00\x00\xe0@\x00\x00\x00A\n\x0c\n\x03age\x12\x05\x1a\x03\n\x01*'
output_dir = 'tfrecord_basic'
if not os. path. exists( output_dir) :
os. mkdir( output_dir)
filename = 'test.tfrecords'
filename_fullpath = os. path. join( output_dir, filename)
with tf. io. TFRecordWriter( filename_fullpath) as writer:
for i in range ( 3 ) :
writer. write( serialized_example)
dataset = tf. data. TFRecordDataset( [ filename_fullpath] )
for serialized_example_tensor in dataset:
print ( serialized_example_tensor)
tf.Tensor(b'\n\\\n-\n\x0efavorite_books\x12\x1b\n\x19\n\x10machine learning\n\x05cc150\n\x1d\n\x05hours\x12\x14\x12\x12\n\x10\x00\x00xA\x00\x00\x18A\x00\x00\xe0@\x00\x00\x00A\n\x0c\n\x03age\x12\x05\x1a\x03\n\x01*', shape=(), dtype=string)
tf.Tensor(b'\n\\\n-\n\x0efavorite_books\x12\x1b\n\x19\n\x10machine learning\n\x05cc150\n\x1d\n\x05hours\x12\x14\x12\x12\n\x10\x00\x00xA\x00\x00\x18A\x00\x00\xe0@\x00\x00\x00A\n\x0c\n\x03age\x12\x05\x1a\x03\n\x01*', shape=(), dtype=string)
tf.Tensor(b'\n\\\n-\n\x0efavorite_books\x12\x1b\n\x19\n\x10machine learning\n\x05cc150\n\x1d\n\x05hours\x12\x14\x12\x12\n\x10\x00\x00xA\x00\x00\x18A\x00\x00\xe0@\x00\x00\x00A\n\x0c\n\x03age\x12\x05\x1a\x03\n\x01*', shape=(), dtype=string)
expected_feature = {
"favorite_books" : tf. io. VarLenFeature( dtype = tf. string) ,
"hours" : tf. io. VarLenFeature( dtype= tf. float32) ,
"age" : tf. io. FixedLenFeature( [ ] , dtype= tf. int64) ,
}
dataset = tf. data. TFRecordDataset( [ filename_fullpath] )
for serialized_example_tensor in dataset:
example = tf. io. parse_single_example(
serialized_example_tensor,
expected_feature)
books = tf. sparse. to_dense( example[ "favorite_books" ] )
for book in books:
print ( book. numpy( ) . decode( "UTF-8" ) )
machine learning
cc150
machine learning
cc150
machine learning
cc150
filename_fullpath_zip = filename_fullpath + '.zip'
options = tf. io. TFRecordOptions( compression_type= "GZIP" )
with tf. io. TFRecordWriter( filename_fullpath_zip, options) as writer:
for i in range ( 3 ) :
writer. write( serialized_example)
dataset_zip = tf. data. TFRecordDataset( [ filename_fullpath_zip] ,
compression_type= 'GZIP' )
for serialized_example_tensor in dataset_zip:
example = tf. io. parse_single_example(
serialized_example_tensor,
expected_feature)
books = tf. sparse. to_dense( example[ "favorite_books" ] )
for book in books:
print ( book. numpy( ) . decode( "UTF-8" ) )
machine learning
cc150
machine learning
cc150
machine learning
cc150