实时查看参数变化情况
首先在代码目录下建立一个文件夹(如 ./tensorboard
)存放 TensorBoard 的记录文件,并在代码中实例化一个记录器
summary_writer = tf.summary.create_file_writer('./tensorboard') # 参数为记录文件所保存的目录
接下来,当需要记录训练过程中的参数时,通过 with 语句指定希望使用的记录器,并对需要记录的参数(一般是 scalar)运行 tf.summary.scalar(name, tensor, step=batch_index)
summary_writer = tf.summary.create_file_writer('./tensorboard')
# 开始模型训练
for batch_index in range(num_batches):
# ...(训练代码,当前batch的损失值放入变量loss中)
with summary_writer.as_default(): # 希望使用的记录器
tf.summary.scalar("loss", loss, step=batch_index)
tf.summary.scalar("MyScalar", my_scalar, step=batch_index) # 还可以添加其他自定义的变量
当我们要对训练过程可视化时,在代码目录打开终端
tensorboard --logdir=./tensorboard
查看 Graph 和 Profile 信息
tf.summary.trace_on(graph=True, profiler=True) # 开启Trace,可以记录图结构和profile信息
# 进行训练
with summary_writer.as_default():
tf.summary.trace_export(name="model_trace", step=0, profiler_outdir=log_dir) # 保存Trace信息到文件
import tensorflow as tf
from tensorflow.keras.datasets import fashion_mnist
def make_model(n_classes):
return tf.keras.Sequential(
[
tf.keras.layers.Conv2D(
32, (5, 5), activation=tf.nn.relu, input_shape=(28, 28, 1)
),
tf.keras.layers.MaxPool2D((2, 2), (2, 2)),
tf.keras.layers.Conv2D(64, (3, 3), activation=tf.nn.relu),
tf.keras.layers.MaxPool2D((2, 2), (2, 2)),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(1024, activation=tf.nn.relu),
tf.keras.layers.Dropout(0.5),
tf.keras.layers.Dense(n_classes),
]
)
def load_data():
(train_x, train_y), (test_x, test_y) = fashion_mnist.load_data()
# Scale input in [-1, 1] range
train_x = tf.expand_dims(train_x, -1)
train_x = (tf.image.convert_image_dtype(train_x, tf.float32) - 0.5) * 2
train_y = tf.expand_dims(train_y, -1)
test_x = test_x / 255.0 * 2 - 1
test_x = (tf.image.convert_image_dtype(test_x, tf.float32) - 0.5) * 2
test_y = tf.expand_dims(test_y, -1)
return (train_x, train_y), (test_x, test_y)
def train():
# Define the model
n_classes = 10
model = make_model(n_classes)
# Input data
(train_x, train_y), (test_x, test_y) = load_data()
# Training parameters
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
step = tf.Variable(1, name="global_step")
optimizer = tf.optimizers.Adam(1e-3)
ckpt = tf.train.Checkpoint(step=step, optimizer=optimizer, model=model)
manager = tf.train.CheckpointManager(ckpt, "./tf_ckpts", max_to_keep=3)
ckpt.restore(manager.latest_checkpoint)
if manager.latest_checkpoint:
print(f"Restored from {manager.latest_checkpoint}")
else:
print("Initializing from scratch.")
accuracy = tf.metrics.Accuracy()
mean_loss = tf.metrics.Mean(name="loss")
# Train step function
@tf.function
def train_step(inputs, labels):
with tf.GradientTape() as tape:
logits = model(inputs)
loss_value = loss(labels, logits)
gradients = tape.gradient(loss_value, model.trainable_variables)
# TODO: apply gradient clipping here
optimizer.apply_gradients(zip(gradients, model.trainable_variables))
step.assign_add(1)
accuracy.update_state(labels, tf.argmax(logits, -1))
return loss_value, accuracy.result()
epochs = 10
batch_size = 32
nr_batches_train = int(train_x.shape[0] / batch_size)
print(f"Batch size: {batch_size}")
print(f"Number of batches per epoch: {nr_batches_train}")
train_summary_writer = tf.summary.create_file_writer("./log/train")
with train_summary_writer.as_default():
for epoch in range(epochs):
for t in range(nr_batches_train):
start_from = t * batch_size
to = (t + 1) * batch_size
features, labels = train_x[start_from:to], train_y[start_from:to]
loss_value, accuracy_value = train_step(features, labels)
mean_loss.update_state(loss_value)
if t % 10 == 0:
print(f"{step.numpy()}: {loss_value} - accuracy: {accuracy_value}")
save_path = manager.save()
print(f"Checkpoint saved: {save_path}")
tf.summary.image(
"train_set", features, max_outputs=3, step=step.numpy()
)
tf.summary.scalar("accuracy", accuracy_value, step=step.numpy())
tf.summary.scalar("loss", mean_loss.result(), step=step.numpy())
accuracy.reset_states()
mean_loss.reset_states()
print(f"Epoch {epoch} terminated")
# Measuring accuracy on the whole training set at the end of epoch
for t in range(nr_batches_train):
start_from = t * batch_size
to = (t + 1) * batch_size
features, labels = train_x[start_from:to], train_y[start_from:to]
logits = model(features)
accuracy.update_state(labels, tf.argmax(logits, -1))
print(f"Training accuracy: {accuracy.result()}")
accuracy.reset_states()
if __name__ == "__main__":
train()