Outline

clip_by_value 根据具体的值裁剪
relu 线性修正单元
clip_by_norm 根据范数值裁剪
gradient clipping 梯度裁剪

clip_by_value

a = tf.range(10)   
# <tf.Tensor: id=389, shape=(10,), dtype=int32, numpy=array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])>
tf.maximum(a, 2)   
# <tf.Tensor: id=581, shape=(10,), dtype=int32, numpy=array([2, 2, 2, 3, 4, 5, 6, 7, 8, 9])>
tf.minimum(a, 2)   
# <tf.Tensor: id=635, shape=(10,), dtype=int32, numpy=array([0, 1, 2, 2, 2, 2, 2, 2, 2, 2])>
tf.clip_by_value(a, 2, 8)   
# <tf.Tensor: id=530, shape=(10,), dtype=int32, numpy=array([2, 2, 2, 3, 4, 5, 6, 7, 8, 8])>

relu

在这里插入图片描述

a = a - 5
# <tf.Tensor: id=750, shape=(10,), dtype=int32, numpy=array([-5, -4, -3, -2, -1,  0,  1,  2,  3,  4])>
tf.nn.relu(a)   
# <tf.Tensor: id=869, shape=(10,), dtype=int32, numpy=array([0, 0, 0, 0, 0, 0, 1, 2, 3, 4])>
tf.maximum(a,0)   
# <tf.Tensor: id=935, shape=(10,), dtype=int32, numpy=array([0, 0, 0, 0, 0, 0, 1, 2, 3, 4])>

clip_by_norm

a = tf.random.normal([2,2], mean=10)
# <tf.Tensor: id=1008, shape=(2, 2), dtype=float32, numpy=
# array([[ 9.298348, 11.598914],
#        [10.152704, 10.486983]], dtype=float32)>

tf.norm(a)   # <tf.Tensor: id=1149, shape=(), dtype=float32, numpy=20.833826>

aa = tf.clip_by_norm(a,15)
tf.norm(aa)   # <tf.Tensor: id=1244, shape=(), dtype=float32, numpy=15.0>

Gradient clipping

梯度爆炸以及梯度弥散
设置 lr=1
new_grads, total_norm = tf.clip_by_global_norm(grads, 25)

before gradient clipping

(x, y), _ = datasets.mnist.load_data()
x = tf.convert_to_tensor(x, dtype=tf.float32) / 50.

    # x: (60000, 28, 28)
    # y: (60000, 10)
    # sample: (128, 28, 28)(128, 10)
    # == before ==
    # tf.Tensor(95.908844, shape=(), dtype=float32)
    # tf.Tensor(2.8925762, shape=(), dtype=float32)
    # tf.Tensor(126.49962, shape=(), dtype=float32)
    # tf.Tensor(2.468435, shape=(), dtype=float32)
    # tf.Tensor(153.886, shape=(), dtype=float32)
    # tf.Tensor(3.0194814, shape=(), dtype=float32)
    # == after ==
    # tf.Tensor(95.908844, shape=(), dtype=float32)
    # tf.Tensor(2.8925762, shape=(), dtype=float32)
    # tf.Tensor(126.49962, shape=(), dtype=float32)
    # tf.Tensor(2.468435, shape=(), dtype=float32)
    # tf.Tensor(153.886, shape=(), dtype=float32)
    # tf.Tensor(3.0194814, shape=(), dtype=float32)
    # 0
    # loss: 34.782127380371094

gradient clipping

        print('==before==')
        for g in grads:
            print(tf.norm(g))
        
        grads,  _ = tf.clip_by_global_norm(grads, 15)

        print('==after==')
        for g in grads:
            print(tf.norm(g))

after gradient clipping


# x: (60000, 28, 28) y: (60000, 10)
# sample: (128, 28, 28) (128, 10)
# ==before==
# tf.Tensor(86.69438, shape=(), dtype=float32)
# tf.Tensor(2.6205728, shape=(), dtype=float32)
# tf.Tensor(121.17316, shape=(), dtype=float32)
# tf.Tensor(2.3026118, shape=(), dtype=float32)
# tf.Tensor(141.0347, shape=(), dtype=float32)
# tf.Tensor(2.6972096, shape=(), dtype=float32)
# ==after==
# tf.Tensor(6.3371544, shape=(), dtype=float32)
# tf.Tensor(0.19155769, shape=(), dtype=float32)
# tf.Tensor(8.857471, shape=(), dtype=float32)
# tf.Tensor(0.16831549, shape=(), dtype=float32)
# tf.Tensor(10.309304, shape=(), dtype=float32)
# tf.Tensor(0.19715965, shape=(), dtype=float32)
# 0 loss: 29.666818618774414

源代码：

import  tensorflow as tf
from    tensorflow import keras
from    tensorflow.keras import datasets, layers, optimizers
import  os

os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
print(tf.__version__)

(x, y), _ = datasets.mnist.load_data()
x = tf.convert_to_tensor(x, dtype=tf.float32) / 50.
y = tf.convert_to_tensor(y)
y = tf.one_hot(y, depth=10)
print('x:', x.shape, 'y:', y.shape)
train_db = tf.data.Dataset.from_tensor_slices((x,y)).batch(128).repeat(30)
x,y = next(iter(train_db))
print('sample:', x.shape, y.shape)
# print(x[0], y[0])


def main():

    # 784 => 512
    w1, b1 = tf.Variable(tf.random.truncated_normal([784, 512], stddev=0.1)), tf.Variable(tf.zeros([512]))
    # 512 => 256
    w2, b2 = tf.Variable(tf.random.truncated_normal([512, 256], stddev=0.1)), tf.Variable(tf.zeros([256]))
    # 256 => 10
    w3, b3 = tf.Variable(tf.random.truncated_normal([256, 10], stddev=0.1)), tf.Variable(tf.zeros([10]))

    optimizer = optimizers.SGD(lr=0.01)

    for step, (x,y) in enumerate(train_db):

        # [b, 28, 28] => [b, 784]
        x = tf.reshape(x, (-1, 784))

        with tf.GradientTape() as tape:

            # layer1.
            h1 = x @ w1 + b1
            h1 = tf.nn.relu(h1)
            # layer2
            h2 = h1 @ w2 + b2
            h2 = tf.nn.relu(h2)
            # output
            out = h2 @ w3 + b3
            # out = tf.nn.relu(out)

            # compute loss
            # [b, 10] - [b, 10]
            loss = tf.square(y-out)
            # [b, 10] => [b]
            loss = tf.reduce_mean(loss, axis=1)
            # [b] => scalar
            loss = tf.reduce_mean(loss)

        # compute gradient
        grads = tape.gradient(loss, [w1, b1, w2, b2, w3, b3])
        print('==before==')
        for g in grads:
            print(tf.norm(g))
        
        grads,  _ = tf.clip_by_global_norm(grads, 15)

        print('==after==')
        for g in grads:
            print(tf.norm(g))
        # update w' = w - lr*grad
        optimizer.apply_gradients(zip(grads, [w1, b1, w2, b2, w3, b3]))

        if step % 100 == 0:
            print(step, 'loss:', float(loss))

if __name__ == '__main__':
    main()

tensorflow 2.0 高阶操作之张量限幅

4.5 张量限幅

Outline

clip_by_value

relu

clip_by_norm

Gradient clipping

before gradient clipping

gradient clipping

after gradient clipping

猜你喜欢

tensorflow 2.0 高阶操作 之 张量限幅

4.5 张量限幅

Outline

clip_by_value

relu

clip_by_norm

Gradient clipping

before gradient clipping

gradient clipping

after gradient clipping

猜你喜欢

tensorflow 2.0 高阶操作之张量限幅