使用Tensor的理由+静态图和动态图的区别

implement the network using numpy and pytorch

import numpy as np
import torch

dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0")

# N: batch size
# D_in input dimension
# H: hidden dimension
# D_out: output dimension
N, D_in, H, D_out = 64, 1000, 100, 10

# input and output data
# x = np.random.randn(N, D_in)
# y = np.random.randn(N, D_out)

# tensor input and output
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)


# Randomly initialize weights
# w1 = np.random.randn(D_in, H)
# w2 = np.random.randn(H, D_out)

# tensor
# w1 = torch.randn(D_in, H, device=device, dtype=dtype)
# w2 = torch.randn(H, D_out, device=device, dtype=dtype)

# tensor and autograd
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6

for t in range(3):
    
    # h = x.dot(w1)
    # mm() matrix multiplication 
#     h = x.mm(w1)
    
    # h_relu = np.maximum(h, 0)
    # clamp(input, min, max) input \in [min, max] 
#     h_relu = h.clamp(min=0)
    
    # y_pred = h_relu.dot(w2)
#     y_pred = h_relu.mm(w2)
    
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    
    
    # loss = np.square(y_pred - y).sum()
    loss = (y_pred - y).pow(2).sum()
    
    # print(t, loss)
    print(t, loss.item())
    
    # Backprop
#     grad_y_pred = 2.0 * (y_pred - y)
    
    # grad_w2 = h_relu.T.dot(grad_y_pred)
#     grad_w2 = h_relu.t().mm(grad_y_pred)
    
    # grad_h_relu = grad_y_pred.dot(w2.T)
#     grad_h_relu = grad_y_pred.mm(w2.t())
    
    # grad_h = grad_h_relu.copy()
#     grad_h = grad_h_relu.clone()
    
#     grad_h[h<0] = 0
    
    # grad_w1 = x.T.dot(grad_h)
#     grad_w1 = x.t().mm(grad_h)

    
#     w1 -= learning_rate * grad_w1
#     w2 -= learning_rate * grad_w2
    
    loss.backward()
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        
        w1.grad.zero_()
        w2.grad.zero_()

0 24548232.0
1 19390818.0
2 19421688.0

PyTorch: Defining new autograd functions

import torch


class MyReLU(torch.autograd.Function):
    """
    We can implement our own custom autograd Functions by subclassing
    torch.autograd.Function and implementing the forward and backward passes
    which operate on Tensors.
    """

    @staticmethod
    def forward(ctx, input):
        """
        In the forward pass we receive a Tensor containing the input and return
        a Tensor containing the output. ctx is a context object that can be used
        to stash information for backward computation. You can cache arbitrary
        objects for use in the backward pass using the ctx.save_for_backward method.
        """
        ctx.save_for_backward(input)
        return input.clamp(min=0)

    @staticmethod
    def backward(ctx, grad_output):
        """
        In the backward pass we receive a Tensor containing the gradient of the loss
        with respect to the output, and we need to compute the gradient of the loss
        with respect to the input.
        """
        input, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input


dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(3):
    # To apply our Function, we use Function.apply method. We alias this as 'relu'.
    relu = MyReLU.apply

    # Forward pass: compute predicted y using operations; we compute
    # ReLU using our custom autograd operation.
    y_pred = relu(x.mm(w1)).mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # Use autograd to compute the backward pass.
    loss.backward()

    # Update weights using gradient descent
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

0 24658138.0
1 19849594.0
2 19743964.0

Use TensorFlow to fit a simple two-layer net

import tensorflow as tf
import numpy as np

N, D_in, D, D_out = 64, 1000, 100, 10

x = tf.placeholder(tf.float32, shape=(None, D_in))
y = tf.placeholder(tf.float32, shape=(None, D_out))

w1 = tf.Variable(tf.random_normal((D_in, H)))
w2 = tf.Variable(tf.random_normal((H, D_out)))

h = tf.matmul(x, w1)
h_relu = tf.maximum(h, tf.zeros(1))
y_pred = tf.matmul(h_relu, w2)

loss = tf.reduce_sum((y - y_pred)**2.0)

grad_w1, grad_w2 = tf.gradients(loss, [w1, w2])

learning_rate = 1e-6
new_w1 = w1.assign(w1 - learning_rate * grad_w1)
new_w2 = w2.assign(w2 - learning_rate * grad_w2)

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    x_value = np.random.randn(N, D_in)
    y_value = np.random.randn(N, D_out)
    for _ in range(3):
        loss_value, _, _ = sess.run([loss, new_w1, new_w2], feed_dict={x:x_value, y:y_value})
        print(loss_value)

29360906.0
24105692.0
22037684.0

Use the torch.nn package to implement our two-layer network

import random
import torch
import torch.nn as nn

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# sequential model
# model = nn.Sequential(nn.Linear(D_in, H), 
#                      nn.ReLU(), 
#                      nn.Linear(H, D_out))


# cumtom model
# class TwoLayerNet(torch.nn.Module):
#     def __init__(self, D_in, H, D_out):
#         super(TwoLayerNet, self).__init__()
#         self.linear1 = nn.Linear(D_in, H)
#         self.linear2 = nn.Linear(H, D_out)
        
#     def forward(self, x):
#         h_relu = self.linear1(x).clamp(min=0)
#         y_pred = self.linear2(h_relu)
#         return y_pred
    
# dynamic graphs and weight sharing
class DynamicNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        super(DynamicNet, self).__init__()
        self.input_linear = torch.nn.Linear(D_in, H)
        self.middle_linear = torch.nn.Linear(H, H)
        self.output_linear = torch.nn.Linear(H, D_out)

    def forward(self, x):
        """
        Since each forward pass builds a dynamic computation graph, we can use normal
        Python control-flow operators like loops or conditional statements when
        defining the forward pass of the model.

        Here we also see that it is perfectly safe to reuse the same Module many
        times when defining a computational graph. 
        """
        h_relu = self.input_linear(x).clamp(min=0)
        for _ in range(random.randint(0, 3)):
            h_relu = self.middle_linear(h_relu).clamp(min=0)
        y_pred = self.output_linear(h_relu)
        return y_pred

# model = TwoLayerNet(D_in, H, D_out)
model = DynamicNet(D_in, H, D_out)

criterion = nn.MSELoss(reduction='sum')

learning_rate = 1e-4

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

for t in range(3):
    y_pred = model(x)
    loss = criterion(y_pred, y)
    
    print(t, loss.item())
    
#     model.zero_grad()
    optimizer.zero_grad()
    
    loss.backward()
    
    # torch.no_grad(): 构建不需要track的上下文环境
    # with torch.no_grad() or .data to avoid tracking history in autograd

#     with torch.no_grad():
#         # SGD
#         for param in model.parameters():
#             param -= learning_rate * param.grad

    optimizer.step()

0 663.4624633789062
1 650.722900390625
2 641.6253051757812

关于Numpy+TensorFlow+PyTorch构造NN的总结

使用Tensor的理由+静态图和动态图的区别

implement the network using numpy and pytorch

PyTorch: Defining new autograd functions

Use TensorFlow to fit a simple two-layer net

Use the torch.nn package to implement our two-layer network

猜你喜欢