文章目录
【一】 RNN(Recurrent Neural Network)
- 立体示意图
- 平面示意图 (W 不变,权重共享)
- Hidden Layer(隐藏层)
【二】 链式求导
- hidden Layer(隐层)对 hidden layer(隐层)求导,
表示对角矩阵
- Loss 对 hidden layer(隐层)求导
【三】 梯度消失 and 梯度爆炸
- 疯狂连乘导致
- Gradient Clipping for Gradient Exploding(解决梯度爆炸)
- Gradient Vanishing(梯度消失):LSTM 和 GRU
- 正则项:控制
【四】 代码(前向传播)
【五】 RNN - Numpy 实现
- 自然语言 编码 and 解码
import numpy as np
word = 'hello'
chars = list(set(word)) # ['e','h','l','o']
# 我们来做两个字典
char_to_ix = { ch:i for i,ch in enumerate(chars) } # 编码
ix_to_char = { i:ch for i,ch in enumerate(chars) } # 解码
def encoding(char_to_ix, data, targets):
inputs = [char_to_ix[ch] for ch in data] # 转化成词向量, ex 2, 1, 3, 4
targets = [char_to_ix[ch] for ch in targets] # 转化为词向量
n_values = len(char_to_ix)
# 转化为独热编码, 因为我们此处的字母没有大小顺序, 对于预测目标, 可以看成一个分类问题无需转化
inputs = np.eye(n_values)[inputs]
# targets = np.eye(n_values)[targets]
return inputs, targets
def decoding(ix_to_char, data):
outputs = [ix_to_char[ch] for ch in data]
return outputs
- RNN 前向传播
# hyperparameters
hidden_size = 10 # size of hidden layer of neurons
seq_length = 4 # number of steps to unroll the RNN for
learning_rate = 1e-1
vocab_size = 4
# model parameters
Wxh = np.random.randn(vocab_size, hidden_size)*0.1 # input to hidden
Whh = np.random.randn(hidden_size, hidden_size)*0.1 # hidden to hidden
Why = np.random.randn(hidden_size, vocab_size)*0.1 # hidden to output
bh = np.zeros((1, hidden_size)) # hidden bias
by = np.zeros((1, vocab_size)) # output bias
# 前向传播函数
def rnn_forward(inputs, targets, Wxh, Whh, Why, bh, by):
loss = 0
hs, ys, ps = np.zeros((seq_length, hidden_size)), np.zeros((seq_length, vocab_size)), np.zeros((seq_length, vocab_size))
for t in range(len(inputs)):
if t == 0:
hprev = np.zeros((1, hidden_size))
else:
hprev = hs[t-1]
hs[t] = np.tanh(np.dot(inputs[t], Wxh) + np.dot(hprev, Whh) + bh) # hidden state
ys[t] = np.dot(hs[t], Why) + by # unnormalized log probabilities for next chars
ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t])) # softmax probabilities
loss += -np.log(ps[t][np.int(targets[t])])
loss = loss/len(inputs)
return ps, hs, loss
# inputs 为独热编码,targets 为普通的词向量
inputs, targets = encoding(char_to_ix, 'hell', 'ello')
# 实例化
ps, hs, loss = rnn_forward(inputs, targets, Wxh, Whh, Why, bh, by)
print ('probability is', ps)
ts = [p.argmax() for p in ps]
print ('label is' , ts)
print ('letter is' , decoding(ix_to_char, ts))
- RNN 反向传播
def rnn_backward(inputs, ps, hs, targets, Wxh, Whh, Why, bh, by):
# np.zeros_like() 函数,创建 shape 相同的 0 矩阵
dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
dbh, dby = np.zeros_like(bh), np.zeros_like(by)
dhnext = np.zeros_like(hs[0]) # loss / h_t , loss/ h_t-1 , loss/ h_t-2
for t in reversed(range(len(inputs))):
# Loss 对 y 求导 得到 dy
dy = np.copy(ps[t])
dy[np.int(targets[t])] -= 1 # backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here
dWhy += np.dot(hs[t].reshape(hidden_size, 1), dy.reshape(1, vocab_size)) # update readout matrix
dby += dy #update readout bias
# key , core of bptt
dh = np.dot(Why, dy) + dhnext # dh composed of two parts, one through y and the other through last step value
dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity
dbh += dhraw # update bias of hidden
dWxh += np.dot(inputs[t].reshape(vocab_size, 1), dhraw.reshape(1, hidden_size)) # update wxh matrix through dhraw
dWhh += np.dot(dhraw, hs[t-1].T) # update whh through last step h value and this step dhraw
dhnext = np.dot(dhraw, Whh) # step for last step dh, cruicial part as this step loss will affect last
for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
# 让梯度处于一个范围内 (-5, 0)
np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients
return dWxh, dWhh, dWhy, dbh, dby
dWxh, dWhh, dWhy, dbh, dby = rnn_backward(inputs, ps, hs, targets, Wxh, Whh, Why, bh, by)
- 训练 和 测试
def train(inputs, targets):
ps, hs, loss = rnn_forward(inputs, targets, Wxh, Whh, Why, bh, by)
# print (ps.argmax())
dWxh, dWhh, dWhy, dbh, dby = rnn_backward(inputs, ps, hs, targets, Wxh, Whh, Why, bh, by)
# perform parameter update with Adagrad
learning_rate = 1e-2
for param, dparam in zip([Wxh, Whh, Why, bh, by], [dWxh, dWhh, dWhy, dbh, dby]):
param += -learning_rate * dparam # adagrad update
return loss
for i in range(10000):
loss = train(inputs, targets)
if i%1000 == 0:
print (' Training loss: %f' % loss)# print progress
# 我们使用ps来进行预测, 我们要用预测值和真实标签的差距衡量训练的效果
ps,_,_ = rnn_forward(inputs, targets, Wxh, Whh, Why, bh, by)
ps[0].argmax(), ps[1].argmax(), ps[2].argmax(), ps[3].argmax()
decoding(ix_to_char, [1,2,2,0])
# 正式的写法
out = [p.argmax() for p in ps]
decoding(ix_to_char, out)
【六】 RNN - Pytorch 实现
- 定义 RNN 结构
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
class RNN(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super(RNN, self).__init__()
self.hidden_size = hidden_size
self.i2h = nn.Parameter(torch.randn(input_size, hidden_size))
self.h2h = nn.Parameter(torch.randn(hidden_size, hidden_size))
self.h2o = nn.Parameter(torch.randn(hidden_size, output_size))
self.bh = nn.Parameter(torch.randn(1, hidden_size))
self.bo = nn.Parameter(torch.randn(1, output_size))
def forward(self, inputs, hidden):
hidden = F.tanh(inputs.matmul(self.i2h) + hidden.matmul(self.h2h) + self.bh)
output = hidden.matmul(self.h2o)+ self.bo
return output, hidden
def initHidden(self):
return Variable(torch.zeros(1, self.hidden_size))
- 测试网络效果
# 编码,inputs 独热编码,targets 普通编码
inputs, targets = encoding(char_to_ix, 'hell', 'ello')
# 改变 inputs 的类型
inputs = Variable(torch.Tensor(inputs))
# 初始化隐层状态
hidden = rnn.inithidden()
# 每次都传 1 个独热编码
for letter in inputs:
letter = letter.view(1, -1)
out, hidden = rnn(letter, hidden)
print(out)
_, indice = out.view(-1).max(0)
- 训练
rnn = RNN(input_size=4, hidden_size=2, output_size= 4)
learning_rate = 0.02 # If you set this too high, it might explode. If too low, it might not learn
# 训练函数
def train(inputs, targets, rnn, epoch):
# start of sequence, that's why we use a init to clear history
criterion = nn.CrossEntropyLoss()
# 初始化隐层
hidden = rnn.initHidden()
# 0 of input size is sequence length,-1 自动计算一个 sequence 的长度
inputs = inputs.view(inputs.size()[0], -1)
# enter dynamic sequence
loss = 0
# 将 targets 转成 Tensor 类型
targets = Variable(torch.LongTensor(targets))
# inputs.size()[0] 表示有多少个 inputs
for i in range(inputs.size()[0]):
output, hidden = rnn(inputs[i], hidden)
# torch.unsqueeze 将数据增加一个维度,按照 dim=0 的方向
loss += criterion(output, torch.unsqueeze(targets[i], dim=0))
if epoch%100 == 0:
# max(0) 的意思是,dim = 0 的方向进行查询,[1] 表示返回 index
out = output.view(-1).max(0)[1] # return index of max value along dim0 of output tensor
inp = inputs[i].view(-1).max(0)[1]
letter_i = ix_to_char[int(inp.data)]
letter_o = ix_to_char[int(out.data)]
print (letter_i, letter_o)
# Add parameters' gradients to their values, multiplied by learning rate
# loss 回传求导,retain_graph 保留计算图
loss.backward(retain_graph = True)
# 参数更新
for p in rnn.parameters():
# pay attention that we use add_ p.data = p.data - learning_rate * p.grad.data, assign value with .data
p.data.add_(-learning_rate, p.grad.data)
# 梯度清零
rnn.zero_grad()
return output, loss.data
# 开始训练
for i in range(1500):
output, loss = train(inputs, targets, rnn, i)
if i%100 == 0:
print ('loss', loss)
【七】 RNN - 生成任务
- 根据上一个字母 自动生成 下一个字母
# 自动生成字母(rnn 为网络架构,seed_ix 为初始字母,n 为希望的序列长度)
def sample(rnn, seed_ix, n):
x = torch.zeros(1, 4)
hidden = rnn.initHidden()
x[0][seed_ix] = 1
x = Variable(x)
ixes = []
for t in range(n):
out, hidden = rnn(x, hidden)
prob = torch.softmax(out, dim=1)
# 根据概率,从 0,1,2,3 中抽样,而不是选择概率最大的直接输出
m = np.random.choice(range(4), p = prob.data.numpy().ravel())
x = torch.zeros(1, 4)
x[0][m] = 1
x = Variable(x)
ixes.append(m)
print(x)
return ixes