BERT模型解读分为Bert模型-数据加载篇,Bert模型-fine-tuning 篇,Bert模型-输出篇, Pytorch版源代码解读篇.
第一篇 (Pytorch版)Bert模型-数据加载篇 讲述了如何加载数据,为数据送入模型做准备.
本篇为第二篇,讲述模型训练过程.训练代码在github中的train_eval.py文件下 .模型参数信息代码在 models文件夹下
本篇将讲述 如何使用加载好的数据 微调预训练模型的参数,就是所谓的 fine-tuning.
其实 fine-tuning 属于迁移学习的一种实现方式.
迁移学习 关心的是: 任务中的“知识”是什么以及如何更好地运用之前得到的“知识” 在新的任务中使用。这可以有很多方法和手段。而fine-tune只是其中的一种手段。
先下载好Bert预训练模型(预训练模型来自google的github)
BERT-Large, Uncased.(Whole Word Masking): 24-layer, 1024-hidden, 16-heads, 340M parameters
BERT-Large, Cased(Whole Word Masking) : 24-layer, 1024-hidden, 16-heads, 340M parameters
BERT-Base, Uncased: 12-layer, 768-hidden, 12-heads, 110M parameters
BERT-Large, Uncased: 24-layer, 1024-hidden, 16-heads, 340M parameters
BERT-Base, Cased: 12-layer, 768-hidden, 12-heads , 110M parameters
BERT-Large, Cased: 24-layer, 1024-hidden, 16-heads, 340M parameters
BERT-Base, Multilingual Cased (New, recommended): 104 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
BERT-Base, Multilingual Uncased (Orig, not recommended) (Not recommended, use Multilingual Cased instead): 102 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
BERT-Base, Chinese: Chinese Simplified and Traditional, 12-layer, 768-hidden, 12-heads, 110M parameters
前6个为英文模型,Multilingual代表多语言模型,最后一个是中文模型(字级别)
Uncased 代表将字母全部转换成小写,而Cased代表保留了大小写
如果想使用其他预训练模型(其实挺多选择的),可以找到对应模型的github 下载即可,**
下载解压之后的有3个文件( 一个json文件 一个bin文件 一个vocab.txt文件 ,共三个) 放入对应的预训练模型文件夹即可 ( 在我的github 中为bert_pretrain文件夹),接下来进入正文
模型加载
parser.add_argument('--model', default='ERNIE',type=str, help='choose a model: Bert, ERNIE')
args = parser.parse_args()
if __name__ == '__main__':
dataset = 'datas' # 数据集
model_name = args.model # bert
print(model_name)
x = import_module('models.' + model_name)#导入模型
print(1)
config = x.Config(dataset)#加载配置参数
np.random.seed(1)
torch.manual_seed(1)
torch.cuda.manual_seed_all(1)
torch.backends.cudnn.deterministic = True # 保证每次结果一样
start_time = time.time()
print("Loading data...")
train_data, dev_data, test_data = build_dataset(config)#加载数据,之前一篇文章已经讲解过
train_iter = build_iterator(train_data, config)#将数据加载为tensor 返回
dev_iter = build_iterator(dev_data, config)#将数据加载为tensor 返回
test_iter = build_iterator(test_data, config)#将数据加载为tensor 返回
time_dif = get_time_dif(start_time)
print("Time usage:", time_dif)
# train
model = x.Model(config).to(config.device)#模型加载
train(config, model, train_iter, dev_iter, test_iter)#训练
开始训练
def train(config, model, train_iter, dev_iter, test_iter):
start_time = time.time()
model.train()
Focal_Loss = FocalLoss()
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]#参数初始化
# optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)
optimizer = BertAdam(optimizer_grouped_parameters,
lr=config.learning_rate,
warmup=0.05,
t_total=len(train_iter) * config.num_epochs)#优化器
total_batch = 0 # 记录进行到多少batch
dev_best_loss = float('inf')
last_improve = 0 # 记录上次验证集loss下降的batch数
flag = False # 记录是否很久没有效果提升
model.train()
for epoch in range(config.num_epochs):
print('Epoch [{}/{}]'.format(epoch + 1, config.num_epochs))
for i, (trains, labels) in enumerate(train_iter):#取数据
# print(i)
outputs,encoder_out = model(trains)#送入模型,我额外返回了最后一层encoder_out 见模型大妈
# print(labels)
model.zero_grad()
#, weight = torch.Tensor([1.0, 0.3]).to(config.device)
# loss = F.cross_entropy(outputs, labels, weight = torch.Tensor([0.3, 0.7]).to(config.device))
labels_2 = translabels(labels, config)#将数据转为2维 方便使用focalloss 损失函数,如果使用 cross_entropy 可以把这行 和下一行注释
loss = Focal_Loss(outputs, labels_2)
loss.backward()
optimizer.step()
if total_batch % 50 == 0:
# 每多少轮输出在训练集和验证集上的效果
true = labels.data.cpu()#从GPU上将数据取出
predic = torch.max(outputs.data, 1)[1].cpu()#取预测值
train_acc = metrics.accuracy_score(true, predic)#计算准确率
dev_acc, dev_loss = evaluate(config, model, dev_iter)#送入验证集
if dev_loss < dev_best_loss:
dev_best_loss = dev_loss
torch.save(model.state_dict(), config.save_path)#如果效果好 保存
improve = '*'
last_improve = total_batch
else:
improve = ''
time_dif = get_time_dif(start_time)
msg = 'Iter: {0:>6}, Train Loss: {1:>5.2}, Train Acc: {2:>6.2%}, Val Loss: {3:>5.2}, Val Acc: {4:>6.2%}, Time: {5} {6}'
print(msg.format(total_batch, loss.item(), train_acc, dev_loss, dev_acc, time_dif, improve))
model.train()
total_batch += 1
if total_batch - last_improve > config.require_improvement:
# 验证集loss超过1000batch没下降,结束训练
print("No optimization for a long time, auto-stopping...")
flag = True
break
if flag:
break
test(config, model, test_iter)
模型代码
class Model(nn.Module):#模型代码
def __init__(self, config):#初始化
super(Model, self).__init__()
self.bert = BertModel.from_pretrained(config.bert_path)#调用你下载的预训练模型
for param in self.bert.parameters():
param.requires_grad = True
self.fc = nn.Linear(config.hidden_size, config.num_classes)#转换输出
def forward(self, x):
context = x[0] # 输入的句子
# print(context)
mask = x[2] # 对padding部分进行mask,和句子一个size,padding部分用0表示,如:[1, 1, 1, 1, 0, 0]
type_ids=x[3]#字归属句子标签
encoder_out, pooled = self.bert(context, token_type_ids=type_ids,attention_mask=mask, output_all_encoded_layers=False)# 返回结果向量
out = self.fc(pooled)
return out,encoder_out
配置参数
class Config(object):
"""配置参数"""
def __init__(self, dataset):
self.model_name = 'bert'
self.train_path = dataset + '/data3/train.txt' # 训练集
self.dev_path = dataset + '/data3/dev.txt' # 验证集
self.test_path = dataset + '/data3/test.txt' # 测试集
self.class_list = [x.strip() for x in open(
dataset + '/data3/class.txt').readlines()] # 类别名单
self.save_path = dataset + '/saved_model/' + self.model_name + '.ckpt' #模型训练结果
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')#设备
self.require_improvement = 1000 # 若超过1000batch效果还没提升,则提前结束训练
self.num_classes = len(self.class_list) # 类别数
self.num_epochs = 10 # epoch数
self.batch_size = 19 # mini-batch大小
self.pad_size = 160 # 每句话处理成的长度(短填长切)
self.learning_rate = 5e-5 # 学习率
self.bert_path = './bert_pretrain'
self.tokenizer = BertTokenizer.from_pretrained(self.bert_path)
self.hidden_size = 768
验证代码
def evaluate(config, model, data_iter, test=False):
model.eval()#注意这里是 model.eval(),训练时 model.train()
Focal_Loss = FocalLoss()
loss_total = 0
predict_all = np.array([], dtype=int)
labels_all = np.array([], dtype=int)
with torch.no_grad():
for texts, labels in data_iter:#取验证集数据
outputs,encoder_out = model(texts)#送入模型得预测值,我额外返回了最后一层encoder_out
# loss = F.cross_entropy(outputs, labels, weight = torch.Tensor([0.3, 0.7]).to(config.device))
labels_2 = translabels(labels, config)#使用focal loss 损失函数 转一维标签为2维标签
loss = Focal_Loss(outputs, labels_2)#计算损失
loss_total += loss
labels = labels.data.cpu().numpy()
predic = torch.max(outputs.data, 1)[1].cpu().numpy()
labels_all = np.append(labels_all, labels)
predict_all = np.append(predict_all, predic)
acc = metrics.accuracy_score(labels_all, predict_all)#计算准确率
if test:#如果是测试集.计算混淆矩阵 F1值等
print(labels_all)
print(len(labels_all))
print(predict_all)
print(len(predict_all))
try:
report = metrics.classification_report(labels_all, predict_all, target_names=config.class_list, digits=4)#计算召回率,F1值等
confusion = metrics.confusion_matrix(labels_all, predict_all)#混淆矩阵
return acc, loss_total / len(data_iter), report, confusion
except Exception as e:
print(e)
np.savetxt("真实标签.txt", labels_all, fmt='%f', delimiter=',')
np.savetxt("预测标签.txt", predict_all, fmt='%f', delimiter=',')
return acc, loss_total / len(data_iter), 0, 0
return acc, loss_total / len(data_iter)
测试代码
def test(config, model, test_iter):
# test
model.load_state_dict(torch.load(config.save_path))
model.eval()
start_time = time.time()
test_acc, test_loss, test_report, test_confusion = evaluate(config, model, test_iter, test=True)#调用验证代码.代码复用
msg = 'Test Loss: {0:>5.2}, Test Acc: {1:>6.2%}'
print(msg.format(test_loss, test_acc))
print("Precision, Recall and F1-Score...")
print(test_report)
print("Confusion Matrix...")
print(test_confusion)
time_dif = get_time_dif(start_time)
print("Time usage:", time_dif)
转换标签维度代码
def translabels(labels,config):#将一维 标签转为2维 batchsize * 类别数
wight = len(config.class_list)
lenght = len(labels)
label=np.zeros((lenght,wight))#形成矩阵
for i in range(lenght):
if int(labels[i].item())==2:
label[i][1] = 1
else:
label[i][int(labels[i].item())] = 1
# print(label)
label = torch.FloatTensor(label).to(config.device)#加载设备
return label#返回
其中注意几点:
- model.eval()和model.train()的区别. 在模型训练阶段使用model.train() ,让model变成训练模式,此时 dropout和batch normalization的操作在训练起到防止网络过拟合的问题。 在模型验证和测试阶段,调用model.eval()时,Pytorch会自动把BN和Dropout固定住,不会取平均,而是用训练好的值。不然的话,一旦test的batch_size过小,很容易就会被BN层导致生成图片颜色失真极大;
- 损失函数. 我使用了2 种 FocalLoss() 和cross_entropy() ,使用方法有点差异,使用的时候注意.因为pytorch 没有自带FocalLoss函数,所以需要另外写,我放在loss_pytorch 目录下.使用cross_entropy 时注释掉下列两行即可.
# labels_2 = translabels(labels, config)#使用focal loss 损失函数 转一维标签为2维标签
# loss = Focal_Loss(outputs, labels_2)#计算损失
到此模型预训练完毕. 预训练好的模型保存在 self.save_path = dataset + ‘/saved_model/’ + self.model_name + ‘.ckpt’ #模型训练结果目录下,使用的时候调用即可.