深度学习调试模型
1.简单训练模型例子
https://zhuanlan.zhihu.com/p/136902153
```
import torch
import torch.nn as nn
import torch.optim as optim
model = nn.Conv2d(3, 64, 3)
optimizer = optim.SGD(model.parameters(), lr=0.5)
lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=2)
for i in range(5):
optimizer.zero_grad()
x = model(torch.randn(3, 3, 64, 64))
loss = x.sum()
loss.backward()
print('{} optim: {}'.format(i, optimizer.param_groups[0]['lr']))
optimizer.step()
print('{} scheduler: {}'.format(i, lr_scheduler.get_lr()[0]))
lr_scheduler.step()
```
2.简单训练模型例子2
```python
import torch
import torch.nn as nn
import yaml
class Net(nn.Module):
def __init__(self):
super(Net,self).__init__()
self.linear = nn.Linear(2,3)
def forward(self,x):
x = self.linear(x)
return x
model = Net()
filename="exp/u2++_conformer/train.yaml"
with open(filename, 'r') as fin:
configs = yaml.load(fin, Loader=yaml.FullLoader)
optimizer = torch.optim.Adam(model.parameters(), **{'lr':0.001})
lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=2)
```
-
espnet模型加载代码
import torch from espnet.nets.pytorch_backend.e2e_asr_transformer import E2E from espnet.asr.asr_utils import get_model_conf model_path="exp/train_nodev_sp_pytorch_train/results/model.21-30.avg.best" idim, odi, args = get_model_conf(model_path, None) model_state_dict = torch.load(model_path, map_location=lambda storage, loc:storage) model=E2E(idim, odi, args) model.load_state_dict(model_state_dict) print(model) model.__repr__() for name, parameters in model.named_parameters(): print(name, ':', parameters.size())
-
查看模型结构
print(model) print(list(model.modules())) print(model.state_dict)
-
查看模型参数
for name, parameter in model.named_parameters(): print(name, ':', parameter.size())
-
查看模型梯度
model.linear.weight.grad
-
optimizer
model = torch.nn.Linear(2, 3) optimizer = torch.optim.Adam(model.parameters(), lr=0.001) torch.save(optimiser.state_dict(), 'optimiser.pth') optimiser.load_state_dict(torch.load('optimiser.pth'))
-
grad clip
clip = 5
grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
if torch.iffinite(grad_norm):
optimizer.step()
注解: grad_clip的作用是将所有梯度是视为一个向量,将这个向量的2norm限制为clip。(默认是2norm,就是每一项平方和除以总个数,最后再开方)。返回值为这个向量的norm值。类型为tensor,大小为[]. 也就是说是个标量,非向量。
如果判断其norm值不是inf,则才进行权重更新,否则因为梯度为inf,则不更新权重。
-
获取数据
GPU上的数据
best_score.cpu().item() -
遍历checkpoint和模型参数
for key in checkpoint.keys(): print(key) print("#######") for name, parameters in model.named_parameters(): print(name) for name, parameters in model.named_parameters(): if "depthwise_conv" in name: print("[wyr debug]") print(name) print(checkpoint[name].size()) print(parameters.size()) print(checkpoint[name]) print(parameters)
-
模型加载
import torch from espnet.nets.pytorch_backend.e2e_asr_transformer import E2E from espnet.asr.asr_utils import get_model_conf model_path="exp/train_nodev_sp_pytorch_train/results/model.21-30.avg.best" idim, odi, args = get_model_conf(model_path, None) model_state_dict = torch.load(model_path, map_location=lambda storage, loc:storage) model=E2E(idim, odi, args) model.load_state_dict(model_state_dict) print(model) model.__repr__() for name, parameters in model.named_parameters(): print(name, ':', parameters.size())
-
tensoroboard使用
tensorboard:训练tensorboard保存目录,tensorboard打开方式:tensorboard --logdir checkpoint/ tensorboard/train/,tensorboard查看方式:打开网页输入训练服务器 ip:6006
-
模型加载
wenet模型保存:
logging.info('Checkpoint: save to checkpoint %s' % path)
if isinstance(model, torch.nn.DataParallel):
state_dict = model.module.state_dict()
elif isinstance(model, torch.nn.parallel.DistributedDataParallel):
state_dict = model.module.state_dict()
else:
state_dict = model.state_dict()
torch.save(state_dict, path)
wenet模型加载:
if torch.cuda.is_available():
logging.info('Checkpoint: loading from checkpoint %s for GPU' % path)
checkpoint = torch.load(path)
else:
logging.info('Checkpoint: loading from checkpoint %s for CPU' % path)
checkpoint = torch.load(path, map_location='cpu')
FUNASR模型保存:
torch.save(
{
“model”: model.state_dict(),
“reporter”: reporter.state_dict(),
“optimizers”: [o.state_dict() for o in optimizers],
“schedulers”: [
s.state_dict() if s is not None else None
for s in schedulers
],
“scaler”: scaler.state_dict() if scaler is not None else None,
},
output_dir / “checkpoint.pth”,
)
torch.save(model.state_dict(), output_dir / f"{iepoch}epoch.pth")
FUNASR模型加载:
states = torch.load(
checkpoint,
map_location=f"cuda:{torch.cuda.current_device()}" if ngpu > 0 else “cpu”,
)
model.load_state_dict(states[“model”])
import torch
model_name="checkpoint/1epoch.pth“
CPU加载
checkpoint = torch.load(model_name, map_location=torch.device(‘cpu’))
GPU加载
checkpoint = torch.load(model_name)
for key in checkpoint.keys():
print(key)