GELU
这个函数特别占内存,计算量很大,对检测有帮助,收敛比relu6快
但是最高精度没有relu6高
梯度最大在第一层卷积层
类:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import torch
import math
from torch import nn
from torch.nn import functional as F
class mish(nn.Module):
def __init__(self):
super(mish, self).__init__()
# Also see https://arxiv.org/abs/1606.08415
def forward(self, x):
return x * torch.tanh(F.softplus(x))
class Gelu(nn.Module):
def __init__(self):
super(Gelu, self).__init__()
# Also see https://arxiv.org/abs/1606.08415
def forward(self, x):
return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
class Gelu_new(nn.Module):
def __init__(self):
super(Gelu_new, self).__init__()
#Also see https://arxiv.org/abs/1606.08415
def forward(self, x):
return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
class swish(nn.Module):
def __init__(self):
super(swish, self).__init__()
#Also see https://arxiv.org/abs/1606.08415
def forward(self, x):
return x * torch.sigmoid(x)
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import torch
import math
def gelu(x):
""" Original Implementation of the gelu activation function in Google Bert repo when initially created.
For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
Also see https://arxiv.org/abs/1606.08415
"""
return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
def gelu_new(x):
""" Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT).
Also see https://arxiv.org/abs/1606.08415
"""
return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
def swish(x):
return x * torch.sigmoid(x)