|
|
| import torch
|
| import torch.nn as nn
|
| import math
|
| import warnings
|
|
|
|
|
| def _no_grad_trunc_normal_(tensor, mean, std, a, b):
|
|
|
|
|
| def norm_cdf(x):
|
|
|
| return (1. + math.erf(x / math.sqrt(2.))) / 2.
|
|
|
| if (mean < a - 2 * std) or (mean > b + 2 * std):
|
| warnings.warn("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
|
| "The distribution of values may be incorrect.",
|
| stacklevel=2)
|
|
|
| with torch.no_grad():
|
|
|
|
|
|
|
| l = norm_cdf((a - mean) / std)
|
| u = norm_cdf((b - mean) / std)
|
|
|
|
|
|
|
| tensor.uniform_(2 * l - 1, 2 * u - 1)
|
|
|
|
|
|
|
| tensor.erfinv_()
|
|
|
|
|
| tensor.mul_(std * math.sqrt(2.))
|
| tensor.add_(mean)
|
|
|
|
|
| tensor.clamp_(min=a, max=b)
|
| return tensor
|
|
|
|
|
| def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
|
|
|
| r"""Fills the input Tensor with values drawn from a truncated
|
| normal distribution. The values are effectively drawn from the
|
| normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
|
| with values outside :math:`[a, b]` redrawn until they are within
|
| the bounds. The method used for generating the random values works
|
| best when :math:`a \leq \text{mean} \leq b`.
|
| Args:
|
| tensor: an n-dimensional `torch.Tensor`
|
| mean: the mean of the normal distribution
|
| std: the standard deviation of the normal distribution
|
| a: the minimum cutoff value
|
| b: the maximum cutoff value
|
| Examples:
|
| >>> w = torch.empty(3, 5)
|
| >>> nn.init.trunc_normal_(w)
|
| """
|
| return _no_grad_trunc_normal_(tensor, mean, std, a, b)
|
|
|
|
|
| def drop_path(x, drop_prob: float = 0., training: bool = False):
|
| """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
|
|
|
| This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
|
| the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
|
| See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
|
| changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
|
| 'survival rate' as the argument.
|
|
|
| """
|
| if drop_prob == 0. or not training:
|
| return x
|
| keep_prob = 1 - drop_prob
|
| shape = (x.shape[0],) + (1,) * (x.ndim - 1)
|
| random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
|
| random_tensor.floor_()
|
| output = x.div(keep_prob) * random_tensor
|
| return output
|
|
|
|
|
| class DropPath(nn.Module):
|
| """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
|
| """
|
|
|
| def __init__(self, drop_prob=None):
|
| super(DropPath, self).__init__()
|
| self.drop_prob = drop_prob
|
|
|
| def forward(self, x):
|
| return drop_path(x, self.drop_prob, self.training)
|
|
|
|
|
| class Mlp(nn.Module):
|
| def __init__(self, in_features, hidden_features=None, out_features=None,
|
| act_layer=nn.GELU, drop=0.):
|
| super().__init__()
|
| out_features = out_features or in_features
|
| hidden_features = hidden_features or in_features
|
| self.fc1 = nn.Linear(in_features, hidden_features)
|
| self.act = act_layer()
|
| self.fc2 = nn.Linear(hidden_features, out_features)
|
| self.drop = nn.Dropout(drop)
|
|
|
| def forward(self, x):
|
| x = self.fc1(x)
|
| x = self.act(x)
|
| x = self.drop(x)
|
| x = self.fc2(x)
|
| x = self.drop(x)
|
| return x |