torch.nn.init 的源代码

"""此文件包含用于初始化神经网络参数的实用程序。"""
import math
import warnings

from torch import Tensor
import torch
from typing import Optional as _Optional

# 这些 no_grad_* 函数是必要的，因为这些函数中使用了 `with torch.no_grad()` 的部分。
# JIT 不支持上下文管理器，因此需要将它们实现为内置函数。使用这些包装器
# 可以让我们保持这些内置函数小且可重用。
def _no_grad_uniform_(tensor, a, b, generator=None):
    with torch.no_grad():
        return tensor.uniform_(a, b, generator=generator)


def _no_grad_normal_(tensor, mean, std, generator=None):
    with torch.no_grad():
        return tensor.normal_(mean, std, generator=generator)


def _no_grad_trunc_normal_(tensor, mean, std, a, b, generator=None):
    # 方法基于 https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
    def norm_cdf(x):
        # 计算标准正态累积分布函数
        return (1. + math.erf(x / math.sqrt(2.))) / 2.

    if (mean < a - 2 * std) or (mean > b + 2 * std):
        warnings.warn("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
                      "The distribution of values may be incorrect.",
                      stacklevel=2)

    with torch.no_grad():
        # 使用截断均匀分布生成值，然后使用正态分布的逆CDF。
        # 获取上界和下界的CDF值
        l = norm_cdf((a - mean) / std)
        u = norm_cdf((b - mean) / std)

        # 用 [l, u] 的值填充张量，然后转换为 [2l-1, 2u-1]。
        tensor.uniform_(2 * l - 1, 2 * u - 1, generator=generator)

        # 使用正态分布的逆CDF变换以获得截断的标准正态分布
        tensor.erfinv_()

        # 转换为适当的均值和标准差
        tensor.mul_(std * math.sqrt(2.))
        tensor.add_(mean)

        # 确保在适当的范围内
        tensor.clamp_(min=a, max=b)
        return tensor


def _no_grad_fill_(tensor, val):
    with torch.no_grad():
        return tensor.fill_(val)


def _no_grad_zero_(tensor):
    with torch.no_grad():
        return tensor.zero_()


[docs]def calculate_gain(nonlinearity, param=None):
    r"""返回给定非线性函数的推荐增益值。

    值如下：

    ================= ====================================================
    nonlinearity      增益
    ================= ====================================================
    Linear / Identity :math:`1`
    Conv{1,2,3}D      :math:`1`
    Sigmoid           :math:`1`
    Tanh              :math:`\frac{5}{3}`
    ReLU              :math:`\sqrt{2}`
    Leaky Relu        :math:`\sqrt{\frac{2}{1 + \text{negative\_slope}^2}}`
    SELU              :math:`\frac{3}{4}`
    ================= ====================================================

    .. 警告::
        为了实现 `自归一化神经网络`_ ，
        你应该使用 ``nonlinearity='linear'`` 而不是 ``nonlinearity='selu'``。
        这使得初始权重的方差为 ``1 / N``，
        这对于在前向传递中诱导稳定的固定点是必要的。
        相比之下，``SELU`` 的默认增益牺牲了归一化效果，
        以在矩形层中获得更稳定的梯度流。

    参数:
        nonlinearity: 非线性函数 (`nn.functional` 名称)
        param: 非线性函数的可选参数

    示例:
        >>> gain = nn.init.calculate_gain('leaky_relu', 0.2)  # leaky_relu with negative_slope=0.2

    .. _自归一化神经网络: https://papers.nips.cc/paper/2017/hash/5d44ee6f2c3f71b73125876103c8f6c4-Abstract.html
    """
    linear_fns = ['linear', 'conv1d', 'conv2d', 'conv3d', 'conv_transpose1d', 'conv_transpose2d', 'conv_transpose3d']
    if nonlinearity in linear_fns or nonlinearity == 'sigmoid':
        return 1
    elif nonlinearity == 'tanh':
        return 5.0 / 3
    elif nonlinearity == 'relu':
        <