torch.nn.parallel.comm 的源代码

import warnings
import torch
from torch.cuda import nccl
from torch._utils import _take_tensors, _flatten_dense_tensors, \
    _unflatten_dense_tensors, _reorder_tensors_as, _get_device_index, _handle_complex
from typing import List

[docs]def broadcast(tensor, devices=None, *, out=None):
    r"""将张量广播到指定的GPU设备。

    参数:
        tensor (Tensor): 要广播的张量。可以在CPU或GPU上。
        devices (Iterable[torch.device, str or int], 可选): 一个包含GPU设备的可迭代对象，在这些设备上进行广播。
        out (Sequence[Tensor], 可选, 仅关键字): 用于存储输出结果的GPU张量。

    .. 注意::
        :attr:`devices` 和 :attr:`out` 必须指定其中一个。

    返回:
        - 如果指定了 :attr:`devices`,
            一个包含 :attr:`tensor` 副本的元组，放置在 :attr:`devices` 上。
        - 如果指定了 :attr:`out`,
            一个包含 :attr:`out` 张量的元组，每个张量包含 :attr:`tensor` 的副本。
    """
    tensor = _handle_complex(tensor)
    if not ((devices is None) ^ (out is None)):
        raise RuntimeError(
            f"必须指定 'devices' 和 'out' 中的一个，但得到 devices={devices} 和 out={out}")
    if devices is not None:
        devices = [_get_device_index(d) for d in devices]
        return torch._C._broadcast(tensor, devices)
    else:
        return torch._C._broadcast_out(tensor, out)


[docs]def broadcast_coalesced(tensors, devices, buffer_size=10485760):
    """将一系列张量广播到指定的GPU。

    小张量首先被合并到一个缓冲区中，以减少同步次数。

    参数:
        tensors (sequence): 要广播的张量。必须在同一设备上，可以是CPU或GPU。
        devices (Iterable[torch.device, str or int]): 一个包含GPU设备的可迭代对象，在这些设备上进行广播。
        buffer_size (int): 用于合并的最大缓冲区大小

    返回:
        一个包含 :attr:`tensor` 副本的元组，放置在 :attr:`devices` 上。
    """
    devices = [_get_device_index(d) for d in devices]
    tensors = [_handle_complex(t) for t in tensors]
    return torch._C._broadcast_coalesced(tensors, devices, buffer_size)


[docs]def reduce_add(inputs, destination=None):
    """从多个GPU中对张量求和。

    所有输入应具有匹配的形状、数据类型和布局。输出张量将具有相同的形状、数据类型和布局。

    参数:
        inputs (Iterable[Tensor]): 要相加的张量。
        destination (int, 可选): 输出将放置的设备（默认: 当前设备）。

    返回:
        一个包含所有输入元素和的张量，放置在 :attr:`destination` 设备上。
    """
    destination = _get_device_index(destination, optional=True)
    input_size = inputs[0].size()
    root_index = None  # 已经位于正确设备上的输入张量的索引
    for i, inp in enumerate(inputs):
        assert inp.device.type != "cpu", "reduce_add 期望所有输入都在GPU上"
        if inp.get_device() == destination:
            root_index = i
        if inp.size() != input_size:
            got = 'x'.join(str(x) for x in inp.size())
            expected = 'x'.join(str(x) for x in input_size)
            raise ValueError(f"输入 {i} 具有无效大小: 得到 {got}, 但期望 {expected}")
    if root_index is None:
        raise RuntimeError("reduce_add 期望目标设备与其中一个张量在同一GPU上")

    if len(inputs) == 1:
        return inputs[0]

    if nccl.is_available(inputs):
        result = torch.empty_like(inputs[root_index])
        nccl.reduce(inputs, output<span class="