class RowParallelLinear(torch.nn.Module):
    """Linear layer with row parallelism.

    The linear layer is defined as Y = XA + b. A is parallelized along
    its first dimension and X along its second dimension as:
               -   -
              | A_1 |
              | .   |
          A = | .   |        X = [X_1, ..., X_p]
              | .   |
              | A_p |
               -   -
    Arguments:
        input_size: first dimension of matrix A.
        output_size: second dimension of matrix A.
        bias: If true, add bias. Note that bias is not parallelized.
        input_is_parallel: If true, we assume that the input is already
                           split across the GPUs and we do not split
                           again.
        init_method: method to initialize weights. Note that bias is always set
                     to zero.
        stride: For the strided linear layers.
        keep_master_weight_for_test: This was added for testing and should be
                                     set to False. It returns the master weights
                                     used for initialization.
    """
    
    def __init__(self, input_size, output_size, bias=True,
                 input_is_parallel=False,
                 init_method=init.xavier_normal_, stride=1,
                 keep_master_weight_for_test=False):
        super(RowParallelLinear, self).__init__()

        # Keep input parameters
        self.input_size = input_size
        self.output_size = output_size
        self.input_is_parallel = input_is_parallel

（2）初始化部分与ColumnParallelLinear类似（就是W是与上面的类似转置过来的）

        # Divide the weight matrix along the last dimension.
        world_size = get_model_parallel_world_size()#获取进程数（每个进程组里有多少个进程）——默认情况下，只有一个进程组
        self.input_size_per_partition = divide(input_size, world_size)#获取每个权重分区的大小

        # Parameters.
        # Note: torch.nn.functional.linear performs XA^T + b and as a result
        # we allocate the transpose.
        self.weight = Parameter(torch.Tensor(self.output_size,
                                             self.input_size_per_partition))
        self.weight.model_parallel = True
        #偏置
        if bias:
            self.bias = Parameter(torch.Tensor(self.output_size))
            # Always initialize bias to zero.
            with torch.no_grad():
                self.bias.zero_()
        else:
            self.register_parameter('bias', None)

        # Initialize weight.切分权重
        self.master_weight = _initialize_affine_weight(
            self.weight, self.output_size, self.input_size,
            self.input_size_per_partition, 1, init_method,
            stride=stride, return_master_weight=keep_master_weight_for_test)

2、forward（也和ColumnParallelLinear差不多）

    def forward(self, input_):
        # Set up backprop all-reduce.
        if self.input_is_parallel:#输入已经在GPU上拆分（X1，……，Xp）
            input_parallel = input_
        else:#未划分则进行划分
            input_parallel = scatter_to_model_parallel_region(input_)
        # Matrix multiply.
        output_parallel = F.linear(input_parallel, self.weight)#XW
        # All-reduce across all the partitions.
        output_ = reduce_from_model_parallel_region(output_parallel)#对所有进程内的数据进行汇总,并且让所有进程都获取最终结果（就是比如说本来第一块GPU的数据是X1*W1，然后汇总后每块GPU上的数据都是XW）
        #偏置
        if self.bias is not None:
            output = output_ + self.bias#Y=XW+B
        else:
            output = output_
        return output

欢迎大家在评论区批评指正，谢谢~

CogView中的RowParallelLinear

一、原理

二、代码解析

1、init

（1）参数说明

（2）初始化部分与ColumnParallelLinear类似（就是W是与上面的类似转置过来的）

2、forward（也和ColumnParallelLinear差不多）

猜你喜欢