基于C++与CUDA的N卡GPU并行程序——cuBLAS简介

cuBLAS官方文档
为了使用cuBLAS库,只需要在C代码中导入库文件即可

include "cublas.h"     #旧版
include "cublas_v2.h"  #新版

编译代码文件时需要导入链接库

nvcc myCublasApp.c  -lcublas  -o myCublasApp
nvcc myCublasApp.c  -lcublas_static   -lculibos -o myCublasApp

库文件中有三个API集合,分别是cuBLAS,cuBLASXt和cuBLASLt,在cuBLAS中必须要开辟分配矩阵向量的内存,并且要将数据结果传递回CPU主机端,在cuBLASXt中则可以将数据放在主机端,或者任何类型的设备数据也行,而cuBLASLt则是一个灵活的轻量化的库,可以执行灵活的矩阵运算,选择不同的算法实现,之后可以复用算法流程以应对不同的数据.
矩阵是按列存储的,并且下标索引从1开始,这与fortran是类似的,而C中是按行存储的,两者的下标索引转换代码为,

#define IDX2F(i,j,ld) ((((j)-1)*(ld))+((i)-1))
#define IDX2C(i,j,ld) (((j)*(ld))+(i))

这里列举一个简单的示例,展示了其大致的流程

//Example 1. Application Using C and cuBLAS: 1-based indexing
//-----------------------------------------------------------
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>
#include "cublas_v2.h"
#define M 6
#define N 5
#define IDX2F(i,j,ld) ((((j)-1)*(ld))+((i)-1))

static __inline__ void modify (cublasHandle_t handle, float *m, int ldm, int n, int p, int q, float alpha, float beta){
    
    
    cublasSscal (handle, n-q+1, &alpha, &m[IDX2F(p,q,ldm)], ldm);
    cublasSscal (handle, ldm-p+1, &beta, &m[IDX2F(p,q,ldm)], 1);
}

int main (void){
    
    
    cudaError_t cudaStat;
    cublasStatus_t stat;
    cublasHandle_t handle;
    int i, j;
    float* devPtrA;
    float* a = 0;
    a = (float *)malloc (M * N * sizeof (*a));
    if (!a) {
    
    
        printf ("host memory allocation failed");
        return EXIT_FAILURE;
    }
    for (j = 1; j <= N; j++) {
    
    
        for (i = 1; i <= M; i++) {
    
    
            a[IDX2F(i,j,M)] = (float)((i-1) * M + j);
        }
    }
    cudaStat = cudaMalloc ((void**)&devPtrA, M*N*sizeof(*a));
    if (cudaStat != cudaSuccess) {
    
    
        printf ("device memory allocation failed");
        return EXIT_FAILURE;
    }
    stat = cublasCreate(&handle);
    if (stat != CUBLAS_STATUS_SUCCESS) {
    
    
        printf ("CUBLAS initialization failed\n");
        return EXIT_FAILURE;
    }
    stat = cublasSetMatrix (M, N, sizeof(*a), a, M, devPtrA, M);
    if (stat != CUBLAS_STATUS_SUCCESS) {
    
    
        printf ("data download failed");
        cudaFree (devPtrA);
        cublasDestroy(handle);
        return EXIT_FAILURE;
    }
    modify (handle, devPtrA, M, N, 2, 3, 16.0f, 12.0f);
    stat = cublasGetMatrix (M, N, sizeof(*a), devPtrA, M, a, M);
    if (stat != CUBLAS_STATUS_SUCCESS) {
    
    
        printf ("data upload failed");
        cudaFree (devPtrA);
        cublasDestroy(handle);
        return EXIT_FAILURE;
    }
    cudaFree (devPtrA);
    cublasDestroy(handle);
    for (j = 1; j <= N; j++) {
    
    
        for (i = 1; i <= M; i++) {
    
    
            printf ("%7.0f", a[IDX2F(i,j,M)]);
        }
        printf ("\n");
    }
    free(a);
    return EXIT_SUCCESS;
}

这里涉及了几个函数

stat = cublasSetMatrix (M, N, sizeof(*a), a, M, devPtrA, M);
stat = cublasGetMatrix (M, N, sizeof(*a), devPtrA, M, a, M);
cublasSetMatrix(int rows, int cols, int elemSize, const void *A, int lda, void *B, int ldb)
cublasGetMatrix (int rows, int cols, int elemSize, const void *A, int lda, void *B, int ldb)

cublasSetMatrix()把CPU主机端矩阵复制到GPU,因为是列优先,所以lda和ldb表示矩阵行数,cublasGetMatrix()把GPU数据复制到CPU主机端.

cublasSscal(cublasHandle_t handle, int n,const float *alpha, float *x, int incx)

cublasSscal()传入矩阵x,总大小为n,每隔incx个数,就乘以alpha.

cublasCreate(cublasHandle_t *handle)
cublasDestroy(cublasHandle_t handle)

使用cuBLAS前,创建handle管理cuBLAS的计算流程,类似于CUDA的流stream,可以同时创建多个handle,也可以绑定到不同的GPU上,多个handle之间的计算工作可以没有影响.应当尽量的创建handle,计算完成时要销毁handle,会隐性调用cublasDeviceSynchronize().如果要使用另一个GPU设备,需要调用cudaSetDevice()之后,就能调用cublasCreate()创建新的handle.

运算

运算库分为三个等级,分别是Level-1、Level-2、Level-3.其参数传递有两种,第一是通过指针引用传入alpha,beta这种可以位于主机端的标量,指针类型设为CUBLAS_POINTER_MODE_HOST,也可以设为CUBLAS_POINTER_MODE_DEVICE,这时就只能在GPU设备端了.第二是类似于asum(),rotg(),rotmg(),dot(),nrm2()这种返回参数,设为CUBLAS_POINTER_MODE_HOST时必须要等到GPU计算完成才能传递回主机端,设为CUBLAS_POINTER_MODE_DEVICE时,则能快速传递完成.
运算所处理的基本类型有单精度浮点型(s),双精度(d),单精度复数©和双精度复数(z),运算函数的名称会有相应的字符名称,表示其处理的是何种类型.首先来看Level-1的运算函数

cublasStatus_t cublasIsamax(cublasHandle_t handle, int n,
                            const float *x, int incx, int *result)
cublasStatus_t cublasIdamax(cublasHandle_t handle, int n,
                            const double *x, int incx, int *result)
cublasStatus_t cublasIcamax(cublasHandle_t handle, int n,
                            const cuComplex *x, int incx, int *result)
cublasStatus_t cublasIzamax(cublasHandle_t handle, int n,
                            const cuDoubleComplex *x, int incx, int *result)

其参数中的handle表示cuBLAS上下文,n为向量元素长度,x为设备端向量,incx为步长,result为主机端或设备端数据.调用运算函数后会产生相应的状态返回值,不同的返回值表示不同的执行效果.

CUBLAS_STATUS_SUCCESS             the operation completed successfully

CUBLAS_STATUS_NOT_INITIALIZED     the library was not initialized

CUBLAS_STATUS_ALLOC_FAILED        the reduction buffer could not be allocated

CUBLAS_STATUS_ARCH_MISMATCH       the device does not support double-precision

CUBLAS_STATUS_EXECUTION_FAILED    the function failed to launch on the GPU

相应的,求最小值的下标索引则有如下运算函数

最小值下标索引

cublasStatus_t cublasIsamin(cublasHandle_t handle, int n,
                            const float *x, int incx, int *result)
cublasStatus_t cublasIdamin(cublasHandle_t handle, int n,
                            const double *x, int incx, int *result)
cublasStatus_t cublasIcamin(cublasHandle_t handle, int n,
                            const cuComplex *x, int incx, int *result)
cublasStatus_t cublasIzamin(cublasHandle_t handle, int n,
                            const cuDoubleComplex *x, int incx, int *result)

向量绝对值求和

#This function computes the sum of the absolute values of the elements of vector x. 
cublasStatus_t  cublasSasum(cublasHandle_t handle, int n,
                            const float           *x, int incx, float  *result)
cublasStatus_t  cublasDasum(cublasHandle_t handle, int n,
                            const double          *x, int incx, double *result)
cublasStatus_t cublasScasum(cublasHandle_t handle, int n,
                            const cuComplex       *x, int incx, float  *result)
cublasStatus_t cublasDzasum(cublasHandle_t handle, int n,
                            const cuDoubleComplex *x, int incx, double *result)

向量数乘相加

#This function multiplies the vector x by the scalar α and adds it to the vector y overwriting the latest vector with the result. Hence, the performed operation is y [ j ] = α × x [ k ] + y [ j ] for i = 1 , … , n , k = 1 + ( i - 1 ) *  incx and j = 1 + ( i - 1 ) *  incy . Notice that the last two equations reflect 1-based indexing used for compatibility with Fortran. 
cublasStatus_t cublasSaxpy(cublasHandle_t handle, int n,
                           const float           *alpha,
                           const float           *x, int incx,
                           float                 *y, int incy)
cublasStatus_t cublasDaxpy(cublasHandle_t handle, int n,
                           const double          *alpha,
                           const double          *x, int incx,
                           double                *y, int incy)
cublasStatus_t cublasCaxpy(cublasHandle_t handle, int n,
                           const cuComplex       *alpha,
                           const cuComplex       *x, int incx,
                           cuComplex             *y, int incy)
cublasStatus_t cublasZaxpy(cublasHandle_t handle, int n,
                           const cuDoubleComplex *alpha,
                           const cuDoubleComplex *x, int incx,
                           cuDoubleComplex       *y, int incy)

向量复制

#This function copies the vector x into the vector y. Hence, the performed operation is y [ j ] = x [ k ] for i = 1 , … , n , k = 1 + ( i - 1 ) *  incx and j = 1 + ( i - 1 ) *  incy . Notice that the last two equations reflect 1-based indexing used for compatibility with Fortran.
cublasStatus_t cublasScopy(cublasHandle_t handle, int n,
                           const float           *x, int incx,
                           float                 *y, int incy)
cublasStatus_t cublasDcopy(cublasHandle_t handle, int n,
                           const double          *x, int incx,
                           double                *y, int incy)
cublasStatus_t cublasCcopy(cublasHandle_t handle, int n,
                           const cuComplex       *x, int incx,
                           cuComplex             *y, int incy)
cublasStatus_t cublasZcopy(cublasHandle_t handle, int n,
                           const cuDoubleComplex *x, int incx,
                           cuDoubleComplex       *y, int incy)

向量内积

#This function computes the dot product of vectors x and y. Hence, the result is ∑  ( x [ k ] × y [ j ] ) where k = 1 + ( i - 1 ) *  incx and j = 1 + ( i - 1 ) *  incy . Notice that in the first equation the conjugate of the element of vector should be used if the function name ends in character ‘c’ and that the last two equations reflect 1-based indexing used for compatibility with Fortran. 
cublasStatus_t cublasSdot (cublasHandle_t handle, int n,
                           const float           *x, int incx,
                           const float           *y, int incy,
                           float           *result)
cublasStatus_t cublasDdot (cublasHandle_t handle, int n,
                           const double          *x, int incx,
                           const double          *y, int incy,
                           double          *result)
cublasStatus_t cublasCdotu(cublasHandle_t handle, int n,
                           const cuComplex       *x, int incx,
                           const cuComplex       *y, int incy,
                           cuComplex       *result)
cublasStatus_t cublasCdotc(cublasHandle_t handle, int n,
                           const cuComplex       *x, int incx,
                           const cuComplex       *y, int incy,
                           cuComplex       *result)
cublasStatus_t cublasZdotu(cublasHandle_t handle, int n,
                           const cuDoubleComplex *x, int incx,
                           const cuDoubleComplex *y, int incy,
                           cuDoubleComplex *result)
cublasStatus_t cublasZdotc(cublasHandle_t handle, int n,
                           const cuDoubleComplex *x, int incx,
                           const cuDoubleComplex *y, int incy,
                           cuDoubleComplex       *result)

向量模长

#This function computes the Euclidean norm of the vector x. 
                            const float           *x, int incx, float  *result)
cublasStatus_t  cublasDnrm2(cublasHandle_t handle, int n,
                            const double          *x, int incx, double *result)
cublasStatus_t cublasScnrm2(cublasHandle_t handle, int n,
                            const cuComplex       *x, int incx, float  *result)
cublasStatus_t cublasDznrm2(cublasHandle_t handle, int n,
                            const cuDoubleComplex *x, int incx, double *result)

向量旋转

#This function applies Givens rotation matrix (i.e., rotation in the x,y plane counter-clockwise by angle defined by cos(alpha)=c, sin(alpha)=s): 
cublasStatus_t  cublasSrot(cublasHandle_t handle, int n,
                           float           *x, int incx,
                           float           *y, int incy,
                           const float  *c, const float           *s)
cublasStatus_t  cublasDrot(cublasHandle_t handle, int n,
                           double          *x, int incx,
                           double          *y, int incy,
                           const double *c, const double          *s)
cublasStatus_t  cublasCrot(cublasHandle_t handle, int n,
                           cuComplex       *x, int incx,
                           cuComplex       *y, int incy,
                           const float  *c, const cuComplex       *s)
cublasStatus_t cublasCsrot(cublasHandle_t handle, int n,
                           cuComplex       *x, int incx,
                           cuComplex       *y, int incy,
                           const float  *c, const float           *s)
cublasStatus_t  cublasZrot(cublasHandle_t handle, int n,
                           cuDoubleComplex *x, int incx,
                           cuDoubleComplex *y, int incy,
                           const double *c, const cuDoubleComplex *s)
cublasStatus_t cublasZdrot(cublasHandle_t handle, int n,
                           cuDoubleComplex *x, int incx,
                           cuDoubleComplex *y, int incy,
                           const double *c, const double          *s)

向量数乘

#This function scales the vector x by the scalar α and overwrites it with the result. Hence, the performed operation is x [ j ] = α × x [ j ] for i = 1 , … , n and j = 1 + ( i - 1 ) *  incx . Notice that the last two equations reflect 1-based indexing used for compatibility with Fortran. 
cublasStatus_t  cublasSscal(cublasHandle_t handle, int n,
                            const float           *alpha,
                            float           *x, int incx)
cublasStatus_t  cublasDscal(cublasHandle_t handle, int n,
                            const double          *alpha,
                            double          *x, int incx)
cublasStatus_t  cublasCscal(cublasHandle_t handle, int n,
                            const cuComplex       *alpha,
                            cuComplex       *x, int incx)
cublasStatus_t cublasCsscal(cublasHandle_t handle, int n,
                            const float           *alpha,
                            cuComplex       *x, int incx)
cublasStatus_t  cublasZscal(cublasHandle_t handle, int n,
                            const cuDoubleComplex *alpha,
                            cuDoubleComplex *x, int incx)
cublasStatus_t cublasZdscal(cublasHandle_t handle, int n,
                            const double          *alpha,
                            cuDoubleComplex *x, int incx)

向量互换

#This function interchanges the elements of vector x and y. Hence, the performed operation is y [ j ] ⇔ x [ k ] for i = 1 , … , n , k = 1 + ( i - 1 ) *  incx and j = 1 + ( i - 1 ) *  incy . Notice that the last two equations reflect 1-based indexing used for compatibility with Fortran. 
cublasStatus_t cublasSswap(cublasHandle_t handle, int n, float           *x,
                           int incx, float           *y, int incy)
cublasStatus_t cublasDswap(cublasHandle_t handle, int n, double          *x,
                           int incx, double          *y, int incy)
cublasStatus_t cublasCswap(cublasHandle_t handle, int n, cuComplex       *x,
                           int incx, cuComplex       *y, int incy)
cublasStatus_t cublasZswap(cublasHandle_t handle, int n, cuDoubleComplex *x,
                           int incx, cuDoubleComplex *y, int incy)

这里再列举几个Level-3的运算函数.

矩阵乘法

#This function performs the matrix-matrix multiplication
#C = α op ( A ) op ( B ) + β C
#where α and β are scalars, and A , B and C are matrices stored in column-major format with dimensions op ( A ) m × k , op ( B ) k × n and C m × n , respectively. 
cublasStatus_t cublasSgemm(cublasHandle_t handle,
                           cublasOperation_t transa, cublasOperation_t transb,
                           int m, int n, int k,
                           const float           *alpha,
                           const float           *A, int lda,
                           const float           *B, int ldb,
                           const float           *beta,
                           float           *C, int ldc)
cublasStatus_t cublasDgemm(cublasHandle_t handle,
                           cublasOperation_t transa, cublasOperation_t transb,
                           int m, int n, int k,
                           const double          *alpha,
                           const double          *A, int lda,
                           const double          *B, int ldb,
                           const double          *beta,
                           double          *C, int ldc)
cublasStatus_t cublasCgemm(cublasHandle_t handle,
                           cublasOperation_t transa, cublasOperation_t transb,
                           int m, int n, int k,
                           const cuComplex       *alpha,
                           const cuComplex       *A, int lda,
                           const cuComplex       *B, int ldb,
                           const cuComplex       *beta,
                           cuComplex       *C, int ldc)
cublasStatus_t cublasZgemm(cublasHandle_t handle,
                           cublasOperation_t transa, cublasOperation_t transb,
                           int m, int n, int k,
                           const cuDoubleComplex *alpha,
                           const cuDoubleComplex *A, int lda,
                           const cuDoubleComplex *B, int ldb,
                           const cuDoubleComplex *beta,
                           cuDoubleComplex *C, int ldc)
cublasStatus_t cublasHgemm(cublasHandle_t handle,
                           cublasOperation_t transa, cublasOperation_t transb,
                           int m, int n, int k,
                           const __half *alpha,
                           const __half *A, int lda,
                           const __half *B, int ldb,
                           const __half *beta,
                           __half *C, int ldc)

处理float类型的矩阵乘法, $C=\alpha * A * B + \beta * C$ ,其中的m为A的行数,n为B的列数,k为A的列数.第二、三个参数cublasOperation_t类型，可以等于CUBLAS_OP_T表示需要转置后运算，或者不需要CUBLAS_OP_N.

矩阵LU分解

#Aarray is an array of pointers to matrices stored in column-major format with dimensions nxn and leading dimension lda.

This function performs the LU factorization of each Aarray[i] for i = 0, ..., batchSize-1 by the following equation

P * Aarray [ i ] = L * U

where P is a permutation matrix which represents partial pivoting with row interchanges. L is a lower triangular matrix with unit diagonal and U is an upper triangular matrix.

Formally P is written by a product of permutation matrices Pj, for j = 1,2,...,n, say P = P1 * P2 * P3 * .... * Pn. Pj is a permutation matrix which interchanges two rows of vector x when performing Pj*x. Pj can be constructed by j element of PivotArray[i] by the following matlab code.
cublasStatus_t cublasSgetrfBatched(cublasHandle_t handle,
                                   int n,
                                   float *Aarray[],
                                   int lda,
                                   int *PivotArray,
                                   int *infoArray,
                                   int batchSize);

cublasStatus_t cublasDgetrfBatched(cublasHandle_t handle,
                                   int n,
                                   double *Aarray[],
                                   int lda,
                                   int *PivotArray,
                                   int *infoArray,
                                   int batchSize);

cublasStatus_t cublasCgetrfBatched(cublasHandle_t handle,
                                   int n,
                                   cuComplex *Aarray[],
                                   int lda,
                                   int *PivotArray,
                                   int *infoArray,
                                   int batchSize);

cublasStatus_t cublasZgetrfBatched(cublasHandle_t handle,
                                   int n,
                                   cuDoubleComplex *Aarray[],
                                   int lda,
                                   int *PivotArray,
                                   int *infoArray,
                                   int batchSize);

矩阵求逆

#Aarray and Carray are arrays of pointers to matrices stored in column-major format with dimensions n*n and leading dimension lda and ldc respectively.
#This function performs the inversion of matrices A[i] for i = 0, ..., batchSize-1.
#Prior to calling cublas<t>getriBatched, the matrix A[i] must be factorized first using the routine cublas<t>getrfBatched. After the call of cublas<t>getrfBatched, the matrix pointing by Aarray[i] will contain the LU factors of the matrix A[i] and the vector pointing by (PivotArray+i) will contain the pivoting sequence.
#Following the LU factorization, cublas<t>getriBatched uses forward and backward triangular solvers to complete inversion of matrices A[i] for i = 0, ..., batchSize-1. The inversion is out-of-place, so memory space of Carray[i] cannot overlap memory space of Array[i]. 
cublasStatus_t cublasSgetriBatched(cublasHandle_t handle,
                                   int n,
                                   float *Aarray[],
                                   int lda,
                                   int *PivotArray,
                                   float *Carray[],
                                   int ldc,
                                   int *infoArray,
                                   int batchSize);

cublasStatus_t cublasDgetriBatched(cublasHandle_t handle,
                                   int n,
                                   double *Aarray[],
                                   int lda,
                                   int *PivotArray,
                                   double *Carray[],
                                   int ldc,
                                   int *infoArray,
                                   int batchSize);

cublasStatus_t cublasCgetriBatched(cublasHandle_t handle,
                                   int n,
                                   cuComplex *Aarray[],
                                   int lda,
                                   int *PivotArray,
                                   cuComplex *Carray[],
                                   int ldc,
                                   int *infoArray,
                                   int batchSize);

cublasStatus_t cublasZgetriBatched(cublasHandle_t handle,
                                   int n,
                                   cuDoubleComplex *Aarray[],
                                   int lda,
                                   int *PivotArray,
                                   cuDoubleComplex *Carray[],
                                   int ldc,
                                   int *infoArray,
                                   int batchSize);

Typically all parameters in cublasgetrfBatched would be passed into cublasgetriBatched. For example,

// step 1: perform in-place LU decomposition, P*A = L*U.
//      Aarray[i] is n*n matrix A[i]
    cublasDgetrfBatched(handle, n, Aarray, lda, PivotArray, infoArray, batchSize);
//      check infoArray[i] to see if factorization of A[i] is successful or not.
//      Array[i] contains LU factorization of A[i]

// step 2: perform out-of-place inversion, Carray[i] = inv(A[i])
    cublasDgetriBatched(handle, n, Aarray, lda, PivotArray, Carray, ldc, infoArray, batchSize);
//      check infoArray[i] to see if inversion of A[i] is successful or not.

cuSOLVER

以上只是小批量的矩阵LU分解和求逆,也有大矩阵的矩阵分解函数,相应资料有cuSOLVER文档

使用Python调用cuBLAS矩阵运算库

使用C语言可能有点繁琐,为此可以直接使用简单方便的python语言调用cuBLAS矩阵运算库,原本以为numba可以调用cublas,但是后来发现好像不行,好像accelerate包和scikit-cuda包都可以调用cublas,但是accelerate好像不好装,所以最后使用scikit-cuda调用cublas了.scikit-cuda安装命令

sudo pip3 install scikit-cuda --target="anaconda3/lib/python3.7/site-packages"

scikit-cuda中需要使用pyCUDA包中的库函数(CUBLAS，CUSOLVER，CUFFT，CULA),所以需要事先安装好pyCUDA.
scikit-cuda官方文档
 scikit-cuda github