一、概述
1、上一篇大致介绍了CUDA的安装和结构,同时以一个基本的例程实现了CUDA的简单使用。这一篇具体介绍一下CUDA关于矩阵的实现步骤。
2、源代码均来自于:https://github.com/Tony-Tan/CUDA_Freshman
3、需要用到的freshman.h
头文件)如下:
#ifndef FRESHMAN_H
#define FRESHMAN_H
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <math.h>
#include <time.h>
#ifdef _WIN32
# include <windows.h>
#else
# include <sys/time.h>
#endif
#ifdef _WIN32
//宏定义
#define CHECK(call)\
{
\
const cudaError_t error=call;\
if(error!=cudaSuccess)\
{
\
printf("ERROR: %s:%d,",__FILE__,__LINE__);\
printf("code:%d,reason:%s\n",error,cudaGetErrorString(error));\
exit(1);\
}\
}
int gettimeofday(struct timeval *tp, void *tzp)
{
time_t clock;
struct tm tm;
SYSTEMTIME wtm;
GetLocalTime(&wtm);
tm.tm_year = wtm.wYear - 1900;
tm.tm_mon = wtm.wMonth - 1;
tm.tm_mday = wtm.wDay;
tm.tm_hour = wtm.wHour;
tm.tm_min = wtm.wMinute;
tm.tm_sec = wtm.wSecond;
tm. tm_isdst = -1;
clock = mktime(&tm);
tp->tv_sec = clock;
tp->tv_usec = wtm.wMilliseconds * 1000;
return (0);
}
#endif
//计时函数
double cpuSecond()
{
struct timeval tp;
gettimeofday(&tp,NULL);
return((double)tp.tv_sec+(double)tp.tv_usec*1e-6);
}
//产生浮点型随机数
void initialData(float* ip,int size)
{
//使用srand和rand产生随机数
time_t t;
srand((unsigned )time(&t)); //https://www.runoob.com/w3cnote/cpp-rand-srand.html
for(int i=0;i<size;i++)
{
ip[i]=(float)(rand()&0xffff)/1000.0f;
}
}
//产生整型随机数
void initialData_int(int* ip, int size)
{
time_t t;
srand((unsigned)time(&t));
for (int i = 0; i<size; i++)
{
ip[i] = int(rand()&0xff);
}
}
//
void printMatrix(float * C,const int nx,const int ny)
{
float *ic=C;
printf("Matrix<%d,%d>:",ny,nx);
for(int i=0;i<ny;i++)
{
for(int j=0;j<nx;j++)
{
printf("%6f ",C[j]);
}
ic+=nx;
printf("\n");
}
}
//初始化设备
void initDevice(int devNum)
{
int dev = devNum;
//打印设备信息
cudaDeviceProp deviceProp;
CHECK(cudaGetDeviceProperties(&deviceProp,dev));
printf("Using device %d: %s\n",dev,deviceProp.name);
printf("Maximun number of thread per block: %d\n",deviceProp.maxThreadsPerBlock);
printf("Maximun size of each dimension of a block: %d x %d x %d\n",deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1], deviceProp.maxThreadsDim[2]);
printf("Maximun size of each dimension of a grid: %d x %d x %d\n",deviceProp.maxGridSize[0],deviceProp.maxGridSize[1],deviceProp.maxGridSize[2]);
printf("Maximu memory pitch %lu bytes\n", deviceProp.memPitch);
printf("Total amount of global memory: %.2f GBytes (%llu bytes)\n",(float)deviceProp.totalGlobalMem / pow(1024.0, 3), deviceProp.totalGlobalMem);
//设置设备ID
CHECK(cudaSetDevice(dev));
}
//检查计算结果
void checkResult(float * hostRef,float * gpuRef,const int N)
{
long epsilon=1.0E-8;
for(int i=0;i<N;i++)
{
if(abs(long(hostRef[i]-gpuRef[i]))>epsilon)
{
printf("Results don\'t match!\n");
printf("%f(hostRef[%d] )!= %f(gpuRef[%d])\n",hostRef[i],i,gpuRef[i],i);
return;
}
}
printf("Check result success!\n");
}
#endif//FRESHMAN_H
二、一维矩阵的运算
- 初始化CUDA以及其ID,这里表示使用ID为0的GPU执行算法。
cudaSetDevice(0);
- 为CUDA内部变量分配内存,主要包括输入和输出的变量内存预分配。
//需要计算的向量的大小:2^14
int nElem = 1 << 14;
printf("Vector size:%d\n", nElem);
//数据的字节数
int nByte = sizeof(float)*nElem;
//d:device表示设备(GPU),a,b为输入数据,res为输出数据
float *a_d, *b_d, *res_d;
cudaMalloc((float**)&a_d, nByte);
cudaMalloc((float**)&b_d, nByte);
cudaMalloc((float**)&res_d, nByte);
- 将CPU的需要计算的矩阵变量通过cudaMemcpy函数拷贝给GPU。
//将CPU数据拷贝到GPU
cudaMemcpy(a_d, a_h, nByte, cudaMemcpyHostToDevice);
cudaMemcpy(b_d, b_h, nByte, cudaMemcpyHostToDevice);
- 为算法分配线程块和线程,以此来执行矩阵中各个元素的加法。
// gridSize表示有多少个block, blockSize表示每个block有多少个thread(最大的线程数好像不能超过1024个)
//定义为一维的线程块
dim3 blocksize(1024);
//定义为一维的线程网格
dim3 gridsize(nElem / blocksize.x);
- 通过定义的核函数,使用CUDA中的各个线程来执行加法操作,实现程序的并行运算。注意:这里需要的等待所有线程执行结束后再执行后面的操作,因为CUDA执行时是先返回控制权给CPU,然后才是线程结束,参考这篇博客。
//“ << < >> >”表示运行时配置符号,在本程序中的定义是 << <1,size >> >,表示分配了一个线程块(Block),每个线程块有分配了size个线程
sumArraysGPU << <gridsize, blocksize >> >(a_d, b_d, res_d); //执行核函数
printf("Execution configuration<<<%d,%d>>>\n", gridsize.x, blocksize.x);
//等待所有线程结束
cudaDeviceSynchronize();
- 将CUDA核函数执行后的结果拷贝到CPU,完成计算
//将GPU的数据拷贝到CPU
cudaMemcpy(res_from_gpu_h, res_d, nByte, cudaMemcpyDeviceToHost);
- 执行完毕,释放内存,重置GPU
cudaFree(a_d);
cudaFree(b_d);
cudaFree(res_d);
cudaDeviceReset();
- 完整代码如下:
#include <cuda_runtime.h>
#include <stdio.h>
#include "freshman.h"
//普通的CPU版本求和方式
void sumArrays(float * a, float * b, float * res, const int size)
{
for (int i = 0; i<size; i += 4)
{
res[i] = a[i] + b[i];
res[i + 1] = a[i + 1] + b[i + 1];
res[i + 2] = a[i + 2] + b[i + 2];
res[i + 3] = a[i + 3] + b[i + 3];
}
}
//使用GPU核函数进行数组求和
__global__ void sumArraysGPU(float*a, float*b, float*res)
{
//根据线程的索引号对数组进行赋值:第几个块*每个块的线程数+线程数
int i = blockIdx.x*blockDim.x + threadIdx.x;
res[i] = a[i] + b[i];
}
int main(int argc, char **argv)
{
// Choose which GPU to run on, change this on a multi-GPU system.
cudaSetDevice(0);
//需要计算的向量的大小:2^14
int nElem = 1 << 14;
printf("Vector size:%d\n", nElem);
//为待计算的数组和输出结果数组分配内存 h:host表示主机(CPU)
int nByte = sizeof(float)*nElem;
float *a_h = (float*)malloc(nByte);
float *b_h = (float*)malloc(nByte);
float *res_h = (float*)malloc(nByte);
float *res_from_gpu_h = (float*)malloc(nByte);
//分配初始值(可以不用)
memset(res_h, 0, nByte);
memset(res_from_gpu_h, 0, nByte);
//使用宏的方式为GPU变量分配内存,对于宏和内联函数的区别可以参考:https://blog.csdn.net/qq_37934101/article/details/81266548
float *a_d, *b_d, *res_d;
CHECK(cudaMalloc((float**)&a_d, nByte));
CHECK(cudaMalloc((float**)&b_d, nByte));
CHECK(cudaMalloc((float**)&res_d, nByte));
//产生随机数
initialData(a_h, nElem);
initialData(b_h, nElem);
//将CPU数据拷贝到GPU
CHECK(cudaMemcpy(a_d, a_h, nByte, cudaMemcpyHostToDevice));
CHECK(cudaMemcpy(b_d, b_h, nByte, cudaMemcpyHostToDevice));
// gridSize表示有多少个block, blockSize表示每个block有多少个thread(最大的线程数好像不能超过1024个)
//定义为一维的线程块
dim3 blocksize(1024);
//定义为一维的线程网格
dim3 gridsize(nElem / blocksize.x);
//“ << < >> >”表示运行时配置符号,在本程序中的定义是 << <1,size >> >,表示分配了一个线程块(Block),每个线程块有分配了size个线程
sumArraysGPU << <gridsize, blocksize >> >(a_d, b_d, res_d); //执行核函数
printf("Execution configuration<<<%d,%d>>>\n", gridsize.x, blocksize.x);
CHECK(cudaDeviceSynchronize()); //等待线程结束
//将GPU的数据拷贝到CPU
CHECK(cudaMemcpy(res_from_gpu_h, res_d, nByte, cudaMemcpyDeviceToHost));
sumArrays(a_h, b_h, res_h, nElem); //执行CPU版本的数组求和
//检查两种求和方式是否一致
checkResult(res_h, res_from_gpu_h, nElem);
//释放GPU内存
cudaFree(a_d);
cudaFree(b_d);
cudaFree(res_d);
//释放CPU内存
free(a_h);
free(b_h);
free(res_h);
free(res_from_gpu_h);
CHECK(cudaDeviceReset());
return 0;
}
- 运行结果
三、二维矩阵运算
- 二维运算方式与一维很相似,唯一不同的在于线程块与线程的分配上。这里主要强调关键部分,其他部分如内存分配,拷贝,释放等于上面一致。首先是定义线程块的维度:
//设置线程块在x和y方向上的维度,均为32
int dimx = 32;
int dimy = 32;
//二维网格和二维块(当然也可以定义二维网格与一维块、一维网格与二维块,具体见完整代码)
dim3 blockSize_0(dimx, dimy);
//这里这样操作主要是为了在最后不足一个线程块时,按一个计算
dim3 gridSize_0((nx - 1) / blockSize_0.x + 1, (ny - 1) / blockSize_0.y + 1);
- 调用计时函数与核函数:
//开始计时
iStart = cpuSecond();
sumMatrix << <gridSize_0, blockSize_0 >> >(A_dev, B_dev, C_dev, nx, ny);
cudaDeviceSynchronize(); //等待线程结束
//计时结束
iElaps = cpuSecond() - iStart;
- CUDA核函数实现矩阵求和
//使用GPU核函数进行举证求和,调用核函数时,就相当于一大地线程同时(不完全同步)执行这个函数
__global__ void sumMatrix(float * MatA, float * MatB, float * MatC, int nx, int ny)
{
//线程在网格中的x坐标
int ix = threadIdx.x + blockDim.x*blockIdx.x;
//线程在网格中的y坐标
int iy = threadIdx.y + blockDim.y*blockIdx.y;
//线程在网格中的索引,与矩阵中某个元素的索引一致
int idx = ix + iy*ny;
if (ix<nx && iy<ny)
{
MatC[idx] = MatA[idx] + MatB[idx];
}
}
- 完整代码如下:
#include "freshman.h"
//这个就是CPU版本的求和函数,用于验证GPU结果是否正确
void sumMatrix2D_CPU(float * MatA, float * MatB, float * MatC, int nx, int ny)
{
float * a = MatA;
float * b = MatB;
float * c = MatC;
for (int j = 0; j<ny; j++)
{
for (int i = 0; i<nx; i++)
{
c[i] = a[i] + b[i];
}
c += nx;
b += nx;
a += nx;
}
}
//使用GPU核函数进行举证求和,调用核函数时,就相当于一大地线程同时(不完全同步)执行这个函数
__global__ void sumMatrix(float * MatA, float * MatB, float * MatC, int nx, int ny)
{
//线程在网格中的x坐标
int ix = threadIdx.x + blockDim.x*blockIdx.x;
//线程在网格中的y坐标
int iy = threadIdx.y + blockDim.y*blockIdx.y;
//线程在网格中的索引,与矩阵中某个元素的索引一致
int idx = ix + iy*ny;
if (ix<nx && iy<ny)
{
MatC[idx] = MatA[idx] + MatB[idx];
}
}
int main(int* argc, char** argv)
{
printf("strating...\n");
initDevice(0);
//设置矩阵大小和字节数
int nx = 1 << 12;
int ny = 1 << 12;
int nxy = nx*ny;
int nBytes = nxy * sizeof(float);
//分配内存(CPU),初始化数据
float* A_host = (float*)malloc(nBytes);
float* B_host = (float*)malloc(nBytes);
float* C_host = (float*)malloc(nBytes);
float* C_from_gpu = (float*)malloc(nBytes);
initialData(A_host, nxy);
initialData(B_host, nxy);
//分配内存(GPU)
float *A_dev = NULL;
float *B_dev = NULL;
float *C_dev = NULL;
CHECK(cudaMalloc((void**)&A_dev, nBytes));
CHECK(cudaMalloc((void**)&B_dev, nBytes));
CHECK(cudaMalloc((void**)&C_dev, nBytes));
//数据拷贝到GPU
CHECK(cudaMemcpy(A_dev, A_host, nBytes, cudaMemcpyHostToDevice));
CHECK(cudaMemcpy(B_dev, B_host, nBytes, cudaMemcpyHostToDevice));
//设置线程块在x和y方向上的维度,均为32
int dimx = 32;
int dimy = 32;
// cpu compute 这里不需要,后面有拷贝函数
// cudaMemcpy(C_from_gpu, C_dev, nBytes, cudaMemcpyDeviceToHost);
double iStart = cpuSecond();
sumMatrix2D_CPU(A_host, B_host, C_host, nx, ny);
double iElaps = cpuSecond() - iStart;
printf("CPU Execution Time elapsed %f sec\n", iElaps);
//二维网格和二维块
dim3 blockSize_0(dimx, dimy);
dim3 gridSize_0((nx - 1) / blockSize_0.x + 1, (ny - 1) / blockSize_0.y + 1); //这里这样操作主要是为了在最后不足一个线程块时,按一个计算
iStart = cpuSecond();
sumMatrix << <gridSize_0, blockSize_0 >> >(A_dev, B_dev, C_dev, nx, ny);
CHECK(cudaDeviceSynchronize()); //等待线程结束
iElaps = cpuSecond() - iStart;
printf("GPU Execution configuration<<<(%d,%d),(%d,%d)>>> Time elapsed %f sec\n",
gridSize_0.x, gridSize_0.y, blockSize_0.x, blockSize_0.y, iElaps);
CHECK(cudaMemcpy(C_from_gpu, C_dev, nBytes, cudaMemcpyDeviceToHost));
checkResult(C_host, C_from_gpu, nxy);
//一维网格和一维块
dimx = 32;
dim3 blockSize_1(dimx);
dim3 gridSize_1((nxy - 1) / blockSize_1.x + 1);
iStart = cpuSecond();
sumMatrix << <gridSize_1, blockSize_1 >> >(A_dev, B_dev, C_dev, nx*ny, 1);
CHECK(cudaDeviceSynchronize()); //等待线程结束
iElaps = cpuSecond() - iStart;
printf("GPU Execution configuration<<<(%d,%d),(%d,%d)>>> Time elapsed %f sec\n",
gridSize_1.x, gridSize_1.y, blockSize_1.x, blockSize_1.y, iElaps);
CHECK(cudaMemcpy(C_from_gpu, C_dev, nBytes, cudaMemcpyDeviceToHost));
checkResult(C_host, C_from_gpu, nxy);
//二维网格和一维块
dimx = 32;
dim3 blockSize_2(dimx);
dim3 gridSize_2((nx - 1) / blockSize_2.x + 1, ny);
//计算开始时间
iStart = cpuSecond();
sumMatrix << <gridSize_2, blockSize_2 >> >(A_dev, B_dev, C_dev, nx, ny);
CHECK(cudaDeviceSynchronize()); //等待线程结束
//获得所有线程结束时间
iElaps = cpuSecond() - iStart;
printf("GPU Execution configuration<<<(%d,%d),(%d,%d)>>> Time elapsed %f sec\n",
gridSize_2.x, gridSize_2.y, blockSize_2.x, blockSize_2.y, iElaps);
CHECK(cudaMemcpy(C_from_gpu, C_dev, nBytes, cudaMemcpyDeviceToHost));
checkResult(C_host, C_from_gpu, nxy);
//释放内存
cudaFree(A_dev);
cudaFree(B_dev);
cudaFree(C_dev);
free(A_host);
free(B_host);
free(C_host);
free(C_from_gpu);
cudaDeviceReset();
return 0;
}
- 运行结果:
四、总结
1、细心的朋友可以发现,CUDA在执行程序时,都会进行数据的拷贝,这一步其实相当耗时。
2、以上均是线程个数和需要计算的矩阵数据个数相同,当分配的线程不足以计算一个矩形块或是需要单个线程处理多次数据时,又该如何处理呢,待续…
五、其他
一个打印线程索引的例子:
#include "freshman.h"
__global__ void printThreadIndex(float *A, const int nx, const int ny)
{
int ix = threadIdx.x + blockIdx.x*blockDim.x;
int iy = threadIdx.y + blockIdx.y*blockDim.y;
unsigned int idx = iy*nx + ix;
printf("thread_id(%d,%d) block_id(%d,%d) coordinate(%d,%d)"
"global index %2d ival %2d\n", threadIdx.x, threadIdx.y,
blockIdx.x, blockIdx.y, ix, iy, idx, A[idx]);
}
int main(int* argc, char** argv)
{
initDevice(0);
//设置矩阵大小和字节数
int nx = 8, ny = 6;
int nxy = nx*ny;
int nBytes = nxy * sizeof(float);
//分配内存(CPU),初始化数据,打印矩阵
float* A_host = (float*)malloc(nBytes);
initialData(A_host, nxy);
printMatrix(A_host, nx, ny);
//分配内存(GPU)
float *A_dev = NULL;
CHECK(cudaMalloc((void**)&A_dev, nBytes));
//数据拷贝
cudaMemcpy(A_dev, A_host, nBytes, cudaMemcpyHostToDevice);
// gridSize表示有多少个block, blockSize表示每个block有多少个thread(最大的线程数好像不能超过1024个)
dim3 blocksize(4, 2);
dim3 gridsize((nx - 1) / blocksize.x + 1, (ny - 1) / blocksize.y + 1);
printThreadIndex << <gridsize, blocksize >> >(A_dev, nx, ny);
//等待所有线程结束
CHECK(cudaDeviceSynchronize());
//释放内存
cudaFree(A_dev);
free(A_host);
//重置设备
cudaDeviceReset();
return 0;
}