在上一篇博客中介绍了异构编程相关概念以及OpenCL框架,都是比较抽象的概念。本文从矢量相加的demo出发,对相关概念做进一步说明,来更深入、直观地理解OpenCL异构编程的过程。
首先,直接将实现矢量相加(每个矢量128个元素)的完整源码贴出来
#include<stdio.h>
#include<stdlib.h>
#include<CL/cl.h>
const char* programSource =
"__kernel void vecadd(__global int* A, __global int* B, __global int* C)\n"
"{ \n"
" int idx=get_global_id(0); \n"
" C[idx]=A[idx]+B[idx]; \n"
"} \n"
;
int main()
{
int *A = NULL;
int *B = NULL;
int *C = NULL;
const int elements= 128;
size_t datasize = sizeof(int)*elements;
A = (int*)malloc(datasize);
B = (int*)malloc(datasize);
C = (int*)malloc(datasize);
for (int i = 0; i < elements; i++)
{
A[i]=i;
B[i]=i;
}
cl_int status;
/*Discover and initialize the platforms*/
cl_uint numPlatforms = 0;
cl_platform_id* platforms = NULL;
status = clGetPlatformIDs(0, NULL, &numPlatforms); //retrieve number of platforms
printf("# of platform:%d\n", numPlatforms);
platforms = (cl_platform_id*)malloc(numPlatforms*sizeof(cl_platform_id)); // malloct memery for platform
status = clGetPlatformIDs(numPlatforms, platforms, NULL); // initialize platforms
/*print platform informations*/
for (int i = 0; i < numPlatforms; i++)
{
size_t size=0;
//name
status = clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 0, NULL, &size);
char* name = (char*)malloc(size);
status = clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, size, name, NULL);
printf("CL_PLATFORM_NAME:%s\n", name);
//vendor
status = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, 0, NULL, &size);
char *vendor = (char *)malloc(size);
status = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, size, vendor, NULL);
printf("CL_PLATFORM_VENDOR:%s\n", vendor);
//version
status = clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, 0, NULL, &size);
char *version = (char *)malloc(size);
status = clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, size, version, NULL);
printf("CL_PLATFORM_VERSION:%s\n", version);
// profile
status = clGetPlatformInfo(platforms[i], CL_PLATFORM_PROFILE, 0, NULL, &size);
char *profile = (char *)malloc(size);
status = clGetPlatformInfo(platforms[i], CL_PLATFORM_PROFILE, size, profile, NULL);
printf("CL_PLATFORM_PROFILE:%s\n", profile);
// extensions
status = clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, 0, NULL, &size);
char *extensions = (char *)malloc(size);
status = clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, size, extensions, NULL);
printf("CL_PLATFORM_EXTENSIONS:%s\n", extensions);
// release
printf("\n\n");
free(name);
free(vendor);
free(version);
free(profile);
free(extensions);
}
/*Discover and initialize devices*/
cl_uint numDevices = 0;
cl_device_id* devices = NULL;
status = clGetDeviceIDs(platforms[0],CL_DEVICE_TYPE_GPU,0,NULL,&numDevices); // retrieve Device number
printf("# of device:%d\n", numDevices);
devices = (cl_device_id*)malloc(numDevices*sizeof(cl_device_id)); // malloct memery for device
status = clGetDeviceIDs(platforms[0],CL_DEVICE_TYPE_GPU,numDevices,devices,NULL); // fill in device
/*print device informations*/
for (int i = 0; i < numDevices; i++)
{
size_t value_size = 0;
//name
status = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, 0, NULL, &value_size); char* name1 = (char*)malloc(value_size); status = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, value_size, name1, NULL); printf("CL_DEVICE_NAME:%s\n", name1); //PARALLEL COMPUTE UNITS(CU) cl_uint maxComputeUnits = 0; status = clGetDeviceInfo(devices[i], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(maxComputeUnits), &maxComputeUnits, NULL); printf("CL_DEVICE_MAX_COMPUTE_UNITS:%u\n", maxComputeUnits); //maxWorkItemPerGroup size_t maxWorkItemPerGroup = 0; status = clGetDeviceInfo(devices[0], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(maxWorkItemPerGroup), &maxWorkItemPerGroup, NULL); printf("CL_DEVICE_MAX_WORK_GROUP_SIZE: %d\n", maxWorkItemPerGroup); //maxGlobalMemSize cl_ulong maxGlobalMemSize = 0; status = clGetDeviceInfo(devices[0], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(maxGlobalMemSize), &maxGlobalMemSize, NULL); printf("CL_DEVICE_GLOBAL_MEM_SIZE: %lu(MB)\n", maxGlobalMemSize / 1024 / 1024); //maxConstantBufferSize cl_ulong maxConstantBufferSize = 0; clGetDeviceInfo(devices[0], CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(maxConstantBufferSize), &maxConstantBufferSize, NULL); printf("CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE: %lu(KB)\n", maxConstantBufferSize / 1024); //maxLocalMemSize cl_ulong maxLocalMemSize = 0; status = clGetDeviceInfo(devices[0], CL_DEVICE_LOCAL_MEM_SIZE, sizeof(maxLocalMemSize), &maxLocalMemSize, NULL); printf("CL_DEVICE_LOCAL_MEM_SIZE: %lu(KB)\n", maxLocalMemSize / 1024);
// release printf("\n\n"); free(name1);
}
/*Creat a context*/
cl_context context = NULL;
context = clCreateContext( NULL, numDevices,devices,NULL,NULL,&status);
// context = clCreateContextFromType(NULL,CL_DEVICE_TYPE_ALL,NULL,NULL,&status);
// cl_device_id device_list;
size_t device_num;
clGetContextInfo(context, CL_CONTEXT_NUM_DEVICES, 0, NULL, &device_num);
printf("Size of cl_device_id:%d\n", sizeof(cl_device_id));
printf("Num of device in Context:%d\n", device_num);
// device_list = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &device_list);
/*Create a command queue*/
cl_command_queue cmdQueue;
cmdQueue = clCreateCommandQueue(context,devices[0],0,&status);
/*Create device buffers*/
cl_mem bufferA;
cl_mem bufferB;
cl_mem bufferC;
bufferA = clCreateBuffer(context,CL_MEM_READ_ONLY,datasize,NULL,&status);
bufferB = clCreateBuffer(context,CL_MEM_READ_ONLY,datasize,NULL,&status);
bufferC = clCreateBuffer(context,CL_MEM_WRITE_ONLY,datasize,NULL,&status);
/*Write host data to device buffers*/
status = clEnqueueWriteBuffer(cmdQueue,bufferA,CL_FALSE,0,datasize,A,0,NULL,NULL);
status = clEnqueueWriteBuffer(cmdQueue,bufferB,CL_FALSE,0,datasize,B,0,NULL,NULL);
// status = clEnqueueWriteBuffer(cmdQueue,bufferC,CL_FALSE,0,datasize,C,0,NULL,NULL);
/*Create and compile the program*/
cl_program program = clCreateProgramWithSource(context,1,(const char **)&programSource,NULL,&status);
status = clBuildProgram(program,numDevices,devices,NULL,NULL,NULL);
if (status!= CL_SUCCESS)
{
size_t len;
char buffer[8 * 1024];
printf("Error: Failed to build program executable!\n");
clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
printf("%s\n", buffer);
}
/*Create the kernel*/
cl_kernel kernel = NULL;
kernel = clCreateKernel(program, "vecadd", &status);
/*Set the kernel arguments*/
status = clSetKernelArg(kernel,0,sizeof(cl_mem),&bufferA);
status = clSetKernelArg(kernel,1,sizeof(cl_mem),&bufferB);
status = clSetKernelArg(kernel,2,sizeof(cl_mem),&bufferC);
/*CONFIGURE THE WORK-ITEM STRUCTURE*/
size_t globalWorkSize[1];
globalWorkSize[0] = elements;
// size_t globalSize[1] = { elements }, localSize[1] = { 256 };
/*Enqueue the kernel for execution*/
status = clEnqueueNDRangeKernel(cmdQueue,kernel,1,NULL,globalWorkSize,NULL,0,NULL,NULL);
/*Read the buffer output back to host*/
clFinish(cmdQueue);
clEnqueueReadBuffer(cmdQueue,bufferC,CL_TRUE,0,datasize,C,0,NULL,NULL);
printf("The calculated outcome:");
for (int i = 0; i < elements; i++)
{
printf("%d", C[i]);
}
printf("\n");
bool result = true;
printf("The right outcome:");
for (int i = 0; i < elements; i++)
{
D[i] = i + i;
printf("%d", D[i]);
}
printf("\n");
for (int i = 0; i < elements; i++)
{
if (C[i] != D[i])
{
result = false;
break;
}
}
if (result)
{
printf("Output is correct!\n");
}
else
{
printf("Output is incorrect!\n");
}
/*Release OpenCL resources*/
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(cmdQueue);
clReleaseMemObject(bufferA);
clReleaseMemObject(bufferB);
clReleaseMemObject(bufferC);
clReleaseContext(context);
free(A);
free(B);
free(C);
free(platforms);
free(devices);
getchar();
}
下面,分别对代码进行分析,了解其是怎么实现异构编程的。
头文件就不用说了,使用OpenCL进行编程的时候需要添加对应的库,该库一般设备商会提供。如果只是在PC端使用VS仿真编程,则去OpenCL官方网站OpenCL下载对应的Header,设置相关库的路径即可。
首先,以下代码:
const char* programSource =
"__kernel void vecadd(__global int* A, __global int* B, __global int* C)\n"
"{ \n"
" int idx=get_global_id(0); \n"
" C[idx]=A[idx]+B[idx]; \n"
"} \n"
;
该部分代码称为Kernel,是真正运行在GPU上的程序。上一篇博客中讲执行模型的时候提到,OpenCL程序分为两部分,一部分运行在宿主机(CPU),另一部分运行在计算设备上(针对GPU编程,指GPU),上述代码就是指后者。这部分代码可以像上面一样,以字符的方式放在CPU程序中,也可以单独写一个.cl
文件放在外面。Kernel的语法有一定的规则,比如必须以__Kernel 开头等,相关的可以查看具体版本OpenCL的Specification,在其官方可以下载,现在可以先不用管。
接下来确定我们的平台和设备。
status = clGetPlatformIDs(0, NULL, &numPlatforms); //retrieve number of platforms
printf("# of platform:%d\n", numPlatforms);
platforms = (cl_platform_id*)malloc(numPlatforms*sizeof(cl_platform_id)); // malloct memery for platform
status = clGetPlatformIDs(numPlatforms, platforms, NULL); // initialize platform
函数clGetPlatformIDs()用于获取平台信息,该函数通常调用两次,第一次用于查询,第二次则指定我们所使用的平台。设备信息则通过clGetDeviceIDs()获得和指定,用法和clGetPlatformIDs()类似,如下所示:
cl_uint numDevices = 0;
cl_device_id* devices = NULL;
status = clGetDeviceIDs(platforms[0],CL_DEVICE_TYPE_GPU,0,NULL,&numDevices); // retrieve Device number
printf("# of device:%d\n", numDevices);
devices = (cl_device_id*)malloc(numDevices*sizeof(cl_device_id)); // malloct memery for device
status = clGetDeviceIDs(platforms[0],CL_DEVICE_TYPE_GPU,numDevices,devices,NULL); // fill in device
通过clGetPlatformInfo()和clGetDeviceInfo()可以查询平台和设备的相关信息,根据输入的参数获取对应的信息。上述代码的结果如下图所示:
平台与设备信息获得后,则可以创建Context,将指定的CPU和GPU放在一个context下,则在Context中对其进行操作,其关系可以用下图表示:
P.S:该素材来源于AMD OpenCL大学教程
对应的API
cl_context context = NULL;
context = clCreateContext( NULL, numDevices,devices,NULL,NULL,&status);
同样可以通过 clGetContextInfo()查询Context的相关信息。
Context用于管理对象,对对象的操作则是通过Commandqueue命令队列实现的。
cl_command_queue cmdQueue;
cmdQueue = clCreateCommandQueue(context,devices[0],0,&status);
命令队列创建后,则需要创建内存对象。因为我们的数据一开始是在host上的,我们需要创建Context中的buffer,来实现host和compute device之间的数据传输。
cl_mem bufferA;
cl_mem bufferB;
cl_mem bufferC;
bufferA = clCreateBuffer(context,CL_MEM_READ_ONLY,datasize,NULL,&status);
bufferB = clCreateBuffer(context,CL_MEM_READ_ONLY,datasize,NULL,&status);
bufferC = clCreateBuffer(context,CL_MEM_WRITE_ONLY,datasize,NULL,&status);
通过上述命令创建的内存对象是空的,还没有数据。通过以下命令,将宿主机上的数据传输至GPU上
status = clEnqueueWriteBuffer(cmdQueue,bufferA,CL_FALSE,0,datasize,A,0,NULL,NULL);
status = clEnqueueWriteBuffer(cmdQueue,bufferB,CL_FALSE,0,datasize,B,0,NULL,NULL);
在我们的应用中,只有矢量A和B一开始是有值,C是计算完成后才获得值。一开始写入GPU的数据只有A和B,C在计算完成后再读回host.
接下来创建程序对象,创建结束后build,build为所关联的设备生成可执行体。创建程序对象试,加载的是Kernel字符。如果Kernel是单独写在外面的cl程序,则加载对应.cl
文件即可。
cl_program program = clCreateProgramWithSource(context,1,(const char**)&programSource,NULL,&status);
status = clBuildProgram(program,numDevices,devices,NULL,NULL,NULL);
程序创建后,创建在GPU上执行的Kernel对象,并传入相关参数。
cl_kernel kernel = NULL;
kernel = clCreateKernel(program, "vecadd", &status);
status = clSetKernelArg(kernel,0,sizeof(cl_mem),&bufferA);
status = clSetKernelArg(kernel,1,sizeof(cl_mem),&bufferB);
status = clSetKernelArg(kernel,2,sizeof(cl_mem),&bufferC);
Kernel不同于Program之处在于,Program是属于host上的,只是加载程序字符并且关联到设备
Kernel则是运行函数实体,然后通过设置内核参数API设置参数的值。
下面,则开始执行内核。
内核执行的时候,则根据上一章节所提到的执行模型生成索引空间,索引空间的大小以及每个工作组的大小都需要事先设定,如下所示。
size_t globalWorkSize[1];
globalWorkSize[0] = elements;
然后执行内核:
status = clEnqueueNDRangeKernel(cmdQueue,kernel,1,NULL,globalWorkSize,NULL,0,NULL,NULL);
需要注意的是,此时程序还没有真正运行,只是将其入队,具体什么时候执行,由队列决定。关于这部分我也还不是很清楚,目前知道的就是OpenCL由一定的执行机制,比如顺序执行或者乱序执行等等,等后面有了比较全面的了解后再更新。
GPU计算结束后,我们需要将数据传回给CPU。
clEnqueueReadBuffer(cmdQueue,bufferC,CL_TRUE,0,datasize,C,0,NULL,NULL);
操作完成后,需要释放相关资源与内存。
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(cmdQueue);
clReleaseMemObject(bufferA);
clReleaseMemObject(bufferB);
clReleaseMemObject(bufferC);
clReleaseContext(context);
至此,我们通过OpenCL完成了CPU 、GPU的异构编程。结合Demo,相信你对OpenCL的相关概念以及其过程有了一定认识。
OpenCL还有很多东西需要进一步学习,比如版本之间的更新情况以及命令队列执行机制,事件等等,等后面学习到了,也会分享出来的。
其实,我对这个过程以及相关概念已经比较熟悉了,但是我发现写出来还是有点混乱,逻辑不是很清晰,特别是上一篇。自己也想通过业余时间写博客的方式,记录所学的东西并训练自己的表达能力吧。因为是新手,有很多理解不到位的地方,烦请大家指出,我一定虚心接纳。