OpenCL 介绍

因为公司项目的需要，我开始接触opencl,之前只知道opencl是做平行计算的，可以加速绝大多数数值计算。目前，有很多知名的算法都被用opencl提速，如fft等。

楔子

学习之路漫长，记录工作中的点点滴滴。

opencl框架

opencl能进行算法加速的好处就不在这里累赘的说明了，网上有大把的文章来“赞美它”，摘要最直观的一幅图
CPU与GPU性能比较
上图可以很明显的看出GPU的优势。

opencl主要函数介绍

1 获取平台clGetPlatFormIDs

 cl_int clGetPlatFormIDs(cl_uint num_entries, cl_platform _id * platforms,cl_uint *num_platforms)

num_entries 可加入platforms的数目，如果platforms不等于NULL,num_entries必须大于0.
platforms 返回所找到OpenCL平台列表
num_platforms 返回实际可用的OpenCL平台集。如果num _ platform等于NULL,则被忽略。

2 查询设备clGetDeviceIDs

cl_int clGetDeviceIDs(cl_platform_id platform,cl_device_type  device_type, cl_uint  num_entries, cl_device_id *devices, cl_uint *num_devices);

* platform 是clGetPlatformIDs返回的平台ID或者为NULL.
* device _ type 用来标示OpenCL设备的类型。
* devices 返回一个列表，其中存放所找到的OpenCL设备
* num _ devices 返回与device _ type 相匹配的可用OpenCL设备的数目，如果num _ devices 是NULL ,则忽略此参数。

3 上下文clCreateContext

cl_context clCreateContext(const cl_context_properties * properties, cl_uint num_devices, const cl_device_id * devices, (void)(*pfn_notify) (const char * errinfo, const void *private_info ,size_t cb, void *user_data), void *user _data, cl_int * errcode_ret);

* properties 指向一个列表，其中有上下文属性名称以及其对应的值。每个属性名称后面紧跟器对应的期望值
* num_devices 是参数devices中设备的数目
* devices 是一个指针，指向clGetDeviceIDs所返回的设备列表。
* pfn_notify 应用所注册的一个回调函数
* errcode_ret 用来返回的错误码

4 命令队列clCreateCommandQueue

cl_command_queue clCreateCommandQueue(cl_context context, cl_device_id device, cl_command_queue_properties properties,cl_int *errcode_ret)

context 必须是一个有效的上下文。
device 必须是与context关联的一个设备。
properties 指定命令队列的一系列属性。
errcode_ret 返回的错误码

5创建内存对象clCreateBuffer

cl_mem clCreateBuffer(cl_context context,cl_mem_flags flags,size_t size,void *host_ptr,cl_int *errcode_ret)

* context 是一个有效的OpenCL上下文，用来创建缓冲对象。
* flags 是一个位域，用来指定分配和使用信息，如在哪个内存区域分配缓冲对象以及怎么使用。
* size 是所分配缓冲对象的字节数。
* host_ptr 指定由应用所分配的缓冲数据。
* errcode_ret 返回错误码

6 设置内核参数 clSetKernelArg

cl_int clSetKernelArg（cl_kernel kernel, cl_uint arg_index,size_t arg_size, const void *arg_value）

kernel 是一个有效的内核对象
arg_index 是参数的索引。内核参数的索引从最左边参数的0到n-1，其中n是内核所有声明的参数个数。
arg_size 表明参数值得大小。如果参数是内存对象，则其值位缓冲对象或图像对象的大小。
arg_value 所指数据用来作为索引位arg_index的惨呼，在clSetKernelArg返回后，arg_value所指数据已经被拷贝，其内存可以由应用重新使用。

7 执行内核clEnqueueNDRangeKernel

cl_int clEnqueueNDRangeKernel(cl_command_queue command_queue,cl_kernel kernel, cl_uint work_dim,const size_t *global_work_offset,const size_t *global_work_size,const size_t *local_work_size,cl_uint num_events_in_wait_list,const cl_event *event_wait_list, cl_event *event)

* command_queue 是一个有效的命令队列，内核将被入队并在command_queue所关联的设备上执行。
* kernel 是一个有效的内核对象。kernel 所关联的OpenCL上下文必须和command_queue所关联的保持一致
* work_dim 是维数，用来指定全局工作项和工作组中的工作项。work_dim必须大于0，且小于等3.
* global_work_offset 在当前必须是NULL,将来修订OpenCL时，此参数可能用来指定一组work_dim无符号值作为偏移量，用来计算工作项的全局ID,而不必非得知道从偏移位置(0,0,…,0)起始的全局ID.
* global_work_size 指向一组work_dim无符号值，描述work_dim维度中用来执行内核函数的全局工作项的数目。全局工作项的数目这样计算：global_work_size[0]*…*global_work_size[work_dim - 1]。
* local_work_size 指向一组work_dim 无符号值，描述组成一个工作组(会执行kernel)的工作项的数目.工作组长工作项的总数可以这样算得local_work_size[0]*…*local_work_size[work_dim - 1],此数目必须小于等于CL _DEVICE _MAX _WORK _GROUP _SIZE.
* num_events_in_wait_list
* event_wait_list
* event

8命令队列全部完成

cl_int clFinish(cl_command_queue command_queue)

* command_queue 是一个有效的命令队列
会阻塞，知道之前入队的所有OpenCL命令全部提交并完成。直到command_queue中所有命令被处理并完成后，clFinish才会返回。

9 样例：

#include <CL/cl.h>
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
using namespace std;
int main(void)
{
const int  array_size = 1024;
cl_uint numPlatforms = 0;
cl_platform_id platform = nullptr;
cl_context context = nullptr;
cl_command_queue commandQueue = nullptr;
cl_program program = nullptr;
cl_mem input1MemObj = nullptr;
cl_mem input2MemObj = nullptr;
cl_mem outputMemObj = nullptr;
cl_kernel kernel = nullptr;
cl_int    status = clGetPlatformIDs(0, NULL, &numPlatforms);
if (status != CL_SUCCESS)
{
    cout << "Error: Getting platforms!" << endl;
    return 0;
}


if (numPlatforms > 0)
{
    cl_platform_id* platforms = (cl_platform_id*)malloc(numPlatforms* sizeof(cl_platform_id));
    status = clGetPlatformIDs(numPlatforms, platforms, NULL);
    platform = platforms[0];
    free(platforms);
}
else
{
    puts("Your system does not have any OpenCL platform!");
    return 0;
}


cl_uint                numDevices = 0;
cl_device_id        *devices;
status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
if (numDevices == 0) //no GPU available.        
{
    cout << "No GPU device available." << endl;
cout << "Choose CPU as default device." << endl;
status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 0, NULL, &numDevices);
devices = (cl_device_id*)malloc(numDevices * sizeof(cl_device_id));

status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, numDevices, devices, NULL);
}
    else
    {
        devices = (cl_device_id*)malloc(numDevices * sizeof(cl_device_id));
        status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numDevices, devices, NULL);
        cout << "The number of devices: " << numDevices << endl;
    }


    context = clCreateContext(NULL, 1, devices, NULL, NULL, NULL);


    commandQueue = clCreateCommandQueue(context, devices[0], 0, NULL);


    // Read the kernel code to the buffer
    FILE *fp = fopen("cl_kernel2.cl", "rb");
    if (fp == nullptr)
    {
        puts("The kernel file not found!");
        goto RELEASE_RESOURCES;
    }
    fseek(fp, 0, SEEK_END);
    size_t kernelLength = ftell(fp);
    fseek(fp, 0, SEEK_SET);
    char *kernelCodeBuffer = (char*)malloc(kernelLength + 1);
    fread(kernelCodeBuffer, 1, kernelLength, fp);
    kernelCodeBuffer[kernelLength] = '\0';
    fclose(fp);

    const char *aSource = kernelCodeBuffer;
    program = clCreateProgramWithSource(context, 1, &aSource, &kernelLength, NULL);


    status = clBuildProgram(program, 1, devices, NULL, NULL, NULL);


    // Do initialization
    int i;
    int input1Buffer[array_size];
    int input2Buffer[array_size];
    int outputBuffer[array_size];
    for (i = 0; i < array_size; i++)
        input1Buffer[i] = input2Buffer[i] = i + 1;
    memset(outputBuffer, 0, sizeof(outputBuffer));

    // Create mmory object
    input1MemObj = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, array_size * sizeof(int), input1Buffer, nullptr);
    input2MemObj = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, array_size * sizeof(int), input2Buffer, nullptr);
    outputMemObj = clCreateBuffer(context, CL_MEM_WRITE_ONLY, array_size * sizeof(int), NULL, NULL);


    kernel = clCreateKernel(program, "cl_add", NULL);


    status = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&outputMemObj);
    status = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&input1MemObj);
    status = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&input2MemObj);


    size_t global_work_size[1] = { array_size };
    status = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, global_work_size, NULL, 0, NULL, NULL);
    clFinish(commandQueue);


    status = clEnqueueReadBuffer(commandQueue, outputMemObj, CL_TRUE, 0, global_work_size[0] * sizeof(int), outputBuffer, 0, NULL, NULL);

    printf("Veryfy the rsults... ");
    for (i = 0; i < array_size; i++)
    {
        if (outputBuffer[i] != (i + 1) * 2)
        {
            puts("Results not correct!");
            break;
        }
    }
    if (i == array_size)
        puts("Correct!");

RELEASE_RESOURCES:

    status = clReleaseKernel(kernel);//*Release kernel.
    status = clReleaseProgram(program);    //Release the program object.
    status = clReleaseMemObject(input1MemObj);//Release mem object.
    status = clReleaseMemObject(input2MemObj);
    status = clReleaseMemObject(outputMemObj);
    status = clReleaseCommandQueue(commandQueue);//Release  Command queue.
    status = clReleaseContext(context);//Release context.
    free(devices);
    getchar();
    return 1;
}

cl_kernel.cl 函数

__kernel void cl_add(__global int *dst, __global int *src1, __global int *src2)
{
    int index = get_global_id(0);
    dst[index] = src1[index] + src2[index];
}

结果：

附上美女一枚：

参考

倪庆亮 OpenCL规范
Khronos OpenCL Working Group

OpenCL学习之介绍