CUDA：纹理内存入门到精通--纹理对象

导读

CUDA：纹理内存入门到精通–纹理内存概要
 CUDA：纹理内存入门到精通–纹理参考
 CUDA：纹理内存入门到精通–纹理对象

背景

纹理对象是CUDA针对纹理参考缺点而提出的升级版，其作用和纹理参考完全一致，但是使用方法更加灵活。

纹理对象

与纹理参考相比，CUDA对其进行各方面的升级，一方面是可以再代码中申请和销毁，另一方面则可以作为设备函数的参数进行传入；可以满足一些特殊的需求。

纹理对象API

使用纹理对象主要包括纹理对象创建、纹理访问和纹理对象销毁。

纹理对象创建

纹理对象创建之前首先要分别对纹理资源和纹理对象属性进行确定，分别对应cudaResoruceDesc和cudaTextureDesc；然后即可利用cudaCreateTextureObject来创建纹理对象。本文通过代码来解释这个过程：

    // 纹理资源
    struct cudaResourceDesc resDesc;
    memset(&resDesc, 0, sizeof(resDesc));
    //resType指定对应设备内存的形式，主要包括
    //cudaResourceTypeArray(二维纹理内存和二维纹理对象）
    //cudaResourceTypeMipmappedArray（不常用）
    //cudaResourceTypeLinear（一维纹理内存和一维纹理对象）
    //cudaResourceTypePitch2D（一维纹理内存和二维纹理对象）
    resDesc.resType = cudaResourceTypeArray;
    //res是一个枚举变量，针对不同内存也有不同的形式
    //cudaResourceTypeArray 对应 res.array.array
    //cudaResourceTypeMipmappedArray 对应res.mipmap.mipmap
    //cudaResourceTypeLinear 对应 res.linear.devPtr（同时还需要设置res.linear.sizeInBytes和res.linear.desc）
    //cudaResourceTypePitch2D 对应 res.pitch2D.devPtr(同时需要设定res.pitch2D.pitchInBytes,res.pitch2D.width,res.pitch2D.height,res.pitch2D.ddesc)
    resDesc.res.array.array = cuArray;//指定需要绑定的二维纹理内存

    // 纹理对象的属性
    // 由于与纹理参考类似，不再重复介绍
    // 再次提醒一次：cudaFilterModeLinear必须配合cudaReadModeNormalizedFloat使用
    struct cudaTextureDesc texDesc;
    memset(&texDesc, 0, sizeof(texDesc));
    texDesc.addressMode[0]   = cudaAddressModeWrap;
    texDesc.addressMode[1]   = cudaAddressModeWrap;
    texDesc.filterMode       = cudaFilterModeLinear;
    texDesc.readMode         = cudaReadModeNormalizedFloat;
    texDesc.normalizedCoords = 1;

    // 创建纹理对象
    cudaTextureObject_t texObj = 0;
    cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);

纹理访问

纹理对象的纹理访问也和纹理参考一样，也是使用tex1D或tex2D等函数进行操作。

//一维纹理
template<class T>
T tex1D(cudaTextureObject_t texObj, float x);

//二维纹理
template<class T>
T tex2D(cudaTextureObject_t texObj, float x, float y);

纹理对象销毁

纹理对象的销毁直接使用cudaDestroyTextureObject即可。值得注意的是应该先销毁对象，然后再释放对应的设备内存。

__host__ cudaError_t cudaDestroyTextureObject ( cudaTextureObject_t texObject )

示例代码

// 核函数
__global__ void transformKernel(float* output,
                                cudaTextureObject_t texObj,
                                int width, int height,
                                float theta) 
{
    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;

    float u = x / (float)width;
    float v = y / (float)height;

    u -= 0.5f;
    v -= 0.5f;
    float tu = u * cosf(theta) - v * sinf(theta) + 0.5f;
    float tv = v * cosf(theta) + u * sinf(theta) + 0.5f;

    // 获取纹理
    output[y * width + x] = tex2D<float>(texObj, tu, tv);
}

int main()
{
    //实验数据
    int width = 10;
    int height = 10;
    float h_data[width*height];

    for(int y=0;y<height;y++)
    {
      for(int x=0;x<width;x++)
      {
        h_data[y*width+x] = x+y;
      }
    }

    cudaChannelFormatDesc channelDesc =
               cudaCreateChannelDesc(32, 0, 0, 0,
                                     cudaChannelFormatKindFloat);
    cudaArray* cuArray;
    cudaMallocArray(&cuArray, &channelDesc, width, height);

    cudaMemcpyToArray(cuArray, 0, 0, h_data, size,
                      cudaMemcpyHostToDevice);

    // 创建纹理对象
    struct cudaResourceDesc resDesc;
    memset(&resDesc, 0, sizeof(resDesc));
    resDesc.resType = cudaResourceTypeArray;
    resDesc.res.array.array = cuArray;

    struct cudaTextureDesc texDesc;
    memset(&texDesc, 0, sizeof(texDesc));
    texDesc.addressMode[0]   = cudaAddressModeWrap;
    texDesc.addressMode[1]   = cudaAddressModeWrap;
    texDesc.filterMode       = cudaFilterModeLinear;
    texDesc.readMode         = cudaReadModeNormalizedFloat;
    texDesc.normalizedCoords = 1;

    cudaTextureObject_t texObj = 0;
    cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);

    float* output;
    cudaMalloc(&output, width * height * sizeof(float));

    // 调用核函数
    dim3 dimBlock(16, 16);
    dim3 dimGrid((width  + dimBlock.x - 1) / dimBlock.x,
                 (height + dimBlock.y - 1) / dimBlock.y);
    transformKernel<<<dimGrid, dimBlock>>>(output,
                                           texObj, width, height,
                                           angle);

    // 销毁纹理对象
    cudaDestroyTextureObject(texObj);

    // 释放设备内存
    cudaFreeArray(cuArray);
    cudaFree(output);

    return 0;
}

额外注意

在使用纹理对象时还有一些特殊注意。通常而言，使用纹理对象的目的就是使用多个纹理对象，因此会申请一个cudaTextureObject_t 数组。注意在使用时必须将其拷贝到device端才能正常使用。下边给出一个示例。

#include "cuda_runtime.h"
#include "device_launch_parameters.h" 

#include <stdio.h>
#include <time.h>

__global__ void kernel_set_value(const cudaTextureObject_t *texObj,float *dev_result,int width,int height)
{
    int x = threadIdx.x + blockIdx.x*blockDim.x;
    int y = threadIdx.y + blockIdx.y*blockDim.y;

    if(x<0 || x>width || y<0 || y>height)
    {
        return;
    }

    float sum = 0;
    for(int i=0;i<4;i++)
    {
        sum += tex2D<float>(texObj[i],x,y);
    }

    int pos = y*width + x;
    dev_result[pos]= sum;

}


int main()
{
    const int array_size_width = 10;
    const int array_size_height = 10;
    float random_array[array_size_width*array_size_height];
    for(int i=0;i<array_size_width*array_size_height;i++)
    {
        random_array[i] = 1;
    }

    //error status
    cudaError_t cuda_status;

    //only chose one GPU
    cuda_status = cudaSetDevice(0);
    if(cuda_status != cudaSuccess)
    {
        fprintf(stderr,"cudaSetDevice failed! Do you have a CUDA-Capable GPU installed?");
        return 1;
    }

    cudaArray *dev_random_array[4];
    cudaTextureObject_t texObj[4];
    for(int i=0;i<4;i++)
    {
        cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
        //allocate memory on the GPU
        cuda_status = cudaMallocArray(&dev_random_array[i],
                                        &channelDesc,
                                        array_size_width,
                                        array_size_height);
        if(cuda_status != cudaSuccess)
        {
            fprintf(stderr,"cudaMallocArray Failed");
            exit( EXIT_FAILURE );
        }

        cuda_status = cudaMemcpyToArray(dev_random_array[i],
                                            0,
                                            0,
                                            random_array,
                                            sizeof(float)*array_size_height*array_size_width,
                                            cudaMemcpyHostToDevice);
        if(cuda_status != cudaSuccess)
        {
            fprintf(stderr,"cudaMemcpyToArray Failed");
            exit( EXIT_FAILURE );
        }

        // Specify texture
        struct cudaResourceDesc resDesc;
        memset(&resDesc, 0, sizeof(resDesc));
        resDesc.resType         = cudaResourceTypeArray;
        resDesc.res.array.array = dev_random_array[i];

        // Specify texture object parameters
        struct cudaTextureDesc texDesc;
        memset(&texDesc, 0, sizeof(texDesc));
        texDesc.addressMode[0]   = cudaAddressModeWrap;
        texDesc.addressMode[1]   = cudaAddressModeWrap;
        texDesc.filterMode       = cudaFilterModePoint;
        texDesc.readMode         = cudaReadModeElementType;
        texDesc.normalizedCoords = 0;


        cudaCreateTextureObject(&texObj[i], &resDesc, &texDesc, NULL);
    }

    //将纹理对象拷贝到设备端
    cudaTextureObject_t *dev_texObj;
    cudaMalloc((void**)&dev_texObj,sizeof(cudaTextureObject_t)*4);
    cudaMemcpy(dev_texObj,texObj,sizeof(cudaTextureObject_t)*4,cudaMemcpyHostToDevice);

    float *dev_result;
    cudaMalloc((void**)&dev_result,sizeof(float)*array_size_height*array_size_width);

    dim3 threads(16,16);
    dim3 grid((array_size_width+threads.x-1)/threads.x,(array_size_height+threads.y-1)/threads.y);  

    kernel_set_value<<<grid,threads>>>(dev_texObj,dev_result,array_size_width,array_size_height);
    cuda_status = cudaGetLastError();
    if(cuda_status != cudaSuccess)
    {
        fprintf(stderr,"kernel_set_value Failed");
        exit( EXIT_FAILURE );
    }

    cuda_status = cudaMemcpy(random_array,dev_result,sizeof(float)*array_size_width*array_size_height,cudaMemcpyDeviceToHost);//dev_depthMap
    if(cuda_status != cudaSuccess)
    {
        fprintf(stderr,"cudaMemcpy Failed");
        exit( EXIT_FAILURE );
    }

    for(int i=0;i<array_size_width*array_size_height;i++)
    {
        printf("%f\n",random_array[i]);
    }

    //free
    cudaFree(dev_texObj);
    cudaFree(dev_result);
    for(int i=0;i<4;i++)
    {
        cudaFreeArray(dev_random_array[i]);
        cudaDestroyTextureObject(texObj[i]);
    }

    return 0;
}