CUDA 复杂问题 + 细节问题 解答 见 CUDA复杂问题 + 细节问题 解答
首先先把程序贴上:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cuda_runtime_api.h>
#include <cuda.h>
#include <iostream>
#define width 10
#define height 11
using std::cout;
using std::endl;
texture<float, 2, cudaReadModeElementType> tex;
__global__ void kernel(float *arr_cpy)
{
float i = threadIdx.x + blockIdx.x*blockDim.x;
float j = threadIdx.y + blockIdx.y*blockDim.y;
arr_cpy[(int)(i*width+j)] = tex2D(tex,j + 0.5f , i + 0.5f);
}
float *arr;
int main(void)
{
arr = (float*)malloc(width*height * sizeof(float));
for (int i = 0;i < width;i++) {
for (int j = 0;j < height;j++) {
arr[i*width + j] = i*width + j;
}
}
size_t pitch, tex_ofs;
float *arr_d = 0;
cudaMallocPitch((void**)&arr_d, &pitch, width * sizeof(float), height);
cudaMemcpy2D(arr_d, pitch, arr, width * sizeof(arr[0]),
width * sizeof(arr[0]), height, cudaMemcpyHostToDevice);
tex.normalized = false;
cudaBindTexture2D(&tex_ofs, &tex, arr_d, &tex.channelDesc,width, height, pitch);
float *arr_cpy;
float *hos_c = 0;
hos_c = (float*)malloc(width*height * sizeof(float));
cudaMalloc((void**)&arr_cpy, width*height * sizeof(float));
dim3 blocks(2,2);
dim3 threads(5, 5);
kernel << <blocks, threads >> >(arr_cpy);
cudaMemcpy(hos_c, arr_cpy, width*height*sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0;i < width*height;i++) {
cout << hos_c[i] << endl;
}
cudaDeviceSynchronize();
system("pause");
return EXIT_SUCCESS;
}
注意几个问题:
第一,一维纹理不管是Linear Memory还是使用cudaMallocPitch,都是可以使用tex1Dfetch和tex1D这两个函数进行采样的。而对于二维纹理,不管是cudaArray还是cudaMallocPitch都是使用tex2D。
第二,#define width 10 #define height 11,height必须大于width,否则会报错。
第三,采样的时候长宽是颠倒的:
float i = threadIdx.x + blockIdx.x*blockDim.x;
float j = threadIdx.y + blockIdx.y*blockDim.y;
arr_cpy[(int)(i*width+j)] = tex2D(tex,j + 0.5f , i + 0.5f);
前面是 j ,后面是 i 。