CUDA 复杂问题 + 细节问题 解答 见 CUDA复杂问题 + 细节问题 解答
首先放上代码:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda.h"
#include <iostream>
using std::cout;
using std::endl;
#define arrayNum 100
#define mwidth 10
#define mheight 10
#define mlength 10
texture<short, 3, cudaReadModeNormalizedFloat> texVolumeData; // 3D texture
cudaArray *d_volumeArray = 0;
void textureVolumeDataInit(void) {
short* texturedata = (short*)malloc(mwidth*mlength*mlength*sizeof(short));
for (int i = 0;i < mwidth*mlength*mlength;i++) {
texturedata[i] = i;
}
// --- Create 3D array
const cudaExtent volumeSize = make_cudaExtent(mwidth, mheight, mlength);
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<short>();
cudaMalloc3DArray(&d_volumeArray, &channelDesc, volumeSize);
// --- Copy data to 3D array (host to device)
cudaMemcpy3DParms copyParams = { 0 };
copyParams.srcPtr = make_cudaPitchedPtr((void*)texturedata, volumeSize.width * sizeof(short), volumeSize.width, volumeSize.height);
copyParams.dstArray = d_volumeArray;
copyParams.extent = volumeSize;
copyParams.kind = cudaMemcpyHostToDevice;
cudaMemcpy3D(©Params);
// --- Set texture parameters
texVolumeData.normalized = false; // access with normalized texture coordinates
texVolumeData.filterMode = cudaFilterModeLinear; // linear interpolation
texVolumeData.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates
texVolumeData.addressMode[1] = cudaAddressModeWrap;
texVolumeData.addressMode[2] = cudaAddressModeWrap;
// --- Bind array to 3D texture
cudaBindTextureToArray(texVolumeData, d_volumeArray, channelDesc);
free(texturedata);
}
__global__ void cudaTest(float *c)
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
c[x + y * blockDim.x * gridDim.x] = 65535.0/2.0*tex3D(texVolumeData, (float)(x+0.5), (float)(y+0.5),0+0.5);
}
int main()
{
float *dev_c = 0;
float *c = 0;
c = (float*)malloc(arrayNum * sizeof(float));
cudaMalloc((void**)&dev_c, arrayNum * sizeof(float));
textureVolumeDataInit();
dim3 blocks(2, 2);
dim3 threads(5, 5);
cudaTest << <blocks, threads >> >(dev_c);
cudaMemcpy(c, dev_c, arrayNum * sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0;i < arrayNum;i++) {
cout << c[i] << endl;
}
system("pause");
return 0;
}
注意几点:
这里收集到的数据是规范化的,所以需要乘一个比例系数:65535/2.0。如果是unsigned short类型的数据,就是65535了。
65535.0/2.0*tex3D(texVolumeData, (float)(x+0.5), (float)(y+0.5),0+0.5);