通过前面的文章,我们对32以下的迭代循环进行了展开处理,实际上,由于线程块的长度限制(GTX1050Ti是1024),可以说循环次数是确定的,因此可以将循环完全展开,
即 1024 512 256 128 64 都可以展开计算,唯一需要注意的每次计算之后都要进行同步,原因在前文已有解释。下面给出代码。
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include "math.h"
#include "stdlib.h"
//错误检查的宏定义
#define CHECK(call) \
{ \
const cudaError_t status=call; \
if (status!=cudaSuccess) \
{ \
printf("文件:%s,函数:%s,行号:%d\n",__FILE__, \
__FUNCTION__,__LINE__); \
printf("%s", cudaGetErrorString(status)); \
exit(1); \
} \
} \
//核函数
__global__ void Kernel(int *d_data, int *d_local_sum, int N)
{
int tid = threadIdx.x;
int index = 8 * blockIdx.x*blockDim.x + threadIdx.x;
int *data = d_data + 8 * blockIdx.x*blockDim.x;
if (index + 7 * blockDim.x < N)
{
int a = d_data[index];
int a1 = d_data[index + blockDim.x];
int a2 = d_data[index + 2 * blockDim.x];
int a3 = d_data[index + 3 * blockDim.x];
int a4 = d_data[index + 4 * blockDim.x];
int a5 = d_data[index + 5 * blockDim.x];
int a6 = d_data[index + 6 * blockDim.x];
int a7 = d_data[index + 7 * blockDim.x];
d_data[index] = (a + a1 + a2 + a3 + a4 + a5 + a6 + a7);
}
__syncthreads();
if (blockDim.x >= 1024 && tid < 512) data[tid] += data[tid + 512]; __syncthreads();
if (blockDim.x >= 512 && tid < 256) data[tid] += data[tid + 256]; __syncthreads();
if (blockDim.x >= 256 && tid < 128) data[tid] += data[tid + 128]; __syncthreads();
if (blockDim.x >= 128 && tid < 64) data[tid] += data[tid + 64]; __syncthreads();
if (tid < 32)
{
volatile int *vmen = data; //volatile 确保每次对全局内存的修改都即时修改!
vmen[tid] += vmen[tid + 32];
vmen[tid] += vmen[tid + 16];
vmen[tid] += vmen[tid + 8];
vmen[tid] += vmen[tid + 4];
vmen[tid] += vmen[tid + 2];
vmen[tid] += vmen[tid + 1];
}
if (tid == 0)
{
d_local_sum[blockIdx.x] = data[0];
}
}
//主函数
int main()
{
//基本参数设置
cudaSetDevice(0);
const int N = 1 << 29;
int local_length = 1024;
long long total_sum = 0;
dim3 grid(((N + local_length - 1) / local_length), 1);
dim3 block(local_length, 1);
int *h_data = nullptr;
int *h_local_sum = nullptr;
int *d_data = nullptr;
int *d_local_sum = nullptr;
//Host&Deivce内存申请及数组初始化
h_data = (int*)malloc(N * sizeof(int));
h_local_sum = (int*)malloc(int(grid.x / 8) * sizeof(int));
CHECK(cudaMalloc((void**)&d_data, N * sizeof(int)));
CHECK(cudaMalloc((void**)&d_local_sum, int(grid.x / 8) * sizeof(int)));
int s = 0;
for (int i = 0; i < N; i++)
{
h_data[i] = int(10 * sin(0.02*3.14*i));//限制数组元素值,防止最终求和值超过long long的范围
s += h_data[i];
}
//数据拷贝至Device
CHECK(cudaMemcpy(d_data, h_data, N * sizeof(int), cudaMemcpyHostToDevice));
//for (int i = 0; i < 200; i++)
//执行核函数
Kernel << <grid.x / 8, block >> > (d_data, d_local_sum, N);
//数据拷贝至Host
CHECK(cudaMemcpy(h_local_sum, d_local_sum, int(grid.x / 8) * sizeof(int),
cudaMemcpyDeviceToHost));
//同步&重置设备
CHECK(cudaDeviceSynchronize());
CHECK(cudaDeviceReset());
for (int i = 0; i < int(grid.x / 8); i++)
{
total_sum += h_local_sum[i];
}
printf("%I64d \n", total_sum);
//getchar();
return 0;
}
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include "math.h"
#include "stdlib.h"
//错误检查的宏定义
#define CHECK(call) \
{ \
const cudaError_t status=call; \
if (status!=cudaSuccess) \
{ \
printf("文件:%s,函数:%s,行号:%d\n",__FILE__, \
__FUNCTION__,__LINE__); \
printf("%s", cudaGetErrorString(status)); \
exit(1); \
} \
} \
//核函数
template<unsigned int iblocksize>
__global__ void Kernel(int *d_data, int *d_local_sum, int N)
{
int tid = threadIdx.x;
int index = 8 * blockIdx.x*blockDim.x + threadIdx.x;
int *data = d_data + 8 * blockIdx.x*blockDim.x;
if (index + 7 * blockDim.x < N)
{
int a = d_data[index];
int a1 = d_data[index + blockDim.x];
int a2 = d_data[index + 2 * blockDim.x];
int a3 = d_data[index + 3 * blockDim.x];
int a4 = d_data[index + 4 * blockDim.x];
int a5 = d_data[index + 5 * blockDim.x];
int a6 = d_data[index + 6 * blockDim.x];
int a7 = d_data[index + 7 * blockDim.x];
d_data[index] = (a + a1 + a2 + a3 + a4 + a5 + a6 + a7);
}
__syncthreads();
if (iblocksize >= 1024 && tid < 512) data[tid] += data[tid + 512]; __syncthreads();
if (iblocksize >= 512 && tid < 256) data[tid] += data[tid + 256]; __syncthreads();
if (iblocksize >= 256 && tid < 128) data[tid] += data[tid + 128]; __syncthreads();
if (iblocksize >= 128 && tid < 64) data[tid] += data[tid + 64]; __syncthreads();
if (tid < 32)
{
volatile int *vmen = data; //volatile 确保每次对全局内存的修改都即时修改!
vmen[tid] += vmen[tid + 32];
vmen[tid] += vmen[tid + 16];
vmen[tid] += vmen[tid + 8];
vmen[tid] += vmen[tid + 4];
vmen[tid] += vmen[tid + 2];
vmen[tid] += vmen[tid + 1];
}
if (tid == 0)
{
d_local_sum[blockIdx.x] = data[0];
}
}
//主函数
int main()
{
//基本参数设置
cudaSetDevice(0);
const int N = 1 << 29;
const int local_length = 1024;
long long total_sum = 0;
dim3 grid(((N + local_length - 1) / local_length), 1);
dim3 block(local_length, 1);
int *h_data = nullptr;
int *h_local_sum = nullptr;
int *d_data = nullptr;
int *d_local_sum = nullptr;
//Host&Deivce内存申请及数组初始化
h_data = (int*)malloc(N * sizeof(int));
h_local_sum = (int*)malloc(int(grid.x / 8) * sizeof(int));
CHECK(cudaMalloc((void**)&d_data, N * sizeof(int)));
CHECK(cudaMalloc((void**)&d_local_sum, int(grid.x / 8) * sizeof(int)));
int s = 0;
for (int i = 0; i < N; i++)
{
h_data[i] = int(10 * sin(0.02*3.14*i));//限制数组元素值,防止最终求和值超过long long的范围
s += h_data[i];
}
//数据拷贝至Device
CHECK(cudaMemcpy(d_data, h_data, N * sizeof(int), cudaMemcpyHostToDevice));
//for (int i = 0; i < 200; i++)
//执行核函数
//Kernel<local_length> << <grid.x / 8, block >> > (d_data, d_local_sum, N);
switch (block.x)
{
case 1024:
Kernel<1024> << <grid.x / 8, block >> > (d_data, d_local_sum, N); break;
case 512:
Kernel<512> << <grid.x / 8, block >> > (d_data, d_local_sum, N); break;
case 256:
Kernel<256> << <grid.x / 8, block >> > (d_data, d_local_sum, N); break;
case 128:
Kernel<128> << <grid.x / 8, block >> > (d_data, d_local_sum, N); break;
case 64:
Kernel<64> << <grid.x / 8, block >> > (d_data, d_local_sum, N); break;
}
//数据拷贝至Host
CHECK(cudaMemcpy(h_local_sum, d_local_sum, int(grid.x / 8) * sizeof(int),
cudaMemcpyDeviceToHost));
//同步&重置设备
CHECK(cudaDeviceSynchronize());
CHECK(cudaDeviceReset());
for (int i = 0; i < int(grid.x / 8); i++)
{
total_sum += h_local_sum[i];
}
printf("%I64d \n", total_sum);
//getchar();
return 0;
}