#include<stdio.h>
#include"UnifiedMemory.h"
__global__ void sum_test(int *src, int *dst, int num){
int pos_start = blockIdx.x * blockDim.x + threadIdx.x;
int pos_step = blockDim.x * gridDim.x;
int warp_idx = pos_start / warpSize;
int warp_step = pos_step / warpSize;
int pop_cnt = __popc(pos_start);
printf("pos_start = %d,pop_cnt = %d\n",pos_start,pop_cnt);
for(int i=pos_start; i<num;i+=pos_step){
int sum = src[i];
__syncwarp();
for(int delta=warpSize/2; delta>0;delta/=2){
sum += __shfl_down_sync((unsigned int)-1,sum,delta);
}
if((pos_start%warpSize) == 0) dst[warp_idx] = sum;
warp_idx += warp_step;
}
}
int main()
{
UnifiedMemory src(sizeof(int)*200);
UnifiedMemory dst(sizeof(int)*200);
for(int i=0;i<200;++i){
src.At<int>(i)=1;
dst.At<int>(i)=-1;
}
sum_test<<<2,32>>>(src.Data<int>(),dst.Data<int>(),200);
cudaDeviceSynchronize();
for(int i=0;i<200;++i){
if(dst.At<int>(i)==-1)continue;
printf("i=%d,sum=%d\n",i,dst.At<int>(i));
}
return 0;
}
[cuda]使用shuffle实现的reduce操作
猜你喜欢
转载自blog.csdn.net/adream307/article/details/83820686
今日推荐
周排行