[cuda]使用shuffle实现的reduce操作

#include<stdio.h>
#include"UnifiedMemory.h"

__global__ void sum_test(int *src, int *dst, int num){
    int pos_start = blockIdx.x * blockDim.x + threadIdx.x;
    int pos_step = blockDim.x * gridDim.x;
    int warp_idx = pos_start / warpSize;
    int warp_step = pos_step / warpSize;
    int pop_cnt = __popc(pos_start);
    printf("pos_start = %d,pop_cnt = %d\n",pos_start,pop_cnt);
    for(int i=pos_start; i<num;i+=pos_step){
        int sum = src[i];
        __syncwarp();
        for(int delta=warpSize/2; delta>0;delta/=2){
            sum += __shfl_down_sync((unsigned int)-1,sum,delta);
        }
        if((pos_start%warpSize) == 0) dst[warp_idx] = sum;
        warp_idx += warp_step;
    }
}

int main()
{
    UnifiedMemory src(sizeof(int)*200);
    UnifiedMemory dst(sizeof(int)*200);
    for(int i=0;i<200;++i){
        src.At<int>(i)=1;
        dst.At<int>(i)=-1;
    }
    sum_test<<<2,32>>>(src.Data<int>(),dst.Data<int>(),200);
    cudaDeviceSynchronize();
    for(int i=0;i<200;++i){
        if(dst.At<int>(i)==-1)continue;
        printf("i=%d,sum=%d\n",i,dst.At<int>(i));
    }
    return 0;
} 

猜你喜欢

转载自blog.csdn.net/adream307/article/details/83820686