概述
本章介绍代码在各个并行副本之间的通信和协作。
1,了解不同线程之间的通信机制;
2,了解并行执行线程的同步机制;
5.2 并行线程块的分解
add<<<N,1>>>( dev_a, dev_b, dev_c);
//第一个参数为想要启动的线程块数量;
//第二个参数为CUDA运行时在每个线程块中创建的线程数量;
//共创建了N个线程块,N*1个并行线程;
add<<N,2>>>( dev_a, dev_b, dev_c);
//共创建了N个线程块,2N个并行线程;
索引的使用(线程块索引&线程索引)
add<<<N,1>>>( dev_a, dev_b, dev_c);
//此时使用了N个线程块,每个线程块1个线程;
//需要使用 线程索引: blockIdx.x
add<<<1,N>>>( dev_a, dev_b, dev_c);
//此时使用了1个线程块,1个线程块有N个线程;
//需要使用 线程索引: threadIdx.x
从并行线程块到并行线程需要进行的两处修改
1,blockIdx.x ==> threadIdx.x
2,add<<<N,1>>>() ==> add<<<1,N>>>()
使用线程实现GPU上的矢量求和(单纯使用线程索引)
#include "../common/book.h"
#define N 10
__global__ void add( int *a, int *b, int *c ) {
int tid = threadIdx.x; //单纯使用线程索引即可
if (tid < N)
c[tid] = a[tid] + b[tid];
}
int main( void ) {
int a[N], b[N], c[N];
int *dev_a, *dev_b, *dev_c;
// allocate the memory on the GPU
HANDLE_ERROR( cudaMalloc( (void**)&dev_a, N * sizeof(int) ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_b, N * sizeof(int) ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_c, N * sizeof(int) ) );
// fill the arrays 'a' and 'b' on the CPU
for (int i=0; i<N; i++) {
a[i] = i;
b[i] = i * i;
}
// copy the arrays 'a' and 'b' to the GPU
HANDLE_ERROR( cudaMemcpy( dev_a, a, N * sizeof(int),
cudaMemcpyHostToDevice ) );
HANDLE_ERROR( cudaMemcpy( dev_b, b, N * sizeof(int),
cudaMemcpyHostToDevice ) );
add<<<1,N>>>( dev_a, dev_b, dev_c );
// copy the array 'c' back from the GPU to the CPU
HANDLE_ERROR( cudaMemcpy( c, dev_c, N * sizeof(int),
cudaMemcpyDeviceToHost ) );
// display the results
for (int i=0; i<N; i++) {
printf( "%d + %d = %d\n", a[i], b[i], c[i] );
}
// free the memory allocated on the GPU
HANDLE_ERROR( cudaFree( dev_a ) );
HANDLE_ERROR( cudaFree( dev_b ) );
HANDLE_ERROR( cudaFree( dev_c ) );
return 0;
}
在GPU上对更长的矢量求和
因为硬件限制,线程块的数量有上限,对于启动核函数时每个线程块中的线程数量,也有上限。具体地说,最大的线程数量不能超过设备属性结构中maxThreadsPerBlock域的值。对很多的GPU,这个限制值是每个线程块最多512个线程。 对于并行线程大于512的矢量相加,需结合线程和线程块来计算。
从上例,改动两个地方:核函数中的索引计算方法和核函数的调用方式
核函数中的索引计算方法
int tid = threadIdx.x + blockIdx.x * blockDim.x;
//blockDim, 对所有线程块是个常数,为线程块中每一维的线程数量。因为使用的一维线程块,所以只使用blockDim.x。
在chapter 4 中, gridDim有个类似的值,即在线程格中每一维的线程块数量。gridDim是二维的,而blockDim实际上是三维的。CUDA运行时允许启动一个二维线程格,并且线程格中的每个线程块都是一个三维的线程数组。不常用,但是支持。
核函数中的索引计算方法
对于N个线程的核函数调用如下。如每个线程块有128个线程。
add<<<(N+127)/128, 128>>>( dev_a, dev_b, dev_c );
在GPU上对任意长度的矢量求和
概述
线程块的数量也有硬件限制,如线程格每一维大小不能超过65535。
#include "../common/book.h"
#define N (33 * 1024)
__global__ void add( int *a, int *b, int *c ) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
while (tid < N) {
c[tid] = a[tid] + b[tid];
tid += blockDim.x * gridDim.x;
//对于超长矢量长度的关键处理方式;
//这种方法可以处理任意长度的矢量相加!!!
}
}
int main( void ) {
int *a, *b, *c;
int *dev_a, *dev_b, *dev_c;
// allocate the memory on the CPU
a = (int*)malloc( N * sizeof(int) );
b = (int*)malloc( N * sizeof(int) );
c = (int*)malloc( N * sizeof(int) );
// allocate the memory on the GPU
HANDLE_ERROR( cudaMalloc( (void**)&dev_a, N * sizeof(int) ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_b, N * sizeof(int) ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_c, N * sizeof(int) ) );
// fill the arrays 'a' and 'b' on the CPU
for (int i=0; i<N; i++) {
a[i] = i;
b[i] = 2 * i;
}
// copy the arrays 'a' and 'b' to the GPU
HANDLE_ERROR( cudaMemcpy( dev_a, a, N * sizeof(int),
cudaMemcpyHostToDevice ) );
HANDLE_ERROR( cudaMemcpy( dev_b, b, N * sizeof(int),
cudaMemcpyHostToDevice ) );
add<<<128,128>>>( dev_a, dev_b, dev_c );
// copy the array 'c' back from the GPU to the CPU
HANDLE_ERROR( cudaMemcpy( c, dev_c, N * sizeof(int),
cudaMemcpyDeviceToHost ) );
// verify that the GPU did the work we requested
bool success = true;
for (int i=0; i<N; i++) {
if ((a[i] + b[i]) != c[i]) {
printf( "Error: %d + %d != %d\n", a[i], b[i], c[i] );
success = false;
}
}
if (success) printf( "We did it!\n" );
// free the memory we allocated on the GPU
HANDLE_ERROR( cudaFree( dev_a ) );
HANDLE_ERROR( cudaFree( dev_b ) );
HANDLE_ERROR( cudaFree( dev_c ) );
// free the memory we allocated on the CPU
free( a );
free( b );
free( c );
return 0;
}
5.2.2 在GPU上使用线程实现滤波效果
#include "cuda.h"
#include "../common/book.h"
#include "../common/cpu_anim.h"
#define DIM 1024
#define PI 3.1415926535897932f
__global__ void kernel( unsigned char *ptr, int ticks ) {
// map from threadIdx/BlockIdx to pixel position
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
//????????? offset为什么这么计算??????
// now calculate the value at that position
float fx = x - DIM/2;
float fy = y - DIM/2;
float d = sqrtf( fx * fx + fy * fy );
unsigned char grey = (unsigned char)(128.0f + 127.0f *
cos(d/10.0f - ticks/7.0f) /
(d/10.0f + 1.0f));
ptr[offset*4 + 0] = grey;
ptr[offset*4 + 1] = grey;
ptr[offset*4 + 2] = grey;
ptr[offset*4 + 3] = 255;
}
struct DataBlock {
unsigned char *dev_bitmap;
CPUAnimBitmap *bitmap;
};
void generate_frame( DataBlock *d, int ticks ) {
dim3 blocks(DIM/16,DIM/16); //?????
dim3 threads(16,16); //?????
kernel<<<blocks,threads>>>( d->dev_bitmap, ticks );
HANDLE_ERROR( cudaMemcpy( d->bitmap->get_ptr(),
d->dev_bitmap,
d->bitmap->image_size(),
cudaMemcpyDeviceToHost ) );
}
// clean up memory allocated on the GPU
void cleanup( DataBlock *d ) {
HANDLE_ERROR( cudaFree( d->dev_bitmap ) );
}
int main( void ) {
DataBlock data;
CPUAnimBitmap bitmap( DIM, DIM, &data );
data.bitmap = &bitmap;
HANDLE_ERROR( cudaMalloc( (void**)&data.dev_bitmap,
bitmap.image_size() ) );
bitmap.anim_and_exit( (void (*)(void*,int))generate_frame,
(void (*)(void*))cleanup );
}
5.3 共享内存和同步
线程块分解为线程,一方面是为了解决线程块数量的硬件限制。还有一个原因是便于利用共享内存实现线程同步。
对于GPU上启动的每一个线程块,CUDA C都将创建变量的一个副本。线程块中的每个线程都共享这块内存,但线程却无法看到和修改其他线程块的变量副本。这便可以实现线程块内的线程通信。 同时因为shared memory在GPU内部, 访问速度可以很快。
使用线程块中的共享内存,需要注意同步问题。
5.3.1 点积运算(dot product, 内积)
通常来说,归约运算不能充分利用GPU的并行运算能力,一般会传回CPU来运算。本例依旧使用GPU完成归约计算。
#include "../common/book.h"
#define imin(a,b) (a<b?a:b)
const int N = 33 * 1024;
const int threadsPerBlock = 256;
const int blocksPerGrid =
imin( 32, (N+threadsPerBlock-1) / threadsPerBlock );
__global__ void dot( float *a, float *b, float *c ) {
__shared__ float cache[threadsPerBlock];
//共享内存缓冲区;因为每个线程块生成共享变量的一个副本,所以只需要根据线程块中的线程的数量来分配内存
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int cacheIndex = threadIdx.x;
//共享的内存的索引和线程索引相同
float temp = 0;
while (tid < N) {
temp += a[tid] * b[tid];
tid += blockDim.x * gridDim.x;
//可以解决任意长度的向量计算
}
// set the cache values
cache[cacheIndex] = temp;
// synchronize threads in this block
__syncthreads(); //确保线程块中每个线程都执行完该函数前面的语句后,才会执行下一条语句
// for reductions, threadsPerBlock must be a power of 2
// because of the following code
//归约运算
int i = blockDim.x/2;
while (i != 0) {
if (cacheIndex < i)
cache[cacheIndex] += cache[cacheIndex + i];
__syncthreads();
i /= 2;
}
if (cacheIndex == 0)
c[blockIdx.x] = cache[0];
}
int main( void ) {
float *a, *b, c, *partial_c;
float *dev_a, *dev_b, *dev_partial_c;
// allocate memory on the cpu side
a = (float*)malloc( N*sizeof(float) );
b = (float*)malloc( N*sizeof(float) );
partial_c = (float*)malloc( blocksPerGrid*sizeof(float) );
// allocate the memory on the GPU
HANDLE_ERROR( cudaMalloc( (void**)&dev_a,
N*sizeof(float) ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_b,
N*sizeof(float) ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_partial_c,
blocksPerGrid*sizeof(float) ) );
// fill in the host memory with data
for (int i=0; i<N; i++) {
a[i] = i;
b[i] = i*2;
}
// copy the arrays 'a' and 'b' to the GPU
HANDLE_ERROR( cudaMemcpy( dev_a, a, N*sizeof(float),
cudaMemcpyHostToDevice ) );
HANDLE_ERROR( cudaMemcpy( dev_b, b, N*sizeof(float),
cudaMemcpyHostToDevice ) );
dot<<<blocksPerGrid,threadsPerBlock>>>( dev_a, dev_b,
dev_partial_c );
// copy the array 'c' back from the GPU to the CPU
HANDLE_ERROR( cudaMemcpy( partial_c, dev_partial_c,
blocksPerGrid*sizeof(float),
cudaMemcpyDeviceToHost ) );
// finish up on the CPU side
c = 0;
for (int i=0; i<blocksPerGrid; i++) {
c += partial_c[i];
}
#define sum_squares(x) (x*(x+1)*(2*x+1)/6)
printf( "Does GPU value %.6g = %.6g?\n", c,
2 * sum_squares( (float)(N - 1) ) );
// free memory on the gpu side
HANDLE_ERROR( cudaFree( dev_a ) );
HANDLE_ERROR( cudaFree( dev_b ) );
HANDLE_ERROR( cudaFree( dev_partial_c ) );
// free memory on the cpu side
free( a );
free( b );
free( partial_c );
}
5.3.2 (不正确的)点积运算优化
int i = blockDim.x/2;
while (i != 0) {
if (cacheIndex < i)
cache[cacheIndex] += cache[cacheIndex + i];
__syncthreads();
i /= 2;
}
为了充分利用线程块中的空闲线程, 对上面代码段进行下面的优化。 会发生线程发散(thread divergence),即当某些线程需要执行一条指令,而其他线程不需要执行时,这种情况就是线程发散。
正常情况下,发散的分支只会引起某些线程处于空闲状态,而其他线程继续执行分支中的代码。但是在__syncthreads()中,线程发散可能造成hang机。
所以,线程同步__syncthreads()需要保证不在线程发散中。
int i = blockDim.x/2;
while (i != 0) {
if (cacheIndex < i)
{
cache[cacheIndex] += cache[cacheIndex + i];
__syncthreads();
}
i /= 2;
}
5.3.3 基于共享内存的位图
#include "cuda.h"
#include "../common/book.h"
#include "../common/cpu_bitmap.h"
#define DIM 1024
#define PI 3.1415926535897932f
__global__ void kernel( unsigned char *ptr ) {
// map from threadIdx/BlockIdx to pixel position
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
__shared__ float shared[16][16];
// now calculate the value at that position
const float period = 128.0f;
shared[threadIdx.x][threadIdx.y] =
255 * (sinf(x*2.0f*PI/ period) + 1.0f) *
(sinf(y*2.0f*PI/ period) + 1.0f) / 4.0f;
// removing this syncthreads shows graphically what happens
// when it doesn't exist. this is an example of why we need it.
__syncthreads();
/如果没有这个同步,输出的图像会出现瑕疵
ptr[offset*4 + 0] = 0;
ptr[offset*4 + 1] = shared[15-threadIdx.x][15-threadIdx.y];
ptr[offset*4 + 2] = 0;
ptr[offset*4 + 3] = 255;
}
// globals needed by the update routine
struct DataBlock {
unsigned char *dev_bitmap;
};
int main( void ) {
DataBlock data;
CPUBitmap bitmap( DIM, DIM, &data );
unsigned char *dev_bitmap;
HANDLE_ERROR( cudaMalloc( (void**)&dev_bitmap,
bitmap.image_size() ) );
data.dev_bitmap = dev_bitmap;
dim3 grids(DIM/16,DIM/16);
dim3 threads(16,16);
kernel<<<grids,threads>>>( dev_bitmap );
HANDLE_ERROR( cudaMemcpy( bitmap.get_ptr(), dev_bitmap,
bitmap.image_size(),
cudaMemcpyDeviceToHost ) );
HANDLE_ERROR( cudaFree( dev_bitmap ) );
bitmap.display_and_exit();
}