OpenACC 绘制曼德勃罗集

▶ 书上第四章，用一系列步骤优化曼德勃罗集的计算过程。

● 代码

 1 // constants.h
 2 const unsigned int WIDTH=16384;
 3 const unsigned int HEIGHT=16384;
 4 const unsigned int MAX_ITERS=50;
 5 const unsigned int MAX_COLOR=255;
 6 const double xmin=-1.7;
 7 const double xmax=.5;
 8 const double ymin=-1.2;
 9 const double ymax=1.2;
10 const double dx = (xmax - xmin) / WIDTH;
11 const double dy = (ymax - ymin) / HEIGHT;

1 // mandelbrot.h
2 #pragma acc routine seq
3 unsigned char mandelbrot(int Px, int Py);

 1 // mandelbrot.cpp
 2 #include <cstdio>
 3 #include <cstdlib>
 4 #include <fstream>
 5 #include "mandelbrot.h"
 6 #include "constants.h"
 7 
 8 using namespace std;
 9 
10 unsigned char mandelbrot(int Px, int Py)
11 {
12     const double x0 = xmin + Px * dx, y0 = ymin + Py * dy;
13     double x = 0.0, y = 0.0;
14     int i;
15     for(i=0; x * x + y * y < 4.0 && i < MAX_ITERS; i++)
16     {
17         double xtemp = x * x - y * y + x0;
18         y = 2 * x * y + y0;
19         x = xtemp;
20     }
21     return (double)MAX_COLOR * i / MAX_ITERS;
22 }

 1 // main.cpp
 2 #include <cstdio>
 3 #include <cstdlib>
 4 #include <fstream>
 5 #include <cstring>
 6 #include <omp.h>
 7 #include <openacc.h>
 8 
 9 #include "mandelbrot.h"
10 #include "constants.h"
11 
12 using namespace std;
13 
14 int main()
15 {
16     unsigned char *image = (unsigned char*)malloc(sizeof(unsigned int) * WIDTH * HEIGHT);
17     FILE *fp=fopen("image.pgm","wb");
18     fprintf(fp,"P5\n\"#comment\"\n%d %d\n%d\n",WIDTH, HEIGHT, MAX_COLOR);
19     
20     acc_init(acc_device_nvidia);
21 #pragma acc parallel num_gangs(1)
22     {
23         image[0] = 0;
24     }        
25     double st = omp_get_wtime();
26 #pragma acc parallel loop
27     for(int y = 0; y < HEIGHT; y++)
28     {
29         for(int x = 0; x < WIDTH; x++)
30             image[y * WIDTH + x] = mandelbrot(x, y);
31     }  
32     double et = omp_get_wtime();
33     printf("Time: %lf seconds.\n", (et-st));
34     fwrite(image,sizeof(unsigned char),WIDTH * HEIGHT, fp);
35     fclose(fp);
36     free(image);
37     return 0;
38 }

● 输出结果

// Ubuntu:
cuan@CUAN:/media/cuan/02FCDA52FCDA4019/Code/ParallelProgrammingWithOpenACC-master/Chapter04/cpp$ pgc++ -std=c++11 -acc -mp -fast -Minfo -c mandelbrot.cpp
mandelbrot(int, int):
      9, Generating acc routine seq
         Generating Tesla code
     10, FMA (fused multiply-add) instruction(s) generated
     15, Loop not vectorized/parallelized: potential early exits
     16, FMA (fused multiply-add) instruction(s) generated
cuan@CUAN:/media/cuan/02FCDA52FCDA4019/Code/ParallelProgrammingWithOpenACC-master/Chapter04/cpp$ pgc++ -std=c++11 -acc -mp -fast -Minfo main.cpp mandelbrot.o -o acc1.exe
main.cpp:
main:
     24, Accelerator kernel generated
         Generating Tesla code
         Generating implicit copyout(image[0])
     27, Accelerator kernel generated
         Generating Tesla code
         30, #pragma acc loop gang /* blockIdx.x */
         31, #pragma acc loop vector(128) /* threadIdx.x */
     27, Generating implicit copy(image[:268435456])
     31, Loop is parallelizable
         Loop not vectorized/parallelized: contains call
cuan@CUAN:/media/cuan/02FCDA52FCDA4019/Code/ParallelProgrammingWithOpenACC-master/Chapter04/cpp$ ./acc1.exe
Time: 0.646578 seconds.

● 优化 03，变化仅在 main.cpp 中

 1 // main.cpp
 2 #include <cstdio>
 3 #include <cstdlib>
 4 #include <fstream>
 5 #include <cstring>
 6 #include <omp.h>
 7 #include <openacc.h>
 8 #include "mandelbrot.h"
 9 #include "constants.h"
10 
11 using namespace std;
12 
13 int main()
14 {
15     const int num_blocks = 16, block_size = HEIGHT / num_blocks * WIDTH;
16     unsigned char *image=(unsigned char*)malloc(sizeof(unsigned int) * WIDTH * HEIGHT);
17     FILE *fp=fopen("image.pgm","wb");
18     fprintf(fp,"P5\n\"#comment\"\n%d %d\n%d\n",WIDTH, HEIGHT, MAX_COLOR);
19 
20     acc_init(acc_device_nvidia);
21 #pragma acc parallel num_gangs(1)
22     {
23         image[0] = 0;
24     }
25     double st = omp_get_wtime();
26 #pragma acc data create(image[WIDTH*HEIGHT])
27     {
28         for(int block = 0; block < num_blocks; block++)
29         {
30             const int start = block * (HEIGHT/num_blocks), end   = start + (HEIGHT/num_blocks);
31 #pragma acc parallel loop async(block)
32             for(int y=start;y<end;y++)
33             {
34                 for(int x=0;x<WIDTH;x++)
35                     image[y*WIDTH+x]=mandelbrot(x,y);
36             }
37 #pragma acc update self(image[block*block_size:block_size]) async(block)
38         }
39     }
40 #pragma acc wait
41   
42     double et = omp_get_wtime();
43     printf("Time: %lf seconds.\n", (et-st));
44     fwrite(image,sizeof(unsigned char), WIDTH * HEIGHT, fp);
45     fclose(fp);
46     free(image);
47     return 0;
48 }

● 输出结果

// Ubuntu:
cuan@CUAN:/media/cuan/02FCDA52FCDA4019/Code/ParallelProgrammingWithOpenACC-master/Chapter04/cpp/task3$ pgc++ -std=c++11 -acc -mp -fast -Minfo -c mandelbrot.cpp
mandelbrot(int, int):
     11, Generating acc routine seq
         Generating Tesla code
     12, FMA (fused multiply-add) instruction(s) generated
     15, Loop not vectorized/parallelized: potential early exits
     17, FMA (fused multiply-add) instruction(s) generated
cuan@CUAN:/media/cuan/02FCDA52FCDA4019/Code/ParallelProgrammingWithOpenACC-master/Chapter04/cpp/task3$ pgc++ -std=c++11 -acc -mp -fast -Minfo main.cpp mandelbrot.o -o acc2.exe
main.cpp:
main:
     22, Accelerator kernel generated
         Generating Tesla code
         Generating implicit copyout(image[0])
     27, Generating create(image[:268435456])
     30, Accelerator kernel generated
         Generating Tesla code
         32, #pragma acc loop gang /* blockIdx.x */
         34, #pragma acc loop vector(128) /* threadIdx.x */
     34, Loop is parallelizable
         Loop not vectorized/parallelized: contains call
     38, Generating update self(image[block*16777216:16777216])
cuan@CUAN:/media/cuan/02FCDA52FCDA4019/Code/ParallelProgrammingWithOpenACC-master/Chapter04/cpp/task3$ ./acc2.exe
Time: 0.577263 seconds.

● 优化 05，添加异步计算

 1 // main.cpp
 2 #include <cstdio>
 3 #include <cstdlib>
 4 #include <fstream>
 5 #include <cstring>
 6 #include <omp.h>
 7 #include <openacc.h>
 8 #include "mandelbrot.h"
 9 #include "constants.h"
10 
11 using namespace std;
12 
13 int main()
14 {
15   const int num_blocks=64, block_size = HEIGHT / num_blocks * WIDTH;  
16   unsigned char *image=(unsigned char*)malloc(sizeof(unsigned int) * WIDTH * HEIGHT);
17   FILE *fp = fopen("image.pgm", "wb");
18   fprintf(fp,"P5\n\"#comment\"\n%d %d\n%d\n",WIDTH, HEIGHT, MAX_COLOR);
19 
20   const int num_gpus = acc_get_num_devices(acc_device_nvidia);
21 
22 #pragma omp parallel num_threads(num_gpus)
23     {
24         acc_init(acc_device_nvidia);
25         acc_set_device_num(omp_get_thread_num(),acc_device_nvidia);
26     }
27         printf("Found %d NVIDIA GPUs.\n", num_gpus);
28 
29     double st = omp_get_wtime();
30 #pragma omp parallel num_threads(num_gpus)
31     {
32         int queue = 1;
33         int my_gpu = omp_get_thread_num();
34         acc_set_device_num(my_gpu,acc_device_nvidia);
35         printf("Thread %d is using GPU %d\n", my_gpu, acc_get_device_num(acc_device_nvidia));
36 #pragma acc data create(image[WIDTH*HEIGHT])
37         {
38 #pragma omp for schedule(static, 1)
39             for(int block = 0; block < num_blocks; block++) 
40             {
41                 const int start = block * (HEIGHT/num_blocks), end   = start + (HEIGHT/num_blocks);
42 #pragma acc parallel loop async(queue)
43                 for(int y=start;y<end;y++)
44                 {
45                     for(int x=0;x<WIDTH;x++)
46                         image[y*WIDTH+x]=mandelbrot(x,y);
47                 }
48 
49 #pragma acc update self(image[block*block_size:block_size]) async(queue)
50         queue = (queue + 1) % 2; 
51             }
52         }
53 #pragma acc wait
54     } 
55   
56     double et = omp_get_wtime();
57     printf("Time: %lf seconds.\n", (et-st));
58     fwrite(image,sizeof(unsigned char), WIDTH * HEIGHT, fp);
59     fclose(fp);
60     free(image);
61     return 0;
62 }

● 输出结果

// Ubuntu:
cuan@CUAN:/media/cuan/02FCDA52FCDA4019/Code/ParallelProgrammingWithOpenACC-master/Chapter04/cpp/task5.multithread$ pgc++ -std=c++11 -acc -mp -fast -Minfo -c mandelbrot.cpp
mandelbrot(int, int):
     11, Generating acc routine seq
         Generating Tesla code
     12, FMA (fused multiply-add) instruction(s) generated
     15, Loop not vectorized/parallelized: potential early exits
     17, FMA (fused multiply-add) instruction(s) generated
cuan@CUAN:/media/cuan/02FCDA52FCDA4019/Code/ParallelProgrammingWithOpenACC-master/Chapter04/cpp/task5.multithread$ pgc++ -std=c++11 -acc -mp -fast -Minfo main.cpp mandelbrot.o -o acc3.exe
main.cpp:
main:
     23, Parallel region activated
     26, Parallel region terminated
     31, Parallel region activated
     37, Generating create(image[:268435456])
     39, Parallel loop activated with static cyclic schedule
     41, Accelerator kernel generated
         Generating Tesla code
         43, #pragma acc loop gang /* blockIdx.x */
         45, #pragma acc loop vector(128) /* threadIdx.x */
     45, Loop is parallelizable
         Loop not vectorized/parallelized: contains call
     50, Generating update self(image[block*4194304:4194304])
     51, Barrier
     54, Parallel region terminated
cuan@CUAN:/media/cuan/02FCDA52FCDA4019/Code/ParallelProgrammingWithOpenACC-master/Chapter04/cpp/task5.multithread$ ./acc3.exe
Found 1 NVIDIA GPUs.
Thread 0 is using GPU 0
Time: 0.497450 seconds.

● nvprof 的结果汇总，三张图分别为 “并行和数据优化”，“优化 03（分块分流）” 和 “优化 05（分块调度）”

OpenACC 绘制曼德勃罗集

猜你喜欢