在第二版的基础上减少通讯时间

第二个版本已经做了一定的忧化，并行计算的总时间是：串行时间+并行时间+通讯时间。
前两个版本都是让进程0找出素数后，再通过广播的形式通知到各个进程。各个进程再进行查找筛选，如果在通讯质量不是很好的网络中进行，将会有很多的时间被浪费在线程通信上面。
于是可以在版本二的基础上，让每个进程自己来计算基础的素数，而不是在嗷嗷待哺的等待着进程0来给它们喂食。这样将会减少通讯所消耗的时间。

方案：

增加一个小标志数组，让每一个进程自己来计算基础素数:

smallSize = (int) sqrt((double) n);

smallMarked = (char *) malloc(smallSize);

让每个进程单独计算基础素数，计算好用自己计算的素数判断后面的素数：

smallIndex = 0;
    prime = 3;
    do {
        if (prime * prime > port_low_value)
            first = (prime * prime - port_low_value)/2;
        else {
            if (!(port_low_value % prime)) first = 0;
            else if(port_low_value % prime % 2 ==0)
                first = prime - ((port_low_value % prime)/2);       // 此处在求局部first（数组中第一个可以被prime整除的数）的时候非常巧妙
            else
                first = (prime - (port_low_value % prime))/2;
        }
        for (i = first; i < smallSize; i += prime){
            smallMarked[i] = 1;
        }
        while (smallMarked[++smallIndex]);
        prime = smallIndex * 2 + 3;
     } while (prime * prime <= n);

    prime = 3;
    smallIndex = 0;

    do {
        if (prime * prime > low_value)
            first = (prime * prime - low_value)/2;
        else {
            if (!(low_value % prime)) first = 0;
            else if(low_value % prime % 2 ==0)
                first = prime - ((low_value % prime)/2);       // 此处在求局部first（数组中第一个可以被prime整除的数）的时候非常巧妙
            else
                first = (prime - (low_value % prime))/2;
        }
        // if (!id)
        //     printf("---%d",first);
        // for (i = first; i < smallSize; i += prime){
        //     smallMarked[i] = 1;
        // }
        for (i = first; i < size; i += prime){
            marked[i] = 1;
        }
         
        while (smallMarked[++smallIndex]);
        prime = smallIndex * 2 + 3;

        // if (p > 1) MPI_Bcast(&prime, 1, MPI_INT, 0, MPI_COMM_WORLD);    //广播，将一个进程中的数据发送到所有进程
    } while (prime * prime <= n);

这样就可以不用MPI中的MPI_Bcast函数来进行广播。

整体：

#include "mpi.h"
#include <math.h>
#include <stdio.h>
#include <stdlib.h>

#define MIN(a, b)  ((a)<(b)?(a):(b))

int main(int argc, char *argv[]) {
    unsigned long int count;        /* Local prime count */
    double elapsed_time; /* Parallel execution time */
    unsigned long int first;        /* Index of first multiple */
    unsigned long int global_count = 0; /* Global prime count */
    unsigned long long int high_value;   /* Highest value on this proc */
    unsigned long int i;
    int id;           /* Process ID number */
    unsigned long int index;        /* Index of current prime */
    unsigned long long int low_value;    /* Lowest value on this proc */
    char *marked;       /* Portion of 2,...,'n' */
    unsigned long long int n;            /* Sieving from 2, ..., 'n' */
    int p;            /* Number of processes */
    unsigned long int proc0_size;   /* Size of proc 0's subarray */
    unsigned long int prime;        /* Current prime */
    unsigned long int size;         /* Elements in 'marked' */
    unsigned long int low_index;    // 低位对应的全局索引值
    unsigned long int high_index;   // 高位对应的全局索引值
    unsigned long int port_low_index;    // 低位对应的局部索引值
    unsigned long int port_high_index;   // 高位对应的局部索引值
    unsigned long long int port_high_value;  //小的局部最低值
    unsigned long long int port_low_value;      //最大局部最高值
    char *smallMarked;              // 来记录从3到根号n对应的素数
    unsigned long int smallSize;
    unsigned long int smallIndex;   // 与index作用类似，用于记录smallMarked的index

    MPI_Init(&argc, &argv);

    /* Start the timer */

    MPI_Comm_rank(MPI_COMM_WORLD, &id); //获取进程id
    MPI_Comm_size(MPI_COMM_WORLD, &p);  //获取进程数量
    MPI_Barrier(MPI_COMM_WORLD);        //进行同步
    elapsed_time = -MPI_Wtime();

    // if (argc != 2) {
    //     if (!id) printf("Command line: %s <m>\n", argv[0]);
    //     MPI_Finalize();
    //     exit(1);
    // }

    // n = atoll(argv[1]);      //获取要计算的数
    n = 100;      //获取要计算的数

    /* Figure out this process's share of the array, as
       well as the integers represented by the first and
       last array elements 计算这个进程在数组中的份额，以及第一个和最后一个数组元素表示的整数*/
    /*********originalSoution*******/
    // low_value = 2 + id * (n - 1) / p;
    // high_value = 1 + (id + 1) * (n - 1) / p;
    // size = high_value - low_value + 1;
    /********solution1**********/
    if (n % 2 == 0)      // 如果给出的是一个偶数，将给出的数减一变为奇数。
        n = n -1;
    // low_value = 3 + id * (n - 1) / p;
    // high_value = 1 + (id + 1) * (n - 1) / p;
    low_index = id * ((n - 1) / 2) / p;
    high_index = (id+1) * ((n-1) / 2) / p; 
    port_low_index = 0 * ((n - 1) / 2) / p;
    port_high_index = 1 * ((n-1) / 2) / p;
    low_value = low_index * 2 + 3;
    high_value = high_index * 2 + 1;
    port_low_value = port_low_index * 2 + 3;
    port_high_value = port_high_index * 2 + 1;
    
    printf("%d:%lld-%lld\n",id,low_value,high_value);
    size = (high_value - low_value) / 2 + 1;
    smallSize = (int) sqrt((double) n);

    /* Bail out if all the primes used for sieving are
       not all held by process 0 如果用于筛选的所有质数不都由进程0持有，则退出*/

    proc0_size = (n - 1) / p;

    if ((2 + proc0_size) < (int) sqrt((double) n)) {
        if (!id) printf("Too many processes\n");
        MPI_Finalize();
        exit(1);
    }

    /* Allocate this process's share of the array.分配此进程在数组中的份额 */

    marked = (char *) malloc(size);
    smallMarked = (char *) malloc(smallSize);

    if (marked == NULL) {
        printf("Cannot allocate enough memory\n");
        MPI_Finalize();
        exit(1);
    }

    if (smallMarked == NULL) {
        printf("Cannot allocate enough memory\n");
        MPI_Finalize();
        exit(1);
    }

    for (i = 0; i < size; i++) marked[i] = 0;
    for (i = 0; i < smallSize; i++) smallMarked[i] = 0;
    if (!id) index = 0;         // ！id----->只有0号进程才会执行。
    index = 0;
    // if (!id) smallIndex = 0;
    smallIndex = 0;
    prime = 3;
    do {
        if (prime * prime > port_low_value)
            first = (prime * prime - port_low_value)/2;
        else {
            if (!(port_low_value % prime)) first = 0;
            else if(port_low_value % prime % 2 ==0)
                first = prime - ((port_low_value % prime)/2);       // 此处在求局部first（数组中第一个可以被prime整除的数）的时候非常巧妙
            else
                first = (prime - (port_low_value % prime))/2;
        }
        for (i = first; i < smallSize; i += prime){
            smallMarked[i] = 1;
        }
        while (smallMarked[++smallIndex]);
        prime = smallIndex * 2 + 3;
     } while (prime * prime <= n);

    prime = 3;
    smallIndex = 0;

    do {
        if (prime * prime > low_value)
            first = (prime * prime - low_value)/2;
        else {
            if (!(low_value % prime)) first = 0;
            else if(low_value % prime % 2 ==0)
                first = prime - ((low_value % prime)/2);       // 此处在求局部first（数组中第一个可以被prime整除的数）的时候非常巧妙
            else
                first = (prime - (low_value % prime))/2;
        }
        // if (!id)
        //     printf("---%d",first);
        // for (i = first; i < smallSize; i += prime){
        //     smallMarked[i] = 1;
        // }
        for (i = first; i < size; i += prime){
            marked[i] = 1;
        }
         
        while (smallMarked[++smallIndex]);
        prime = smallIndex * 2 + 3;

        // if (p > 1) MPI_Bcast(&prime, 1, MPI_INT, 0, MPI_COMM_WORLD);    //广播，将一个进程中的数据发送到所有进程
    } while (prime * prime <= n);
    count = 0;
    for (i = 0; i < size; i++)
        if (!marked[i]) 
        {
            count++;    // 统计单个进程中素数的个数，看有多少个0
            // if (id == 0)
            //     {
            //         printf("%d:%d ",id, i*2 + low_value); // 进行测试输出
            //     }
        }
    if(!id) count++;
    printf("%d:count %ld\n",id,count);
    if (p > 1)
        MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM,
                   0, MPI_COMM_WORLD);  // 规约，集合通信，由进程0来计算全局的count

    /* Stop the timer */

    elapsed_time += MPI_Wtime();


    /* Print the results */

    if (!id) {
        printf("The total number of prime: %ld, total time: %10.6f, total node %d\n", global_count, elapsed_time, p);
    }
    MPI_Finalize();
    return 0;

}

输出对比(10的八次方)：

0:3-24999999
1:25000001-49999999
2:50000001-74999999
3:75000001-99999999
0:count 1565927
1:count 1435207
2:count 1393170
3:count 1367151
The total number of prime: 5761455, total time:   0.779948, total node 4

由于是在中山大学基于天河2号所搭建的平台上进行的，所有相对于第二版时间的提升不是很多。但是在实际的应用中，会有明显的提升的。

计算10的九次方(16核心)：

0:3-62499999
1:62500001-124999999
2:125000001-187499999
3:187500001-249999999
4:250000001-312499999
5:312500001-374999999
8:500000001-562499999
9:562500001-624999999
10:625000001-687499999
11:687500001-749999999
12:750000001-812499999
13:812500001-874999999
14:875000001-937499999
6:375000001-437499999
7:437500001-499999999
15:937500001-999999999
0:count 3701487
1:count 3408655
2:count 3314265
6:count 3153228
15:count 3020834
8:count 3110499
5:count 3179482
4:count 3212954
7:count 3130885
3:count 3254911
12:count 3052785
9:count 3093969
14:count 3029831
13:count 3040903
10:count 3078479
11:count 3064367
The total number of prime: 50847534, total time:   4.045691, total node 16

MPI实现求解10的八次方内素数的个数（版本三）

在第二版的基础上减少通讯时间

方案：

整体：

输出对比(10的八次方)：

计算10的九次方(16核心)：

猜你喜欢