在第二版的基础上减少通讯时间
第二个版本已经做了一定的忧化,并行计算的总时间是:串行时间+并行时间+通讯时间。
前两个版本都是让进程0找出素数后,再通过广播的形式通知到各个进程。各个进程再进行查找筛选,如果在通讯质量不是很好的网络中进行,将会有很多的时间被浪费在线程通信上面。
于是可以在版本二的基础上,让每个进程自己来计算基础的素数,而不是在嗷嗷待哺的等待着进程0来给它们喂食。这样将会减少通讯所消耗的时间。
方案:
增加一个小标志数组,让每一个进程自己来计算基础素数:
smallSize = (int) sqrt((double) n);
smallMarked = (char *) malloc(smallSize);
让每个进程单独计算基础素数,计算好用自己计算的素数判断后面的素数:
smallIndex = 0;
prime = 3;
do {
if (prime * prime > port_low_value)
first = (prime * prime - port_low_value)/2;
else {
if (!(port_low_value % prime)) first = 0;
else if(port_low_value % prime % 2 ==0)
first = prime - ((port_low_value % prime)/2); // 此处在求局部first(数组中第一个可以被prime整除的数)的时候非常巧妙
else
first = (prime - (port_low_value % prime))/2;
}
for (i = first; i < smallSize; i += prime){
smallMarked[i] = 1;
}
while (smallMarked[++smallIndex]);
prime = smallIndex * 2 + 3;
} while (prime * prime <= n);
prime = 3;
smallIndex = 0;
do {
if (prime * prime > low_value)
first = (prime * prime - low_value)/2;
else {
if (!(low_value % prime)) first = 0;
else if(low_value % prime % 2 ==0)
first = prime - ((low_value % prime)/2); // 此处在求局部first(数组中第一个可以被prime整除的数)的时候非常巧妙
else
first = (prime - (low_value % prime))/2;
}
// if (!id)
// printf("---%d",first);
// for (i = first; i < smallSize; i += prime){
// smallMarked[i] = 1;
// }
for (i = first; i < size; i += prime){
marked[i] = 1;
}
while (smallMarked[++smallIndex]);
prime = smallIndex * 2 + 3;
// if (p > 1) MPI_Bcast(&prime, 1, MPI_INT, 0, MPI_COMM_WORLD); //广播,将一个进程中的数据发送到所有进程
} while (prime * prime <= n);
这样就可以不用MPI中的MPI_Bcast函数来进行广播。
整体:
#include "mpi.h"
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#define MIN(a, b) ((a)<(b)?(a):(b))
int main(int argc, char *argv[]) {
unsigned long int count; /* Local prime count */
double elapsed_time; /* Parallel execution time */
unsigned long int first; /* Index of first multiple */
unsigned long int global_count = 0; /* Global prime count */
unsigned long long int high_value; /* Highest value on this proc */
unsigned long int i;
int id; /* Process ID number */
unsigned long int index; /* Index of current prime */
unsigned long long int low_value; /* Lowest value on this proc */
char *marked; /* Portion of 2,...,'n' */
unsigned long long int n; /* Sieving from 2, ..., 'n' */
int p; /* Number of processes */
unsigned long int proc0_size; /* Size of proc 0's subarray */
unsigned long int prime; /* Current prime */
unsigned long int size; /* Elements in 'marked' */
unsigned long int low_index; // 低位对应的全局索引值
unsigned long int high_index; // 高位对应的全局索引值
unsigned long int port_low_index; // 低位对应的局部索引值
unsigned long int port_high_index; // 高位对应的局部索引值
unsigned long long int port_high_value; //小的局部最低值
unsigned long long int port_low_value; //最大局部最高值
char *smallMarked; // 来记录从3到根号n对应的素数
unsigned long int smallSize;
unsigned long int smallIndex; // 与index作用类似,用于记录smallMarked的index
MPI_Init(&argc, &argv);
/* Start the timer */
MPI_Comm_rank(MPI_COMM_WORLD, &id); //获取进程id
MPI_Comm_size(MPI_COMM_WORLD, &p); //获取进程数量
MPI_Barrier(MPI_COMM_WORLD); //进行同步
elapsed_time = -MPI_Wtime();
// if (argc != 2) {
// if (!id) printf("Command line: %s <m>\n", argv[0]);
// MPI_Finalize();
// exit(1);
// }
// n = atoll(argv[1]); //获取要计算的数
n = 100; //获取要计算的数
/* Figure out this process's share of the array, as
well as the integers represented by the first and
last array elements 计算这个进程在数组中的份额,以及第一个和最后一个数组元素表示的整数*/
/*********originalSoution*******/
// low_value = 2 + id * (n - 1) / p;
// high_value = 1 + (id + 1) * (n - 1) / p;
// size = high_value - low_value + 1;
/********solution1**********/
if (n % 2 == 0) // 如果给出的是一个偶数,将给出的数减一变为奇数。
n = n -1;
// low_value = 3 + id * (n - 1) / p;
// high_value = 1 + (id + 1) * (n - 1) / p;
low_index = id * ((n - 1) / 2) / p;
high_index = (id+1) * ((n-1) / 2) / p;
port_low_index = 0 * ((n - 1) / 2) / p;
port_high_index = 1 * ((n-1) / 2) / p;
low_value = low_index * 2 + 3;
high_value = high_index * 2 + 1;
port_low_value = port_low_index * 2 + 3;
port_high_value = port_high_index * 2 + 1;
printf("%d:%lld-%lld\n",id,low_value,high_value);
size = (high_value - low_value) / 2 + 1;
smallSize = (int) sqrt((double) n);
/* Bail out if all the primes used for sieving are
not all held by process 0 如果用于筛选的所有质数不都由进程0持有,则退出*/
proc0_size = (n - 1) / p;
if ((2 + proc0_size) < (int) sqrt((double) n)) {
if (!id) printf("Too many processes\n");
MPI_Finalize();
exit(1);
}
/* Allocate this process's share of the array.分配此进程在数组中的份额 */
marked = (char *) malloc(size);
smallMarked = (char *) malloc(smallSize);
if (marked == NULL) {
printf("Cannot allocate enough memory\n");
MPI_Finalize();
exit(1);
}
if (smallMarked == NULL) {
printf("Cannot allocate enough memory\n");
MPI_Finalize();
exit(1);
}
for (i = 0; i < size; i++) marked[i] = 0;
for (i = 0; i < smallSize; i++) smallMarked[i] = 0;
if (!id) index = 0; // !id----->只有0号进程才会执行。
index = 0;
// if (!id) smallIndex = 0;
smallIndex = 0;
prime = 3;
do {
if (prime * prime > port_low_value)
first = (prime * prime - port_low_value)/2;
else {
if (!(port_low_value % prime)) first = 0;
else if(port_low_value % prime % 2 ==0)
first = prime - ((port_low_value % prime)/2); // 此处在求局部first(数组中第一个可以被prime整除的数)的时候非常巧妙
else
first = (prime - (port_low_value % prime))/2;
}
for (i = first; i < smallSize; i += prime){
smallMarked[i] = 1;
}
while (smallMarked[++smallIndex]);
prime = smallIndex * 2 + 3;
} while (prime * prime <= n);
prime = 3;
smallIndex = 0;
do {
if (prime * prime > low_value)
first = (prime * prime - low_value)/2;
else {
if (!(low_value % prime)) first = 0;
else if(low_value % prime % 2 ==0)
first = prime - ((low_value % prime)/2); // 此处在求局部first(数组中第一个可以被prime整除的数)的时候非常巧妙
else
first = (prime - (low_value % prime))/2;
}
// if (!id)
// printf("---%d",first);
// for (i = first; i < smallSize; i += prime){
// smallMarked[i] = 1;
// }
for (i = first; i < size; i += prime){
marked[i] = 1;
}
while (smallMarked[++smallIndex]);
prime = smallIndex * 2 + 3;
// if (p > 1) MPI_Bcast(&prime, 1, MPI_INT, 0, MPI_COMM_WORLD); //广播,将一个进程中的数据发送到所有进程
} while (prime * prime <= n);
count = 0;
for (i = 0; i < size; i++)
if (!marked[i])
{
count++; // 统计单个进程中素数的个数,看有多少个0
// if (id == 0)
// {
// printf("%d:%d ",id, i*2 + low_value); // 进行测试输出
// }
}
if(!id) count++;
printf("%d:count %ld\n",id,count);
if (p > 1)
MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM,
0, MPI_COMM_WORLD); // 规约,集合通信,由进程0来计算全局的count
/* Stop the timer */
elapsed_time += MPI_Wtime();
/* Print the results */
if (!id) {
printf("The total number of prime: %ld, total time: %10.6f, total node %d\n", global_count, elapsed_time, p);
}
MPI_Finalize();
return 0;
}
输出对比(10的八次方):
0:3-24999999
1:25000001-49999999
2:50000001-74999999
3:75000001-99999999
0:count 1565927
1:count 1435207
2:count 1393170
3:count 1367151
The total number of prime: 5761455, total time: 0.779948, total node 4
由于是在中山大学基于天河2号所搭建的平台上进行的,所有相对于第二版时间的提升不是很多。但是在实际的应用中,会有明显的提升的。
计算10的九次方(16核心):
0:3-62499999
1:62500001-124999999
2:125000001-187499999
3:187500001-249999999
4:250000001-312499999
5:312500001-374999999
8:500000001-562499999
9:562500001-624999999
10:625000001-687499999
11:687500001-749999999
12:750000001-812499999
13:812500001-874999999
14:875000001-937499999
6:375000001-437499999
7:437500001-499999999
15:937500001-999999999
0:count 3701487
1:count 3408655
2:count 3314265
6:count 3153228
15:count 3020834
8:count 3110499
5:count 3179482
4:count 3212954
7:count 3130885
3:count 3254911
12:count 3052785
9:count 3093969
14:count 3029831
13:count 3040903
10:count 3078479
11:count 3064367
The total number of prime: 50847534, total time: 4.045691, total node 16