4.1 MPI_Reduce
前言:reduce就是规约,例如数组[1,2,3]的求和规约就是6,求平均规约就是2。
MPI_Reduce
在每个进程上获取一个输入元素数组,并将输出元素数组返回给根进程。原型是:
int MPI_Reduce(
const void *sendbuf, // 发送缓冲区(存储要发送的数据的起始地址)
void *recvbuf, // 接收缓冲区(存储归约结果的起始地址,只有 root 进程需要设置)
int count, // 发送和接收的数据数量
MPI_Datatype datatype, // 数据类型(如 MPI_INT、MPI_FLOAT 等)
MPI_Op op, // 归约操作(如 MPI_SUM、MPI_MAX、MPI_MIN 等)
int root, // 根进程(存储归约结果的目标进程)
MPI_Comm comm // 通信器(指定通信域,如 MPI_COMM_WORLD)
);
op
参数如下:
-
MPI_MAX
- 返回最大元素。 -
MPI_MIN
- 返回最小元素。 -
MPI_SUM
- 对元素求和。 -
MPI_PROD
- 将所有元素相乘。 -
MPI_LAND
- 对元素执行逻辑与运算。 -
MPI_LOR
- 对元素执行逻辑或运算。 -
MPI_BAND
- 对元素的各个位按位与执行。 -
MPI_BOR
- 对元素的位执行按位或运算。 -
MPI_MAXLOC
- 返回最大值和所在的进程的秩。 -
MPI_MINLOC
- 返回最小值和所在的进程的秩。
代码如下:
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
#include <assert.h>
#include <time.h>
float* create_rand_nums (int num_elements) {
float* rand_nums = (float*)malloc(num_elements * sizeof(float));
assert(rand_nums != NULL);
for (int i = 0; i < num_elements; i ++) {
rand_nums[i] = ((float) rand() / RAND_MAX);
}
return rand_nums;
}
int main (int argc, char ** argv) {
if (argc != 2) {
fprintf(stderr, "Usage: avg num_elements_per_proc\n");
exit(1);
}
int num_ele_per_proc = atoi(argv[1]);
MPI_Init(NULL, NULL);
int world_rank;
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
int world_size;
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
srand(time(NULL) * world_rank);
float *rand_nums = NULL;
rand_nums = create_rand_nums(num_ele_per_proc);
float local_sum = 0;
for (int i = 0; i < num_ele_per_proc; i ++) {
local_sum += rand_nums[i];
}
printf("Local sum for process %d - %f, avg = %f\n",
world_rank, local_sum, local_sum / num_ele_per_proc);
float global_sum;
MPI_Reduce(&local_sum, &global_sum, 1, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD);
if (world_rank == 0) {
printf("Total sum = %f, avg = %f\n", global_sum, global_sum / (world_size * num_ele_per_proc));
}
// Clean up
free(rand_nums);
MPI_Barrier(MPI_COMM_WORLD);
MPI_Finalize();
}
/******************************************************************
(base) joker@joker-2 4.3 Advanced collective % mpic++ MPI_Reduce_exp1.cc -o MPI_Reduce_exp1
(base) joker@joker-2 4.3 Advanced collective % mpirun -np 4 ./MPI_Reduce_exp1 100
Local sum for process 1 - 48.185650, avg = 0.481856
Local sum for process 2 - 52.371292, avg = 0.523713
Local sum for process 3 - 52.872005, avg = 0.528720
Local sum for process 0 - 51.385098, avg = 0.513851
Total sum = 204.814056, avg = 0.512035
*******************************************************************/
4.2 MPI_Allreduce
刚刚的范例可以看到最后规约的结果是store在rank0上的,所以ALLreduce
就是的出现就是为了让所有进程访问规约的结果。函数原型是:
int MPI_Allreduce(
const void *sendbuf, // 发送缓冲区(存储要发送的数据的起始地址)
void *recvbuf, // 接收缓冲区(存储归约结果的起始地址,每个进程都会接收相同的结果)
int count, // 发送和接收的数据数量
MPI_Datatype datatype, // 数据类型(如 MPI_INT、MPI_FLOAT 等)
MPI_Op op, // 归约操作(如 MPI_SUM、MPI_MAX、MPI_MIN 等)
MPI_Comm comm // 通信器(指定通信域,如 MPI_COMM_WORLD)
);
它不需要根进程 ID(因为结果分配给所有进程)。 下图介绍了 MPI_Allreduce
的通信模式:
相当于先执行了MPI_Reduce
之后,再执行MPI_Bcast
。
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
#include <math.h>
#include <time.h>
#include <assert.h>
// Creates an array of random numbers. Each number has a value from 0 - 1
float *create_rand_nums(int num_elements) {
float *rand_nums = (float *)malloc(sizeof(float) * num_elements);
assert(rand_nums != NULL);
int i;
for (i = 0; i < num_elements; i++) {
rand_nums[i] = (rand() / (float)RAND_MAX);
}
return rand_nums;
}
int main(int argc, char** argv) {
if (argc != 2) {
fprintf(stderr, "Usage: avg num_elements_per_proc\n");
exit(1);
}
int num_elements_per_proc = atoi(argv[1]);
MPI_Init(NULL, NULL);
int world_rank;
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
int world_size;
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
// Create a random array of elements on all processes.
srand(time(NULL) * world_rank); // Seed the random number generator of processes uniquely
float *rand_nums = NULL;
rand_nums = create_rand_nums(num_elements_per_proc);
// Sum the numbers locally
float local_sum = 0;
int i;
for (i = 0; i < num_elements_per_proc; i++) {
local_sum += rand_nums[i];
}
// Reduce all of the local sums into the global sum in order to calculate the mean
float global_sum;
MPI_Allreduce(&local_sum, &global_sum, 1, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD);
float mean = global_sum / (num_elements_per_proc * world_size);
// Compute the local sum of the squared differences from the mean
float local_sq_diff = 0;
for (i = 0; i < num_elements_per_proc; i++) {
local_sq_diff += (rand_nums[i] - mean) * (rand_nums[i] - mean);
}
// Reduce the global sum of the squared differences to the root process
// and print off the answer
float global_sq_diff;
MPI_Reduce(&local_sq_diff, &global_sq_diff, 1, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD);
// The standard deviation is the square root of the mean of the squared differences
if (world_rank == 0) {
float stddev = sqrt(global_sq_diff / (num_elements_per_proc * world_size));
printf("Mean = %f, Standard deviation = %f\n", mean, stddev);
}
// Clean up
free(rand_nums);
MPI_Barrier(MPI_COMM_WORLD);
MPI_Finalize();
return 0;
}
/******************************************************************
(base) joker@joker-2 4.3 Advanced collective % mpic++ MPI_AllReduce.cc -o MPI_AllReduce
(base) joker@joker-2 4.3 Advanced collective % mpirun -np 4 ./MPI_AllReduce 100
Mean = 0.507307, Standard deviation = 0.289248
*******************************************************************/