[3.5] 高级集体通讯理论+代码_mpi reduce函数参数-CSDN博客

文章目录

- 4.1 MPI_Reduce
- 4.2 **MPI_Allreduce**

4.1 MPI_Reduce

前言：reduce就是规约，例如数组[1,2,3]的求和规约就是6，求平均规约就是2。
MPI_Reduce 在每个进程上获取一个输入元素数组，并将输出元素数组返回给根进程。原型是：

int MPI_Reduce(
    const void *sendbuf,   // 发送缓冲区（存储要发送的数据的起始地址）
    void *recvbuf,         // 接收缓冲区（存储归约结果的起始地址，只有 root 进程需要设置）
    int count,             // 发送和接收的数据数量
    MPI_Datatype datatype, // 数据类型（如 MPI_INT、MPI_FLOAT 等）
    MPI_Op op,             // 归约操作（如 MPI_SUM、MPI_MAX、MPI_MIN 等）
    int root,              // 根进程（存储归约结果的目标进程）
    MPI_Comm comm          // 通信器（指定通信域，如 MPI_COMM_WORLD）
);

op 参数如下：

MPI_MAX - 返回最大元素。
MPI_MIN - 返回最小元素。
MPI_SUM - 对元素求和。
MPI_PROD - 将所有元素相乘。
MPI_LAND - 对元素执行逻辑与运算。
MPI_LOR - 对元素执行逻辑或运算。
MPI_BAND - 对元素的各个位按位与执行。
MPI_BOR - 对元素的位执行按位或运算。
MPI_MAXLOC - 返回最大值和所在的进程的秩。
MPI_MINLOC - 返回最小值和所在的进程的秩。

代码如下：

#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
#include <assert.h>
#include <time.h>

float* create_rand_nums (int num_elements) {
    float* rand_nums = (float*)malloc(num_elements * sizeof(float));
    assert(rand_nums != NULL);
    for (int i = 0; i < num_elements; i ++) {
        rand_nums[i] = ((float) rand() / RAND_MAX);
    }
    return rand_nums;
}

int main (int argc, char ** argv) {
    if (argc != 2) {
        fprintf(stderr, "Usage: avg num_elements_per_proc\n");
        exit(1);
    }

    int num_ele_per_proc = atoi(argv[1]);
    MPI_Init(NULL, NULL);

    int world_rank;
    MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
    int world_size;
    MPI_Comm_size(MPI_COMM_WORLD, &world_size);

    srand(time(NULL) * world_rank);
    float *rand_nums = NULL;
    rand_nums = create_rand_nums(num_ele_per_proc);

    float local_sum = 0;
    for (int i = 0; i < num_ele_per_proc; i ++) {
        local_sum += rand_nums[i];
    }
    printf("Local sum for process %d - %f, avg = %f\n",
        world_rank, local_sum, local_sum / num_ele_per_proc);
    
    float global_sum;
    MPI_Reduce(&local_sum, &global_sum, 1, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD);

    if (world_rank == 0) {
        printf("Total sum = %f, avg = %f\n", global_sum, global_sum / (world_size * num_ele_per_proc));
    }
    
    // Clean up
    free(rand_nums);

    MPI_Barrier(MPI_COMM_WORLD);
    MPI_Finalize();
}
/******************************************************************
(base) joker@joker-2 4.3 Advanced collective % mpic++ MPI_Reduce_exp1.cc -o MPI_Reduce_exp1
(base) joker@joker-2 4.3 Advanced collective % mpirun -np 4 ./MPI_Reduce_exp1 100
Local sum for process 1 - 48.185650, avg = 0.481856
Local sum for process 2 - 52.371292, avg = 0.523713
Local sum for process 3 - 52.872005, avg = 0.528720
Local sum for process 0 - 51.385098, avg = 0.513851
Total sum = 204.814056, avg = 0.512035
*******************************************************************/

4.2 MPI_Allreduce

刚刚的范例可以看到最后规约的结果是store在rank0上的，所以ALLreduce就是的出现就是为了让所有进程访问规约的结果。函数原型是：

int MPI_Allreduce(
    const void *sendbuf,   // 发送缓冲区（存储要发送的数据的起始地址）
    void *recvbuf,         // 接收缓冲区（存储归约结果的起始地址，每个进程都会接收相同的结果）
    int count,             // 发送和接收的数据数量
    MPI_Datatype datatype, // 数据类型（如 MPI_INT、MPI_FLOAT 等）
    MPI_Op op,             // 归约操作（如 MPI_SUM、MPI_MAX、MPI_MIN 等）
    MPI_Comm comm          // 通信器（指定通信域，如 MPI_COMM_WORLD）
);

它不需要根进程 ID（因为结果分配给所有进程）。下图介绍了 MPI_Allreduce 的通信模式：
在这里插入图片描述
相当于先执行了MPI_Reduce之后，再执行MPI_Bcast。

#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
#include <math.h>
#include <time.h>
#include <assert.h>

// Creates an array of random numbers. Each number has a value from 0 - 1
float *create_rand_nums(int num_elements) {
    float *rand_nums = (float *)malloc(sizeof(float) * num_elements);
    assert(rand_nums != NULL);
    int i;
    for (i = 0; i < num_elements; i++) {
        rand_nums[i] = (rand() / (float)RAND_MAX);
    }
    return rand_nums;
}

int main(int argc, char** argv) {
    if (argc != 2) {
        fprintf(stderr, "Usage: avg num_elements_per_proc\n");
        exit(1);
    }

    int num_elements_per_proc = atoi(argv[1]);

    MPI_Init(NULL, NULL);

    int world_rank;
    MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
    int world_size;
    MPI_Comm_size(MPI_COMM_WORLD, &world_size);

    // Create a random array of elements on all processes.
    srand(time(NULL) * world_rank); // Seed the random number generator of processes uniquely
    float *rand_nums = NULL;
    rand_nums = create_rand_nums(num_elements_per_proc);

    // Sum the numbers locally
    float local_sum = 0;
    int i;
    for (i = 0; i < num_elements_per_proc; i++) {
        local_sum += rand_nums[i];
    }

    // Reduce all of the local sums into the global sum in order to calculate the mean
    float global_sum;
    MPI_Allreduce(&local_sum, &global_sum, 1, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD);
    float mean = global_sum / (num_elements_per_proc * world_size);

    // Compute the local sum of the squared differences from the mean
    float local_sq_diff = 0;
    for (i = 0; i < num_elements_per_proc; i++) {
        local_sq_diff += (rand_nums[i] - mean) * (rand_nums[i] - mean);
    }

    // Reduce the global sum of the squared differences to the root process
    // and print off the answer
    float global_sq_diff;
    MPI_Reduce(&local_sq_diff, &global_sq_diff, 1, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD);

    // The standard deviation is the square root of the mean of the squared differences
    if (world_rank == 0) {
        float stddev = sqrt(global_sq_diff / (num_elements_per_proc * world_size));
        printf("Mean = %f, Standard deviation = %f\n", mean, stddev);
    }

    // Clean up
    free(rand_nums);

    MPI_Barrier(MPI_COMM_WORLD);
    MPI_Finalize();

    return 0;
}
/******************************************************************
(base) joker@joker-2 4.3 Advanced collective % mpic++ MPI_AllReduce.cc -o MPI_AllReduce    
(base) joker@joker-2 4.3 Advanced collective % mpirun -np 4 ./MPI_AllReduce 100        
Mean = 0.507307, Standard deviation = 0.289248
*******************************************************************/