通过自定义通信器发送数组答案

【问题标题】：Send an array over a custom communicator通过自定义通信器发送数组
【发布时间】：2022-01-21 04:16:23
【问题描述】：

1。目标

我必须通过自定义通信器（不是MPI_COMM_WORLD）分发一个名为A_loc 的数组。假设我们要在mesh_r 通信器上分发一个数组：

P0-P1
|  |  
P2-P3

其中- 代表mesh_r (mesh_rows) 通信器，| 代表mesh_c (mesh_columns) 通信器，通过build_mesh 过程构建。

2。代码

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>
#include <mpi.h>

bool is_divisible(int, int);
void build_mesh(MPI_Comm*, MPI_Comm*, MPI_Comm*, int, int, int, int, int*);
int *fill_matrix(int*, int, int);
void print_matrix(int*, int, int, int, int);
void handle_errors(int, int, int, int);
void distribute(int*, int*, int, int, int, int, int, int, int);
void debug(int*, int*, int, int, int, int, int, int, int);

int main(int argc, char *argv[])
{
    int process_rank, world_size;
    int mesh_rows, mesh_columns;
    int mesh_dimension = 2;
    int *process_coordinates;
    MPI_Comm mesh, mesh_r, mesh_c;
    int process_rank_mesh;
    int *A, *A_loc;
    int *B, *B_loc;
    int m, n, mloc, nloc;

    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &process_rank);
    MPI_Comm_size(MPI_COMM_WORLD, &world_size);

    if (process_rank == 0) {
        m = n = world_size * 1; // multiple of world_size = 4
    }

    MPI_Bcast(&m, 1, MPI_INT, 0, MPI_COMM_WORLD);
    MPI_Bcast(&n, 1, MPI_INT, 0, MPI_COMM_WORLD);
    A = fill_matrix(A, m, n);
    B = fill_matrix(A, m, n);

    if (process_rank == 0) 
        mesh_rows = 2;

    if (is_divisible(world_size, mesh_rows))
        mesh_columns = world_size / mesh_rows;
    else {
        mesh_rows = 1;
        mesh_columns = world_size / mesh_rows;
    }
   
    MPI_Bcast(&mesh_rows, 1, MPI_INT, 0, MPI_COMM_WORLD);
    MPI_Bcast(&mesh_columns, 1, MPI_INT, 0, MPI_COMM_WORLD);

    process_coordinates = (int*) calloc(mesh_dimension, sizeof(int));
    build_mesh(&mesh, &mesh_r, &mesh_c, process_rank, world_size, mesh_rows, mesh_columns, process_coordinates);
    MPI_Comm_rank(mesh, &process_rank_mesh); 
 
    mloc = m / mesh_rows;
    nloc = m / mesh_columns;

    handle_errors(m, n, world_size, process_rank);

    A_loc = (int*) calloc(mloc * nloc, sizeof(int));
    distribute(A, A_loc, m, n, mloc, nloc, world_size, mesh_rows, mesh_columns);
    
    B_loc = (int*) calloc(mloc * nloc, sizeof(int));
    distribute(B, B_loc, m, n, mloc, nloc, world_size, mesh_rows, mesh_columns);

    // I want to re-write this part so I can exploit mesh_r communicator instead of MPI_COMM_WORLD...
    int *A_loc_add = (int*) calloc(mloc * nloc, sizeof(int));
    if (process_rank == 0) {
        MPI_Send(A_loc, mloc * nloc, MPI_INT, 1, 10, MPI_COMM_WORLD);
    } else if (process_rank == 3) {
        MPI_Send(A_loc, mloc * nloc, MPI_INT, 2, 20, MPI_COMM_WORLD);
    }
    MPI_Status status;
    if (process_rank == 1) {
        MPI_Recv(A_loc_add, mloc * nloc, MPI_INT, 0, 10, MPI_COMM_WORLD, &status);
    } else if (process_rank == 2) {
        MPI_Recv(A_loc_add, mloc * nloc, MPI_INT, 3, 20, MPI_COMM_WORLD, &status);
    }

    MPI_Finalize();
    return 0;
}


void distribute(int *Mat, int *Mat_loc, int m, int n, int mloc, int nloc, int world_size, int mesh_rows, int mesh_columns)
{
    MPI_Datatype square_block;
    int stride = n;
    int count = mloc;
    int block_length = nloc;
    MPI_Type_vector(count, block_length, stride, MPI_INT, &square_block);
    MPI_Datatype square_block_resized;
    MPI_Type_create_resized(square_block, 0, sizeof(int), &square_block_resized);
    MPI_Type_commit(&square_block_resized);
    int *send_counts = (int*) calloc(world_size, sizeof(int));
    int *displs = (int*) calloc(world_size, sizeof(int));
    for (int i = 0; i < mesh_rows; i++) {
        for (int j = 0; j < mesh_columns; j++) {
            send_counts[i * mesh_columns + j] = 1;
            displs[i * mesh_columns + j] = i * n * block_length + j * block_length;
        }
    }
    
    MPI_Scatterv(Mat, send_counts, displs, square_block_resized, Mat_loc, mloc * nloc, MPI_INT, 0, MPI_COMM_WORLD);
}


bool is_divisible(int dividend, int divisor)
{
    return dividend % divisor == 0;
}

void build_mesh(MPI_Comm *mesh, MPI_Comm *mesh_r, MPI_Comm *mesh_c, int process_rank, int world_size,
    int mesh_rows, int mesh_columns, int *process_coordinates) 
{
    int mesh_dimension = 2;
    int *mesh_n_dimension;
    int mesh_reorder = 0;
    int *mesh_period;
    int *remain_dims = (int*) calloc(mesh_dimension, sizeof(int));
    mesh_n_dimension = (int*) calloc(mesh_dimension, sizeof(int));
    mesh_n_dimension[0] = mesh_rows;
    mesh_n_dimension[1] = mesh_columns;
    mesh_period = (int*) calloc(mesh_dimension, sizeof(int));
    mesh_period[0] = mesh_period[1] = 0;
    MPI_Cart_create(MPI_COMM_WORLD, mesh_dimension, mesh_n_dimension, mesh_period, mesh_reorder, mesh);
    MPI_Cart_coords(*mesh, process_rank, mesh_dimension, process_coordinates);
    remain_dims[0] = 0;  
    remain_dims[1] = 1;
    MPI_Cart_sub(*mesh, remain_dims, mesh_r);
    remain_dims[0] = 1;
    remain_dims[1] = 0;
    MPI_Cart_sub(*mesh, remain_dims, mesh_c);
}

int *fill_matrix(int *Mat, int m, int n)
{
    int k = 0;
    Mat = (int*) calloc(m * n, sizeof(int));
    for (int i = 0; i < m; i++)
        for (int j = 0; j < n; j++) 
            Mat[i * n + j] = ++k;
    return Mat;
}

如您所见，这工作正常，但我希望我可以重新编写注释部分，以便我可以利用 mesh_r 通信器并将 A_loc 分发到 mesh_r 上的每个处理器上，而不是硬编码的 @987654334 @ 与 dest = 1 和 dest = 2 超过 MPI_COMM_WORLD。

有什么帮助吗？

【问题讨论】：

标签： c parallel-processing mpi distributed-computing

【解决方案1】：

您应该使用Bcast，而不是发送和接收，就像您在早期版本的代码中所做的那样。您的问题是您没有以分布式方式思考，而是试图保持全局视图。我的意思是，在您创建子通信器mesh_r 之后，每个进程似乎都在该通信器中，但是来了：有多个mesh_r 通信器，每个进程都是正好是其中的一部分。每个 MPI 进程都能准确地看到它所属的 mesh_r 通信器。因此，单个代码行MPI_Bcast( ...buffer stuff...., mesh_r ) 进行多次广播，每个网格行中一次。

【讨论】：

一开始我想按照你刚才的建议使用MPI_Bcast，但我需要再使用一个变量，即A_loc_add，因为在那之后我不能重写A_loc值播送。此外，MPI_Bcast(A_loc, mloc * nloc, MPI_INT, 0, mesh_r); 发送的值不正确。有什么方法可以在不丢失旧的A_loc 值的情况下使用该广播？我尝试了memcpy(A_loc_add, A_loc, mloc * nloc);，然后广播A_loc_add，但即使我存储了旧的A_loc 值，A_loc_add 值也没有正确分布（只有第一个元素）
bcast 发送不正确的值是什么意思？在根目录打印它们并在非根目录打印它们并准确显示发生了什么。
如果我 MPI_Bcast(A_loc_add, mloc * nloc, MPI_INT, 0, mesh_r); 然后 P1 得到 1（正确）但 P2 得到 3（而不是 4）。我想广播处理器 P0 和 P3 的A_loc（在网格对角线上）
如果你想在对角线上广播，你需要为此创建一个子传播者。现在对角线进程正在不同的通信器中广播，所以什么都没有发生。