线程初始化答案

【问题标题】：Thread initiallization线程初始化
【发布时间】：2021-06-17 04:43:27
【问题描述】：

我想加快模拟的执行速度。这是主要功能的一部分：

double computeStart = wtime();
    /* Main loop */

    for (t = 0.0; t < t_end; t += del_t, iters++) {

        setTimestepInterval(&del_t, imax, jmax, delx, dely, u, v, Re, tau);

        ifluid = (imax * jmax) - ibound;

        computeTentativeVelocity(u, v, f, g, flag, imax, jmax,
            del_t, delx, dely, gamma, Re);

        computeRhs(f, g, rhs, flag, imax, jmax, del_t, delx, dely);

        if (ifluid > 0) {
            itersor = poissonSolver(p, rhs, flag, imax, jmax, delx, dely,
                        eps, itermax, omega, &res, ifluid);
        } else {
            itersor = 0;
        }

        if (verbose > 1) {
            printf("%d t:%g, del_t:%g, SOR iters:%3d, res:%e, bcells:%d\n",
                iters, t+del_t, del_t, itersor, res, ibound);
        }

        updateVelocity(u, v, f, g, p, flag, imax, jmax, del_t, delx, dely);

        applyBoundaryConditions(u, v, flag, imax, jmax, ui, vi);
    } /* End of main loop */
    double computeEnd = wtime();

我想并行化另一个文件中的函数 updateVelocity 和 applyBoundaryConditions。但是我不想用

#pragma omp parallel num_threads(2)

每次调用这两个函数时，开销都会很大。有没有一种方法可以只初始化线程一次，而不是每次调用 updateVelocity 和 applyBoundaryConditions 时都初始化它们。这些是函数：

void updateVelocity(float **u, float **v, float **f, float **g, float **p,
    char **flag, int imax, int jmax, float del_t, float delx, float dely)
{
    int i, j;

    #pragma omp for schedule(static)
    for (i=1; i<=imax-1; i++) {
        for (j=1; j<=jmax; j++) {
            /* only if both adjacent cells are fluid cells */
            if ((flag[i][j] & C_F) && (flag[i+1][j] & C_F)) {
                float pcalc = p[i+1][j]-p[i][j];
                float mul = pcalc*del_t;
                float div = mul/delx;
                u[i][j] = f[i][j] - div;
                // u[i][j] = f[i][j]-(p[i+1][j]-p[i][j])*del_t/delx;
            }
        }
    }
    #pragma omp for schedule(static)
    for (i=1; i<=imax; i++) {
        for (j=1; j<=jmax-1; j++) {
            /* only if both adjacent cells are fluid cells */
            if ((flag[i][j] & C_F) && (flag[i][j+1] & C_F)) {
                float pcalcv = p[i][j+1]-p[i][j];
                float mulv = pcalcv*del_t;
                float divv = mulv / dely;
                v[i][j] = g[i][j] - divv;
                // v[i][j] = g[i][j]-(p[i][j+1]-p[i][j])*del_t/dely;
            }
        }
    }
}

void applyBoundaryConditions(float **u, float **v, char **flag,
    int imax, int jmax, float ui, float vi)
{
    int i, j;
    // imax is 600
    // jmax is 200
    // the larger the loop iterations the more efficient the parallelisation
    // may not be optimisable under parallelisation
    // 2 threads optimal
    for (j=0; j<=jmax+1; j++) {
        /* Fluid freely flows in from the west */
        u[0][j] = u[1][j];
        v[0][j] = v[1][j];

        /* Fluid freely flows out to the east */
        u[imax][j] = u[imax-1][j];
        v[imax+1][j] = v[imax][j];
    }
    for (i=0; i<=imax+1; i++) {
        /* The vertical velocity approaches 0 at the north and south
         * boundaries, but fluid flows freely in the horizontal direction */
        v[i][jmax] = 0.0;
        u[i][jmax+1] = u[i][jmax];

        v[i][0] = 0.0;
        u[i][0] = u[i][1];
    }

    /* Apply no-slip boundary conditions to cells that are adjacent to
     * internal obstacle cells. This forces the u and v velocity to
     * tend towards zero in these cells.
     */
    #pragma omp for schedule(static)  // decrease runtime by about 9 secs
    for (i=1; i<=imax; i++) {
        for (j=1; j<=jmax; j++) {
            if (flag[i][j] & B_NSEW) {
                switch (flag[i][j]) {
                    case B_N:
                        v[i][j]   = 0.0;
                        u[i][j]   = -u[i][j+1];
                        u[i-1][j] = -u[i-1][j+1];
                        break;
                    case B_E:
                        u[i][j]   = 0.0;
                        v[i][j]   = -v[i+1][j];
                        v[i][j-1] = -v[i+1][j-1];
                        break;
                    case B_S:
                        v[i][j-1] = 0.0;
                        u[i][j]   = -u[i][j-1];
                        u[i-1][j] = -u[i-1][j-1];
                        break;
                    case B_W:
                        u[i-1][j] = 0.0;
                        v[i][j]   = -v[i-1][j];
                        v[i][j-1] = -v[i-1][j-1];
                        break;
                    case B_NE:
                        v[i][j]   = 0.0;
                        u[i][j]   = 0.0;
                        v[i][j-1] = -v[i+1][j-1];
                        u[i-1][j] = -u[i-1][j+1];
                        break;
                    case B_SE:
                        v[i][j-1] = 0.0;
                        u[i][j]   = 0.0;
                        v[i][j]   = -v[i+1][j];
                        u[i-1][j] = -u[i-1][j-1];
                        break;
                    case B_SW:
                        v[i][j-1] = 0.0;
                        u[i-1][j] = 0.0;
                        v[i][j]   = -v[i-1][j];
                        u[i][j]   = -u[i][j-1];
                        break;
                    case B_NW:
                        v[i][j]   = 0.0;
                        u[i-1][j] = 0.0;
                        v[i][j-1] = -v[i-1][j-1];
                        u[i][j]   = -u[i][j+1];
                        break;
                }
            }
        }
    }


    /* Finally, fix the horizontal velocity at the  western edge to have
     * a continual flow of fluid into the simulation.
     */

    v[0][0] = 2*vi-v[1][0];
    for (j=1;j<=jmax;j++) {
        u[0][j] = ui;
        v[0][j] = 2*vi-v[1][j];
    }
}

任何帮助将不胜感激。

【问题讨论】：

您断言“开销会很大”，但是，虽然肯定会有开销，但在任何理智的库中它都不会“巨大”，因为 OpenMP 运行时库维护一个线程池，所以创建操作系统级线程的高成本只在第一个并行区域支付。因此，其他平行区域的成本应该只比屏障高一点。

标签： c multithreading performance parallel-processing openmp

【解决方案1】：

但我不想用

#pragma omp parallel num_threads(2)

您需要使用#pragma omp parallel 来创建线程。

代替：

#pragma omp for schedule(static)
for (i=1; i<=imax-1; i++) {
    for (j=1; j<=jmax; j++) {
        /* only if both adjacent cells are fluid cells */
        if ((flag[i][j] & C_F) && (flag[i+1][j] & C_F)) {
            float pcalc = p[i+1][j]-p[i][j];
            float mul = pcalc*del_t;
            float div = mul/delx;
            u[i][j] = f[i][j] - div;
            // u[i][j] = f[i][j]-(p[i+1][j]-p[i][j])*del_t/delx;
        }
    }
}
#pragma omp for schedule(static)
for (i=1; i<=imax; i++) {
    for (j=1; j<=jmax-1; j++) {
        /* only if both adjacent cells are fluid cells */
        if ((flag[i][j] & C_F) && (flag[i][j+1] & C_F)) {
            float pcalcv = p[i][j+1]-p[i][j];
            float mulv = pcalcv*del_t;
            float divv = mulv / dely;
            v[i][j] = g[i][j] - divv;
            // v[i][j] = g[i][j]-(p[i][j+1]-p[i][j])*del_t/dely;
        }
    }
}

试试下面的

#pragma omp parallel
{
    #pragma omp for private(j)
    for (i=1; i<=imax-1; i++) {
      for (j=1; j<=jmax; j++) {
        /* only if both adjacent cells are fluid cells */
        if ((flag[i][j] & C_F) && (flag[i+1][j] & C_F)) {
            float pcalc = p[i+1][j]-p[i][j];
            float mul = pcalc*del_t;
            float div = mul/delx;
            u[i][j] = f[i][j] - div;
            // u[i][j] = f[i][j]-(p[i+1][j]-p[i][j])*del_t/delx;
         }
      }
  }
  #pragma omp for private(j)
  for (i=1; i<=imax; i++) {
      for (j=1; j<=jmax-1; j++) {
        /* only if both adjacent cells are fluid cells */
        if ((flag[i][j] & C_F) && (flag[i][j+1] & C_F)) {
            float pcalcv = p[i][j+1]-p[i][j];
            float mulv = pcalcv*del_t;
            float divv = mulv / dely;
            v[i][j] = g[i][j] - divv;
            // v[i][j] = g[i][j]-(p[i][j+1]-p[i][j])*del_t/dely;
         }
      }
   }
}

您添加了#pragma omp for schedule(static)的另一个循环

    #pragma omp for schedule(static)  // decrease runtime by about 9 secs
    for (i=1; i<=imax; i++) {
        for (j=1; j<=jmax; j++) {
            if (flag[i][j] & B_NSEW) {
                switch (flag[i][j]) {

不应该并行化，因为最外层循环的迭代之间存在依赖关系，即：

            case B_N:
                v[i][j]   = 0.0;
                u[i][j]   = -u[i][j+1];
                u[i-1][j] = -u[i-1][j+1]; // <-- This one

等等。

【讨论】：