使用 OpenMP 的多线程 Nagel–Schreckenberg 模型（交通模拟）答案

【问题标题】：Multithreaded Nagel–Schreckenberg model (traffic simulation) with OpenMP使用 OpenMP 的多线程 Nagel–Schreckenberg 模型（交通模拟）
【发布时间】：2022-01-27 10:22:33
【问题描述】：

我正在尝试用 c 语言编写一个多线程的 Nagel–Schreckenberg 模型模拟，当线程访问尚未计算的数据时遇到了一些问题。

这是一个工作代码，它只并行计算每行的速度：

#define L 3000         // number of cells in row
#define num_iters 3000 // number of iterations
#define density 0.48  // how many positives
#define vmax 2
#define p 0.2

    for (int i = 0; i < num_iters - 1; i++)
    {
            int temp[L] = {0};

            #pragma omp parallel for
            for (int x = 0; x < L; x++)
            {
                if (iterations[i][x] > -1)
                {
                    int vi = iterations[i][x]; // velocity of previews iteration
                    int d = 1;                 // index of the next vehicle

                    while (iterations[i][(x + d) % L] < 0)
                        d++;

                    int vtemp = min(min(vi + 1, d - 1), vmax);    // increase speed, but avoid hitting the next car
                    int v = r2() < p ? max(vtemp - 1, 0) : vtemp; // stop the vehicle with probability p
                    temp[x] = v;
                }
            }
            
            for (int x = 0; x < L; x++) // write the velocities to the next line
            {
                if (iterations[i][x] > -1)
                {
                    int v = temp[x];
                    iterations[i + 1][(x + v) % L] = v;
                }
            }
    }

这很好用，但速度不够快。我正在尝试使用卷积来提高性能，但由于尚未计算，它有一半的时间无法读取相邻线程的数据。这是我使用的代码：

#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <string.h>
#include <sys/time.h>

#define L 4000         // number of cells in row
#define num_iters 4000 // number of iterations
#define density 0.48  // how many positives
#define vmax 2
#define p 0.2
#define BLOCKS_Y 4
#define BLOCKS_X 4
#define BLOCKSIZEY (L / BLOCKS_Y)
#define BLOCKSIZEX (L / BLOCKS_X)

time_t t;

#ifndef min
#define min(a, b) (((a) < (b)) ? (a) : (b))
#endif

#ifndef max
#define max(a, b) (((a) > (b)) ? (a) : (b))
#endif

void shuffle(int *array, size_t n)
{
  if (n > 1)
  {
    size_t i;
    for (i = 0; i < n - 1; i++)
    {
      size_t j = i + rand() / (RAND_MAX / (n - i) + 1);
      int t = array[j];
      array[j] = array[i];
      array[i] = t;
    }
  }
}

double r2()
{
  return (double)rand() / (double)RAND_MAX;
}

void writeImage(int *iterations[], char filename[])
{
    int h = L;
    int w = num_iters;
    FILE *f;
    unsigned char *img = NULL;
    int filesize = 54 + 3 * w * h;

    img = (unsigned char *)malloc(3 * w * h);
    memset(img, 0, 3 * w * h);

    for (int i = 0; i < w; i++)
    {
        for (int j = 0; j < h; j++)
        {
            int x = i;
            int y = (h - 1) - j;
            int color = iterations[i][j] == 0 ? 0 : 255;
            img[(x + y * w) * 3 + 2] = (unsigned char)(color);
            img[(x + y * w) * 3 + 1] = (unsigned char)(color);
            img[(x + y * w) * 3 + 0] = (unsigned char)(color);
        }
    }

    unsigned char bmpfileheader[14] = {'B', 'M', 0, 0, 0, 0, 0, 0, 0, 0, 54, 0, 0, 0};
    unsigned char bmpinfoheader[40] = {40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 24, 0};
    unsigned char bmppad[3] = {0, 0, 0};

    bmpfileheader[2] = (unsigned char)(filesize);
    bmpfileheader[3] = (unsigned char)(filesize >> 8);
    bmpfileheader[4] = (unsigned char)(filesize >> 16);
    bmpfileheader[5] = (unsigned char)(filesize >> 24);

    bmpinfoheader[4] = (unsigned char)(w);
    bmpinfoheader[5] = (unsigned char)(w >> 8);
    bmpinfoheader[6] = (unsigned char)(w >> 16);
    bmpinfoheader[7] = (unsigned char)(w >> 24);
    bmpinfoheader[8] = (unsigned char)(h);
    bmpinfoheader[9] = (unsigned char)(h >> 8);
    bmpinfoheader[10] = (unsigned char)(h >> 16);
    bmpinfoheader[11] = (unsigned char)(h >> 24);

    f = fopen(filename, "wb");
    fwrite(bmpfileheader, 1, 14, f);
    fwrite(bmpinfoheader, 1, 40, f);
    for (int i = 0; i < h; i++)
    {
        fwrite(img + (w * (h - i - 1) * 3), 3, w, f);
        fwrite(bmppad, 1, (4 - (w * 3) % 4) % 4, f);
    }

    free(img);
    fclose(f);
}

void simulation()
{
    printf("L=%d, num_iters=%d\n", L, num_iters);
    int z = 0;
    z++;
    int current_index = 0;
    int success_moves = 0;

    const int cars_num = (int)(density * L);

    int **iterations = (int **)malloc(num_iters * sizeof(int *));
    for (int i = 0; i < num_iters; i++)
        iterations[i] = (int *)malloc(L * sizeof(int));

    for (int i = 0; i < L; i++)
    {
        iterations[0][i] = i <= cars_num ? 0 : -1;
    }
    shuffle(iterations[0], L);

    for (int i = 0; i < num_iters - 1; i++)
        for (int x = 0; x < L; x++)
            iterations[i + 1][x] = -1;

    double *randoms = (double *)malloc(L * num_iters * sizeof(double));
    for (int i = 0; i < L * num_iters; i++) {
        randoms[i] = r2();
    }

    #pragma omp parallel for collapse(2)
    for (int blocky = 0; blocky < BLOCKS_Y; blocky++)
    {
        for (int blockx = 0; blockx < BLOCKS_X; blockx++)
        {
            int ystart = blocky * BLOCKSIZEY;
            int yend = ystart + BLOCKSIZEY;
            int xstart = blockx * BLOCKSIZEX;
            int xend = xstart + BLOCKSIZEX;

            for (int y = ystart; y < yend; y++)
            {
                for (int x = xstart; x < xend; x++)
                {
                    if (iterations[y][x] > -1)
                    {
                        int vi = iterations[y][x];
                        int d = 1;

                        int start = (x + d) % L;
                        int i;
                        for (i = start; i < L && iterations[y][i] < 0; ++i);
                        d += i - start;
                        if (i == L)
                        {
                            for (i = 0; i < start && iterations[y][i] < 0; ++i);
                            d += i;
                        }

                        int vtemp = min(min(vi + 1, d - 1), vmax);
                        int v = randoms[x * y] < p ? max(vtemp - 1, 0) : vtemp;
                        iterations[y + 1][(x + v) % L] = v;
                    }
                }
            }
        }
    }

    if (L <= 4000)
        writeImage(iterations, "img.bmp");
    free(iterations);
}

void main() {
    srand((unsigned)time(&t));
    simulation();
}

如您所见，在计算第二个块时，第一个块可能还没有计算出来，从而产生了那个空白空间。

我认为可以通过卷积来解决这个问题，但我只是做错了什么，我不确定是什么。如果您能就如何解决此问题提供任何建议，我将不胜感激。

【问题讨论】：

num_iters 和 L 有多大？
截图上都是3000，但最终都是20000+
关于第一个代码：它受内存限制（计算量很少，内存读/写很多），因此如果它不能很好地与使用的内核数量一起扩展也就不足为奇了。您是否需要存储所有先前迭代的结果？关于第二个代码：我对 Nagel-Schreckenberg 模型一无所知，但如果瓶颈是内存读/写，它可能不会明显更快。另一方面，我看不到第一个和第二个代码之间的联系。
1.你有多少个内核，你得到了什么加速？ 2.你设置OMP_PROC_BIND吗？ 3.您的第一个代码如何正确？您并行化了 x，但每个索引都依赖于其他索引。
您能否明确输入“与上述相同的代码”以确保没有基本错误？有一个最小的 reproducible 示例会有所帮助。

标签： multithreading concurrency openmp traffic-simulation data-race

【解决方案1】：

第二个代码中有一个竞争条件，因为iterations 可以由一个线程读取并由另一个线程写入。更具体地说，iterations[y + 1][(x + v) % L] = v 设置了一个值，当两个线程处理连续的y 值（两个连续的blocky 值）时，另一个线程应该在检查iterations[y][x] 或iterations[y][(x + d) % L] 时读取该值。

此外，r2 函数必须是线程安全的。它似乎是一个随机数生成器 (RNG)，但这种随机函数通常使用通常不是线程安全的 全局变量 来实现。一种简单有效的解决方案是改用thread_local 变量。另一种解决方案是将参数显式传递给随机函数的可变状态。当您设计并行应用程序时，后者是一种很好的做法，因为它可以显示内部状态的变化，并提供更好地控制 RNG 确定性的方法。

除此之外，请注意模数通常很昂贵，特别是如果L 不是编译时常量。您可以通过在循环之前预先计算余数或拆分循环来删除其中的一些，以便仅在边界附近执行检查。这是while 的（未经测试的）示例：

int start = (x + d) % L;
int i;

for(i=start ; i < L && iterations[y][i] < 0 ; ++i);
d += i - start;

if(i == L) {
    for(i=0 ; i < start && iterations[y][i] < 0 ; ++i);
    d += i;
}

最后，请注意块应该能被 4 整除。否则，当前代码无效（可能需要最小/最大钳位）。

【讨论】：

非常感谢您的详细回答，我用所有建议的更改更新了问题，但它仍然无法正常工作。我使用预先生成的随机数只是为了在测试时更加安全。我知道比赛条件，但不知道如何摆脱它
while 部分的建议更改是为了提高性能而不是解决竞争条件。我认为主要问题仍然存在于iterations[y + 1][(x + v) % L] = ... 行中。您可能需要多个数组或额外的同步（甚至多个步骤）来防止线程同时访问相同的数组位置。这高度依赖于底层算法（我还不完全理解）。
我更新了代码，现在应该可以使用了。该算法是一种流量模拟，这里很好解释de.zxc.wiki/wiki/Nagel-Schreckenberg-Modell我不确定多个数组是否有帮助，因为数组已经分配，我认为更多的是关于访问顺序。我尝试在示例中对其应用卷积算法，但我认为我做错了