omp simd减少没有加速答案

【问题标题】：no speedup for omp simd reductionomp simd减少没有加速
【发布时间】：2021-04-09 09:27:32
【问题描述】：

我正在尝试使用矢量化（openmp simd）来加速矩阵乘法。为了利用矢量化，我转置了第二个矩阵（以使变化最快的索引通过连续内存）。我正在 3000 x 3000 阵列上运行我的测试。因为我无法测量有与没有 open mp pragma 时的挂壁时间差异，所以我想确认我实际上正在为我正在相乘的单个数组获得加速（事实并非如此）。正因为如此，我插入了一些相同大小的虚拟数组，以检查是否使用 SIMD 对它们进行加速（至少在使用缩减子句时）。

现在我的问题是阻碍 SIMD 加速的问题是什么，我唯一的猜测是它一定是数组的二维，但我不完全明白为什么会导致减速。

或者我的代码是否还有其他我看不到的问题？

#include <stdlib.h>
#include <stdio.h>
#include <omp.h>

#include <time.h>

const int N = 3000;

struct timespec begin, end;

double **create_a() {
    double *a = (double *)aligned_alloc(32, sizeof(double) * N * N);
    double **a_indexed = (double **)aligned_alloc(32, sizeof(double *) * N);
    for (int i = 0; i<N; i++){
        a_indexed[i] = a+i*N;
    }

    for (int i = 0; i< N; i++) {
        for (int j = 0; j<N; j++) {
            a_indexed[i][j] = i * j;
        }
    }
    return a_indexed;
}

double **create_b(){
    double *b = (double *)aligned_alloc(32, sizeof(double) * N * N);
    double **b_indexed = (double **)aligned_alloc(32, sizeof(double *) * N);
    for (int i = 0; i<N; i++){
        b_indexed[i] = b+i*N;
    }

    for (int i = 0; i< N; i++) {
        for (int j = 0; j<N; j++) {
            b_indexed[i][j] = (i == j) ? 1:0;
        }
    }
    return b_indexed;
}

double **transpose( double **matrix) {
    double *t = (double *)aligned_alloc(32, sizeof(double) * N * N);
    double **t_indexed = (double **)aligned_alloc(32, sizeof(double *) * N);
    for (int i = 0; i<N; i++){
        t_indexed[i] = t+i*N;
    }

    for (int i = 0; i< N; i++) {
        for (int j = 0; j<N; j++) {
            t_indexed[i][j] = matrix[j][i];
        }
    }
    return t_indexed;
}

double **mul(double **a, double **b) {
    double **b_t = transpose(b);
    double *res = (double *)aligned_alloc(32, sizeof(double) * N * N);
    double **res_indexed = (double **)aligned_alloc(32, sizeof(double *) * N);
    for (int i = 0; i<N; i++){
        res_indexed[i] = res+i*N;
    }

    for (int row = 0; row< 1; row++) {
        for (int col = 0; col < 1; col++) {
    double cell_res = 0;

    // problematic calculation that I can't get to speed up no matter what pragma I use
    clock_gettime(CLOCK_REALTIME, &begin);
    #pragma omp simd aligned(a, b_t:32) reduction(+:cell_res)
    for (int i = 0; i < N; i++) {
        cell_res += a[0][i] * b_t[0][i];
    }
    clock_gettime(CLOCK_REALTIME, &end);

    long seconds = end.tv_sec - begin.tv_sec;
    long nanoseconds = end.tv_nsec - begin.tv_nsec;
    double elapsed = seconds + nanoseconds*1e-9;

    printf("Time simd reduce measured: %.9f seconds.\n", elapsed);


    // dummy array measurements

    struct timespec begin2, end2;
    struct timespec begin3, end3;
    struct timespec begin4, end4;

    double *a2 = (double *)aligned_alloc(32, sizeof(double) * N);
    double *b2 = (double *)aligned_alloc(32, sizeof(double) * N);
    for (int i = 0; i < N ; i++){
        a2[i] = 1;
        b2[i] = 1;
    }

    // measurement with reduction is significantly faster than others
    clock_gettime(CLOCK_REALTIME, &begin2);
    #pragma omp simd aligned(a2, b2:32) reduction(+:cell_res)
    for (int i = 0; i < N; i++) {
        cell_res += a2[i] * b2[i];
    }

    clock_gettime(CLOCK_REALTIME, &end2);
    long seconds2 = end2.tv_sec - begin2.tv_sec;
    long nanoseconds2 = end2.tv_nsec - begin2.tv_nsec;
    double elapsed2 = seconds2 + nanoseconds2*1e-9;
    
    printf("time2 (simd reduction): %.9f seconds.\n", elapsed2);

    // no speedup compared to without simd (slightly faster than problematic calculation)
    clock_gettime(CLOCK_REALTIME, &begin3);
    #pragma omp simd aligned(a2, b2:32)
    for (int i = 0; i < N; i++) {
        cell_res += a2[i] * b2[i];
    }
    clock_gettime(CLOCK_REALTIME, &end3);

    long seconds3 = end3.tv_sec - begin3.tv_sec;
    long nanoseconds3 = end3.tv_nsec - begin3.tv_nsec;
    double elapsed3 = seconds3 + nanoseconds3*1e-9;
    printf("time3 (simd): %.9f seconds.\n", elapsed3);

    // no pragma (slightly faster than problematic calculation)
    clock_gettime(CLOCK_REALTIME, &begin4);
    for (int i = 0; i < N; i++) {
        cell_res += a2[i] * b2[i];
    }
    clock_gettime(CLOCK_REALTIME, &end4);

    long seconds4 = end4.tv_sec - begin4.tv_sec;
    long nanoseconds4 = end4.tv_nsec - begin4.tv_nsec;
    double elapsed4 = seconds4 + nanoseconds4*1e-9;
    printf("time4: %.9f seconds.\n", elapsed4);


    res_indexed[0][0] = cell_res;
        }
    }
    return res_indexed;
}


int main (int argc, char **argv) {
    //init a(i,j) = i * j
    double **a = create_a();

    //init b(i,j) = (i == j) ? 1:0;
    double **b = create_b();

    //multiply
    double **res = mul(a,b);
}

Time simd reduce measured: 0.000004188 seconds. // problematic
time2 (simd reduction): 0.000001762 seconds. // faster
time3 (simd): 0.000003475 seconds. //slightly faster
time4: 0.000003476 seconds. //slightly faster

【问题讨论】：

标签： c performance parallel-processing openmp simd

【解决方案1】：

我已经在我的机器上测试了前两个循环，我可以重现相同的行为。

Time simd reduce measured: 0.000006000 seconds.
time2 (simd reduction): 0.000004000 seconds.

我的猜测是有两个问题：

第一个问题：

多个版本的执行时间之间的差异似乎更多地与缓存有关，而不是矢量化。因此，当您使用具有 3000 个元素（24 KB）的虚拟数组进行测试时：

double *a2 = (double *)aligned_alloc(32, sizeof(double) * N);
double *b2 = (double *)aligned_alloc(32, sizeof(double) * N);
for (int i = 0; i < N ; i++){
    a2[i] = 1;
    b2[i] = 1;
}

这些小数组在初始化期间被加载到缓存中（即a2[i] = 1）。另一方面，矩阵a 和b_t 的大小为3000 x 3000（72 兆字节），这使得第一行不太可能完全位于缓存中（尽管这取决于测试环境的缓存）。

我改变了以下循环：

// problematic calculation that I can't get to speed up no matter what pragma I use
clock_gettime(CLOCK_REALTIME, &begin);
#pragma omp simd aligned(a, b_t:32) reduction(+:cell_res)
for (int i = 0; i < N; i++) {
    cell_res += a[0][i] * b_t[0][i];
}
clock_gettime(CLOCK_REALTIME, &end);

通过将矩阵a和b_t的第一行分别打包成两个新矩阵a_2和b_2，即：

for(int i = 0; i < N; i++){
      a_2[0][i]  = a[0][i];
      bt_2[0][i] = b_t[0][i];
 }

// problematic calculation that I can't get to speed up no matter what pragma I use
clock_gettime(CLOCK_REALTIME, &begin);
#pragma omp simd aligned(a_2, bt_2) reduction(+:cell_res)
for (int i = 0; i < N; i++) {
    cell_res += a_2[0][i] * bt_2[0][i];
}

通过将这些矩阵打包成只有一行的较小矩阵，我可以将这些矩阵加载到缓存中，从而减少第一个版本的执行时间。新结果：

Time simd reduce measured: 0.000004000 seconds.
time2 (simd reduction): 0.000004000 seconds.

IMO 您不应该在同一个函数中测试所有这些循环，因为编译器可能会以不同方式优化这些循环，然后存在缓存这些值的问题，等等。我会在单独的运行中测试它们。

现在我的问题是阻碍 SIMD 的问题是什么加速 我唯一的猜测是它一定是二维的数组，但我不完全明白为什么会导致减速。

我还测试了将矩阵a 和b_t 的第一行直接打包到单独的一维数组（而不是矩阵）中，但结果完全相同。接受它的价值。现在您应该在您的环境中进行分析，即测试缓存未命中。

更重要的是，测试这个版本：

clock_gettime(CLOCK_REALTIME, &begin);
for (int i = 0; i < N; i++) {
    cell_res += a[0][i] * b_t[0][i];
}
clock_gettime(CLOCK_REALTIME, &end);

有和没有#pragma omp simd aligned(a_2, bt_2:32) reduction(+:cell_res)，以及有和没有打包到数组中，但分别测试所有这些版本。此外，针对不同的输入大小测试它们。

第二个问题：

另一个问题是：

for (int i = 0; i < N; i++) {
        cell_res += a[0][i] * b_t[0][i];
}

受内存限制，因此使用SIMD 获得收益的机会较少，不应该从双精度浮点乘积中获得expect much gains 和SIMD。一种解决方法是将矩阵从双精度数更改为浮点数，从而将所需的内存带宽减少到一半，并使您可以执行的SIMD 操作数加倍。尽管如此，上述代码 sn-p 仍将受内存限制。尽管如此，您可能会获得一些收益，主要是当值在缓存中时。

在我的机器中，从 double 更改为 floats，使 SIMD 版本明显比没有它的版本快，即使不使用包装也是如此。这也可能是您遇到的问题。

【讨论】：