【问题标题】:no speedup for omp simd reductionomp simd减少没有加速
【发布时间】:2021-04-09 09:27:32
【问题描述】:

我正在尝试使用矢量化(openmp simd)来加速矩阵乘法。为了利用矢量化,我转置了第二个矩阵(以使变化最快的索引通过连续内存)。我正在 3000 x 3000 阵列上运行我的测试。因为我无法测量有与没有 open mp pragma 时的挂壁时间差异,所以我想确认我实际上正在为我正在相乘的单个数组获得加速(事实并非如此)。正因为如此,我插入了一些相同大小的虚拟数组,以检查是否使用 SIMD 对它们进行加速(至少在使用缩减子句时)。

现在我的问题是阻碍 SIMD 加速的问题是什么,我唯一的猜测是它一定是数组的二维,但我不完全明白为什么会导致减速。

或者我的代码是否还有其他我看不到的问题?

#include <stdlib.h>
#include <stdio.h>
#include <omp.h>

#include <time.h>

const int N = 3000;

struct timespec begin, end;

double **create_a() {
    double *a = (double *)aligned_alloc(32, sizeof(double) * N * N);
    double **a_indexed = (double **)aligned_alloc(32, sizeof(double *) * N);
    for (int i = 0; i<N; i++){
        a_indexed[i] = a+i*N;
    }

    for (int i = 0; i< N; i++) {
        for (int j = 0; j<N; j++) {
            a_indexed[i][j] = i * j;
        }
    }
    return a_indexed;
}

double **create_b(){
    double *b = (double *)aligned_alloc(32, sizeof(double) * N * N);
    double **b_indexed = (double **)aligned_alloc(32, sizeof(double *) * N);
    for (int i = 0; i<N; i++){
        b_indexed[i] = b+i*N;
    }

    for (int i = 0; i< N; i++) {
        for (int j = 0; j<N; j++) {
            b_indexed[i][j] = (i == j) ? 1:0;
        }
    }
    return b_indexed;
}

double **transpose( double **matrix) {
    double *t = (double *)aligned_alloc(32, sizeof(double) * N * N);
    double **t_indexed = (double **)aligned_alloc(32, sizeof(double *) * N);
    for (int i = 0; i<N; i++){
        t_indexed[i] = t+i*N;
    }

    for (int i = 0; i< N; i++) {
        for (int j = 0; j<N; j++) {
            t_indexed[i][j] = matrix[j][i];
        }
    }
    return t_indexed;
}

double **mul(double **a, double **b) {
    double **b_t = transpose(b);
    double *res = (double *)aligned_alloc(32, sizeof(double) * N * N);
    double **res_indexed = (double **)aligned_alloc(32, sizeof(double *) * N);
    for (int i = 0; i<N; i++){
        res_indexed[i] = res+i*N;
    }

    for (int row = 0; row< 1; row++) {
        for (int col = 0; col < 1; col++) {
    double cell_res = 0;

    // problematic calculation that I can't get to speed up no matter what pragma I use
    clock_gettime(CLOCK_REALTIME, &begin);
    #pragma omp simd aligned(a, b_t:32) reduction(+:cell_res)
    for (int i = 0; i < N; i++) {
        cell_res += a[0][i] * b_t[0][i];
    }
    clock_gettime(CLOCK_REALTIME, &end);

    long seconds = end.tv_sec - begin.tv_sec;
    long nanoseconds = end.tv_nsec - begin.tv_nsec;
    double elapsed = seconds + nanoseconds*1e-9;

    printf("Time simd reduce measured: %.9f seconds.\n", elapsed);


    // dummy array measurements

    struct timespec begin2, end2;
    struct timespec begin3, end3;
    struct timespec begin4, end4;

    double *a2 = (double *)aligned_alloc(32, sizeof(double) * N);
    double *b2 = (double *)aligned_alloc(32, sizeof(double) * N);
    for (int i = 0; i < N ; i++){
        a2[i] = 1;
        b2[i] = 1;
    }

    // measurement with reduction is significantly faster than others
    clock_gettime(CLOCK_REALTIME, &begin2);
    #pragma omp simd aligned(a2, b2:32) reduction(+:cell_res)
    for (int i = 0; i < N; i++) {
        cell_res += a2[i] * b2[i];
    }

    clock_gettime(CLOCK_REALTIME, &end2);
    long seconds2 = end2.tv_sec - begin2.tv_sec;
    long nanoseconds2 = end2.tv_nsec - begin2.tv_nsec;
    double elapsed2 = seconds2 + nanoseconds2*1e-9;
    
    printf("time2 (simd reduction): %.9f seconds.\n", elapsed2);

    // no speedup compared to without simd (slightly faster than problematic calculation)
    clock_gettime(CLOCK_REALTIME, &begin3);
    #pragma omp simd aligned(a2, b2:32)
    for (int i = 0; i < N; i++) {
        cell_res += a2[i] * b2[i];
    }
    clock_gettime(CLOCK_REALTIME, &end3);

    long seconds3 = end3.tv_sec - begin3.tv_sec;
    long nanoseconds3 = end3.tv_nsec - begin3.tv_nsec;
    double elapsed3 = seconds3 + nanoseconds3*1e-9;
    printf("time3 (simd): %.9f seconds.\n", elapsed3);

    // no pragma (slightly faster than problematic calculation)
    clock_gettime(CLOCK_REALTIME, &begin4);
    for (int i = 0; i < N; i++) {
        cell_res += a2[i] * b2[i];
    }
    clock_gettime(CLOCK_REALTIME, &end4);

    long seconds4 = end4.tv_sec - begin4.tv_sec;
    long nanoseconds4 = end4.tv_nsec - begin4.tv_nsec;
    double elapsed4 = seconds4 + nanoseconds4*1e-9;
    printf("time4: %.9f seconds.\n", elapsed4);


    res_indexed[0][0] = cell_res;
        }
    }
    return res_indexed;
}


int main (int argc, char **argv) {
    //init a(i,j) = i * j
    double **a = create_a();

    //init b(i,j) = (i == j) ? 1:0;
    double **b = create_b();

    //multiply
    double **res = mul(a,b);
}
Time simd reduce measured: 0.000004188 seconds. // problematic
time2 (simd reduction): 0.000001762 seconds. // faster
time3 (simd): 0.000003475 seconds. //slightly faster
time4: 0.000003476 seconds. //slightly faster

【问题讨论】:

    标签: c performance parallel-processing openmp simd


    【解决方案1】:

    我已经在我的机器上测试了前两个循环,我可以重现相同的行为。

    Time simd reduce measured: 0.000006000 seconds.
    time2 (simd reduction): 0.000004000 seconds.
    

    我的猜测是有两个问题:

    第一个问题:

    多个版本的执行时间之间的差异似乎更多地与缓存有关,而不是矢量化。因此,当您使用具有 3000 个元素(24 KB)的虚拟数组进行测试时:

    double *a2 = (double *)aligned_alloc(32, sizeof(double) * N);
    double *b2 = (double *)aligned_alloc(32, sizeof(double) * N);
    for (int i = 0; i < N ; i++){
        a2[i] = 1;
        b2[i] = 1;
    }
    

    这些小数组在初始化期间被加载到缓存中(a2[i] = 1)。另一方面,矩阵ab_t 的大小为3000 x 3000(72 兆字节),这使得第一行不太可能完全位于缓存中(尽管这取决于测试环境的缓存)。

    我改变了以下循环:

    // problematic calculation that I can't get to speed up no matter what pragma I use
    clock_gettime(CLOCK_REALTIME, &begin);
    #pragma omp simd aligned(a, b_t:32) reduction(+:cell_res)
    for (int i = 0; i < N; i++) {
        cell_res += a[0][i] * b_t[0][i];
    }
    clock_gettime(CLOCK_REALTIME, &end);
    

    通过将矩阵ab_t的第一行分别打包成两个新矩阵a_2b_2,即:

    for(int i = 0; i < N; i++){
          a_2[0][i]  = a[0][i];
          bt_2[0][i] = b_t[0][i];
     }
    
    // problematic calculation that I can't get to speed up no matter what pragma I use
    clock_gettime(CLOCK_REALTIME, &begin);
    #pragma omp simd aligned(a_2, bt_2) reduction(+:cell_res)
    for (int i = 0; i < N; i++) {
        cell_res += a_2[0][i] * bt_2[0][i];
    }
    

    通过将这些矩阵打包成只有一行的较小矩阵,我可以将这些矩阵加载到缓存中,从而减少第一个版本的执行时间。新结果:

    Time simd reduce measured: 0.000004000 seconds.
    time2 (simd reduction): 0.000004000 seconds.
    

    IMO 您不应该在同一个函数中测试所有这些循环,因为编译器可能会以不同方式优化这些循环,然后存在缓存这些值的问题,等等。我会在单独的运行中测试它们。

    现在我的问题是阻碍 SIMD 的问题是什么 加速 我唯一的猜测是它一定是二维的 数组,但我不完全明白为什么会导致减速。

    我还测试了将矩阵ab_t 的第一行直接打包到单独的一维数组(而不是矩阵)中,但结果完全相同。接受它的价值。现在您应该在您的环境中进行分析,即测试缓存未命中。

    更重要的是,测试这个版本:

    clock_gettime(CLOCK_REALTIME, &begin);
    for (int i = 0; i < N; i++) {
        cell_res += a[0][i] * b_t[0][i];
    }
    clock_gettime(CLOCK_REALTIME, &end);
    

    有和没有#pragma omp simd aligned(a_2, bt_2:32) reduction(+:cell_res),以及有和没有打包到数组中,但分别测试所有这些版本。此外,针对不同的输入大小测试它们。

    第二个问题:

    另一个问题是:

    for (int i = 0; i < N; i++) {
            cell_res += a[0][i] * b_t[0][i];
    } 
    

    受内存限制,因此使用SIMD 获得收益的机会较少,不应该从双精度浮点乘积中获得expect much gainsSIMD。一种解决方法是将矩阵从双精度数更改为浮点数,从而将所需的内存带宽减少到一半,并使您可以执行的SIMD 操作数加倍。尽管如此,上述代码 sn-p 仍将受内存限制。尽管如此,您可能会获得一些收益,主要是当值在缓存中时。

    在我的机器中,从 double 更改为 floats,使 SIMD 版本明显比没有它的版本快,即使不使用包装也是如此。这也可能是您遇到的问题。

    【讨论】:

      猜你喜欢
      • 1970-01-01
      • 1970-01-01
      • 2017-08-11
      • 1970-01-01
      • 2018-07-20
      • 2016-10-04
      • 2019-07-06
      • 2016-05-03
      • 2015-06-04
      相关资源
      最近更新 更多