【发布时间】:2021-04-09 09:27:32
【问题描述】:
我正在尝试使用矢量化(openmp simd)来加速矩阵乘法。为了利用矢量化,我转置了第二个矩阵(以使变化最快的索引通过连续内存)。我正在 3000 x 3000 阵列上运行我的测试。因为我无法测量有与没有 open mp pragma 时的挂壁时间差异,所以我想确认我实际上正在为我正在相乘的单个数组获得加速(事实并非如此)。正因为如此,我插入了一些相同大小的虚拟数组,以检查是否使用 SIMD 对它们进行加速(至少在使用缩减子句时)。
现在我的问题是阻碍 SIMD 加速的问题是什么,我唯一的猜测是它一定是数组的二维,但我不完全明白为什么会导致减速。
或者我的代码是否还有其他我看不到的问题?
#include <stdlib.h>
#include <stdio.h>
#include <omp.h>
#include <time.h>
const int N = 3000;
struct timespec begin, end;
double **create_a() {
double *a = (double *)aligned_alloc(32, sizeof(double) * N * N);
double **a_indexed = (double **)aligned_alloc(32, sizeof(double *) * N);
for (int i = 0; i<N; i++){
a_indexed[i] = a+i*N;
}
for (int i = 0; i< N; i++) {
for (int j = 0; j<N; j++) {
a_indexed[i][j] = i * j;
}
}
return a_indexed;
}
double **create_b(){
double *b = (double *)aligned_alloc(32, sizeof(double) * N * N);
double **b_indexed = (double **)aligned_alloc(32, sizeof(double *) * N);
for (int i = 0; i<N; i++){
b_indexed[i] = b+i*N;
}
for (int i = 0; i< N; i++) {
for (int j = 0; j<N; j++) {
b_indexed[i][j] = (i == j) ? 1:0;
}
}
return b_indexed;
}
double **transpose( double **matrix) {
double *t = (double *)aligned_alloc(32, sizeof(double) * N * N);
double **t_indexed = (double **)aligned_alloc(32, sizeof(double *) * N);
for (int i = 0; i<N; i++){
t_indexed[i] = t+i*N;
}
for (int i = 0; i< N; i++) {
for (int j = 0; j<N; j++) {
t_indexed[i][j] = matrix[j][i];
}
}
return t_indexed;
}
double **mul(double **a, double **b) {
double **b_t = transpose(b);
double *res = (double *)aligned_alloc(32, sizeof(double) * N * N);
double **res_indexed = (double **)aligned_alloc(32, sizeof(double *) * N);
for (int i = 0; i<N; i++){
res_indexed[i] = res+i*N;
}
for (int row = 0; row< 1; row++) {
for (int col = 0; col < 1; col++) {
double cell_res = 0;
// problematic calculation that I can't get to speed up no matter what pragma I use
clock_gettime(CLOCK_REALTIME, &begin);
#pragma omp simd aligned(a, b_t:32) reduction(+:cell_res)
for (int i = 0; i < N; i++) {
cell_res += a[0][i] * b_t[0][i];
}
clock_gettime(CLOCK_REALTIME, &end);
long seconds = end.tv_sec - begin.tv_sec;
long nanoseconds = end.tv_nsec - begin.tv_nsec;
double elapsed = seconds + nanoseconds*1e-9;
printf("Time simd reduce measured: %.9f seconds.\n", elapsed);
// dummy array measurements
struct timespec begin2, end2;
struct timespec begin3, end3;
struct timespec begin4, end4;
double *a2 = (double *)aligned_alloc(32, sizeof(double) * N);
double *b2 = (double *)aligned_alloc(32, sizeof(double) * N);
for (int i = 0; i < N ; i++){
a2[i] = 1;
b2[i] = 1;
}
// measurement with reduction is significantly faster than others
clock_gettime(CLOCK_REALTIME, &begin2);
#pragma omp simd aligned(a2, b2:32) reduction(+:cell_res)
for (int i = 0; i < N; i++) {
cell_res += a2[i] * b2[i];
}
clock_gettime(CLOCK_REALTIME, &end2);
long seconds2 = end2.tv_sec - begin2.tv_sec;
long nanoseconds2 = end2.tv_nsec - begin2.tv_nsec;
double elapsed2 = seconds2 + nanoseconds2*1e-9;
printf("time2 (simd reduction): %.9f seconds.\n", elapsed2);
// no speedup compared to without simd (slightly faster than problematic calculation)
clock_gettime(CLOCK_REALTIME, &begin3);
#pragma omp simd aligned(a2, b2:32)
for (int i = 0; i < N; i++) {
cell_res += a2[i] * b2[i];
}
clock_gettime(CLOCK_REALTIME, &end3);
long seconds3 = end3.tv_sec - begin3.tv_sec;
long nanoseconds3 = end3.tv_nsec - begin3.tv_nsec;
double elapsed3 = seconds3 + nanoseconds3*1e-9;
printf("time3 (simd): %.9f seconds.\n", elapsed3);
// no pragma (slightly faster than problematic calculation)
clock_gettime(CLOCK_REALTIME, &begin4);
for (int i = 0; i < N; i++) {
cell_res += a2[i] * b2[i];
}
clock_gettime(CLOCK_REALTIME, &end4);
long seconds4 = end4.tv_sec - begin4.tv_sec;
long nanoseconds4 = end4.tv_nsec - begin4.tv_nsec;
double elapsed4 = seconds4 + nanoseconds4*1e-9;
printf("time4: %.9f seconds.\n", elapsed4);
res_indexed[0][0] = cell_res;
}
}
return res_indexed;
}
int main (int argc, char **argv) {
//init a(i,j) = i * j
double **a = create_a();
//init b(i,j) = (i == j) ? 1:0;
double **b = create_b();
//multiply
double **res = mul(a,b);
}
Time simd reduce measured: 0.000004188 seconds. // problematic
time2 (simd reduction): 0.000001762 seconds. // faster
time3 (simd): 0.000003475 seconds. //slightly faster
time4: 0.000003476 seconds. //slightly faster
【问题讨论】:
标签: c performance parallel-processing openmp simd