正如评论者所说,这些是所谓的“竞争条件”的症状。
OpenMP 使用的线程彼此独立,但矩阵乘法的各个循环的结果不是,因此一个线程的位置可能与另一个线程不同,如果您依赖于结果的顺序。
你只能并行化最外层的循环:
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
int main(int argc, char **argv)
{
int n;
double **A, **B, **C, **D, t;
int i, j, k;
struct timeval start, stop;
if (argc != 2) {
fprintf(stderr, "Usage: %s a positive integer >= 2 and < 1 mio\n", argv[0]);
exit(EXIT_FAILURE);
}
n = atoi(argv[1]);
if (n <= 2 || n >= 1000000) {
fprintf(stderr, "Usage: %s a positive integer >= 2 and < 1 mio\n", argv[0]);
exit(EXIT_FAILURE);
}
// make it repeatable
srand(0xdeadbeef);
// allocate memory for and initialize A
A = malloc(sizeof(*A) * n);
for (i = 0; i < n; i++) {
A[i] = malloc(sizeof(**A) * n);
for (j = 0; j < n; j++) {
A[i][j] = (double) ((rand() % 100) / 99.);
}
}
// do the same for B
B = malloc(sizeof(*B) * n);
for (i = 0; i < n; i++) {
B[i] = malloc(sizeof(**B) * n);
for (j = 0; j < n; j++) {
B[i][j] = (double) ((rand() % 100) / 99.);
}
}
// and C but initialize with zero
C = malloc(sizeof(*C) * n);
for (i = 0; i < n; i++) {
C[i] = malloc(sizeof(**C) * n);
for (j = 0; j < n; j++) {
C[i][j] = 0.0;
}
}
// ditto with D
D = malloc(sizeof(*D) * n);
for (i = 0; i < n; i++) {
D[i] = malloc(sizeof(**D) * n);
for (j = 0; j < n; j++) {
D[i][j] = 0.0;
}
}
// some coarse timing
gettimeofday(&start, NULL);
// naive matrix multiplication
for (i = 0; i < n; i++) {
for (j = 0; j < n; j++) {
for (k = 0; k < n; k++) {
C[i][j] = C[i][j] + A[i][k] * B[k][j];
}
}
}
gettimeofday(&stop, NULL);
t = ((stop.tv_sec - start.tv_sec) * 1000000u +
stop.tv_usec - start.tv_usec) / 1.e6;
printf("Timing for naive run = %.10g\n", t);
gettimeofday(&start, NULL);
#pragma omp parallel shared(A, B, C) private(i, j, k)
#pragma omp for
for (i = 0; i < n; i++) {
for (j = 0; j < n; j++) {
for (k = 0; k < n; k++) {
D[i][j] = D[i][j] + A[i][k] * B[k][j];
}
}
}
gettimeofday(&stop, NULL);
t = ((stop.tv_sec - start.tv_sec) * 1000000u +
stop.tv_usec - start.tv_usec) / 1.e6;
printf("Timing for parallel run = %.10g\n", t);
// check result
for (i = 0; i < n; i++) {
for (j = 0; j < n; j++) {
if (D[i][j] != C[i][j]) {
printf("Cell %d,%d differs with delta(D_ij-C_ij) = %.20g\n", i, j,
D[i][j] - C[i][j]);
}
}
}
// clean up
for (i = 0; i < n; i++) {
free(A[i]);
free(B[i]);
free(C[i]);
free(D[i]);
}
free(A);
free(B);
free(C);
free(D);
puts("All ok? Bye");
exit(EXIT_SUCCESS);
}
(n>2000 可能需要一些耐心才能得到结果)
但这并不完全正确。您可以(但不应该)尝试使用
之类的方式获取最内层循环
sum = 0.0;
#pragma omp parallel for reduction(+:sum)
for (k = 0; k < n; k++) {
sum += A[i][k] * B[k][j];
}
D[i][j] = sum;
似乎没有更快,小n 甚至更慢。
使用原代码和n = 2500(只运行一次):
Timing for naive run = 124.466307
Timing for parallel run = 44.154538
与减法差不多:
Timing for naive run = 119.586365
Timing for parallel run = 43.288371
使用较小的n = 500
Timing for naive run = 0.444061
Timing for parallel run = 0.150842
缩小到那个尺寸已经变慢了:
Timing for naive run = 0.447894
Timing for parallel run = 0.245481
非常大n可能会赢,但我缺乏必要的耐心。
不过,最后一个 n = 4000(仅限 OpenMP 部分):
正常:
Timing for parallel run = 174.647404
减少:
Timing for parallel run = 179.062463
这种差异仍然完全在误差线之内。
使用 Schönhage-Straßen 算法(在 ca. n>100)是一种更好的乘法矩阵的方法。
哦:我只是为了方便而使用方阵,而不是因为它们必须是那种形式!但是如果你有一个长比很大的矩形矩阵,你可能会尝试改变循环的运行方式;列优先或行优先在这里可以产生重大影响。