使 OpenMP 程序与 Pthreads 一起工作，segfault 错误答案

【问题标题】：Making an OpenMP program work with Pthreads, segfault error使 OpenMP 程序与 Pthreads 一起工作，segfault 错误
【发布时间】：2015-05-03 18:04:27
【问题描述】：

我编写了一个程序，用 C 语言执行高斯消元并返回矩阵的 L2 范数。该程序的名称类似于./exec n k，其中n 是n x n 矩阵的大小，k 是将用于执行该程序的线程数（最多4 个）。我为 n x n+1 矩阵分配空间，因为具有增广矩阵是高斯消除的一部分。

它在 OpenMP 中完美运行。如下面的代码所示，我只有 1 个并行。我现在的目标是使用 Pthreads 而不是 OpenMP 使并行 for 循环同时运行。我将 for 循环并行化为一个单独的函数并创建 pthread 来处理它。我的猜测是，pthread 并不是每个都执行循环的不同部分（基本上是 j 的不同迭代），而是 4 个 Pthread 只是运行整个循环。我像./gauss 30 4 一样运行程序，它有时会工作，有时会出现段错误，尽管当它工作时，L2 规范不是 0（如果程序运行良好，L2 将返回 0），所以线程部分显然有问题。当我通过 GDB 运行它时，由于某种原因，它会在一个循环中出现段错误，但同样的循环在 OpenMP 中运行完美……有人可以帮帮我吗

GDB

http://i.stack.imgur.com/V99yt.png

OpenMP 代码：

 #include <stdlib.h>
    #include <stdio.h>
    #include <math.h>
    #include <omp.h>
    #include <time.h>
    #include <sys/time.h>
    //globals
    double **a, *vect, *bvect, scalar, ratio, sum, delta, *temp;
    int i,j,k,ptr, z;
    int y,z;
    int bvectcount = 0;
    struct timeval start, end;
    // a is matrix, b is vector, x is the solution vector, and n is the size 
    double L2(double **a, double *bvect, double *vect, int matrixSize) {
        double sum;
        double res[matrixSize];
        int i, j;
        for (i=0; i < matrixSize; i++) {
            sum = (double) 0;
            for (j=0; j < matrixSize; j++) {
                sum += a[i][j] * vect[j];
            }
            res[i] = sum;
        }
        for (i=0; i < matrixSize; i++) {
            res[i] -= vect[i];
        }
        double sum_squares = (double) 0;
        for (i=0; i < matrixSize; i++) {
            sum_squares += res[i] * res[i];
        }
        return sqrt(sum_squares);
    }
    int checkargs(int argc, char* argv[]){
        if(argc != 3){
            fprintf(stderr, "Error: Usage is size threadNum\n" );
            exit(1);
        }
    }
    int main(int argc, char* argv[]){
        //check for args
        checkargs(argc, argv);
        int matrixSize = atoi(argv[1]);
        int threadNum = atoi(argv[2]);
        int chunk = matrixSize/threadNum;
        //memory allocation
        a = (double**)malloc(matrixSize*sizeof(double*));
        for(i = 0; i < matrixSize ; i++)
            a[i] = (double*)malloc(matrixSize*sizeof(double) * matrixSize);
        vect = (double*)malloc(matrixSize*sizeof(double));
        bvect = (double*)malloc(matrixSize*sizeof(double));
        temp = (double*)malloc(matrixSize*sizeof(double));
        for(i = 0; i < matrixSize; ++i){
            for(j = 0; j < matrixSize + 1; ++j){
                a[i][j] = drand48(); 
            }
        }   
        j = 0;
        j += matrixSize;
        for(i = 0; i < matrixSize; ++i){
            bvect[i] = a[i][j];
        }
        //generation of scalar matrix (diagonal vector)
        gettimeofday(&start, NULL);
        for(i=0; i<matrixSize; i++){
            scalar = a[i][i];
            //initialization of p to travel throughout matrix
            ptr = i;
            //find largest number in column and row number of it
            for(k = i+1; k < matrixSize; k++){
            if(fabs(scalar) < fabs(a[k][i])){
                //k is row of scalar, while
               scalar = a[k][i];
               ptr = k;
            }
        }
        //swaping the elements of diagonal row and row containing largest no
        for(j = 0; j <= matrixSize; j++){
            temp[0] = a[i][j];
            a[i][j]= a[ptr][j];
            a[ptr][j] = temp[0];
        }
        //calculating triangular matrix
        //threading needs to be done HERE
        ratio = a[i][i];
        for(k = 0; k < matrixSize + 1; k++){
            a[i][k] = a[i][k] / ratio;
        }
        double temp2;
        #pragma omp parallel default(none) num_threads(threadNum)  shared(a,i,matrixSize,vect) private(j,z,ratio,temp2)
        {
        #pragma omp for schedule(static)
        for(j = i + 1; j<matrixSize; j++){
            temp2 = a[j][i]/a[i][i];
            for(z = 0; z<matrixSize + 1; z++){
                a[j][z] = a[j][z] - temp2 * a[i][z];
                }
            }
        }
    }
        //backward substitution method
        for(i=matrixSize-1; i >=0; i--){
            for(k = i; k > 0; k--){
                a[k-1][matrixSize] -= a[k-1][i] * a[i][matrixSize];
                a[k-1][i] -= a[k-1][i] * a[i][i];
            }     
        }
        for(i = 0; i < matrixSize; ++i){
            vect[i] = a[i][matrixSize];
        }
        double l2Norm;
        l2Norm = L2(a, bvect, vect, matrixSize);
        printf("THIS IS L2 NORM: %f\n", l2Norm);
        gettimeofday(&end, NULL);
        delta = ((end.tv_sec  - start.tv_sec) * 1000000u + 
             end.tv_usec - start.tv_usec) / 1.e6;
        printf("end time: %f\n", delta);
    }

Pthreads代码：

#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <omp.h>
#include <time.h>
#include <sys/time.h>
#include <pthread.h>

//globals
double **a, *vect, *bvect, scalar, ratio, sum, delta, *temp;
int i,j,k,ptr, z;
int y,z;
int bvectcount = 0;
int threadcount;
pthread_t workerThreads[4];
typedef struct threader {
    int counter;
    int matrixl;
} threader;
struct timeval start, end;
    void *retval;

int checkargs(int argc, char* argv[]);
// a is matrix, b is vector, x is the solution vector, and n is the size 
double L2(double **a, double *bvect, double *vect, int matrixSize) {
    double sum;
    double res[matrixSize];
    int i, j;
    for (i=0; i < matrixSize; i++) {
        sum = (double) 0;
        for (j=0; j < matrixSize; j++) {
            sum += a[i][j] * vect[j];
        }
        res[i] = sum;
    }
    for (i=0; i < matrixSize; i++) {
        res[i] -= vect[i];
    }
    double squaresum = (double) 0;
    for (i=0; i < matrixSize; i++) {
        squaresum += res[i] * res[i];
    }
    return sqrt(squaresum);
}
int checkargs(int argc, char* argv[]){
    if(argc != 3){
        fprintf(stderr, "Error: Usage is size threadNum\n" );
        exit(1);
    }
}

void *parallelstuff(void *args){
    threader temp = *((threader *)args);
    int i, matrixSize;
    i = temp.counter;
    matrixSize = temp.matrixl;
    double temp2;
    for(j = i + 1; j<matrixSize; j++){
        temp2 = a[j][i]/a[i][i];
        for(z = 0; z<matrixSize + 1; z++){
            a[j][z] = a[j][z] - temp2 * a[i][z];
        }
    }
}
int main(int argc, char* argv[]){
    //check for args
    checkargs(argc, argv);
    int matrixSize = atoi(argv[1]);
    int threadNum = atoi(argv[2]);
    //memory allocation
    a = (double**)malloc(matrixSize*sizeof(double*));
    for(i = 0; i < matrixSize ; i++)
        a[i] = (double*)malloc(matrixSize*sizeof(double) * matrixSize);
    vect = (double*)malloc(matrixSize*sizeof(double));
    bvect = (double*)malloc(matrixSize*sizeof(double));
    temp = (double*)malloc(matrixSize*sizeof(double));
    for(i = 0; i < matrixSize; ++i){
        for(j = 0; j < matrixSize + 1; ++j){
            a[i][j] = drand48(); 
        }
    }
    j = 0;
    j += matrixSize;
    for(i = 0; i < matrixSize; ++i){
        bvect[i] = a[i][j];
    }
//generation of scalar matrix (diagonal vector)
    gettimeofday(&start, NULL);
    for(i=0; i<matrixSize; i++){
        scalar = a[i][i];
    //initialization of p to travel throughout matrix
        ptr = i;
    //find largest number in column and row number of it
        for(k = i+1; k < matrixSize; k++){
        if(fabs(scalar) < fabs(a[k][i])){
            //k is row of scalar, while
           scalar = a[k][i];
           ptr = k;
        }
    }
    //swaping the elements of diagonal row and row containing largest no
    for(j = 0; j <= matrixSize; j++)
    {
        temp[0] = a[i][j];
        a[i][j]= a[ptr][j];
        a[ptr][j] = temp[0];
    }
    ratio = a[i][i];
    for(k = 0; k < matrixSize + 1; k++){
        a[i][k] = a[i][k] / ratio;
    }   
    threader stuff;
    stuff.counter = i;
    stuff.matrixl = matrixSize;
    //MAKE EACH THREAD DO SOMETHING DIFF
    // parallelstuff(int i, int matrixSize, double **a){
    for(threadcount = 0; threadcount < threadNum; threadcount++){
        if(pthread_create (&workerThreads[threadcount], NULL, parallelstuff, (void *) &stuff ) != 0){
            fprintf(stderr, "Error: consumer create problem\n");
            exit(1);
            }
        }
    while(threadcount != 0){
        if(pthread_join (workerThreads[threadcount-1], &retval ) != 0){
        fprintf(stderr, "Error: consumer create problem\n");
        exit(1);
        }
        threadcount--;
    }

    //create matrix of n size 
    //backward substitution method
    for(i=matrixSize-1; i >=0; i--){
        for(k = i; k > 0; k--){
            a[k-1][matrixSize] -= a[k-1][i] * a[i][matrixSize];
            a[k-1][i] -= a[k-1][i] * a[i][i];
        }  
    }
    for(i = 0; i < matrixSize; ++i){
        vect[i] = a[i][matrixSize];
        }       
    double l2Norm;
    l2Norm = L2(a, bvect, vect, matrixSize);
    printf("THIS IS L2 NORM: %f\n", l2Norm);
    gettimeofday(&end, NULL);
    delta = ((end.tv_sec  - start.tv_sec) * 1000000u + 
         end.tv_usec - start.tv_usec) / 1.e6;
    printf("end time: %f\n", delta);
}
        }

【问题讨论】：

您正在访问parallelstuff() 中的全局double **a，对它进行读写。我看不到同步。如果这些访问既不是原子的也不是同步的，它们可能会出现竞争条件，从而导致未定义的行为。

标签： c parallel-processing pthreads openmp

【解决方案1】：

请注意，j , z 应在每个线程中声明为本地（私有）变量。
在 OpenMP 代码中，您在第 100 行关闭了 for 循环的括号：

    gettimeofday(&start, NULL);
    for(i=0; i<matrixSize; i++){
        scalar = a[i][i];
        //initialization of p to travel throughout matrix
        .......
        ......
        .....

} //line 100

但在 pthreads 代码中，您在第 149 行将其关闭，因此完整代码：

#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <omp.h>
#include <time.h>
#include <sys/time.h>
#include <pthread.h>

//globals
double **a, *vect, *bvect, scalar, ratio, sum, delta, *temp;
int i,j,k,ptr, z;
int y; //z?
int bvectcount = 0;
int threadcount;
pthread_t workerThreads[4];
typedef struct threader {
    int counter;
    int matrixl;
} threader;
struct timeval start, end;
    void *retval;

int checkargs(int argc, char* argv[]);
// a is matrix, b is vector, x is the solution vector, and n is the size 
double L2(double **a, double *bvect, double *vect, int matrixSize) {
    double sum;
    double res[matrixSize];
    int i, j;
    for (i=0; i < matrixSize; i++) {
        sum = (double) 0;
        for (j=0; j < matrixSize; j++) {
            sum += a[i][j] * vect[j];
        }
        res[i] = sum;
    }
    for (i=0; i < matrixSize; i++) {
        res[i] -= vect[i];
    }
    double squaresum = (double) 0;
    for (i=0; i < matrixSize; i++) {
        squaresum += res[i] * res[i];
    }
    return sqrt(squaresum);
}
int checkargs(int argc, char* argv[]){
    if(argc != 3){
        fprintf(stderr, "Error: Usage is size threadNum\n" );
        exit(1);
    }
}

void *parallelstuff(void *args){
    threader temp = *((threader *)args);
    int i, matrixSize;
    i = temp.counter;
    matrixSize = temp.matrixl;
    //printf("matrixSize=%d counter=%d\n" , matrixSize ,temp.counter );
    double temp2;
    int j , z; //houssam
    for(j = i + 1; j<matrixSize; j++){
        temp2 = a[j][i]/a[i][i];
        for(z = 0; z<matrixSize + 1; z++){
            a[j][z] = a[j][z] - temp2 * a[i][z];
        }
    }


}
int main(int argc, char* argv[]){
    //check for args
    checkargs(argc, argv);
    int matrixSize = atoi(argv[1]);
    int threadNum = atoi(argv[2]);
    //memory allocation
    a = (double**)malloc(matrixSize*sizeof(double*));
    for(i = 0; i < matrixSize ; i++)
        a[i] = (double*)malloc(matrixSize*sizeof(double) * matrixSize);
    vect = (double*)malloc(matrixSize*sizeof(double));
    bvect = (double*)malloc(matrixSize*sizeof(double));
    temp = (double*)malloc(matrixSize*sizeof(double));
    for(i = 0; i < matrixSize; ++i){
        for(j = 0; j < matrixSize + 1; ++j){
            a[i][j] = drand48();
        }
    }
    j = 0;
    j += matrixSize;
    for(i = 0; i < matrixSize; ++i){
        bvect[i] = a[i][j];
    }
//generation of scalar matrix (diagonal vector)
    gettimeofday(&start, NULL);
    for(i=0; i<matrixSize; i++){
        scalar = a[i][i];
    //initialization of p to travel throughout matrix
        ptr = i;
    //find largest number in column and row number of it
        for(k = i+1; k < matrixSize; k++){
        if(fabs(scalar) < fabs(a[k][i])){
            //k is row of scalar, while
           scalar = a[k][i];
           ptr = k;
        }
    }
    //swaping the elements of diagonal row and row containing largest no
    for(j = 0; j <= matrixSize; j++)
    {
        temp[0] = a[i][j];
        a[i][j]= a[ptr][j];
        a[ptr][j] = temp[0];
    }
    ratio = a[i][i];
    for(k = 0; k < matrixSize + 1; k++){
        a[i][k] = a[i][k] / ratio;
    }   

    threader  stuff;
    stuff.counter = i;
    stuff.matrixl = matrixSize;
    //printf("i=%d\n" , i);
    //MAKE EACH THREAD DO SOMETHING DIFF
    // parallelstuff(int i, int matrixSize, double **a){
    for(threadcount = 0; threadcount < threadNum; threadcount++){
        if(pthread_create (&workerThreads[threadcount], NULL, parallelstuff, (void *) &stuff ) != 0){
            fprintf(stderr, "Error: consumer create problem\n");
            exit(1);
            }
        }

    while(threadcount != 0){
        if(pthread_join (workerThreads[threadcount-1], &retval ) != 0){
        fprintf(stderr, "Error: consumer create problem\n");
        exit(1);
        }
        threadcount--;
    }
}
    //create matrix of n size 
    //backward substitution method
    for(i=matrixSize-1; i >=0; i--){
        for(k = i; k > 0; k--){
            a[k-1][matrixSize] -= a[k-1][i] * a[i][matrixSize];
            a[k-1][i] -= a[k-1][i] * a[i][i];
        }  
    }
    for(i = 0; i < matrixSize; ++i){
        vect[i] = a[i][matrixSize];
        }       
    double l2Norm;
    l2Norm = L2(a, bvect, vect, matrixSize);
    printf("THIS IS L2 NORM: %f\n", l2Norm);
    gettimeofday(&end, NULL);
    delta = ((end.tv_sec  - start.tv_sec) * 1000000u + 
         end.tv_usec - start.tv_usec) / 1.e6;
    printf("end time: %f\n", delta);

}

【讨论】：

【解决方案2】：

所写的两个代码是不等价的。观察 OpenMP 代码：

#pragma omp for schedule(static)
for(j = i + 1; j<matrixSize; j++){
    temp2 = a[j][i]/a[i][i];
    for(z = 0; z<matrixSize + 1; z++){
        a[j][z] = a[j][z] - temp2 * a[i][z];
    }
}

OpenMP 中组合的parallel for 构造是一个工作共享构造，即它在团队中的线程之间分配以下循环中的迭代。给定schedule(static) 子句，迭代空间被分成#threads 块，每个块分配给不同的线程。

您的 Pthreads 代码不共享工作：

i = temp.counter;
matrixSize = temp.matrixl;
...
for(j = i + 1; j<matrixSize; j++){
    temp2 = a[j][i]/a[i][i];
    for(z = 0; z<matrixSize + 1; z++){
        a[j][z] = a[j][z] - temp2 * a[i][z];
    }
}

假设将相同的stuff 对象传递给所有线程，它们都会收到相同的i 和matrixSize 值并循环整个迭代空间，因此会出现错误的结果。

你要做的是模拟#pragma omp for schedule(static) 所做的事情，即让每个线程只做一些matrixSize - (i+1) + 1 迭代。您应该向每个线程传递一个包含开始和结束迭代的唯一数据对象：

typedef struct threader {
    int start;
    int end;
    int i;
    int matrixSize;
} threader;

...

void *parallelstuff(void *args){
    threader *temp = (threader *)args;
    int start, end, i, matrixSize;
    start = temp->start;
    end = temp->end;
    i = temp->i;
    matrixSize = temp->matrixSize;
    double temp2;
    int j , z; //houssam
    for(j = start + 1; j<end; j++){
        temp2 = a[j][i]/a[i][i];
        for(z = 0; z<matrixSize + 1; z++){
            a[j][z] = a[j][z] - temp2 * a[i][z];
        }
    }
}

...

threader stuff[threadNum];
//MAKE EACH THREAD DO SOMETHING DIFF
// parallelstuff(int i, int matrixSize, double **a){
for(threadcount = 0; threadcount < threadNum; threadcount++){
    stuff[threadcount].start = i + threadcount*(matrixSize / threadNum);
    stuff[threadcount].end = i + (threadcount+1)*(matrixSize / threadNum);
    stuff[threadcount].i = i;
    stuff[threadcount].matrixSize = matrixSize;
    if(pthread_create (&workerThreads[threadcount], NULL, parallelstuff, (void *) &stuff ) != 0){
        fprintf(stderr, "Error: consumer create problem\n");
        exit(1);
    }
}

理论上，您也可以让每个线程知道有多少线程，并让它自己计算迭代范围，但由于 Pthreads API 缺少 omp_get_thread_num() 的等价物，这就变得复杂了。有一个高级技巧使用对齐的内存分配和在传递的指针的最后一位中编码的数字线程 ID。

【讨论】：