OpenMP 并行计算素数答案

【问题标题】：OpenMP Parallel Computing Prime NumbersOpenMP 并行计算素数
【发布时间】：2018-05-25 16:14:48
【问题描述】：

以下是计算素数的简单代码：

#define END   100000000

// Only pass odd values to this function
int is_prime(uint32_t v)
{
    uint32_t end = sqrt(v);
    for (uint32_t i = 3; i <= end; i += 2) {
        if ((v % i) == 0) {
            return 0;
        }
    }
    return 1;
}

int main(int argc, char **argv)
{
    // We'll grab 2 as it's the only even prime
    int prime_count = 1;

    uint32_t bracket = 10;

    #pragma omp parallel for num_threads(4)
    for (uint32_t i = 3; i < END; i += 2) {
        if (i > bracket) {
            printf("%12d\t%12d\n", bracket, prime_count);
            bracket *= 10;
        }
        if (is_prime(i)) {
            prime_count++;
        }
    }
    printf("%12d\t%12d\n", bracket, prime_count);
    return 0;
}

我们的任务是使用 OpenMP pragma 来加快计算速度，因为循环遍历 100,000,000 个整数需要很长时间。我尝试将#pragma 语句放在我粘贴的代码中我当前拥有的位置，当我运行代码时，我得到以下输出：

        10              4
        10              1
        10              4
        10              4
       100             25
      1000             25
     10000             25
    100000             25
  10000000            262
1410065408        5737822

显然认为这是不正确的，我尝试在 is_prime() 函数中的 for 循环上方放置一个#pragma，但我得到编译器错误提示

错误：“到/从 OpenMP 结构化块的无效分支”。

我意识到这个错误来自可能进入 if 语句并在并行操作中间返回 0 的线程。线程提前终止，整个操作分开。我觉得将#pragma 放在函数调用中是最合适的，因为它可以将每次调用函数的运行时间减少v/4。但是，我不知道执行此操作的正确语法。如果有人可以请帮助我，我将不胜感激。谢谢

【问题讨论】：

标签： c multithreading parallel-processing openmp

【解决方案1】：

user3666197 为您提供了如何手动执行此操作的示例。这很好理解，但您不需要自己做——相反，您应该使用 OpenMP 提供的抽象——这是一种简化。减少将为每个线程提供一个私有变量，并且在并行计算之后，将它们一起添加到原始变量中。没有中间输出的正确示例是：

int prime_count = 1;

#pragma omp parallel for reduction(+:prime_count)
for (uint32_t i = 3; i < END; i += 2) {
    if (is_prime(i)) {
        prime_count++;
    }
}
printf("%12d\t%12d\n", 0, prime_count);

并行再现中间输出。我宁愿让所有线程都在一个 bracket 上工作，然后一个报告汇总结果。这仍然可以通过减少轻松实现，并为您提供每个块中有多少素数的正确汇总信息：

int prime_count = 1;

#pragma omp parallel
for (uint32_t bracket = 10; bracket <= END; bracket *= 10)
{
    uint32_t start = bracket / 10;
    if (start == 1) start = 3;
    else start++;
    #pragma omp for reduction(+:prime_count)
    for (uint32_t i = start; i < bracket; i += 2) {
        if (is_prime(i)) {
            prime_count++;
        }
    }
    #pragma omp single
    printf("%12d\t%12d %p\n", bracket, prime_count);
}
printf("%12d\t%12d\n", 0, prime_count);

我将for 和parallel 分开，因为一次又一次地重新打开并行区域的成本很高。

【讨论】：

【解决方案2】：

不要寻求语法上的改变，
需要改变思维的概念——它是并行发生的

鉴于上面已指示num_threads( 4 )，接下来的操作将在其他三个之外发生。这意味着如果一个线程进入prime_count++ 并获取prime_count 的“当前”值，以便添加+1，作为指令命令，则其他线程在您的代码中无法等待，在第一个之前接受者完成++ 并存储“新”值（应该由其他接受者使用）。在没有任何协调工具的情况下，不受控制的添加乍一看还不错，但是一旦您意识到，您有很多次错误的“初始”值（在很多情况下可能同时也是 ++-由任何其他线程编辑），所以最终结果只是一场灾难。

显式协调和/或共享变量是可能的，但时间成本很高。

[END] PRIME COUNT was omp.FOUND ==    664579 <= 10000000
[END] PRIME COUNT was omp.FOUND ==   1270607 <= 20000000
[END] PRIME COUNT was omp.FOUND ==   1857859 <= 30000000

INF(t:  0)|||: _id
|||(t:  0)|||:          10                1
|||(t:  0)|||:         100                7
|||(t:  0)|||:        1000               44
|||(t:  0)|||:       10000              311
INF(t:  3)|||: _id             
|||(t:  3)|||:          10                0
|||(t:  3)|||:         100                5
|||(t:  3)|||:        1000               37
|||(t:  3)|||:       10000              295
INF(t:  2)|||: _id             
|||(t:  2)|||:          10                1
|||(t:  2)|||:         100                6
|||(t:  2)|||:        1000               43
|||(t:  2)|||:       10000              308
INF(t:  1)|||: _id             
|||(t:  1)|||:          10                1
|||(t:  1)|||:         100                6
|||(t:  1)|||:        1000               43
|||(t:  1)|||:       10000              314
|||(t:  0)|||:      100000             2409
|||(t:  1)|||:      100000             2399
|||(t:  3)|||:      100000             2384
|||(t:  2)|||:      100000             2399
|||(t:  2)|||:     1000000            19669
|||(t:  3)|||:     1000000            19552
|||(t:  1)|||:     1000000            19623
|||(t:  0)|||:     1000000            19653
|||(t:  2)|||:    10000000           166237
|||(t:  0)|||:    10000000           166161
|||(t:  1)|||:    10000000           166204
|||(t:  3)|||:    10000000           165976
:--(t:  0):--:   100000000           464549
:--(t:  1):--:   100000000           464491
:--(t:  2):--:   100000000           464530
:--(t:  3):--:   100000000           464288
[END] PRIME COUNT was omp.FOUND ==  1857859 <= 30000000

生成正确结果后，性能很重要：

更好的做法是主要避免所有级别的读取/写入冲突（并最好避免错误对齐和缓存效率低下）：

_{希望你也喜欢扩展的语法装饰，借用true-[PARALLEL] languageoccam-pi，以强调线程对齐的代码执行）}

#include<stdio.h>
#include<math.h>
#include<omp.h>

#define END   10000000

int is_prime( long long v )  // Only pass odd values to this function
{             long long end = sqrt( (double) v );
    for (     long long i  = 3;
                        i <= end;
                        i += 2 ) {
        if (      ( v % i ) == 0 ) return 0;
    }
    return 1;
}

int main( int argc, char **argv )
{
#define                                    OMP_NUM_OF_THREADS 4   // CONFIG:
                                                                  //   [SEQ]
    __uint64_t global_per_thread_bracket[  OMP_NUM_OF_THREADS];   //   :--
    __uint64_t global_per_thread_sub_count[OMP_NUM_OF_THREADS];   //   :--
    #pragma omp parallel num_threads(      OMP_NUM_OF_THREADS )   //   :-- [PAR]x4--------------------------||||-4-threads
    {                                                             //         {|...}                         ||||-4-threads
           int        thread_local_ID      = omp_get_thread_num();//         {|...}     // SET: _id         ||||-4-threads
           int        thread_local_COUNT   =  0;                  //         {|...}     // SET: SAFE ZERO   ||||-4-threads
           __uint64_t thread_local_BRACKET = 10;                  //         {|...}     // SET: SAFE   10   ||||-4-threads
                                                                  //         {|...}     //                  ||||-4-threads
           printf( "INF(t:%3d)|||: _id\n",                       thread_local_ID );     // PRINT:?(ORD)     ||||-4-threads
                                                                  //         {|...}     //                  ||||-4-threads
           for ( long long   i  = ( 3 +                    ( 2 * thread_local_ID ) );   // CALC:---------// ||||-4-threads
                             i <  END;                            //         {|...}     //               // ||||-4-threads
                             i += ( 2 * OMP_NUM_OF_THREADS )      //         {|...}     //               // ||||-4-threads
                             )                                    //         {|...}     //               // ||||-4-threads
           {        if (     i >  thread_local_BRACKET )          //         {|...}     // IF:local      // ||||-4-threads
                    {  printf( "|||(t:%3d)|||:%12d\t%12d\n",     thread_local_ID,       // PRINT:?(ORD)  // ||||-4-threads
                                (int)thread_local_BRACKET,        //         {|...}     //    local      // ||||-4-threads
                                (int)thread_local_COUNT           //         {|...}     //    local      // ||||-4-threads
                                );                                //         {|...}     //               // ||||-4-threads
                       thread_local_BRACKET *= (__uint64_t)10;    //         {|...}     // UPDATE:*=10   // ||||-4-threads
                    }                                             //         {|...}     //               // ||||-4-threads
                    if ( is_prime(i) ) thread_local_COUNT++;      //         {|...}     // IF:local++    // ||||-4-threads
           }  // ------------------------------------------------------------{|...}----------------------// ||||-4-threads
           global_per_thread_sub_count[                          thread_local_ID] = thread_local_COUNT;  // ||||-4-threads
           global_per_thread_bracket[                            thread_local_ID] = thread_local_BRACKET;// ||||-4-threads
    } // ---------------------------------------------------------//_________{|___}_________________________||||-4-threads
    long long prime_count = 1;                                    //   :--  SET:=1 as 2 is prime
    for ( int thread_OrdNo  = 0;                                  //   :--  CONSOLIDATE: -----//
              thread_OrdNo <  OMP_NUM_OF_THREADS;                 //   :--                    //
              thread_OrdNo++                                      //   :--                    //
              )                                                   //   :--                    //
    {   prime_count+= global_per_thread_sub_count[thread_OrdNo];  //   :--  UPDATE: +=        //
        printf( ":--(t:%3d):--:%12d\t%12d\n",(int)thread_OrdNo,   //   :--  PRINT:            //
                 (int)global_per_thread_bracket[  thread_OrdNo],  //   :--                    //
                 (int)global_per_thread_sub_count[thread_OrdNo]   //   :--                    //
                 );                                               //   :--                    //
    } // --------------------------------------------------------------:-- -------------------//
    printf( "[END] PRIME COUNT was omp.FOUND == %9lld <= %lld\n", //   :--
             prime_count,                                         //   :--
             END                                                  //   :--
             );                                                   //   :--
}

Feel free to further experiment with the code Live / on-line.

【讨论】：

感谢您非常详细的回复！这清楚了很多！
我注意到您的代码运行速度比固定的原始代码慢得多。有趣的是，is_prime 的性能似乎在很大程度上取决于数据类型：int64_t << uint64_t << uint32_t。

【解决方案3】：

这种方法比修正后的 OP 快 5 倍。它首先生成一个素数数组，直到平方根，单线程。然后多个线程只从该数组中读取。

/* Prime numbers by trial division, with parallel 2nd step */
/* Trick is to make sure the SQR array has top prime > SQR to make the for loop work until the end / the last p[i]
   Lowest MAX is 4 */
/* Timing: 0.13s for up to 10M (4 cores, turbo boost 3GHz) */

/* single threaded, array filling step only: timing:
./a.out 1000000000000
top: 78497 1000003 1000006000009
-> 0.04s for 78K primes up to 1M, ready for 1T (would take hours/years) 
-> does not matter overall
*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <omp.h>

/* Fill p[] with primes up to 'max', and one more */
int
primearray(int max, int p[]) {

    p[0] = 3;
    int top = 0, n, i;

    for (n = 5;; n += 2)
        for (i = 0;; i++)

            if (n % p[i] == 0)
                break;
            else
                if (p[i]*p[i] > n) {

                    p[++top] = n;
                    if (n > max)
                        return top;          // return WITH last n
                    break;
                }
}
/* Use primes in p[] to check up to 'max' */
/* "5" is p[1] is the 3rd prime, so start count from 2 */
int
primecount_omp(long max, int p[]) {

    int n, i, sum = 2;

    #pragma omp parallel for private(i) schedule(guided) reduction(+:sum)

    for (n = 5; n <= max; n += 2)
        for (i = 0;; i++)

            if (n % p[i] == 0)
                break;
            else
                if (p[i]*p[i] > n) {
                    sum++;
                    break;
                }
    return sum;
}
/* Call primearray() with SQR and then primecount_omp() with MAX */
/* PNT (x/logx) to find approx. size of array p[] 'psize'. Needs some safety margin. */
int
main(int argc, char** argv) {

    if (argc < 2) {
        printf("No limit given\n");
        return 1;
    }
    long MAX = strtoul(argv[1], 0, 10);
    int  SQR = sqrt(MAX);

    int   psize = 50 + 1.2 * SQR / log(SQR);
    int      *p = malloc(psize * sizeof*p);

    int top = primearray(SQR, p);

    printf("psize: %d top: %d %d %ld\n", psize, top, p[top], (long)p[top]*p[top]);

    int sum = primecount_omp(MAX, p);

    printf("Count: %d (%.2f%%)\n", sum, (double) 100*sum / MAX) ;
    double lnx = MAX / log(MAX);
    printf("LnX:   %.2f (%f)\n",  lnx, sum / lnx);

    return 0;
}

这两个功能相似，但不完全相同。第一个有：

   p[++top] = n;
   if (n > max)
       return top;
   break;

这比第二个更复杂，并行的：

   sum++;
   break;

虽然sum++ 可以通过reduction(+:sum) 子句完成，但++top 必须在线程之间进行协调。

我相信这是最快的简单试除法方法。它有一个数组、两个步骤和 OMP，但仍然很简单。

输出是：

psize: 520 top: 445 3163 10004569
Count: 664579 (6.65%)
LnX:   620420.69 (1.071175)

表示分配了 520 个整数，使用了 445 个：3163 是所需的最大素数，最大的平方为 10M。

确切的（分配的）数组不是那么重要，只要足够大。几千块会让你走得很远。

我喜欢这个，因为它实现了递归的想法：

要找到不超过 N 的素数，首先要找到不超过 SQRT(N) 的素数

只有一个“递归”（正式地 w/o），并提供了轻松并行化第二个重要步骤的机会。

如果它是一步完成的，那么要么您需要额外的测试，要么它继续将数组填充到最后。

如果它真的是递归完成的，你可以sqrt降低到 3 或 2，然后使用短数组来制作更长的数组，所有这些都以相同的方式。

【讨论】：

【解决方案4】：

所有线程可以同时传入if块，修改bracket不一致

pragma "#pragma omp critical" 可以防止这种情况：

#define END   100000000

// Only pass odd values to this function
int is_prime(uint32_t v)
{
    uint32_t end = sqrt(v);
    for (uint32_t i = 3; i <= end; i += 2) {
        if ((v % i) == 0) {
            return 0;
        }
    }
    return 1;
}

int main(int argc, char **argv)
{
    // We'll grab 2 as it's the only even prime
    int prime_count = 1;

    uint32_t bracket = 10;

    #pragma omp parallel for num_threads(4)
    for (uint32_t i = 3; i < END; i += 2) {
        #pragma omp critical
        {
            if (i > bracket) {
                printf("%12d\t%12d\n", bracket, prime_count);
                bracket *= 10;
            }
        }
        if (is_prime(i)) {
            prime_count++;
        }
    }
    printf("%12d\t%12d\n", bracket, prime_count);
    return 0;
}

上面的代码不能完全回答你的问题：我想你想知道有多少素数在 10、100、1000 之下...

但处理for 的openmp 方式如下：它将循环切入4：

the first from 0 to N/4,
the second from N/4 to N/2,
the first from N/2 to 3*N/4,
the second from 3*N/4 to N.

要知道素数是如何分布的，你应该有一些不同的方法：

int main(int argc, char **argv)
{

    /* number_of_prime[0] is the number of primes between   0 and 10 
       number_of_prime[1] is the number of primes between 10 and 100 
       ...  */
    int number_of_prime[8] = {1, 0};

    uint32_t bracket = 10;

    #pragma omp parallel for num_threads(4)
    for (uint32_t i = 3; i < END; i += 2) {            
        if (is_prime(i)) {
            /* here, i is a prime */
            if (i > 10000000)
            {
                #pragma omp critical
                number_of_prime[7]++;
            }
            else if (i > 1000000)
            {
                #pragma omp critical
                number_of_prime[6]++;
            }
            else if (i > 100000)
            {
                #pragma omp critical
                number_of_prime[5]++;
            }
            else if (i > 10000)
            {
                #pragma omp critical
                number_of_prime[4]++;
            }
            else if (i > 1000)
            {
                #pragma omp critical
                number_of_prime[3]++;
            }
            else if (i > 100)
            {
                #pragma omp critical
                number_of_prime[2]++;
            }
            else if (i > 10)
            {
                #pragma omp critical
                number_of_prime[1]++;
            }
            else 
            {
                #pragma omp critical
                number_of_prime[0]++;
            }                            
        }            
    }

    /* add some printf hero to display results */

    return 0;
}

编辑：

正如祖蓝所说，我至少应该用过atomic instead of critical

【讨论】：

prime_count 在您的示例中仍然不正确地递增（竞争条件）。使用omp critical 来增加计数器是一种性能。请改用reduction，或至少使用atomic。
1.使用减少计数，2.括号更新可能是错误的，但至少是不优雅的：将括号循环放在并行循环之外的因素。

不要寻求语法上的改变，需要改变思维的概念——它是并行发生的

生成正确结果后，性能很重要：

不要寻求语法上的改变，
需要改变思维的概念——它是并行发生的