我想我设法获得了一个循环可以执行的最大迭代次数的下限,而串行执行优于并行执行。但是,我希望您对此发表评论,让我们看看是否有更好的方法来找到循环体迭代的正确折衷/估计次数,小于从并行 for 区域中受益的没有用处。 (非常感谢
让我们说:
N:是并行区域中运行循环的线程数。
R:是打开和关闭并行区域的成本(为了简单起见,以循环迭代次数而不是纳秒来衡量。)
S:是最大迭代次数(权衡),大于串行执行所需的时间与并行执行(包括并行区域开销)一样多。
所以基本上我们正在寻找 S 使得:并行执行时间 = 串行执行时间。这将是:R + S/N = S。
当求解 S 时:S = R x N / (N-1)。所以我用下面的代码测试了这个想法:
我在哪里执行以下步骤:
- 第 78-96 行:测量创建/分叉和最终确定/加入并行区域的成本。
- 第 99-111 行:测量循环的一次迭代的成本。
- 第 113-116 行:使用上面讨论的公式获得权衡。
- 第 119 行:我运行测试只是为了比较并行执行与串行执行。
#include <iostream>
#include <omp.h>
// #include <ctime>
#include <chrono>
// #define dtype float
// #define NROWS 1350 // float
#define dtype double
#define NUMEL 1000 // double
#define NPAR_REGION 100000
#define TEST_REP 1000
#define NREP 1000
#define printvar(x) std::cout << #x << " = " << x << std::endl;
#define printline std::cout << "LINE: " << __LINE__ << std::endl;
// #pragma omp declare simd
template <typename T>
T loop_body_func(T x, T y){
return sqrt(x*sin(x)/(1+y));
}
using namespace std;
int compare_seri_vs_para(int numel, int nrep){
dtype *m = (dtype*) malloc(sizeof(dtype)*numel) ;
dtype *n = (dtype*) malloc(sizeof(dtype)*numel) ;
dtype *mn = (dtype*) malloc(sizeof(dtype)*numel) ;
int a;
chrono::steady_clock::time_point st = chrono::steady_clock::now();
#pragma noparallel
for(int k = 0; k < nrep; k++){
#pragma omp parallel for if (true)
#pragma novector
for(int i = 0; i < numel; i++){
mn[i] = loop_body_func(m[i], n[i]);
// if (i % 100 == 0){
// printvar(omp_get_thread_num());
// }
}
}
chrono::steady_clock::time_point en = chrono::steady_clock::now();
std::cout << "parallel ellapsed time: \t" <<
chrono::duration_cast<chrono::nanoseconds>(en - st).count() << std::endl;
st = chrono::steady_clock::now();
#pragma noparallel
for(int k = 0; k < nrep; k++){
#pragma noparallel
#pragma novector
for(int i = 0; i < numel; i++){
mn[i] = loop_body_func(m[i], n[i]);
// if (i % 100 == 0){
// printvar(omp_get_thread_num());
// }
}
}
en = chrono::steady_clock::now();
std::cout << "serial ellapsed time: \t\t" <<
chrono::duration_cast<chrono::nanoseconds>(en - st).count() << std::endl;
free(m); free(n); free(mn);
return 0;
}
int main(){
printline
int nthread = omp_get_max_threads ();
printvar(nthread)
// chrono::steady_clock::time_point *thread_cost =
// (chrono::steady_clock::time_point *) calloc(nthread, sizeof(chrono::steady_clock::time_point));
double *thread_cost = (double*) calloc(nthread, sizeof(double));
chrono::steady_clock::time_point sc = chrono::steady_clock::now();
#pragma noparallel
#pragma novector
for(int i = 0; i < NPAR_REGION; i++){
#pragma omp parallel
{
int tid = omp_get_thread_num();
thread_cost[tid] += tid; // a dummy task
// chrono::duration_cast<chrono::nanoseconds>(chrono::steady_clock::now() - sc).count()
}
}
printline
double paralell_region_cost = chrono::duration_cast<chrono::nanoseconds>(chrono::steady_clock::now() - sc).count();
// for(int i = 0; i < nthread; i++)
// paralell_region_cost+= thread_cost[i];
paralell_region_cost /= NPAR_REGION;
printvar(paralell_region_cost)
printline
dtype *m = (dtype*) malloc(sizeof(dtype)*TEST_REP) ;
dtype *n = (dtype*) malloc(sizeof(dtype)*TEST_REP) ;
dtype *mn = (dtype*) malloc(sizeof(dtype)*TEST_REP) ;
sc = chrono::steady_clock::now();
#pragma noparallel
#pragma novector
for(int i = 0; i < TEST_REP; i++){
mn[i] = loop_body_func(m[i], n[i]);
}
double loop_body_cost = \
chrono::duration_cast<chrono::nanoseconds>(chrono::steady_clock::now() - sc).count()/TEST_REP;
printvar(loop_body_cost)
printline
int loop_parallel_serial_tradeoff = (paralell_region_cost / loop_body_cost)*(nthread/(double(nthread)-1.0));
printline
printvar(loop_parallel_serial_tradeoff)
printline
compare_seri_vs_para(loop_parallel_serial_tradeoff, NREP);
free(m); free(n); free(mn);
printvar("-=program ended!=-")
}
我使用以下命令行编译了我的代码 sn-p:
icl.exe -Qopt-report:5 -Qopt-report-phase:all /Ob0 -Qopenmp -Qsimd -Qopenmp-simd -arch:avx2 -Qdiag-error-limit:5 -c omp_if_test.cpp -Fo:omp_if_test.obj
Intel(R) C++ Intel(R) 64 Compiler for applications running on Intel(R) 64, Version 19.1.2.254 Build 20200623
Copyright (C) 1985-2020 Intel Corporation. All rights reserved.
icl: remark #10397: optimization reports are generated in *.optrpt files in the output location
omp_if_test.cpp
xilink.exe omp_if_test.obj -LIBPATH:../../Debug/lib -out:omp_if_test.exe
xilink: executing 'link'
Microsoft (R) Incremental Linker Version 14.28.29913.0
Copyright (C) Microsoft Corporation. All rights reserved.
omp_if_test.obj
-LIBPATH:../../Debug/lib
-out:omp_if_test.exe
最后,这里的输出有点奇怪,因为串行执行似乎总是优于并行执行几乎 3 倍。 (lol) (我期待一个波动的情况)。
C:\simdtests>omp_if_test.exe
LINE: 72
nthread = 8
LINE: 90
paralell_region_cost = 2053.75
LINE: 97
loop_body_cost = 32
LINE: 113
LINE: 115
loop_parallel_serial_tradeoff = 73
LINE: 118
parallel ellapsed time: 3392300
serial ellapsed time: 1141000
"-=program ended!=-" = -=program ended!=-
C:\simdtests>omp_if_test.exe
LINE: 72
nthread = 8
LINE: 90
paralell_region_cost = 2304
LINE: 97
loop_body_cost = 33
LINE: 113
LINE: 115
loop_parallel_serial_tradeoff = 79
LINE: 118
parallel ellapsed time: 3275000
serial ellapsed time: 1216000
"-=program ended!=-" = -=program ended!=-
C:\simdtests>omp_if_test.exe
LINE: 72
nthread = 8
LINE: 90
paralell_region_cost = 2132.9
LINE: 97
loop_body_cost = 32
LINE: 113
LINE: 115
loop_parallel_serial_tradeoff = 76
LINE: 118
parallel ellapsed time: 3337900
serial ellapsed time: 1123100
"-=program ended!=-" = -=program ended!=-
C:\simdtests>omp_if_test.exe
LINE: 72
nthread = 8
LINE: 90
paralell_region_cost = 2062.77
LINE: 97
loop_body_cost = 32
LINE: 113
LINE: 115
loop_parallel_serial_tradeoff = 73
LINE: 118
parallel ellapsed time: 3739700
serial ellapsed time: 1118400
"-=program ended!=-" = -=program ended!=-
C:\simdtests>omp_if_test.exe
LINE: 72
nthread = 8
LINE: 90
paralell_region_cost = 2121.74
LINE: 97
loop_body_cost = 91
LINE: 113
LINE: 115
loop_parallel_serial_tradeoff = 26
LINE: 118
parallel ellapsed time: 5627200
serial ellapsed time: 356300
"-=program ended!=-" = -=program ended!=-
所以看起来我获得了权衡值的下限,而不是实际值。
如果您能对我们发表评论,我们将不胜感激。
问候