即使当一切都在 main() 中时您的应用程序没有优化掉,您仍然可以进行一些简单的改进来提高代码的性能。你的函数和我的改进版本(在calculate.c中):
#include "calculate.h"
#include <omp.h>
#include <math.h>
#define DATA_SIZE 500
#define NUMBER_OF_STEPS 100
static double a[DATA_SIZE];
static double d_title[DATA_SIZE];
static double b[DATA_SIZE];
static double c[DATA_SIZE];
void initialize_data()
{
int i;
for (i = 0; i < DATA_SIZE; i++)
{
a[i] = 0.3291;
d_title[i] = 2.414;
b[i] = 3.8037;
c[i] = 4086;
}
}
double func()
{
double nu_start = 0;
double mu_start = 0;
double z_start = 0;
double step_nu = 2 * M_PI / NUMBER_OF_STEPS;
double step_mu = M_PI / NUMBER_OF_STEPS;
double step_z = 0;
double nu = 0;
double mu = 0;
double z = 0;
double integral = 0;
double d_uv = 0;
int i = 0;
int j = 0;
int k = 0;
int loop = 0;
#pragma omp parallel for default(none) shared(a, d_title, b, c, nu_start, mu_start, z_start, step_nu, step_mu, integral) private(i, j, k, mu, nu, step_z, z, d_uv)
for (loop = 0; loop < DATA_SIZE; loop++)
{
for (i = 0; i < NUMBER_OF_STEPS; i++)
{
mu = mu_start + (i + 1) * step_mu;
for (j = 0; j < NUMBER_OF_STEPS; j++)
{
nu = nu_start + (j + 1) * step_nu;
for (k = 0; k < DATA_SIZE; k++)
{
d_uv = (sin(mu) * sin(mu) * cos(nu) * cos(nu) + sin(mu) * sin(mu) * (a[loop] * sin(nu) - d_title[loop] * cos(nu)) * (a[loop] * sin(nu) - d_title[loop] * cos(nu)) + b[loop] * b[loop] * cos(mu) * cos(mu)) / (c[loop] * c[loop]);
step_z = 20 / (d_uv * DATA_SIZE);
z = z_start + (k + 1) * step_z;
#pragma omp atomic
integral += sin(mu) * (1 - 3 * sin(mu) * sin(mu) * cos(nu) * cos(nu)) * exp(-d_uv * z) * log(1 + z * z) * step_z * step_mu * step_nu / (c[loop] * c[loop]);
}
}
}
}
return integral;
}
double func2()
{
double integral = 0;
int loop = 0;
int i = 0;
#pragma omp parallel for default(none) shared(a, d_title, b, c) reduction(+: integral) collapse(2)
for (loop = 0; loop < DATA_SIZE; loop++)
{
for (i = 0; i < NUMBER_OF_STEPS; i++)
{
const double mu_start = 0;
const double step_mu = M_PI / NUMBER_OF_STEPS;
const double mu = mu_start + (i + 1) * step_mu;
int j;
for (j = 0; j < NUMBER_OF_STEPS; j++)
{
const double nu_start = 0;
const double step_nu = 2 * M_PI / NUMBER_OF_STEPS;
const double nu = nu_start + (j + 1) * step_nu;
int k;
for (k = 0; k < DATA_SIZE; k++)
{
const double z_start = 0;
const double d_uv = (sin(mu) * sin(mu) * cos(nu) * cos(nu) + sin(mu) * sin(mu) * (a[loop] * sin(nu) - d_title[loop] * cos(nu)) * (a[loop] * sin(nu) - d_title[loop] * cos(nu)) + b[loop] * b[loop] * cos(mu) * cos(mu)) / (c[loop] * c[loop]);
const double step_z = 20 / (d_uv * DATA_SIZE);
const double z = z_start + (k + 1) * step_z;
integral += sin(mu) * (1 - 3 * sin(mu) * sin(mu) * cos(nu) * cos(nu)) * exp(-d_uv * z) * log(1 + z * z) * step_z * step_mu * step_nu / (c[loop] * c[loop]);
}
}
}
}
return integral;
}
标头(calculate.h):
#ifndef CALCULATE_H
#define CALCULATE_H
#ifdef __cplusplus
extern "C"
{
#endif
void initialize_data();
double func();
double func2();
#ifdef __cplusplus
}
#endif
#endif
使用Google Benchmark的主应用程序(openmp-performance.cpp):
#include <benchmark/benchmark.h>
#include "calculate.h"
static void BM_original_func(benchmark::State& state)
{
initialize_data();
for (auto _ : state)
{
const double result = func();
state.counters["result"] = result;
}
}
static void BM_func2(benchmark::State& state)
{
initialize_data();
for (auto _ : state)
{
const double result = func2();
state.counters["result"] = result;
}
}
BENCHMARK(BM_original_func)->Unit(benchmark::kSecond);
BENCHMARK(BM_func2)->Unit(benchmark::kSecond);
BENCHMARK_MAIN();
我使用 Makefile 构建:
CXXFLAGS+=-Wall -march=native -g -fopenmp -O2
CFLAGS=$(CXXFLAGS)
LDFLAGS=-lpthread -lbenchmark
TARGET = benchmark
all : $(TARGET)
$(TARGET) : calculate.o openmp-performance.o
g++ $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
.PHONY : clean
clean :
rm -f $(TARGET) *.o
我在 Linux 机器上构建并执行了代码,但您应该得到类似的结果:
[dan@cpp-slave openmp-performance]$ make && ./benchmark
cc -Wall -march=native -g -fopenmp -O2 -c -o calculate.o calculate.c
g++ -Wall -march=native -g -fopenmp -O2 -c -o openmp-performance.o openmp-performance.cpp
g++ -Wall -march=native -g -fopenmp -O2 calculate.o openmp-performance.o -o benchmark -lpthread -lbenchmark
2021-11-12T16:13:01+01:00
Running ./benchmark
Run on (4 X 2394 MHz CPU s)
CPU Caches:
L1 Data 32 KiB (x4)
L1 Instruction 32 KiB (x4)
L2 Unified 4096 KiB (x4)
L3 Unified 16384 KiB (x1)
Load Average: 0.58, 0.52, 0.59
---------------------------------------------------------------------------
Benchmark Time CPU Iterations UserCounters...
---------------------------------------------------------------------------
BM_original_func 104 s 102 s 1 result=46.2432k
BM_func2 27.8 s 26.7 s 1 result=46.2432k
只需稍微更改 OpenMP 声明并将变量向下推一点,我就能显着提高函数的性能:
- 用还原替换原子会产生很大的不同。
- 使用原子操作会导致来自多个线程的所有添加都被序列化。
- 在此处使用归约将最后各个线程的小计相加。
- 预先计算 sin(mu) 和 cos(nu) 没有任何区别,这表明编译器自己发现了优化。