【发布时间】:2017-04-03 17:05:36
【问题描述】:
我刚开始使用 OpenMP 在 C++ 中进行并行计算。该程序的并行性能很差。由于我不太了解多线程分析工具(不像简单的单线程gprof),所以我编写了一个示例程序来测试性能。
我有一个 2D 矩阵 (N * N),每个元素都有一个 3d 向量 (x, y, z)。我只是做一个双循环来设置矩阵中的每个值:
for (int i = 0; i < N; ++i) {
for (int j = 0; j < N; ++j) {
vectorStack[i][j] = VECTOR3D(1.0*i*i, 1.0*j*j, 1.0*i*j);
}
}
其中VECTOR3D 是一个具有x, y, z 属性的简单类:
class VECTOR3D {
double x, y, z; // component along each axis
}
另一方面,我也可以使用 (N * N * 3) 3D 数组来执行此操作:
for (int i = 0; i < N; ++i) {
for (int j = 0; j < N; ++j) {
arrayHeap[i][j][0] = (1.0*i*i);
arrayHeap[i][j][1] = (1.0*j*j);
arrayHeap[i][j][2] = (1.0*i*j);
}
}
从内存方面来说,也有几种不同的选择,比如使用原始指针手动分配和释放:
double ***arrayHeap;
arrayHeap = new double** [N];
for(int i = 0; i < N; ++i) {
arrayHeap[i] = new double* [N];
for(int j = 0; j < N; ++j) {
arrayHeap[i][j] = new double[3];
}
}
或者干脆使用std::vector:
vector< vector<VECTOR3D>> vectorStack(N, vector<VECTOR3D>(N, VECTOR3D(0, 0, 0)));
我还考虑像LAMMP(Molecular Simulation Package 源代码中那样为数组手动分配连续内存。
所以我的N=10000 的结果列在这里:
对于单线程:
OMP_NUM_THREADS=1 ./a.out
为堆上的数组分配内存...
======= 堆上的数组结果 =======
在时间(总)内完成:0.720385秒
在时间(实际)内完成:0.720463秒
正在为堆上的数组分配内存...
为数组连续分配内存...
======= 数组连续结果 =======
在时间(总)内完成:0.819733秒
在时间(实际)内完成:0.819774秒
为数组连续释放内存...
正在为堆上的向量分配内存...
======= 堆结果向量 =======
在时间(总)内完成:3.08715秒
在时间(实际)内完成:3.08725秒
正在为堆上的向量释放内存...
正在为堆栈上的向量分配内存...
======= 堆栈结果中的向量 =======
在时间(总)内完成:1.49888秒
在时间(实际)内完成:1.49895秒
对于多线程(threads=4):
OMP_NUM_THREADS=4 ./a.out
为堆上的数组分配内存...
======= 堆上的数组结果 =======
在时间(总)内完成:2.29184秒
在时间(实际)内完成:0.577807秒
正在为堆上的数组分配内存...
为数组连续分配内存...
======= 数组连续结果 =======
在时间(总)内完成:1.79501秒
在时间(实际)内完成:0.454139秒
为数组连续释放内存...
正在为堆上的向量分配内存...
======= 堆结果向量 =======
在时间(总)内完成:6.80917秒
在时间(实际)内完成:1.92541秒
正在为堆上的向量释放内存...
正在为堆栈上的向量分配内存...
======= 堆栈结果中的向量 =======
在时间(总)内完成:1.64086秒
在时间(实际)内完成:0.411 秒
整体并行效率不好。没想到,花哨的连续内存分配没有用?!为什么会这样?看来std::vector 足以应付这种情况?
有人可以为我解释一下结果吗?我还需要关于如何改进代码的建议。
非常感谢!!!
附上所有源代码。 (请直接进入main,一开始有几个手动管理内存的功能)。
#include <iostream>
#include <omp.h>
#include <vector>
#include <stdlib.h>
#include <cinttypes>
#include "vector3d.h"
typedef int64_t bigint;
void *smalloc(bigint nbytes, const char *name)
{
if (nbytes == 0) return NULL;
void *ptr = malloc(nbytes);
return ptr;
}
template <typename TYPE>
TYPE ***create(TYPE ***&array, int n1, int n2, int n3, const char *name)
{
bigint nbytes = ((bigint) sizeof(TYPE)) * n1*n2*n3;
TYPE *data = (TYPE *) smalloc(nbytes,name);
nbytes = ((bigint) sizeof(TYPE *)) * n1*n2;
TYPE **plane = (TYPE **) smalloc(nbytes,name);
nbytes = ((bigint) sizeof(TYPE **)) * n1;
array = (TYPE ***) smalloc(nbytes,name);
int i,j;
bigint m;
bigint n = 0;
for (i = 0; i < n1; i++) {
m = ((bigint) i) * n2;
array[i] = &plane[m];
for (j = 0; j < n2; j++) {
plane[m+j] = &data[n];
n += n3;
}
}
return array;
}
template <typename TYPE>
TYPE ***create3d_offset(TYPE ***&array, int n1lo, int n1hi,
int n2, int n3, const char *name)
{
int n1 = n1hi - n1lo + 1;
create(array,n1,n2,n3,name);
array -= n1lo;
return array;
}
void sfree(void *ptr) {
if (ptr == NULL) return;
free(ptr);
}
template <typename TYPE>
void destroy(TYPE ***&array)
{
if (array == NULL) return;
sfree(array[0][0]);
sfree(array[0]);
sfree(array);
array = NULL;
}
template <typename TYPE>
void destroy3d_offset(TYPE ***&array, int offset)
{
if (array == NULL) return;
sfree(&array[offset][0][0]);
sfree(&array[offset][0]);
sfree(&array[offset]);
array = NULL;
}
////////////////////////////////////////////////////////
////////////////////////////////////////////////////////
////////////////////////////////////////////////////////
int main() {
using namespace std;
const int N = 10000;
///////////////////////////////////////
double sum = 0.0;
clock_t t;
double startTime, stopTime, secsElapsed;
printf("\n\nAllocating memory for array on heap...\n");
double ***arrayHeap;
arrayHeap = new double** [N];
for(int i = 0; i < N; ++i) {
arrayHeap[i] = new double* [N];
for(int j = 0; j < N; ++j) {
arrayHeap[i][j] = new double[3];
}
}
printf("======= Array on heap Results =======\n");
sum = 0.0;
t = clock();
startTime = omp_get_wtime();
#pragma omp parallel
{
//#pragma omp for schedule(dynamic)
//#pragma omp for collapse(2)
#pragma omp for
for (int i = 0; i < N; ++i) {
for (int j = 0; j < N; ++j) {
arrayHeap[i][j][0] = (1.0*i*i);
arrayHeap[i][j][1] = (1.0*j*j);
arrayHeap[i][j][2] = (1.0*i*j);
}
}
}
t = clock() - t;
cout << "Finished within time (total): " << ((double) t) / CLOCKS_PER_SEC << " seconds" << endl;
stopTime = omp_get_wtime();
secsElapsed = stopTime - startTime;
cout << "Finished within time (real): " << secsElapsed << " seconds" << endl;
printf("Deallocating memory for array on heap...\n");
for(int i = 0; i < N; ++i) {
for(int j = 0; j < N; ++j) {
delete [] arrayHeap[i][j];
}
delete [] arrayHeap[i];
}
delete [] arrayHeap;
///////////////////////////////////////
printf("\n\nAllocating memory for array continous...\n");
double ***array_continuous;
create3d_offset(array_continuous,0, N, N, 3, "array");
printf("======= Array continuous Results =======\n");
sum = 0.0;
t = clock();
startTime = omp_get_wtime();
#pragma omp parallel
{
//#pragma omp for schedule(dynamic)
//#pragma omp for collapse(2)
#pragma omp for
for (int i = 0; i < N; ++i) {
for (int j = 0; j < N; ++j) {
array_continuous[i][j][0] = (1.0*i*i);
array_continuous[i][j][1] = (1.0*j*j);
array_continuous[i][j][2] = (1.0*i*j);
}
}
}
t = clock() - t;
cout << "Finished within time (total): " << ((double) t) / CLOCKS_PER_SEC << " seconds" << endl;
stopTime = omp_get_wtime();
secsElapsed = stopTime - startTime;
cout << "Finished within time (real): " << secsElapsed << " seconds" << endl;
printf("Deallocating memory for array continuous...\n");
destroy3d_offset(array_continuous, 0);
///////////////////////////////////////k
printf("\n\nAllocating memory for vector on heap...\n");
VECTOR3D ***vectorHeap;
vectorHeap = new VECTOR3D**[N];
for(int i = 0; i < N; ++i) {
vectorHeap[i] = new VECTOR3D* [N];
for(int j = 0; j < N; ++j) {
vectorHeap[i][j] = new VECTOR3D(0,0,0);
}
}
printf("======= Vector on heap Results =======\n");
sum = 0.0;
t = clock();
startTime = omp_get_wtime();
#pragma omp parallel
{
//#pragma omp for schedule(dynamic)
//#pragma omp for collapse(2)
#pragma omp for
for (int i = 0; i < N; ++i) {
for (int j = 0; j < N; ++j) {
vectorHeap[i][j] = new VECTOR3D(1.0*i*i, 1.0*j*j, 1.0*i*j);
}
}
}
t = clock() - t;
cout << "Finished within time (total): " << ((double) t) / CLOCKS_PER_SEC << " seconds" << endl;
stopTime = omp_get_wtime();
secsElapsed = stopTime - startTime;
cout << "Finished within time (real): " << secsElapsed << " seconds" << endl;
printf("Deallocating memory for vector on heap...\n");
for(int i = 0; i < N; ++i) {
for(int j = 0; j < N; ++j) {
delete [] vectorHeap[i][j];
}
delete [] vectorHeap[i];
}
delete [] vectorHeap;
/////////////////////////////////////////////////
printf("\n\nAllocating memory for vector on stack...\n");
vector< vector<VECTOR3D>> vectorStack(N, vector<VECTOR3D>(N, VECTOR3D(0, 0, 0)));
printf("======= Vector on stack Results =======\n");
sum = 0.0;
t = clock();
startTime = omp_get_wtime();
#pragma omp parallel
{
//#pragma omp for schedule(dynamic)
//#pragma omp for collapse(2)
#pragma omp for
for (int i = 0; i < N; ++i) {
for (int j = 0; j < N; ++j) {
vectorStack[i][j] = VECTOR3D(1.0*i*i, 1.0*j*j, 1.0*i*j);
}
}
}
t = clock() - t;
cout << "Finished within time (total): " << ((double) t) / CLOCKS_PER_SEC << " seconds" << endl;
stopTime = omp_get_wtime();
secsElapsed = stopTime - startTime;
cout << "Finished within time (real): " << secsElapsed << " seconds" << endl;
/////////////////////////////////
return 0;
}
还有VECTOR3D 类:
#ifndef _VECTOR3D_H
#define _VECTOR3D_H
#include <iostream>
#include <cmath>
#include <iomanip>
#include <limits>
class VECTOR3D {
public:
double x, y, z; // component along each axis (cartesian)
VECTOR3D(double xx = 0.0, double yy = 0.0, double zz = 0.0) : x(xx), y(yy), z(zz) // make a 3d vector
{
}
}
【问题讨论】:
标签: c++ multithreading performance multidimensional-array openmp