【发布时间】:2017-05-10 13:10:52
【问题描述】:
为什么这个普通数组实现比 std::vector 实现性能慢?
由于我正在做的事情看到一些奇怪的结果,我决定编写一个简化的测试来比较 std::vector 和普通数组的效率。
我有一个以两种方式实现的结构,
1 使用普通数组(不同大小)
typedef struct {
uint16_t index;
uint16_t nvals;
uint16_t vals[50];
double mean;
} a_segment_t;
2 使用 STL
typedef struct {
uint16_t index;
uint16_t nvals;
vector<uint16_t> vals;
uint32_t mean;
} b_segment_t;
在内存中创建这个对象不是我感兴趣的(所以我不介意push_back()),一旦这个对象在内存中,它就用于操作效率是我正在分析的。 vals 填充了一些随机数据。
操作遍历存储在每个段中的 val,在本例中是简单的均值计算。测试如下:
using namespace std;
#include <stdint.h>
#include <stdlib.h> // srand, rand
#include <time.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <array>
#define NSEGMENTS 100
#define MAX_NPXS 50
#define N 10000
// plain array approach
typedef struct {
uint16_t index;
uint16_t nvals;
uint16_t vals[MAX_NPXS];
double mean;
} a_segment_t;
uint16_t operation(uint16_t, a_segment_t*);
uint16_t print(uint16_t nsegments, a_segment_t* p_segments);
// stl vector approach
typedef struct {
uint16_t index;
uint16_t nvals;
vector<uint16_t> vals;
uint32_t mean;
} b_segment_t;
uint16_t operation(uint16_t, vector<b_segment_t>*);
uint16_t print(uint16_t nsegments, vector<b_segment_t>*);
void delta_time(struct timespec*, struct timespec*, struct timespec*);
uint16_t operation(uint16_t nsegments, a_segment_t* p_segments) {
// the operation (plain array approach)
uint64_t sum;
for( uint16_t nsegment = 0; nsegment < nsegments; ++nsegment ) {
sum = 0;
for(uint16_t nval = 0; nval < p_segments[nsegment].nvals; ++nval){
sum = sum + p_segments[nsegment].vals[nval];
}
p_segments[nsegment].mean = sum/p_segments[nsegment].nvals;
}
return nsegments;
}
uint16_t print(uint16_t nsegments, a_segment_t* p_segments) {
// print data (plain array approach)
for( uint16_t nsegment = 0; nsegment < nsegments; ++nsegment ) {
cout << "index : " << setfill('0') << setw(3) << p_segments[nsegment].index;
cout << "\tnval : " << setfill('0') << setw(3) << p_segments[nsegment].nvals;
cout << "\tvals : [";
for(uint16_t nval = 0; nval < p_segments[nsegment].nvals; ++nval){
cout << p_segments[nsegment].vals[nval] << ",";
}
cout << "\b]" << endl;
}
return nsegments;
}
uint16_t operation(uint16_t nsegments, vector<b_segment_t>* p_segments) {
// the operation (stl vector approach)
uint32_t sum;
for (vector<b_segment_t>::iterator p_segment = p_segments->begin(); p_segment<p_segments->end(); ++p_segment) {
sum = 0;
for (vector<uint16_t>::iterator p_val = (p_segment->vals).begin(); p_val<(p_segment->vals).end(); ++p_val) {
sum = sum + (*p_val);
}
p_segment->mean = sum/(p_segment->nvals);
}
return nsegments;
}
uint16_t print(uint16_t nsegments, vector<b_segment_t>* p_segments) {
// print data (stl vector approach)
for (vector<b_segment_t>::iterator p_segment = p_segments->begin(); p_segment<p_segments->end(); ++p_segment) {
cout << "index : " << setfill('0') << setw(3) << p_segment->index;
cout << "\tnval : " << setfill('0') << setw(3) << p_segment->nvals;
cout << "\tvals : [";
for (vector<uint16_t>::iterator p_val = (p_segment->vals).begin(); p_val<(p_segment->vals).end(); ++p_val) {
cout << *p_val << ",";
}
cout << "\b]" << endl;
}
return nsegments;
}
void delta_time(struct timespec* t1, struct timespec* t2, struct timespec* dt) {
if ((t2->tv_nsec - t1->tv_nsec) < 0) {
dt->tv_sec = t2->tv_sec - t1->tv_sec - 1;
dt->tv_nsec = t2->tv_nsec - t1->tv_nsec + 1000000000;
} else {
dt->tv_sec = t2->tv_sec - t1->tv_sec;
dt->tv_nsec = t2->tv_nsec - t1->tv_nsec;
}
return;
}
int main(int argc, char const *argv[]) {
uint16_t nsegments = NSEGMENTS;
uint16_t nsegment = 0;
uint16_t i = 0;
//create an populate the segments with dummy data (plain array approach)
a_segment_t* a_segments = new a_segment_t[nsegments];
for( nsegment = 0; nsegment < nsegments; ++nsegment ) {
a_segments[nsegment].index = nsegment;
srand(nsegment);
a_segments[nsegment].nvals = rand() % MAX_NPXS + 1;
for(uint16_t nval = 0; nval < a_segments[nsegment].nvals; ++nval){
a_segments[nsegment].vals[nval] = nval;
}
}
//create an populate the segments with dummy data (stl vector approach)
nsegment = 0;
vector<b_segment_t> b_segments(nsegments);
for (vector<b_segment_t>::iterator p_segment = b_segments.begin(); p_segment<b_segments.end(); ++p_segment) {
p_segment->index = nsegment;
srand(nsegment);
p_segment->nvals = rand() % MAX_NPXS + 1;
for(uint16_t nval = 0; nval < p_segment->nvals; ++nval){
p_segment->vals.push_back(nval);
}
nsegment++;
}
// print(nsegments, a_segments);
// cout << "===================================" << endl;
// print(nsegments, &b_segments);
// cout << "===================================" << endl;
// ======================= plain array timing measure ========================
struct timespec a_times[N];
for(i = 0; i < N; i++) {
nsegments = operation(nsegments, a_segments);
clock_gettime(CLOCK_REALTIME, &(a_times[i]));
}
// ===========================================================================
// ========================= vector timing measure ===========================
struct timespec b_times[N];
for(i = 0; i < N; i++) {
nsegments = operation(nsegments, &b_segments);
clock_gettime(CLOCK_REALTIME, &(b_times[i]));
}
// ===========================================================================
// =========================== timing console log ============================
struct timespec a_deltatime[N], a_elapsedtime[N], b_deltatime[N], b_elapsedtime[N];
cout << "\t\t plain array\t\t stl vector" << endl;
cout << "frame #\telapsedtime\tdeltatime\telapsedtime\tdeltatime" << endl;
for(i = 0; i < N-1; i=i+1000) {
delta_time(&(a_times[0]), &(a_times[i]), &(a_elapsedtime[i]));
delta_time(&(a_times[i]), &(a_times[i+1]), &(a_deltatime[i]));
delta_time(&(b_times[0]), &(b_times[i]), &(b_elapsedtime[i]));
delta_time(&(b_times[i]), &(b_times[i+1]), &(b_deltatime[i]));
cout << i << ",\t"
<< a_elapsedtime[i].tv_sec << "." << setfill('0') << setw(9) << a_elapsedtime[i].tv_nsec << ",\t"
<< a_deltatime[i].tv_sec << "." << setfill('0') << setw(9) << a_deltatime[i].tv_nsec << ",\t"
<< b_elapsedtime[i].tv_sec << "." << setfill('0') << setw(9) << b_elapsedtime[i].tv_nsec << ",\t"
<< b_deltatime[i].tv_sec << "." << setfill('0') << setw(9) << b_deltatime[i].tv_nsec << endl;
}
// ===========================================================================
}
online version。注意:所有的测试都是用 -O3 编译的
谁能指出为什么普通数组实现比
std::vector实现慢?普通数组实现不应该更快吗?
如何提高普通数组实现的速度?
【问题讨论】:
-
“普通数组的实现不应该更快吗?” - 我的意思不是粗鲁,但标准库的实现通常是由一群非常聪明的人来完成的他们之间有几十年经验的人。您的幼稚实现不太可能进行开箱即用的优化。
-
时差是多少?我可以想象
std::vector实现看到数组适当对齐以在整个数组上应用向量操作,而普通数组实现很可能没有对齐适合向量化的值。但是,如果没有更多细节,这就像一个盲人试图在黑暗中向目标射击。 -
无意冒犯,如果这是一个复杂的算法,我完全同意。但这是数组(或向量)遍历的简单练习。
-
在单核 512mb ram 虚拟机上进行 9000 次迭代,普通数组实现为 0.062658325 秒(每个约 0.0000058 秒),向量实现为 0.042909131 秒(每个约 0.0000042 秒)。
-
实际上,似乎还有一个虚假的区别:这两个实现做了不同的事情!其中一种实现将
double用于mean,而另一种实现使用uint32_t。您可能希望开始让这两个实现做同样的事情......
标签: c++ arrays memory-management stl