为什么 std::vector 这么快（或者我的实现太慢了）答案

【问题标题】：Why is std::vector so fast ( or is my implementation is too slow )为什么 std::vector 这么快（或者我的实现太慢了）
【发布时间】：2013-11-14 18:47:14
【问题描述】：

前几天我在玩，想看看我能优化多远。我决定从一个简单的地图开始，它只进行线性搜索以查找元素是否存在，然后尝试对其进行大部分优化。另外，为了比较，我使用 std::find 对 std::map 和 std::vector 做同样的事情。

地图的结果是预期的，创建和销毁比我的地图慢，但速度要快得多（实际上，我无法测量它，它总是返回 0）。问题出在 std::vector 上。我预计它会比我的实现慢，但事实并非如此，而且我真的不明白它怎么会相同或更快，因为我的实现正在跳过最坏的情况（该值不在向量中）并且是使用结果缓存。

任何人都可以在这里阐明一下吗？我知道stl背后的人是半神，但是，这没有任何意义。

基准测试结果（i3、Windows 8.1 Pro 64、Visual Studio 2013）：

std::vector :
    Build : 85.0042 ms
    Loop : 37.0011 ms
    Find : 1.82259 ms  -> First : Found, Second : Found, Third : Not Found
    Release : 0 ms
--------------------
std::map :
    Build : 6929.41 ms
    Loop : 570.032 ms
    Find : 0 ms  -> First : Found, Second : Found, Third : Not Found
    Release : 1425.08
--------------------
Linear Map V0:
    Build : 194.012 ms
    Loop : 49.0052 ms
    Find : 1.88915 ms -> First : Found, Second : Found, Third : Not Found
    Release : 109.004

这是地图的代码：

template<typename T>
class LinearMap0
{
public:
LinearMap0()
{
    _end = _root = new Node;
    _prebuffer = nullptr;
    prebufferCapacity = 0;
    _alive = true;
    prebufferMarker = 0;
    _cache = _mm_set1_epi32(-1);
    for (auto& ptr : _cacheBuffer) ptr = nullptr;
    MinID = INT32_MAX - 1;
    MaxID = -1;
}
void PreAllocate(int Count)
{
    prebufferCapacity = Count;
    _prebuffer = new Node[Count];
}
~LinearMap0()
{
    if (_alive)
    {
        Release();
    }
}
void Release()
{
    Node* marker = _end;
    while (marker->Prev)
    {
        marker = marker->Prev;
        if (!marker->Next->IsPreAllocated) delete marker->Next;
    }

    if (!_root->IsPreAllocated) delete _root;
    delete[] _prebuffer;

    _alive = false;
}

void AddElement(int ID,T element)
{
    Node* tmp = nullptr;
    if (prebufferMarker < prebufferCapacity)
    {
        // Use a pre-allocated object
        tmp = &_prebuffer[prebufferMarker];
        prebufferMarker++;
        tmp->IsPreAllocated = true;
    }
    else
    {
        tmp = new Node;
    }

    tmp->ID = ID;
    tmp->Data = element;

    // Update list
    _end->Next = tmp;
    Node* prevEnd = _end;
    _end = tmp;
    _end->Prev = prevEnd;
    bool isMin = ID < MinID; MinID = ID * isMin + (1 - isMin) * MinID;
    bool isMax = ID > MaxID; MaxID = ID * isMax + (1 - isMax) * MaxID;
}
void DeleteLast()
{
    Node* tmp = _end;

    _end = _end->Prev;
    _end->Next = nullptr;

    delete tmp;
}

template<class Function>
void Loop(Function&& f, bool Forward = true)
{
    if (Forward)
    {
        Node* marker = _root;
        while (marker->Next)
        {
            marker = marker->Next;
            f(marker->Data);
        }
    }
    else
    {
        Node* marker = _end;
        while (marker->Prev)
        {
            marker = marker->Prev;
            f(marker->Data);
        }
    }
}

T* Find(int ID)
{
    // Bounds check
    if (ID < MinID || ID > MaxID) return nullptr;

    // Check it it's in the cache

    // Compare the value to every value in the cache
    __m128i idxSSE = _mm_set1_epi32(ID);
    __m128i C = _mm_cmpeq_epi32(_cache, idxSSE);

    // To change form -1 to 1
    C = _mm_mul_epi32(C, _mm_set1_epi32(-1));

    // Now C holds 1 if true, or 0 if false (in each of its 4 members). It should only be ONE set at 1
    __m128i tmp = _mm_set1_epi32(1);
    __m128i S = _mm_sub_epi32(tmp, C);

    // Now find the index
    int i = S.m128i_i32[0] * (C.m128i_i32[1] + S.m128i_i32[1] * (2 * C.m128i_i32[2] + S.m128i_i32[2] * (3 * C.m128i_i32[3] + S.m128i_i32[3] * -1)));

    if (i != -1)
        return _cacheBuffer[i];

    // Traverse the list
    Node* marker0 = _root;
    T* obj = nullptr;

    while (true)
    {
        if (marker0->ID == ID)
        {
            obj = &marker0->Data;
        }

        if (marker0->Next) marker0 = marker0->Next; else break;
    }

    // Cache value and return
    _cache.m128i_i32[cacheMarker] = ID;
    _cacheBuffer[cacheMarker] = obj;
    cacheMarker = (cacheMarker + 1) & 3; // x & 3 = x % 4

    return obj;
}
private:
struct Node
{
    Node()
    {
        Prev = nullptr;
        Next = nullptr;
        IsPreAllocated = false;
        ID = -1;
    }
    T Data;
    Node* Prev;
    Node* Next;
    bool IsPreAllocated;
    int ID;
};

Node* _root;
Node* _end;

Node* _prebuffer;
int prebufferCapacity;
int prebufferMarker;

bool _alive;

__m128i _cache;
T* _cacheBuffer[4];
int cacheMarker;
int MinID, MaxID;
};

这是基准：

// Initialize seeds
const __int64 ecount = 5 * 1000*1000;
vector<__int64> seed(ecount);
for (__int64 i = 0; i < ecount; i++)
{
    seed[i] = i;
}
random_shuffle(seed.begin(), seed.end());

///////////// std::vector

vector<__int64> v;

cout << "--------------------" << endl;
cout << "std::vector :" << endl;
cout << "   Build : " << time_call([&]()
{
    v.resize(ecount/2);
    for (__int64 i = 0; i < ecount; i++)
    {
        if (i < (ecount / 2))
            v[i] = seed[i];
        else
            v.push_back(seed[i]);
    }
}) << " ms" << endl;

cout << "   Loop : " << time_call([&]()
{
    for (auto& n : v)
        n /= 2;
}) << " ms" << endl;

bool found1, found2, found3;
cout << "   Find : " << (((float)time_call([&]()
{
    for (int i = 0; i < 15; i++)
    {
        // Should exist
        found1 = find(v.begin(), v.end(), seed[5] / 2) != v.end();//find(seed[5]) != m.end();
        found2 = find(v.begin(), v.end(), seed[1000] / 2) != v.end();

        // Shouldn't exist
        found3 = find(v.begin(), v.end(), -1234) != v.end();
    }
})) / 15.0) / 3.0;
cout << " ms " << " -> First : " << ((found1) ? "Found" : "Not Found") << ", Second : " << ((found2) ? "Found" : "Not Found") << ", Third : " << ((found3) ? "Found" : "Not Found") << endl;

cout << "   Release : " << time_call([&]()
{
    v.clear();
}) << " ms" << endl;

///////////// std::map

map<__int64, __int64> m;

cout << "--------------------" << endl;
cout << "std::map :" << endl;
cout << "   Build : " << time_call([&]()
{
    for (__int64 i = 0; i < ecount; i++)
    {
        m[seed[i]] = seed[i];
    }
}) << " ms" << endl;

cout << "   Loop : " << time_call([&]()
{
    for (auto& n : m)
        n.second /= 2;
}) << " ms" << endl;

cout << "   Find : " << (((float)time_call([&]()
{
    for (int i = 0; i < 15; i++)
    {
        // Should exist
        found1 = m.find(seed[5]) != m.end();
        found2 = m.find(seed[1000]) != m.end();

        // Shouldn't exist
        found3 = m.find(-1234) != m.end();
    }
})) / 15.0) / 3.0;
cout << " ms " << " -> First : " << ((found1) ? "Found" : "Not Found") << ", Second : " << ((found2) ? "Found" : "Not Found") << ", Third : " << ((found3) ? "Found" : "Not Found") << endl;

cout << "   Release : " << time_call([&]()
{
    m.clear();
}) << endl;

///////////// Linear Map V0

LinearMap0<__int64> c;

cout << "--------------------" << endl;
cout << "Linear Map V0:" << endl;
cout << "   Build : " << time_call([&]()
{
    c.PreAllocate(ecount / 2);
    for (__int64 i = 0; i < ecount; i++)
    {
        c.AddElement(seed[i],seed[i]);
    }
}) << " ms" << endl;

cout << "   Loop : " << time_call([&]()
{
    c.Loop([](__int64& Data)
    {
        Data /= 2;
    });
}) << " ms" << endl;

cout << "   Find : " << (((float)time_call([&]()
{
    for (int i = 0; i < 15; i++)
    {
        // Should exist
        found1 = c.Find(seed[5]);
        found2 = c.Find(seed[1000]);

        // Shouldn't exist
        found3 = c.Find(-1234);
    }
})) / 15.0) / 3.0;
cout << " ms -> First : " << ((found1) ? "Found" : "Not Found") << ", Second : " << ((found2) ? "Found" : "Not Found") << ", Third : " << ((found3) ? "Found" : "Not Found") << endl;

cout << "   Release : " << time_call([&]()
{
    c.Release();
}) << endl;

编辑：time_call 是：

template <class Function>
double time_call(Function&& f)
{
    chrono::time_point<chrono::high_resolution_clock> start, end;
    start = chrono::high_resolution_clock::now();
        f();
    end = chrono::high_resolution_clock::now();

    return ((double)(chrono::duration_cast<chrono::nanoseconds>(end - start).count())) / 1000000.0;
}

【问题讨论】：

你实现了一个链表。 std::vector 是一个动态大小的数组。
你为什么使用像__int64这样的非标准内容而不是std::int64_t？
我知道，但性能上的所有差异真的只是 CPU 缓存吗？因为链表和动态大小的数组并没有那么不同，而且正如我所说，我的只做了 2 个查找，所有其他 43 个都提前退出，无论是缓存还是边界检查。
是的，缓存的使用方式会极大地影响性能。
排序数组的速度与您将获得的速度差不多。

标签： c++ performance c++11 stl

【解决方案1】：

您的容器是一个链表，而std::vector 是一个动态大小的数组。

链表方法有一些好处，例如能够插入元素而无需任何重新分配。

但是数组方法有一些显着的优势：

线性搜索只是扫描内存，这正是缓存和预取器的用途。扫描链表的效率会降低，因为每次跳转到未缓存的内存都意味着代价高昂的缓存未命中。
线性阵列扫描易于矢量化。如果您使用-O3 进行编译，那么编译器可能会使用std::find 的矢量化版本。由于内存依赖性，无法对链表扫描进行矢量化。
已使用的内存量。您的链表必须维护一个next 指针，它可以有效地使您的元素更大。此外，每个未预分配的Node 必须支付分配器的开销（即new 和delete 的会计数据）。这意味着您将更快地达到内存带宽限制，并且您可以在缓存中容纳更少的元素。

【讨论】：

好吧，既然你这么说，那就有道理了。不过，我猜我制作了一个非常快的链表。
PS：真的没那么快：std::list：构建：292.019 毫秒循环：49.0047 毫秒查找：9.73399 毫秒-> 第一个：找到，第二个：找到，第三个：未找到发布：150.009 毫秒
不要自责。我们都试图击败 STL，但被知道自己在做什么的人多年的优化而感到谦卑。他们做得更好的唯一方法是发明一种复杂性较低的新容器类型（即更快的大哦）或使用仅适用于您的问题的快捷方式（即在一般情况下中断，因此 STL 不能使用它们）。
我还建议您阅读 Stroustrup 的常见问题解答条目：stroustrup.com/bs_faq2.html#slow-containers

【解决方案2】：

std vector 的所有优点是元素被紧密打包（内存中的元素 1 就在元素 0 之后，依此类推）。这对 CPU 来说是一个很大的优势，因为内存读取更加可预测。当你在堆上分配了节点时，CPU 必须疯狂地来回跳来获取内存。

查看this 线程。

【讨论】：