【问题标题】:Why is this implemented binary search so much slower than std::binary_search()?为什么这个实现的二进制搜索比 std::binary_search() 慢得多?
【发布时间】:2015-09-26 08:31:00
【问题描述】:

在检测到 std::upper_bound 之前,我实现了自己的 binarySearch 版本来确定所需元素的索引。该实现有效,但与线性搜索相比,我的 binarySearch 仅快一点。我的实现和标准库之间的因素随着搜索区域的增长而增加。

为了快速自我测试,我在本文末尾插入了完整的代码。快速浏览一下我的 searchBinary 实现:

template<typename T> T searchBinary(const std::vector<std::vector<T> > vectorList, const std::vector<T> compareVector) {
    long iteration = 0;
    size_t leftIndex = 0;
    size_t rightIndex = vectorList.size()-1;
    size_t pos;

    while (leftIndex <= rightIndex) {
        iteration++;
        pos = (leftIndex + rightIndex) / 2;

        if (compareVector < vectorList[pos]) {
            rightIndex = pos - 1;
        } else if (compareVector > vectorList[pos]) {
            leftIndex = pos + 1;
        } else {
            cout << "Match at binary search after " << iteration << " iterations.\n";
            return pos;
        }
    }

    cout << "No match at binary search after " << iteration << " iterations.\n";
    return -1;
}

这就是我弄乱运行时的方式:

void searchBinaryOwn_messure(std::vector<std::vector<u_char> > vectorList, std::vector<u_char> compareVector) {
    struct timeval begin, end;
    long seconds, useconds;

    if (gettimeofday(&begin,(struct timezone *)0)) {
        fprintf(stderr, "can not get time\n");
        exit(1);
    }

    searchBinary(vectorList, compareVector);

    if (gettimeofday(&end,(struct timezone *)0)) {
        fprintf(stderr, "can not get time\n");
        exit(1);
    }

    seconds = end.tv_sec - begin.tv_sec;
    useconds = end.tv_usec - begin.tv_usec;
    if(useconds < 0) {
        useconds += 1000000;
        seconds--;
    }

    printf("searchBinaryOwn(): %ld sec %ld usec\n\n", seconds, useconds);
    return;
}

在这里看不到任何问题。如果我用 8 000 运行这个程序 000 个元素:

  • searchLinear() 大约需要 3.7 秒
  • searchBinaryOwn() 大约需要 2.8 秒
  • searchBinaryStd() 需要 ~0,0007 秒

那么为什么这两种二进制搜索之间会有如此巨大的差异呢?(使用 gcc 4.8.2 编译) 注意:因为 "cout..." 大约需要 30 微秒,所以 std::binarySearch 实际上比显示的要快

这里是完整的代码:

#include <iostream>
#include <vector>
#include <sys/time.h>
#include <algorithm>
#include <string>
#include <stdio.h>
using namespace std;


template<typename T> T searchBinary(const std::vector<std::vector<T> > vectorList, const std::vector<T> compareVector) {
    long iteration = 0;
    size_t leftIndex = 0;
    size_t rightIndex = vectorList.size()-1;
    size_t pos;


    while (leftIndex <= rightIndex) {
        iteration++;
        pos = (leftIndex + rightIndex) / 2;

        if (compareVector < vectorList[pos]) {
            rightIndex = pos - 1;
        } else if (compareVector > vectorList[pos]) {
            leftIndex = pos + 1;
        } else {
            cout << "Match at binary search after " << iteration << " iterations.\n";
            return pos;
        }
    }

    cout << "No match at binary search after " << iteration << " iterations.\n";
    return -1;
}

size_t searchLinear(std::vector<std::vector<u_char> > vectorList, std::vector<u_char> compareVector) {
    size_t vectorListSize = vectorList.size();
    for (size_t i = 0; i < vectorListSize; i++) {
        if (vectorList[i] == compareVector) {
            return i;
        }
    }
    return (size_t)-1;
}

void searchLinear_messure(std::vector<std::vector<u_char> > vectorList, std::vector<u_char> compareVector) {
    struct timeval begin, end;
    long seconds, useconds;

    if (gettimeofday(&begin,(struct timezone *)0)) {
        fprintf(stderr, "can not get time\n");
        exit(1);
    }

    //search
    cout << "\nPos: " << searchLinear(vectorList, compareVector) << endl;

    if (gettimeofday(&end,(struct timezone *)0)) {
        fprintf(stderr, "can not get time\n");
        exit(1);
    }

    seconds = end.tv_sec - begin.tv_sec;
    useconds = end.tv_usec - begin.tv_usec;
    if(useconds < 0) {
        useconds += 1000000;
        seconds--;
    }

    printf("searchLinear(): %ld sec %ld usec\n\n", seconds, useconds);
    return;
}

void searchBinaryStd_messure(std::vector<std::vector<u_char> > vectorList, std::vector<u_char> compareVector) {
    struct timeval begin, end;
    long seconds, useconds;

    if (gettimeofday(&begin,(struct timezone *)0)) {
        fprintf(stderr, "can not get time\n");
        exit(1);
    }

    //search
    cout << "found: " << std::binary_search(vectorList.begin(), vectorList.end(), compareVector) << endl;

    if (gettimeofday(&end,(struct timezone *)0)) {
        fprintf(stderr, "can not get time\n");
        exit(1);
    }

    seconds = end.tv_sec - begin.tv_sec;
    useconds = end.tv_usec - begin.tv_usec;
    if(useconds < 0) {
        useconds += 1000000;
        seconds--;
    }

    printf("searchBinaryStd(): %ld sec %ld usec\n\n", seconds, useconds);
    return;
}

void searchBinaryOwn_messure(std::vector<std::vector<u_char> > vectorList, std::vector<u_char> compareVector) {
    struct timeval begin, end;
    long seconds, useconds;

    if (gettimeofday(&begin,(struct timezone *)0)) {
        fprintf(stderr, "can not get time\n");
        exit(1);
    }

    searchBinary(vectorList, compareVector);

    if (gettimeofday(&end,(struct timezone *)0)) {
        fprintf(stderr, "can not get time\n");
        exit(1);
    }

    seconds = end.tv_sec - begin.tv_sec;
    useconds = end.tv_usec - begin.tv_usec;
    if(useconds < 0) {
        useconds += 1000000;
        seconds--;
    }

    printf("searchBinaryOwn(): %ld sec %ld usec\n\n", seconds, useconds);
    return;
}


int main() {
    std::vector<u_char> compareVector;
    compareVector.clear();
    compareVector.push_back(0xF8);
    compareVector.push_back(0xD1);
    compareVector.push_back(0x11);
    compareVector.push_back(0xFF);

    std::vector<std::vector<u_char> > vectorList;
    vectorList.clear();
    std::vector<u_char> temp;
    for (unsigned int i = 0; i < ((unsigned int)-1); i++) {
        if (i == 8000000) {
//      if (i == 15000000) {
            break;
        }
        temp.clear();
        temp.push_back(0x11);
        temp.push_back(0x22);
        temp.push_back(0x33);
        temp.push_back(0x44);
        vectorList.push_back(temp);
    }

    vectorList[7999999] = compareVector;

    cout << "Elements in vectorList: " << vectorList.size() << endl;

    searchLinear_messure(vectorList, compareVector);
    searchBinaryStd_messure(vectorList, compareVector);
    searchBinaryOwn_messure(vectorList, compareVector);

    return 0;
}

【问题讨论】:

  • 我没有时间仔细查看,但是您的 searchBinary 创建了您传递的两个向量的副本,因此这至少是一个瓶颈。
  • 因为很多人在算法标准化之前工作了很长时间。 ;-)

标签: c++ performance binary-search


【解决方案1】:
  1. 将您的函数原型更改为

template&lt;typename T&gt; T searchBinary(const std::vector&lt;std::vector&lt;T&gt; &gt;&amp; vectorList, const std::vector&lt;T&gt;&amp; compareVector) {

即通过常量引用而不是值传递。这将避免两个向量副本。

  1. 您可以在每次迭代时使用单个条件测试 &lt; 进行重构。 (您还需要更改 while 条件)。

  2. iteration 是否必须是 long?不能更短吗?收敛的最坏情况是什么?

第 1 点很重要。 2 非常重要,3 是一种微优化,在某些系统上可能根本不会产生影响。

【讨论】:

  • 您的第 2 项不仅仅是“微”优化。这几乎是两倍。除非我误解了,否则您仍然需要对正在排序的值进行 测试可以节省近一半的时间。然后在一段时间之后,您需要进行额外的测试以查看 leftIndex-1 是否是您正在搜索的项目(如果 leftIndex 被初始化为 1 而不是 0,这将更安全、更快)。如果打印的迭代次数真的很有趣,那么整个想法就失败了。但我认为不是。
  • 你当然是对的。我提高了我的语言。如果您要在答案中提供代码,我当然会赞成。
【解决方案2】:

向量按值传递给searchBinary,因此将创建副本,这需要时间。

如果您将签名更改为

template<typename T> T searchBinary(const std::vector<std::vector<T> >& vectorList, const std::vector<T>& compareVector)

它与 std 实现一样快:http://melpon.org/wandbox/permlink/qozapTfn3MrGv5JA

【讨论】:

    【解决方案3】:

    这个template&lt;typename T&gt; T searchBinary(const std::vector&lt;std::vector&lt;T&gt; &gt; vectorList, const std::vector&lt;T&gt; compareVector) 会复制输入向量(当您通过值传递它时),它是时间线性的。所以你得到的结果实际上是预期的。

    顺便说一句。一个诙谐的回答可能是标准库是由相当优秀的开发人员编写的,预计它几乎不会被超越。

    【讨论】:

      猜你喜欢
      • 1970-01-01
      • 1970-01-01
      • 2017-04-09
      • 2017-07-21
      • 1970-01-01
      • 2017-05-10
      • 2015-11-11
      • 2020-09-17
      相关资源
      最近更新 更多