嵌套 for 循环的奇怪性能问题答案

【问题标题】：Odd performance issue with nested for loops嵌套 for 循环的奇怪性能问题
【发布时间】：2014-12-09 19:41:15
【问题描述】：

以下是完整的源代码，您只需将其复制粘贴到 Visual Studio 中即可轻松复制。

#include <Windows.h>

#include <algorithm>
#include <vector>
#include <iostream>
#include <sstream>

LARGE_INTEGER gFreq;

struct CProfileData;

// Yes, we map the pointer itself not the string, for performance reasons
std::vector<CProfileData*> gProfileData;

// simulate a draw buffer access to avoid CBlock::Draw being optimized away
float gDrawBuffer = 0;

struct CTimer
{
    CTimer()
    {
        Reset();
    }

    size_t GetElapsedMicro()
    {
        LARGE_INTEGER now;
        ::QueryPerformanceCounter(&now);
        return (1000000 * (now.QuadPart - m_timer.QuadPart)) / gFreq.QuadPart;
    }

    inline void Reset()
    {
        ::QueryPerformanceCounter(&m_timer);
    }

    LARGE_INTEGER m_timer;
};

struct CProfileData
{
    CProfileData() : m_hitCount(0), m_totalTime(0), m_minTime(-1),
        m_maxTime(0), m_name(NULL)
    {
        gProfileData.push_back(this);
    }

    size_t m_totalTime;
    size_t m_minTime;
    size_t m_maxTime;
    size_t m_hitCount;
    const char * m_name;
};

class CSimpleProfiler
{
public:
    CSimpleProfiler(const char * aLocationName, CProfileData * aData)
        : m_location(aLocationName), m_data(aData)
    {
        ::QueryPerformanceCounter(&m_clock);
    }

    ~CSimpleProfiler()
    {
        CProfileData & data = *m_data;
        data.m_name = m_location;
        ++data.m_hitCount;


        LARGE_INTEGER now;
        ::QueryPerformanceCounter(&now);

        size_t elapsed = (1000000 * (now.QuadPart - m_clock.QuadPart)) / gFreq.QuadPart;
        data.m_totalTime += elapsed;

        elapsed < data.m_minTime ? data.m_minTime = elapsed : true;
        elapsed > data.m_maxTime ? data.m_maxTime = elapsed : true;
    }

    static void PrintAll()
    {
        std::stringstream str;
        str.width(20);
        str << "Location";
        str.width(15);
        str << "Total time";
        str.width(15);
        str << "Average time";
        str.width(15);
        str << "Hit count";
        str.width(15);
        str << "Min";
        str.width(15);
        str << "Max" << std::endl;

        ::OutputDebugStringA(str.str().c_str());

        for (auto i = gProfileData.begin(); i != gProfileData.end(); ++i)
        {
            CProfileData & data = **i;
            std::stringstream str;
            str.width(20);
            str << data.m_name;
            str.width(15);
            str << data.m_totalTime;
            str.width(15);
            str << data.m_totalTime / (float)data.m_hitCount;
            str.width(15);
            str << data.m_hitCount;
            str.width(15);
            str << data.m_minTime;
            str.width(15);
            str << data.m_maxTime << std::endl;

            ::OutputDebugStringA(str.str().c_str());
        }
    }

    static void Clear()
    {
        for (auto i = gProfileData.begin(); i != gProfileData.end(); ++i)
        {
            (*i)->m_totalTime = 0;
            (*i)->m_minTime = 0;
            (*i)->m_maxTime = 0;
            (*i)->m_hitCount = 0;
        }
    }

private:
    LARGE_INTEGER m_clock;
    const char * m_location;
    CProfileData * m_data;
};


#define PROFILING_ENABLED

#ifdef PROFILING_ENABLED
#define SIMPLE_PROFILE \
    static CProfileData pdata ## __LINE__; \
    CSimpleProfiler p ## __LINE__(__FUNCTION__, & pdata ## __LINE__)

#define SIMPLE_PROFILE_WITH_NAME(Name) \
    static CProfileData pdata ## __LINE__; \
    CSimpleProfiler p ## __LINE__(Name, & pdata ## __LINE__)
#else
#define SIMPLE_PROFILE __noop
#define SIMPLE_PROFILE_WITH_NAME(Name) __noop
#endif


void InvalidateL1Cache()
{
    const int size = 256 * 1024; 
    static char *c = (char *)malloc(size);
    for (int i = 0; i < 0x0fff; i++)
        for (int j = 0; j < size; j++)
            c[j] = i*j;
}

int _tmain(int argc, _TCHAR* argv[])
{
    ::QueryPerformanceFrequency(&gFreq);

    LARGE_INTEGER pc;
    ::QueryPerformanceCounter(&pc);

    struct CBlock
    {
        float x;
        float y;

        void Draw(float aBlend)
        {   
            for (size_t i = 0; i < 100; ++i )
                gDrawBuffer += aBlend;
        }
    };


    typedef std::vector<std::vector<CBlock>> Layer;
    typedef std::vector<Layer> Layers;
    Layers mBlocks;

    // populate with dummy data;
    mBlocks.push_back(Layer());
    Layer & layer = mBlocks.back();
    layer.resize(109);

    srand(0); // for reprodicibility (determinism)
    for (auto i = layer.begin(); i != layer.end(); ++i)
    {
        i->resize(25 + rand() % 10 - 5);
    }

    // end populating dummy data

    while (1)
    {
        CSimpleProfiler::Clear();

        float aBlend = 1.f / (rand() % 100);
        {
            for (auto i = mBlocks.begin(); i != mBlocks.end(); ++i)
            {
                for (auto j = i->begin(); j != i->end(); ++j)
                {

                    CTimer t;

                    {
                        SIMPLE_PROFILE_WITH_NAME("Main_Draw_3");
                        for (auto blockIt = j->begin(); blockIt != j->end();)
                        {
                            CBlock * b = nullptr;
                            {
                                b = &*blockIt;
                            }
                        {
                            b->Draw(aBlend);
                        }
                        {
                            ++blockIt;
                        }
                        }
                    }

                    if (t.GetElapsedMicro() > 1000)
                    {
                        ::OutputDebugStringA("SLOWDOWN!\n");
                        CSimpleProfiler::PrintAll();
                    }
                }
            }
        }
    }

    return 0;
}

我不时得到以下分析，以微秒表示：

SLOWDOWN!
            Location     Total time   Average time      Hit count            Min            Max
         Main_Draw_3           2047        36.5536             56              0           1040

这种情况不时出现。通常，Main_Draw_3 块需要 100 微秒才能完成，但它有时会飙升至 1000（Max 列）。这是什么原因造成的？

我知道缓存未命中可能会起作用，但在这种情况下真的是这样吗？...这里发生了什么，我该如何缓解这种情况？

更多信息：

编译器VS 2013，用Maximize Speed (/O2)编译

【问题讨论】：

绝对可重现的性能测量样本。太棒了！很少见到更好的：-P ...
@πάνταῥεῖ 不知道你的意思，但是谢谢，我猜？
“但是谢谢，我猜？” 讽刺，通常是不那么受欢迎的事情之一，但不幸的是，也是不那么理解的事情之一。
你需要使用向量吗？也许使用普通的 3D 数组会提供更一致的性能？
@Zadirion - 基本上说“为什么我的代码慢/快？”的帖子应随附用于构建测试的编译器和选项。您没有说明您正在使用什么编译器，您是否正在计时优化构建等。

标签： c++ performance vector

【解决方案1】：

我认为可能有两个问题：

您是否使用优化进行编译？标志是什么？
也许您可以增加样本量（例如，通过在一次分析运行中运行此代码十次（或一百次或数千次等））。原因是如果样本量小，标准差就很高

【讨论】：

在 VS 2013 中使用 Maximize Speed (/O2) 编译。同样的样本也在更大的集合（大约 2000 个元素）上进行，结果是相同的。大多数迭代很快，但偶尔其中一个会在 15,000 微秒左右达到峰值
2000 个元素并不是很多元素。尝试至少做一百万次，并多次重复数据收集。 :-p
@Zadirion：“大约 15,000 微秒”是系统调度程序的默认时间片。上下文切换导致您的线程停止运行，因此另一段代码（一直在耐心等待轮到它）可以让处理器运行 15 毫秒。
@BenVoigt 你和我想的一样
您 100% 确定 Draw() 也不会触发不同线程中的任何内容？