Windows 多线程使内存使用量增加答案

【问题标题】：Windows multi-thread makes memory usage increasingWindows 多线程使内存使用量增加
【发布时间】：2016-04-07 17:08:07
【问题描述】：

我正在使用SSE 指令和Windows multi-thread 来实现矩阵乘法。当矩阵的维度很大时，比如1024×1024，需要很长时间才能得到结果。

当它以 4096*4096 运行时，exe 占用的内存很少，比如 192M，但是，我的机器的内存使用率从 20% 增加到 97%，我认为 @987654324 可能有问题@。

以下是我的代码。

main.cpp

#include "sse_matrix.h"
#include <ctime>

int main(int argc, char* argv[])
{
    vector<float> * left = new vector<float>(size, 0);
    vector<float> * right = new vector<float>(size, 0);
    vector<float> * result = new vector<float>(size, 0);
    // initialize value
    for (int i = 0; i < dim; i ++)
    {
        for (int j = 0; j < dim; j ++)
        {
            (*left)[i*dim + j] = j;
            (*right)[i*dim + j] = j;
        }

    }
    cout << "1. INFO: value initialized, starting matrix multiplication" << endl;
    // calculate the result
    clock_t my_time = clock();
    SSE_Matrix_Multiply(left, right, result, 4);
    cout << "2. INFO: SSE matrix multiplication result has got" << endl;
    cout << "3. INFO: time(ms): " << float(clock() - my_time) << endl;

    delete left;
    delete right;
    delete result;

    system("pause");

    return 0;

}

sse_matrix.h

#ifndef __SSE_MATRIX_H__
#define __SSE_MATRIX_H__

#include <vector>
#include <iostream>
#include <Windows.h>
using std::cin;
using std::cout;
using std::endl;
using std::vector;

const int dim = 4096;
const int size = dim * dim;

struct Matrix_Info 
{
    vector<float> * A;
    int ax, ay;
    vector<float> * B;
    int bx, by;
    vector<float> * C;
    int cx, cy;
    int m;
    int n;
};

void Transpose_Matrix_SSE(float * matrix)
{
    __m128 row1 = _mm_loadu_ps(&matrix[0*4]);
    __m128 row2 = _mm_loadu_ps(&matrix[1*4]);
    __m128 row3 = _mm_loadu_ps(&matrix[2*4]);
    __m128 row4 = _mm_loadu_ps(&matrix[3*4]);
    _MM_TRANSPOSE4_PS(row1, row2, row3, row4);
    _mm_storeu_ps(&matrix[0*4], row1);
    _mm_storeu_ps(&matrix[1*4], row2);
    _mm_storeu_ps(&matrix[2*4], row3);
    _mm_storeu_ps(&matrix[3*4], row4);

}

float * Shuffle_Matrix_Multiply(float * left, float * right)
{
    __m128 _t1, _t2, _sum;
    _sum = _mm_setzero_ps(); // set all value of _sum to zero
    float * _result = new float[16];
    float _res[4] = {0};
    for (int i = 0; i < 4; i ++)
    {
        for (int j = 0; j < 4; j ++)
        {
            _t1 = _mm_loadu_ps(left + i * 4);
            _t2 = _mm_loadu_ps(right + j * 4);
            _sum = _mm_mul_ps(_t1, _t2);
            _mm_storeu_ps(_res, _sum);
            _result[i * 4 + j] = _res[0] + _res[1] + _res[2] + _res[3];
        }
    }
    return _result;
}

float * SSE_4_Matrix(struct Matrix_Info * my_info)
{
    int m = my_info->m;
    int n = my_info->n;
    int ax = my_info->ax;
    int ay = my_info->ay;
    int bx = my_info->bx;
    int by = my_info->by;
    //1. split Matrix A and Matrix B
    float * _a = new float[16];
    float * _b = new float[16];
    for (int i = 0; i < m; i ++)
    {
        for (int j = 0; j < m; j ++)
        {
            _a[i*m + j] = (*my_info->A)[(i + ax) * n + j + ay];
            _b[i*m + j] = (*my_info->B)[(i + bx) * n + j + by];
        }
    }
    //2. transpose Matrix B
    Transpose_Matrix_SSE(_b);
    //3. calculate result and return a float pointer
    float * result =  Shuffle_Matrix_Multiply(_a, _b);
    free(_a);
    free(_b);
    return result;
}

DWORD WINAPI Matrix_Multiply(LPVOID my_info)
{
    int m = ((struct Matrix_Info *)my_info)->m;
    int n = ((struct Matrix_Info *)my_info)->n;
    int cx = ((struct Matrix_Info *)my_info)->cx;
    int cy = ((struct Matrix_Info *)my_info)->cy;
    for (int i = 0; i < m; i ++)
    {
        for (int j = 0; j < m; j ++)
        {
            float * temp = SSE_4_Matrix(((struct Matrix_Info *)my_info));
            (*((struct Matrix_Info *)my_info)->C)[(i + cx) * n + j + cy] += temp[i*m + j];
            free(temp);
        }
    }
    return 0;
}

void SSE_Matrix_Multiply(vector<float> * left, vector<float> * right, vector<float> * result, int thread_num)
{
    struct Matrix_Info * my_info = new struct  Matrix_Info[thread_num];
    HANDLE * handle = new HANDLE[thread_num];
    for (int i = 0; i < thread_num; i ++)
    {
        my_info[i].A = left;
        my_info[i].B = right;
        my_info[i].C = result;
        my_info[i].n = dim;
        my_info[i].m = 4;
    }
    int id = 0;
    // Matrix A row:i, column:j
    for (int i = 0; i < dim; i += 4)
    {
        for (int j = 0; j < dim; j += 4)
        {
            // Matrix B row:j column:k
            for (int k = 0; k < dim; k += 4)
            {
                my_info[id].ax = i;
                my_info[id].ay = j;
                my_info[id].bx = j;
                my_info[id].by = k;
                my_info[id].cx = i;
                my_info[id].cy = k;
                if (id < thread_num)
                {
                     handle[id] = CreateThread(NULL, 0, Matrix_Multiply, (LPVOID)(&my_info[id]), 0, 0 );
                     id ++;
                }
                if (id == thread_num)
                {
                    for (int _i = 0; _i < id; _i ++)
                        WaitForMultipleObjects(thread_num, &handle[_i], TRUE, INFINITE);
                    id = 0;
                }
            }
        }
    }
    free(my_info);
    free(handle);
}

#endif

所以，当dim为4096时，当exe运行时，大概占用了192M的内存，但是在我得到结果之前，内存使用率从20%增加到了97%。

我的操作系统是Windows 10，IDE是Visual Studio 2012，我的内存是8G。

【问题讨论】：

创建线程需要内存.. 是的.. 这是预期的和设计上的.. 甚至可能在很大程度上被称为必要.. 没有仔细查看代码，但你确实似乎正在创建很多线程..你真的需要吗？仅供参考，创建比机器上的内核数量更多的线程不会对您的吞吐量有太大帮助..实际上会损害性能和内存（如您所见）。因此，将线程数保持在合理的数量（核心数的 2-3 倍）。
您在线程中执行了大量的 malloc 和 free 操作。这可能会增加内存开销（如果内存系统太忙而无法清理已释放的块），但肯定会破坏线程性能，因为 malloc 不是多线程的（因此 3 个线程将等待其中的 1）。只需将这 16 个浮点数组存储在 Matrix_Info 结构中，不要在线程代码中分配任何内容。（所有这些 CreateThread 调用也会产生很多开销。）
@1201ProgramAlarm，我必须释放一些内存，否则可能会导致内存泄漏。
@AlexanderYau new vector<…> 是不必要的。只需将它们声明为本地人：std::vector<float> left(size); ….
您的代码中有无数错误。您认为您的代码很好但缺陷在于 Windows 的信念是您真正的问题。如果您不查看自己的代码，您将如何发现自己的错误。

标签： c++ windows multithreading winapi memory

【解决方案1】：

您正在使用CreateThread 中的dwStackSize = 0 创建线程，这导致default stack size of 1MB。线程所需的内存未由任务管理器正确显示，您可以使用以下小测试应用程序对其进行测试：

#include <windows.h>
#include <stdio.h>
#include <conio.h>

DWORD WINAPI thread(LPVOID pData)
{
   HANDLE hThread;

   hThread = GetCurrentThread();
   while (SuspendThread(hThread) == -1);

   return (0);
}

int main()
{
   int cnt;
   HANDLE hThread;
   DWORD tid;

   cnt = 0;
   do
   {
      hThread = CreateThread(NULL, 4096, thread, NULL, 0, &tid);
      if (hThread != NULL)
      {
         cnt++;
      }

      SleepEx(10, FALSE);
   }
   while (hThread != NULL);

   printf("%d threads; error: %d\n", cnt, GetLastError());

   printf("\n\nend.");
   getch();

   return (0);
}

我能够在我的机器上创建 1597 个线程，因此我的应用程序的内存使用量应该在 1.5GB 左右，但任务管理器显示只有 45MB 左右。

我的猜测是您观察到的行为是由任务管理器的显示引起的...这可能值得进一步调查...

关于您的实施的另一件事：

在创建新线程之前，您正在等待所有线程结束。您可能需要检查 set bWaitAll = FALSE 并评估 WaitForMultipleObjects 的返回值，然后立即创建一个新线程，而不是等待所有线程完成处理...
如前所述，您不应分配像float[16] 这样的少量内存。想办法避免这种情况...

【讨论】：

任务管理器正确显示线程使用的内存。您被限制为 1597 个线程，因为您正在编译 32 位并且用完 地址空间。默认线程占用大约 8 kB 的内存和正好 1 MB 的地址空间。（确切的内存使用取决于运行时库和线程局部变量）
这个答案不正确且不完整，只是遗漏了问题代码中的一连串错误。
You're waiting for all threads to end before creating new threads. You may want to check set bWaitAll = FALSE and evaluate the return value of WaitForMultipleObjects the create a new thread right away instead of waiting for all threads to finish processing. 我认为你对我来说是个好建议。