【发布时间】:2016-04-07 17:08:07
【问题描述】:
我正在使用SSE 指令和Windows multi-thread 来实现矩阵乘法。当矩阵的维度很大时,比如1024×1024,需要很长时间才能得到结果。
当它以 4096*4096 运行时,exe 占用的内存很少,比如 192M,但是,我的机器的内存使用率从 20% 增加到 97%,我认为 @987654324 可能有问题@。
以下是我的代码。
main.cpp
#include "sse_matrix.h"
#include <ctime>
int main(int argc, char* argv[])
{
vector<float> * left = new vector<float>(size, 0);
vector<float> * right = new vector<float>(size, 0);
vector<float> * result = new vector<float>(size, 0);
// initialize value
for (int i = 0; i < dim; i ++)
{
for (int j = 0; j < dim; j ++)
{
(*left)[i*dim + j] = j;
(*right)[i*dim + j] = j;
}
}
cout << "1. INFO: value initialized, starting matrix multiplication" << endl;
// calculate the result
clock_t my_time = clock();
SSE_Matrix_Multiply(left, right, result, 4);
cout << "2. INFO: SSE matrix multiplication result has got" << endl;
cout << "3. INFO: time(ms): " << float(clock() - my_time) << endl;
delete left;
delete right;
delete result;
system("pause");
return 0;
}
sse_matrix.h
#ifndef __SSE_MATRIX_H__
#define __SSE_MATRIX_H__
#include <vector>
#include <iostream>
#include <Windows.h>
using std::cin;
using std::cout;
using std::endl;
using std::vector;
const int dim = 4096;
const int size = dim * dim;
struct Matrix_Info
{
vector<float> * A;
int ax, ay;
vector<float> * B;
int bx, by;
vector<float> * C;
int cx, cy;
int m;
int n;
};
void Transpose_Matrix_SSE(float * matrix)
{
__m128 row1 = _mm_loadu_ps(&matrix[0*4]);
__m128 row2 = _mm_loadu_ps(&matrix[1*4]);
__m128 row3 = _mm_loadu_ps(&matrix[2*4]);
__m128 row4 = _mm_loadu_ps(&matrix[3*4]);
_MM_TRANSPOSE4_PS(row1, row2, row3, row4);
_mm_storeu_ps(&matrix[0*4], row1);
_mm_storeu_ps(&matrix[1*4], row2);
_mm_storeu_ps(&matrix[2*4], row3);
_mm_storeu_ps(&matrix[3*4], row4);
}
float * Shuffle_Matrix_Multiply(float * left, float * right)
{
__m128 _t1, _t2, _sum;
_sum = _mm_setzero_ps(); // set all value of _sum to zero
float * _result = new float[16];
float _res[4] = {0};
for (int i = 0; i < 4; i ++)
{
for (int j = 0; j < 4; j ++)
{
_t1 = _mm_loadu_ps(left + i * 4);
_t2 = _mm_loadu_ps(right + j * 4);
_sum = _mm_mul_ps(_t1, _t2);
_mm_storeu_ps(_res, _sum);
_result[i * 4 + j] = _res[0] + _res[1] + _res[2] + _res[3];
}
}
return _result;
}
float * SSE_4_Matrix(struct Matrix_Info * my_info)
{
int m = my_info->m;
int n = my_info->n;
int ax = my_info->ax;
int ay = my_info->ay;
int bx = my_info->bx;
int by = my_info->by;
//1. split Matrix A and Matrix B
float * _a = new float[16];
float * _b = new float[16];
for (int i = 0; i < m; i ++)
{
for (int j = 0; j < m; j ++)
{
_a[i*m + j] = (*my_info->A)[(i + ax) * n + j + ay];
_b[i*m + j] = (*my_info->B)[(i + bx) * n + j + by];
}
}
//2. transpose Matrix B
Transpose_Matrix_SSE(_b);
//3. calculate result and return a float pointer
float * result = Shuffle_Matrix_Multiply(_a, _b);
free(_a);
free(_b);
return result;
}
DWORD WINAPI Matrix_Multiply(LPVOID my_info)
{
int m = ((struct Matrix_Info *)my_info)->m;
int n = ((struct Matrix_Info *)my_info)->n;
int cx = ((struct Matrix_Info *)my_info)->cx;
int cy = ((struct Matrix_Info *)my_info)->cy;
for (int i = 0; i < m; i ++)
{
for (int j = 0; j < m; j ++)
{
float * temp = SSE_4_Matrix(((struct Matrix_Info *)my_info));
(*((struct Matrix_Info *)my_info)->C)[(i + cx) * n + j + cy] += temp[i*m + j];
free(temp);
}
}
return 0;
}
void SSE_Matrix_Multiply(vector<float> * left, vector<float> * right, vector<float> * result, int thread_num)
{
struct Matrix_Info * my_info = new struct Matrix_Info[thread_num];
HANDLE * handle = new HANDLE[thread_num];
for (int i = 0; i < thread_num; i ++)
{
my_info[i].A = left;
my_info[i].B = right;
my_info[i].C = result;
my_info[i].n = dim;
my_info[i].m = 4;
}
int id = 0;
// Matrix A row:i, column:j
for (int i = 0; i < dim; i += 4)
{
for (int j = 0; j < dim; j += 4)
{
// Matrix B row:j column:k
for (int k = 0; k < dim; k += 4)
{
my_info[id].ax = i;
my_info[id].ay = j;
my_info[id].bx = j;
my_info[id].by = k;
my_info[id].cx = i;
my_info[id].cy = k;
if (id < thread_num)
{
handle[id] = CreateThread(NULL, 0, Matrix_Multiply, (LPVOID)(&my_info[id]), 0, 0 );
id ++;
}
if (id == thread_num)
{
for (int _i = 0; _i < id; _i ++)
WaitForMultipleObjects(thread_num, &handle[_i], TRUE, INFINITE);
id = 0;
}
}
}
}
free(my_info);
free(handle);
}
#endif
所以,当dim为4096时,当exe运行时,大概占用了192M的内存,但是在我得到结果之前,内存使用率从20%增加到了97%。
我的操作系统是Windows 10,IDE是Visual Studio 2012,我的内存是8G。
【问题讨论】:
-
创建线程需要内存.. 是的.. 这是预期的和设计上的.. 甚至可能在很大程度上被称为必要.. 没有仔细查看代码,但你确实似乎正在创建很多线程..你真的需要吗?仅供参考,创建比机器上的内核数量更多的线程不会对您的吞吐量有太大帮助..实际上会损害性能和内存(如您所见)。因此,将线程数保持在合理的数量(核心数的 2-3 倍)。
-
您在线程中执行了大量的 malloc 和 free 操作。这可能会增加内存开销(如果内存系统太忙而无法清理已释放的块),但肯定会破坏线程性能,因为 malloc 不是多线程的(因此 3 个线程将等待其中的 1)。只需将这 16 个浮点数组存储在 Matrix_Info 结构中,不要在线程代码中分配任何内容。 (所有这些 CreateThread 调用也会产生很多开销。)
-
@1201ProgramAlarm,我必须释放一些内存,否则可能会导致内存泄漏。
-
@AlexanderYau
new vector<…>是不必要的。只需将它们声明为本地人:std::vector<float> left(size); …. -
您的代码中有无数错误。您认为您的代码很好但缺陷在于 Windows 的信念是您真正的问题。如果您不查看自己的代码,您将如何发现自己的错误。
标签: c++ windows multithreading winapi memory