【发布时间】:2018-07-17 08:27:01
【问题描述】:
我对 CUDA 编程很陌生,我想尝试使用 Parallel Reduction 实现矩阵乘法。我想出了这段代码,想澄清一下:
- 代码返回错误结果的原因。
- 为什么它的运行时间比CUDA C Programming Guide 第 3 章第 25 页中描述的使用共享内存的方法要长得多。
作为参考,我在计算能力为 2.1 的 NVIDIA GeForce GTX 675M 上运行它。
#include <cuda_runtime_api.h>
#include "device_launch_parameters.h"
#include <cuda.h>
#include <device_functions.h>
#include "cuda_runtime.h"
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <math.h>
#define BLOCK_OPT 1024
typedef struct {
int width;
int height;
int stride;
float* elements;
}Matrix;
__global__ void MatMulOPT(Matrix A, Matrix B, Matrix C)
{
__shared__ float result[BLOCK_OPT];
int e = threadIdx.x;
int row = blockIdx.x;
int col = blockIdx.y;
result[e] = A.elements[row*A.stride + e] * B.elements[e*B.stride + col];
__syncthreads();
if (e < 512)
{
result[e] += result[e + 512];
}
__syncthreads();
if (e < 256)
{
result[e] += result[e + 256];
}
__syncthreads();
if (e < 128)
{
result[e] += result[e + 128];
}
__syncthreads();
if (e < 64)
{
result[e] += result[e + 64];
}
__syncthreads();
if (e < 32)
{
result[e] += result[e + 32];
result[e] += result[e + 16];
result[e] += result[e + 8];
result[e] += result[e + 4];
result[e] += result[e + 2];
result[e] += result[e + 1];
}
if (e == 0)C.elements[row*C.stride + col] = result[0];
}
void MatMulCPU(Matrix A, Matrix B, Matrix C)
{
for (int i = 0; i < A.height; i++)
{
for (int j = 0; j < B.width; j++)
{
for (int k = 0; k < B.height; k++)
{
C.elements[i*C.stride + j] += A.elements[i*A.stride+k] * B.elements[k*B.stride + j];
}
}
}
}
float randomFloat()
{
return (float)rand() / (float)RAND_MAX;
}
int main()
{
clock_t begin, end;
srand(time(NULL));
//Common Setup
float cpu_t = 0, gpu_t = 0;
int x = 1024;
int N = x * x;
size_t size = N * sizeof(float);
Matrix A;
A.width = x;
A.stride = x;
A.height = x;
A.elements = (float*)malloc(size);
for (int i = 0; i < N; i++)
A.elements[i] = randomFloat();
Matrix B;
B.width = x;
B.stride = x;
B.height = x;
B.elements = (float*)malloc(size);
for (int j = 0; j < N; j++)
B.elements[j] = randomFloat();
Matrix C;
C.width = x;
C.stride = x;
C.height = x;
C.elements = (float*)malloc(size);
for (int k = 0; k < N; k++)
C.elements[k] = 0;
Matrix D;
D.width = x;
D.stride = x;
D.height = x;
D.elements = (float*)malloc(size);
for (int l = 0; l < N; l++)
D.elements[l] = 0;
//Execute CPU code & time it
begin = clock();
MatMulCPU(A, B, D);
end = clock();
cpu_t = (float)end - begin;
// GPU setup
Matrix d_A, d_B, d_C;
d_A.width = x;
d_A.stride = x;
d_A.height = x;
d_B.width = x;
d_B.stride = x;
d_B.height = x;
d_C.width = x;
d_C.stride = x;
d_C.height = x;
cudaMalloc(&d_A.elements, size);
cudaMemcpy(d_A.elements, A.elements, size, cudaMemcpyHostToDevice);
cudaMalloc(&d_B.elements, size);
cudaMemcpy(d_B.elements, B.elements, size, cudaMemcpyHostToDevice);
cudaMalloc(&d_C.elements, size);
cudaMemcpy(d_C.elements, C.elements, size, cudaMemcpyHostToDevice);
//Special Parameters
int optBlockSize = BLOCK_OPT;
dim3 optDimGrid(x, x);
//Execute GPU Kernel and time it
begin = clock();
cudaThreadSynchronize();
MatMulOPT << <optDimGrid, optBlockSize >> > (d_A, d_B, d_C);
cudaThreadSynchronize();
end = clock();
gpu_t = (float)end - begin;
cudaMemcpy(C.elements, d_C.elements, size, cudaMemcpyDeviceToHost);
//Do a memcheck
bool passed = true;
for (int k = 0; k < N; k++)
{
//printf("%f ",C.elements[k]);
if (fabs(C.elements[k] -D.elements[k] ) > 1e-6)
{
passed = false;
printf("\nFAIL\n");
printf("C[%d] = %f, D[%d] = %f\n",k,C.elements[k],k,D.elements[k]);
break;
}
}
printf("\n");
if (passed)
printf("PASSED\n");
printf("CPU Elapsed Time: %f\n", cpu_t);
printf("GPU Elapsed Time: %f\n", gpu_t);
//Clear all GPU memory
cudaFree(d_A.elements);
cudaFree(d_B.elements);
cudaFree(d_C.elements);
//Clear all CPU memory
free(A.elements);
free(B.elements);
free(C.elements);
}
【问题讨论】: