【发布时间】:2020-05-21 22:48:40
【问题描述】:
我正在使用 VS2019 并拥有 NVIDIA GeForce GPU。我尝试了此链接中的代码:https://towardsdatascience.com/writing-lightning-fast-code-with-cuda-c18677dcdd5f
该帖子的作者声称在使用 CUDA 时获得了加速。但是,对我来说,串行版本大约需要 7 毫秒,而 CUDA 版本大约需要 28 毫秒。为什么这段代码的 CUDA 速度较慢?我使用的代码如下:
__global__
void add(int n, float* x, float* y)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = index; i < n; i += stride)
y[i] = x[i] + y[i];
}
void addSerial(int n, float* x, float* y)
{
for (int i = 0; i < n; i++)
y[i] = x[i] + y[i];
}
int main()
{
int NSerial = 1 << 20;
float* xSerial = new float[NSerial];
float* ySerial = new float[NSerial];
for (int i = 0; i < NSerial; i++) {
xSerial[i] = 1.0f;
ySerial[i] = 2.0f;
}
auto t1Serial = std::chrono::high_resolution_clock::now();
addSerial(NSerial, xSerial, ySerial);
auto t2Serial = std::chrono::high_resolution_clock::now();
auto durationSerial = std::chrono::duration_cast<std::chrono::milliseconds>(t2Serial - t1Serial).count();
float maxErrorSerial = 0.0f;
for (int i = 0; i < NSerial; i++)
maxErrorSerial = fmax(maxErrorSerial, fabs(ySerial[i] - 3.0f));
std::cout << "Max error Serial: " << maxErrorSerial << std::endl;
std::cout << "durationSerial: "<<durationSerial << std::endl;
delete[] xSerial;
delete[] ySerial;
int N = 1 << 20;
float* x, * y;
cudaMallocManaged(&x, N * sizeof(float));
cudaMallocManaged(&y, N * sizeof(float));
for (int i = 0; i < N; i++) {
x[i] = 1.0f;
y[i] = 2.0f;
}
int device = -1;
cudaGetDevice(&device);
cudaMemPrefetchAsync(x, N * sizeof(float), device, NULL);
cudaMemPrefetchAsync(y, N * sizeof(float), device, NULL);
int blockSize = 1024;
int numBlocks = (N + blockSize - 1) / blockSize;
auto t1 = std::chrono::high_resolution_clock::now();
add << <numBlocks, blockSize >> > (N, x, y);
cudaDeviceSynchronize();
auto t2 = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1).count();
float maxError = 0.0f;
for (int i = 0; i < N; i++)
maxError = fmax(maxError, fabs(y[i] - 3.0f));
std::cout << "Max error: " << maxError << std::endl;
std::cout << "duration CUDA: "<<duration;
cudaFree(x);
cudaFree(y);
return 0;
}
【问题讨论】:
-
7 毫秒可能太少了,也看不到任何效果。尝试增加工作量
-
当我更改
NSerial = N = 1<<30时,串行版本只需要4秒而CUDA需要19秒 -
这可能是因为将内存从 RAM 移动到 GPU 比在 CPU 上进行简单计算需要更长的时间。这取决于硬件。
-
循环运行内核约 100 次并取平均值。您的同步调用也会影响性能。
-
对此类问题的规范答案似乎是“尝试运行以发布模式构建的代码,而不是调试”。
标签: c++ parallel-processing cuda gpu