【发布时间】:2021-05-30 21:45:20
【问题描述】:
我正在使用 GTX Titan X GPU、CUDA 8.0 和驱动程序版本 367.48 的系统。当我使用 nvidia-smi 时,我的 GPU 已正确列出。
我正在使用下面的代码对 Pi 进行数值逼近,并测量代码的执行时间 5000 次。但是,内核返回 0.0 作为近似结果。为什么会这样?
#include "cuda_runtime.h"
#include <stdio.h>
#define ITERATIONS 96000000
const int threads = 256;
// Synchronous error checking call. Enable with nvcc -DEBUG
inline void checkCUDAError(const char *fileName, const int line)
{
#ifdef DEBUG
cudaThreadSynchronize();
cudaError_t error = cudaGetLastError();
if(error != cudaSuccess)
{
printf("Error at %s: line %i: %s\n", fileName, line, cudaGetErrorString(error));
exit(-1);
}
#endif
}
__global__ void integrateSimple(float *sum)
{
__shared__ float ssums[threads];
// Each thread computes its own sum.
int global_idx = threadIdx.x + blockIdx.x * blockDim.x;
if(global_idx < ITERATIONS)
{
float step = 1.0f / ITERATIONS;
float x = (global_idx + 0.5f) * step;
ssums[threadIdx.x] = 4.0f / (1.0f + x * x);
}
else
{
ssums[threadIdx.x] = 0.0f;
}
// The 1st thread will gather all sums from all other threads of this block into one
__syncthreads();
if(threadIdx.x == 0)
{
float local_sum = 0.0f;
for(int i = 0; i < threads; ++i)
{
local_sum += ssums[i];
}
sum[blockIdx.x] = local_sum;
}
}
int main()
{
const float PI = 3.14159265358979323846264;
int deviceCount = 0;
printf("Starting...");
cudaError_t error = cudaGetDeviceCount(&deviceCount);
if (error != cudaSuccess)
{
printf("cudaGetDeviceCount returned %d\n-> %s\n", (int)error, cudaGetErrorString(error));
return 1;
}
deviceCount == 0 ? printf("There are no available CUDA device(s)\n") : printf("%d CUDA Capable device(s) detected\n", deviceCount);
/*--------- Simple Kernel ---------*/
int blocks = (ITERATIONS + threads - 1) / threads;
float *sum_d;
float step = 1.0f / ITERATIONS;
for (int i = 0; i < 5000; ++i)
{
// Allocate device memory
cudaMallocManaged((void **)&sum_d, blocks * sizeof(float));
// CUDA events needed to measure execution time
cudaEvent_t start, stop;
float gpuTime;
// Start timer
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
/*printf("\nCalculating Pi using simple GPU kernel over %i intervals...\n", (int)ITERATIONS);*/
integrateSimple<<<blocks, threads>>>(sum_d);
cudaDeviceSynchronize(); // wait until the kernel execution is completed
checkCUDAError(__FILE__, __LINE__);
// Stop timer
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&gpuTime, start, stop);
// Sum result on host
float piSimple = 0.0f;
for (int i = 0; i < blocks; i++)
{
piSimple += sum_d[i];
}
piSimple *= step;
cudaFree(sum_d);
// Stop timer
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&gpuTime, start, stop);
// Print execution times
/*printf("\n======================================\n\n");*/
printf("%.23lf,%.23lf,%f", piSimple, fabs(piSimple - PI), gpuTime/1000);
printf("\n");
}
// Reset Device
cudaDeviceReset();
return 0;
}
最后几行的输出
0.00000000000000000000000,3.14159274101257324218750,0.000009 0.00000000000000000000000,3.14159274101257324218750,0.000009 0.00000000000000000000000,3.14159274101257324218750,0.000009 0.00000000000000000000000,3.14159274101257324218750,0.000008 0.00000000000000000000000,3.14159274101257324218750,0.000008 0.00000000000000000000000,3.14159274101257324218750,0.000008 P>
另外,当我编译时收到此警告:
nvcc 警告:“compute_20”、“sm_20”和“sm_21”架构 > 已弃用,可能会在未来版本中删除(使用 -Wno-deprecated-gpu-targets 来抑制警告)。
【问题讨论】:
-
它在我的系统上运行良好。典型的输出线;
3.14135432243347167968750,0.00023841857910156250000,0.009575. -
@einpoklum 是的,这是正确的。我还尝试在另一台 GTX 1060 (6GB) 机器上运行,但不明白为什么在那台机器上无法运行。
-
您是否在您的机器上编译到正确的 GPU 架构?
-
@AnderBiguri 老实说,我对 CUDA 还是很陌生。我用过:
nvcc pi_cuda.cu -o pi_cudapi_cuda.cu 是文件。