【发布时间】:2014-10-18 06:14:32
【问题描述】:
我是 CUDA 的新手,所以如果我犯了任何愚蠢的错误,我很抱歉,但这让我感到困惑。以下代码非常适用于最多 620 个元素的数组。当我们将 NV def(涡旋数)从 621 更改为更高时,内核中的所有数组都变为 NAN。我希望有人能解释一下。
#include <stdio.h>
#include <time.h>
#define NP 20000
#define DT 0.01
#define NV 620 // Fails if 621 or larger
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
__device__ float d_x0[NV];
__device__ float d_y0[NV];
__global__ static void calc(float *d_x, float *d_y, float Lx, float Ly ){
int i = blockDim.x * blockIdx.x + threadIdx.x;
float fx, fy, t0, t1, t2, t3, t4, dx, dy, pi = acos(-1.0);
int j, n;
if (i<NV) {
// For array error detection
if (isnan(d_x0[i])) printf(" dx(%d)!",i);
if (isnan(d_y0[i])) printf(" dy(%d)!",i);
if (isnan(d_x[i])) printf(" x(%d)!",i);
if (isnan(d_y[i])) printf(" y(%d)!",i);
fx = 0.0; fy = 0.0;
for (j = 0 ; j < NV ; j++){
dx = d_x0[i] - d_x0[j];
dy = d_y0[i] - d_y0[j];
t0 = 2.0 * dy / Ly;
t1 = sin(2.0 * pi * dx / Lx);
t3 = cos(2.0 * pi * dx / Lx);
for (n = -10 ; n <= 10 ; n++){
if (n == 0){
if (j != i){
t2 = cosh(2.0 * pi * Ly / Lx * (dy / Ly + n));
t4 = sinh(2.0 * pi * Ly/Lx * (dy / Ly + n));
fx = fx + t1 / (t2 - t3);
fy = fy + t4 / (t2 - t3);
}
}
else{
t2 = cosh(2.0 * pi * Ly / Lx * (dy / Ly + n));
t4 = sinh(2.0 * pi * Ly/Lx * (dy / Ly + n));
fx = fx + t1 / (t2 - t3);
fy = fy + t4 / (t2 - t3);
}
}
fy = fy - t0;
}
fx = fx * pi / Lx;
fy = fy * pi / Lx;
d_x[i] = d_x0[i] + fx * DT;
d_y[i] = d_y0[i] + fy * DT;
// Clip box
if(d_x[i] > Lx) d_x[i] = d_x[i] - (abs(d_x[i] / Lx) * Lx);
if(d_x[i] < 0.0) d_x[i] = d_x[i] + ((abs(d_x[i] / Lx) + 1.0) * Lx);
if(d_y[i] > Ly) d_y[i] = d_y[i] - (abs(d_y[i] / Ly) * Ly);
if(d_y[i] < 0.0) d_y[i] = d_y[i] + ((abs(d_y[i] / Ly) + 1.0) * Ly);
}
}
__global__ static void update(float *d_x, float *d_y ){
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i<NV) {
d_x0[i] = d_x[i];
d_y0[i] = d_y[i];
}
}
int main(int argc,char **argv) {
float Lx, Ly, dv;
int i, k;
int size = (NV) * sizeof(float);
float* x = (float*)malloc(size);
float* y = (float*)malloc(size);
float* x0 = (float*)malloc(size);
float* y0 = (float*)malloc(size);
dv = 0.12 * 16.0;
Lx = sqrt(2.0 / 3.0 * sqrt(3.0) * NV / dv);
Ly = Lx * sqrt(3.0) / 2.0;
for(i=0 ; i < NV ; i++){
x0[i] = Lx * (rand() % 1000)/1000;
y0[i] = Ly * (rand() % 1000)/1000;
}
// GPU mem management
float *d_x = NULL, *d_y = NULL;
cudaMalloc((void**)&d_x, size);
cudaCheckErrors("cudaMalloc fail 1");
cudaMalloc((void**)&d_y, size);
cudaCheckErrors("cudaMalloc fail 2");
cudaMemcpyToSymbol(d_x0, x0, size);
cudaCheckErrors("cudaMemcpyToSymbol fail 1");
cudaMemcpyToSymbol(d_y0, y0, size);
cudaCheckErrors("cudaMemcpyToSymbol fail 2");
int threadsPerBlock = 512;
int blocksPerGrid = (NV + threadsPerBlock - 1) / threadsPerBlock;
for(k = 0; k < NP ; k++){
calc<<<blocksPerGrid, threadsPerBlock>>>( d_x, d_y, Lx, Ly);
cudaCheckErrors("kernel 1 call fail");
cudaDeviceSynchronize();
update<<<blocksPerGrid, threadsPerBlock>>>( d_x, d_y);
cudaCheckErrors("kernel 2 call fail");
if (k%((NP)/200)==0) {
cudaMemcpy(x, d_x, size, cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemCopy fail 1");
cudaMemcpy(y, d_y, size, cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemCopy fail 2");
printf("(%d%%) ",100*k/NP);
for(i = 1 ; i <= 5 ; i++) printf(",%5.2f,%5.2f ", x[i], y[i]);
printf("\n\n");
}
}
cudaMemcpy(x, d_x, size, cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy fail 1");
cudaMemcpy(y, d_y, size, cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy fail 2");
cudaMemcpyFromSymbol(x0, d_x0, size);
cudaCheckErrors("cudaMemcpyFromSymbol fail 1");
cudaMemcpyFromSymbol(y0, d_y0, size);
cudaCheckErrors("cudaMemcpyFromSymbol fail 2");
cudaFree(d_x);
cudaFree(d_y);
return 0;
}
我尝试更改块和网格结构,使用 -arch=sm_35 -arch=sm_30 和 --cudart=shared 选项进行编译,甚至将数组从 float 更改为 double,但没有任何效果。
【问题讨论】: