指向主机结构内的设备数组的指针答案

【问题标题】：Pointer to device array inside host struct指向主机结构内的设备数组的指针
【发布时间】：2017-04-01 19:11:44
【问题描述】：

我正在尝试创建一个结构，它将主机和设备数组保存在一个地方，并且应该驻留在主机上。我后来打算将它扩展为链表的一个元素。基本结构如下所示：

typedef struct Data{
    double *h;
    double *d;
} Data;

其中 *h 指向主机上的双精度数组，*d 指向设备上的双精度数组。

关于将整个结构复制到设备 (CUDA cudaMemcpy Struct of Arrays) 有各种答案，但没有一个完全符合我的需要。我有以下代码，但不断收到非法内存访问错误。

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "cuda.h"

/*
* CUDA Error stuff
*/

static void HandleError( cudaError_t err,
                         const char *file,
                         int line ) {
    if (err != cudaSuccess) {
        printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
                file, line );
        exit( EXIT_FAILURE );
    }
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))


#define HANDLE_NULL( a ) {if (a == NULL) { \
                            printf( "Host memory failed in %s at line %d\n", \
                                    __FILE__, __LINE__ ); \
                            exit( EXIT_FAILURE );}}

//malloc error code
int errMsg(const char *message, int errorCode)
{
    printf("%s\n", message);
    return errorCode;
}

typedef struct Data{
    double *h;
    double *d;
} Data;

__global__ void kernel(Data *d)
{
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if(tid<100){
        d->d[tid] = 2;
    }

}

int main()
{
    Data *d;
    d = (Data*)malloc(sizeof(Data));

    d->h = (double*)malloc(sizeof(double)*100);
    HANDLE_ERROR( cudaMalloc((void**) &(d->d), 100*sizeof(double)) );

    for(int i=0; i<100; i++){
        d->h[i] = i;
    }

    HANDLE_ERROR( cudaMemcpy(d->d, d->h, 100*sizeof(double), cudaMemcpyHostToDevice) );

    printf("%f\n", d->h[1]);

    kernel<<<1, 102>>>(d);

    printf("done\n");

    {
    cudaError_t cudaerr = cudaDeviceSynchronize();
    if (cudaerr != cudaSuccess)
        printf("kernel launch failed with error \"%s\"->\n",
               cudaGetErrorString(cudaerr));
    }

    HANDLE_ERROR( cudaMemcpy(d->h, d->d, 100*sizeof(double), cudaMemcpyDeviceToHost) );
    printf("%f\n", d->h[99]);


    return 0;
}

我得到的输出是：

1.000000
done
kernel launch failed with error "an illegal memory access was encountered"->
an illegal memory access was encountered in linkedListGPU.cu at line 77

我怀疑我的指针有点搞砸了。错误处理代码来自Wiley Introduction to CUDA book，如果这里不允许出现代码，我会删除它。

谢谢。

【问题讨论】：

您将主机指针传递给设备并尝试在内核中访问它。这显然是行不通的
谢谢，只需将内核调用更改为 (d->d) 并调整内核代码即可修复它。很抱歉，我现在对设备上的结构感到困惑。

标签： struct cuda

【解决方案1】：

问题是d 本身是一个指向主机分配结构的指针（其中包含d 和h 指针。当您将d 结构指针传递给内核时，如下所示：

kernel<<<1, 102>>>(d);
                   ^
                   this is a pointer to memory on the host

然后尝试在此处取消引用设备代码中的该指针：

    d->...;
     ^ 
     This operator dereferences the pointer to the left of it

你获得了非法的内存访问。

至少有两种明显的方法可以解决这个问题：

按值而不是按指针传递结构。

这是一个例子：

$ cat t1311.cu
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "cuda.h"

/*
* CUDA Error stuff
*/

static void HandleError( cudaError_t err,
                         const char *file,
                         int line ) {
    if (err != cudaSuccess) {
        printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
                file, line );
        exit( EXIT_FAILURE );
    }
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))


#define HANDLE_NULL( a ) {if (a == NULL) { \
                            printf( "Host memory failed in %s at line %d\n", \
                                    __FILE__, __LINE__ ); \
                            exit( EXIT_FAILURE );}}

//malloc error code
int errMsg(const char *message, int errorCode)
{
    printf("%s\n", message);
    return errorCode;
}

typedef struct Data{
    double *h;
    double *d;
} Data;

__global__ void kernel(Data d)
{
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if(tid<100){
        d.d[tid] = 2;
    }

}

int main()
{
    Data d;

    d.h = (double*)malloc(sizeof(double)*100);
    HANDLE_ERROR( cudaMalloc((void**) &(d.d), 100*sizeof(double)) );

    for(int i=0; i<100; i++){
        d.h[i] = i;
    }

    HANDLE_ERROR( cudaMemcpy(d.d, d.h, 100*sizeof(double), cudaMemcpyHostToDevice) );

    printf("%f\n", d.h[1]);

    kernel<<<1, 102>>>(d);

    printf("done\n");

    {
    cudaError_t cudaerr = cudaDeviceSynchronize();
    if (cudaerr != cudaSuccess)
        printf("kernel launch failed with error \"%s\"->\n",
               cudaGetErrorString(cudaerr));
    }

    HANDLE_ERROR( cudaMemcpy(d.h, d.d, 100*sizeof(double), cudaMemcpyDeviceToHost) );
    printf("%f\n", d.h[99]);


    return 0;
}
$ nvcc -arch=sm_35 -o t1311 t1311.cu
$ cuda-memcheck ./t1311
========= CUDA-MEMCHECK
1.000000
done
2.000000
========= ERROR SUMMARY: 0 errors
$

制作d 主机指针指向的结构的设备副本：

这是一个例子：

$ cat t1311.cu
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "cuda.h"

/*
* CUDA Error stuff
*/

static void HandleError( cudaError_t err,
                         const char *file,
                         int line ) {
    if (err != cudaSuccess) {
        printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
                file, line );
        exit( EXIT_FAILURE );
    }
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))


#define HANDLE_NULL( a ) {if (a == NULL) { \
                            printf( "Host memory failed in %s at line %d\n", \
                                    __FILE__, __LINE__ ); \
                            exit( EXIT_FAILURE );}}

//malloc error code
int errMsg(const char *message, int errorCode)
{
    printf("%s\n", message);
    return errorCode;
}

typedef struct Data{
    double *h;
    double *d;
} Data;

__global__ void kernel(Data *d)
{
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if(tid<100){
        d->d[tid] = 2;
    }

}

int main()
{
    Data *d, *dev_d;
    d = (Data*)malloc(sizeof(Data));
    HANDLE_ERROR(cudaMalloc(&dev_d, sizeof(Data)));
    d->h = (double*)malloc(sizeof(double)*100);
    HANDLE_ERROR( cudaMalloc((void**) &(d->d), 100*sizeof(double)) );

    for(int i=0; i<100; i++){
        d->h[i] = i;
    }

    HANDLE_ERROR( cudaMemcpy(d->d, d->h, 100*sizeof(double), cudaMemcpyHostToDevice) );
    HANDLE_ERROR(cudaMemcpy(dev_d, d, sizeof(Data), cudaMemcpyHostToDevice));
    printf("%f\n", d->h[1]);

    kernel<<<1, 102>>>(dev_d);

    printf("done\n");

    {
    cudaError_t cudaerr = cudaDeviceSynchronize();
    if (cudaerr != cudaSuccess)
        printf("kernel launch failed with error \"%s\"->\n",
               cudaGetErrorString(cudaerr));
    }

    HANDLE_ERROR( cudaMemcpy(d->h, d->d, 100*sizeof(double), cudaMemcpyDeviceToHost) );
    printf("%f\n", d->h[99]);


    return 0;
}
$ nvcc -arch=sm_35 -o t1311 t1311.cu
$ cuda-memcheck ./t1311
========= CUDA-MEMCHECK
1.000000
done
2.000000
========= ERROR SUMMARY: 0 errors
$

顺便说一句，您可以按照here 列出的方法将您的调试过程推进得更远。

【讨论】：