设备代码中的错误 memcpy答案

【问题标题】：Error memcpy in device code设备代码中的错误 memcpy
【发布时间】：2016-01-19 19:26:24
【问题描述】：

我编写了一个代码，获取可能性向量的第一个 _var 位置（即，矩阵 _size*_var 与 _var=3 和 _size=27）并在我的内核中调用此函数（32 个线程，即每个线程都有一个对象) 但我没有得到函数的任何返回值，也没有 NULL 指针。
程序正常退出，但内核中的 printf 行没有执行或显示（即使使用 sm_20 或更高版本编译），就好像程序之前停止了一样。
dataIntern.h：

#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#define _MIN -1
#define _MAX 1

#ifdef __CUDACC__
#define CUDA_CALLABLE_MEMBER __host__ __device__
#else
#define CUDA_CALLABLE_MEMBER
#endif

template <class a_type>
class dataIntern{
private:
    a_type *possibilities;
    int _assign;
    int _size;
    int _var;
    int _maxsize;

public:
    CUDA_CALLABLE_MEMBER dataIntern(){
    }

    CUDA_CALLABLE_MEMBER dataIntern(int var){
        _var = var;
        _size = (int)pow(3.0, (double)_var);
        _maxsize = _size * _var;
        _assign = 1;
        possibilities = (a_type*)malloc(_maxsize*sizeof(a_type));
        if(!possibilities){
            exit(1);
        }
        createTable();
    }

    CUDA_CALLABLE_MEMBER void createTable(){
        int i, j, k, limit, pos;
        a_type value;
        if(_assign == 1){
            for(i=0; i<_var; i++){
                #ifdef __CUDA_ARCH__
                    limit = (int)pow(3.0, _var-i-1);
                #else
                    limit = (int)pow(3.0, (double)_var-i-1);
                #endif

                value = (a_type)_MIN;
                k = 0;
                for(j=0; j<_size; j++){
                    pos = _var*j+i;
                    if(k >= limit){
                        value++;
                        if(value > _MAX){
                            value = (a_type)_MIN;
                        }
                        k = 0;
                    }
                    possibilities[pos] = value;
                    k++;
                }
            }           
        }
    }

    CUDA_CALLABLE_MEMBER void print(){
        int i;

        printf("Printing.\n");
        if(_assign == 1){
            for(i=0; i<_size*_var; i++){
                printf("%d ", possibilities[i]);
                if(i%_var == _var-1){
                    printf("\n");
                }
            }
        }
        else{
            printf("Not assigned.\n");
        }
    }

    CUDA_CALLABLE_MEMBER void retify(int posChanged, a_type valueRetified){
        int i, pos, count, initpos, attrib;
        a_type *newnode;
        a_type *newlist = NULL, *morelist = NULL;
        pos = posChanged;
        initpos = 0;
        count = 0;      

        if(_assign == 1){
            attrib = 0;
            newnode = (a_type*)malloc(_var*sizeof(a_type));
            for(i=0; i<_size; i++){
                if(possibilities[pos] == valueRetified){
                    memcpy(newnode, &possibilities[i*_var], _var*sizeof(a_type));

                    count++;
                    if(newlist!=NULL){
                        morelist = (a_type*)malloc(count*_var*sizeof(a_type));
                        memcpy(morelist, newlist, (count-1)*_var*sizeof(a_type));
                    }
                    newlist = (a_type*)malloc(count*_var*sizeof(a_type));
                    memcpy(newlist, morelist, (count-1)*_var*sizeof(a_type));
                    memcpy(&newlist[initpos], newnode, _var*sizeof(a_type));

                    initpos+=_var;
                    attrib = 1;
                }
                pos+=_var;
            }

            if(attrib == 1){
                _size = count;
                possibilities = (a_type*)malloc(_size*_var*sizeof(a_type));
                if(possibilities == NULL){
                    printf("Allocation fail in newlist retify.\n");
                    exit(1);
                }
                memcpy(possibilities, newlist, _size*_var*sizeof(a_type));
            }
            else{
                _assign = 0;
            }
        }
    }

    CUDA_CALLABLE_MEMBER a_type* unstack(){
        a_type* solution = NULL, *backup = NULL;

        if(_assign == 1){
            if(_size>0){
                backup = (a_type*)malloc(_var*_size*sizeof(a_type));
                if(backup == NULL){
                    printf("Erro to alloc backup pointer on unstack function in data intern\n");
                    return NULL;
                }

                solution = (a_type*)malloc(_var*sizeof(a_type));
                if(solution == NULL){
                    printf("Erro to alloc solution pointer on unstack function in data intern\n");
                    return NULL;
                }
                memcpy(backup, possibilities, _size*_var*sizeof(a_type));                   
                memcpy(solution, possibilities, _var*sizeof(a_type));

                free(possibilities);

                _size--;
                possibilities = (a_type*)malloc(_size*_var*sizeof(a_type));
                if(possibilities == NULL){
                    printf("Error to realloc possibilities pointer in data intern\n");
                    return NULL;
                }
                memcpy(possibilities, &backup[_var], _size*_var*sizeof(a_type));

                free(backup);
                return solution;
            }           
        }
        return NULL;
    }

    CUDA_CALLABLE_MEMBER int get_size(){
        return _size;
    }

    CUDA_CALLABLE_MEMBER ~dataIntern(){
        _assign = 0;
        if(possibilities)
            free(possibilities);
    }
};

deviceCode.h：

#ifndef DEVICECODE_H
#define DEVICECODE_H

void CallingInMain();
__global__ void kernel();
#endif

deviceCode.cu：

#include "deviceCode.h"
#include "dataIntern.h"
#include <iostream>
#include <stdio.h>


//I declared like this to my kernel:
__global__ void kernel(){
    __shared__ dataIntern<int> data[32];
    int *vetor;

    vetor = NULL;
    data[threadIdx.x] = dataIntern<int>(3);

    //_var == 3 in the class above
    vetor = (int*)malloc(sizeof(int)*3);

    vetor = data[threadIdx.x].unstack();
    while(vetor!=NULL){
        //never past here
        printf("%d %d %d %d\n", threadIdx.x, vetor[0], vetor[1], vetor[2]);
        vetor = data[threadIdx.x].unstack();
    }
    //neither here in if or else
    if(vetor)
        printf("Not null\n");
    else
        printf("Null final\n");

    free(vetor);
}

void CallingInMain(){
    kernel<<<1, 32>>>();
    cudaDeviceSynchronize();
}

main.cu:

#include <iostream>
#include <stdio.h>

#ifndef deviceCode_H
#include "deviceCode.h"
#endif

int main(int argc, char* argv[]){


    CallingInMain();

    return 0;
}

【问题讨论】：

您的代码不可编译。除其他外，它缺少一个复制构造函数和一个main 例程。所以expects 对于这些问题（“为什么这段代码不起作用？”），你提供一个MCVE。这是其他人可以编译、运行并查看问题的完整代码。此外，您应该使用proper cuda error checking。如果您使用 cuda-memcheck 运行代码会发生什么？
我只发布主要部分。我没有复制构造函数，真的有必要吗？我的主要问题是关于设备中动态指针的返回，有可能并且性能良好？
您现在发布的代码将编译（尽管它有您应该注意的警告），但在运行时它会出现各种错误。我建议添加我提到的正确的 cuda 错误检查以及使用cuda-memcheck 运行您的代码。您现在在 cmets 中发布的问题似乎与您在问题中发布的问题完全不同，这似乎是在询问为什么代码不起作用。不过，您的代码无法正常工作，我建议在您提出关于性能的问题之前先解决这个问题（我不清楚）。
事实上，如果第二个问题（指针返回）可能是第一个问题的原因，我很困惑。关于cuda错误检查，我会尝试使用它。

标签： cuda memcpy

【解决方案1】：

有同事向我指出，您的代码似乎有错误。

在你的内核中考虑这一行：

data[threadIdx.x] = dataIntern<int>(3);

这一行实例化了一个临时的dataIntern<int> 对象，运行带有3 值的构造函数，然后从该对象复制到data[threadIdx.x] 中的存储。注意构造函数执行malloc操作：

CUDA_CALLABLE_MEMBER dataIntern(int var){
    ...
    possibilities = (a_type*)malloc(_maxsize*sizeof(a_type));

但由于原始对象是临时的，C++ 标准允许在语句终止时删除对象，即此处的分号处：

data[threadIdx.x] = dataIntern<int>(3);
                                      ^

复制构建过程完成后。但是对象的删除触发了析构函数，它对possibilities进行了free操作：

CUDA_CALLABLE_MEMBER ~dataIntern(){
    _assign = 0;
    if(possibilities)
        free(possibilities);
}

因此在这行代码之后分配的指针的用法：

data[threadIdx.x] = dataIntern<int>(3);

比如unstack这里：

vetor = data[threadIdx.x].unstack();

将无效。

这违反了 C++ 编程规则，错误不是 CUDA 特有的。

【讨论】：

我认为这是错误，因为当我让析构函数完美运行时，我将数据结构更改为结构，从内核初始化并将它们传递到设备内存。还是谢谢你。
我有一个超出该上下文的问题。在共享内存中声明变量时，所有块的内容都相同吗？即如果我用块 0 更改您的值，变量的内容也会更改为其他块？
不，共享内存的范围仅适用于单个块。内核中的每个块都有自己单独范围的共享内存。