【问题标题】:Copying a 2D malloc pitched device memory to a 3D array in the device将 2D malloc pitched 设备内存复制到设备中的 3D 阵列
【发布时间】:2020-09-11 07:49:48
【问题描述】:

在执行输出更新的设备内存(即下面代码中的d_ordered)的内核(即reorder_raw)之后,我想在另一个内核中进行一些分层插值。我知道我必须为此创建一个 3D 数组,然后将我的设备内存转移到我的数组内存d_ordered_array。但是,当我使用函数 cudaMemcpy2DToArray 时,我在代码末尾的 mexPrintf(cudaGetErrorString(cudaGetLastError())); 函数中收到错误 invalid memory

否则,如果我评论 cudaMemcpy2DToArray 我不会收到任何错误。

void delay_US_linear(
    short *h_raw, short *d_ordered, float *d_delay,
    int samples, int channels, int scanlines, int elements, 
    float pitch, float speed_sound, float sample_freq, float delay_offset,
    size_t in_pitch, size_t out_pitch
){
    // Allocate the GPU raw data and ordered data buffer
    short *d_raw;
    cudaMalloc((void**)& d_raw, sizeof(short)*samples*channels*scanlines);
    cudaMemcpy(d_raw, h_raw, sizeof(short)*samples*channels*scanlines, cudaMemcpyHostToDevice);

    // Allocate block and grid dimensions
    int griddim_x = (samples + order_X - 1) / order_X;
    int griddim_y = (scanlines);
    int griddim_z = 1;

    dim3 dimGrid(griddim_x, griddim_y, griddim_z);
    dim3 dimBlock(order_X, order_Y, order_Z);

    // Use all threads in block for shared memory
    int shared_size = order_X * order_Y * order_Z * sizeof(short);

    // Only need to change the channel order, independency in axial and scanline dimension
    reorder_raw << <dimGrid, dimBlock, shared_size >> > (
        d_raw, d_ordered, samples, channels, scanlines, elements, in_pitch/sizeof(short));

    cudaDeviceSynchronize();
    // Create a 3D array
    cudaArray *d_ordered_array;
    cudaChannelFormatDesc  desc = cudaCreateChannelDesc(16, 0, 0, 0, cudaChannelFormatKindSigned);
    cudaMalloc3DArray(&d_ordered_array, &desc, make_cudaExtent(samples, channels, scanlines), 
    cudaArrayLayered);

    // Copy device memory to the 3D array
    cudaMemcpy2DToArray(d_ordered_array, 0, 0, d_ordered, in_pitch, sizeof(short)*samples, 
    channels*scanlines,cudaMemcpyDeviceToDevice);

    cudaFreeArray(d_ordered_array);
    cudaFree(d_raw);

    mexPrintf(cudaGetErrorString(cudaGetLastError()));
}

作为参考,d_ordered 设备指针是一个 2D 间距内存,之前已分配为

size_t in_pitch;
cudaMallocPitch((void**)& d_ordered,&in_pitch,sizeof(short)*samples,channels*scanlines);
    

【问题讨论】:

    标签: arrays cuda


    【解决方案1】:

    使用cudaMalloc3D 而不是cudaMallocPitch 来分配d_ordered,并使用cudaMemcpy3D 操作而不是cudaMemcpy2DToArray,您将能够使其工作。这些与您的 3D cudaArray 匹配。这是一个例子:

    $ cat t1733.cu
    #include <iostream>
    
    void delay_US_linear(
        short *h_raw, cudaPitchedPtr d_ordered, float *d_delay,
        int samples, int channels, int scanlines, int elements,
        float pitch, float speed_sound, float sample_freq, float delay_offset,
        size_t in_pitch, size_t out_pitch
    ){
        // Create a 3D array
        cudaArray *d_ordered_array;
        cudaChannelFormatDesc  desc = cudaCreateChannelDesc<short>();
        cudaExtent my_ext = make_cudaExtent(samples, channels, scanlines);
        cudaMalloc3DArray(&d_ordered_array, &desc, my_ext, cudaArrayLayered);
    
        // Copy device memory to the 3D array
        cudaMemcpy3DParms p = {0};
        p.srcPtr = d_ordered;
        p.dstArray = d_ordered_array;
        p.extent = my_ext;
        p.kind = cudaMemcpyDeviceToDevice;
        cudaMemcpy3D(&p);
    
        cudaFreeArray(d_ordered_array);
    
        std::cout << cudaGetErrorString(cudaGetLastError()) << std::endl;
    }
    
    
    int main(){
      const int samples = 4864; // 4864
      const int channels = 64; //64
      const int scanlines = 128;// 128
      cudaPitchedPtr d_ordered;
      size_t in_pitch=0, out_pitch = 0;
      short *h_raw = NULL;
      float *d_delay = NULL;
      const int elements = 0;
      float pitch = 0;
      float speed_sound = 0;
      float sample_freq = 0;
      float delay_offset = 0;
      cudaExtent my_ext = make_cudaExtent(samples*sizeof(short), channels, scanlines);
      cudaMalloc3D(&d_ordered, my_ext);
    //  cudaMallocPitch((void**) &d_ordered,&in_pitch,sizeof(short)*samples,channels*scanlines);
      delay_US_linear(h_raw, d_ordered, d_delay, samples, channels, scanlines, elements,
        pitch, speed_sound, sample_freq, delay_offset, in_pitch, out_pitch);
    }
    $ nvcc -o t1733 t1733.cu
    $ cuda-memcheck ./t1733
    ========= CUDA-MEMCHECK
    no error
    ========= ERROR SUMMARY: 0 errors
    $
    

    【讨论】:

      猜你喜欢
      • 2018-07-17
      • 2013-03-25
      • 1970-01-01
      • 1970-01-01
      • 2019-10-15
      • 2017-03-23
      • 2014-10-18
      • 1970-01-01
      • 1970-01-01
      相关资源
      最近更新 更多