【问题标题】:CUDA illegal memory accessCUDA非法内存访问
【发布时间】:2015-06-28 00:17:44
【问题描述】:

我正在尝试让此代码与 3D 类型结构一起使用。我正在使用 Cuda 的 2D 函数。所以主机端线性数据('board')的大小是宽度 * 高度 * 深度,而 2D malloc 是宽度 x 高度 * 深度(这里的宽度和高度都是 DIMxDIM 元素)。内核处理从 A 到 B 的数据。我在该行遇到非法内存访问错误(使用内存检查器)

dst[offset] = curr;

如果我将 malloc 更改为 HEIGHT * 2,错误就会消失,但大小似乎匹配。我错过了什么?也欢迎其他批评,我是 C++ 和 CUDA 的新手。

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdio.h>
#include <stdlib.h>

typedef signed int sint;
typedef unsigned int uint;

#define DIM 512
#define TPB 32 // Threads per block

#define CLEARANCE 5
#define MAPLAYERS 2
#define WIDTH (sizeof(sint) * DIM)
#define HEIGHT (DIM * MAPLAYERS)

void route(sint *A, size_t &pitchA, sint *B, size_t &pitchB, sint *board, int src, int dest);
__global__ void map(sint *src, size_t pitchSrc, sint *dst, size_t pitchDst, unsigned long *index);
__device__ bool hasClearance(sint* src, sint x, sint y, sint z, size_t pitch);
__device__ bool inBounds(sint x, sint y, sint z, sint xoff, sint yoff, sint zoff);
__device__ inline long long calcOffset(sint x, sint y, sint z, sint xoff, sint yoff, sint zoff, size_t pitch);

dim3 blocks(DIM / TPB, DIM / TPB, MAPLAYERS);
dim3 threads(TPB, TPB);

/** CUDA Error Check */
#define CER(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        int tmp;
        std::cin >> tmp;
        exit(code);
    }
}

int main(void) {

    sint *A;
    sint *B;
    size_t pitchA, pitchB;
    sint *board = new sint[WIDTH*HEIGHT];

    CER(cudaMallocPitch(&A, &pitchA, WIDTH, HEIGHT));
    CER(cudaMallocPitch(&B, &pitchB, WIDTH, HEIGHT));
    CER(cudaMemset2D(A, pitchA, 0, WIDTH, HEIGHT));
    CER(cudaMemset2D(B, pitchA, 0, WIDTH, HEIGHT));

    route(A, pitchA, B, pitchB, board, 0, DIM*DIM - 1);

    CER(cudaFree(A));
    CER(cudaFree(B));
    delete[] board;
}

void route(sint *A, size_t &pitchA, sint *B, size_t &pitchB, sint *board, int src, int dest) {
    unsigned long *dev_index;
    unsigned long index = NULL;

    CER(cudaMalloc((void**)&dev_index, sizeof(unsigned long)));
    CER(cudaMemcpy(dev_index, &index, sizeof(unsigned long), cudaMemcpyHostToDevice));

    CER(cudaMemcpy2D(A, pitchA, board, WIDTH, WIDTH, HEIGHT, cudaMemcpyHostToDevice));
    CER(cudaMemcpy2D(B, pitchB, board, WIDTH, WIDTH, HEIGHT, cudaMemcpyHostToDevice));

    map << <blocks, threads >> >(B, pitchB, A, pitchA, dev_index);
    CER(cudaPeekAtLastError());
    CER(cudaMemcpy(&index, dev_index, sizeof(unsigned long), cudaMemcpyDeviceToHost));
    if (index != NULL) {
        // break condition
    }

}

__global__ void map(sint *src, size_t pitchSrc, sint *dst, size_t pitchDst, unsigned long *index) {
    unsigned int x = threadIdx.x + blockIdx.x * blockDim.x;
    unsigned int y = threadIdx.y + blockIdx.y * blockDim.y;
    unsigned int z = blockIdx.z + blockIdx.z * blockDim.z;
    unsigned long long offset = calcOffset(x, y, z, 0, 0, 0, pitchDst);

    sint curr;

    if (!inBounds(x, y, z, 0, 0, 0))
        return;

    curr = src[calcOffset(x, y, z, 0, 0, 0, pitchSrc)];
    if (z % 2 == 0 && curr == 0 && hasClearance(src, x, y, z, pitchSrc)) {
        // Processing
    }
    else
        dst[offset] = 1;

    return;
}

/** Finds linear offset for a given pixel and offset. */
__device__ inline long long calcOffset(sint x, sint y, sint z, sint xoff, sint yoff, sint zoff, size_t pitch) {
    return (x + xoff) + (y + yoff) * pitch + ((z + zoff) * pitch * (HEIGHT / MAPLAYERS));
}


/** Checks if position is valid on the map. */
__device__ bool inBounds(sint x, sint y, sint z, sint xoff, sint yoff, sint zoff) {
    if (0 > (x + xoff) || (x + xoff) >= DIM || 0 > (y + yoff) || (y + yoff) >= DIM || 0 > (z + zoff) || (z + zoff) >= MAPLAYERS)
        return false;
    return true;
}


/** Returns true if a block has clearnace */
__device__ bool hasClearance(sint* src, sint x, sint y, sint z, size_t pitch) {
    for (int c = -CLEARANCE; c <= CLEARANCE; c++) {
        for (int r = -CLEARANCE; r <= CLEARANCE; r++){
            if (inBounds(x, y, z, r, c, 0)){
                if (src[calcOffset(x, y, z, r, c, 0, pitch)] == 2 || src[calcOffset(x, y, z, r, c, 0, pitch)] == 1)
                    return false;
            }
            else {
                return false;
            }
        }
    }
    return true;
}

CUDA 调试器的输出:

Memory Checker detected 384 access violations.
error = access violation on load (global memory)
gridid = 18
blockIdx = {0,8,0}
threadIdx = {0,4,0}
address = 0x05d08000
accessSize = 4

【问题讨论】:

  • 请提供其他人可以编译的代码。如果您不确定那是什么,请将此问题中的代码复制到一个全新的项目中,并继续修复所有编译错误,直到没有任何编译错误。然后确保该代码演示了您所询问的访问冲突。然后将该固定代码粘贴回问题中。

标签: c++ cuda


【解决方案1】:

这看起来不对:

sint *board = new sint[WIDTH*HEIGHT];

我认为你的意思是:

sint *board = new sint[DIM*HEIGHT];

这看起来不对:

unsigned int z = blockIdx.z + blockIdx.z * blockDim.z;

我认为你的意思是:

unsigned int z = threadIdx.z + blockIdx.z * blockDim.z;

但问题的症结在于,您在计算sint 数组中的索引的算术中使用了间距值(计算行宽的字节)。当您以这种方式计算索引时,您需要通过sizeof(sint) 缩放您的音高值。即使这样也不是相当正确的。正确的做法是转换为unsigned char 指针,以行乘以间距(即字节)为单位进行算术运算,然后将行指针从unsigned char 转换回sint,然后从那里通过(x+xoff)。实际上,这意味着您的calcOffset 例程需要重写,并且需要接受底层指针作为参数,并返回一个指针。

所以这段代码有这些变化:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdio.h>
#include <stdlib.h>

typedef signed int sint;
typedef unsigned int uint;

#define DIM 512
#define TPB 32 // Threads per block

#define CLEARANCE 5
#define MAPLAYERS 2
#define WIDTH (sizeof(sint) * DIM)
#define HEIGHT (DIM * MAPLAYERS)

void route(sint *A, size_t &pitchA, sint *B, size_t &pitchB, sint *board, int src, int dest);
__global__ void map(sint *src, size_t pitchSrc, sint *dst, size_t pitchDst, unsigned long *index);
__device__ bool hasClearance(sint* src, sint x, sint y, sint z, size_t pitch);
__device__ bool inBounds(sint x, sint y, sint z, sint xoff, sint yoff, sint zoff);
__device__ inline sint * calcOffset(sint *ptr, sint x, sint y, sint z, sint xoff, sint yoff, sint zoff, size_t pitch);

dim3 blocks(DIM / TPB, DIM / TPB, MAPLAYERS);
dim3 threads(TPB, TPB);

/** CUDA Error Check */
#define CER(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        int tmp;
        std::cin >> tmp;
        exit(code);
    }
}

int main(void) {

    sint *A;
    sint *B;
    size_t pitchA, pitchB;
    sint *board = new sint[DIM*HEIGHT];

    CER(cudaMallocPitch(&A, &pitchA, WIDTH, HEIGHT));
    CER(cudaMallocPitch(&B, &pitchB, WIDTH, HEIGHT));
    CER(cudaMemset2D(A, pitchA, 0, WIDTH, HEIGHT));
    CER(cudaMemset2D(B, pitchA, 0, WIDTH, HEIGHT));

    route(A, pitchA, B, pitchB, board, 0, DIM*DIM - 1);

    CER(cudaFree(A));
    CER(cudaFree(B));
    delete[] board;
}

void route(sint *A, size_t &pitchA, sint *B, size_t &pitchB, sint *board, int src, int dest) {
    unsigned long *dev_index;
    unsigned long index = 0;

    CER(cudaMalloc((void**)&dev_index, sizeof(unsigned long)));
    CER(cudaMemcpy(dev_index, &index, sizeof(unsigned long), cudaMemcpyHostToDevice));

    CER(cudaMemcpy2D(A, pitchA, board, WIDTH, WIDTH, HEIGHT, cudaMemcpyHostToDevice));
    CER(cudaMemcpy2D(B, pitchB, board, WIDTH, WIDTH, HEIGHT, cudaMemcpyHostToDevice));

    map << <blocks, threads >> >(B, pitchB, A, pitchA, dev_index);
    CER(cudaPeekAtLastError());
    CER(cudaMemcpy(&index, dev_index, sizeof(unsigned long), cudaMemcpyDeviceToHost));
    if (index != 0) {
        // break condition
    }

}

__global__ void map(sint *src, size_t pitchSrc, sint *dst, size_t pitchDst, unsigned long *index) {
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;
    int z = threadIdx.z + blockIdx.z * blockDim.z;
    sint *dst_offset = calcOffset(dst, x, y, z, 0, 0, 0, pitchDst);

    sint curr;

    if (!inBounds(x, y, z, 0, 0, 0))
        return;

    curr = *calcOffset(src, x, y, z, 0, 0, 0, pitchSrc);
    if (z % 2 == 0 && curr == 0 && hasClearance(src, x, y, z, pitchSrc)) {
        // Processing
    }
    else
        *dst_offset = 1;

    return;
}

/** Finds linear offset for a given pixel and offset. */
__device__ sint* calcOffset(sint *ptr, sint x, sint y, sint z, sint xoff, sint yoff, sint zoff, size_t pitch) {
    unsigned char *my_ptr = reinterpret_cast<unsigned char *>(ptr);
    return (x + xoff) + reinterpret_cast<sint *>(my_ptr + (((y + yoff) * pitch) + ((z + zoff) * pitch * (HEIGHT / MAPLAYERS))));
}


/** Checks if position is valid on the map. */
__device__ bool inBounds(sint x, sint y, sint z, sint xoff, sint yoff, sint zoff) {
    if (0 > (x + xoff) || (x + xoff) >= DIM || 0 > (y + yoff) || (y + yoff) >= DIM || 0 > (z + zoff) || (z + zoff) >= MAPLAYERS)
        return false;
    return true;
}


/** Returns true if a block has clearnace */
__device__ bool hasClearance(sint* src, sint x, sint y, sint z, size_t pitch) {
    for (int c = -CLEARANCE; c <= CLEARANCE; c++) {
        for (int r = -CLEARANCE; r <= CLEARANCE; r++){
            if (inBounds(x, y, z, r, c, 0)){
                if ((*calcOffset(src, x, y, z, r, c, 0, pitch) == 2) || (*calcOffset(src, x, y, z, r, c, 0, pitch)) == 1)
                    return false;
            }
            else {
                return false;
            }
        }
    }
    return true;
}

将来,您可能希望使用非间距分配让您的代码正常工作。一切正常后,您可以查看添加倾斜分配是否会给您带来任何性能优势。

我还想到,如果 (x+xoff) 为负数(或者如果 (x+xoff) 导致索引到 next 行),即使这样也行不通。您不能以这种方式在倾斜分配中从一行向后索引到上一行(或下一行)。有必要首先将 (x+xoff) 解析为实际引用的行,然后为该行开发一个索引,然后针对该行进行定位计算。

【讨论】:

    猜你喜欢
    • 2020-12-27
    • 2018-12-27
    • 2021-03-25
    • 2015-05-02
    • 2022-01-18
    • 2015-07-23
    • 1970-01-01
    • 2021-08-12
    • 1970-01-01
    相关资源
    最近更新 更多