CUDA - “不支持未对齐的内存访问”答案

【问题标题】：CUDA - "Unaligned memory accesses not supported"CUDA - “不支持未对齐的内存访问”
【发布时间】：2015-02-26 15:16:30
【问题描述】：

之前关于这个程序的问题：

Blur effect on bitmap using C

Translating four nested loops into a CUDA kernel

我正在使用 Visual Studio 2012 和 CUDA 6 代码应该使用 CUDA 在 BMP 文件上添加模糊效果。在转换为 CUDA 之前，一切都运行良好。这是我第一个同时使用 C 和 CUDA 的项目，所以我可能犯了一些愚蠢的错误。我的代码出现 76 个错误，其中大多数是“此声明没有存储类或类型说明符”，还有更多没有任何意义的错误。我在 http://computer-graphics.se/hello-world-for-cuda.html 的 Hello World 程序之前尝试过，它工作正常。有同样的错误，所以我并不真正关心它们。

但我有两个不同的错误：

Error    2    error : Unaligned memory accesses not supported  C:\Users\Karpińscy\documents\visual studio 2012\Projects\blur\blur\kernel.cu    blur

还有：

错误 3 错误 MSB3721：命令 ""C:\Program Files\NVIDIA GPU 计算工具包\CUDA\v6.0\bin\nvcc.exe" -gencode=arch=compute_10,code=\"sm_10,compute_10\" --use-local-env --cl-version 2012 -ccbin "C:\Program Files (x86)\Microsoft Visual Studio 11.0\VC\bin" -I"C:\Program Files\NVIDIA GPU 计算 Toolkit\CUDA\v6.0\include" -I"C:\Program Files\NVIDIA GPU 计算工具包\CUDA\v6.0\include" -G -maxrregcount=0 --machine 32 --compile -cudart static -g -DWIN32 -D_DEBUG -D_CONSOLE -D_MBCS -Xcompiler "/EHsc /W3 /nologo /Od /Zi /RTC1 /MDd " -o Debug\kernel.cu.obj "C:\Users\Karpińscy \文档\视觉工作室 2012\Projects\blur\blur\kernel.cu"" 退出，代码为 2。C:\Program 文件 (x86)\MSBuild\Microsoft.Cpp\v4.0\V110\BuildCustomizations\CUDA 6.0.targets 597 9 模糊

我什至在 google.com 的第二个网站上搜索过答案，但我还没有为适合我的解决方案提供资金。请帮帮我！

程序代码：

#include <stdio.h>
#include <stdlib.h>
#include <Windows.h>


#pragma pack(push,1)
/* Windows 3.x bitmap file header */
typedef struct {
    char         filetype[2];   /* magic - always 'B' 'M' */
    unsigned int filesize;
    short        reserved1;
    short        reserved2;
    unsigned int dataoffset;    /* offset in bytes to actual bitmap data */
} file_header;

/* Windows 3.x bitmap full header, including file header */
typedef struct {
    file_header  fileheader;
    unsigned int headersize;
    int          width;
    int          height;
    short        planes;
    short        bitsperpixel;  /* we only support the value 24 here */
    unsigned int compression;   /* we do not support compression */
    unsigned int bitmapsize;
    int          horizontalres;
    int          verticalres;
    unsigned int numcolors;
    unsigned int importantcolors;
} bitmap_header;
#pragma pack(pop)

__global__ void blur(bitmap_header* hp, unsigned char *data)
{
    int xx,yy,x,y, avgB, avgG, avgR, ile;
    int blurSize = 5;

    xx = blockIdx.y * blockDim.y + threadIdx.y;
    yy = blockIdx.x * blockDim.x + threadIdx.x;

    if(xx >= hp->width || yy >= hp->height)
        return;


    avgB = avgG = avgR = 0;
    ile = 0;

    for(x = xx; x < hp->width && x < xx + blurSize; x++)
    {


        for(y = yy; y < hp->height && y < yy + blurSize; y++)
        {
            avgB += data[x*3 + y*hp->width*3 + 0];
            avgG += data[x*3 + y*hp->width*3 + 1];
            avgR += data[x*3 + y*hp->width*3 + 2];
            ile++;
        }
    }

    avgB = avgB / ile;
    avgG = avgG / ile;
    avgR = avgR / ile;

    data[xx*3 + yy*hp->width*3 + 0] = avgB;
    data[xx*3 + yy*hp->width*3 + 1] = avgG;
    data[xx*3 + yy*hp->width*3 + 2] = avgR;
}

int filter(char* input, char *output)
{
    FILE *fp,*out;
    bitmap_header* hp;
    bitmap_header* d_hp;
    unsigned char *data;
    unsigned char *d_data;

    //Open input file:
    fp = fopen(input, "r");
    if(fp==NULL)
        return 1;

    //Read the input file headers:
    hp=(bitmap_header*)malloc(sizeof(bitmap_header));

    cudaMalloc( &d_hp, sizeof(bitmap_header));

    if(hp==NULL)
        return 1;

    fread(hp, sizeof(bitmap_header), 1, fp);

    cudaMemcpy(d_hp, hp, sizeof(bitmap_header), cudaMemcpyHostToDevice);

    //Read the data of the image:
    data = (unsigned char*)malloc(sizeof(char)*hp->bitmapsize);

    cudaMalloc( &d_data, sizeof(char)*hp->bitmapsize);

    fseek(fp,sizeof(char)*hp->fileheader.dataoffset,SEEK_SET);
    fread(data,sizeof(char),hp->bitmapsize, fp);

    cudaMemcpy(d_data, data, sizeof(char)*hp->bitmapsize, cudaMemcpyHostToDevice);

    //Not sure if correctly calling function
    dim3 block(16,16);
    dim3 grid ( (hp->height + 15)/16, (hp->width + 15)/16 );
    blur<<<grid,block>>>(d_hp, d_data);

    cudaMemcpy(data, d_data, sizeof(char)*hp->bitmapsize, cudaMemcpyDeviceToHost);

    //Open output file:
    out = fopen(output, "wb");
    if(out==NULL)
    {
        fclose(fp);
        free(hp);
        free(data);
        cudaFree(d_data);
        cudaFree(d_hp);
        return 1;
    }

    fwrite(hp,sizeof(char),sizeof(bitmap_header),out);

    fseek(out,sizeof(char)*hp->fileheader.dataoffset,SEEK_SET);
    fwrite(data,sizeof(char),hp->bitmapsize,out);

    fclose(fp);
    fclose(out);
    free(hp);
    free(data);

    cudaFree(d_data);
    cudaFree(d_hp);
    return 0;
}

int main(int argc, char* argv[])
{
    char *path = "file.bmp";
    filter(path,path);

    return 0;
}

What is the canonical way to check for errors using the CUDA runtime API? 已要求我实施错误检查，但我不知道它如何或真正能帮助我。

编辑：

感谢@DanielKamilKozar，我解决了这些问题。程序编译但模糊没有被添加到 BMP 文件中。 CUDA 语法是否正确调用了模糊函数？

【问题讨论】：

我几乎没有关于 CUDA 的经验或知识，但在对齐方面，data[x*3 + y*hp->width*3 + 2]; 行敲响了警钟。您似乎正在尝试以字节粒度访问内存，而 CUDA 显然不支持。
另外，this 似乎证实了我的怀疑：设备可以通过与其大小对齐的 32、64 或 128 字节事务访问全局内存。
另外，您在结构上使用了包装，这意味着它们的成员肯定不会与单词边界对齐。这是另一件事要检查。
“这是我第一个使用 C 和 CUDA 的项目” - 请注意，“CUDA C”实际上是 C++ 的一种方言，因此您可能需要相应地修改您的代码。
BMP 格式使用不将数据元素与其自然对齐方式对齐的标题格式。这将导致 GPU 和许多其他架构出现问题。

标签： c visual-studio-2012 cuda

【解决方案1】：

我通过不通过函数参数发送完整的 BMP 标头来解决它，但它是必要的内容。我遇到了另一个函数没有被调用的问题，我通过更新 CUDA 软件解决了这个问题。

【讨论】：

【解决方案2】：

我可以通过将 arch 值从 sm_10 更改为 sm_20 来解决这个问题。我的应用在 Win 8.1 x64 VS2012 的 GT750M 上运行。

【讨论】：