Cuda 将 char** 传递给内核答案

【问题标题】：Cuda passing char** to kernelCuda 将 char** 传递给内核
【发布时间】：2016-04-14 13:11:43
【问题描述】：

我对这个基本的 CUDA 代码有些烦恼。

我有一个char**，它是一个二维密码数组，我目前的实现是让 CUDA 简单地遍历这个列表并显示密码。但是，当我去显示它们时，我只会得到“（NULL）”。我不太确定这是为什么。有人可以解释发生了什么吗？

主要：

char ** pwdAry;
pwdAry = new char *[numberOfPwd];

//pwdAry given some values (flat 2d array layout)
const int pwdArySize = sizeof(pwdAry);    
dim3 grid(gridSize,gridSize);
dim3 block(blockSize,blockSize);

searchKeywordKernel << <grid, block >> >(pwdAry);

return EXIT_SUCCESS;

库达：

__global__ void searchKeywordKernel(char **passwordList)
{
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;
    int pitch = blockDim.x * gridDim.x;
    int idx = x + y * pitch;
    int tidy = idx / pitch;
    int tidx = idx - (pitch * tidy);
    int bidx = tidx / blockDim.x;
    int bidy = tidy / blockDim.y;
    int currentThread = threadIdx.x + blockDim.x * threadIdx.y;

    printf("hi, i am thread: %i, and my block x: %i, and y: %i\n", currentThread, bidx, bidy);
    printf("My password is: %s\n", passwordList[currentThread]);
}

【问题讨论】：

const int pwdArySize = sizeof(pwdAry); 如果这是您的目标，这不会给您数组中元素的数量。
指针数组不是平面数组。我将首先了解扁平化数组在 CUDA 中的含义，然后将您的代码转换为使用扁平化数组。如果您需要有关无法正常工作的代码的帮助，您应该提供 MCVE。
这和我在这里创建一个二维数组的做法不同吗？ stackoverflow.com/questions/5397976/…
stackoverflow.com/a/6137517/681865 包含一个完整的工作示例。
当然，我没有将其中任何一个作为“答案”。这里列出的是“cmets”。这些链接都旨在指出在不使用双指针方法的情况下处理字符数组数据（实际上是多个字符串）的各种方法。如果您想使用双指针方法，规范示例是 @talonmies 已经指出的示例。对于初学者，这不是推荐的方法。它还会导致更复杂的代码，更难维护，更容易出错。而且它可能会导致代码运行速度比通过单个指针引用的数据慢。

标签： c++ cuda

【解决方案1】：

根据 cmets 中的讨论，这里有一个示例代码，大致遵循问题中的代码，使用 3 种不同的方法：

使用“扁平化”数组。对于询问如何处理双指针数组（char **，或任何其他类型）或任何包含embedded pointers 的数据结构的初学者，这是传统的建议。基本思想是创建一个相同类型的单个指针数组（例如char *），并将所有数据端到端复制到该数组中。在这种情况下，由于数组元素是可变长度的，我们还需要传递一个包含每个字符串的起始索引的数组（在这种情况下）。
使用直接双指针方法。我认为这段代码很难写。它也可能对性能产生影响。典型示例是here，算法上所需内容的逐步描述是here 和/或here 是带有方法描述的3D（即三指针）工作示例（糟糕！）。这基本上是在 CUDA 中进行深拷贝，我认为它比典型的 CUDA 编码要困难一些。
使用managed memory 子系统，它在CUDA platforms that support it 中可用。在编码方面，这可能比上述两种方法中的任何一种都简单。

这是所有 3 种方法的一个有效示例：

$ cat t1035.cu
#include <stdio.h>
#include <string.h>

#define nTPB 256

__global__ void kern_1D(char *data, unsigned *indices, unsigned num_strings){

  int idx = threadIdx.x+blockDim.x*blockIdx.x;
  if (idx < num_strings)
    printf("Hello from thread %d, my string is %s\n", idx, data+indices[idx]);
}

__global__ void kern_2D(char **data, unsigned num_strings){

  int idx = threadIdx.x+blockDim.x*blockIdx.x;
  if (idx < num_strings)
    printf("Hello from thread %d, my string is %s\n", idx, data[idx]);
}

int main(){

  const int num_strings = 3;
  const char s0[] = "s1\0";
  const char s1[] = "s2\0";
  const char s2[] = "s3\0";
  int ds[num_strings];
  ds[0] = sizeof(s0)/sizeof(char);
  ds[1] = sizeof(s1)/sizeof(char);
  ds[2] = sizeof(s2)/sizeof(char);
  // pretend we have a dynamically allocated char** array
  char **data;
  data = (char **)malloc(num_strings*sizeof(char *));
  data[0] = (char *)malloc(ds[0]*sizeof(char));
  data[1] = (char *)malloc(ds[1]*sizeof(char));
  data[2] = (char *)malloc(ds[2]*sizeof(char));
  // initialize said array
  strcpy(data[0], s0);
  strcpy(data[1], s1);
  strcpy(data[2], s2);
  // method 1: "flattening"
  char *fdata = (char *)malloc((ds[0]+ds[1]+ds[2])*sizeof(char));
  unsigned *ind   = (unsigned *)malloc(num_strings*sizeof(unsigned));
  unsigned next = 0;
  for (int i = 0; i < num_strings; i++){
    strcpy(fdata+next, data[i]);
    ind[i] = next;
    next += ds[i];}
  //copy to device
  char *d_fdata;
  unsigned *d_ind;
  cudaMalloc(&d_fdata, next*sizeof(char));
  cudaMalloc(&d_ind, num_strings*sizeof(unsigned));
  cudaMemcpy(d_fdata, fdata, next*sizeof(char), cudaMemcpyHostToDevice);
  cudaMemcpy(d_ind, ind, num_strings*sizeof(unsigned), cudaMemcpyHostToDevice);
  printf("method 1:\n");
  kern_1D<<<(num_strings+nTPB-1)/nTPB, nTPB>>>(d_fdata, d_ind, num_strings);
  cudaDeviceSynchronize();
  //method 2: "2D" (pointer-to-pointer) array
  char **d_data;
  cudaMalloc(&d_data, num_strings*sizeof(char *));
  char **d_temp_data;
  d_temp_data = (char **)malloc(num_strings*sizeof(char *));
  for (int i = 0; i < num_strings; i++){
    cudaMalloc(&(d_temp_data[i]), ds[i]*sizeof(char));
    cudaMemcpy(d_temp_data[i], data[i], ds[i]*sizeof(char), cudaMemcpyHostToDevice);
    cudaMemcpy(d_data+i, &(d_temp_data[i]), sizeof(char *), cudaMemcpyHostToDevice);}
  printf("method 2:\n");
  kern_2D<<<(num_strings+nTPB-1)/nTPB, nTPB>>>(d_data, num_strings);
  cudaDeviceSynchronize();
  // method 3: managed allocations
  // start over with a managed char** array
  char **m_data;
  cudaMallocManaged(&m_data, num_strings*sizeof(char *));
  cudaMallocManaged(&(m_data[0]), ds[0]*sizeof(char));
  cudaMallocManaged(&(m_data[1]), ds[1]*sizeof(char));
  cudaMallocManaged(&(m_data[2]), ds[2]*sizeof(char));
  // initialize said array
  strcpy(m_data[0], s0);
  strcpy(m_data[1], s1);
  strcpy(m_data[2], s2);
  // call kernel directly on managed data
  printf("method 3:\n");
  kern_2D<<<(num_strings+nTPB-1)/nTPB, nTPB>>>(m_data, num_strings);
  cudaDeviceSynchronize();

  return 0;
}


$ nvcc -arch=sm_35 -o t1035 t1035.cu
$ cuda-memcheck ./t1035
========= CUDA-MEMCHECK
method 1:
Hello from thread 0, my string is s1
Hello from thread 1, my string is s2
Hello from thread 2, my string is s3
method 2:
Hello from thread 0, my string is s1
Hello from thread 1, my string is s2
Hello from thread 2, my string is s3
method 3:
Hello from thread 0, my string is s1
Hello from thread 1, my string is s2
Hello from thread 2, my string is s3
========= ERROR SUMMARY: 0 errors
$

注意事项：

如果您只是第一次测试，我建议使用cuda-memcheck 运行此代码。为简洁起见，我省略了proper cuda error checking，但我建议您在遇到 CUDA 代码问题时使用它。此代码的正确执行取决于是否有可用的托管内存子系统（阅读我提供的文档链接）。如果您的平台不支持它，按原样运行此代码可能会导致段错误，因为我没有包含正确的错误检查。
将双指针数组从设备复制到主机虽然没有在本示例中明确介绍，但本质上与 3 种方法中的每一种方法的步骤相反。对于方法 1，一个 cudaMemcpy 调用就可以做到。对于方法 2，它需要一个 for 循环来反转复制到设备的步骤（包括使用临时指针）。对于方法 3，除了正确遵守托管内存编码实践（例如在内核调用后使用 cudaDeviceSynchronize()），然后再次尝试从主机代码访问设备之外，什么都不需要。
在提供将char ** 数组传递给 CUDA 内核的方法方面，我不想争论方法 1 和 3 是否明确遵守问题的字母。如果您的关注点那么窄，请使用方法2，否则完全忽略此答案。

编辑：基于下面 cmets 中的一个问题，这里是上面的代码，修改了主机端字符串的不同初始化序列（在第 42 行）。现在有编译警告，但这些警告是由 OP 专门要求使用的代码引起的：

$ cat t1036.cu
#include <stdio.h>
#include <string.h>

#define nTPB 256

__global__ void kern_1D(char *data, unsigned *indices, unsigned num_strings){

  int idx = threadIdx.x+blockDim.x*blockIdx.x;
  if (idx < num_strings)
    printf("Hello from thread %d, my string is %s\n", idx, data+indices[idx]);
}

__global__ void kern_2D(char **data, unsigned num_strings){

  int idx = threadIdx.x+blockDim.x*blockIdx.x;
  if (idx < num_strings)
    printf("Hello from thread %d, my string is %s\n", idx, data[idx]);
}

int main(){

  const int num_strings = 3;
#if 0
  const char s0[] = "s1\0";
  const char s1[] = "s2\0";
  const char s2[] = "s3\0";
  int ds[num_strings];
  ds[0] = sizeof(s0)/sizeof(char);
  ds[1] = sizeof(s1)/sizeof(char);
  ds[2] = sizeof(s2)/sizeof(char);
  // pretend we have a dynamically allocated char** array
  char **data;
  data = (char **)malloc(num_strings*sizeof(char *));
  data[0] = (char *)malloc(ds[0]*sizeof(char));
  data[1] = (char *)malloc(ds[1]*sizeof(char));
  data[2] = (char *)malloc(ds[2]*sizeof(char));
  // initialize said array
  strcpy(data[0], s0);
  strcpy(data[1], s1);
  strcpy(data[2], s2);
#endif
  char ** pwdAry; pwdAry = new char *[num_strings]; for (int a = 0; a < num_strings; a++) { pwdAry[a] = new char[1024]; } for (int a = 0; a < 3; a++) { pwdAry[a] = "hello\0"; }
  // method 1: "flattening"
  char *fdata = (char *)malloc((1024*num_strings)*sizeof(char));
  unsigned *ind   = (unsigned *)malloc(num_strings*sizeof(unsigned));
  unsigned next = 0;
  for (int i = 0; i < num_strings; i++){
    memcpy(fdata+next, pwdAry[i], 1024);
    ind[i] = next;
    next += 1024;}
  //copy to device
  char *d_fdata;
  unsigned *d_ind;
  cudaMalloc(&d_fdata, next*sizeof(char));
  cudaMalloc(&d_ind, num_strings*sizeof(unsigned));
  cudaMemcpy(d_fdata, fdata, next*sizeof(char), cudaMemcpyHostToDevice);
  cudaMemcpy(d_ind, ind, num_strings*sizeof(unsigned), cudaMemcpyHostToDevice);
  printf("method 1:\n");
  kern_1D<<<(num_strings+nTPB-1)/nTPB, nTPB>>>(d_fdata, d_ind, num_strings);
  cudaDeviceSynchronize();
  //method 2: "2D" (pointer-to-pointer) array
  char **d_data;
  cudaMalloc(&d_data, num_strings*sizeof(char *));
  char **d_temp_data;
  d_temp_data = (char **)malloc(num_strings*sizeof(char *));
  for (int i = 0; i < num_strings; i++){
    cudaMalloc(&(d_temp_data[i]), 1024*sizeof(char));
    cudaMemcpy(d_temp_data[i], pwdAry[i], 1024*sizeof(char), cudaMemcpyHostToDevice);
    cudaMemcpy(d_data+i, &(d_temp_data[i]), sizeof(char *), cudaMemcpyHostToDevice);}
  printf("method 2:\n");
  kern_2D<<<(num_strings+nTPB-1)/nTPB, nTPB>>>(d_data, num_strings);
  cudaDeviceSynchronize();
  // method 3: managed allocations
  // start over with a managed char** array
  char **m_data;
  cudaMallocManaged(&m_data, num_strings*sizeof(char *));
  cudaMallocManaged(&(m_data[0]), 1024*sizeof(char));
  cudaMallocManaged(&(m_data[1]), 1024*sizeof(char));
  cudaMallocManaged(&(m_data[2]), 1024*sizeof(char));
  // initialize said array
  for (int i = 0; i < num_strings; i++)
    memcpy(m_data[i], pwdAry[i], 1024);
  // call kernel directly on managed data
  printf("method 3:\n");
  kern_2D<<<(num_strings+nTPB-1)/nTPB, nTPB>>>(m_data, num_strings);
  cudaDeviceSynchronize();

  return 0;
}


$ nvcc -arch=sm_35 -o t1036 t1036.cu
t1036.cu(42): warning: conversion from a string literal to "char *" is deprecated

t1036.cu(42): warning: conversion from a string literal to "char *" is deprecated

$ cuda-memcheck ./t1036
========= CUDA-MEMCHECK
method 1:
Hello from thread 0, my string is hello
Hello from thread 1, my string is hello
Hello from thread 2, my string is hello
method 2:
Hello from thread 0, my string is hello
Hello from thread 1, my string is hello
Hello from thread 2, my string is hello
method 3:
Hello from thread 0, my string is hello
Hello from thread 1, my string is hello
Hello from thread 2, my string is hello
========= ERROR SUMMARY: 0 errors
$

【讨论】：

很棒的例子，谢谢！但如果我有这样的事情：char ** pwdAry; pwdAry = new char *[3]; for (int a = 0; a < 3; a++) { pwdAry[a] = new char[1024]; } for (int a = 0; a < 3; a++) { pwdAry[a] = "hello\0"; } 那么我将如何在您的方法 2 实现中传递它？
我认为这几乎只是一个 c/c++ 编程问题。但是，我在答案中添加了一个附加代码，它显示了您在评论中提供的确切初始化代码。请注意，现在有一个编译时警告。这是由您指出的代码引起的，所以我不会费心修复它。