根据 cmets 中的讨论,这里有一个示例代码,大致遵循问题中的代码,使用 3 种不同的方法:
使用“扁平化”数组。对于询问如何处理双指针数组(char **,或任何其他类型)或任何包含embedded pointers 的数据结构的初学者,这是传统的建议。基本思想是创建一个相同类型的单个指针数组(例如char *),并将所有数据端到端复制到该数组中。在这种情况下,由于数组元素是可变长度的,我们还需要传递一个包含每个字符串的起始索引的数组(在这种情况下)。
使用直接双指针方法。我认为这段代码很难写。它也可能对性能产生影响。典型示例是here,算法上所需内容的逐步描述是here 和/或here 是带有方法描述的3D(即三指针)工作示例(糟糕!)。这基本上是在 CUDA 中进行深拷贝,我认为它比典型的 CUDA 编码要困难一些。
使用managed memory 子系统,它在CUDA platforms that support it 中可用。在编码方面,这可能比上述两种方法中的任何一种都简单。
这是所有 3 种方法的一个有效示例:
$ cat t1035.cu
#include <stdio.h>
#include <string.h>
#define nTPB 256
__global__ void kern_1D(char *data, unsigned *indices, unsigned num_strings){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
if (idx < num_strings)
printf("Hello from thread %d, my string is %s\n", idx, data+indices[idx]);
}
__global__ void kern_2D(char **data, unsigned num_strings){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
if (idx < num_strings)
printf("Hello from thread %d, my string is %s\n", idx, data[idx]);
}
int main(){
const int num_strings = 3;
const char s0[] = "s1\0";
const char s1[] = "s2\0";
const char s2[] = "s3\0";
int ds[num_strings];
ds[0] = sizeof(s0)/sizeof(char);
ds[1] = sizeof(s1)/sizeof(char);
ds[2] = sizeof(s2)/sizeof(char);
// pretend we have a dynamically allocated char** array
char **data;
data = (char **)malloc(num_strings*sizeof(char *));
data[0] = (char *)malloc(ds[0]*sizeof(char));
data[1] = (char *)malloc(ds[1]*sizeof(char));
data[2] = (char *)malloc(ds[2]*sizeof(char));
// initialize said array
strcpy(data[0], s0);
strcpy(data[1], s1);
strcpy(data[2], s2);
// method 1: "flattening"
char *fdata = (char *)malloc((ds[0]+ds[1]+ds[2])*sizeof(char));
unsigned *ind = (unsigned *)malloc(num_strings*sizeof(unsigned));
unsigned next = 0;
for (int i = 0; i < num_strings; i++){
strcpy(fdata+next, data[i]);
ind[i] = next;
next += ds[i];}
//copy to device
char *d_fdata;
unsigned *d_ind;
cudaMalloc(&d_fdata, next*sizeof(char));
cudaMalloc(&d_ind, num_strings*sizeof(unsigned));
cudaMemcpy(d_fdata, fdata, next*sizeof(char), cudaMemcpyHostToDevice);
cudaMemcpy(d_ind, ind, num_strings*sizeof(unsigned), cudaMemcpyHostToDevice);
printf("method 1:\n");
kern_1D<<<(num_strings+nTPB-1)/nTPB, nTPB>>>(d_fdata, d_ind, num_strings);
cudaDeviceSynchronize();
//method 2: "2D" (pointer-to-pointer) array
char **d_data;
cudaMalloc(&d_data, num_strings*sizeof(char *));
char **d_temp_data;
d_temp_data = (char **)malloc(num_strings*sizeof(char *));
for (int i = 0; i < num_strings; i++){
cudaMalloc(&(d_temp_data[i]), ds[i]*sizeof(char));
cudaMemcpy(d_temp_data[i], data[i], ds[i]*sizeof(char), cudaMemcpyHostToDevice);
cudaMemcpy(d_data+i, &(d_temp_data[i]), sizeof(char *), cudaMemcpyHostToDevice);}
printf("method 2:\n");
kern_2D<<<(num_strings+nTPB-1)/nTPB, nTPB>>>(d_data, num_strings);
cudaDeviceSynchronize();
// method 3: managed allocations
// start over with a managed char** array
char **m_data;
cudaMallocManaged(&m_data, num_strings*sizeof(char *));
cudaMallocManaged(&(m_data[0]), ds[0]*sizeof(char));
cudaMallocManaged(&(m_data[1]), ds[1]*sizeof(char));
cudaMallocManaged(&(m_data[2]), ds[2]*sizeof(char));
// initialize said array
strcpy(m_data[0], s0);
strcpy(m_data[1], s1);
strcpy(m_data[2], s2);
// call kernel directly on managed data
printf("method 3:\n");
kern_2D<<<(num_strings+nTPB-1)/nTPB, nTPB>>>(m_data, num_strings);
cudaDeviceSynchronize();
return 0;
}
$ nvcc -arch=sm_35 -o t1035 t1035.cu
$ cuda-memcheck ./t1035
========= CUDA-MEMCHECK
method 1:
Hello from thread 0, my string is s1
Hello from thread 1, my string is s2
Hello from thread 2, my string is s3
method 2:
Hello from thread 0, my string is s1
Hello from thread 1, my string is s2
Hello from thread 2, my string is s3
method 3:
Hello from thread 0, my string is s1
Hello from thread 1, my string is s2
Hello from thread 2, my string is s3
========= ERROR SUMMARY: 0 errors
$
注意事项:
如果您只是第一次测试,我建议使用cuda-memcheck 运行此代码。为简洁起见,我省略了proper cuda error checking,但我建议您在遇到 CUDA 代码问题时使用它。此代码的正确执行取决于是否有可用的托管内存子系统(阅读我提供的文档链接)。如果您的平台不支持它,按原样运行此代码可能会导致段错误,因为我没有包含正确的错误检查。
将双指针数组从设备复制到主机虽然没有在本示例中明确介绍,但本质上与 3 种方法中的每一种方法的步骤相反。对于方法 1,一个 cudaMemcpy 调用就可以做到。对于方法 2,它需要一个 for 循环来反转复制到设备的步骤(包括使用临时指针)。对于方法 3,除了正确遵守托管内存编码实践(例如在内核调用后使用 cudaDeviceSynchronize()),然后再次尝试从主机代码访问设备之外,什么都不需要。
在提供将char ** 数组传递给 CUDA 内核的方法方面,我不想争论方法 1 和 3 是否明确遵守问题的字母。如果您的关注点那么窄,请使用方法2,否则完全忽略此答案。
编辑:基于下面 cmets 中的一个问题,这里是上面的代码,修改了主机端字符串的不同初始化序列(在第 42 行)。现在有编译警告,但这些警告是由 OP 专门要求使用的代码引起的:
$ cat t1036.cu
#include <stdio.h>
#include <string.h>
#define nTPB 256
__global__ void kern_1D(char *data, unsigned *indices, unsigned num_strings){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
if (idx < num_strings)
printf("Hello from thread %d, my string is %s\n", idx, data+indices[idx]);
}
__global__ void kern_2D(char **data, unsigned num_strings){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
if (idx < num_strings)
printf("Hello from thread %d, my string is %s\n", idx, data[idx]);
}
int main(){
const int num_strings = 3;
#if 0
const char s0[] = "s1\0";
const char s1[] = "s2\0";
const char s2[] = "s3\0";
int ds[num_strings];
ds[0] = sizeof(s0)/sizeof(char);
ds[1] = sizeof(s1)/sizeof(char);
ds[2] = sizeof(s2)/sizeof(char);
// pretend we have a dynamically allocated char** array
char **data;
data = (char **)malloc(num_strings*sizeof(char *));
data[0] = (char *)malloc(ds[0]*sizeof(char));
data[1] = (char *)malloc(ds[1]*sizeof(char));
data[2] = (char *)malloc(ds[2]*sizeof(char));
// initialize said array
strcpy(data[0], s0);
strcpy(data[1], s1);
strcpy(data[2], s2);
#endif
char ** pwdAry; pwdAry = new char *[num_strings]; for (int a = 0; a < num_strings; a++) { pwdAry[a] = new char[1024]; } for (int a = 0; a < 3; a++) { pwdAry[a] = "hello\0"; }
// method 1: "flattening"
char *fdata = (char *)malloc((1024*num_strings)*sizeof(char));
unsigned *ind = (unsigned *)malloc(num_strings*sizeof(unsigned));
unsigned next = 0;
for (int i = 0; i < num_strings; i++){
memcpy(fdata+next, pwdAry[i], 1024);
ind[i] = next;
next += 1024;}
//copy to device
char *d_fdata;
unsigned *d_ind;
cudaMalloc(&d_fdata, next*sizeof(char));
cudaMalloc(&d_ind, num_strings*sizeof(unsigned));
cudaMemcpy(d_fdata, fdata, next*sizeof(char), cudaMemcpyHostToDevice);
cudaMemcpy(d_ind, ind, num_strings*sizeof(unsigned), cudaMemcpyHostToDevice);
printf("method 1:\n");
kern_1D<<<(num_strings+nTPB-1)/nTPB, nTPB>>>(d_fdata, d_ind, num_strings);
cudaDeviceSynchronize();
//method 2: "2D" (pointer-to-pointer) array
char **d_data;
cudaMalloc(&d_data, num_strings*sizeof(char *));
char **d_temp_data;
d_temp_data = (char **)malloc(num_strings*sizeof(char *));
for (int i = 0; i < num_strings; i++){
cudaMalloc(&(d_temp_data[i]), 1024*sizeof(char));
cudaMemcpy(d_temp_data[i], pwdAry[i], 1024*sizeof(char), cudaMemcpyHostToDevice);
cudaMemcpy(d_data+i, &(d_temp_data[i]), sizeof(char *), cudaMemcpyHostToDevice);}
printf("method 2:\n");
kern_2D<<<(num_strings+nTPB-1)/nTPB, nTPB>>>(d_data, num_strings);
cudaDeviceSynchronize();
// method 3: managed allocations
// start over with a managed char** array
char **m_data;
cudaMallocManaged(&m_data, num_strings*sizeof(char *));
cudaMallocManaged(&(m_data[0]), 1024*sizeof(char));
cudaMallocManaged(&(m_data[1]), 1024*sizeof(char));
cudaMallocManaged(&(m_data[2]), 1024*sizeof(char));
// initialize said array
for (int i = 0; i < num_strings; i++)
memcpy(m_data[i], pwdAry[i], 1024);
// call kernel directly on managed data
printf("method 3:\n");
kern_2D<<<(num_strings+nTPB-1)/nTPB, nTPB>>>(m_data, num_strings);
cudaDeviceSynchronize();
return 0;
}
$ nvcc -arch=sm_35 -o t1036 t1036.cu
t1036.cu(42): warning: conversion from a string literal to "char *" is deprecated
t1036.cu(42): warning: conversion from a string literal to "char *" is deprecated
$ cuda-memcheck ./t1036
========= CUDA-MEMCHECK
method 1:
Hello from thread 0, my string is hello
Hello from thread 1, my string is hello
Hello from thread 2, my string is hello
method 2:
Hello from thread 0, my string is hello
Hello from thread 1, my string is hello
Hello from thread 2, my string is hello
method 3:
Hello from thread 0, my string is hello
Hello from thread 1, my string is hello
Hello from thread 2, my string is hello
========= ERROR SUMMARY: 0 errors
$