您尝试使用带有指向结构数组的指针的结构,每个结构数组都有一个嵌入的指针,这将使cudaMalloc 的实现变得非常复杂。如果您使用cudaMallocManaged,它可能会更简单一些,但仍然不必要地复杂。复杂性的出现是因为cudaMalloc 在特定设备上分配空间,并且(默认情况下)任何其他设备都无法访问该数据,并且还因为您的嵌入式指针创建了各种“深拷贝”的必要性。这是一个有效的例子:
$ cat t1492.cu
#include <iostream>
#include <stdio.h>
typedef struct ComplexArray
{
double *real;
} ComplexArray;
typedef struct ComplexArrayArray
{
ComplexArray* Arr;
} ComplexArrayArray;
__global__ void kernel(ComplexArrayArray stateVector, int dev, int ds)
{
// Calculate necessary device
int device_number = dev;
int index = blockIdx.x*blockDim.x+threadIdx.x;
if (index < ds){
double val = stateVector.Arr[device_number].real[index] + dev;
stateVector.Arr[device_number].real[index] = val;
}
}
const int nTPB = 256;
int main(){
int numberOfGPU;
cudaGetDeviceCount(&numberOfGPU);
std::cout << "GPU count: " << numberOfGPU << std::endl;
ComplexArrayArray *stateVector = new ComplexArrayArray[numberOfGPU];
const int ds = 32;
double *hdata = new double[ds]();
ComplexArray *ddata = new ComplexArray[numberOfGPU];
for (int i = 0; i < numberOfGPU; i++){
cudaSetDevice(i);
cudaMalloc(&(stateVector[i].Arr), sizeof(ComplexArray) * numberOfGPU);
cudaMalloc(&(ddata[i].real), (ds/numberOfGPU)*sizeof(double));
cudaMemcpy(ddata[i].real, hdata + i*(ds/numberOfGPU), (ds/numberOfGPU)*sizeof(double), cudaMemcpyHostToDevice);}
for (int i = 0; i < numberOfGPU; i++){
cudaSetDevice(i);
cudaMemcpy(stateVector[i].Arr, ddata, sizeof(ComplexArray)*numberOfGPU, cudaMemcpyHostToDevice);}
for (int i = 0; i < numberOfGPU; i++){
cudaSetDevice(i);
kernel<<<((ds/numberOfGPU)+nTPB-1)/nTPB,nTPB>>>(stateVector[i], i, (ds/numberOfGPU));}
for (int i = 0; i < numberOfGPU; i++){
cudaSetDevice(i);
cudaMemcpy(hdata + i*(ds/numberOfGPU), ddata[i].real, (ds/numberOfGPU)*sizeof(double), cudaMemcpyDeviceToHost);}
for (int i = 0; i < ds; i++)
std::cout << hdata[i] << " ";
std::cout << std::endl;
}
$ nvcc -o t1492 t1492.cu
$ cuda-memcheck ./t1492
========= CUDA-MEMCHECK
GPU count: 4
0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3
========= ERROR SUMMARY: 0 errors
$
但是,如果您想将主机阵列划分为每个 GPU 的一个块,则不需要那种复杂程度。这是一个更简单的例子:
$ cat t1493.cu
#include <iostream>
#include <stdio.h>
typedef struct ComplexArray
{
double *real;
} ComplexArray;
typedef struct ComplexArrayArray
{
ComplexArray* Arr;
} ComplexArrayArray;
__global__ void kernel(ComplexArray stateVector, int dev, int ds)
{
int index = blockIdx.x*blockDim.x+threadIdx.x;
if (index < ds){
double val = stateVector.real[index] + dev;
stateVector.real[index] = val;
}
}
const int nTPB = 256;
int main(){
int numberOfGPU;
cudaGetDeviceCount(&numberOfGPU);
std::cout << "GPU count: " << numberOfGPU << std::endl;
ComplexArray *stateVector = new ComplexArray[numberOfGPU];
const int ds = 32;
double *hdata = new double[ds]();
for (int i = 0; i < numberOfGPU; i++){
cudaSetDevice(i);
cudaMalloc(&(stateVector[i].real), (ds/numberOfGPU)*sizeof(double));
cudaMemcpy(stateVector[i].real, hdata + i*(ds/numberOfGPU), (ds/numberOfGPU)*sizeof(double), cudaMemcpyHostToDevice);}
for (int i = 0; i < numberOfGPU; i++){
cudaSetDevice(i);
kernel<<<((ds/numberOfGPU)+nTPB-1)/nTPB,nTPB>>>(stateVector[i], i, (ds/numberOfGPU));}
for (int i = 0; i < numberOfGPU; i++){
cudaSetDevice(i);
cudaMemcpy(hdata + i*(ds/numberOfGPU), stateVector[i].real, (ds/numberOfGPU)*sizeof(double), cudaMemcpyDeviceToHost);}
for (int i = 0; i < ds; i++)
std::cout << hdata[i] << " ";
std::cout << std::endl;
}
$ nvcc -o t1493 t1493.cu
$ cuda-memcheck ./t1493
========= CUDA-MEMCHECK
GPU count: 4
0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3
========= ERROR SUMMARY: 0 errors
$
请注意,您的问题似乎提到了您将数据分成块的想法,并且每个内核都可能访问所有块。这将需要管理内存使用或系统可以支持 GPU 之间的 P2P 访问的知识。这增加了更多的复杂性,超出了我在这里回答的范围,重点是关于内核无法访问“它自己的”数据的问题。
由于我们应该能够设置可以参与的 GPU 数量的上限(让我们将其设置为最多 8 个),因此我们可以避免第一种方法的深度复制,同时仍然允许所有 GPU 拥有所有指针。这是一个修改过的例子:
$ cat t1495.cu
#include <iostream>
#include <stdio.h>
const int maxGPU=8;
typedef struct ComplexArray
{
double *real[maxGPU];
} ComplexArray;
__global__ void kernel(ComplexArray stateVector, int dev, int ds)
{
int index = blockIdx.x*blockDim.x+threadIdx.x;
if (index < ds){
double val = stateVector.real[dev][index] + dev;
stateVector.real[dev][index] = val;
}
}
const int nTPB = 256;
int main(){
int numberOfGPU;
cudaGetDeviceCount(&numberOfGPU);
std::cout << "GPU count: " << numberOfGPU << std::endl;
ComplexArray stateVector;
const int ds = 32;
double *hdata = new double[ds]();
for (int i = 0; i < numberOfGPU; i++){
cudaSetDevice(i);
cudaMalloc(&(stateVector.real[i]), (ds/numberOfGPU)*sizeof(double));
cudaMemcpy(stateVector.real[i], hdata + i*(ds/numberOfGPU), (ds/numberOfGPU)*sizeof(double), cudaMemcpyHostToDevice);}
for (int i = 0; i < numberOfGPU; i++){
cudaSetDevice(i);
kernel<<<((ds/numberOfGPU)+nTPB-1)/nTPB,nTPB>>>(stateVector, i, (ds/numberOfGPU));}
for (int i = 0; i < numberOfGPU; i++){
cudaSetDevice(i);
cudaMemcpy(hdata + i*(ds/numberOfGPU), stateVector.real[i], (ds/numberOfGPU)*sizeof(double), cudaMemcpyDeviceToHost);}
for (int i = 0; i < ds; i++)
std::cout << hdata[i] << " ";
std::cout << std::endl;
}
$ nvcc -o t1495 t1495.cu
$ cuda-memcheck ./t1495
========= CUDA-MEMCHECK
GPU count: 4
0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3
========= ERROR SUMMARY: 0 errors
$