CUDA中使用多维数组

今天想起一个问题，看到的绝大多数CUDA代码都是使用的一维数组，是否可以在CUDA中使用一维数组，这是一个问题，想了各种问题，各种被77的错误状态码和段错误折磨，最后发现有一个cudaMallocManaged函数，这个函数可以很好的组织多维数组的多重指针的形式
，后来发现，这个问题之前在Stack Overflow中就有很好的解决。先贴一下我自己的代码实现：
  1 #include "cuda_runtime.h"  
  2 #include "device_launch_parameters.h"  
  3   
  4 #include <stdio.h>  
  5 const int arraySize = 5; 
  6 cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size);  
  7   
  8 __global__ void addKernel(int **c, const int *a, const int *b)  
  9 {  
 10     int i = threadIdx.x;  
 11     if(i<arraySize)
 12         c[0][i] = a[i] + b[i];  
 13     else
 14         c[1][i-arraySize]= a[i-arraySize]+b[i-arraySize];
 15 }  
 16 
 17 int main()  
 18 {  
 19  
 20     const int a[arraySize] = { 1, 2, 3, 4, 5 };  
 21     const int b[arraySize] = { 10, 20, 30, 40, 50 };  
 22     int c[arraySize] = { 0 };  
 23   
 24     // Add vectors in parallel.  
 25     cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize);  
 26     if (cudaStatus != cudaSuccess) {  
 27         fprintf(stderr, "addWithCuda failed!");  
 28         return 1;  
 29     }  
 30   
 31     printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n",  
 32         c[0], c[1], c[2], c[3], c[4]);  
 33   
 34     // cudaThreadExit must be called before exiting in order for profiling and  
 35     // tracing tools such as Nsight and Visual Profiler to show complete traces.  
 36     cudaStatus = cudaThreadExit();  
 37     if (cudaStatus != cudaSuccess) {  
 38         fprintf(stderr, "cudaThreadExit failed!");  
 39         return 1;  
 40     }  
 41   
 42     return 0;  
 43 }  
 44   
 45 // Helper function for using CUDA to add vectors in parallel.  
 46 cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size)  
 47 {  
 48     int *dev_a = 0;  
 49     int *dev_b = 0;  
 50     int *dev_c0;
 51     int **dev_c ;
 52     cudaError_t cudaStatus;  
 53   
 54     // Choose which GPU to run on, change this on a multi-GPU system.  
 55     cudaStatus = cudaSetDevice(0);  
 56     if (cudaStatus != cudaSuccess) {  
 57         fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");  
 58         goto Error;  
 59     }  
 60   
 61     // Allocate GPU buffers for three vectors (two input, one output)    
 62     cudaStatus  =  cudaMallocManaged(&dev_c, 2*sizeof(int*));
 63     if (cudaStatus != cudaSuccess) {  
 64         fprintf(stderr, "cudaMalloc failed!");  
 65         goto Error;  
 66     } 
 67     cudaStatus = cudaMalloc((void**)&(dev_c0), size * sizeof(int)*2);  
 68     if (cudaStatus != cudaSuccess) {  
 69         fprintf(stderr, "cudaMalloc failed!");  
 70         goto Error;  
 71     }  
 72 
 73     dev_c[0]=dev_c0;
 74     dev_c[1]=dev_c0+arraySize;
 75     cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));  
 76     if (cudaStatus != cudaSuccess) {  
 77         fprintf(stderr, "cudaMalloc failed!");  
 78         goto Error;  
 79     }  
 80   
 81     cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));  
 82     if (cudaStatus != cudaSuccess) {  
 83         fprintf(stderr, "cudaMalloc failed!");  
 84         goto Error;  
 85     }  
 86   
 87     // Copy input vectors from host memory to GPU buffers.  
 88     cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);  
 89     if (cudaStatus != cudaSuccess) {  
 90         fprintf(stderr, "cudaMemcpy failed!");  
 91         goto Error;  
 92     }  
 93   
 94     cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);  
 95     if (cudaStatus != cudaSuccess) {  
 96         fprintf(stderr, "cudaMemcpy failed!");  
 97         goto Error;  
 98     }  
 99   
100     // Launch a kernel on the GPU with one thread for each element.  
101     addKernel<<<1, size*2>>>(dev_c, dev_a, dev_b);  
102   
103     // cudaThreadSynchronize waits for the kernel to finish, and returns  
104     // any errors encountered during the launch.  
105     cudaStatus = cudaThreadSynchronize();  
106     if (cudaStatus != cudaSuccess) {  
107         fprintf(stderr, "cudaThreadSynchronize returned error code %d after launching addKernel!\n", cudaStatus);  
108         goto Error;  
109     }  
110   
111     // Copy output vector from GPU buffer to host memory.  
112     cudaStatus = cudaMemcpy(c, dev_c[1], size * sizeof(int), cudaMemcpyDeviceToHost);  
113     if (cudaStatus != cudaSuccess) {  
114         fprintf(stderr, "cudaMemcpy failed!");  
115         goto Error;  
116     }  
117   
118 Error:  
119 
120     cudaFree(dev_a);  
121     cudaFree(dev_b);  
122       
123     return cudaStatus;  
124 }
View Code