今天想起一个问题,看到的绝大多数CUDA代码都是使用的一维数组,是否可以在CUDA中使用一维数组,这是一个问题,想了各种问题,各种被77的错误状态码和段错误折磨,最后发现有一个cudaMallocManaged函数,这个函数可以很好的组织多维数组的多重指针的形式
,后来发现,这个问题之前在Stack Overflow中就有很好的解决。先贴一下我自己的代码实现:
1 #include "cuda_runtime.h" 2 #include "device_launch_parameters.h" 3 4 #include <stdio.h> 5 const int arraySize = 5; 6 cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size); 7 8 __global__ void addKernel(int **c, const int *a, const int *b) 9 { 10 int i = threadIdx.x; 11 if(i<arraySize) 12 c[0][i] = a[i] + b[i]; 13 else 14 c[1][i-arraySize]= a[i-arraySize]+b[i-arraySize]; 15 } 16 17 int main() 18 { 19 20 const int a[arraySize] = { 1, 2, 3, 4, 5 }; 21 const int b[arraySize] = { 10, 20, 30, 40, 50 }; 22 int c[arraySize] = { 0 }; 23 24 // Add vectors in parallel. 25 cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize); 26 if (cudaStatus != cudaSuccess) { 27 fprintf(stderr, "addWithCuda failed!"); 28 return 1; 29 } 30 31 printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n", 32 c[0], c[1], c[2], c[3], c[4]); 33 34 // cudaThreadExit must be called before exiting in order for profiling and 35 // tracing tools such as Nsight and Visual Profiler to show complete traces. 36 cudaStatus = cudaThreadExit(); 37 if (cudaStatus != cudaSuccess) { 38 fprintf(stderr, "cudaThreadExit failed!"); 39 return 1; 40 } 41 42 return 0; 43 } 44 45 // Helper function for using CUDA to add vectors in parallel. 46 cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size) 47 { 48 int *dev_a = 0; 49 int *dev_b = 0; 50 int *dev_c0; 51 int **dev_c ; 52 cudaError_t cudaStatus; 53 54 // Choose which GPU to run on, change this on a multi-GPU system. 55 cudaStatus = cudaSetDevice(0); 56 if (cudaStatus != cudaSuccess) { 57 fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?"); 58 goto Error; 59 } 60 61 // Allocate GPU buffers for three vectors (two input, one output) 62 cudaStatus = cudaMallocManaged(&dev_c, 2*sizeof(int*)); 63 if (cudaStatus != cudaSuccess) { 64 fprintf(stderr, "cudaMalloc failed!"); 65 goto Error; 66 } 67 cudaStatus = cudaMalloc((void**)&(dev_c0), size * sizeof(int)*2); 68 if (cudaStatus != cudaSuccess) { 69 fprintf(stderr, "cudaMalloc failed!"); 70 goto Error; 71 } 72 73 dev_c[0]=dev_c0; 74 dev_c[1]=dev_c0+arraySize; 75 cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int)); 76 if (cudaStatus != cudaSuccess) { 77 fprintf(stderr, "cudaMalloc failed!"); 78 goto Error; 79 } 80 81 cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int)); 82 if (cudaStatus != cudaSuccess) { 83 fprintf(stderr, "cudaMalloc failed!"); 84 goto Error; 85 } 86 87 // Copy input vectors from host memory to GPU buffers. 88 cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice); 89 if (cudaStatus != cudaSuccess) { 90 fprintf(stderr, "cudaMemcpy failed!"); 91 goto Error; 92 } 93 94 cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice); 95 if (cudaStatus != cudaSuccess) { 96 fprintf(stderr, "cudaMemcpy failed!"); 97 goto Error; 98 } 99 100 // Launch a kernel on the GPU with one thread for each element. 101 addKernel<<<1, size*2>>>(dev_c, dev_a, dev_b); 102 103 // cudaThreadSynchronize waits for the kernel to finish, and returns 104 // any errors encountered during the launch. 105 cudaStatus = cudaThreadSynchronize(); 106 if (cudaStatus != cudaSuccess) { 107 fprintf(stderr, "cudaThreadSynchronize returned error code %d after launching addKernel!\n", cudaStatus); 108 goto Error; 109 } 110 111 // Copy output vector from GPU buffer to host memory. 112 cudaStatus = cudaMemcpy(c, dev_c[1], size * sizeof(int), cudaMemcpyDeviceToHost); 113 if (cudaStatus != cudaSuccess) { 114 fprintf(stderr, "cudaMemcpy failed!"); 115 goto Error; 116 } 117 118 Error: 119 120 cudaFree(dev_a); 121 cudaFree(dev_b); 122 123 return cudaStatus; 124 }