线程之间保留的 Cuda 展开循环变量答案

【问题标题】：Cuda unrolled loop variable preserved between threads线程之间保留的 Cuda 展开循环变量
【发布时间】：2015-02-21 05:25:36
【问题描述】：

我一直在研究一个有一些循环展开的 cuda 程序，看起来展开的变量是在线程之间维护的。我不完全确定这不是共享内存问题，但我认为我的索引是正确的。我使用的是 256 的块大小和一维布局。

test.cu

#include <stdio.h>
#include <iostream>

#include <cuda.h>
#include <cuda_runtime.h>

#define BLOCK_SIZE 256

using namespace std;

__global__ void test_kernel(unsigned char *arr) {
    int tid = threadIdx.x;
    int bid = blockIdx.x*blockDim.x + threadIdx.x;

    __shared__ unsigned char sharr[32 * BLOCK_SIZE];

    #pragma unroll
    for (int i=0; i < 32; ++i) {
        sharr[tid*32+i] = i;
    }

    __syncthreads();

    #pragma unroll
    for (int j=0; j < 32; ++j) {
        arr[bid+j] = sharr[tid*32+j];
    }
}

int main(int argc, char **argv) {
    int size = 1024;
    unsigned char *device_test_arr;
    cudaMalloc((void **) &device_test_arr, size * 32 * sizeof(unsigned char));

    const dim3 block_size(256);
    const dim3 num_blocks(size / block_size.x);

    test_kernel<<<num_blocks, block_size>>>(device_test_arr);

    unsigned char *host_test_arr = (unsigned char *)malloc(size * 32 * sizeof(unsigned char));
    cudaMemcpy(host_test_arr, device_test_arr, size * 32 * sizeof(unsigned char), cudaMemcpyDeviceToHost);

    for (int i=0; i < 5; ++i) {
        for (int j=0; j < 32; ++j) {
            cout << static_cast<unsigned char>(host_test_arr[i*32+j]) << ", ";
        }
        cout << "\n";
    }
}

我希望输出是：

0, 1, 2, 3, 4, ..., 30, 31
0, 1, 2, 3, 4, ..., 30, 31

相反，我得到：

0, 1, 2, 3, 4, ..., 30, 31
31, 31, 31, 31, ..., 31, 31

我正在使用计算能力 3.5 和 cuda 7.0 进行测试

【问题讨论】：

到目前为止，您采取了哪些步骤来自行调试？
如果你评论#pragma unroll，它会产生不同的输出吗？
我试过注释掉 #pragma unroll 但我认为没有它 cuda 编译器会自动执行它，如果可能的话我不确定如何禁用它。

标签： cuda

【解决方案1】：

我做了更多必要的修改，但重要的是：

arr[bid*32+j] = sharr[tid*32+j];

总码数：

#include <stdio.h>
#include <iostream>

#include <cuda.h>
#include <cuda_runtime.h>

#define BLOCK_SIZE 256

using namespace std;

__global__ void test_kernel(unsigned char *arr) {
    int tid = threadIdx.x; //0-255
    int bid = blockIdx.x*blockDim.x + threadIdx.x;//0-1024

    __shared__ unsigned char sharr[32 * BLOCK_SIZE];//32*256

    #pragma unroll
    for (int i=0; i < 32; ++i) {
        sharr[tid*32+i] = i;//0,0,0,0,0,0...1,1,1,1,1,1...2,2,2,2,2.....
    }

    __syncthreads();

    #pragma unroll
    for (int j=0; j < 32; ++j) {//
        //arr = 1024*32 unsigned chars
        arr[bid*32+j] = sharr[tid*32+j];
    }
}

int main(int argc, char **argv) {
    int size = 1024;
    unsigned char *device_test_arr;
    cudaMalloc((void **) &device_test_arr, size * 32 * sizeof(unsigned char));

    const dim3 block_size(256);
    const dim3 num_blocks(size / block_size.x);
             //<<<4         ,        256>>>
    test_kernel<<<num_blocks, block_size>>>(device_test_arr);

    unsigned char host_test_arr[size*32];
    cudaMemcpy(host_test_arr, device_test_arr, size * 32 * sizeof(unsigned char), cudaMemcpyDeviceToHost);

    for (int i=0; i < 5; ++i) {
        for (int j=0; j < 32; ++j) {
            cout << (int)host_test_arr[i*32+j] << ", ";
        }cout << endl;
    }
}

【讨论】：

非常感谢，我真的很困惑如何在线程之间共享局部变量，结果我只是不擅长索引。