内核中的 CUDA lambda 表达式答案

【问题标题】：CUDA lambda expression in kernel内核中的 CUDA lambda 表达式
【发布时间】：2019-11-22 01:24:08
【问题描述】：

我在 CUDA 内核代码中实现 lambda 表达式时遇到问题，它可以编译但执行失败。我使用 Ubuntu 18.04 和 CUDA 10.1 并使用 --expt-extended-lambda 编译。

我只是想以简洁明了的方式在自定义设备矩阵上实现一些基本操作，如点乘、加法、提取等。

我已经测试了推力，但它在与更复杂的自定义 CUDA 内核代码混合时导致了几个设备内存错误。手动 CUDA 分配内存，将 cudaMalloc 转换为 thrust::device_ptr，然后使用推力例程效果不佳，我宁愿摆脱推力。

这是一个失败的模板表达式的基本使用示例，我不知道为什么。 transform / transformProcess 方法失败。显然 binaryFunction 传递的 lambda 表达式不能应用于设备代码。

EDIT 2（修正代码没有编译错误）

Types.cuh

#ifndef TEST_CUDA_DEVICE_LAMBDA_PROCESSING_TYPES_CUH
#define TEST_CUDA_DEVICE_LAMBDA_PROCESSING_TYPES_CUH

#include <cuda_runtime.h>
#include <nvfunctional>
#include <iostream>
#include <vector>
#include <string>

typedef unsigned int uint;

inline bool check(int e, int iLine, const char *szFile) {
    if (e < 0) {
        std::cerr << "General error " << e << " at line " << iLine << " in file " << szFile << std::endl;
        return false;
    }
    return true;
}

#define ck(call) check(call, __LINE__, __FILE__)

template <typename precision>
struct CudaMatrix {
    typedef nvstd::function<precision(precision, precision)> binaryFunction;

    CudaMatrix(uint width, uint height) : width(width), height(height) { }

    __device__ __host__ uint size() const { return width * height; }

    uint       bytesSize() const { return size() * sizeof(precision); }
    void       fill(precision value);
    void       setValuesFromVector(const std::vector<precision> &vector);
    void       display(const std::string &name = "") const;
    CudaMatrix transform(const CudaMatrix &A, binaryFunction lambda);

    CudaMatrix  operator+=(const CudaMatrix &m) { return transform(m, [=] __device__ (precision x, precision y) { return x + y; }); }
    CudaMatrix  operator-=(const CudaMatrix &m) { return transform(m, [=] __device__ (precision x, precision y) { return x - y; }); }
    CudaMatrix  operator*=(const CudaMatrix &m) { return transform(m, [=] __device__ (precision x, precision y) { return x * y; }); }

    precision      *data;
    uint           width,
                   height;
};

#endif //TEST_CUDA_DEVICE_LAMBDA_PROCESSING_TYPES_CUH

Types.cu


#include "Types.cuh"

/**
 * Device code to set a matrix value to the given one
 *
 * @tparam precision - The matrix precision
 *
 * @param matrix - The matrix to set the value to
 * @param value - The value to set
 */
template <typename precision>
__global__ void fillProcess(CudaMatrix<precision> matrix, precision value)
{
    int x = blockDim.x * blockIdx.x + threadIdx.x;

    if (x >= matrix.size()) {
        return;
    }

    *(matrix.data + x) = value;
}

/**
 * Device code to apply a function f for each element of matrix A and B with A = f(A, B)
 *
 * @tparam precision - The matrix precision
 *
 * @param A - The matrix A to store the result in
 * @param B - The matrix B to compute the result from
 * @param transform - The function to apply on each A'elements such as A(i) = transform(A(i), B(i))
 */
template<typename precision>
__global__ void transformProcess(               CudaMatrix<precision>                 A,
                                                CudaMatrix<precision>                 B,
                                 const typename CudaMatrix<precision>::binaryFunction &transform
) {
    int x = blockDim.x * blockIdx.x + threadIdx.x;

    if (x >= A.size()) {
        return;
    }

    *(A.data + x) = transform(*(A.data + x), *(B.data + x));
}

/**
 * Display the matrix
 *
 * @tparam precision - The matrix precision
 *
 * @param name - The matrix name
 */
template <typename precision>
void CudaMatrix<precision>::display(const std::string &name) const
{
    precision *hostValues;

    ck(cudaMallocHost(&hostValues, bytesSize()));
    ck(cudaMemcpy(hostValues, data, bytesSize(), cudaMemcpyDeviceToHost));

    std::cout << "Matrix " << name << " " << width << " x " << height << " pixels of " << typeid(precision).name()
              << "\n\n";

    for (int i = 0; i < height; ++i) {
        std::cout << "{ ";

        for (int j = 0; j < width - 1; ++j) {
            std::cout << *(hostValues + i * width + j) << ", ";
        }

        std::cout << *(hostValues + (i + 1) * width - 1) << " }\n";
    }

    std::cout << std::endl;

    ck(cudaFreeHost(hostValues));
}

/**
 * Fill the matrix with the given value
 *
 * @tparam precision - The matrix precision
 *
 * @param value - The value to set all matrix's elements with
 */
template <typename precision>
void CudaMatrix<precision>::fill(precision value)
{
    const uint threadsPerBlock = 128;
    const uint numBlock        = size() / threadsPerBlock + 1;

    fillProcess<<< numBlock, threadsPerBlock >>>(*this, value);
}

/**
 * Set the matrix values in device CUDA memory from a host standard vector
 *
 * @param vector - The values to set
 */
template <typename precision>
void CudaMatrix<precision>::setValuesFromVector(const std::vector<precision> &vector)
{
    ck(cudaMemcpy(data, vector.data(), vector.size() * sizeof(precision), cudaMemcpyHostToDevice));
}

/**
 * Apply the function "fn" to all elements of the current matrix such as *this[i] = fn(*this[i], A[i])
 *
 * @tparam precision - The matrix precision
 *
 * @param A - The input matrix A
 * @param op - The binary function to apply
 *
 * @return This
 */
template<typename precision>
CudaMatrix<precision> CudaMatrix<precision>::transform(const CudaMatrix &A, binaryFunction fn)
{
    const uint threadsPerBlock = 128;
    const uint numBlock        = size() / threadsPerBlock + 1;

    transformProcess<<< numBlock, threadsPerBlock >>>(*this, A, fn);

    return *this;
}

// Forward template declarations

template struct CudaMatrix<double>;
template struct CudaMatrix<float>;
template struct CudaMatrix<int>;

main.cpp

#include "Types.cuh"

int main(int argc, char **argv)
{
    // Allocate memory
    CudaMatrix<double> m1(3, 3);
    CudaMatrix<double> m2(3, 3);

    ck(cudaMalloc(&m1.data, m1.bytesSize()));
    ck(cudaMalloc(&m2.data, m2.bytesSize()));

    // Test here

    m1.setValuesFromVector({1, 1, 1, 2, 2, 2, 3, 3, 3});
    m2.fill(10);

    m1.display("m1");
    m2.display("m2");

    m1 *= m2;

    m1.display("m1 * m2");

    m1 += m2;

    m1.display("m1 + m2");

    // Clean memory

    ck(cudaFree(m1.data));
    ck(cudaFree(m2.data));

    return EXIT_SUCCESS;
}

输出

Matrix m1 3 x 3 pixels of d

{ 1, 1, 1 }
{ 2, 2, 2 }
{ 3, 3, 3 }

Matrix m2 3 x 3 pixels of d

{ 10, 10, 10 }
{ 10, 10, 10 }
{ 10, 10, 10 }

Matrix m1 * m2 3 x 3 pixels of d

{ 1, 1, 1 }
{ 2, 2, 2 }
{ 3, 3, 3 }

Matrix m1 + m2 3 x 3 pixels of d

Segmentation fault (core dumped)

编辑 3

Robert Crovella 的“嵌套”模板策略解决方案运行良好。

【问题讨论】：

您有实际问题要问吗？作为备注，CUDA GPU 操作不会导致主机端分段错误。这将需要诸如越界主机内存访问或冲洗堆栈之类的东西。因此，（a）您要问什么以及（b）您显示的段错误与您发布的 CUDA 代码有什么关系并不明显
您好，我认为segfault是由于内核函数transformProcess执行的操作对设备内存的非法操作造成的。问题不在于段错误，而是应用给定 lambda 函数导致此段错误的原因。这条线*(A.data + x) = transform(*(A.data + x), *(B.data + x)); 失败了，我问为什么。预期结果是A(i) = A(i) + B(i)。这是我提出的问题，段错误只是这种不当行为的结果，这显然是由于修改设备内存的 CUDA 内核代码造成的。
参见项目 1 here。如果你问为什么你的代码不能工作，你应该提供一个minimal reproducible example。到目前为止，您所展示的不是一个。我已投票结束您的问题。
您是否真的尝试精确编译并且只编译您在此处显示的内容？因为那是我试图做的。它没有用。如果你尝试一下，你也会发现它不起作用。在我向您的Types.cuh 添加了大约 4 或 5 项所需的东西之后，我最终得到了setValuesFromVector 的缺失函数实现。是的，我可能也可以解决这个问题。如果您认为我应该做所有这些来尝试运行您的代码，那么对不起，我不同意您的观点。我相信声称您的代码不完整是一个非常可辩护的声明。
在我看来，没有必要在这个问题中提供任何进一步的信息。我已经根据您的代码创建了一个工作示例，并解决了您代码中的问题，并为您提供了解决方案。您问题最后一段中的主题已得到解释和修复。如果您的问题仍未以某种方式得到解答，我的建议是创建一个新问题。

标签： c++ templates lambda cuda

【解决方案1】：

nvstd::function

您将Types.cuh 包含在main.cpp 中，但Types.cuh 包含设备代码和诸如__device__ 之类的构造，它们无法被主机编译器识别。默认情况下，扩展名为.cpp 的文件将主要由主机编译器处理。当然，您有可能将 -x cu 编译器开关传递给 nvcc 以在您的 Makefile 中处理此问题，但我不知道，所以为了未来读者的利益，我指出这一点。在下面的“固定”代码中，我没有对您的 main.cpp 进行任何更改，只是将其重命名为 main.cu 以解决此问题。

您在至少 2 个内核中对 Types.cu 进行了一些不正确的范围检查：

__global__ void fillProcess(
...
    if (x > matrix.size()) {  // off-by-one size check
        return;
    }
...
__global__ void transformProcess( 
...
    if (x >  A.size()) {      // off-by-one size check
        return;
    }

标准的计算机科学偏离 1 错误。

您的代码中至少缺少六项才能编译。

最需要修复的项目是第一个。为此，我选择使用“嵌套”模板策略，以允许对 lambda 进行模板化，这是 (approximately) 我所知道的将 lambda 从主机传输到设备的唯一方法。我想还有其他可能的方法，您可能会考虑使用函子来代替您在这里拥有的二进制函数（因为它们都有相同的输入-输出原型）。

以下解决了这些问题，并给出了合理的输出。

$ cat Types.cuh
#include <cublas_v2.h>
#include <string>
#include <vector>
#include <cassert>
#define ck(x) x

typedef unsigned int uint;


template <typename precision>
struct CudaMatrix {
    //typedef nvstd::function<precision(precision, precision)> binaryFunction;

    CudaMatrix(uint width, uint height, cublasHandle_t cublasHandle = nullptr) :
               width(width), height(height), cublasHandle(cublasHandle) { }

    __device__ __host__ uint size() const { return width * height; }

    uint       bytesSize() const { return size() * sizeof(precision); }
    void       fill(precision value);
    void       display(const std::string &name = "") const;
    void       setValuesFromVector(const std::vector<precision> vals) const;
    template <typename T>
    CudaMatrix transform(const CudaMatrix &A, T fn);

    CudaMatrix& operator=(CudaMatrix m);
    CudaMatrix  operator+=(const CudaMatrix &m) { return transform(m, [=] __device__ (precision x, precision y) { return x + y; }); }
    CudaMatrix  operator-=(const CudaMatrix &m) { return transform(m, [=] __device__ (precision x, precision y) { return x - y; }); }
    CudaMatrix  operator*=(const CudaMatrix &m) { return transform(m, [=] __device__ (precision x, precision y) { return x * y; }); }

    precision      *data;
    uint           width,
                   height;
    cublasHandle_t cublasHandle;
};
$ cat Types.cu
#include "Types.cuh"
#include <iostream>
/**
 * Device code to set a matrix value to the given one
 *
 * @tparam precision - The matrix precision
 *
 * @param matrix - The matrix to set the value to
 * @param value - The value to set
 */
template <typename precision>
__global__ void fillProcess(CudaMatrix<precision> matrix, precision value)
{
    int x = blockDim.x * blockIdx.x + threadIdx.x;

    if (x >= matrix.size()) { 
        return;
    }

    *(matrix.data + x) = value;
}

/**
 * Device code to apply a function f for each element of matrix A and B with A = f(A, B)
 *
 * @tparam precision - The matrix precision
 *
 * @param A - The matrix A to store the result in
 * @param B - The matrix B to compute the result from
 * @param transform - The function to apply on each A'elements such as A(i) = transform(A(i), B(i))
 */
template <typename precision, typename T>
__global__ void transformProcess(               CudaMatrix<precision>                 A,
                                                CudaMatrix<precision>                 B,
                                                T                                     transform
) {
    int x = blockDim.x * blockIdx.x + threadIdx.x;

    if (x >= A.size()) {  
        return;
    }

    // transform(*(A.data + x), *(B.data + x)) seems to return nothing but do not crash ...

    *(A.data + x) = transform(*(A.data + x), *(B.data + x));
}

/**
 * Apply the function "fn" to all elements of the current matrix such as *this[i] = fn(*this[i], A[i])
 *
 * @tparam precision - The matrix precision
 *
 * @param A - The input matrix A
 * @param op - The binary function to apply
 *
 * @return This
 */
template<typename precision> template<typename T>
CudaMatrix<precision> CudaMatrix<precision>::transform(const CudaMatrix &A, T fn)
{
    const uint threadsPerBlock = 128;
    const uint numBlock        = size() / threadsPerBlock + 1;

    assert(width == A.width);
    assert(height == A.height);

    transformProcess<<< numBlock, threadsPerBlock >>>(*this, A, fn);

    return *this;
}

/**
 * Fill the matrix with the given value
 *
 * @tparam precision - The matrix precision
 *
 * @param value - The value to set all matrix's elements with
 */
template <typename precision>
void CudaMatrix<precision>::fill(precision value)
{
    const uint threadsPerBlock = 128;
    const uint numBlock        = size() / threadsPerBlock + 1;

    // @fixme thrust fill method gives error after 1 iteration
    // thrust::device_ptr<precision> thrustPtr = thrust::device_pointer_cast(data);
    // thrust::uninitialized_fill(thrustPtr, thrustPtr + size(), value);

    fillProcess<<< numBlock, threadsPerBlock >>>(*this, value);
}
template <typename precision>
void CudaMatrix<precision>::setValuesFromVector(const std::vector<precision> vals) const
{

  cudaMemcpy((*this).data, vals.data(), vals.size()*sizeof(precision), cudaMemcpyHostToDevice);

}
/**
 * Display the matrix
 *
 * @tparam precision - The matrix precision
 *
 * @param name - The matrix name
 */
template <typename precision>
void CudaMatrix<precision>::display(const std::string &name) const
{
    precision *hostValues;

    ck(cudaMallocHost(&hostValues, bytesSize()));
    ck(cudaMemcpy(hostValues, data, bytesSize(), cudaMemcpyDeviceToHost));

    std::cout << "Matrix " << name << " " << width << " x " << height << " pixels of " << typeid(precision).name()
              << "\n\n";

    for (int i = 0; i < height; ++i) {
        std::cout << "{ ";

        for (int j = 0; j < width - 1; ++j) {
            std::cout << *(hostValues + i * width + j) << ", ";
        }

        std::cout << *(hostValues + (i + 1) * width - 1) << " }\n";
    }

    std::cout << std::endl;

    ck(cudaFreeHost(hostValues));
}
template class CudaMatrix<double>;
$ cat main.cu
#include "Types.cuh"

int main(int argc, char **argv)
{
    // Allocate memory
    cublasHandle_t cublasHandle = nullptr;

    cublasCreate(&cublasHandle);

    CudaMatrix<double> m1(3, 3, cublasHandle);
    CudaMatrix<double> m2(3, 3, cublasHandle);

    ck(cudaMalloc(&m1.data, m1.bytesSize()));
    ck(cudaMalloc(&m2.data, m2.bytesSize()));

    // Test here

    m1.setValuesFromVector({1, 1, 1, 2, 2, 2, 3, 3, 3});
    m2.fill(10);

    m1.display("m1");
    m2.display("m2");

    // Fails here
    m1 *= m2;

    m1.display("m1 * m1");

    // Clean memory

    cublasDestroy(cublasHandle);

    ck(cudaFree(m1.data));
    ck(cudaFree(m2.data));

    return EXIT_SUCCESS;
}
$ nvcc -std=c++11  -o test main.cu Types.cu --expt-extended-lambda -lcublas -lineinfo
$ cuda-memcheck ./test
========= CUDA-MEMCHECK
Matrix m1 3 x 3 pixels of d

{ 1, 1, 1 }
{ 2, 2, 2 }
{ 3, 3, 3 }

Matrix m2 3 x 3 pixels of d

{ 10, 10, 10 }
{ 10, 10, 10 }
{ 10, 10, 10 }

Matrix m1 * m1 3 x 3 pixels of d

{ 10, 10, 10 }
{ 20, 20, 20 }
{ 30, 30, 30 }

========= ERROR SUMMARY: 0 errors
$

【讨论】：

您好，我测试了您的解决方案（在仅使用您提供的代码的新项目中），但出现编译错误：error: ‘CudaMatrix<precision> CudaMatrix<precision>::transform(const CudaMatrix<precision>&, lambdaT) [with lambdaT = CudaMatrix<precision>::operator*=(const CudaMatrix<precision>&) [with precision = double]::<lambda(double, double)>; precision = double]’, declared using local type ‘CudaMatrix<precision>::operator*=(const CudaMatrix<precision>&) [with precision = double]::<lambda(double, double)>’, is used but never defined [-fpermissive]
这是因为你的 main 是 main.cpp 文件，而不是 main.cu 文件。正如我在回答中的第 2 项中指出的那样，您需要将您的 main 文件设为 main.cu 文件，并使用 nvcc 对其进行编译。之所以必须是.cu文件并由nvcc处理，我的回答第2项也有说明。
就是这样，以前从来没有过这种问题。 CMake 让我变得愚蠢。再次感谢您。