解决 CUDA 中的一般稀疏线性系统答案

【问题标题】：Solving general sparse linear systems in CUDA解决 CUDA 中的一般稀疏线性系统
【发布时间】：2015-08-05 18:40:27
【问题描述】：

我目前正在研究CUDA 并尝试使用cuBLAS 和cuSPARSE 库解决Ax = b。我查看了 NVIDIA 提供的示例代码，包括 conjugateGradient 和 conjugateGradientPrecond。但是，共轭梯度法只适用于正定矩阵，是一种迭代法。现在，我有一些通用的稀疏矩阵，我认为我应该利用 cuSPARSE 库。有谁知道如何使用cuSPARSE 和cuBLAS 库解决Ax = b？我找不到对我有用的 API。通常，矩阵预计至少为1000x1000，在某些情况下，它会上升到100000x100000。我应该使用直接方法吗？

【问题讨论】：

您一般会如何解决？如果在主机上解决，你会用什么方法？
我正在考虑使用 Cholesky 或 LU 分解（不是 cuSparse 库中不完整的分解），然后得到类似 LUx = Pb 这样我就可以求解 x。
如果您想使用 Cholesky，Tim Davis 在suitesparse 中的 CHOLMOD 实现现在支持 GPU。是否使用直接方法的选择可能取决于您尚未说明的矩阵大小。
感谢提醒！我已经编辑了问题。
1000x1000 对于稀疏系统来说是 tiny。您可能想研究主机上的直接方法，甚至只使用密集矩阵代数。 double 的 1000x1000 密集矩阵仅占用 8MB。 “大于 1000x1000”实际上根本没有提供任何信息。它没有上限。为您关心的矩阵大小提供一个有界范围。

标签： cuda

【解决方案1】：

在 CUDA 中解决一般稀疏线性系统的一种可能性是使用cuSOLVER。

cuSOLVER 有三个有用的例程：

cusolverSpDcsrlsvlu，适用于平方线性系统（未知数等于方程数），内部使用sparse LU factorization with partial pivoting；
cusolverSpDcsrlsvqr，适用于平方线性系统（未知数等于方程数），内部使用sparse QR factorization；
cusolverSpDcsrlsqvqr，适用于矩形线性系统（未知数的数量与方程的数量不同）并在内部求解least square problem。

对于上述所有例程，支持的矩阵类型为CUSPARSE_MATRIX_TYPE_GENERAL。如果A 是对称的/Hermitian 且仅使用了下部/上部或有意义，则必须扩展其缺少的上部/下部。

注意cusolverSpDcsrlsvlu

注意两个输入参数：tol和reorder。对于前者，如果系统矩阵A 是奇异的，则LU 分解的矩阵U 的一些对角元素为零。如果|U(j,j)|<tol，该算法决定为零。关于后者，cuSOLVER 提供了重新排序以减少零填充，极大地影响了LU factorization 的性能。 reorder 在重新排序 (reorder=1) 或不重新排序 (reorder=0) 之间切换。

还要注意一个输出参数：singularity。如果A 是可逆的，则为-1，否则它提供第一个索引j 使得U(j,j)=0。

注意cusolverSpDcsrlsvqr

注意输入/输出参数与之前相同。特别是，tol 用于判断奇点，reorder 无效，如果A 可逆，singularity 是-1，否则返回第一个索引j，使得R(j,j)=0。

注意cusolverSpDcsrlsqvqr

注意输入参数tol，用于决定A的等级。

还要注意输出参数rankA，表示A、p的数值秩，长度等于A列数的置换向量（请看有关详细信息的文档）和min_norm，这是残差||Ax - b||的规范。

目前，截至CUDA 10.0，以上三个功能仅适用于主机通道，这意味着它们尚未在GPU上运行。它们必须被称为：

cusolverSpDcsrlsvluHost;
cusolverSpDcsrlsvqrHost;
cusolverSpDcsrlsqvqrHost,

并且输入参数应该都驻留在主机上。

请在下面找到一个使用上述所有三种可能性的完整示例：

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>

#include <cusparse.h>
#include <cusolverSp.h>

/*******************/
/* iDivUp FUNCTION */
/*******************/
//extern "C" int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }
__host__ __device__ int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }

/********************/
/* CUDA ERROR CHECK */
/********************/
// --- Credit to http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) { exit(code); }
    }
}

extern "C" void gpuErrchk(cudaError_t ans) { gpuAssert((ans), __FILE__, __LINE__); }

/**************************/
/* CUSOLVE ERROR CHECKING */
/**************************/
static const char *_cusolverGetErrorEnum(cusolverStatus_t error)
{
    switch (error)
    {
    case CUSOLVER_STATUS_SUCCESS:
        return "CUSOLVER_SUCCESS";

    case CUSOLVER_STATUS_NOT_INITIALIZED:
        return "CUSOLVER_STATUS_NOT_INITIALIZED";

    case CUSOLVER_STATUS_ALLOC_FAILED:
        return "CUSOLVER_STATUS_ALLOC_FAILED";

    case CUSOLVER_STATUS_INVALID_VALUE:
        return "CUSOLVER_STATUS_INVALID_VALUE";

    case CUSOLVER_STATUS_ARCH_MISMATCH:
        return "CUSOLVER_STATUS_ARCH_MISMATCH";

    case CUSOLVER_STATUS_EXECUTION_FAILED:
        return "CUSOLVER_STATUS_EXECUTION_FAILED";

    case CUSOLVER_STATUS_INTERNAL_ERROR:
        return "CUSOLVER_STATUS_INTERNAL_ERROR";

    case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
        return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";

    }

    return "<unknown>";
}

inline void __cusolveSafeCall(cusolverStatus_t err, const char *file, const int line)
{
    if (CUSOLVER_STATUS_SUCCESS != err) {
        fprintf(stderr, "CUSOLVE error in file '%s', line %d, error: %s \nterminating!\n", __FILE__, __LINE__, \
            _cusolverGetErrorEnum(err)); \
            assert(0); \
    }
}

extern "C" void cusolveSafeCall(cusolverStatus_t err) { __cusolveSafeCall(err, __FILE__, __LINE__); }

/***************************/
/* CUSPARSE ERROR CHECKING */
/***************************/
static const char *_cusparseGetErrorEnum(cusparseStatus_t error)
{
    switch (error)
    {

    case CUSPARSE_STATUS_SUCCESS:
        return "CUSPARSE_STATUS_SUCCESS";

    case CUSPARSE_STATUS_NOT_INITIALIZED:
        return "CUSPARSE_STATUS_NOT_INITIALIZED";

    case CUSPARSE_STATUS_ALLOC_FAILED:
        return "CUSPARSE_STATUS_ALLOC_FAILED";

    case CUSPARSE_STATUS_INVALID_VALUE:
        return "CUSPARSE_STATUS_INVALID_VALUE";

    case CUSPARSE_STATUS_ARCH_MISMATCH:
        return "CUSPARSE_STATUS_ARCH_MISMATCH";

    case CUSPARSE_STATUS_MAPPING_ERROR:
        return "CUSPARSE_STATUS_MAPPING_ERROR";

    case CUSPARSE_STATUS_EXECUTION_FAILED:
        return "CUSPARSE_STATUS_EXECUTION_FAILED";

    case CUSPARSE_STATUS_INTERNAL_ERROR:
        return "CUSPARSE_STATUS_INTERNAL_ERROR";

    case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
        return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";

    case CUSPARSE_STATUS_ZERO_PIVOT:
        return "CUSPARSE_STATUS_ZERO_PIVOT";
    }

    return "<unknown>";
}

inline void __cusparseSafeCall(cusparseStatus_t err, const char *file, const int line)
{
    if (CUSPARSE_STATUS_SUCCESS != err) {
        fprintf(stderr, "CUSPARSE error in file '%s', line %Ndims\Nobjs %s\nerror %Ndims: %s\nterminating!\Nobjs", __FILE__, __LINE__, err, \
            _cusparseGetErrorEnum(err)); \
            cudaDeviceReset(); assert(0); \
    }
}

extern "C" void cusparseSafeCall(cusparseStatus_t err) { __cusparseSafeCall(err, __FILE__, __LINE__); }

/********/
/* MAIN */
/********/
int main()
{
    // --- Initialize cuSPARSE
    cusparseHandle_t handle;    cusparseSafeCall(cusparseCreate(&handle));

    const int Nrows = 4;                        // --- Number of rows
    const int Ncols = 4;                        // --- Number of columns
    const int N = Nrows;

    // --- Host side dense matrix
    double *h_A_dense = (double*)malloc(Nrows*Ncols*sizeof(*h_A_dense));

    // --- Column-major ordering
    h_A_dense[0] = 1.0f; h_A_dense[4] = 4.0f; h_A_dense[8] = 0.0f; h_A_dense[12] = 0.0f;
    h_A_dense[1] = 0.0f; h_A_dense[5] = 2.0f; h_A_dense[9] = 3.0f; h_A_dense[13] = 0.0f;
    h_A_dense[2] = 5.0f; h_A_dense[6] = 0.0f; h_A_dense[10] = 0.0f; h_A_dense[14] = 7.0f;
    h_A_dense[3] = 0.0f; h_A_dense[7] = 0.0f; h_A_dense[11] = 9.0f; h_A_dense[15] = 0.0f;

    //create device array and copy host to it
    double *d_A_dense;  gpuErrchk(cudaMalloc(&d_A_dense, Nrows * Ncols * sizeof(*d_A_dense)));
    gpuErrchk(cudaMemcpy(d_A_dense, h_A_dense, Nrows * Ncols * sizeof(*d_A_dense), cudaMemcpyHostToDevice));

    // --- Descriptor for sparse matrix A
    cusparseMatDescr_t descrA;      cusparseSafeCall(cusparseCreateMatDescr(&descrA));
    cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL);
    cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO);

    int nnz = 0;                                // --- Number of nonzero elements in dense matrix
    const int lda = Nrows;                      // --- Leading dimension of dense matrix
    // --- Device side number of nonzero elements per row
    int *d_nnzPerVector;    gpuErrchk(cudaMalloc(&d_nnzPerVector, Nrows * sizeof(*d_nnzPerVector)));
    cusparseSafeCall(cusparseDnnz(handle, CUSPARSE_DIRECTION_ROW, Nrows, Ncols, descrA, d_A_dense, lda, d_nnzPerVector, &nnz));
    // --- Host side number of nonzero elements per row
    int *h_nnzPerVector = (int *)malloc(Nrows * sizeof(*h_nnzPerVector));
    gpuErrchk(cudaMemcpy(h_nnzPerVector, d_nnzPerVector, Nrows * sizeof(*h_nnzPerVector), cudaMemcpyDeviceToHost));

    printf("Number of nonzero elements in dense matrix = %i\n\n", nnz);
    for (int i = 0; i < Nrows; ++i) printf("Number of nonzero elements in row %i = %i \n", i, h_nnzPerVector[i]);
    printf("\n");

    // --- Device side dense matrix
    double *d_A;            gpuErrchk(cudaMalloc(&d_A, nnz * sizeof(*d_A)));
    int *d_A_RowIndices;    gpuErrchk(cudaMalloc(&d_A_RowIndices, (Nrows + 1) * sizeof(*d_A_RowIndices)));
    int *d_A_ColIndices;    gpuErrchk(cudaMalloc(&d_A_ColIndices, nnz * sizeof(*d_A_ColIndices)));

    cusparseSafeCall(cusparseDdense2csr(handle, Nrows, Ncols, descrA, d_A_dense, lda, d_nnzPerVector, d_A, d_A_RowIndices, d_A_ColIndices));

    // --- Host side dense matrix
    double *h_A = (double *)malloc(nnz * sizeof(*h_A));
    int *h_A_RowIndices = (int *)malloc((Nrows + 1) * sizeof(*h_A_RowIndices));
    int *h_A_ColIndices = (int *)malloc(nnz * sizeof(*h_A_ColIndices));
    gpuErrchk(cudaMemcpy(h_A, d_A, nnz*sizeof(*h_A), cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(h_A_RowIndices, d_A_RowIndices, (Nrows + 1) * sizeof(*h_A_RowIndices), cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(h_A_ColIndices, d_A_ColIndices, nnz * sizeof(*h_A_ColIndices), cudaMemcpyDeviceToHost));

    for (int i = 0; i < nnz; ++i) printf("A[%i] = %.0f ", i, h_A[i]); printf("\n");

    for (int i = 0; i < (Nrows + 1); ++i) printf("h_A_RowIndices[%i] = %i \n", i, h_A_RowIndices[i]); printf("\n");

    for (int i = 0; i < nnz; ++i) printf("h_A_ColIndices[%i] = %i \n", i, h_A_ColIndices[i]);

    // --- Allocating and defining dense host and device data vectors
    double *h_y = (double *)malloc(Nrows * sizeof(double));
    h_y[0] = 100.0;  h_y[1] = 200.0; h_y[2] = 400.0; h_y[3] = 500.0;

    double *d_y;        gpuErrchk(cudaMalloc(&d_y, Nrows * sizeof(double)));
    gpuErrchk(cudaMemcpy(d_y, h_y, Nrows * sizeof(double), cudaMemcpyHostToDevice));

    // --- Allocating the host and device side result vector
    double *h_x = (double *)malloc(Ncols * sizeof(double));
    double *d_x;        gpuErrchk(cudaMalloc(&d_x, Ncols * sizeof(double)));

    // --- CUDA solver initialization
    cusolverSpHandle_t solver_handle;
    cusolverSpCreate(&solver_handle);

    // --- Using LU factorization
    int singularity;
    cusolveSafeCall(cusolverSpDcsrlsvluHost(solver_handle, N, nnz, descrA, h_A, h_A_RowIndices, h_A_ColIndices, h_y, 0.000001, 0, h_x, &singularity));
    // --- Using QR factorization
    //cusolveSafeCall(cusolverSpDcsrlsvqrHost(solver_handle, N, nnz, descrA, h_A, h_A_RowIndices, h_A_ColIndices, h_y, 0.000001, 0, h_x, &singularity));

    //int rankA;
    //int *p = (int *)malloc(N * sizeof(int));
    //double min_norm;
    //cusolveSafeCall(cusolverSpDcsrlsqvqrHost(solver_handle, N, N, nnz, descrA, h_A, h_A_RowIndices, h_A_ColIndices, h_y, 0.000001, &rankA, h_x, p, &min_norm));

    printf("Showing the results...\n");
    for (int i = 0; i < N; i++) printf("%f\n", h_x[i]);

}

【讨论】：