使用 cuSPARSE 进行密集到稀疏和稀疏到密集的转换答案

【问题标题】：Dense-to-sparse and sparse-to-dense conversions using cuSPARSE使用 cuSPARSE 进行密集到稀疏和稀疏到密集的转换
【发布时间】：2014-01-31 17:24:50
【问题描述】：

以下程序使用cuSPARSE 测试密集到稀疏的转换。它在前几行输出中产生垃圾。但是，如果我将标有(2) 的行移到标有(1) 的行之后，程序就可以正常工作。谁能告诉我可能是什么原因？

编辑：为了让演示更清晰，我用thrust重写了程序，同样的问题仍然存在。

编辑：按照罗伯特的建议，我把它改回了没有thrust的版本，并添加了api级别的错误检查代码。

#include <iostream>
#include <cusparse_v2.h>

using std::cerr;
using std::cout;
using std::endl;

#define WRAP(x) do {x} while (0)
#define CHKcusparse(x) WRAP(                                        \
  cusparseStatus_t err = (x);                                       \
  if (err != CUSPARSE_STATUS_SUCCESS) {                             \
    cerr << "Cusparse Error #" << int(err) << "\"TODO\" at Line "   \
         << __LINE__ << " of " << __FILE__ << ": " << #x << endl;   \
    exit(1);                                                        \
  }                                                                 \
)
#define CHKcuda(x) WRAP(                                             \
  cudaError_t err = (x);                                             \
  if (err != cudaSuccess) {                                          \
    cerr << "Cuda Error #" << int(err) << ", \""                     \
         << cudaGetErrorString(err) << "\" at Line " << __LINE__     \
         << " of " << __FILE__ << ": " << #x << endl;                \
    exit(1);                                                         \
  }                                                                  \
)
#define ALLOC(X, T, N) do {                            \
  h##X = (T*) malloc(sizeof(T) * (N));                 \
  CHKcuda(cudaMalloc((void**)&d##X, sizeof(T) * (N))); \
} while(0)

int main() {
  srand(100);

  cusparseHandle_t g_cusparse_handle;
  CHKcusparse(cusparseCreate(&g_cusparse_handle));

  const int n = 100, in_degree = 10;
  int nnz = n * in_degree, nn = n * n;

  int *dnnz, *dridx, *dcols;
  int *hnnz, *hridx, *hcols;
  float *dvals, *dmat;
  float *hvals, *hmat;

  // (1) The number of non-zeros in each column.
  ALLOC(nnz, int, n);

  // The dense matrix.
  ALLOC(mat, float, nn);

  // The values in sparse matrix.
  ALLOC(vals, float, nnz);

  // (2) The row indices of the sparse matrix.
  ALLOC(ridx, int, nnz);

  // The column offsets of the sparse matrix.
  ALLOC(cols, int, n+1);

  // Fill and copy dense matrix and number of non-zeros.
  for (int i = 0; i < nn; i++) {hmat[i] = rand();}
  for (int i = 0; i < n; i++) {hnnz[i] = in_degree;}
  CHKcuda(cudaMemcpyAsync(dnnz, hnnz, sizeof(int) * n, cudaMemcpyHostToDevice));
  CHKcuda(cudaMemcpyAsync(dmat, hmat, sizeof(float) * nn, cudaMemcpyHostToDevice));
  CHKcuda(cudaDeviceSynchronize());

  // Perform dense to CSC format
  cusparseMatDescr_t cspMatDesc;
  CHKcusparse(cusparseCreateMatDescr(&cspMatDesc));
  CHKcusparse(cusparseSdense2csc(
      g_cusparse_handle, n, n, cspMatDesc, dmat, n,
      dnnz, dvals, dridx, dcols
  ));

  // Copy row indices back.
  CHKcuda(cudaMemcpyAsync(hridx, dridx, sizeof(int) * nnz, cudaMemcpyDeviceToHost));
  CHKcuda(cudaDeviceSynchronize());
  CHKcusparse(cusparseDestroyMatDescr(cspMatDesc));

  // Display row indices.
  for (int i = 0; i < n; i++) {
    for (int j = 0; j < in_degree; j++) {
      std::cout << hridx[i * in_degree + j] << ", ";
    }
    std::cout << std::endl;
  }

  CHKcuda(cudaFree(dnnz));
  CHKcuda(cudaFree(dvals));
  CHKcuda(cudaFree(dridx));
  CHKcuda(cudaFree(dcols));
  CHKcuda(cudaFree(dmat));
  free(hnnz);
  free(hmat);
  free(hvals);
  free(hridx);
  free(hcols);
  return 0;
}

【问题讨论】：

没有错误检查？在向其他人寻求帮助之前，您应该利用 CUDA 和 cusparse API 的基本 API 级别错误检查。无论第 1 行或第 2 行的位置如何，您都有返回错误的 cusparse 函数。您声明每列 nnz 为 10，但实际上您正在初始化密集矩阵，每列有超过 10 个非零元素，这导致密集到稀疏的转换来炸毁。 Cusparse 提供了一个函数来预先计算每列的 nnz。但在您的情况下，您只需将 in_degree 设置为 100 而不是 10 即可消除错误。
感谢您的提醒。我从一个大型代码库中提取它来提问。我已经测试了这些调用，它们都成功返回。至于密集矩阵，我打算将密集矩阵 n 乘以 n，然后将其转换为大小为 n 乘 n 的稀疏矩阵，每列有 10 个非零元素。如果这是设置，我调用转换函数的方式是否正确？还是我理解错了什么？
@RobertCrovella 对不起，我明白你的意思，你的意思是如果我的密集矩阵在每列中有超过 10 个非零元素，我不能调用每列 nnz 等于 10 的转换？转换不会自动选择 10 个最不为零的元素吗？
不，它不会自动选择 10 个最非零元素，(??)。如果您仔细考虑这一点，我想您会意识到，如果每列的 nnz 与您传递的矩阵不匹配，则结果可能会模棱两可。您传递的每列的 nnz 应与您传递的实际密集矩阵相匹配。如果您愿意，请使用cusparse function available for this purpose。当我运行您发布的第一个代码并测试 API 级别错误时，我在最后一次 cudaMemcpyAsync 调用时收到错误。
你也可以看到你的推力版本的错误。使用cuda-memcheck 运行您的代码或查看proper cuda error checking（其中还讨论了推力错误检查）。如果您没有看到错误，则说明您的错误检查方法已损坏。

标签： cuda sparse-matrix

【解决方案1】：

基本问题是您将内部不一致的数据传递给dense-to-sparse routine。您正在传递一个每列有 100 个非零元素的密集矩阵，但您告诉 cusparse 每列只有 10 个非零元素。

如果您使用cuda-memcheck 运行代码，您会看到 cusparse 出现错误。

对于此代码，您可以通过将 in_degree 变量更改为 100 来解决此问题。

对于一般情况，cusparse 提供a convenient routine 以正确填充每列的非零元素数。

【讨论】：

所以我认为我真正的问题是如何有效地从每列中选择具有最大绝对值的 k 个元素并将其余元素设置为零。你有什么主意吗？我认为这归结为并行的多个 k-selection 问题。
唯一想到的是在每一列上做一个sort-by-key（键=元素绝对值，值=元素索引），然后按值填充前k个原始元素（元素索引）回到一列零。我建议您将其作为一个新的 SO 问题提出，以获得更好的想法。

【解决方案2】：

正如 Robert Crovella 已经强调的那样，使用 cuSPARSE 通过 cusparse<t>nnz() 和 cusparse<t>dense2csr() 例程可以有效地执行从密集到稀疏的传递。反之亦然，可以通过cusparse<t>csr2dense() 例程来完成。下面是一个完整的示例，展示了如何在 CSR 格式中使用 cuSPARSE 从密集传递到稀疏，反之亦然。

cuSparseUtilities.cuh

#ifndef CUSPARSEUTILITIES_CUH
#define CUSPARSEUTILITIES_CUH

#include "cusparse_v2.h"

void setUpDescriptor(cusparseMatDescr_t &, cusparseMatrixType_t, cusparseIndexBase_t);
void dense2SparseD(const double * __restrict__ d_A_dense, int **d_nnzPerVector, double **d_A,
    int **d_A_RowIndices, int **d_A_ColIndices, int &nnz, cusparseMatDescr_t descrA,
    const cusparseHandle_t handle, const int Nrows, const int Ncols);

#endif

cuSparseUtilities.cu

#include "cuSparseUtilities.cuh"
#include "Utilities.cuh"

/*****************************/
/* SETUP DESCRIPTOR FUNCTION */
/*****************************/
void setUpDescriptor(cusparseMatDescr_t &descrA, cusparseMatrixType_t matrixType, cusparseIndexBase_t indexBase) {
    cusparseSafeCall(cusparseCreateMatDescr(&descrA));
    cusparseSafeCall(cusparseSetMatType(descrA, matrixType));
    cusparseSafeCall(cusparseSetMatIndexBase(descrA, indexBase));
}

/********************************************************/
/* DENSE TO SPARSE CONVERSION FOR REAL DOUBLE PRECISION */
/********************************************************/
void dense2SparseD(const double * __restrict__ d_A_dense, int **d_nnzPerVector, double **d_A, 
                   int **d_A_RowIndices, int **d_A_ColIndices, int &nnz, cusparseMatDescr_t descrA, 
                   const cusparseHandle_t handle, const int Nrows, const int Ncols) {

    const int lda = Nrows;                      // --- Leading dimension of dense matrix

    gpuErrchk(cudaMalloc(&d_nnzPerVector[0], Nrows * sizeof(int)));

    // --- Compute the number of nonzero elements per row and the total number of nonzero elements in the dense d_A_dense
    cusparseSafeCall(cusparseDnnz(handle, CUSPARSE_DIRECTION_ROW, Nrows, Ncols, descrA, d_A_dense, lda, d_nnzPerVector[0], &nnz));

    // --- Device side sparse matrix
    gpuErrchk(cudaMalloc(&d_A[0], nnz * sizeof(double)));
    gpuErrchk(cudaMalloc(&d_A_RowIndices[0], (Nrows + 1) * sizeof(int)));
    gpuErrchk(cudaMalloc(&d_A_ColIndices[0], nnz * sizeof(int)));

    cusparseSafeCall(cusparseDdense2csr(handle, Nrows, Ncols, descrA, d_A_dense, lda, d_nnzPerVector[0], d_A[0], d_A_RowIndices[0], d_A_ColIndices[0]));

}

kernel.cu

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

#include <cusparse_v2.h>

#include "cuSparseUtilities.cuh"
#include "Utilities.cuh"

/********/
/* MAIN */
/********/
int main() {

    cusparseHandle_t    handle;

    // --- Initialize cuSPARSE
    cusparseSafeCall(cusparseCreate(&handle));

    cusparseMatDescr_t  descrA = 0;

    /**************************/
    /* SETTING UP THE PROBLEM */
    /**************************/
    const int Nrows = 5;                        // --- Number of rows
    const int Ncols = 4;                        // --- Number of columns
    const int N = Nrows;

    // --- Host side dense matrix
    double *h_A_dense = (double*)malloc(Nrows * Ncols * sizeof(*h_A_dense));

    // --- Column-major storage
    h_A_dense[ 0] = 0.4612f;  h_A_dense[ 5] = -0.0006f;   h_A_dense[10] = 1.3f;     h_A_dense[15] = 0.0f;
    h_A_dense[ 1] = 0.0f;     h_A_dense[ 6] = 1.443f;     h_A_dense[11] = 0.0f;     h_A_dense[16] = 0.0f;
    h_A_dense[ 2] = -0.0006f; h_A_dense[ 7] = 0.4640f;    h_A_dense[12] = 0.0723f;  h_A_dense[17] = 0.0f;
    h_A_dense[ 3] = 0.3566f;  h_A_dense[ 8] = 0.0723f;    h_A_dense[13] = 0.7543f;  h_A_dense[18] = 0.0f;
    h_A_dense[ 4] = 0.f;      h_A_dense[ 9] = 0.0f;       h_A_dense[14] = 0.0f;     h_A_dense[19] = 0.1f;

    // --- Create device array and copy host array to it
    double *d_A_dense;  gpuErrchk(cudaMalloc(&d_A_dense, Nrows * Ncols * sizeof(double)));
    gpuErrchk(cudaMemcpy(d_A_dense, h_A_dense, Nrows * Ncols * sizeof(*d_A_dense), cudaMemcpyHostToDevice));

    /*******************************/
    /* FROM DENSE TO SPARSE MATRIX */
    /*******************************/
    // --- Descriptor for sparse matrix A
    setUpDescriptor(descrA, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_INDEX_BASE_ONE);

    int nnz = 0;                                // --- Number of nonzero elements in dense matrix
    int *d_nnzPerVector;                        // --- Device side number of nonzero elements per row

    double *d_A;                                // --- Sparse matrix values - array of size nnz
    int *d_A_RowIndices;                        // --- "Row indices"
    int *d_A_ColIndices;                        // --- "Column indices"

    dense2SparseD(d_A_dense, &d_nnzPerVector, &d_A, &d_A_RowIndices, &d_A_ColIndices, nnz, descrA, handle, Nrows, Ncols);

    /*******************************************************/
    /* CHECKING THE RESULTS FOR DENSE TO SPARSE CONVERSION */
    /*******************************************************/
    // --- Host side number of nonzero elements per row
    int *h_nnzPerVector = (int *)malloc(Nrows * sizeof(int));
    gpuErrchk(cudaMemcpy(h_nnzPerVector, d_nnzPerVector, Nrows * sizeof(int), cudaMemcpyDeviceToHost));

    printf("Number of nonzero elements in dense matrix = %i\n\n", nnz);
    for (int i = 0; i < Nrows; ++i) printf("Number of nonzero elements in row %i = %i \n", i, h_nnzPerVector[i]);
    printf("\n");

    // --- Host side sparse matrix
    double *h_A = (double *)malloc(nnz * sizeof(double));
    int *h_A_RowIndices = (int *)malloc((Nrows + 1) * sizeof(int));
    int *h_A_ColIndices = (int *)malloc(nnz * sizeof(int));
    gpuErrchk(cudaMemcpy(h_A, d_A, nnz * sizeof(double), cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(h_A_RowIndices, d_A_RowIndices, (Nrows + 1) * sizeof(int), cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(h_A_ColIndices, d_A_ColIndices, nnz * sizeof(int), cudaMemcpyDeviceToHost));

    printf("\nOriginal matrix in CSR format\n\n");
    for (int i = 0; i < nnz; ++i) printf("A[%i] = %f\n", i, h_A[i]); printf("\n");

    printf("\n");
    for (int i = 0; i < (Nrows + 1); ++i) printf("h_A_RowIndices[%i] = %i \n", i, h_A_RowIndices[i]); printf("\n");

    for (int i = 0; i < nnz; ++i) printf("h_A_ColIndices[%i] = %i \n", i, h_A_ColIndices[i]);

    /*******************************/
    /* FROM SPARSE TO DENSE MATRIX */
    /*******************************/
    double *d_A_denseReconstructed; gpuErrchk(cudaMalloc(&d_A_denseReconstructed, Nrows * Ncols * sizeof(double)));
    cusparseSafeCall(cusparseDcsr2dense(handle, Nrows, Ncols, descrA, d_A, d_A_RowIndices, d_A_ColIndices,
                                        d_A_denseReconstructed, Nrows));

    /*******************************************************/
    /* CHECKING THE RESULTS FOR SPARSE TO DENSE CONVERSION */
    /*******************************************************/
    double *h_A_denseReconstructed = (double *)malloc(Nrows * Ncols * sizeof(double));
    gpuErrchk(cudaMemcpy(h_A_denseReconstructed, d_A_denseReconstructed, Nrows * Ncols * sizeof(double), cudaMemcpyDeviceToHost));

    printf("\nReconstructed dense matrix \n");
    for (int m = 0; m < Nrows; m++) {
        for (int n = 0; n < Ncols; n++) 
            printf("%f\t", h_A_denseReconstructed[n * Nrows + m]);
        printf("\n");
    }

    return 0;
}

【讨论】：