【发布时间】:2021-07-25 16:39:31
【问题描述】:
我目前正在尝试使 cublasSgelsbatched (https://docs.nvidia.com/cuda/cublas/index.html) 版本正常工作。我首先制作了一个小测试用例,以查看确切需要哪些参数以及如何输入它们。然而,经过多次试验和错误,我仍然无法让它工作,我得到 13 的状态返回,这对应于 CUBLAS_STATUS_EXECUTION_FAILED 这是一个非常模糊的错误,我还尝试了一些其他 cublas 测试用例,它们似乎工作正常。我还在 MATlab 中测试了输入矩阵,它确实有 LS 解决方案。
#include "stdafx.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdlib.h>
#include <stdio.h>
#include <cuda_runtime.h>
#include "cublas_v2.h"
#include <algorithm>
#include <cmath>
#include <Windows.h>
int main()
{
//init id, handle and stat
int id = cudaGetDevice(&id);
cublasHandle_t m_cuBLAS;
cublasStatus_t stat;
// create handle
stat = cublasCreate(&m_cuBLAS);
//params
const int C = 3;
const int M = 2;
long lda = C;
long ldb = M;
//init variables
float *Amat, *Ymat, *Xmat;
float *gAmat, *gYmat;
//allocate mem
Amat = (float*) malloc(M * C * sizeof(float));
Ymat = (float*) malloc(C * sizeof(float));
Xmat = (float*) malloc(M * sizeof(float));
srand(100);
for (int i = 0; i < C * M; i++) {
Amat[i] = rand() % 10 + 1;
Amat[i] = (float)Amat[i];
}
for (int i = 0; i < C; i++) {
Ymat[i] = rand() % 10 + 1;
Ymat[i] = (float)Ymat[i];
}
//allocate mem
cudaMalloc( &gAmat, M * C * sizeof(float));
cudaMalloc( &gYmat, C * sizeof(float));
//copy mem
cudaMemcpy(gAmat, Amat, M * C * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(gYmat, Ymat, C * 1 * sizeof(float), cudaMemcpyHostToDevice);
//init info params
int info = 0;
int devInfoArray[1] = { 0 };
//Synchronize (not necesarry I think, but just to test)
cudaDeviceSynchronize();
//run cublas
cublasStatus_t status = cublasSgelsBatched(m_cuBLAS,
CUBLAS_OP_N,
C,
M,
1,
&gAmat,
lda, //or 1
&gYmat,
lda,
&info,
NULL,
1);
//Output info
std::cout << "status = " << status << std::endl;
std::cout << "info = " << info << std::endl;
std::cout << "devInfoArray = " << devInfoArray[0] << std::endl;
cudaMemcpy(Xmat, gYmat, C * 1 * sizeof(float), cudaMemcpyDeviceToHost);
//Output printed
std::cout << Xmat[0] << ", " << Xmat[1] << ", " << Xmat[2] << std::endl;
//free memory
free(Amat);
free(Ymat);
free(Xmat);
cudaFree(gAmat);
cudaFree(gYmat);
//destory handle
cublasDestroy(m_cuBLAS);
return 0;
}
我在使用 CUDA 9.0 在 MVS 中运行的 Windows 10 上
非常感谢您的帮助
【问题讨论】:
-
docs.nvidia.com/cuda/cublas/… -- “Aarray 是指向以列优先格式存储的矩阵的指针数组”。我在您的代码中没有看到任何指向矩阵的指针数组,是吗?