【发布时间】:2014-03-22 01:17:02
【问题描述】:
我有一个混合的 mex 和 cuda 代码来评估 phi = 1/2*x'*A*x - b'*x,其中 x 和 b 都是 m x 1 向量,A 是 m x m 矩阵。该代码可以编译和执行,它也给了我正确的答案。
但是,当我退出 MATLAB 时,我不断收到错误 Segmentation fault (core dumped)。我在代码中所做的是,我在 MATLAB 中生成 A、b 和 x,使用 mex 函数将它们传递给 cuda。然后我在 GPU 上评估 phi = 1/2*x'*A*x - b'*x(使用 cuda 线性代数库 cublas)并使用 mex 将 phi 传输回 MATLAB。
谁能帮我看看问题出在哪里?提前致谢。
顺便说一句,这是我的编译方式:
nvcc -arch=sm_20 -c test.cu -Xcompiler -fPIC -I/site/local/matlab-r2012a/extern/include/
mex -L/usr/local/cuda/lib64 -lcudart -lcublas test.o
要打开 MATLAB,需要链接 libstdc++ 库:
LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libstdc++.so.6 matlab
在 MATLAB 中,我做了以下代码测试:
N = 500; x = randn(N,1);B = randn(N);A = B'*B; b = randn(N,1);
tic, 1/2*x'*A*x-b'*x, toc
tic, phi = cublas_mex_test(x,A,b),toc
下面是我的代码test.cu:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>
#include "cublas_v2.h"
#include "cuda.h"
#include "mex.h"
#include <time.h>
#define IDX2C(i,j,ld) (((j)*(ld))+(i))
// this is the actual function that evaluates phi = 1/2*x'*A*x - b'*x
int PhiEval(double *x, double *A, double *b, double &phi, size_t m)
{
cudaError_t cudaStat;
cublasStatus_t stat;
cublasHandle_t handle;
clock_t start, end;
start = clock();
//host data
double *bx, *xAx;
bx = (double*)malloc(1*sizeof(double));
xAx = (double*)malloc(1*sizeof(double));
//device data
double* A_d, * x_d, *b_d, *Ax_d, *bx_d, *xAx_d;
cudaStat = cudaMalloc ((void**)&A_d, m*m*sizeof(double));
cudaStat = cudaMalloc ((void**)&x_d, m*sizeof(double));
cudaStat = cudaMalloc ((void**)&b_d, m*sizeof(double));
cudaStat = cudaMalloc ((void**)&Ax_d, m*sizeof(double));
cudaStat = cudaMalloc ((void**)&bx_d, 1*sizeof(double));
cudaStat = cudaMalloc ((void**)&xAx_d, 1*sizeof(double));
if (cudaStat != cudaSuccess) {
printf ("device memory allocation failed");
return EXIT_FAILURE;
}
end = clock();
printf ("It takes: %d clicks (%.7f seconds) to allocate the memory.\n",end-start,(double)(end-start)/CLOCKS_PER_SEC);
start = clock();
stat = cublasCreate(&handle);
if (stat != CUBLAS_STATUS_SUCCESS) {
printf ("CUBLAS initialization failed\n");
return EXIT_FAILURE;
}
// copy host data to device
stat = cublasSetMatrix (m,m, sizeof(double), A, m, A_d, m);
stat = cublasSetVector (m, sizeof(double), x, 1, x_d, 1);
stat = cublasSetVector (m, sizeof(double), b, 1, b_d, 1);
if (stat != CUBLAS_STATUS_SUCCESS) {
printf ("data download failed");
cudaFree (A_d);
cudaFree (x_d);
cudaFree (b_d);
cudaFree (Ax_d);
cudaFree(xAx_d);
cudaFree(bx_d);
cublasDestroy(handle);
free(bx); free(xAx);
return EXIT_FAILURE;
}
end = clock();
printf ("It takes: %d clicks (%.7f seconds) to copy the data to GPU.\n",end-start,(double)(end-start)/CLOCKS_PER_SEC);
start = clock();
//calculate A*x and store the result in Ax_d
double alpha = 1;
double beta = 0;
stat = cublasDgemv(handle, CUBLAS_OP_N, m,m, &alpha, A_d, m, x_d, 1, &beta, Ax_d, 1);
if (stat != CUBLAS_STATUS_SUCCESS) {
printf ("data download failed");
cudaFree (A_d);
cudaFree (x_d);
cudaFree (b_d);
cudaFree (Ax_d);
cudaFree(xAx_d);
cudaFree(bx_d);
cublasDestroy(handle);
free(bx); free(xAx);
return EXIT_FAILURE;
}
//calculate x'*A*x and store the result in xAx_d
stat = cublasDgemv(handle, CUBLAS_OP_T, m,1, &alpha, x_d, m, Ax_d, 1, &beta, xAx_d, 1);
if (stat != CUBLAS_STATUS_SUCCESS) {
printf ("inner product failed");
cudaFree (A_d);
cudaFree (x_d);
cudaFree (b_d);
cudaFree (Ax_d);
cudaFree(xAx_d);
cudaFree(bx_d);
cublasDestroy(handle);
free(bx); free(xAx);
return EXIT_FAILURE;
}
stat = cublasGetVector (1, sizeof(double),xAx_d, 1, xAx, 1); // copy the result x'*A*x to host
//calculate b'*x and store the result in bx_d
stat = cublasDgemv(handle, CUBLAS_OP_T, m,1, &alpha, b_d, m, x_d, 1, &beta, bx_d, 1);
if (stat != CUBLAS_STATUS_SUCCESS) {
printf ("inner product failed");
cudaFree (A_d);
cudaFree (x_d);
cudaFree (b_d);
cudaFree (Ax_d);
cudaFree(xAx_d);
cudaFree(bx_d);
cublasDestroy(handle);
free(bx); free(xAx);
return EXIT_FAILURE;
}
stat = cublasGetVector (1, sizeof(double),bx_d, 1, bx, 1);
end = clock();
printf ("It takes: %d clicks (%.7f seconds) to call functions in cublas.\n",end-start,(double)(end-start)/CLOCKS_PER_SEC);
//calculate phi = 1/2*x'*A*x - b'*x
phi = .5*xAx[0]-bx[0];
start = clock();
//free the memory
cudaFree (A_d);
cudaFree (x_d);
cudaFree (b_d);
cudaFree (Ax_d);
cudaFree(xAx_d);
cudaFree(bx_d);
cublasDestroy(handle);
free(bx); free(xAx);
end = clock();
printf ("It takes: %d clicks (%.7f seconds) to free the memory.\n",end-start,(double)(end-start)/CLOCKS_PER_SEC);
return EXIT_SUCCESS;
}
/* the gateway function */
void mexFunction( int nlhs, mxArray *plhs[],
int nrhs, const mxArray *prhs[])
{
double phi;
double *A, *b, *x;
size_t mrows,ncols;
/* check for proper number of arguments */
if(nrhs!=3)
mexErrMsgIdAndTxt( "MATLAB:MinTest:invalidNumInputs",
"Three inputs required.");
if(nlhs!=1)
mexErrMsgIdAndTxt( "MATLAB:MinTest:invalidNumOutputs",
"One output required.");
/* create a pointer to the input vector x */
x = mxGetPr(prhs[0]);
/* create a pointer to the input matrix A */
A = mxGetPr(prhs[1]);
/* create a pointer to the input vector b */
b = mxGetPr(prhs[2]);
/* get the dimensions of the matrix input A */
mrows = mxGetM(prhs[1]);
ncols = mxGetN(prhs[1]);
if(mrows!=ncols)
mexErrMsgIdAndTxt( "MATLAB:MinTest:invalidMatrixInput",
"A has to be a square matrix");
/* call the cpp subroutine */
PhiEval(x,A,b,phi,mrows);
plhs[0] = mxCreateDoubleScalar(phi);
}
【问题讨论】:
-
建议here 有帮助吗? (即,在调用
cublas_mex_test例程之前尝试运行类似 gpuDevice() 之类的无害 gpu 活动) -
是的,当我运行 gpuDevice 时,问题就消失了!谢谢!
标签: c++ matlab cuda segmentation-fault mex