【发布时间】:2016-08-14 00:09:30
【问题描述】:
我的问题: 我正在寻找有人指出我尝试在 CUDA 中使用实现零拷贝的方式中的一个错误,或者揭示一个更“幕后”的视角来解释为什么零拷贝方法不会比 memcpy 方法快.顺便说一句,我正在使用 Ubuntu 在 NVidia 的 TK1 处理器上进行测试。
我的问题与高效使用 NVIDIA TK1 的(物理上)统一内存架构和 CUDA 有关。 NVIDIA 为 GPU/CPU 内存传输抽象提供了 2 种方法。
- 统一内存抽象(使用 cudaHostAlloc 和 cudaHostGetDevicePointer)
- 显式复制到主机和从设备(使用 cudaMalloc() 和 cudaMemcpy)
我的测试代码的简短描述:我使用方法 1 和 2 测试了相同的 cuda 内核。鉴于没有将源数据复制到设备或从设备复制结果数据,我希望 1 更快.但是,结果与我的假设相反(方法#1 慢了 50%)。以下是我的测试代码:
#include <libfreenect/libfreenect.hpp>
#include <iostream>
#include <vector>
#include <cmath>
#include <pthread.h>
#include <cxcore.h>
#include <time.h>
#include <sys/time.h>
#include <memory.h>
///CUDA///
#include <cuda.h>
#include <cuda_runtime.h>
///OpenCV 2.4
#include <highgui.h>
#include <cv.h>
#include <opencv2/gpu/gpu.hpp>
using namespace cv;
using namespace std;
///The Test Kernel///
__global__ void cudaCalcXYZ( float *dst, float *src, float *M, int height, int width, float scaleFactor, int minDistance)
{
float nx,ny,nz, nzpminD, jFactor;
int heightCenter = height / 2;
int widthCenter = width / 2;
//int j = blockIdx.x; //Represents which row we are in
int index = blockIdx.x*width;
jFactor = (blockIdx.x - heightCenter)*scaleFactor;
for(int i= 0; i < width; i++)
{
nz = src[index];
nzpminD = nz + minDistance;
nx = (i - widthCenter )*(nzpminD)*scaleFactor;
ny = (jFactor)*(nzpminD);
//Solve for only Y matrix (height vlaues)
dst[index++] = nx*M[4] + ny*M[5] + nz*M[6];
//dst[index++] = 1 + 2 + 3;
}
}
//Function fwd declarations
double getMillis();
double getMicros();
void runCudaTestZeroCopy(int iter, int cols, int rows);
void runCudaTestDeviceCopy(int iter, int cols, int rows);
int main(int argc, char **argv) {
//ZERO COPY FLAG (allows runCudaTestZeroCopy to run without fail)
cudaSetDeviceFlags(cudaDeviceMapHost);
//Runs kernel using explicit data copy to 'device' and back from 'device'
runCudaTestDeviceCopy(20, 640,480);
//Uses 'unified memory' cuda abstraction so device can directly work from host data
runCudaTestZeroCopy(20,640, 480);
std::cout << "Stopping test" << std::endl;
return 0;
}
void runCudaTestZeroCopy(int iter, int cols, int rows)
{
cout << "CUDA Test::ZEROCOPY" << endl;
int src_rows = rows;
int src_cols = cols;
int m_rows = 4;
int m_cols = 4;
int dst_rows = src_rows;
int dst_cols = src_cols;
//Create and allocate memory for host mats pointers
float *psrcMat;
float *pmMat;
float *pdstMat;
cudaHostAlloc((void **)&psrcMat, src_rows*src_cols*sizeof(float), cudaHostAllocMapped);
cudaHostAlloc((void **)&pmMat, m_rows*m_cols*sizeof(float), cudaHostAllocMapped);
cudaHostAlloc((void **)&pdstMat, dst_rows*dst_cols*sizeof(float), cudaHostAllocMapped);
//Create mats using host pointers
Mat src_mat = Mat(cvSize(src_cols, src_rows), CV_32FC1, psrcMat);
Mat m_mat = Mat(cvSize(m_cols, m_rows), CV_32FC1, pmMat);
Mat dst_mat = Mat(cvSize(dst_cols, dst_rows), CV_32FC1, pdstMat);
//configure src and m mats
for(int i = 0; i < src_rows*src_cols; i++)
{
psrcMat[i] = (float)i;
}
for(int i = 0; i < m_rows*m_cols; i++)
{
pmMat[i] = 0.1234;
}
//Create pointers to dev mats
float *d_psrcMat;
float *d_pmMat;
float *d_pdstMat;
//Map device to host pointers
cudaHostGetDevicePointer((void **)&d_psrcMat, (void *)psrcMat, 0);
//cudaHostGetDevicePointer((void **)&d_pmMat, (void *)pmMat, 0);
cudaHostGetDevicePointer((void **)&d_pdstMat, (void *)pdstMat, 0);
//Copy matrix M to device
cudaMalloc( (void **)&d_pmMat, sizeof(float)*4*4 ); //4x4 matrix
cudaMemcpy( d_pmMat, pmMat, sizeof(float)*m_rows*m_cols, cudaMemcpyHostToDevice);
//Additional Variables for kernels
float scaleFactor = 0.0021;
int minDistance = -10;
//Run kernel! //cudaSimpleMult( float *dst, float *src, float *M, int width, int height)
int blocks = src_rows;
const int numTests = iter;
double perfStart = getMillis();
for(int i = 0; i < numTests; i++)
{
//cudaSimpleMult<<<blocks,1>>>(d_pdstMat, d_psrcMat, d_pmMat, src_cols, src_rows);
cudaCalcXYZ<<<blocks,1>>>(d_pdstMat, d_psrcMat, d_pmMat, src_rows, src_cols, scaleFactor, minDistance);
cudaDeviceSynchronize();
}
double perfStop = getMillis();
double perfDelta = perfStop - perfStart;
cout << "Ran " << numTests << " iterations totaling " << perfDelta << "ms" << endl;
cout << " Average time per iteration: " << (perfDelta/(float)numTests) << "ms" << endl;
//Copy result back to host
//cudaMemcpy(pdstMat, d_pdstMat, sizeof(float)*src_rows*src_cols, cudaMemcpyDeviceToHost);
//cout << "Printing results" << endl;
//for(int i = 0; i < 16*16; i++)
//{
// cout << "src[" << i << "]= " << psrcMat[i] << " dst[" << i << "]= " << pdstMat[i] << endl;
//}
cudaFree(d_psrcMat);
cudaFree(d_pmMat);
cudaFree(d_pdstMat);
cudaFreeHost(psrcMat);
cudaFreeHost(pmMat);
cudaFreeHost(pdstMat);
}
void runCudaTestDeviceCopy(int iter, int cols, int rows)
{
cout << "CUDA Test::DEVICE COPY" << endl;
int src_rows = rows;
int src_cols = cols;
int m_rows = 4;
int m_cols = 4;
int dst_rows = src_rows;
int dst_cols = src_cols;
//Create and allocate memory for host mats pointers
float *psrcMat;
float *pmMat;
float *pdstMat;
cudaHostAlloc((void **)&psrcMat, src_rows*src_cols*sizeof(float), cudaHostAllocMapped);
cudaHostAlloc((void **)&pmMat, m_rows*m_cols*sizeof(float), cudaHostAllocMapped);
cudaHostAlloc((void **)&pdstMat, dst_rows*dst_cols*sizeof(float), cudaHostAllocMapped);
//Create pointers to dev mats
float *d_psrcMat;
float *d_pmMat;
float *d_pdstMat;
cudaMalloc( (void **)&d_psrcMat, sizeof(float)*src_rows*src_cols );
cudaMalloc( (void **)&d_pdstMat, sizeof(float)*src_rows*src_cols );
cudaMalloc( (void **)&d_pmMat, sizeof(float)*4*4 ); //4x4 matrix
//Create mats using host pointers
Mat src_mat = Mat(cvSize(src_cols, src_rows), CV_32FC1, psrcMat);
Mat m_mat = Mat(cvSize(m_cols, m_rows), CV_32FC1, pmMat);
Mat dst_mat = Mat(cvSize(dst_cols, dst_rows), CV_32FC1, pdstMat);
//configure src and m mats
for(int i = 0; i < src_rows*src_cols; i++)
{
psrcMat[i] = (float)i;
}
for(int i = 0; i < m_rows*m_cols; i++)
{
pmMat[i] = 0.1234;
}
//Additional Variables for kernels
float scaleFactor = 0.0021;
int minDistance = -10;
//Run kernel! //cudaSimpleMult( float *dst, float *src, float *M, int width, int height)
int blocks = src_rows;
double perfStart = getMillis();
for(int i = 0; i < iter; i++)
{
//Copty from host to device
cudaMemcpy( d_psrcMat, psrcMat, sizeof(float)*src_rows*src_cols, cudaMemcpyHostToDevice);
cudaMemcpy( d_pmMat, pmMat, sizeof(float)*m_rows*m_cols, cudaMemcpyHostToDevice);
//Run Kernel
//cudaSimpleMult<<<blocks,1>>>(d_pdstMat, d_psrcMat, d_pmMat, src_cols, src_rows);
cudaCalcXYZ<<<blocks,1>>>(d_pdstMat, d_psrcMat, d_pmMat, src_rows, src_cols, scaleFactor, minDistance);
//Copy from device to host
cudaMemcpy( pdstMat, d_pdstMat, sizeof(float)*src_rows*src_cols, cudaMemcpyDeviceToHost);
}
double perfStop = getMillis();
double perfDelta = perfStop - perfStart;
cout << "Ran " << iter << " iterations totaling " << perfDelta << "ms" << endl;
cout << " Average time per iteration: " << (perfDelta/(float)iter) << "ms" << endl;
cudaFree(d_psrcMat);
cudaFree(d_pmMat);
cudaFree(d_pdstMat);
cudaFreeHost(psrcMat);
cudaFreeHost(pmMat);
cudaFreeHost(pdstMat);
}
//Timing functions for performance measurements
double getMicros()
{
timespec ts;
//double t_ns, t_s;
long t_ns;
double t_s;
clock_gettime(CLOCK_MONOTONIC, &ts);
t_s = (double)ts.tv_sec;
t_ns = ts.tv_nsec;
//return( (t_s *1000.0 * 1000.0) + (double)(t_ns / 1000.0) );
return ((double)t_ns / 1000.0);
}
double getMillis()
{
timespec ts;
double t_ns, t_s;
clock_gettime(CLOCK_MONOTONIC, &ts);
t_s = (double)ts.tv_sec;
t_ns = (double)ts.tv_nsec;
return( (t_s * 1000.0) + (t_ns / 1000000.0) );
}
我已经看过Cuda zero-copy performance的帖子了,但我觉得这没有关系,原因如下:GPU和CPU有一个物理统一的内存架构。
谢谢
【问题讨论】:
-
Stack Overflow 不是一个讨论论坛,这个问题不太适合这个地方。如果您有一个具体的独立问题,并带有一个简短、完整的代码示例来说明您的问题,请将其编辑到您的问题中。将谷歌驱动器链接放入代码会适得其反。如果链接断开,那么这个问题就没用了。问题和答案作为永久记录存在,可以帮助您和未来的访问者遇到相同的问题或问题。我投票结束了这个问题。
-
感谢您的建议,我将删除“讨论”请求并更明确地提出底线请求,因为我的问题具体是“如何在物理统一的情况下有效地使用零拷贝内存架构?”基于我提供的 2 种方法。