正如 cmets 中已经建议的那样,此内核可能不是您认为的性能限制器。至少,您没有提供支持该想法的数据。但是,仍然可以提出一些建议,以改善此内核的运行时间。
我将假设GLfloat 等价于float。在这种情况下,特别是因为这个内核的主要输出 (device_particleCoordinates) 是 float 数量,因此怀疑以 double 精度完成的任何中间计算是否提供了很多好处。您可以尝试将所有操作转换为float 算术。
GPU 代码中的划分可能很昂贵。对于浮点运算,除以常数可以替换为乘以常数的倒数。
您的加载和存储操作正在加载备用位置。可以通过向量加载/存储来提高效率。如评论中所述,这是对基础数据对齐的假设。
这是一个修改内核的示例(未经测试),展示了这些想法:
__global__
void updateParticle1(const int num_particles, const double time, const double gravity,
GLfloat* device_particleCoordinates, GLfloat* device_particleStartCoordinates,
GLfloat* device_particleAcceleration, GLint* device_particleCreatedTime)
{
int threadId = threadIdx.x + blockIdx.x * blockDim.x;
if (threadId < num_particles)
{
float particleLifetime = (int)((((float)time) - (float)device_particleCreatedTime[threadId]) * (0.001f));
float2 dpA = *(reinterpret_cast<float2 *>(device_particleAcceleration)+threadId);
float spl2 = 0.0001f * particleLifetime*particleLifetime;
float distanceX = dpA.x * spl2;
float distanceY = dpA.y * spl2;
float2 dpC = *(reinterpret_cast<float2 *>(device_particleStartCoordinates)+threadId);
dpC.x += distanceX;
dpC.y += distanceY;
*(reinterpret_cast<float2 *>(device_particleCoordinates)+threadId) = dpC;
}
}
根据我的测试,这些更改将使大约 100 万个粒子的内核执行时间从大约 69us (updateParticle) 减少到大约 54us (updateParticle1):
$ cat t388.cu
#include <GL/gl.h>
const int ppt = 4;
__global__
void updateParticle(const int num_particles, const double time, const double gravity,
GLfloat* device_particleCoordinates, GLfloat* device_particleStartCoordinates,
GLfloat* device_particleAcceleration, GLint* device_particleCreatedTime)
{
int threadId = threadIdx.x + blockIdx.x * blockDim.x;
if (threadId < num_particles)
{
int particleLifetime = (time - device_particleCreatedTime[threadId]) / 1000;
double distanceX = 0.5 * device_particleAcceleration[threadId * 2 + 0] * (particleLifetime * particleLifetime) / 5000.0;
double distanceY = 0.5 * device_particleAcceleration[threadId * 2 + 1] * (particleLifetime * particleLifetime) / 5000.0;
device_particleCoordinates[threadId * 2 + 0] = device_particleStartCoordinates[threadId * 2 + 0] + distanceX;
device_particleCoordinates[threadId * 2 + 1] = device_particleStartCoordinates[threadId * 2 + 1] + distanceY;
}
}
__global__
void updateParticle1(const int num_particles, const double time, const double gravity,
GLfloat* device_particleCoordinates, GLfloat* device_particleStartCoordinates,
GLfloat* device_particleAcceleration, GLint* device_particleCreatedTime)
{
int threadId = threadIdx.x + blockIdx.x * blockDim.x;
if (threadId < num_particles)
{
float particleLifetime = (int)((((float)time) - (float)device_particleCreatedTime[threadId]) * (0.001f));
float2 dpA = *(reinterpret_cast<float2 *>(device_particleAcceleration)+threadId);
float spl2 = 0.0001f * particleLifetime*particleLifetime;
float distanceX = dpA.x * spl2;
float distanceY = dpA.y * spl2;
float2 dpC = *(reinterpret_cast<float2 *>(device_particleStartCoordinates)+threadId);
dpC.x += distanceX;
dpC.y += distanceY;
*(reinterpret_cast<float2 *>(device_particleCoordinates)+threadId) = dpC;
}
}
__global__
void updateParticle2(const int num_particles, const double time, const double gravity,
GLfloat * __restrict__ device_particleCoordinates, const GLfloat * __restrict__ device_particleStartCoordinates,
const GLfloat * __restrict__ device_particleAcceleration, const GLint * __restrict__ device_particleCreatedTime)
{
int threadId = threadIdx.x + blockIdx.x * blockDim.x;
if (threadId < num_particles)
{
float particleLifetime = (int)((((float)time) - (float)device_particleCreatedTime[threadId]) * (0.001f));
float2 dpA = *(reinterpret_cast<const float2 *>(device_particleAcceleration)+threadId);
float spl2 = 0.0001f * particleLifetime*particleLifetime;
float distanceX = dpA.x * spl2;
float distanceY = dpA.y * spl2;
float2 dpC = *(reinterpret_cast<const float2 *>(device_particleStartCoordinates)+threadId);
dpC.x += distanceX;
dpC.y += distanceY;
*(reinterpret_cast<float2 *>(device_particleCoordinates)+threadId) = dpC;
}
}
__global__
void updateParticle3(const int num_particles, const double time, const double gravity,
GLfloat * __restrict__ device_particleCoordinates, const GLfloat * __restrict__ device_particleStartCoordinates,
const GLfloat * __restrict__ device_particleAcceleration, const GLint * __restrict__ device_particleCreatedTime)
{
int threadId = threadIdx.x + blockIdx.x * blockDim.x;
for (int i = 0; i < ppt; i++)
{
float particleLifetime = (int)((((float)time) - (float)device_particleCreatedTime[threadId]) * (0.001f));
float2 dpA = *(reinterpret_cast<const float2 *>(device_particleAcceleration)+threadId);
float spl2 = 0.0001f * particleLifetime*particleLifetime;
float distanceX = dpA.x * spl2;
float distanceY = dpA.y * spl2;
float2 dpC = *(reinterpret_cast<const float2 *>(device_particleStartCoordinates)+threadId);
dpC.x += distanceX;
dpC.y += distanceY;
*(reinterpret_cast<float2 *>(device_particleCoordinates)+threadId) = dpC;
threadId += gridDim.x*blockDim.x;
}
}
int main(){
int num_p = 1048576;
float *dpC, *dpSC, *dpA;
int *dpCT;
cudaMalloc(&dpC, num_p*2*sizeof(dpC[0]));
cudaMalloc(&dpSC, num_p*2*sizeof(dpSC[0]));
cudaMalloc(&dpA, num_p*2*sizeof(dpA[0]));
cudaMalloc(&dpCT, num_p*sizeof(dpCT[0]));
updateParticle<<<(num_p+255)/256, 256>>>(num_p, 1.0, 1.0, dpC, dpSC, dpA, dpCT);
updateParticle1<<<(num_p+255)/256, 256>>>(num_p, 1.0, 1.0, dpC, dpSC, dpA, dpCT);
updateParticle2<<<(num_p+255)/256, 256>>>(num_p, 1.0, 1.0, dpC, dpSC, dpA, dpCT);
updateParticle3<<<num_p/(ppt*256), 256>>>(num_p, 1.0, 1.0, dpC, dpSC, dpA, dpCT);
updateParticle<<<(num_p+255)/256, 256>>>(num_p, 1.0, 1.0, dpC, dpSC, dpA, dpCT);
updateParticle1<<<(num_p+255)/256, 256>>>(num_p, 1.0, 1.0, dpC, dpSC, dpA, dpCT);
updateParticle2<<<(num_p+255)/256, 256>>>(num_p, 1.0, 1.0, dpC, dpSC, dpA, dpCT);
updateParticle3<<<num_p/(ppt*256), 256>>>(num_p, 1.0, 1.0, dpC, dpSC, dpA, dpCT);
cudaDeviceSynchronize();
}
$ nvcc -arch=sm_60 -o t388 t388.cu
$ nvprof ./t388
==32419== NVPROF is profiling process 32419, command: ./t388
==32419== Profiling application: ./t388
==32419== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 30.11% 141.41us 2 70.703us 68.991us 72.416us updateParticle(int, double, double, float*, float*, float*, int*)
23.53% 110.50us 2 55.247us 54.816us 55.679us updateParticle2(int, double, double, float*, float const *, float const *, int const *)
23.31% 109.47us 2 54.735us 54.335us 55.136us updateParticle3(int, double, double, float*, float const *, float const *, int const *)
23.06% 108.29us 2 54.144us 53.952us 54.336us updateParticle1(int, double, double, float*, float*, float*, int*)
API calls: 97.56% 291.86ms 4 72.966ms 273.40us 291.01ms cudaMalloc
1.53% 4.5808ms 384 11.929us 313ns 520.98us cuDeviceGetAttribute
0.49% 1.4735ms 4 368.37us 226.07us 580.91us cuDeviceTotalMem
0.22% 670.21us 4 167.55us 89.800us 369.11us cuDeviceGetName
0.13% 392.94us 1 392.94us 392.94us 392.94us cudaDeviceSynchronize
0.05% 150.44us 8 18.804us 10.502us 67.034us cudaLaunchKernel
0.01% 21.862us 4 5.4650us 4.0570us 7.0660us cuDeviceGetPCIBusId
0.00% 10.010us 8 1.2510us 512ns 2.9310us cuDeviceGet
0.00% 6.6950us 3 2.2310us 435ns 3.8940us cuDeviceGetCount
0.00% 2.3460us 4 586ns 486ns 727ns cuDeviceGetUuid
$
用const ... __restrict__ (updateParticle2) 装饰指针似乎不会为这个测试用例提供任何额外的好处。每个线程计算 4 个粒子 (updateParticle3) 而不是 1 个似乎也对处理时间没有显着影响。
特斯拉 P100、CUDA 10.0、CentOS 7.5