适用于多个 GPU 的 SLI答案

【问题标题】：SLI for multiple GPUs适用于多个 GPU 的 SLI
【发布时间】：2012-06-22 21:08:47
【问题描述】：

我是 CUDA 编程的新手，我正在解决一个需要在一台机器上安装多个 GPU 的问题。我知道，为了更好地进行图形编程，需要通过 SLI 组合多个 GPU。但是，对于 CUDA 编程，我是否还需要通过 SLI 组合 GPU？

【问题讨论】：

标签： cuda gpu sli

【解决方案1】：

不，如果您打算将 GPU 用于计算而不是纯图形应用程序，则通常不希望使用 SLI。您将能够从 CUDA 程序中将两个 GPU 作为独立设备访问。请注意，您需要在 GPU 之间明确划分工作。

我没有解释为什么 SLI 不适用于计算应用程序，但这是我在 Nvidia 论坛上读到的，并在 IRC 频道中从其他人那里听到的。

【讨论】：

这是一个更好的答案。请参阅我对 Roger Dahl 的回答的评论，了解为什么 SLI 对计算没有用处。

【解决方案2】：

您可以在没有 SLI 的多个 GPU 上使用 CUDA，甚至可以在不同的 CUDA 架构之间使用 CUDA，但是您必须编写额外的代码来划分工作并同步划分的子工作。这是一个在 3 个 GPU 上为示例内核 vectorAdd 进行负载平衡的简单程序（GT1030 GPU 一个 Pascal 架构 GPU + 两个 Kepler 架构的 K420 GPU，在同一个任务池中一起工作没有问题）：

/**
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

/**
 * Vector addition: C = A + B.
 *
 * This sample is a very basic sample that implements element by element
 * vector addition. It is the same as the sample illustrating Chapter 2
 * of the programming guide with some additions like error checking.
 */

#include <stdio.h>

// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>

#include <helper_cuda.h>

// for load balancing between 3 different GPUs
#include "LoadBalancerX.h"

/**
 * CUDA Kernel Device code
 *
 * Computes the vector addition of A and B into C. The 3 vectors have the same
 * number of elements numElements.
 */
__global__ void
vectorAdd(const float *A, const float *B, float *C, int numElements)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;

    if (i < numElements)
    {
        C[i] = A[i] + B[i];
    }
}


#include<iostream>
#include<map>
int
main(void)
{

    int numElements = 1500000;
    int numElementsPerGrain = 50000;
    size_t size = numElements * sizeof(float);

    float *h_A; cudaMallocHost((void**)&h_A,size);
    float *h_B; cudaMallocHost((void**)&h_B,size);
    float *h_C; cudaMallocHost((void**)&h_C,size);


    for (int i = 0; i < numElements; ++i)
    {
        h_A[i] = rand()/(float)RAND_MAX;
        h_B[i] = rand()/(float)RAND_MAX;
    }



    /*
     * default tutorial vecAdd logic

    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);


    int threadsPerBlock = 256;
    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;

    vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements);
    cudaGetLastError();


    cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
    */

    /* load-balanced 3-GPU version setup */
    class GrainState
    {
    public:
        int offset;
        int range;
        std::map<int,float *> d_A;
        std::map<int,float *> d_B;
        std::map<int,float *> d_C;
        std::map<int,cudaStream_t> stream;
        ~GrainState(){
            for(auto a:d_A)
                cudaFree(a.second);
            for(auto b:d_B)
                cudaFree(b.second);
            for(auto c:d_C)
                cudaFree(c.second);
            for(auto s:stream)
                cudaStreamDestroy(s.second);
        }
    };

    class DeviceState
    {
    public:
        int gpuId;
        int amIgpu;
    };

    LoadBalanceLib::LoadBalancerX<DeviceState,GrainState> lb;
    lb.addDevice(LoadBalanceLib::ComputeDevice<DeviceState>({0,1})); // 1st cuda gpu in computer
    lb.addDevice(LoadBalanceLib::ComputeDevice<DeviceState>({1,1})); // 2nd cuda gpu in computer
    lb.addDevice(LoadBalanceLib::ComputeDevice<DeviceState>({2,1})); // 3rd cuda gpu in computer
    //lb.addDevice(LoadBalanceLib::ComputeDevice<DeviceState>({3,0})); // CPU single core

    for(int i=0;i<numElements;i+=numElementsPerGrain)
    {
        lb.addWork(LoadBalanceLib::GrainOfWork<DeviceState,GrainState>(
                [&,i](DeviceState gpu, GrainState& grain){
                    if(gpu.amIgpu)
                    {
                        cudaSetDevice(gpu.gpuId);
                        cudaStreamCreate(&grain.stream[gpu.gpuId]);
                        cudaMalloc((void **)&grain.d_A[gpu.gpuId], numElementsPerGrain*sizeof(float));
                        cudaMalloc((void **)&grain.d_B[gpu.gpuId], numElementsPerGrain*sizeof(float));
                        cudaMalloc((void **)&grain.d_C[gpu.gpuId], numElementsPerGrain*sizeof(float));
                    }
                },
                [&,i](DeviceState gpu, GrainState& grain){
                    if(gpu.amIgpu)
                    {
                        cudaSetDevice(gpu.gpuId);
                        cudaMemcpyAsync(grain.d_A[gpu.gpuId], h_A+i, numElementsPerGrain*sizeof(float), cudaMemcpyHostToDevice,grain.stream[gpu.gpuId]);
                        cudaMemcpyAsync(grain.d_B[gpu.gpuId], h_B+i, numElementsPerGrain*sizeof(float), cudaMemcpyHostToDevice,grain.stream[gpu.gpuId]);
                    }
                },
                [&,i](DeviceState gpu, GrainState& grain){
                    if(gpu.amIgpu)
                    {
                        int threadsPerBlock = 1000;
                        int blocksPerGrid =numElementsPerGrain/1000;
                        vectorAdd<<<blocksPerGrid, threadsPerBlock, 0, grain.stream[gpu.gpuId]>>>(grain.d_A[gpu.gpuId], grain.d_B[gpu.gpuId], grain.d_C[gpu.gpuId], numElements-i);
                    }
                    else
                    {
                        for(int j=0;j<numElementsPerGrain;j++)
                        {
                            const int index = j+i;
                            h_C[index]=h_A[index]+h_B[index];
                        }
                    }
                },
                [&,i](DeviceState gpu, GrainState& grain){
                    if(gpu.amIgpu)
                    {
                       cudaMemcpyAsync(h_C+i, grain.d_C[gpu.gpuId], numElementsPerGrain*sizeof(float), cudaMemcpyDeviceToHost,grain.stream[gpu.gpuId]);
                    }
                },
                [&,i](DeviceState gpu, GrainState& grain){
                    if(gpu.amIgpu)
                    {
                        cudaStreamSynchronize(grain.stream[gpu.gpuId]);
                    }
                }
        ));
    }

    /* load-balance setup end*/

    // run 100 times
    size_t nanoseconds=0;

    for(int i=0;i<100;i++)
    {
        nanoseconds += lb.run();

    }

    std::cout<<nanoseconds/100.0<<"ns  ("<<((numElements*12.0/(nanoseconds/100.0)))<<"GB/s)"<<std::endl;


    std::cout<<"??"<<std::endl;

    for (int i = 0; i < numElements; i+=numElementsPerGrain)
    {
        std::cout<<h_A[i]<<" + "<<h_B[i]<<" = "<<h_C[i]<<std::endl;
    }
    auto z = lb.getRelativePerformancesOfDevices();
    std::cout<<"work distribution to devices:"<<std::endl;
    for(auto zz:z)
    {
        std::cout<<zz<<"% ";
    }
    std::cout<<std::endl;
    cudaFreeHost(h_A);
    cudaFreeHost(h_B);
    cudaFreeHost(h_C);

    return 0;
}

【讨论】：