【问题标题】:SLI for multiple GPUs适用于多个 GPU 的 SLI
【发布时间】:2012-06-22 21:08:47
【问题描述】:

我是 CUDA 编程的新手,我正在解决一个需要在一台机器上安装多个 GPU 的问题。我知道,为了更好地进行图形编程,需要通过 SLI 组合多个 GPU。但是,对于 CUDA 编程,我是否还需要通过 SLI 组合 GPU?

【问题讨论】:

    标签: cuda gpu sli


    【解决方案1】:

    不,如果您打算将 GPU 用于计算而不是纯图形应用程序,则通常不希望使用 SLI。您将能够从 CUDA 程序中将两个 GPU 作为独立设备访问。请注意,您需要在 GPU 之间明确划分工作。

    我没有解释为什么 SLI 不适用于计算应用程序,但这是我在 Nvidia 论坛上读到的,并在 IRC 频道中从其他人那里听到的。

    【讨论】:

    • 这是一个更好的答案。请参阅我对 Roger Dahl 的回答的评论,了解为什么 SLI 对计算没有用处。
    【解决方案2】:

    您可以在没有 SLI 的多个 GPU 上使用 CUDA,甚至可以在不同的 CUDA 架构之间使用 CUDA,但是您必须编写额外的代码来划分工作并同步划分的子工作。这是一个在 3 个 GPU 上为示例内核 vectorAdd 进行负载平衡的简单程序(GT1030 GPU 一个 Pascal 架构 GPU + 两个 Kepler 架构的 K420 GPU,在同一个任务池中一起工作没有问题):

    /**
     * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
     *
     * Please refer to the NVIDIA end user license agreement (EULA) associated
     * with this source code for terms and conditions that govern your use of
     * this software. Any use, reproduction, disclosure, or distribution of
     * this software and related documentation outside the terms of the EULA
     * is strictly prohibited.
     *
     */
    
    /**
     * Vector addition: C = A + B.
     *
     * This sample is a very basic sample that implements element by element
     * vector addition. It is the same as the sample illustrating Chapter 2
     * of the programming guide with some additions like error checking.
     */
    
    #include <stdio.h>
    
    // For the CUDA runtime routines (prefixed with "cuda_")
    #include <cuda_runtime.h>
    
    #include <helper_cuda.h>
    
    // for load balancing between 3 different GPUs
    #include "LoadBalancerX.h"
    
    /**
     * CUDA Kernel Device code
     *
     * Computes the vector addition of A and B into C. The 3 vectors have the same
     * number of elements numElements.
     */
    __global__ void
    vectorAdd(const float *A, const float *B, float *C, int numElements)
    {
        int i = blockDim.x * blockIdx.x + threadIdx.x;
    
        if (i < numElements)
        {
            C[i] = A[i] + B[i];
        }
    }
    
    
    #include<iostream>
    #include<map>
    int
    main(void)
    {
    
        int numElements = 1500000;
        int numElementsPerGrain = 50000;
        size_t size = numElements * sizeof(float);
    
        float *h_A; cudaMallocHost((void**)&h_A,size);
        float *h_B; cudaMallocHost((void**)&h_B,size);
        float *h_C; cudaMallocHost((void**)&h_C,size);
    
    
        for (int i = 0; i < numElements; ++i)
        {
            h_A[i] = rand()/(float)RAND_MAX;
            h_B[i] = rand()/(float)RAND_MAX;
        }
    
    
    
        /*
         * default tutorial vecAdd logic
    
        cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
        cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
    
    
        int threadsPerBlock = 256;
        int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
    
        vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements);
        cudaGetLastError();
    
    
        cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
        */
    
        /* load-balanced 3-GPU version setup */
        class GrainState
        {
        public:
            int offset;
            int range;
            std::map<int,float *> d_A;
            std::map<int,float *> d_B;
            std::map<int,float *> d_C;
            std::map<int,cudaStream_t> stream;
            ~GrainState(){
                for(auto a:d_A)
                    cudaFree(a.second);
                for(auto b:d_B)
                    cudaFree(b.second);
                for(auto c:d_C)
                    cudaFree(c.second);
                for(auto s:stream)
                    cudaStreamDestroy(s.second);
            }
        };
    
        class DeviceState
        {
        public:
            int gpuId;
            int amIgpu;
        };
    
        LoadBalanceLib::LoadBalancerX<DeviceState,GrainState> lb;
        lb.addDevice(LoadBalanceLib::ComputeDevice<DeviceState>({0,1})); // 1st cuda gpu in computer
        lb.addDevice(LoadBalanceLib::ComputeDevice<DeviceState>({1,1})); // 2nd cuda gpu in computer
        lb.addDevice(LoadBalanceLib::ComputeDevice<DeviceState>({2,1})); // 3rd cuda gpu in computer
        //lb.addDevice(LoadBalanceLib::ComputeDevice<DeviceState>({3,0})); // CPU single core
    
        for(int i=0;i<numElements;i+=numElementsPerGrain)
        {
            lb.addWork(LoadBalanceLib::GrainOfWork<DeviceState,GrainState>(
                    [&,i](DeviceState gpu, GrainState& grain){
                        if(gpu.amIgpu)
                        {
                            cudaSetDevice(gpu.gpuId);
                            cudaStreamCreate(&grain.stream[gpu.gpuId]);
                            cudaMalloc((void **)&grain.d_A[gpu.gpuId], numElementsPerGrain*sizeof(float));
                            cudaMalloc((void **)&grain.d_B[gpu.gpuId], numElementsPerGrain*sizeof(float));
                            cudaMalloc((void **)&grain.d_C[gpu.gpuId], numElementsPerGrain*sizeof(float));
                        }
                    },
                    [&,i](DeviceState gpu, GrainState& grain){
                        if(gpu.amIgpu)
                        {
                            cudaSetDevice(gpu.gpuId);
                            cudaMemcpyAsync(grain.d_A[gpu.gpuId], h_A+i, numElementsPerGrain*sizeof(float), cudaMemcpyHostToDevice,grain.stream[gpu.gpuId]);
                            cudaMemcpyAsync(grain.d_B[gpu.gpuId], h_B+i, numElementsPerGrain*sizeof(float), cudaMemcpyHostToDevice,grain.stream[gpu.gpuId]);
                        }
                    },
                    [&,i](DeviceState gpu, GrainState& grain){
                        if(gpu.amIgpu)
                        {
                            int threadsPerBlock = 1000;
                            int blocksPerGrid =numElementsPerGrain/1000;
                            vectorAdd<<<blocksPerGrid, threadsPerBlock, 0, grain.stream[gpu.gpuId]>>>(grain.d_A[gpu.gpuId], grain.d_B[gpu.gpuId], grain.d_C[gpu.gpuId], numElements-i);
                        }
                        else
                        {
                            for(int j=0;j<numElementsPerGrain;j++)
                            {
                                const int index = j+i;
                                h_C[index]=h_A[index]+h_B[index];
                            }
                        }
                    },
                    [&,i](DeviceState gpu, GrainState& grain){
                        if(gpu.amIgpu)
                        {
                           cudaMemcpyAsync(h_C+i, grain.d_C[gpu.gpuId], numElementsPerGrain*sizeof(float), cudaMemcpyDeviceToHost,grain.stream[gpu.gpuId]);
                        }
                    },
                    [&,i](DeviceState gpu, GrainState& grain){
                        if(gpu.amIgpu)
                        {
                            cudaStreamSynchronize(grain.stream[gpu.gpuId]);
                        }
                    }
            ));
        }
    
        /* load-balance setup end*/
    
        // run 100 times
        size_t nanoseconds=0;
    
        for(int i=0;i<100;i++)
        {
            nanoseconds += lb.run();
    
        }
    
        std::cout<<nanoseconds/100.0<<"ns  ("<<((numElements*12.0/(nanoseconds/100.0)))<<"GB/s)"<<std::endl;
    
    
        std::cout<<"??"<<std::endl;
    
        for (int i = 0; i < numElements; i+=numElementsPerGrain)
        {
            std::cout<<h_A[i]<<" + "<<h_B[i]<<" = "<<h_C[i]<<std::endl;
        }
        auto z = lb.getRelativePerformancesOfDevices();
        std::cout<<"work distribution to devices:"<<std::endl;
        for(auto zz:z)
        {
            std::cout<<zz<<"% ";
        }
        std::cout<<std::endl;
        cudaFreeHost(h_A);
        cudaFreeHost(h_B);
        cudaFreeHost(h_C);
    
        return 0;
    }
    

    【讨论】:

      猜你喜欢
      • 2015-06-18
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      相关资源
      最近更新 更多