【问题标题】:Alea CUDA DeviceMapModule much slower than handwritten kernelAlea CUDA DeviceMapModule 比手写内核慢得多
【发布时间】:2016-09-10 17:58:49
【问题描述】:

我尝试使用 DeviceMapModule 和 DeviceMap2Module 进行简单的映射操作。 令我惊讶的是,它比手动编写内核慢了大约 2 倍。 对于手写内核,我没有做任何特别的事情。我刚刚复制了 Getting Started Square 内核并修改它来做 sigmoid。

下面是两个版本的代码:

首先是慢速版本:

type SigmoidModule(size) =
    inherit Module(size, size) // Note Module is my own abstract class that defines the forward/backward interface.

    [<ReflectedDefinition; AOTCompile>]
    let sigmoid' s y = (1.0f-s) * s * y

    [<ReflectedDefinition; AOTCompile>]
    let sigmoid x = 1.0f / (1.0f + __nv_expf (-x))

    let forwardMap = new DeviceMapModule<float32, float32>(GPUModuleTarget.DefaultWorker, <@ sigmoid @>)
    let backwardMap = new DeviceMap2Module<float32, float32, float32>(
                        GPUModuleTarget.DefaultWorker, <@ sigmoid' @>)

    let mutable activation = Matrix.ZeroCreate 1 size

    override m.Forward (fwd:Matrix) = 
        assert(fwd.Cols = size)
        if activation.Rows <> fwd.Rows then
            activation.Data.Dispose()
            activation <- Matrix.ZeroCreate fwd.Rows fwd.Cols

        forwardMap.Map(activation.Data.Ptr, fwd.Data.Ptr, fwd.Cols * fwd.Rows)
        activation

    override m.Backward (dy:Matrix) = 
        assert(dy.Cols = size)
        assert(activation.Rows = dy.Rows)

        backwardMap.Map(activation.Data.Ptr, activation.Data.Ptr, dy.Data.Ptr, dy.Cols * dy.Rows)
        activation

    interface IDisposable with
        member m.Dispose() = 
            forwardMap.Dispose()
            backwardMap.Dispose()
            activation.Data.Dispose()

现在是快速版本:

type SigmoidModuleFast(size) =
inherit Module(size, size)
let sigmoid' s y = (1.0f-s) * s * y
let worker = Worker.Default

[<ReflectedDefinition; AOTCompile>]
static let sigmoidKernel (outputs:deviceptr<float32>) (inputs:deviceptr<float32>) n =
    let start = blockIdx.x * blockDim.x + threadIdx.x
    let stride = gridDim.x * blockDim.x
    let sigmoid x = 1.0f / (1.0f + __nv_expf (-x))

    let mutable i = start 
    while i < n do
        outputs.[i] <- sigmoid(inputs.[i])
        i <- i + stride

[<ReflectedDefinition; AOTCompile>]
static let sigmoidPrimeKernel (outputs:deviceptr<float32>) (input:deviceptr<float32>) (dy:deviceptr<float32>) n =
    let start = blockIdx.x * blockDim.x + threadIdx.x
    let stride = gridDim.x * blockDim.x

    let mutable i = start 
    while i < n do
        let s = input.[i]
        let y = dy.[i]
        outputs.[i] <- (1.0f-s) * s * y
        i <- i + stride

let mutable activation = Matrix.ZeroCreate 1 size
let mutable lp = LaunchParam(1, size)

override m.Forward (fwd:Matrix) = 
    assert(fwd.Cols = size)
    if activation.Rows <> fwd.Rows then
        activation.Data.Dispose()
        activation <- Matrix.ZeroCreate fwd.Rows fwd.Cols
        let threads = fwd.Rows * fwd.Cols
        if threads < 1024 then
            lp <- LaunchParam(1, threads)
        else
            let blockSize = 256
            let numSm = worker.Device.Attributes.MULTIPROCESSOR_COUNT
            let gridSize = Math.Min(16 * numSm, divup threads blockSize)
            lp <- new LaunchParam(gridSize, blockSize)

    worker.Launch <@ sigmoidKernel @> lp activation.Data.Ptr fwd.Data.Ptr (fwd.Cols*fwd.Rows)
    activation

override m.Backward (dy:Matrix) = 
    assert(dy.Cols = size)
    assert(activation.Rows = dy.Rows)

    worker.Launch <@ sigmoidPrimeKernel @> lp activation.Data.Ptr activation.Data.Ptr dy.Data.Ptr (dy.Cols*dy.Rows)
    activation

interface IDisposable with
    member m.Dispose() = 
        activation.Data.Dispose()

以下是关于我的系统的一些信息:

  • 名称 Intel(R) Core(TM) i5-4590 CPU @ 3.30GHz
  • 架构 x64
  • 频率 3,300 MHz
  • 核心数 4
  • 页面大小 4,096
  • 总物理内存 8,050.00 MB
  • 可用物理内存 902.00 MB
  • 混合图形已启用 False
  • 版本名称 Windows 10 专业版
  • 版本号 10.0.10586
  • Nsight 版本 5.1.0.16062
  • Visual Studio 14.0 版

  • GPU:英伟达 GTX 980 Ti

  • .NET CLR:.NET 4.6。

【问题讨论】:

    标签: aleagpu


    【解决方案1】:

    我不是 GPU 编程方面的专家,但有基本的了解。我在评估 Alea GPU 时看到了这个问题。

    NVidia GTX 980 Ti 有 2816 个内核。 blockSize 为 256,网格大小为 2816 / 256 = 11。

    Alea Community/Basic Developer 库最多允许 2688 个内核。如果 Alea 库将 blockSize 设置为 256(这是一个猜测),则 gridSize 为 10.5。

    会不会是把工作一分为二?第一次运行饱和了 2688 核心限制,然后其余运行在 2816 - 2688 = 128 核心上?

    【讨论】:

      猜你喜欢
      • 1970-01-01
      • 2012-11-27
      • 1970-01-01
      • 1970-01-01
      • 2011-01-24
      • 1970-01-01
      • 2012-04-26
      • 2014-11-07
      • 2018-05-21
      相关资源
      最近更新 更多