【发布时间】:2016-09-10 17:58:49
【问题描述】:
我尝试使用 DeviceMapModule 和 DeviceMap2Module 进行简单的映射操作。 令我惊讶的是,它比手动编写内核慢了大约 2 倍。 对于手写内核,我没有做任何特别的事情。我刚刚复制了 Getting Started Square 内核并修改它来做 sigmoid。
下面是两个版本的代码:
首先是慢速版本:
type SigmoidModule(size) =
inherit Module(size, size) // Note Module is my own abstract class that defines the forward/backward interface.
[<ReflectedDefinition; AOTCompile>]
let sigmoid' s y = (1.0f-s) * s * y
[<ReflectedDefinition; AOTCompile>]
let sigmoid x = 1.0f / (1.0f + __nv_expf (-x))
let forwardMap = new DeviceMapModule<float32, float32>(GPUModuleTarget.DefaultWorker, <@ sigmoid @>)
let backwardMap = new DeviceMap2Module<float32, float32, float32>(
GPUModuleTarget.DefaultWorker, <@ sigmoid' @>)
let mutable activation = Matrix.ZeroCreate 1 size
override m.Forward (fwd:Matrix) =
assert(fwd.Cols = size)
if activation.Rows <> fwd.Rows then
activation.Data.Dispose()
activation <- Matrix.ZeroCreate fwd.Rows fwd.Cols
forwardMap.Map(activation.Data.Ptr, fwd.Data.Ptr, fwd.Cols * fwd.Rows)
activation
override m.Backward (dy:Matrix) =
assert(dy.Cols = size)
assert(activation.Rows = dy.Rows)
backwardMap.Map(activation.Data.Ptr, activation.Data.Ptr, dy.Data.Ptr, dy.Cols * dy.Rows)
activation
interface IDisposable with
member m.Dispose() =
forwardMap.Dispose()
backwardMap.Dispose()
activation.Data.Dispose()
现在是快速版本:
type SigmoidModuleFast(size) =
inherit Module(size, size)
let sigmoid' s y = (1.0f-s) * s * y
let worker = Worker.Default
[<ReflectedDefinition; AOTCompile>]
static let sigmoidKernel (outputs:deviceptr<float32>) (inputs:deviceptr<float32>) n =
let start = blockIdx.x * blockDim.x + threadIdx.x
let stride = gridDim.x * blockDim.x
let sigmoid x = 1.0f / (1.0f + __nv_expf (-x))
let mutable i = start
while i < n do
outputs.[i] <- sigmoid(inputs.[i])
i <- i + stride
[<ReflectedDefinition; AOTCompile>]
static let sigmoidPrimeKernel (outputs:deviceptr<float32>) (input:deviceptr<float32>) (dy:deviceptr<float32>) n =
let start = blockIdx.x * blockDim.x + threadIdx.x
let stride = gridDim.x * blockDim.x
let mutable i = start
while i < n do
let s = input.[i]
let y = dy.[i]
outputs.[i] <- (1.0f-s) * s * y
i <- i + stride
let mutable activation = Matrix.ZeroCreate 1 size
let mutable lp = LaunchParam(1, size)
override m.Forward (fwd:Matrix) =
assert(fwd.Cols = size)
if activation.Rows <> fwd.Rows then
activation.Data.Dispose()
activation <- Matrix.ZeroCreate fwd.Rows fwd.Cols
let threads = fwd.Rows * fwd.Cols
if threads < 1024 then
lp <- LaunchParam(1, threads)
else
let blockSize = 256
let numSm = worker.Device.Attributes.MULTIPROCESSOR_COUNT
let gridSize = Math.Min(16 * numSm, divup threads blockSize)
lp <- new LaunchParam(gridSize, blockSize)
worker.Launch <@ sigmoidKernel @> lp activation.Data.Ptr fwd.Data.Ptr (fwd.Cols*fwd.Rows)
activation
override m.Backward (dy:Matrix) =
assert(dy.Cols = size)
assert(activation.Rows = dy.Rows)
worker.Launch <@ sigmoidPrimeKernel @> lp activation.Data.Ptr activation.Data.Ptr dy.Data.Ptr (dy.Cols*dy.Rows)
activation
interface IDisposable with
member m.Dispose() =
activation.Data.Dispose()
以下是关于我的系统的一些信息:
- 名称 Intel(R) Core(TM) i5-4590 CPU @ 3.30GHz
- 架构 x64
- 频率 3,300 MHz
- 核心数 4
- 页面大小 4,096
- 总物理内存 8,050.00 MB
- 可用物理内存 902.00 MB
- 混合图形已启用 False
- 版本名称 Windows 10 专业版
- 版本号 10.0.10586
- Nsight 版本 5.1.0.16062
Visual Studio 14.0 版
GPU:英伟达 GTX 980 Ti
- .NET CLR:.NET 4.6。
【问题讨论】:
标签: aleagpu