从 Python Numba CUDA 内核调用的加速 FFT答案

【问题标题】：accelerated FFT to be invoked from Python Numba CUDA kernel从 Python Numba CUDA 内核调用的加速 FFT
【发布时间】：2019-11-08 10:59:43
【问题描述】：

我需要计算 256 个元素的 float64 信号的傅立叶变换。要求是这样的，我需要从 cuda.jitted 部分中调用这些 FFT，并且必须在 25 微秒内完成。唉，cuda.jit 编译的函数不允许调用外部库 => 我自己写的。唉，我的单核代码仍然太慢（在 Quadro P4000 上约为 250 微秒）。有没有更好的办法？

我创建了一个单核 FFT 函数，它可以提供正确的结果，但是速度太慢了 10 倍。我不明白如何充分利用多核。

---fft.py

from numba import cuda, boolean, void, int32, float32, float64, complex128
import math, sys, cmath

def _transform_radix2(vector, inverse, out):    
    n = len(vector) 
    levels = int32(math.log(float32(n))/math.log(float32(2)))

    assert 2**levels==n # error: Length is not a power of 2 

    #uncomment either Numba.Cuda or Numpy memory allocation, (intelligent conditional compileation??)               
    exptable = cuda.local.array(1024, dtype=complex128)   
    #exptable = np.zeros(1024, np.complex128)

    assert (n // 2) <= len(exptable)  # error: FFT length > MAXFFTSIZE

    coef = complex128((2j if inverse else -2j) * math.pi / n)   
    for i in range(n // 2):                       
        exptable[i] = cmath.exp(i * coef)       

    for i in range(n):
        x = i   
        y = 0
        for j in range(levels):
            y = (y << 1) | (x & 1)
            x >>= 1
        out[i] = vector[y]      

    size = 2
    while size <= n:
        halfsize = size // 2
        tablestep = n // size
        for i in range(0, n, size):
            k = 0
            for j in range(i, i + halfsize):
                temp = out[j + halfsize] * exptable[k]    
                out[j + halfsize] = out[j] - temp
                out[j] += temp
                k += tablestep
        size *= 2

    scale=float64(n if inverse else 1)
    for i in range(n):
        out[i]=out[i]/scale   # the inverse requires a scaling

# now create the Numba.cuda version to be called by a GPU
gtransform_radix2 = cuda.jit(device=True)(_transform_radix2)

---test.py

from numba import cuda, void, float64, complex128, boolean
import cupy as cp
import numpy as np
import timeit  
import fft

@cuda.jit(void(float64[:],boolean, complex128[:]))    
def fftbench(y, inverse, FT):
  Y  = cuda.local.array(256, dtype=complex128)

  for i in range(len(y)):
    Y[i]=complex128(y[i])    
  fft.gtransform_radix2(Y, False, FT)


str='\nbest [%2d/%2d] iterations, min:[%9.3f], max:[%9.3f], mean:[%9.3f], std:[%9.3f] usec'
a=[127.734375 ,130.87890625 ,132.1953125  ,129.62109375 ,118.6015625
 ,110.2890625  ,106.55078125 ,104.8203125  ,106.1875     ,109.328125
 ,113.5        ,118.6640625  ,125.71875    ,127.625      ,120.890625
 ,114.04296875 ,112.0078125  ,112.71484375 ,110.18359375 ,104.8828125
 ,104.47265625 ,106.65625    ,109.53515625 ,110.73828125 ,111.2421875
 ,112.28125    ,112.38671875 ,112.7734375  ,112.7421875  ,113.1328125
 ,113.24609375 ,113.15625    ,113.66015625 ,114.19921875 ,114.5
 ,114.5546875  ,115.09765625 ,115.2890625  ,115.7265625  ,115.41796875
 ,115.73828125 ,116.         ,116.55078125 ,116.5625     ,116.33984375
 ,116.63671875 ,117.015625   ,117.25       ,117.41015625 ,117.6640625
 ,117.859375   ,117.91015625 ,118.38671875 ,118.51171875 ,118.69921875
 ,118.80859375 ,118.67578125 ,118.78125    ,118.49609375 ,119.0078125
 ,119.09375    ,119.15234375 ,119.33984375 ,119.31640625 ,119.6640625
 ,119.890625   ,119.80078125 ,119.69140625 ,119.65625    ,119.83984375
 ,119.9609375  ,120.15625    ,120.2734375  ,120.47265625 ,120.671875
 ,120.796875   ,120.4609375  ,121.1171875  ,121.35546875 ,120.94921875
 ,120.984375   ,121.35546875 ,120.87109375 ,120.8359375  ,121.2265625
 ,121.2109375  ,120.859375   ,121.17578125 ,121.60546875 ,121.84375
 ,121.5859375  ,121.6796875  ,121.671875   ,121.78125    ,121.796875
 ,121.8828125  ,121.9921875  ,121.8984375  ,122.1640625  ,121.9375
 ,122.         ,122.3515625  ,122.359375   ,122.1875     ,122.01171875
 ,121.91015625 ,122.11328125 ,122.1171875  ,122.6484375  ,122.81640625
 ,122.33984375 ,122.265625   ,122.78125    ,122.44921875 ,122.34765625
 ,122.59765625 ,122.63671875 ,122.6796875  ,122.6171875  ,122.34375
 ,122.359375   ,122.7109375  ,122.83984375 ,122.546875   ,122.25390625
 ,122.06640625 ,122.578125   ,122.7109375  ,122.83203125 ,122.5390625
 ,122.2421875  ,122.06640625 ,122.265625   ,122.13671875 ,121.8046875
 ,121.87890625 ,121.88671875 ,122.2265625  ,121.63671875 ,121.14453125
 ,120.84375    ,120.390625   ,119.875      ,119.34765625 ,119.0390625
 ,118.4609375  ,117.828125   ,117.1953125  ,116.9921875  ,116.046875
 ,115.16015625 ,114.359375   ,113.1875     ,110.390625   ,108.41796875
 ,111.90234375 ,117.296875   ,127.0234375  ,147.58984375 ,158.625
 ,129.8515625  ,120.96484375 ,124.90234375 ,130.17578125 ,136.47265625
 ,143.9296875  ,150.24609375 ,141.         ,117.71484375 ,109.80859375
 ,115.24609375 ,118.44140625 ,120.640625   ,120.9921875  ,111.828125
 ,101.6953125  ,111.21484375 ,114.91015625 ,115.2265625  ,118.21875
 ,125.3359375  ,139.44140625 ,139.76953125 ,135.84765625 ,137.3671875
 ,141.67578125 ,139.53125    ,136.44921875 ,135.08203125 ,135.7890625
 ,137.58203125 ,138.7265625  ,154.33203125 ,172.01171875 ,152.24609375
 ,129.8046875  ,125.59375    ,125.234375   ,127.32421875 ,132.8984375
 ,147.98828125 ,152.328125   ,153.7734375  ,155.09765625 ,156.66796875
 ,159.0546875  ,151.83203125 ,138.91796875 ,138.0546875  ,140.671875
 ,143.48046875 ,143.99609375 ,146.875      ,146.7578125  ,141.15234375
 ,141.5        ,140.76953125 ,140.8828125  ,145.5625     ,150.78125
 ,148.89453125 ,150.02734375 ,150.70703125 ,152.24609375 ,148.47265625
 ,131.95703125 ,125.40625    ,123.265625   ,123.57421875 ,129.859375
 ,135.6484375  ,144.51171875 ,155.05078125 ,158.4453125  ,140.8125
 ,100.08984375 ,104.29296875 ,128.55078125 ,139.9921875  ,143.38671875
 ,143.69921875 ,137.734375   ,124.48046875 ,116.73828125 ,114.84765625
 ,113.85546875 ,117.45703125 ,122.859375   ,125.8515625  ,133.22265625
 ,139.484375   ,135.75       ,122.69921875 ,115.7734375  ,116.9375
 ,127.57421875] 
y1 =cp.zeros(len(a), cp.complex128) 
FT1=cp.zeros(len(a), cp.complex128)

for i in range(len(a)):
  y1[i]=a[i]  #convert to complex to feed the FFT

r=1000
series=sorted(timeit.repeat("fftbench(y1, False, FT1)",      number=1, repeat=r, globals=globals()))
series=series[0:r-5]
print(str % (len(series), r, 1e6*np.min(series), 1e6*np.max(series), 1e6*np.mean(series), 1e6*np.std(series)));



a faster implementation t<<25usec

【问题讨论】：

标签： python cuda fft jit numba

【解决方案1】：

您的算法的缺点是即使在 GPU 上它也可以在单核上运行。

为了了解如何在 Nvidia GPGPU 上设计算法，我建议查看： the CUDA C Programming guide 和 the numba documentation 在 python 中应用代码。

此外，要了解您的代码有什么问题，我建议使用 Nvidia profiler。

答案的以下部分将解释如何在您的示例中应用基础知识。

运行多个线程

为了提高性能，您首先需要启动多个可以并行运行的线程，CUDA 处理线程如下：

线程被分组为由 n 个线程组成的块 (n
具有相同块的每个线程都可以同步并可以访问称为“共享内存”的（快速）公共内存空间。
您可以在“网格”中并行运行多个块，但您将失去同步机制。

运行多个线程的语法如下：

fftbench[griddim, blockdim](y1, False, FT1)

为了简化，我将只使用一个大小为 256 的块：

fftbench[1, 256](y1, False, FT1)

内存

为了提高 GPU 性能，查看数据的存储位置很重要，它们是三个主要空间：

全局内存：它是 GPU 的“RAM”，速度慢且延迟高，当您将所有阵列发送到 GPU 时，这是放置它们的位置。
共享内存：这是一个有点快的访问内存，一个块的所有线程都可以访问同一个共享内存。
本地内存：物理上与全局内存相同，但每个线程都访问自己的本地内存。

通常，如果您使用多次相同的数据，您应该尝试将它们存储在共享内存中，以防止来自全局内存的延迟。

在您的代码中，您可以将exptable 存储在共享内存中：

exptable = cuda.shared.array(1024, dtype=complex128)

如果n 不是太大，您可能希望使用工作而不是使用out：

working = cuda.shared.array(256, dtype=complex128)

为每个线程分配任务

当然，如果你不改变你的函数，所有线程都会做同样的工作，它只会减慢你的程序。

在这个例子中，我们将把每个线程分配给数组的一个单元格。为此，我们必须通过一个块获取线程的唯一 ID：

idx = cuda.threadIdx.x

现在我们将能够加速 for 循环，让我们一一处理：

exptable = cuda.shared.array(1024, dtype=complex128)   
...
for i in range(n // 2):                       
    exptable[i] = cmath.exp(i * coef)

目标是：我们希望前 n/2 个线程填充这个数组，然后所有线程都可以使用它。

所以在这种情况下，只需用线程 idx 上的条件替换 for 循环：

if idx < n // 2:
    exptable[idx] = cmath.exp(idx * coef)

最后两个循环更简单，每个线程将处理数组的一个单元：

for i in range(n):
    x = i   
    y = 0
    for j in range(levels):
        y = (y << 1) | (x & 1)
        x >>= 1
    out[i] = vector[y]

成为

x = idx   
y = 0
for j in range(levels):
    y = (y << 1) | (x & 1)
    x >>= 1
working[idx] = vector[y]

和

for i in range(n):
    out[i]=out[i]/scale   # the inverse requires a scaling

成为

out[idx]=working[idx]/scale   # the inverse requires a scaling

我使用共享数组working，但如果你想使用全局内存，你可以将它替换为out。

现在，让我们看看 while 循环，我们说过我们希望每个线程只处理数组的一个单元格。所以我们可以尝试将里面的两个for循环并行化。

    ...
    for i in range(0, n, size):
        k = 0
        for j in range(i, i + halfsize):
            temp = out[j + halfsize] * exptable[k]    
            out[j + halfsize] = out[j] - temp
            out[j] += temp
            k += tablestep
    ...

为简化起见，我将只使用一半的线程，我们将取前 128 个线程并确定j 如下：

    ...
    if idx < 128:
        j = (idx%halfsize) + size*(idx//halfsize)
    ...

k 是：

    k = tablestep*(idx%halfsize)

所以我们得到了循环：

size = 2
while size <= n:
    halfsize = size // 2
    tablestep = n // size

    if idx < 128:
        j = (idx%halfsize) + size*(idx//halfsize)            
        k = tablestep*(idx%halfsize)
        temp = working[j + halfsize] * exptable[k]
        working[j + halfsize] = working[j] - temp
        working[j] += temp
    size *= 2

同步

最后但同样重要的是，我们需要同步所有这些线程。事实上，如果我们不同步，程序将无法运行。 GPU 上的线程可能不会同时运行，因此当数据由一个线程生成并由另一个线程使用时，您可能会遇到问题，例如：

exptable[0] 在 thread_0 填充存储其值之前由 thread_2 使用
working[j + halfsize] 在存储到temp 之前被另一个线程修改了

为了防止这种情况，我们可以使用以下功能：

cuda.syncthreads()

同一块中的所有线程将在执行其余代码之前完成这一行。

在本例中，您需要在 working 初始化之后和 while 循环的每次迭代之后两点进行同步。

那么您的代码如下所示：

def _transform_radix2(vector, inverse, out):    
  n = len(vector) 
  levels = int32(math.log(float32(n))/math.log(float32(2)))

  assert 2**levels==n # error: Length is not a power of 2 

  exptable = cuda.shared.array(1024, dtype=complex128)
  working = cuda.shared.array(256, dtype=complex128)

  assert (n // 2) <= len(exptable)  # error: FFT length > MAXFFTSIZE

  coef = complex128((2j if inverse else -2j) * math.pi / n)   
  if idx < n // 2:
    exptable[idx] = cmath.exp(idx * coef)    

  x = idx   
  y = 0
  for j in range(levels):
    y = (y << 1) | (x & 1)
    x >>= 1
  working[idx] = vector[y]    
  cuda.syncthreads()

  size = 2
  while size <= n:
    halfsize = size // 2
    tablestep = n // size

    if idx < 128:
      j = (idx%halfsize) + size*(idx//halfsize)            
      k = tablestep*(idx%halfsize)
      temp = working[j + halfsize] * exptable[k]
      working[j + halfsize] = working[j] - temp
      working[j] += temp
    size *= 2
    cuda.syncthreads()

  scale=float64(n if inverse else 1)
  out[idx]=working[idx]/scale   # the inverse requires a scaling

我觉得您的问题是介绍一些有关 GPGPU 计算基础知识的好方法，我尝试以教学的方式回答它。最终的代码远非完美，还可以进行很多优化，如果您想了解更多关于 GPU 优化的信息，我强烈建议您阅读此Programming guide。

【讨论】：

那么，加速是多少？