GPU 上的 NumPy N 数组交集答案

【问题标题】：NumPy N-Array Intersection on GPUGPU 上的 NumPy N 数组交集
【发布时间】：2018-09-08 19:00:50
【问题描述】：

我的代码中的一个瓶颈是找到 N 个数组的索引交集；数百万次。使用 np.intersect1d 进行简单的 NumPy 计算，但运行数百万次需要付出代价。

一个例子：

arr1 = [0,1,2,3,4]
arr2 = [0,3,4]
arr3 = [3,4]

交点是[3,4]

我想利用 GPU 线程，但在实现方面遇到了困难...... 欢迎使用 CUDA、OpenCL、Numba 和/或其他解决方案。

这是python代码：

import functools, datetime
import numpy as np


def run():
    """
    Create fake-data variable `grouped_data` which is a list of 100k entries. 
    Each element has 3 numpy arrays that are UNIQUE AND SORTED.

    Goal: iterate through `grouped_data` to find intersecting values per element.
    Ie, length of output equals length of input, `grouped_data`.
    In each element, these common values will be used to slice another numpy 
    array which is not included here.

    *Question*: how can this be moved to the GPU?  I'd like to leverage GPU threads.
    CUDA, OpenCL, Numba and/or `other` solutions welcome.
    """
    grouped_data = create_data()                            # 9% of runtime
    overlap = loop_through_intersections(grouped_data)      # 91% of runtime



def create_data():
    """ Return `grouped_data`, list of 100k entries. Each element has 3 numpy arrays 
    kern profiler shows this function takes ~ 9% of runtime """
    array = np.array(range(2000))

    grouped_data = []
    for i in range(100000):
        ar1 = array[::np.random.randint(1,9)]
        ar2 = array[::np.random.randint(1,9)]
        ar3 = array[::np.random.randint(1,9)]

        grouped_data.append( [ar1, ar2, ar3] )

    return grouped_data


def loop_through_intersections(grouped_data):
    """  for each element in grouped_data (3 numpy arrays), find the intersecting values 
    kern profiler shows this function takes ~ 91% of runtime 
    """
    overlap = []
    for f in grouped_data:
        overlap.append( functools.reduce(intersect1d, f) )
    return overlap


def intersect1d(ar1, ar2):
    """
    Find the intersection of two arrays.
    Return the sorted, unique values that are in both of the input arrays.

    Taken from NumPy.   https://github.com/numpy/numpy/blob/v1.14.0/numpy/lib/arraysetops.py#L297-L338
    """
    aux = np.concatenate((ar1, ar2))
    aux.sort()
    return aux[:-1][aux[1:] == aux[:-1]]




####################################################
#            Runtime takes ~6s 
####################################################

st = datetime.datetime.now()
run();  print datetime.datetime.now() - st

我也愿意转换输入。例如，我可以将列表 grouped_data 转换为矩阵。

欢迎所有 GPU 解决方案。

更新 - CUDA 尝试

第一次更新，我将数据转换为矩阵（而不是列表列表）以将数组传递给 GPU。

第二次更新，为简单起见，样本数据现在要小得多。

第三次更新，我正在学习CUDA并写了一个简单的内核，但是行为出乎意料...

我的内核每个输出列应该有 1 个线程。对于第一个线程（值 0），取输入矩阵列 0、1、2 并找到值交集。如果连续都是 1，则将输出行设置为 1，否则什么也不做。

目前的输出是意外的，我不知道为什么。有什么想法吗？？

import numpy as np

import pycuda.driver as drv
import pycuda.autoinit
from pycuda.compiler import SourceModule
from pycuda import driver, compiler, gpuarray, tools


def create_data(rows, iterations):
    np.random.seed(42)
    array = np.array(range(rows))

    grouped_data = []
    for i in range(iterations):
        for j in range(3):
            index = np.zeros(rows, dtype=bool)
            index[ array[::np.random.randint(1,4)] ] = True        
            grouped_data.append( index )

    #matrix = np.array( np.array(grouped_data).T, order='F')
    matrix = np.array( np.array(grouped_data).T, dtype=np.float32)
    return matrix



def get_kernel_code(rows, iterations):

    kernel_code = """
    __global__ void MyKernel(int rows, float matrix[MATRIX_ROWS][MATRIX_COLS], float output[OUTPUT_ROWS][OUTPUT_COLS])
    {
        const int thread = blockIdx.x * blockDim.x + threadIdx.x;


        if (thread < rows){

            int col1 = thread*3;
            int col2 = thread*3+1;
            int col3 = thread*3+2;

            for (int i=0; i<rows; i++) {
                if (matrix[i][col1]==1 && matrix[i][col1]==matrix[i][col2] && matrix[i][col2]==matrix[i][col3]) {
                    output[i][thread] = 1; }
            }
        }

    }
    """

    kernel_code = kernel_code.replace('MATRIX_ROWS', str(rows) )
    kernel_code = kernel_code.replace('MATRIX_COLS', str(iterations*3) )
    kernel_code = kernel_code.replace('OUTPUT_ROWS', str(rows) )
    kernel_code = kernel_code.replace('OUTPUT_COLS', str(iterations) )

    return kernel_code



def cuda_attempt(rows, iterations):
    """
    Create data, use gpuarray, get pycuda result. 
    """

    # Setup data
    kernel_code = get_kernel_code(rows, iterations)
    np.random.seed(42)
    matrix = create_data(rows, iterations).astype(np.float32)


    # Transfer host (CPU) memory to device (GPU) memory
    input = gpuarray.to_gpu(matrix)
    output = gpuarray.empty((rows, iterations), np.float32)


    # Compile the kernel code 
    mod = compiler.SourceModule(kernel_code)
    intersect = mod.get_function("MyKernel")


    # Define Thread & Block Size
    number_threads = output.shape[1]
    number_blocks = 1

    intersect(
        np.int32(rows), input, output,
        block=(number_blocks,number_threads,1)
        )

    gpu_output = output.get()
    print '\n output col0 which is the intersection of first 3 input columns\n', gpu_output[:, :1]
    print '\n should be \n', np.array([1, 0,0,1,0,0,1,0,0,1], dtype=float)


    old = input.get()
    print '\n Matrix Input for 1st Grouping of 3 \n', old[:, 0:3]

    return



cuda_attempt(rows=10, iterations=2)

【问题讨论】：

标签： arrays numpy cuda gpu intersection

【解决方案1】：

以下是一个解决方案。良好的学习经历。

GPU 代码比 Numba 快 5 倍。没有我想要的那么好... 我仍然可以优化块和网格大小，但现在要离开了。

import numpy as np
import datetime
from numba import njit

from pycuda import driver, compiler, gpuarray, tools
import pycuda.autoinit 



def compare(rows, iterations):
    """
    Run CPU & GPU Version.  Compare output.
    Creates binary matrix called a_cpu which represents a dataset.
    The goal is to take 3 columns at a time and if all are 1, pass 1
    to the output matrix.
    """

    np.random.seed(42)
    a_cpu = np.random.randint(0,2, (rows, iterations*3)).astype(np.float32)


    st = datetime.datetime.now()
    cpu = np.zeros((rows, iterations), dtype=int)
    iterate_over_matrix(a_cpu, iterations, rows, cpu)
    print '\n\t CPU runtime: ', datetime.datetime.now() - st


    st = datetime.datetime.now()
    gpu = cuda_attempt(rows, iterations, a_cpu)
    print '\n\t GPU runtime: ', datetime.datetime.now() - st

    print "cpu.sum(): {:,}".format(cpu.sum())
    print "gpu.sum(): {:,}".format(int(gpu.sum()))



def get_kernel_code(iterations):

    kernel_code = """
    __global__ void MatrixMulKernel(int ROWS, float *A, float *C)
    {
        const int wC = %(C_SIZE)s;

        const int blockId = blockIdx.y * gridDim.x + blockIdx.x;
        const int thread = blockId * blockDim.x + threadIdx.x;


        if ( thread < (ROWS * wC) ) {
            float Aele = A[3*thread] * A[3*thread +1] * A[3*thread +2];
            C[thread] = Aele;
        }
    }
    """

    kernel_code = kernel_code % { 
        'A_SIZE': 3*iterations,
        'C_SIZE': iterations,
        }

    return kernel_code



def cuda_attempt(rows, iterations, a_cpu):
    """
    Create data, use gpuarray, get pycuda result. 
    """

    a_gpu = gpuarray.to_gpu(a_cpu) 
    c_gpu = gpuarray.empty((rows, iterations), np.float32)


    kernel_code = get_kernel_code(iterations)
    mod = compiler.SourceModule(kernel_code)
    matrixmul = mod.get_function("MatrixMulKernel")


    # 2D Grid of 1D Blocks
    needed_threads = rows * iterations
    threads = 1024
    number_blocks = needed_threads // threads + 1
    number_blocks = int(np.sqrt(number_blocks)) + 1

    assert (number_blocks <= 65535), "number of blocks exceeds allowed limit in 1 dimension"


    grid = (number_blocks, number_blocks)
    block = (threads, 1, 1)

    matrixmul(
        np.int32(rows), a_gpu, c_gpu, 
        grid = grid,
        block = block,
        )

    return c_gpu.get()



#===============================================================================
#                    CPU CALCULATTIONS
#===============================================================================

@njit
def iterate_over_matrix(matrix, iterations, rows, bools):
    for i in range(iterations):
        arr = matrix[:, i*3:(i*3+3)]
        check_intersection(bools[:, i], arr[:, 0], arr[:, 1], arr[:, 2], rows)



@njit
def check_intersection(index, ar1, ar2, ar3, rows):
    for i in range(rows):
        if ar1[i] == ar2[i] == ar3[i] == True:
            index[i] = True





#===============================================================================
#                    RUN
#===============================================================================


rows=5
iterations=2


rows=2000
iterations=100000


compare(rows, iterations)

【讨论】：

我无法理解您的输出。你能澄清一下吗？