【问题标题】:NumPy N-Array Intersection on GPUGPU 上的 NumPy N 数组交集
【发布时间】:2018-09-08 19:00:50
【问题描述】:

我的代码中的一个瓶颈是找到 N 个数组的索引交集;数百万次。使用 np.intersect1d 进行简单的 NumPy 计算,但运行数百万次需要付出代价。

一个例子:

arr1 = [0,1,2,3,4]
arr2 = [0,3,4]
arr3 = [3,4]

交点是[3,4]

我想利用 GPU 线程,但在实现方面遇到了困难...... 欢迎使用 CUDA、OpenCL、Numba 和/或其他解决方案。

这是python代码:

import functools, datetime
import numpy as np


def run():
    """
    Create fake-data variable `grouped_data` which is a list of 100k entries. 
    Each element has 3 numpy arrays that are UNIQUE AND SORTED.

    Goal: iterate through `grouped_data` to find intersecting values per element.
    Ie, length of output equals length of input, `grouped_data`.
    In each element, these common values will be used to slice another numpy 
    array which is not included here.

    *Question*: how can this be moved to the GPU?  I'd like to leverage GPU threads.
    CUDA, OpenCL, Numba and/or `other` solutions welcome.
    """
    grouped_data = create_data()                            # 9% of runtime
    overlap = loop_through_intersections(grouped_data)      # 91% of runtime



def create_data():
    """ Return `grouped_data`, list of 100k entries. Each element has 3 numpy arrays 
    kern profiler shows this function takes ~ 9% of runtime """
    array = np.array(range(2000))

    grouped_data = []
    for i in range(100000):
        ar1 = array[::np.random.randint(1,9)]
        ar2 = array[::np.random.randint(1,9)]
        ar3 = array[::np.random.randint(1,9)]

        grouped_data.append( [ar1, ar2, ar3] )

    return grouped_data


def loop_through_intersections(grouped_data):
    """  for each element in grouped_data (3 numpy arrays), find the intersecting values 
    kern profiler shows this function takes ~ 91% of runtime 
    """
    overlap = []
    for f in grouped_data:
        overlap.append( functools.reduce(intersect1d, f) )
    return overlap


def intersect1d(ar1, ar2):
    """
    Find the intersection of two arrays.
    Return the sorted, unique values that are in both of the input arrays.

    Taken from NumPy.   https://github.com/numpy/numpy/blob/v1.14.0/numpy/lib/arraysetops.py#L297-L338
    """
    aux = np.concatenate((ar1, ar2))
    aux.sort()
    return aux[:-1][aux[1:] == aux[:-1]]




####################################################
#            Runtime takes ~6s 
####################################################

st = datetime.datetime.now()
run();  print datetime.datetime.now() - st

我也愿意转换输入。例如,我可以将列表 grouped_data 转换为矩阵。

欢迎所有 GPU 解决方案。

**

更新 - CUDA 尝试

**

第一次更新,我将数据转换为矩阵(而不是列表列表)以将数组传递给 GPU。

第二次更新,为简单起见,样本数据现在要小得多。

第三次更新,我正在学习CUDA并写了一个简单的内核,但是行为出乎意料...

我的内核每个输出列应该有 1 个线程。 对于第一个线程(值 0),取输入矩阵列 0、1、2 并找到值交集。如果连续都是 1,则将输出行设置为 1,否则什么也不做。

目前的输出是意外的,我不知道为什么。 有什么想法吗??

import numpy as np

import pycuda.driver as drv
import pycuda.autoinit
from pycuda.compiler import SourceModule
from pycuda import driver, compiler, gpuarray, tools


def create_data(rows, iterations):
    np.random.seed(42)
    array = np.array(range(rows))

    grouped_data = []
    for i in range(iterations):
        for j in range(3):
            index = np.zeros(rows, dtype=bool)
            index[ array[::np.random.randint(1,4)] ] = True        
            grouped_data.append( index )

    #matrix = np.array( np.array(grouped_data).T, order='F')
    matrix = np.array( np.array(grouped_data).T, dtype=np.float32)
    return matrix



def get_kernel_code(rows, iterations):

    kernel_code = """
    __global__ void MyKernel(int rows, float matrix[MATRIX_ROWS][MATRIX_COLS], float output[OUTPUT_ROWS][OUTPUT_COLS])
    {
        const int thread = blockIdx.x * blockDim.x + threadIdx.x;


        if (thread < rows){

            int col1 = thread*3;
            int col2 = thread*3+1;
            int col3 = thread*3+2;

            for (int i=0; i<rows; i++) {
                if (matrix[i][col1]==1 && matrix[i][col1]==matrix[i][col2] && matrix[i][col2]==matrix[i][col3]) {
                    output[i][thread] = 1; }
            }
        }

    }
    """

    kernel_code = kernel_code.replace('MATRIX_ROWS', str(rows) )
    kernel_code = kernel_code.replace('MATRIX_COLS', str(iterations*3) )
    kernel_code = kernel_code.replace('OUTPUT_ROWS', str(rows) )
    kernel_code = kernel_code.replace('OUTPUT_COLS', str(iterations) )

    return kernel_code



def cuda_attempt(rows, iterations):
    """
    Create data, use gpuarray, get pycuda result. 
    """

    # Setup data
    kernel_code = get_kernel_code(rows, iterations)
    np.random.seed(42)
    matrix = create_data(rows, iterations).astype(np.float32)


    # Transfer host (CPU) memory to device (GPU) memory
    input = gpuarray.to_gpu(matrix)
    output = gpuarray.empty((rows, iterations), np.float32)


    # Compile the kernel code 
    mod = compiler.SourceModule(kernel_code)
    intersect = mod.get_function("MyKernel")


    # Define Thread & Block Size
    number_threads = output.shape[1]
    number_blocks = 1

    intersect(
        np.int32(rows), input, output,
        block=(number_blocks,number_threads,1)
        )

    gpu_output = output.get()
    print '\n output col0 which is the intersection of first 3 input columns\n', gpu_output[:, :1]
    print '\n should be \n', np.array([1, 0,0,1,0,0,1,0,0,1], dtype=float)


    old = input.get()
    print '\n Matrix Input for 1st Grouping of 3 \n', old[:, 0:3]

    return



cuda_attempt(rows=10, iterations=2)

【问题讨论】:

    标签: arrays numpy cuda gpu intersection


    【解决方案1】:

    以下是一个解决方案。良好的学习经历。

    GPU 代码比 Numba 快 5 倍。没有我想要的那么好... 我仍然可以优化块和网格大小,但现在要离开了。

    import numpy as np
    import datetime
    from numba import njit
    
    from pycuda import driver, compiler, gpuarray, tools
    import pycuda.autoinit 
    
    
    
    def compare(rows, iterations):
        """
        Run CPU & GPU Version.  Compare output.
        Creates binary matrix called a_cpu which represents a dataset.
        The goal is to take 3 columns at a time and if all are 1, pass 1
        to the output matrix.
        """
    
        np.random.seed(42)
        a_cpu = np.random.randint(0,2, (rows, iterations*3)).astype(np.float32)
    
    
        st = datetime.datetime.now()
        cpu = np.zeros((rows, iterations), dtype=int)
        iterate_over_matrix(a_cpu, iterations, rows, cpu)
        print '\n\t CPU runtime: ', datetime.datetime.now() - st
    
    
        st = datetime.datetime.now()
        gpu = cuda_attempt(rows, iterations, a_cpu)
        print '\n\t GPU runtime: ', datetime.datetime.now() - st
    
        print "cpu.sum(): {:,}".format(cpu.sum())
        print "gpu.sum(): {:,}".format(int(gpu.sum()))
    
    
    
    def get_kernel_code(iterations):
    
        kernel_code = """
        __global__ void MatrixMulKernel(int ROWS, float *A, float *C)
        {
            const int wC = %(C_SIZE)s;
    
            const int blockId = blockIdx.y * gridDim.x + blockIdx.x;
            const int thread = blockId * blockDim.x + threadIdx.x;
    
    
            if ( thread < (ROWS * wC) ) {
                float Aele = A[3*thread] * A[3*thread +1] * A[3*thread +2];
                C[thread] = Aele;
            }
        }
        """
    
        kernel_code = kernel_code % { 
            'A_SIZE': 3*iterations,
            'C_SIZE': iterations,
            }
    
        return kernel_code
    
    
    
    def cuda_attempt(rows, iterations, a_cpu):
        """
        Create data, use gpuarray, get pycuda result. 
        """
    
        a_gpu = gpuarray.to_gpu(a_cpu) 
        c_gpu = gpuarray.empty((rows, iterations), np.float32)
    
    
        kernel_code = get_kernel_code(iterations)
        mod = compiler.SourceModule(kernel_code)
        matrixmul = mod.get_function("MatrixMulKernel")
    
    
        # 2D Grid of 1D Blocks
        needed_threads = rows * iterations
        threads = 1024
        number_blocks = needed_threads // threads + 1
        number_blocks = int(np.sqrt(number_blocks)) + 1
    
        assert (number_blocks <= 65535), "number of blocks exceeds allowed limit in 1 dimension"
    
    
        grid = (number_blocks, number_blocks)
        block = (threads, 1, 1)
    
        matrixmul(
            np.int32(rows), a_gpu, c_gpu, 
            grid = grid,
            block = block,
            )
    
        return c_gpu.get()
    
    
    
    #===============================================================================
    #                    CPU CALCULATTIONS
    #===============================================================================
    
    @njit
    def iterate_over_matrix(matrix, iterations, rows, bools):
        for i in range(iterations):
            arr = matrix[:, i*3:(i*3+3)]
            check_intersection(bools[:, i], arr[:, 0], arr[:, 1], arr[:, 2], rows)
    
    
    
    @njit
    def check_intersection(index, ar1, ar2, ar3, rows):
        for i in range(rows):
            if ar1[i] == ar2[i] == ar3[i] == True:
                index[i] = True
    
    
    
    
    
    #===============================================================================
    #                    RUN
    #===============================================================================
    
    
    rows=5
    iterations=2
    
    
    rows=2000
    iterations=100000
    
    
    compare(rows, iterations)
    

    【讨论】:

    • 我无法理解您的输出。你能澄清一下吗?
    猜你喜欢
    • 2015-06-27
    • 1970-01-01
    • 1970-01-01
    • 2018-12-03
    • 1970-01-01
    • 1970-01-01
    • 2021-04-16
    • 2019-03-29
    • 1970-01-01
    相关资源
    最近更新 更多