【问题标题】:Is there a way to retrieve indices of particular values using Pycuda?有没有办法使用 Pycuda 检索特定值的索引?
【发布时间】:2015-10-06 02:51:27
【问题描述】:

我有一个大小为 1000x1000x1000 的 3D numpy 数组。我正在寻找整个数组中值 1 的索引。对于像我这样的较大数据集, np.nonzero(array) 非常慢。我想知道是否有办法通过 pycuda 做到这一点。或者有没有其他更有效的方法。

【问题讨论】:

    标签: python numpy cuda pycuda


    【解决方案1】:

    我之前没有用过PyCuda,但是因为找到了good example on how to use thrust in PyCuda,所以想出了下面的解决方案。

    在内部,它使用thrust::counting_iteratorthrust::copy_if 来查找等于1 的元素的索引。

    虽然这可能更快,但您的整个问题存在严重缺陷: 您有一个包含 10 亿 (1000000000) 个元素的数组,使用 32 位整数时需要 4 GB 内存。 您还需要另一个 4 GB 的输出数组。 即使您的 GPU 有这么多 RAM,也需要将输入数据复制到 GPU,这需要一些时间。

    如果您的数组主要由零条目组成,您最好使用sparse matrix format 并且只存储非零条目。 这将节省内存,您根本不必搜索非零条目。

    find_indices_thrust.py

    import pycuda
    import pycuda.autoinit
    import pycuda.gpuarray as gpuarray
    import numpy as np
    
    from codepy.cgen import *
    from codepy.bpl import BoostPythonModule
    from codepy.cuda import CudaModule
    
    #Make a host_module, compiled for CPU
    host_mod = BoostPythonModule()
    
    #Make a device module, compiled with NVCC
    nvcc_mod = CudaModule(host_mod)
    
    #Describe device module code
    #NVCC includes
    nvcc_includes = [
        'thrust/copy.h',
        'thrust/device_vector.h',
        'thrust/iterator/counting_iterator.h',
        'thrust/functional.h',
        'cuda.h',
        'stdint.h'
        ]
    #Add includes to module
    nvcc_mod.add_to_preamble([Include(x) for x in nvcc_includes])
    
    #NVCC function
    nvcc_function = FunctionBody(
        FunctionDeclaration(Value('void', 'find_indices'),
                            [Value('CUdeviceptr', 'input_ptr'),
                            Value('uint32_t', 'input_length'),
                            Value('CUdeviceptr', 'output_ptr'),
                            Value('uint32_t*', 'output_length')]),
        Block([Statement('thrust::device_ptr<uint32_t> thrust_input_ptr((uint32_t*)input_ptr)'),
              Statement('thrust::device_ptr<uint32_t> thrust_output_ptr((uint32_t*)output_ptr)'),
              Statement('using namespace thrust::placeholders'),
              Statement('*output_length = thrust::copy_if(thrust::counting_iterator<uint32_t>(0), thrust::counting_iterator<uint32_t>(input_length), thrust_input_ptr, thrust_output_ptr, _1==1)-thrust_output_ptr')]))
    
    #Add declaration to nvcc_mod
    #Adds declaration to host_mod as well
    nvcc_mod.add_function(nvcc_function)
    
    host_includes = [
        'boost/python/extract.hpp',
        ]
    #Add host includes to module
    host_mod.add_to_preamble([Include(x) for x in host_includes])
    
    host_namespaces = [
        'using namespace boost::python',
        ]
    
    #Add BPL using statement
    host_mod.add_to_preamble([Statement(x) for x in host_namespaces])
    
    
    host_statements = [
        #Extract information from PyCUDA GPUArray
        #Get length
        'tuple shape = extract<tuple>(gpu_input_array.attr("shape"))',
        'int input_length = extract<int>(shape[0])',
        #Get input data pointer
        'CUdeviceptr input_ptr = extract<CUdeviceptr>(gpu_input_array.attr("ptr"))',
        #Get output data pointer
        'CUdeviceptr output_ptr = extract<CUdeviceptr>(gpu_output_array.attr("ptr"))',
        #Call Thrust routine, compiled into the CudaModule
        'uint32_t output_size',
        'find_indices(input_ptr, input_length, output_ptr, &output_size)',
        #Return result
        'return output_size',
        ]
    
    host_mod.add_function(
        FunctionBody(
            FunctionDeclaration(Value('int', 'host_entry'),
                                [Value('object', 'gpu_input_array'),Value('object', 'gpu_output_array')]),
            Block([Statement(x) for x in host_statements])))
    
    #Print out generated code, to see what we're actually compiling
    print("---------------------- Host code ----------------------")
    print(host_mod.generate())
    print("--------------------- Device code ---------------------")
    print(nvcc_mod.generate())
    print("-------------------------------------------------------")
    
    
    
    #Compile modules
    import codepy.jit, codepy.toolchain
    gcc_toolchain = codepy.toolchain.guess_toolchain()
    nvcc_toolchain = codepy.toolchain.guess_nvcc_toolchain()
    
    module = nvcc_mod.compile(gcc_toolchain, nvcc_toolchain, debug=True)
    
    
    
    length = 100
    input_array = np.array(np.random.rand(length)*5, dtype=np.uint32)
    output_array = np.zeros(length, dtype=np.uint32)
    
    print("---------------------- INPUT -----------------------")
    print(input_array)
    gpu_input_array = gpuarray.to_gpu(input_array)
    gpu_output_array = gpuarray.to_gpu(output_array)
    
    # call GPU function
    output_size = module.host_entry(gpu_input_array, gpu_output_array)
    print("----------------------- OUTPUT ------------------------")
    print gpu_output_array[:output_size]
    print("-------------------------------------------------------")
    

    生成的代码

    ---------------------- Host code ----------------------
    #include <boost/python.hpp>
    #include <cuda.h>
    void find_indices(CUdeviceptr input_ptr, uint32_t input_length, CUdeviceptr output_ptr, uint32_t* output_length);
    #include <boost/python/extract.hpp>
    using namespace boost::python;
    
    namespace private_namespace_6f5e74fc4bebe20d5478de66e2226656
    {
      int host_entry(object gpu_input_array, object gpu_output_array)
      {
        tuple shape = extract<tuple>(gpu_input_array.attr("shape"));
        int input_length = extract<int>(shape[0]);
        CUdeviceptr input_ptr = extract<CUdeviceptr>(gpu_input_array.attr("ptr"));
        CUdeviceptr output_ptr = extract<CUdeviceptr>(gpu_output_array.attr("ptr"));
        uint32_t output_size;
        find_indices(input_ptr, input_length, output_ptr, &output_size);
        return output_size;
      }
    }
    
    using namespace private_namespace_6f5e74fc4bebe20d5478de66e2226656;
    
    BOOST_PYTHON_MODULE(module)
    {
      boost::python::def("host_entry", &host_entry);
    }
    --------------------- Device code ---------------------
    #include <thrust/copy.h>
    #include <thrust/device_vector.h>
    #include <thrust/iterator/counting_iterator.h>
    #include <thrust/functional.h>
    #include <cuda.h>
    #include <stdint.h>
    
    void find_indices(CUdeviceptr input_ptr, uint32_t input_length, CUdeviceptr output_ptr, uint32_t* output_length)
    {
      thrust::device_ptr<uint32_t> thrust_input_ptr((uint32_t*)input_ptr);
      thrust::device_ptr<uint32_t> thrust_output_ptr((uint32_t*)output_ptr);
      using namespace thrust::placeholders;
      *output_length = thrust::copy_if(thrust::counting_iterator<uint32_t>(0), thrust::counting_iterator<uint32_t>(input_length), thrust_input_ptr, thrust_output_ptr, _1==1)-thrust_output_ptr;
    }
    -------------------------------------------------------
    

    演示输出

    ---------------------- INPUT -----------------------
    [1 2 3 0 3 3 1 2 1 2 0 4 4 3 2 0 4 2 3 0 2 3 1 4 3 4 3 4 3 2 4 3 2 4 2 0 3
    0 3 4 3 0 0 4 4 2 0 3 3 1 3 4 2 0 0 4 0 4 3 2 3 2 1 1 4 3 0 4 3 1 1 1 3 2
    0 0 3 4 3 3 4 2 2 3 4 1 1 3 2 2 2 2 3 2 0 2 4 3 2 0]
    ----------------------- OUTPUT ------------------------
    [ 0  6  8 22 49 62 63 69 70 71 85 86]
    -------------------------------------------------------
    

    【讨论】:

      猜你喜欢
      • 2021-08-11
      • 2016-09-29
      • 2019-08-19
      • 1970-01-01
      • 2022-08-19
      • 2020-08-15
      • 1970-01-01
      • 2021-12-09
      • 2021-04-30
      相关资源
      最近更新 更多