【问题标题】:PyCUDA using struct properlyPyCUDA 正确使用结构
【发布时间】:2015-03-21 23:14:57
【问题描述】:

我正在尝试在我的 Pycuda 代码中实现一个结构,但我遇到了超出范围的错误。我尝试遵循 this 教程,但无法让它适用于我的案例。

问题很可能是由于指针使用不当造成的,例如本教程显示必须分配指针 memsize 而不是数据 memsize。希望这里有人能给我一些见解...

示例代码:

#!/usr/bin/env python
#-*- coding:utf-8 -*-

import numpy as np
import pycuda.driver as cuda
import pycuda.tools as tools
import pycuda.autoinit

from mako.template import Template
from pycuda.compiler import SourceModule

src_template = Template(
"""
    struct Dist {
        %for s in xrange(ns):
        float *dist${s};
        %endfor
    };

    // return linear index based on x,y coordinate
    __device__ int get_index(int xcoord, int ycoord)
    {
        return ycoord + xcoord * ${ny};
    };

    __global__ void initialize(float *rho, float *ux, float *uy, Dist *ftmp)
    {
        int idx;
        float dens, velx, vely, vv, ev;

        for (int y = threadIdx.x + blockIdx.x * blockDim.x; 
                 y < ${ny}; 
                 y += blockDim.x * gridDim.x)   
        {   
            for (int x = threadIdx.y + blockIdx.y * blockDim.y; 
                 x < ${nx}; 
                 x += blockDim.y * gridDim.y) 
            {
                if ((x > 0) && (x < ${nx-1}) && (y > 0) && (y < ${ny-1}))
                {
                    idx = get_index(x,y);
                    dens = rho[idx]; velx = ux[idx]; vely = uy[idx];
                    vv = velx*velx + vely*vely;

                    %for s in xrange(ns):
                    // s = ${s}; \vec{e}[${s}] = [${ex[s]},${ey[s]}]
                    ev = ${float(ex[s])}f*velx + ${float(ey[s])}f*vely;
                    ftmp->dist${s}[idx] = ${w[s]}f*dens*(1.0f+3.0f*ev+4.5f*ev*ev-1.5f*vv);
                    %endfor
                }
            }
        }
    }
"""
)

class channelFlow:
    # initialize channelFlow
    def __init__(self, nx, ny):
        self.nx, self.ny = nx, ny

        max_threads_per_block = tools.DeviceData().max_threads
        self.blocksize = (ny if ny<32 else 32, nx if nx<32 else 32, 1)  # threads per block
        self.gridsize = (ny/self.blocksize[0], nx/self.blocksize[1], 1) # blocks per grid

        self.ns = 9
        self.w = np.array([4./9, 1./9, 1./9, 1./9, 1./9, 1./36, 1./36, 1./36, 1./36])
        self.ex = np.array([0, 1, -1, 0, 0, 1, -1, -1, 1])
        self.ey = np.array([0, 0, 0, 1, -1, 1, 1, -1, -1])

        self.ctx = { 'nx': self.nx, 'ny': self.ny, 'ns': self.ns,
                     'w': self.w, 'ex': self.ex, 'ey': self.ey
                     }

        dtype = np.float32
        self.ftmp = np.zeros([self.nx,self.ny,self.ns]).astype(dtype)
        self.rho  = np.zeros([self.nx,self.ny]).astype(dtype)
        self.ux   = np.zeros([self.nx,self.ny]).astype(dtype)
        self.uy   = np.zeros([self.nx,self.ny]).astype(dtype)

        self.ftmp_gpu = cuda.mem_alloc(self.ftmp.nbytes)
        self.rho_gpu  = cuda.mem_alloc(self.rho.nbytes)
        self.ux_gpu   = cuda.mem_alloc(self.ux.nbytes)
        self.uy_gpu   = cuda.mem_alloc(self.uy.nbytes)

    def run(self):
        src = src_template.render(**self.ctx)
        code = SourceModule(src)
        initialize  = code.get_function('initialize')

        self.rho[:,:] = 1.
        self.ux[:,:] = 0.
        self.uy[:,:] = 0.

        cuda.memcpy_htod(self.rho_gpu, self.rho)
        cuda.memcpy_htod(self.ux_gpu, self.ux)
        cuda.memcpy_htod(self.uy_gpu, self.uy)

        initialize(
            self.rho_gpu, self.ux_gpu, self.uy_gpu,
            self.ftmp_gpu, 
            block=self.blocksize, grid=self.gridsize
            )

if __name__ == "__main__":
    sim = channelFlow(64,64); sim.run()

【问题讨论】:

    标签: python c cuda indexoutofboundsexception pycuda


    【解决方案1】:

    我能够使用可用的 GPUStruct python 模块here 在 pycuda 中正确实现数组结构,并通过修复指针的不当使用,例如:

    ftmp->dist${s}[idx] = ${w[s]}f*dens*(1.0f+3.0f*ev+4.5f*ev*ev-1.5f*vv);
    

    改为:

    float *ftmp${s}_ptr = ftmp->dist${s};
    ftmp${s}_ptr[idx] = ${w[s]}f*dens*(1.0f+3.0f*ev+4.5f*ev*ev-1.5f*vv);
    

    显示GPUStruct实现细节的修改代码:

    #!/usr/bin/env python
    #-*- coding:utf-8 -*-
    
    import numpy as np
    import pycuda.driver as cuda
    import pycuda.tools as tools
    import pycuda.autoinit
    
    from gpu_struct import GPUStruct
    from mako.template import Template
    from pycuda.compiler import SourceModule
    
    src_template = Template(
    """
        struct Dist {
            %for s in xrange(ns):
            float *dist${s};
            %endfor
        };
    
        // return linear index based on x,y coordinate
        __device__ int get_index(int xcoord, int ycoord)
        {
            return ycoord + xcoord * ${ny};
        };
    
        __global__ void initialize(float *rho, float *ux, float *uy, Dist *ftmp)
        {
            int idx;
            float dens, velx, vely, vv, ev;
    
            for (int y = threadIdx.x + blockIdx.x * blockDim.x; 
                     y < ${ny}; 
                     y += blockDim.x * gridDim.x)   
            {   
                for (int x = threadIdx.y + blockIdx.y * blockDim.y; 
                     x < ${nx}; 
                     x += blockDim.y * gridDim.y) 
                {
                    if ((x > 0) && (x < ${nx-1}) && (y > 0) && (y < ${ny-1}))
                    {
                        idx = get_index(x,y);
                        dens = rho[idx]; velx = ux[idx]; vely = uy[idx];
                        vv = velx*velx + vely*vely;
    
                        %for s in xrange(ns):
                        // s = ${s}; \vec{e}[${s}] = [${ex[s]},${ey[s]}]
                        float *ftmp${s}_ptr1 = ftmp->dist${s};
                        ev = ${float(ex[s])}f*velx + ${float(ey[s])}f*vely;
                        ftmp${s}_ptr1[idx] = ${w[s]}f*dens*(1.0f+3.0f*ev+4.5f*ev*ev-1.5f*vv);                   
                        %endfor
                    }
                }
            }
        }
    """
    )
    
    class channelFlow:
        # initialize channelFlow
        def __init__(self, nx, ny):
            self.nx, self.ny = nx, ny
    
            max_threads_per_block = tools.DeviceData().max_threads
            self.blocksize = (ny if ny<32 else 32, nx if nx<32 else 32, 1)  # threads per block
            self.gridsize = (ny/self.blocksize[0], nx/self.blocksize[1], 1) # blocks per grid
    
            self.ns = 9
            self.w = np.array([4./9, 1./9, 1./9, 1./9, 1./9, 1./36, 1./36, 1./36, 1./36])
            self.ex = np.array([0, 1, -1, 0, 0, 1, -1, -1, 1])
            self.ey = np.array([0, 0, 0, 1, -1, 1, 1, -1, -1])
    
            self.ctx = { 'nx': self.nx, 'ny': self.ny, 'ns': self.ns,
                         'w': self.w, 'ex': self.ex, 'ey': self.ey
                         }
    
            dtype = np.float32
            self.ftmp = np.zeros([self.nx,self.ny,self.ns]).astype(dtype)
            self.rho  = np.zeros([self.nx,self.ny]).astype(dtype)
            self.ux   = np.zeros([self.nx,self.ny]).astype(dtype)
            self.uy   = np.zeros([self.nx,self.ny]).astype(dtype)
    
            self.ftmp_gpu = GPUStruct([
                (np.float32,'*dist0', self.ftmp[:,:,0]),
                (np.float32,'*dist1', self.ftmp[:,:,1]),
                (np.float32,'*dist2', self.ftmp[:,:,2]),
                (np.float32,'*dist3', self.ftmp[:,:,3]),
                (np.float32,'*dist4', self.ftmp[:,:,4]),
                (np.float32,'*dist5', self.ftmp[:,:,5]),
                (np.float32,'*dist6', self.ftmp[:,:,6]),
                (np.float32,'*dist7', self.ftmp[:,:,7]),
                (np.float32,'*dist8', self.ftmp[:,:,8])
                ])
            self.rho_gpu  = cuda.mem_alloc(self.rho.nbytes)
            self.ux_gpu   = cuda.mem_alloc(self.ux.nbytes)
            self.uy_gpu   = cuda.mem_alloc(self.uy.nbytes)
    
        def run(self):
            src = src_template.render(**self.ctx)
            code = SourceModule(src)
            initialize  = code.get_function('initialize')
    
            self.rho[:,:] = 1.
            self.ux[:,:] = 0.
            self.uy[:,:] = 0.
    
            self.ftmp_gpu.copy_to_gpu()
            cuda.memcpy_htod(self.rho_gpu, self.rho)
            cuda.memcpy_htod(self.ux_gpu, self.ux)
            cuda.memcpy_htod(self.uy_gpu, self.uy)
    
            initialize(
                self.rho_gpu, self.ux_gpu, self.uy_gpu,
                self.ftmp_gpu.get_ptr(), 
                block=self.blocksize, grid=self.gridsize
                )
    
            self.dens = np.zeros_like(self.rho)
            cuda.memcpy_dtoh(self.dens, self.rho_gpu)
            print self.dens
    
    if __name__ == "__main__":
        sim = channelFlow(64,64); sim.run()
    

    【讨论】:

      猜你喜欢
      • 2023-03-29
      • 2015-05-28
      • 2017-04-15
      • 1970-01-01
      • 1970-01-01
      • 2013-05-21
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      相关资源
      最近更新 更多