我可能会帮助您处理内存,即在您的迭代过程中不必从 CPU 复制到 GPU。我正在使用 euler timestepping 随着时间的推移发展一个系统,我将所有数据保存在我的 GPU 上的方式如下所示。
但是,这样做的问题是,一旦启动了第一个内核,cpu 就会继续执行它之后的行。 IE。边界内核在时间演化步骤之前启动。
我需要的是一种同步事物的方法。我尝试过使用 strm.synchronize() (请参阅我的代码),但它并不总是有效。如果您对此有任何想法,我将非常感谢您的意见!谢谢!
def curveShorten(dist,timestep,maxit):
"""
iterates the function image on a 2d grid through an euler anisotropic
diffusion operator with timestep=timestep maxit number of times
"""
image = 1*dist
forme = image.shape
if(np.size(forme)>2):
sys.exit('Only works on gray images')
aSize = forme[0]*forme[1]
xdim = np.int32(forme[0])
ydim = np.int32(forme[1])
image[0,:] = image[1,:]
image[xdim-1,:] = image[xdim-2,:]
image[:,ydim-1] = image[:,ydim-2]
image[:,0] = image[:,1]
#np arrays i need to store things on the CPU, image is the initial
#condition and final is the final state
image = image.reshape(aSize,order= 'C').astype(np.float32)
final = np.zeros(aSize).astype(np.float32)
#allocating memory to GPUs
image_gpu = drv.mem_alloc(image.nbytes)
final_gpu = drv.mem_alloc(final.nbytes)
#sending data to each memory location
drv.memcpy_htod(image_gpu,image) #host to device copying
drv.memcpy_htod(final_gpu,final)
#block size: B := dim1*dim2*dim3=1024
#gird size : dim1*dimr2*dim3 = ceiling(aSize/B)
blockX = int(1024)
multiplier = aSize/float(1024)
if(aSize/float(1024) > int(aSize/float(1024)) ):
gridX = int(multiplier + 1)
else:
gridX = int(multiplier)
strm1 = drv.Stream(1)
ev1 = drv.Event()
strm2 = drv.Stream()
for k in range(0,maxit):
Kern_diffIteration(image_gpu,final_gpu,ydim, xdim, np.float32(timestep), block=(blockX,1,1), grid=(gridX,1,1),stream=strm1)
strm1.synchronize()
if(strm1.is_done()==1):
Kern_boundaryX0(final_gpu,ydim,xdim,block=(blockX,1,1), grid=(gridX,1,1))
Kern_boundaryX1(final_gpu,ydim,xdim,block=(blockX,1,1), grid=(gridX,1,1))#,stream=strm1)
Kern_boundaryY0(final_gpu,ydim,xdim,block=(blockX,1,1), grid=(gridX,1,1))#,stream=strm2)
Kern_boundaryY1(final_gpu,ydim,xdim,block=(blockX,1,1), grid=(gridX,1,1))#,stream=strm1)
if(strm1.is_done()==1):
drv.memcpy_dtod(image_gpu, final_gpu, final.nbytes)
#Kern_copy(image_gpu,final_gpu,ydim,xdim,block=(blockX,1,1), grid=(gridX,1,1),stream=strm1)
drv.memcpy_dtoh(final,final_gpu) #device to host copying
#final_gpu.free()
#image_gpu.free()
return final.reshape(forme,order='C')