【发布时间】:2018-01-12 21:46:30
【问题描述】:
不知何故,当我在下面的代码中修改 d_updated_water_flow_map 时,d_terrain_height_map 也被修改了 / 而不是。
更改两个数组的分配顺序可以解决问题,但我认为这只是掩盖了问题的根本原因。
cudaCheck(cudaMalloc((void **)&d_water_flow_map, SIZE * 4));
cudaCheck(cudaMalloc((void **)&d_updated_water_flow_map, SIZE * 4)); // changing this array also changes d_terrain_height_map
cudaCheck(cudaMalloc((void **)&d_terrain_height_map, SIZE));
我正在将内核编译成一个 DLL,并从 Blender 3D python 解释器中的 python 文件下面调用它。所有值都是 32 位浮点数。
cu_include.h
#pragma once
#ifdef MATHLIBRARY_EXPORTS
#define MATHLIBRARY_API __declspec(dllexport)
#else
#define MATHLIBRARY_API __declspec(dllimport)
#endif
extern "C" __declspec(dllexport)
void init(float *t_height_map,
float *w_height_map,
float *s_height_map,
int SIZE_X,
int SIZE_Y);
extern "C" __declspec(dllexport)
void run_hydro_erosion(int cycles,
float t_step,
float min_tilt_angle,
float SEDIMENT_CAP,
float DISSOLVE_CONST,
float DEPOSIT_CONST,
int SIZE_X,
int SIZE_Y,
float PIPE_LENGTH,
float ADJACENT_LENGTH,
float TIME_STEP,
float MIN_TILT_ANGLE);
extern "C" __declspec(dllexport)
void free_mem();
extern "C" __declspec(dllexport)
void procedural_rain(float *water_height_map, float *rain_map, int SIZE_X, int SIZE_Y);
erosion_kernel.dll
#include "cu_include.h"
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <time.h>
#include <iostream>
#include <algorithm>
#include <random>
// includes CUDA
#include <cuda_runtime.h>
using namespace std;
#define FLOW_RIGHT 0
#define FLOW_UP 1
#define FLOW_LEFT 2
#define FLOW_DOWN 3
#define X_VEL 0
#define Y_VEL 1
#define LEFT_CELL row, col - 1
#define RIGHT_CELL row, col + 1
#define ABOVE_CELL row - 1, col
#define BELOW_CELL row + 1, col
// CUDA API error checking macro
#define T 1024
#define M 1536
#define blockSize 1024
#define cudaCheck(error) \
if (error != cudaSuccess) { \
printf("Fatal error: %s at %s:%d\n", \
cudaGetErrorString(error), \
__FILE__, __LINE__); \
exit(1); \
}
__global__ void update_water_flow(float *water_height_map, float *water_flow_map, float *d_updated_water_flow_map, int SIZE_X, int SIZE_Y)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
int col = index % SIZE_X;
int row = index / SIZE_X;
index = row * (SIZE_X * 4) + col * 4; // 3D index
d_updated_water_flow_map[index + FLOW_RIGHT] = 0;
d_updated_water_flow_map[index + FLOW_UP] = 0;
d_updated_water_flow_map[index + FLOW_LEFT] = 0;
d_updated_water_flow_map[index + FLOW_DOWN] = 0;
}
static float *terrain_height_map;
static float *water_height_map;
static float *sediment_height_map;
void init(float *t_height_map,
float *w_height_map,
float *s_height_map,
int SIZE_X,
int SIZE_Y)
{
/* set vars HOST*/
terrain_height_map = t_height_map;
water_height_map = w_height_map;
sediment_height_map = s_height_map;
}
void run_hydro_erosion(int cycles,
float t_step,
float min_tilt_angle,
float SEDIMENT_CAP,
float DISSOLVE_CONST,
float DEPOSIT_CONST,
int SIZE_X,
int SIZE_Y,
float PIPE_LENGTH,
float ADJACENT_LENGTH,
float TIME_STEP,
float MIN_TILT_ANGLE)
{
int numBlocks = (SIZE_X * SIZE_Y + (blockSize - 1)) / blockSize;
int SIZE = SIZE_X * SIZE_Y * sizeof(float);
float *d_terrain_height_map, *d_updated_terrain_height_map;
float *d_water_height_map, *d_updated_water_height_map;
float *d_sediment_height_map, *d_updated_sediment_height_map;
float *d_suspended_sediment_level;
float *d_updated_suspended_sediment_level;
float *d_water_flow_map;
float *d_updated_water_flow_map;
float *d_prev_water_height_map;
float *d_water_velocity_vec;
float *d_rain_map;
cudaCheck(cudaMalloc(&d_water_height_map, SIZE));
cudaCheck(cudaMalloc(&d_updated_water_height_map, SIZE));
cudaCheck(cudaMalloc(&d_prev_water_height_map, SIZE));
cudaCheck(cudaMalloc(&d_water_flow_map, SIZE * 4));
cudaCheck(cudaMalloc(&d_updated_water_flow_map, SIZE * 4)); // changing this array also changes d_terrain_height_map
cudaCheck(cudaMalloc(&d_terrain_height_map, SIZE));
cudaCheck(cudaMalloc(&d_updated_terrain_height_map, SIZE));
cudaCheck(cudaMalloc(&d_sediment_height_map, SIZE));
cudaCheck(cudaMalloc(&d_updated_sediment_height_map, SIZE));
cudaCheck(cudaMalloc(&d_suspended_sediment_level, SIZE));
cudaCheck(cudaMalloc(&d_updated_suspended_sediment_level, SIZE));
cudaCheck(cudaMalloc(&d_rain_map, SIZE));
cudaCheck(cudaMalloc(&d_water_velocity_vec, SIZE * 2));
cudaCheck(cudaMemcpy(d_terrain_height_map, terrain_height_map, SIZE, cudaMemcpyHostToDevice));
cudaCheck(cudaMemcpy(d_water_height_map, water_height_map, SIZE, cudaMemcpyHostToDevice));
cudaCheck(cudaMemcpy(d_sediment_height_map, sediment_height_map, SIZE, cudaMemcpyHostToDevice));
cout << "init terrain_height_map" << endl;
for (int i = 0; i < SIZE_X * SIZE_Y; i++) {
cout << terrain_height_map[i] << ", ";
if (i % SIZE_X == 0 && i != 0) cout << endl;
}
/* launch the kernel on the GPU */
float *temp;
while (cycles--) {
update_water_flow << < numBlocks, blockSize >> >(d_water_height_map, d_water_flow_map, d_updated_water_flow_map, SIZE_X, SIZE_Y);
temp = d_water_flow_map;
d_water_flow_map = d_updated_water_flow_map;
d_updated_water_flow_map = temp;
}
cudaCheck(cudaMemcpy(terrain_height_map, d_terrain_height_map, SIZE, cudaMemcpyDeviceToHost));
cout << "updated terrain" << endl;
for (int i = 0; i < SIZE_X * SIZE_Y; i++) {
cout << terrain_height_map[i] << ", ";
if (i % SIZE_X == 0 && i != 0) cout << endl;
}
}
Python 文件
import bpy
import numpy
import ctypes
import random
width = 4
height = 4
size_x = width
size_y = height
N = size_x * size_y
scrpt_cycles = 1
kernel_cycles = 1
time_step = 0.005
pipe_length = 1.0
adjacent_length = 1.0
min_tilt_angle = 10
sediment_cap = 0.01
dissolve_const = 0.01
deposit_const = 0.01
# initialize arrays
ter_height_map = numpy.ones((N), dtype=numpy.float32)
water_height_map = numpy.zeros((N), dtype=numpy.float32)
sed_height_map = numpy.zeros((N), dtype=numpy.float32)
rain_map = numpy.ones((N), dtype=numpy.float32)
# load terrain height from image
for i in range(0, len(ter_height_map)):
ter_height_map[i] = 1
# import DLL
E = ctypes.cdll.LoadLibrary("E:/Programming/CUDA/erosion/Release/erosion_kernel.dll")
# initialize device memory
E.init( ctypes.c_void_p(ter_height_map.ctypes.data),
ctypes.c_void_p(water_height_map.ctypes.data),
ctypes.c_void_p(sed_height_map.ctypes.data),
ctypes.c_int(size_x),
ctypes.c_int(size_y))
# run erosion
while(scrpt_cycles):
scrpt_cycles = scrpt_cycles - 1
E.run_hydro_erosion(ctypes.c_int(kernel_cycles),
ctypes.c_float(time_step),
ctypes.c_float(min_tilt_angle),
ctypes.c_float(sediment_cap),
ctypes.c_float(dissolve_const),
ctypes.c_float(deposit_const),
ctypes.c_int(size_x),
ctypes.c_int(size_y),
ctypes.c_float(pipe_length),
ctypes.c_float(adjacent_length),
ctypes.c_float(time_step),
ctypes.c_float(min_tilt_angle))
错误的输出:
预期输出(在我注释掉 update_water_flow 之后):
//update_water_flow << < numBlocks, blockSize >> >(d_water_height_map, d_water_flow_map, d_updated_water_flow_map, SIZE_X, SIZE_Y);
显卡:GTX460M
【问题讨论】:
-
如果没有minimal reproducible example,这个问题就是浪费大家的时间,包括你的。
-
@talonmies 我添加了一个有效的最小示例!对此感到抱歉..
-
这在 CUDA 中是非法的:
__device__ float *d_updated_water_flow_map; ... cudaCheck(cudaMalloc(&d_updated_water_flow_map, SIZE * 4));并且您现在发布的代码版本与之前发布的版本(d_updated_water_flow_map作为普通主机)之间存在显着差异堆栈变量)。您不能在主机代码中获取__device__变量的地址(即使在cudaMalloc的调用/参数中),这不是创建__device__指针变量的正确方法。 -
@RobertCrovella 我将代码改回原始设计。我正在尝试不同的东西,却忘记将变量改回原来的样子。我测试了,问题仍然存在。修改d_updated_water_flow_map还是修改d_terrain_height_map
-
使用
cuda-memcheck在搅拌机之外运行您的代码。我可以通过从你的 python 脚本中删除import bpy然后运行 cuda-memcheck python test.py来做到这一点。当我这样做时,cuda-memcheck会在你的内核中报告无效的内存访问错误。