【发布时间】:2018-11-19 13:18:54
【问题描述】:
我编写了一个基于体素化的光线追踪器,它按预期工作,但速度很慢。
目前光线追踪器代码如下:
#version 430
//normalized positon from (-1, -1) to (1, 1)
in vec2 f_coord;
out vec4 fragment_color;
struct Voxel
{
vec4 position;
vec4 normal;
vec4 color;
};
struct Node
{
//children of the current node
int children[8];
};
layout(std430, binding = 0) buffer voxel_buffer
{
//last layer of the tree, the leafs
Voxel voxels[];
};
layout(std430, binding = 1) buffer buffer_index
{
uint index;
};
layout(std430, binding = 2) buffer tree_buffer
{
//tree structure
Node tree[];
};
layout(std430, binding = 3) buffer tree_index
{
uint t_index;
};
uniform vec3 camera_pos; //position of the camera
uniform float aspect_ratio; // aspect ratio of the window
uniform float cube_dim; //Dimenions of the voxelization cube
uniform int voxel_resolution; //Side length of the cube in voxels
#define EPSILON 0.01
// Detect whether a position is inside of the voxel with size size located at corner
bool inBoxBounds(vec3 corner, float size, vec3 position)
{
bool inside = true;
position-=corner;//coordinate of the position relative to the box coordinate system
//Test that all coordinates are inside the box, if any is outisde, the point is out the box
for(int i=0; i<3; i++)
{
inside = inside && (position[i] > -EPSILON);
inside = inside && (position[i] < size+EPSILON);
}
return inside;
}
//Get the distance to a box or infinity if the box cannot be hit
float boxIntersection(vec3 origin, vec3 dir, vec3 corner0, float size)
{
dir = normalize(dir);
vec3 corner1 = corner0 + vec3(size,size,size);//Oposite corner of the box
float coeffs[6];
//Calculate the intersaction coefficients with te 6 bonding planes
coeffs[0] = (corner0.x - origin.x)/(dir.x);
coeffs[1] = (corner0.y - origin.y)/(dir.y);
coeffs[2] = (corner0.z - origin.z)/(dir.z);
coeffs[3] = (corner1.x - origin.x)/(dir.x);
coeffs[4] = (corner1.y - origin.y)/(dir.y);
coeffs[5] = (corner1.z - origin.z)/(dir.z);
//by default the distance to the box is infinity
float t = 1.f/0.f;
for(uint i=0; i<6; i++){
//if the distance to a boxis negative, we set it to infinity as we cannot travel in the negative direction
coeffs[i] = coeffs[i] < 0 ? 1.f/0.f : coeffs[i];
//The distance is the minumum of the previous calculated distance and the current distance
t = inBoxBounds(corner0,size,origin+dir*coeffs[i]) ? min(coeffs[i],t) : t;
}
return t;
}
#define MAX_TREE_HEIGHT 11
int nodes[MAX_TREE_HEIGHT];
int levels[MAX_TREE_HEIGHT];
vec3 positions[MAX_TREE_HEIGHT];
int sp=0;
void push(int node, int level, vec3 corner)
{
nodes[sp] = node;
levels[sp] = level;
positions[sp] = corner;
sp++;
}
void main()
{
int count = 0; //count the iterations of the algorithm
vec3 r = vec3(f_coord.x, f_coord.y, 1.f/tan(radians(40))); //direction of the ray
r.y/=aspect_ratio; //modify the direction based on the windows aspect ratio
vec3 dir = r;
r += vec3(0,0,-1.f/tan(radians(40))) + camera_pos; //put the ray at the camera position
fragment_color = vec4(0);
int max_level = int(log2(voxel_resolution));//height of the tree
push(0,0,vec3(-cube_dim));//set the stack
float tc = 1.f; //initial color value, to be decreased whenever a voxel is hit
//tree variables
int level=0;
int node=0;
vec3 corner;
do
{
//pop from stack
sp--;
node = nodes[sp];
level = levels[sp];
corner = positions[sp];
//set the size of the current voxel
float size = cube_dim / pow(2,level);
//set the corners of the children
vec3 corners[] =
{corner, corner+vec3(0,0,size),
corner+vec3(0, size,0), corner+vec3(0,size,size),
corner+vec3(size,0,0), corner+vec3(size,0,size),
corner+vec3(size,size,0), corner+vec3(size,size,size)};
float coeffs[8];
for(int child=0; child<8; child++)
{
//Test non zero childs, zero childs are empty and thus should be discarded
coeffs[child] = tree[node].children[child]>0?
//Get the distance to your child if it's not empty or infinity if it's empty
boxIntersection(r, dir, corners[child], size) : 1.f/0.f;
}
int indices[8] = {0,1,2,3,4,5,6,7};
//sort the children from closest to farthest
for(uint i=0; i<8; i++)
{
for(uint j=i; j<8; j++)
{
if((coeffs[j] < coeffs[i]))
{
float swap = coeffs[i];
coeffs[i] = coeffs[j];
coeffs[j] = swap;
int iSwap = indices[i];
indices[i] = indices[j];
indices[j] = iSwap;
vec3 vSwap = corners[i];
corners[i] = corners[j];
corners[j] = vSwap;
}
}
}
//push to stack
for(uint i=7; i>=0; i--)
{
if(!isinf(coeffs[i]))
{
push(tree[node].children[indices[i]],
level+1, corners[i]);
}
}
count++;
}while(level < (max_level-1) && sp>0);
//set color
fragment_color = vec4(count)/100;
}
由于可能不完全清楚这是做什么的,让我解释一下。
我们从一个大立方体开始检查光线盒的交点。如果我们击中它,我们会测试与组成它的 8 个立方体的交集。
如果我们击中任何一个,我们会检查与组成该立方体的 8 个立方体的交点。
在 2D 中,这将如下所示:
在这种情况下,我们有 4 层,我们首先检查大框,然后是红色的,然后是绿色的,最后是蓝色的。
将光线追踪步骤执行的次数打印为颜色(这是我提供的代码 sn-p 所做的)
结果如下图:
如您所见,大多数情况下着色器的迭代次数不会超过 100 次。
但是,这个着色器在 gtx 1070 中执行平均需要 200 000 微秒。
由于问题不是执行次数,我的问题很可能出在线程执行上。
有谁知道我可以如何优化这段代码? 最大的瓶颈似乎是堆栈的使用。
如果我在不推入堆栈的情况下运行相同的代码(生成错误的输出),运行时会提高 10 倍
【问题讨论】:
-
为什么不将网格存储在 3D 纹理中并直接使用 3D 纹理坐标?不需要任何堆栈和空间细分周围的东西,请参阅How to best write a voxel engine in C with performance in mind,我的 GLSL 尝试是从我的这个Reflection and refraction impossible without recursive ray tracing? ray-tracer 进行的。也看看What techniques were used to reduce the required re-rendering in 3D programs?
-
已经编码了那个。这里的问题是数据的可扩展性和渐近复杂性。如果您使用 3D 纹理,其中大部分是无效的(浪费空间)。所以对于一个有 512 个体素的场景,你最终会涂上 512^3*size_of_encoding。然而,其中绝大多数是不需要的。其次,除非你巧妙地创建 mip_maps,否则图像的渐近复杂度是纹理边长的线性(例如 512),但我的是摊销成本 log(n),它只使用所需的数据。
-
@Spektre 给你一个想法,我的 gpu 无法分配 1024 3D 纹理,但可以处理 2048 oct 树和 512 3D 纹理消耗大约 2 GB 的 VRAM,而这个可以仅剩 200 MB
-
我建议你尝试运行这个多阶段。在每次迭代后写入结果,然后重新运行着色器并跳过在顶点阶段命中的像素。您目前是否只有一个屏幕大小的顶点矩形?您可以为每个像素使用一个点,并在后续通道中为相机后面的完成像素设置位置。
-
@Makogan 我不确定它是否相关,但您可以在没有调用堆栈的情况下实现路径跟踪。我最近做了以下事情:stackoverflow.com/a/62057711/7330813
标签: c++ opengl optimization glsl gpu