在 GLSL 中优化光线追踪着色器答案

【问题标题】：Optimizing a raytracing shader in GLSL在 GLSL 中优化光线追踪着色器
【发布时间】：2018-11-19 13:18:54
【问题描述】：

我编写了一个基于体素化的光线追踪器，它按预期工作，但速度很慢。

目前光线追踪器代码如下：

#version 430 
//normalized positon from (-1, -1) to (1, 1)
in vec2 f_coord;

out vec4 fragment_color;

struct Voxel
{
    vec4 position;
    vec4 normal;
    vec4 color;
};

struct Node
{
    //children of the current node
    int children[8];
};

layout(std430, binding = 0) buffer voxel_buffer
{
    //last layer of the tree, the leafs
    Voxel voxels[];
};
layout(std430, binding = 1) buffer buffer_index
{
    uint index;
};
layout(std430, binding = 2) buffer tree_buffer
{
    //tree structure       
    Node tree[];
};
layout(std430, binding = 3) buffer tree_index
{
    uint t_index;
};

uniform vec3 camera_pos; //position of the camera
uniform float aspect_ratio; // aspect ratio of the window
uniform float cube_dim; //Dimenions of the voxelization cube
uniform int voxel_resolution; //Side length of the cube in voxels

#define EPSILON 0.01
// Detect whether a position is inside of the voxel with size size located at corner
bool inBoxBounds(vec3 corner, float size, vec3 position)
{
    bool inside = true;
    position-=corner;//coordinate of the position relative to the box coordinate system
    //Test that all coordinates are inside the box, if any is outisde, the point is out the box
    for(int i=0; i<3; i++)
    {
        inside = inside && (position[i] > -EPSILON);
        inside = inside && (position[i] < size+EPSILON);
    }

    return inside;
}

//Get the distance to a box or infinity if the box cannot be hit
float boxIntersection(vec3 origin, vec3 dir, vec3 corner0, float size)
{
    dir = normalize(dir);
    vec3 corner1 = corner0 + vec3(size,size,size);//Oposite corner of the box

    float coeffs[6];
    //Calculate the intersaction coefficients with te 6 bonding planes 
    coeffs[0] = (corner0.x - origin.x)/(dir.x);
    coeffs[1] = (corner0.y - origin.y)/(dir.y);
    coeffs[2] = (corner0.z - origin.z)/(dir.z);

    coeffs[3] = (corner1.x - origin.x)/(dir.x);
    coeffs[4] = (corner1.y - origin.y)/(dir.y);
    coeffs[5] = (corner1.z - origin.z)/(dir.z);
    //by default the distance to the box is infinity
    float t = 1.f/0.f;

    for(uint i=0; i<6; i++){
        //if the distance to a boxis negative, we set it to infinity as we cannot travel in the negative direction
        coeffs[i] = coeffs[i] < 0 ? 1.f/0.f : coeffs[i];
        //The distance is the minumum of the previous calculated distance and the current distance
        t = inBoxBounds(corner0,size,origin+dir*coeffs[i]) ? min(coeffs[i],t) : t;
    }

    return t;
}

#define MAX_TREE_HEIGHT 11
int nodes[MAX_TREE_HEIGHT];
int levels[MAX_TREE_HEIGHT];
vec3 positions[MAX_TREE_HEIGHT];
int sp=0;

void push(int node, int level, vec3 corner)
{
    nodes[sp] = node;
    levels[sp] = level;
    positions[sp] = corner;
    sp++;
}

void main()
{   
    int count = 0; //count the iterations of the algorithm
    vec3 r = vec3(f_coord.x, f_coord.y, 1.f/tan(radians(40))); //direction of the ray
    r.y/=aspect_ratio; //modify the direction based on the windows aspect ratio
    vec3 dir = r;
    r += vec3(0,0,-1.f/tan(radians(40))) + camera_pos; //put the ray at the camera position

    fragment_color = vec4(0);
    int max_level = int(log2(voxel_resolution));//height of the tree
    push(0,0,vec3(-cube_dim));//set the stack
    float tc = 1.f; //initial color value, to be decreased whenever a voxel is hit
    //tree variables
    int level=0;
    int node=0;
    vec3 corner;

    do
    {
        //pop from stack
        sp--;
        node = nodes[sp];
        level = levels[sp];
        corner = positions[sp];

        //set the size of the current voxel 
        float size = cube_dim / pow(2,level);
        //set the corners of the children
        vec3 corners[] =
            {corner,                        corner+vec3(0,0,size),
            corner+vec3(0, size,0),         corner+vec3(0,size,size),
            corner+vec3(size,0,0),          corner+vec3(size,0,size),
            corner+vec3(size,size,0),       corner+vec3(size,size,size)};

        float coeffs[8];
        for(int child=0; child<8; child++)
        {
            //Test non zero childs, zero childs are empty and thus should be discarded
            coeffs[child] = tree[node].children[child]>0?
                //Get the distance to your child if it's not empty or infinity if it's empty
                boxIntersection(r, dir, corners[child], size) : 1.f/0.f;
        }
        int indices[8] = {0,1,2,3,4,5,6,7};
        //sort the children from closest to farthest
        for(uint i=0; i<8; i++)
        {
            for(uint j=i; j<8; j++)
            {
                if((coeffs[j] < coeffs[i]))
                {
                    float swap = coeffs[i];
                    coeffs[i] = coeffs[j];
                    coeffs[j] = swap;

                    int iSwap = indices[i];
                    indices[i] = indices[j];
                    indices[j] = iSwap;

                    vec3 vSwap = corners[i];
                    corners[i] = corners[j];
                    corners[j] = vSwap;
                }
            }
        }
        //push to stack
        for(uint i=7; i>=0; i--)
        {
            if(!isinf(coeffs[i]))
            {
                push(tree[node].children[indices[i]],
                    level+1, corners[i]);
            }
        }
        count++;
    }while(level < (max_level-1) && sp>0);
    //set color
    fragment_color = vec4(count)/100;
}

由于可能不完全清楚这是做什么的，让我解释一下。

我们从一个大立方体开始检查光线盒的交点。如果我们击中它，我们会测试与组成它的 8 个立方体的交集。

如果我们击中任何一个，我们会检查与组成该立方体的 8 个立方体的交点。

在 2D 中，这将如下所示：

在这种情况下，我们有 4 层，我们首先检查大框，然后是红色的，然后是绿色的，最后是蓝色的。

将光线追踪步骤执行的次数打印为颜色（这是我提供的代码 sn-p 所做的）

结果如下图：

如您所见，大多数情况下着色器的迭代次数不会超过 100 次。

但是，这个着色器在 gtx 1070 中执行平均需要 200 000 微秒。

由于问题不是执行次数，我的问题很可能出在线程执行上。

有谁知道我可以如何优化这段代码？最大的瓶颈似乎是堆栈的使用。

如果我在不推入堆栈的情况下运行相同的代码（生成错误的输出），运行时会提高 10 倍

【问题讨论】：

为什么不将网格存储在 3D 纹理中并直接使用 3D 纹理坐标？不需要任何堆栈和空间细分周围的东西，请参阅How to best write a voxel engine in C with performance in mind，我的 GLSL 尝试是从我的这个Reflection and refraction impossible without recursive ray tracing? ray-tracer 进行的。也看看What techniques were used to reduce the required re-rendering in 3D programs?
已经编码了那个。这里的问题是数据的可扩展性和渐近复杂性。如果您使用 3D 纹理，其中大部分是无效的（浪费空间）。所以对于一个有 512 个体素的场景，你最终会涂上 512^3*size_of_encoding。然而，其中绝大多数是不需要的。其次，除非你巧妙地创建 mip_maps，否则图像的渐近复杂度是纹理边长的线性（例如 512），但我的是摊销成本 log(n)，它只使用所需的数据。
@Spektre 给你一个想法，我的 gpu 无法分配 1024 3D 纹理，但可以处理 2048 oct 树和 512 3D 纹理消耗大约 2 GB 的 VRAM，而这个可以仅剩 200 MB
我建议你尝试运行这个多阶段。在每次迭代后写入结果，然后重新运行着色器并跳过在顶点阶段命中的像素。您目前是否只有一个屏幕大小的顶点矩形？您可以为每个像素使用一个点，并在后续通道中为相机后面的完成像素设置位置。
@Makogan 我不确定它是否相关，但您可以在没有调用堆栈的情况下实现路径跟踪。我最近做了以下事情：stackoverflow.com/a/62057711/7330813

标签： c++ opengl optimization glsl gpu

【解决方案1】：

您似乎在八叉树的每个级别中测试与大多数体素的射线相交。并在每个级别对它们进行排序（按一定距离）。我提出了另一种方法。

如果光线与边界框（八叉树的第 0 级）相交，它会在框的两个面上形成。或者在角落或边缘，这些都是“角落”案例。

可以像here 一样找到 3D 射线平面的交点。可以通过测试点是否在面部的两个三角形之一的内部来确定交点是否在面部（四边形）内，例如 here。

获取距离相机最远的路口I0。还让r 成为射线在I0 朝向相机的方向上的单位向量。

找到I0 坐标的最深体素。这是离相机最远的体素。

现在我们想要那个体素中光线穿过另一个面的出口坐标I0e。虽然您可以再次对所有 6 个面进行计算，但如果您的体素是 X、Y、X 对齐的，并且您在与八叉树相同的坐标系中定义射线，那么计算会大大简化。

通过射线的r 单位向量：I1 = I0e + r/1000 对I0e 应用一点位移（例如，最小体素大小的 1/1000）。找到这些I1 的体素。这是体素射线交叉点排序列表中的下一个体素。

重复查找I1e 然后I2 然后I2e 然后I3 等等，直到边界框退出。交叉体素列表已排序。

可以根据存储信息的方式优化八叉树的使用：所有可能的节点或仅使用。带有数据的节点，或者只是指向另一个带有数据的容器的“指针”。这是另一个问题的问题。

【讨论】：

这会比我使用的交叉点检查慢

【解决方案2】：

首先突出的是您的框相交函数。查看inigo quilez' procedural box function 以获得更快的版本。由于您的 boxsize 在所有轴上都是统一的，并且您不需要 outNormal，因此您可以获得更轻的版本。从本质上讲，使用数学而不是测试每个盒子平面的蛮力方法。

此外，请尽量避免临时存储。例如，可以根据需要为每个八叉树框计算corners 数组。当然，有了上面的建议，这些都会被改成box center。

由于nodes、levels 和positions 总是一起访问，因此请尝试将它们并置在一个新的单一结构中并将它们作为一个单元访问。

稍后会看更多...

【讨论】：

【解决方案3】：

GPU 上的线程执行可能是大规模并行的，但这并不意味着所有线程彼此独立运行。线程组执行完全相同的指令，唯一的区别是输入数据。这意味着分支和循环不能使线程在执行中发散，因此也不能让它们提前终止。

您的示例显示了最极端的边缘情况：当一组线程中所有完成的工作很可能只与一个线程相关时。

为了缓解这种情况，您应该尝试减少组中（或全部）线程的执行长度（在您的情况下为迭代）的差异。这可以通过设置每个着色器通道的迭代次数限制并仅重新安排需要更多迭代的线程/像素来完成。

【讨论】：

GLSL 并不是为此而生的。正如我在评论中建议的那样，您可以例如图像中的每个像素都有点顶点，并且对于随后的传递，移动已经完成的点，使其位于剪切平面之外。希望片段能被正确调度。
@Makogan 你试过后能告诉我吗（如果你试过的话）？