【发布时间】:2014-10-27 22:40:40
【问题描述】:
我正在尝试在 OpenCL 中实现 LU 分解的简单版本。首先,我在 C++ 中实现了一个顺序版本,并构造了方法来验证我的结果(即乘法方法)。接下来,我在内核中实现了我的算法,并使用手动验证的输入(即 5x5 矩阵)对其进行了测试。这很好用。
但是,当我在大于 5x5 的随机生成矩阵上运行我的算法时,我得到了奇怪的结果。我已经清理了我的代码,手动检查了计算,但我无法弄清楚我的内核哪里出了问题。我开始认为这可能与floats 和计算的稳定性有关。我的意思是误差范围被传播并变得越来越大。我很清楚我可以交换行以获得最大的枢轴值等,但有时误差幅度会很大。在任何情况下,我都希望结果 - 尽管是错误的 - 与顺序算法相同。我需要一些帮助来确定我可能做错了什么。
我使用的是一维数组,因此使用二维矩阵寻址的过程如下:
A(row, col) = A[row * matrix_width + col].
关于结果,我可能会补充一点,我决定将 L 和 U 矩阵合并为一个。所以给定 L 和 U:
L: U:
1 0 0 A B C
X 1 0 0 D E
Y Z 1 0 0 F
我将它们显示为:
A:
A B C
X D E
Y Z F
内核如下:
参数source是我要分解的原始矩阵。
参数destin 是目的地。 matrix_size 是矩阵的总大小(对于 3x3 矩阵来说是 9),matrix_width 是宽度(对于 3x3 矩阵来说是 3)。
__kernel void matrix(
__global float * source,
__global float * destin,
unsigned int matrix_size,
unsigned int matrix_width
)
{
unsigned int index = get_global_id(0);
int col_idx = index % matrix_width;
int row_idx = index / matrix_width;
if (index >= matrix_size)
return;
// First of all, copy our value to the destination.
destin[index] = source[index];
// Iterate over all the pivots.
for(int piv_idx = 0; piv_idx < matrix_width; piv_idx++)
{
// We have to be the row below the pivot row
// And we have to be the column of the pivot
// or right of that column.
if(col_idx < piv_idx || row_idx <= piv_idx)
return;
// Calculate the divisor.
float pivot_value = destin[(piv_idx * matrix_width) + piv_idx];
float below_pivot_value = destin[(row_idx * matrix_width) + piv_idx];
float divisor = below_pivot_value/ pivot_value;
// Get the value in the pivot row on this column.
float pivot_row_value = destin[(piv_idx * matrix_width) + col_idx];
float current_value = destin[index];
destin[index] = current_value - (pivot_row_value * divisor);
// Write the divisor to the memory (we won't use these values anymore!)
// if we are the value under the pivot.
barrier(CLK_GLOBAL_MEM_FENCE);
if(col_idx == piv_idx)
{
int divisor_location = (row_idx * matrix_width) + piv_idx;
destin[divisor_location] = divisor;
}
barrier(CLK_GLOBAL_MEM_FENCE);
}
}
这是顺序版本:
// Decomposes a matrix into L and U but in the same matrix.
float * decompose(float* A, int matrix_width)
{
int total_length = matrix_width*matrix_width;
float *U = new float[total_length];
for (int i = 0; i < total_length; i++)
{
U[i] = A[i];
}
for (int row = 0; row < matrix_width; row++)
{
int pivot_idx = row;
float pivot_val = U[pivot_idx * matrix_width + pivot_idx];
for (int r = row + 1; r < matrix_width; r++)
{
float below_pivot = U[r*matrix_width + pivot_idx];
float divisor = below_pivot / pivot_val;
for (int row_idx = pivot_idx; row_idx < matrix_width; row_idx++)
{
float value = U[row * matrix_width + row_idx];
U[r*matrix_width + row_idx] = U[r*matrix_width + row_idx] - (value * divisor);
}
U[r * matrix_width + pivot_idx] = divisor;
}
}
return U;
}
我得到的示例输出如下:
Workgroup size: 1
Array dimension: 6
Original unfactorized:
| 176.000000 | 133.000000 | 431.000000 | 839.000000 | 739.000000 | 450.000000 |
| 507.000000 | 718.000000 | 670.000000 | 753.000000 | 122.000000 | 941.000000 |
| 597.000000 | 449.000000 | 596.000000 | 742.000000 | 491.000000 | 212.000000 |
| 159.000000 | 944.000000 | 797.000000 | 717.000000 | 822.000000 | 219.000000 |
| 266.000000 | 755.000000 | 33.000000 | 231.000000 | 824.000000 | 785.000000 |
| 724.000000 | 408.000000 | 652.000000 | 863.000000 | 663.000000 | 113.000000 |
Sequential:
| 176.000000 | 133.000000 | 431.000000 | 839.000000 | 739.000000 | 450.000000 |
| 2.880682 | 334.869324 | -571.573853 | -1663.892090 | -2006.823730 | -355.306763 |
| 3.392045 | -0.006397 | -869.627747 | -2114.569580 | -2028.558716 | -1316.693359 |
| 0.903409 | 2.460203 | -2.085742 | -357.893066 | 860.526367 | -2059.689209 |
| 1.511364 | 1.654343 | -0.376231 | -2.570729 | 4476.049805 | -5097.599121 |
| 4.113636 | -0.415427 | 1.562076 | -0.065806 | 0.003290 | 52.263515 |
Sequential multiplied matching with original?:
1
GPU:
| 176.000000 | 133.000000 | 431.000000 | 839.000000 | 739.000000 | 450.000000 |
| 2.880682 | 334.869293 | -571.573914 | -1663.892212 | -2006.823975 | -355.306885 |
| 3.392045 | -0.006397 | -869.627808 | -2114.569580 | -2028.558716 | -1316.693359 |
| 0.903409 | 2.460203 | -2.085742 | -357.892578 | 5091.575684 | -2059.688965 |
| 1.511364 | 1.654343 | -0.376232 | -2.570732 | 16116.155273 | -5097.604980 |
| 4.113636 | -0.415427 | -0.737347 | 2.005755 | -3.655331 | -237.480438 |
GPU multiplied matching with original?:
Values differ: 5053.05 -- 822
0
Values differ: 5091.58 -- 860.526
Correct solution? 0
编辑
好的,我想我明白为什么它以前不起作用了。原因是我只在每个工作组上同步。当我使用等于矩阵中项目数的工作组大小来调用我的内核时,它总是正确的,因为这样障碍就会正常工作。但是,我决定采用 cmets 中提到的方法。将多个内核排入队列并等待每个内核完成,然后再启动下一个内核。然后,这将映射到矩阵每一行的迭代,并将其与枢轴元素相乘。这确保我不会修改或读取当时内核正在修改的元素。
同样,这适用于小矩阵。所以我认为我假设它只是同步是错误的。根据 Baiz 的要求,我在此处发布了调用内核的整个 main:
int main(int argc, char *argv[])
{
try {
if (argc != 5) {
std::ostringstream oss;
oss << "Usage: " << argv[0] << " <kernel_file> <kernel_name> <workgroup_size> <array width>";
throw std::runtime_error(oss.str());
}
// Read in arguments.
std::string kernel_file(argv[1]);
std::string kernel_name(argv[2]);
unsigned int workgroup_size = atoi(argv[3]);
unsigned int array_dimension = atoi(argv[4]);
int total_matrix_length = array_dimension * array_dimension;
// Print parameters
std::cout << "Workgroup size: " << workgroup_size << std::endl;
std::cout << "Array dimension: " << array_dimension << std::endl;
// Create matrix to work on.
// Create a random array.
int matrix_width = sqrt(total_matrix_length);
float* input_matrix = new float[total_matrix_length];
input_matrix = randomMatrix(total_matrix_length);
/// Debugging
//float* input_matrix = new float[9];
//int matrix_width = 3;
//total_matrix_length = matrix_width * matrix_width;
//input_matrix[0] = 10; input_matrix[1] = -7; input_matrix[2] = 0;
//input_matrix[3] = -3; input_matrix[4] = 2; input_matrix[5] = 6;
//input_matrix[6] = 5; input_matrix[7] = -1; input_matrix[8] = 5;
// Allocate memory on the host and populate source
float *gpu_result = new float[total_matrix_length];
// OpenCL initialization
std::vector<cl::Platform> platforms;
std::vector<cl::Device> devices;
cl::Platform::get(&platforms);
platforms[0].getDevices(CL_DEVICE_TYPE_GPU, &devices);
cl::Context context(devices);
cl::CommandQueue queue(context, devices[0], CL_QUEUE_PROFILING_ENABLE);
// Load the kernel source.
std::string file_text;
std::ifstream file_stream(kernel_file.c_str());
if (!file_stream) {
std::ostringstream oss;
oss << "There is no file called " << kernel_file;
throw std::runtime_error(oss.str());
}
file_text.assign(std::istreambuf_iterator<char>(file_stream), std::istreambuf_iterator<char>());
// Compile the kernel source.
std::string source_code = file_text;
std::pair<const char *, size_t> source(source_code.c_str(), source_code.size());
cl::Program::Sources sources;
sources.push_back(source);
cl::Program program(context, sources);
try {
program.build(devices);
}
catch (cl::Error& e) {
std::string msg;
program.getBuildInfo<std::string>(devices[0], CL_PROGRAM_BUILD_LOG, &msg);
std::cerr << "Your kernel failed to compile" << std::endl;
std::cerr << "-----------------------------" << std::endl;
std::cerr << msg;
throw(e);
}
// Allocate memory on the device
cl::Buffer source_buf(context, CL_MEM_READ_ONLY, total_matrix_length*sizeof(float));
cl::Buffer dest_buf(context, CL_MEM_WRITE_ONLY, total_matrix_length*sizeof(float));
// Create the actual kernel.
cl::Kernel kernel(program, kernel_name.c_str());
// transfer source data from the host to the device
queue.enqueueWriteBuffer(source_buf, CL_TRUE, 0, total_matrix_length*sizeof(float), input_matrix);
for (int pivot_idx = 0; pivot_idx < matrix_width; pivot_idx++)
{
// set the kernel arguments
kernel.setArg<cl::Memory>(0, source_buf);
kernel.setArg<cl::Memory>(1, dest_buf);
kernel.setArg<cl_uint>(2, total_matrix_length);
kernel.setArg<cl_uint>(3, matrix_width);
kernel.setArg<cl_int>(4, pivot_idx);
// execute the code on the device
std::cout << "Enqueueing new kernel for " << pivot_idx << std::endl;
cl::Event evt;
queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(total_matrix_length), cl::NDRange(workgroup_size), 0, &evt);
evt.wait();
std::cout << "Iteration " << pivot_idx << " done" << std::endl;
}
// transfer destination data from the device to the host
queue.enqueueReadBuffer(dest_buf, CL_TRUE, 0, total_matrix_length*sizeof(float), gpu_result);
// Calculate sequentially.
float* sequential = decompose(input_matrix, matrix_width);
// Print out the results.
std::cout << "Sequential:\n";
printMatrix(total_matrix_length, sequential);
// Print out the results.
std::cout << "GPU:\n";
printMatrix(total_matrix_length, gpu_result);
std::cout << "Correct solution? " << equalMatrices(gpu_result, sequential, total_matrix_length);
// compute the data throughput in GB/s
//float throughput = (2.0*total_matrix_length*sizeof(float)) / t; // t is in nano seconds
//std::cout << "Achieved throughput: " << throughput << std::endl;
// Cleanup
// Deallocate memory
delete[] gpu_result;
delete[] input_matrix;
delete[] sequential;
return 0;
}
catch (cl::Error& e) {
std::cerr << e.what() << ": " << jc::readable_status(e.err());
return 3;
}
catch (std::exception& e) {
std::cerr << e.what() << std::endl;
return 2;
}
catch (...) {
std::cerr << "Unexpected error. Aborting!\n" << std::endl;
return 1;
}
}
【问题讨论】:
-
我添加了一个可能的解决方案。如果可行,请告诉我。
标签: c++ algorithm matrix opencl