正如@faivvy 在他的回答中指出的那样,您可以尝试完全取消 for 循环
但是,另一种方法(正确处理负数a)是执行循环展开,我将调用该函数fnUnroll。如果您不熟悉循环展开,其想法是减少迭代次数并并行求和
正如cmets中提到的,每次迭代不需要乘以b,可以在最后完成。我添加了另一个名为 fnUnrollNoMult 的函数来显示这个
#include <chrono>
#include <cstdlib>
#include <iostream>
int fn(int a, int b) {
int sum = 0;
for (int i = a * 4; i > 0; i--)
sum += b * i * i;
return sum;
}
int fnUnroll(int a, int b) {
// Set up some number of accumulators, I picked 4
int sum0 = 0;
int sum1 = 0;
int sum2 = 0;
int sum3 = 0;
int i = 1;
int limit = a * 4;
// Sum 4 values in parallel
for ( ; i < limit; i += 4) {
sum0 += b * i * i;
sum1 += b * (i + 1) * (i + 1);
sum2 += b * (i + 2) * (i + 2);
sum3 += b * (i + 3) * (i + 3);
}
// Handle the remainder (if any)
for ( ; i < limit; i++)
sum0 += b * i + i;
// Sum the accumulators
return sum0 + sum1 + sum2 + sum3;
}
int fnUnrollNoMult(int a, int b) {
int sum0 = 0;
int sum1 = 0;
int sum2 = 0;
int sum3 = 0;
// Remove b from the loops
int i = 1;
int limit = a * 4;
for ( ; i < limit; i += 4) {
sum0 += i * i;
sum1 += (i + 1) * (i + 1);
sum2 += (i + 2) * (i + 2);
sum3 += (i + 3) * (i + 3);
}
for ( ; i < limit; i++)
sum0 += i + i;
// Handle b here
return b * (sum0 + sum1 + sum2 + sum3);
}
int main(int argc, char** argv) {
// Expects two arguments: a and b
if (argc != 3) {
std::cout << "Usage: " << argv[0] << " <int> <int>\n";
return 1;
}
int a = atoi(argv[1]);
int b = atoi(argv[2]);
// This is just to demonstrate correctness
for (int i = 0; i < 100; i++)
for (int j = 0; j < 100; j++)
if (
fn(i, j) != fnUnroll(i, j) ||
fn(i, j) != fnUnrollNoMult(i, j)
) {
std::cout << "Not equal: " << i << ", " << j << std::endl;
return 1;
}
// Benchmark
using namespace std::chrono;
{
auto start = high_resolution_clock::now();
int result = fn(a, b);
auto stop = high_resolution_clock::now();
std::cout << "fn value: " << result << std::endl;
std::cout << "fn nanos: " << duration_cast<nanoseconds>(stop - start).count() << std::endl;
}
{
auto start = high_resolution_clock::now();
int result = fnUnroll(a, b);
auto stop = high_resolution_clock::now();
std::cout << "fnUnroll value: " << result << std::endl;
std::cout << "fnUnroll nanos: " << duration_cast<nanoseconds>(stop - start).count() << std::endl;
}
{
auto start = high_resolution_clock::now();
int result = fnUnrollNoMult(a, b);
auto stop = high_resolution_clock::now();
std::cout << "fnUnrollNoMult value: " << result << std::endl;
std::cout << "fnUnrollNoMult nanos: " << duration_cast<nanoseconds>(stop - start).count() << std::endl;
}
return 0;
}
下面的程序需要两个参数,分别代表a 和b。下面我将程序编译为g++ -std=c++14 foo.cpp -O3,并得到一些a 值的这些结果:
./a.out 1 2
fn value: 60
fn nanos: 373
fnUnroll value: 60
fnUnroll nanos: 209
fnUnrollNoMult value: 60
fnUnrollNoMult nanos: 157
./a.out 1000 2
fn value: -267004960
fn nanos: 3509
fnUnroll value: -267004960
fnUnroll nanos: 2820
fnUnrollNoMult value: -267004960
fnUnrollNoMult nanos: 1568
./a.out 1000000 2
fn value: -619707648
fn nanos: 3137685
fnUnroll value: -619707648
fnUnroll nanos: 2387840
fnUnrollNoMult value: -619707648
fnUnrollNoMult nanos: 1220519