avx_mathfun 中的 exp 函数使用范围缩减与 Chebyshev 近似多项式相结合,与 AVX 指令并行计算 8 exp-s。尽可能使用正确的编译器设置来确保 addps 和 mulps 与 FMA 指令融合。
将 avx_mathfun 中的原始 exp 代码改编为可移植(跨不同编译器)C / AVX2 内在代码非常简单。原始代码使用 gcc 样式对齐属性和巧妙的宏。改用标准_mm256_set1_ps() 的修改代码位于小测试代码和表格下方。修改后的代码需要AVX2。
以下代码用于简单测试:
int main(){
int i;
float xv[8];
float yv[8];
__m256 x = _mm256_setr_ps(1.0f, 2.0f, 3.0f ,4.0f ,5.0f, 6.0f, 7.0f, 8.0f);
__m256 y = exp256_ps(x);
_mm256_store_ps(xv,x);
_mm256_store_ps(yv,y);
for (i=0;i<8;i++){
printf("i = %i, x = %e, y = %e \n",i,xv[i],yv[i]);
}
return 0;
}
输出好像没问题:
i = 0, x = 1.000000e+00, y = 2.718282e+00
i = 1, x = 2.000000e+00, y = 7.389056e+00
i = 2, x = 3.000000e+00, y = 2.008554e+01
i = 3, x = 4.000000e+00, y = 5.459815e+01
i = 4, x = 5.000000e+00, y = 1.484132e+02
i = 5, x = 6.000000e+00, y = 4.034288e+02
i = 6, x = 7.000000e+00, y = 1.096633e+03
i = 7, x = 8.000000e+00, y = 2.980958e+03
修改后的代码(AVX2)为:
#include <stdio.h>
#include <immintrin.h>
/* gcc -O3 -m64 -Wall -mavx2 -march=broadwell expc.c */
__m256 exp256_ps(__m256 x) {
/* Modified code. The original code is here: https://github.com/reyoung/avx_mathfun
AVX implementation of exp
Based on "sse_mathfun.h", by Julien Pommier
http://gruntthepeon.free.fr/ssemath/
Copyright (C) 2012 Giovanni Garberoglio
Interdisciplinary Laboratory for Computational Science (LISC)
Fondazione Bruno Kessler and University of Trento
via Sommarive, 18
I-38123 Trento (Italy)
This software is provided 'as-is', without any express or implied
warranty. In no event will the authors be held liable for any damages
arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not
claim that you wrote the original software. If you use this software
in a product, an acknowledgment in the product documentation would be
appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
(this is the zlib license)
*/
/*
To increase the compatibility across different compilers the original code is
converted to plain AVX2 intrinsics code without ingenious macro's,
gcc style alignment attributes etc. The modified code requires AVX2
*/
__m256 exp_hi = _mm256_set1_ps(88.3762626647949f);
__m256 exp_lo = _mm256_set1_ps(-88.3762626647949f);
__m256 cephes_LOG2EF = _mm256_set1_ps(1.44269504088896341);
__m256 cephes_exp_C1 = _mm256_set1_ps(0.693359375);
__m256 cephes_exp_C2 = _mm256_set1_ps(-2.12194440e-4);
__m256 cephes_exp_p0 = _mm256_set1_ps(1.9875691500E-4);
__m256 cephes_exp_p1 = _mm256_set1_ps(1.3981999507E-3);
__m256 cephes_exp_p2 = _mm256_set1_ps(8.3334519073E-3);
__m256 cephes_exp_p3 = _mm256_set1_ps(4.1665795894E-2);
__m256 cephes_exp_p4 = _mm256_set1_ps(1.6666665459E-1);
__m256 cephes_exp_p5 = _mm256_set1_ps(5.0000001201E-1);
__m256 tmp = _mm256_setzero_ps(), fx;
__m256i imm0;
__m256 one = _mm256_set1_ps(1.0f);
x = _mm256_min_ps(x, exp_hi);
x = _mm256_max_ps(x, exp_lo);
/* express exp(x) as exp(g + n*log(2)) */
fx = _mm256_mul_ps(x, cephes_LOG2EF);
fx = _mm256_add_ps(fx, _mm256_set1_ps(0.5f));
tmp = _mm256_floor_ps(fx);
__m256 mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS);
mask = _mm256_and_ps(mask, one);
fx = _mm256_sub_ps(tmp, mask);
tmp = _mm256_mul_ps(fx, cephes_exp_C1);
__m256 z = _mm256_mul_ps(fx, cephes_exp_C2);
x = _mm256_sub_ps(x, tmp);
x = _mm256_sub_ps(x, z);
z = _mm256_mul_ps(x,x);
__m256 y = cephes_exp_p0;
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, cephes_exp_p1);
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, cephes_exp_p2);
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, cephes_exp_p3);
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, cephes_exp_p4);
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, cephes_exp_p5);
y = _mm256_mul_ps(y, z);
y = _mm256_add_ps(y, x);
y = _mm256_add_ps(y, one);
/* build 2^n */
imm0 = _mm256_cvttps_epi32(fx);
imm0 = _mm256_add_epi32(imm0, _mm256_set1_epi32(0x7f));
imm0 = _mm256_slli_epi32(imm0, 23);
__m256 pow2n = _mm256_castsi256_ps(imm0);
y = _mm256_mul_ps(y, pow2n);
return y;
}
int main(){
int i;
float xv[8];
float yv[8];
__m256 x = _mm256_setr_ps(1.0f, 2.0f, 3.0f ,4.0f ,5.0f, 6.0f, 7.0f, 8.0f);
__m256 y = exp256_ps(x);
_mm256_store_ps(xv,x);
_mm256_store_ps(yv,y);
for (i=0;i<8;i++){
printf("i = %i, x = %e, y = %e \n",i,xv[i],yv[i]);
}
return 0;
}
作为
@Peter Cordes points out,
应该可以将
_mm256_floor_ps(fx + 0.5f) 替换为
_mm256_round_ps(fx)。此外,
mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS); 和接下来的两行似乎是多余的。
通过将
cephes_exp_C1 和
cephes_exp_C2 组合成
inv_LOG2EF,可以进一步优化。
这导致下面的代码没有经过彻底的测试!
#include <stdio.h>
#include <immintrin.h>
#include <math.h>
/* gcc -O3 -m64 -Wall -mavx2 -march=broadwell expc.c -lm */
__m256 exp256_ps(__m256 x) {
/* Modified code from this source: https://github.com/reyoung/avx_mathfun
AVX implementation of exp
Based on "sse_mathfun.h", by Julien Pommier
http://gruntthepeon.free.fr/ssemath/
Copyright (C) 2012 Giovanni Garberoglio
Interdisciplinary Laboratory for Computational Science (LISC)
Fondazione Bruno Kessler and University of Trento
via Sommarive, 18
I-38123 Trento (Italy)
This software is provided 'as-is', without any express or implied
warranty. In no event will the authors be held liable for any damages
arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not
claim that you wrote the original software. If you use this software
in a product, an acknowledgment in the product documentation would be
appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
(this is the zlib license)
*/
/*
To increase the compatibility across different compilers the original code is
converted to plain AVX2 intrinsics code without ingenious macro's,
gcc style alignment attributes etc.
Moreover, the part "express exp(x) as exp(g + n*log(2))" has been significantly simplified.
This modified code is not thoroughly tested!
*/
__m256 exp_hi = _mm256_set1_ps(88.3762626647949f);
__m256 exp_lo = _mm256_set1_ps(-88.3762626647949f);
__m256 cephes_LOG2EF = _mm256_set1_ps(1.44269504088896341f);
__m256 inv_LOG2EF = _mm256_set1_ps(0.693147180559945f);
__m256 cephes_exp_p0 = _mm256_set1_ps(1.9875691500E-4);
__m256 cephes_exp_p1 = _mm256_set1_ps(1.3981999507E-3);
__m256 cephes_exp_p2 = _mm256_set1_ps(8.3334519073E-3);
__m256 cephes_exp_p3 = _mm256_set1_ps(4.1665795894E-2);
__m256 cephes_exp_p4 = _mm256_set1_ps(1.6666665459E-1);
__m256 cephes_exp_p5 = _mm256_set1_ps(5.0000001201E-1);
__m256 fx;
__m256i imm0;
__m256 one = _mm256_set1_ps(1.0f);
x = _mm256_min_ps(x, exp_hi);
x = _mm256_max_ps(x, exp_lo);
/* express exp(x) as exp(g + n*log(2)) */
fx = _mm256_mul_ps(x, cephes_LOG2EF);
fx = _mm256_round_ps(fx, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC);
__m256 z = _mm256_mul_ps(fx, inv_LOG2EF);
x = _mm256_sub_ps(x, z);
z = _mm256_mul_ps(x,x);
__m256 y = cephes_exp_p0;
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, cephes_exp_p1);
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, cephes_exp_p2);
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, cephes_exp_p3);
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, cephes_exp_p4);
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, cephes_exp_p5);
y = _mm256_mul_ps(y, z);
y = _mm256_add_ps(y, x);
y = _mm256_add_ps(y, one);
/* build 2^n */
imm0 = _mm256_cvttps_epi32(fx);
imm0 = _mm256_add_epi32(imm0, _mm256_set1_epi32(0x7f));
imm0 = _mm256_slli_epi32(imm0, 23);
__m256 pow2n = _mm256_castsi256_ps(imm0);
y = _mm256_mul_ps(y, pow2n);
return y;
}
int main(){
int i;
float xv[8];
float yv[8];
__m256 x = _mm256_setr_ps(11.0f, -12.0f, 13.0f ,-14.0f ,15.0f, -16.0f, 17.0f, -18.0f);
__m256 y = exp256_ps(x);
_mm256_store_ps(xv,x);
_mm256_store_ps(yv,y);
/* compare exp256_ps with the double precision exp from math.h,
print the relative error */
printf("i x y = exp256_ps(x) double precision exp relative error\n\n");
for (i=0;i<8;i++){
printf("i = %i x =%16.9e y =%16.9e exp_dbl =%16.9e rel_err =%16.9e\n",
i,xv[i],yv[i],exp((double)(xv[i])),
((double)(yv[i])-exp((double)(xv[i])))/exp((double)(xv[i])) );
}
return 0;
}
下表通过将 exp256_ps 与来自 math.h 的双精度 exp 进行比较,给出了某些点的准确度印象。
相对误差在最后一列。
i x y = exp256_ps(x) double precision exp relative error
i = 0 x = 1.000000000e+00 y = 2.718281746e+00 exp_dbl = 2.718281828e+00 rel_err =-3.036785947e-08
i = 1 x =-2.000000000e+00 y = 1.353352815e-01 exp_dbl = 1.353352832e-01 rel_err =-1.289636419e-08
i = 2 x = 3.000000000e+00 y = 2.008553696e+01 exp_dbl = 2.008553692e+01 rel_err = 1.672817689e-09
i = 3 x =-4.000000000e+00 y = 1.831563935e-02 exp_dbl = 1.831563889e-02 rel_err = 2.501162103e-08
i = 4 x = 5.000000000e+00 y = 1.484131622e+02 exp_dbl = 1.484131591e+02 rel_err = 2.108215155e-08
i = 5 x =-6.000000000e+00 y = 2.478752285e-03 exp_dbl = 2.478752177e-03 rel_err = 4.380257261e-08
i = 6 x = 7.000000000e+00 y = 1.096633179e+03 exp_dbl = 1.096633158e+03 rel_err = 1.849522682e-08
i = 7 x =-8.000000000e+00 y = 3.354626242e-04 exp_dbl = 3.354626279e-04 rel_err =-1.101575118e-08