在 cmets 中经过一些思考和讨论后,我认为这是最有效的版本,至少当源和目标数据都在 RAM 中时。不需要AVX2,AVX1就够了。
主要思想是,与存储相比,现代 CPU 可以执行两倍的加载微操作,并且在许多 CPU 上,使用vinsertf128 将内容加载到较高一半的向量中的成本与常规的 16 字节加载相同。与您的宏相比,此版本不再需要这些相对昂贵的(大多数 CPU 上的 3 个延迟周期)vperm2f128 shuffle。
struct Matrix4x4
{
__m256d r0, r1, r2, r3;
};
inline void loadTransposed( Matrix4x4& mat, const double* rsi, size_t stride = 8 )
{
// Load top half of the matrix into low half of 4 registers
__m256d t0 = _mm256_castpd128_pd256( _mm_loadu_pd( rsi ) ); // 00, 01
__m256d t1 = _mm256_castpd128_pd256( _mm_loadu_pd( rsi + 2 ) ); // 02, 03
rsi += stride;
__m256d t2 = _mm256_castpd128_pd256( _mm_loadu_pd( rsi ) ); // 10, 11
__m256d t3 = _mm256_castpd128_pd256( _mm_loadu_pd( rsi + 2 ) ); // 12, 13
rsi += stride;
// Load bottom half of the matrix into high half of these registers
t0 = _mm256_insertf128_pd( t0, _mm_loadu_pd( rsi ), 1 ); // 00, 01, 20, 21
t1 = _mm256_insertf128_pd( t1, _mm_loadu_pd( rsi + 2 ), 1 );// 02, 03, 22, 23
rsi += stride;
t2 = _mm256_insertf128_pd( t2, _mm_loadu_pd( rsi ), 1 ); // 10, 11, 30, 31
t3 = _mm256_insertf128_pd( t3, _mm_loadu_pd( rsi + 2 ), 1 );// 12, 13, 32, 33
// Transpose 2x2 blocks in registers.
// Due to the tricky way we loaded stuff, that's enough to transpose the complete 4x4 matrix.
mat.r0 = _mm256_unpacklo_pd( t0, t2 ); // 00, 10, 20, 30
mat.r1 = _mm256_unpackhi_pd( t0, t2 ); // 01, 11, 21, 31
mat.r2 = _mm256_unpacklo_pd( t1, t3 ); // 02, 12, 22, 32
mat.r3 = _mm256_unpackhi_pd( t1, t3 ); // 03, 13, 23, 33
}
inline void store( const Matrix4x4& mat, double* rdi, size_t stride = 8 )
{
_mm256_storeu_pd( rdi, mat.r0 );
_mm256_storeu_pd( rdi + stride, mat.r1 );
_mm256_storeu_pd( rdi + stride * 2, mat.r2 );
_mm256_storeu_pd( rdi + stride * 3, mat.r3 );
}
// Transpose 8x8 matrix of double values
void transpose8x8( double* rdi, const double* rsi )
{
Matrix4x4 block;
// Top-left corner
loadTransposed( block, rsi );
store( block, rdi );
#if 1
// Using another instance of the block to support in-place transpose
Matrix4x4 block2;
loadTransposed( block, rsi + 4 ); // top right block
loadTransposed( block2, rsi + 8 * 4 ); // bottom left block
store( block2, rdi + 4 );
store( block, rdi + 8 * 4 );
#else
// Flip the #if if you can guarantee ( rsi != rdi )
// Performance is about the same, but this version uses 4 less vector registers,
// slightly more efficient when some registers need to be backed up / restored.
assert( rsi != rdi );
loadTransposed( block, rsi + 4 );
store( block, rdi + 8 * 4 );
loadTransposed( block, rsi + 8 * 4 );
store( block, rdi + 4 );
#endif
// Bottom-right corner
loadTransposed( block, rsi + 8 * 4 + 4 );
store( block, rdi + 8 * 4 + 4 );
}
为了完整起见,这里有一个版本,它使用的代码与您的宏非常相似,加载次数减少了两倍,存储次数相同,并且洗牌次数更多。尚未进行基准测试,但我希望它会稍微慢一些。
struct Matrix4x4
{
__m256d r0, r1, r2, r3;
};
inline void load( Matrix4x4& mat, const double* rsi, size_t stride = 8 )
{
mat.r0 = _mm256_loadu_pd( rsi );
mat.r1 = _mm256_loadu_pd( rsi + stride );
mat.r2 = _mm256_loadu_pd( rsi + stride * 2 );
mat.r3 = _mm256_loadu_pd( rsi + stride * 3 );
}
inline void store( const Matrix4x4& mat, double* rdi, size_t stride = 8 )
{
_mm256_storeu_pd( rdi, mat.r0 );
_mm256_storeu_pd( rdi + stride, mat.r1 );
_mm256_storeu_pd( rdi + stride * 2, mat.r2 );
_mm256_storeu_pd( rdi + stride * 3, mat.r3 );
}
inline void transpose( Matrix4x4& m4 )
{
// These unpack instructions transpose lanes within 2x2 blocks of the matrix
const __m256d t0 = _mm256_unpacklo_pd( m4.r0, m4.r1 );
const __m256d t1 = _mm256_unpacklo_pd( m4.r2, m4.r3 );
const __m256d t2 = _mm256_unpackhi_pd( m4.r0, m4.r1 );
const __m256d t3 = _mm256_unpackhi_pd( m4.r2, m4.r3 );
// Produce the transposed matrix by combining these blocks
m4.r0 = _mm256_permute2f128_pd( t0, t1, 0x20 );
m4.r1 = _mm256_permute2f128_pd( t2, t3, 0x20 );
m4.r2 = _mm256_permute2f128_pd( t0, t1, 0x31 );
m4.r3 = _mm256_permute2f128_pd( t2, t3, 0x31 );
}
// Transpose 8x8 matrix with double values
void transpose8x8( double* rdi, const double* rsi )
{
Matrix4x4 block;
// Top-left corner
load( block, rsi );
transpose( block );
store( block, rdi );
// Using another instance of the block to support in-place transpose, with very small overhead
Matrix4x4 block2;
load( block, rsi + 4 ); // top right block
load( block2, rsi + 8 * 4 ); // bottom left block
transpose( block2 );
store( block2, rdi + 4 );
transpose( block );
store( block, rdi + 8 * 4 );
// Bottom-right corner
load( block, rsi + 8 * 4 + 4 );
transpose( block );
store( block, rdi + 8 * 4 + 4 );
}