vreinterpret_X_Y 宏用于获取现有寄存器并将类型“转换”为其他形式以传递给另一个内在函数。例如,此代码在一次加载中将两个 16 位有符号短裤加载为 32 位无符号整数,但随后我必须使用 vreinterpret_s16_u32,因为我实际上不想将数据视为 uint32x2_t相反,我希望它是 int16x4_t,它的字节大小完全相同(即它们都映射到 __n64 值)。
// ptr is an input pointer to two uint16_t values
uint32x2_t vInt16 = vld1_dup_u32( reinterpret_cast<const uint32_t*>(ptr) );
int32x4_t vInt = vmovl_s16( vreinterpret_s16_u32(vInt16) );
注意: vreinterpret_X_Y 与 _mm_castX_Y 对 SSE 所做的完全一样。即,什么都没有。它不发出任何代码,它只是让编译器对类型更改更满意。值得注意的是,Visual Studio 的 ARM C++ 编译器在这方面并没有真正做太多的类型检查,因为无论如何,一切都被视为__n64 或__n128 类型。因此,vreinterpret_X_Y 主要是代码可移植性问题。
然而,查表内在函数有点特殊。您必须加载 uint8x8x2_t 类型,并且不能只将现有变量强制转换为它。
注意:这也适用于 vtbxl、vtrn、vzip、vuzp、vld2+ 和 vst2+ 内在函数。
例如,在DirectXMath 中,我使用两个vtbl2_u8 查找实现了通用XMVectorSwizzle 的ARM-NEON 版本:
// DirectXMathVector.inl
inline XMVECTOR XM_CALLCONV XMVectorSwizzle(FXMVECTOR V,
uint32_t E0, uint32_t E1, uint32_t E2, uint32_t E3)
{
assert( (E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4) );
static const uint32_t ControlElement[ 4 ] =
{
0x03020100, // XM_SWIZZLE_X
0x07060504, // XM_SWIZZLE_Y
0x0B0A0908, // XM_SWIZZLE_Z
0x0F0E0D0C, // XM_SWIZZLE_W
};
int8x8x2_t tbl;
tbl.val[0] = vget_low_f32(V);
tbl.val[1] = vget_high_f32(V);
uint32x2_t idx = vcreate_u32( ((uint64_t)ControlElement[E0])
| (((uint64_t)ControlElement[E1]) << 32) );
const uint8x8_t rL = vtbl2_u8( tbl, idx );
idx = vcreate_u32( ((uint64_t)ControlElement[E2])
| (((uint64_t)ControlElement[E3]) << 32) );
const uint8x8_t rH = vtbl2_u8( tbl, idx );
return vcombine_f32( rL, rH );
}
同样,我将vtbl4_u8 用于XMVectorPermute
请注意,虽然vtbl 非常强大,但使用起来有点复杂。对于“常见的”混合模式,我实现了 XMVectorSwizzle 和 XMVectorPermute 的模板形式,因此我可以专门针对不需要完整表查找的案例:
// General swizzle template
template<uint32_t SwizzleX, uint32_t SwizzleY, uint32_t SwizzleZ, uint32_t SwizzleW>
inline XMVECTOR XMVectorSwizzle(FXMVECTOR V)
{
static_assert(SwizzleX <= 3, "SwizzleX template parameter out of range");
static_assert(SwizzleY <= 3, "SwizzleY template parameter out of range");
static_assert(SwizzleZ <= 3, "SwizzleZ template parameter out of range");
static_assert(SwizzleW <= 3, "SwizzleW template parameter out of range");
return XMVectorSwizzle( V, SwizzleX, SwizzleY, SwizzleZ, SwizzleW );
}
// Specialized swizzles
template<> inline XMVECTOR XMVectorSwizzle<0,1,2,3>(FXMVECTOR V)
{ return V; }
template<> inline XMVECTORXMVectorSwizzle<0,0,0,0>(FXMVECTOR V)
{ return vdupq_lane_f32( vget_low_f32(V), 0); }
template<> inline XMVECTOR XMVectorSwizzle<1,1,1,1>(FXMVECTOR V)
{ return vdupq_lane_f32( vget_low_f32(V), 1); }
template<> inline XMVECTOR XMVectorSwizzle<2,2,2,2>(FXMVECTOR V)
{ return vdupq_lane_f32( vget_high_f32(V), 0); }
template<> inline XMVECTOR XMVectorSwizzle<3,3,3,3>(FXMVECTOR V)
{ return vdupq_lane_f32( vget_high_f32(V), 1); }
template<> inline XMVECTOR XMVectorSwizzle<1,0,3,2>(FXMVECTOR V)
{ return vrev64q_f32(V); }
template<> inline XMVECTOR XMVectorSwizzle<0,1,0,1>(FXMVECTOR V)
{ float32x2_t vt = vget_low_f32(V); return vcombine_f32( vt, vt ); }
template<> inline XMVECTOR XMVectorSwizzle<2,3,2,3>(FXMVECTOR V)
{ float32x2_t vt = vget_high_f32(V); return vcombine_f32( vt, vt ); }
template<> inline XMVECTOR XMVectorSwizzle<1,0,1,0>(FXMVECTOR V)
{ float32x2_t vt = vrev64_f32( vget_low_f32(V) ); return vcombine_f32( vt, vt ); }
template<> inline XMVECTOR XMVectorSwizzle<3,2,3,2>(FXMVECTOR V)
{ float32x2_t vt = vrev64_f32( vget_high_f32(V) ); return vcombine_f32( vt, vt ); }
template<> inline XMVECTOR XMVectorSwizzle<0,1,3,2>(FXMVECTOR V)
{ return vcombine_f32( vget_low_f32(V), vrev64_f32( vget_high_f32(V) ) ); }
template<> inline XMVECTOR XMVectorSwizzle<1,0,2,3>(FXMVECTOR V)
{ return vcombine_f32( vrev64_f32( vget_low_f32(V) ), vget_high_f32(V) ); }
template<> inline XMVECTOR XMVectorSwizzle<2,3,1,0>(FXMVECTOR V)
{ return vcombine_f32( vget_high_f32(V), vrev64_f32( vget_low_f32(V) ) ); }
template<> inline XMVECTOR XMVectorSwizzle<3,2,0,1>(FXMVECTOR V)
{ return vcombine_f32( vrev64_f32( vget_high_f32(V) ), vget_low_f32(V) ); }
template<> inline XMVECTOR XMVectorSwizzle<3,2,1,0>(FXMVECTOR V)
{ return vcombine_f32( vrev64_f32( vget_high_f32(V) ), vrev64_f32( vget_low_f32(V) ) ); }
template<> inline XMVECTOR XMVectorSwizzle<0,0,2,2>(FXMVECTOR V)
{ return vtrnq_f32(V,V).val[0]; }
template<> inline XMVECTOR XMVectorSwizzle<1,1,3,3>(FXMVECTOR V)
{ return vtrnq_f32(V,V).val[1]; }
template<> inline XMVECTOR XMVectorSwizzle<0,0,1,1>(FXMVECTOR V)
{ return vzipq_f32(V,V).val[0]; }
template<> inline XMVECTOR XMVectorSwizzle<2,2,3,3>(FXMVECTOR V)
{ return vzipq_f32(V,V).val[1]; }
template<> inline XMVECTOR XMVectorSwizzle<0,2,0,2>(FXMVECTOR V)
{ return vuzpq_f32(V,V).val[0]; }
template<> inline XMVECTOR XMVectorSwizzle<1,3,1,3>(FXMVECTOR V)
{ return vuzpq_f32(V,V).val[1]; }
template<> inline XMVECTOR XMVectorSwizzle<1,2,3,0>(FXMVECTOR V)
{ return vextq_f32(V, V, 1); }
template<> inline XMVECTOR XMVectorSwizzle<2,3,0,1>(FXMVECTOR V)
{ return vextq_f32(V, V, 2); }
template<> inline XMVECTOR XMVectorSwizzle<3,0,1,2>(FXMVECTOR V)
{ return vextq_f32(V, V, 3); }