如果您有可变数字输入,则无法使用恒定时间。您必须至少对这些值进行一次迭代,对吗?
在任何情况下,您都可以使用内部函数来最小化操作次数。您尚未指定目标架构或整数大小。所以我假设 AVX2 和 64 位整数作为输出。另外,为方便起见,我假设输入是 64 位的。
如果您的输入是比输出更小的整数,则必须添加一些零扩展。
#include <immintrin.h>
#include <array>
#include <cstdint>
#include <cstdio>
std::uint64_t accum(const std::uint64_t* bitpositions, std::size_t n)
{
// 2 x 64 bit integers set to 1
const __m128i ones2 = _mm_set1_epi64(_m_from_int64(1));
// 4 x 64 bit integers set to 1
const __m256i ones4 = _mm256_broadcastsi128_si256(ones2);
// 4 x 64 bit integers serving as partial bit masks
__m256i accum4 = _mm256_setzero_si256();
std::size_t i;
for(i = 0; i + 4 <= n; i += 4) {
// may be replaced with aligned load
__m256i positions = _mm256_loadu_si256((const __m256i*)(bitpositions + i));
// vectorized (1 << position) bit shift
__m256i shifted = _mm256_sllv_epi64(ones4, positions);
// add new bits to accumulator
accum4 = _mm256_or_si256(accum4, shifted);
}
// reduce 4 to 2 64 bit integers
__m128i accum2 = _mm256_castsi256_si128(accum4);
__m128i high2 = _mm256_extracti128_si256(accum4, 1);
if(i + 2 <= n) {
// zero or one iteration with 2 64 bit integers
__m128i positions = _mm_loadu_si128((const __m128i*)(bitpositions + i));
__m128i shifted = _mm_sllv_epi64(ones2, positions);
accum2 = _mm_or_si128(accum2, shifted);
i += 2;
}
// high2 folded in with delay to account for extract latency
accum2 = _mm_or_si128(accum2, high2);
// reduce to 1 64 bit integer
__m128i high1 = _mm_unpackhi_epi64(accum2, accum2);
accum2 = _mm_or_si128(accum2, high1);
std::uint64_t accum1 = static_cast<std::uint64_t>(_mm_cvtsi128_si64(accum2));
if(i < n)
accum1 |= 1 << bitpositions[i];
return accum1;
}
编辑
我刚刚看到您的示例输入使用基于 1 的索引。所以位 1 将被设置为值 1 并且输入值 0 可能是未定义的行为。我建议切换到从零开始的索引。但如果您坚持使用这种表示法,只需在班次前添加 _mm256_sub_epi64(positions, ones4) 或 _mm_sub_epi64(positions, ones2)。
输入尺寸更小
这是一个字节大小的输入整数版本。
std::uint64_t accum(const std::uint8_t* bitpositions, std::size_t n)
{
const __m128i ones2 = _mm_set1_epi64(_m_from_int64(1));
const __m256i ones4 = _mm256_broadcastsi128_si256(ones2);
__m256i accum4 = _mm256_setzero_si256();
std::size_t i;
for(i = 0; i + 4 <= n; i += 4) {
/*
* As far as I can see, there is no point in loading a full 128 or 256 bit
* vector. To zero-extend more values, we would need to use many shuffle
* instructions and those have a lower throughput than repeated
* 32 bit loads
*/
__m128i positions = _mm_cvtsi32_si128(*(const int*)(bitpositions + i));
__m256i extended = _mm256_cvtepu8_epi64(positions);
__m256i shifted = _mm256_sllv_epi64(ones4, extended);
accum4 = _mm256_or_si256(accum4, shifted);
}
__m128i accum2 = _mm256_castsi256_si128(accum4);
__m128i high2 = _mm256_extracti128_si256(accum4, 1);
accum2 = _mm_or_si128(accum2, high2);
/*
* Until AVX512, there is no single instruction to load 2 byte into a vector
* register. So we don't bother. Instead, the scalar code below will run up
* to 3 times
*/
__m128i high1 = _mm_unpackhi_epi64(accum2, accum2);
accum2 = _mm_or_si128(accum2, high1);
std::uint64_t accum1 = static_cast<std::uint64_t>(_mm_cvtsi128_si64(accum2));
/*
* We use a separate accumulator to avoid the long dependency chain through
* the reduction above
*/
std::uint64_t tail = 0;
/*
* Compilers create a ton of code if we give them a simple loop because they
* think they can vectorize. So we unroll the loop, even if it is stupid
*/
if(i + 2 <= n) {
tail = std::uint64_t(1) << bitpositions[i++];
tail |= std::uint64_t(1) << bitpositions[i++];
}
if(i < n)
tail |= std::uint64_t(1) << bitpositions[i];
return accum1 | tail;
}