【问题标题】:Efficient way to set first N or last N bits of __m256i to 1, the rest to 0将__m256i的前N位或后N位设置为1,其余为0的有效方法
【发布时间】:2017-09-03 15:16:36
【问题描述】:

如何使用 AVX2 有效地设置为 1

  1. 第一个N
  2. 最后一个N

来自__m256i,将其余设置为0

这些是位范围的尾部和头部的 2 个独立操作,当范围可能在 __m256i 值的中间开始和结束时。占满 __m256i 值的范围部分使用 all-0 或 all-1 掩码处理。

【问题讨论】:

    标签: c++ bit-manipulation vectorization x86-64 avx2


    【解决方案1】:

    AVX2 移位指令 vpsllvdvpsrlvd 具有移位计数的好特性 大于或等于 32 导致 ymm 寄存器中的整数为零。 换句话说:班次计数没有被掩盖,相反 到 x86 标量移位指令的移位计数。

    因此代码相当简单:

    /*
    gcc -O3 -m64 -Wall -mavx2 -march=broadwell avx2_bit_mask.c
    */
    #include <immintrin.h>
    #include <stdio.h>
    
    __m256i bit_mask_avx2_msb(unsigned int n)      
    {           
        __m256i ones       = _mm256_set1_epi32(-1);
        __m256i cnst32_256 = _mm256_set_epi32(32,64,96,128, 160,192,224,256);
    
        __m256i shift      = _mm256_set1_epi32(n);   
                shift      = _mm256_subs_epu16(cnst32_256,shift);  
                      return _mm256_sllv_epi32(ones,shift);         
    }
    
    
    __m256i bit_mask_avx2_lsb(unsigned int n)               
    {           
        __m256i ones       = _mm256_set1_epi32(-1);
        __m256i cnst32_256 = _mm256_set_epi32(256,224,192,160, 128,96,64,32);
    
        __m256i shift      = _mm256_set1_epi32(n);   
                shift      = _mm256_subs_epu16(cnst32_256,shift);  
                      return _mm256_srlv_epi32(ones,shift);
    }
    
    
    int print_avx2_hex(__m256i ymm)
    {
        long unsigned int x[4];
            _mm256_storeu_si256((__m256i*)x,ymm);
            printf("%016lX %016lX %016lX %016lX\n", x[3],x[2],x[1],x[0]);
    
        return 0;
    }
    
    
    int main()
    {
        unsigned int i;
    
        for (i=0;i<259;i++){
            printf("bit_mask_avx2_lsb(%3d) ",i);
            print_avx2_hex(bit_mask_avx2_lsb(i));
        }
        printf("\n");
    
        for (i=0;i<259;i++){
            printf("bit_mask_avx2_msb(%3d) ",i);
            print_avx2_hex(bit_mask_avx2_msb(i));
        }
        printf("\n");
    
    
        return 0;
    }
    

    结果是:

    $ ./a.out
    bit_mask_avx2_lsb(  0) 0000000000000000 0000000000000000 0000000000000000 0000000000000000
    bit_mask_avx2_lsb(  1) 0000000000000000 0000000000000000 0000000000000000 0000000000000001
    bit_mask_avx2_lsb(  2) 0000000000000000 0000000000000000 0000000000000000 0000000000000003
    bit_mask_avx2_lsb(  3) 0000000000000000 0000000000000000 0000000000000000 0000000000000007
    bit_mask_avx2_lsb(  4) 0000000000000000 0000000000000000 0000000000000000 000000000000000F
    bit_mask_avx2_lsb(  5) 0000000000000000 0000000000000000 0000000000000000 000000000000001F
    bit_mask_avx2_lsb(  6) 0000000000000000 0000000000000000 0000000000000000 000000000000003F
    bit_mask_avx2_lsb(  7) 0000000000000000 0000000000000000 0000000000000000 000000000000007F
    bit_mask_avx2_lsb(  8) 0000000000000000 0000000000000000 0000000000000000 00000000000000FF
    bit_mask_avx2_lsb(  9) 0000000000000000 0000000000000000 0000000000000000 00000000000001FF
    bit_mask_avx2_lsb( 10) 0000000000000000 0000000000000000 0000000000000000 00000000000003FF
    bit_mask_avx2_lsb( 11) 0000000000000000 0000000000000000 0000000000000000 00000000000007FF
    ...
    bit_mask_avx2_lsb(124) 0000000000000000 0000000000000000 0FFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
    bit_mask_avx2_lsb(125) 0000000000000000 0000000000000000 1FFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
    bit_mask_avx2_lsb(126) 0000000000000000 0000000000000000 3FFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
    bit_mask_avx2_lsb(127) 0000000000000000 0000000000000000 7FFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
    bit_mask_avx2_lsb(128) 0000000000000000 0000000000000000 FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
    bit_mask_avx2_lsb(129) 0000000000000000 0000000000000001 FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
    bit_mask_avx2_lsb(130) 0000000000000000 0000000000000003 FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
    bit_mask_avx2_lsb(131) 0000000000000000 0000000000000007 FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
    bit_mask_avx2_lsb(132) 0000000000000000 000000000000000F FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
    ...
    bit_mask_avx2_lsb(248) 00FFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
    bit_mask_avx2_lsb(249) 01FFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
    bit_mask_avx2_lsb(250) 03FFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
    bit_mask_avx2_lsb(251) 07FFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
    bit_mask_avx2_lsb(252) 0FFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
    bit_mask_avx2_lsb(253) 1FFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
    bit_mask_avx2_lsb(254) 3FFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
    bit_mask_avx2_lsb(255) 7FFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
    bit_mask_avx2_lsb(256) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
    bit_mask_avx2_lsb(257) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
    bit_mask_avx2_lsb(258) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
    
    
    
    bit_mask_avx2_msb(  0) 0000000000000000 0000000000000000 0000000000000000 0000000000000000
    bit_mask_avx2_msb(  1) 8000000000000000 0000000000000000 0000000000000000 0000000000000000
    bit_mask_avx2_msb(  2) C000000000000000 0000000000000000 0000000000000000 0000000000000000
    bit_mask_avx2_msb(  3) E000000000000000 0000000000000000 0000000000000000 0000000000000000
    bit_mask_avx2_msb(  4) F000000000000000 0000000000000000 0000000000000000 0000000000000000
    bit_mask_avx2_msb(  5) F800000000000000 0000000000000000 0000000000000000 0000000000000000
    bit_mask_avx2_msb(  6) FC00000000000000 0000000000000000 0000000000000000 0000000000000000
    bit_mask_avx2_msb(  7) FE00000000000000 0000000000000000 0000000000000000 0000000000000000
    bit_mask_avx2_msb(  8) FF00000000000000 0000000000000000 0000000000000000 0000000000000000
    bit_mask_avx2_msb(  9) FF80000000000000 0000000000000000 0000000000000000 0000000000000000
    bit_mask_avx2_msb( 10) FFC0000000000000 0000000000000000 0000000000000000 0000000000000000
    bit_mask_avx2_msb( 11) FFE0000000000000 0000000000000000 0000000000000000 0000000000000000
    ...
    bit_mask_avx2_msb(124) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFF0 0000000000000000 0000000000000000
    bit_mask_avx2_msb(125) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFF8 0000000000000000 0000000000000000
    bit_mask_avx2_msb(126) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFC 0000000000000000 0000000000000000
    bit_mask_avx2_msb(127) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFE 0000000000000000 0000000000000000
    bit_mask_avx2_msb(128) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF 0000000000000000 0000000000000000
    bit_mask_avx2_msb(129) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF 8000000000000000 0000000000000000
    bit_mask_avx2_msb(130) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF C000000000000000 0000000000000000
    bit_mask_avx2_msb(131) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF E000000000000000 0000000000000000
    bit_mask_avx2_msb(132) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF F000000000000000 0000000000000000
    ...
    bit_mask_avx2_msb(248) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFF00
    bit_mask_avx2_msb(249) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFF80
    bit_mask_avx2_msb(250) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFC0
    bit_mask_avx2_msb(251) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFE0
    bit_mask_avx2_msb(252) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFF0
    bit_mask_avx2_msb(253) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFF8
    bit_mask_avx2_msb(254) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFC
    bit_mask_avx2_msb(255) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFE
    bit_mask_avx2_msb(256) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
    bit_mask_avx2_msb(257) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
    bit_mask_avx2_msb(258) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
    

    对于值n,256n_mm256_subs_epu16()的16位饱和算法。 对于n=65536,位掩码(输出值)为零。可以修改代码,使所有位都设置为 1 对于 256nINT_MAX 的范围。 这可以通过替换来实现 shift = _mm256_subs_epu16(cnst32_256,shift);

        __m256i mask       = _mm256_cmpgt_epi32(cnst32_256,shift);
                shift      = _mm256_sub_epi32(cnst32_256,shift);
                shift      = _mm256_and_si256(shift,mask);
    

    这三个内在函数或多或少模仿了_mm256_subs_epu32(cnst32_256,shift),但它并不存在。

    【讨论】:

    • 非常酷。这仅比适用于字节掩码(而不是位掩码)的滑动窗口未对齐加载技术稍微贵一点。在某些情况下甚至可能更可取。
    猜你喜欢
    • 2023-03-06
    • 1970-01-01
    • 2014-05-13
    • 1970-01-01
    • 2013-08-26
    • 2021-05-29
    • 2013-01-03
    • 1970-01-01
    • 1970-01-01
    相关资源
    最近更新 更多