很遗憾,SSE 没有写出打包的 24 位整数的好方法,所以我们需要自己打包像素数据。
24bpp 像素每个像素占用 3 个字节,但 XMM 寄存器为 16 个字节,这意味着我们需要一次处理 3*16 个像素 = 48 个字节,而不必担心只存储 XMM 寄存器的一部分。
首先我们需要加载一个 16bpp 数据的向量,然后将其转换成一对 32bpp 数据的向量。我通过将数据解包到一个 uint32 的向量中来做到这一点,然后移动和屏蔽这个向量以提取红色、绿色和蓝色通道。将它们组合在一起是转换为 32bpp 的最后一步。如果速度更快,可以用链接问题中的代码替换,我还没有测量我的解决方案的性能。
一旦我们将 16 个像素转换为 32bpp 像素的向量,就需要将这些向量打包在一起并写入结果数组。我选择单独屏蔽每个像素并使用_mm_bsrli_si128 和_mm_bslli_si128 将其移动到三个结果向量中的每一个中的最终位置。将这些像素中的每一个再次“或”在一起,即可得到打包数据,并将其写入结果数组。
我已经测试过这段代码可以工作,但我还没有进行任何性能测量,如果有更快的方法可以做到这一点,我不会感到惊讶,特别是如果你允许自己使用 SSE2 之外的东西。
这会将红色通道的 24bpp 数据写入 MSB。
#include <inttypes.h>
#include <stdio.h>
#include <stdlib.h>
#include <x86intrin.h>
#define SSE_ALIGN 16
int main(int argc, char *argv[]) {
// Create a small test buffer
// We process 16 pixels at a time, so size must be a multiple of 16
size_t buf_size = 64;
uint16_t *rgb565buf = aligned_alloc(SSE_ALIGN, buf_size * sizeof(uint16_t));
// Fill it with recognizable data
for (size_t i = 0; i < buf_size; i++) {
uint8_t r = 0x1F & (i + 10);
uint8_t g = 0x3F & i;
uint8_t b = 0x1F & (i + 20);
rgb565buf[i] = (r << 11) | (g << 5) | b;
}
// Create a buffer to hold the data after translation to 24bpp
uint8_t *rgb888buf = aligned_alloc(SSE_ALIGN, buf_size * 3*sizeof(uint8_t));
// Masks for extracting RGB channels
const __m128i mask_r = _mm_set1_epi32(0x00F80000);
const __m128i mask_g = _mm_set1_epi32(0x0000FC00);
const __m128i mask_b = _mm_set1_epi32(0x000000F8);
// Masks for extracting 24bpp pixels for the first 128b write
const __m128i mask_0_1st = _mm_set_epi32(0, 0, 0, 0x00FFFFFF);
const __m128i mask_0_2nd = _mm_set_epi32(0, 0, 0x0000FFFF, 0xFF000000);
const __m128i mask_0_3rd = _mm_set_epi32(0, 0x000000FF, 0xFFFF0000, 0 );
const __m128i mask_0_4th = _mm_set_epi32(0, 0xFFFFFF00, 0, 0 );
const __m128i mask_0_5th = _mm_set_epi32(0x00FFFFFF, 0, 0, 0 );
const __m128i mask_0_6th = _mm_set_epi32(0xFF000000, 0, 0, 0 );
// Masks for the second write
const __m128i mask_1_6th = _mm_set_epi32(0, 0, 0, 0x0000FFFF);
const __m128i mask_1_7th = _mm_set_epi32(0, 0, 0x000000FF, 0xFFFF0000);
const __m128i mask_1_8th = _mm_set_epi32(0, 0, 0xFFFFFF00, 0 );
const __m128i mask_1_9th = _mm_set_epi32(0, 0x00FFFFFF, 0, 0 );
const __m128i mask_1_10th = _mm_set_epi32(0x0000FFFF, 0xFF000000, 0, 0 );
const __m128i mask_1_11th = _mm_set_epi32(0xFFFF0000, 0, 0, 0 );
// Masks for the third write
const __m128i mask_2_11th = _mm_set_epi32(0, 0, 0, 0x000000FF);
const __m128i mask_2_12th = _mm_set_epi32(0, 0, 0, 0xFFFFFF00);
const __m128i mask_2_13th = _mm_set_epi32(0, 0, 0x00FFFFFF, 0 );
const __m128i mask_2_14th = _mm_set_epi32(0, 0x0000FFFF, 0xFF000000, 0 );
const __m128i mask_2_15th = _mm_set_epi32(0x000000FF, 0xFFFF0000, 0, 0 );
const __m128i mask_2_16th = _mm_set_epi32(0xFFFFFF00, 0, 0, 0 );
// Convert the RGB565 data into RGB888 data
__m128i *packed_rgb888_buf = (__m128i*)rgb888buf;
for (size_t i = 0; i < buf_size; i += 16) {
// Need to do 16 pixels at a time -> least number of 24bpp pixels that fit evenly in XMM register
__m128i rgb565pix0_raw = _mm_load_si128((__m128i *)(&rgb565buf[i]));
__m128i rgb565pix1_raw = _mm_load_si128((__m128i *)(&rgb565buf[i+8]));
// Extend the 16b ints to 32b ints
__m128i rgb565pix0lo_32b = _mm_unpacklo_epi16(rgb565pix0_raw, _mm_setzero_si128());
__m128i rgb565pix0hi_32b = _mm_unpackhi_epi16(rgb565pix0_raw, _mm_setzero_si128());
// Shift each color channel into the correct position and mask off the other bits
__m128i rgb888pix0lo_r = _mm_and_si128(mask_r, _mm_slli_epi32(rgb565pix0lo_32b, 8)); // Block 0 low pixels
__m128i rgb888pix0lo_g = _mm_and_si128(mask_g, _mm_slli_epi32(rgb565pix0lo_32b, 5));
__m128i rgb888pix0lo_b = _mm_and_si128(mask_b, _mm_slli_epi32(rgb565pix0lo_32b, 3));
__m128i rgb888pix0hi_r = _mm_and_si128(mask_r, _mm_slli_epi32(rgb565pix0hi_32b, 8)); // Block 0 high pixels
__m128i rgb888pix0hi_g = _mm_and_si128(mask_g, _mm_slli_epi32(rgb565pix0hi_32b, 5));
__m128i rgb888pix0hi_b = _mm_and_si128(mask_b, _mm_slli_epi32(rgb565pix0hi_32b, 3));
// Combine each color channel into a single vector of four 32bpp pixels
__m128i rgb888pix0lo_32b = _mm_or_si128(rgb888pix0lo_r, _mm_or_si128(rgb888pix0lo_g, rgb888pix0lo_b));
__m128i rgb888pix0hi_32b = _mm_or_si128(rgb888pix0hi_r, _mm_or_si128(rgb888pix0hi_g, rgb888pix0hi_b));
// Same thing as above for the next block of pixels
__m128i rgb565pix1lo_32b = _mm_unpacklo_epi16(rgb565pix1_raw, _mm_setzero_si128());
__m128i rgb565pix1hi_32b = _mm_unpackhi_epi16(rgb565pix1_raw, _mm_setzero_si128());
__m128i rgb888pix1lo_r = _mm_and_si128(mask_r, _mm_slli_epi32(rgb565pix1lo_32b, 8)); // Block 1 low pixels
__m128i rgb888pix1lo_g = _mm_and_si128(mask_g, _mm_slli_epi32(rgb565pix1lo_32b, 5));
__m128i rgb888pix1lo_b = _mm_and_si128(mask_b, _mm_slli_epi32(rgb565pix1lo_32b, 3));
__m128i rgb888pix1hi_r = _mm_and_si128(mask_r, _mm_slli_epi32(rgb565pix1hi_32b, 8)); // Block 1 high pixels
__m128i rgb888pix1hi_g = _mm_and_si128(mask_g, _mm_slli_epi32(rgb565pix1hi_32b, 5));
__m128i rgb888pix1hi_b = _mm_and_si128(mask_b, _mm_slli_epi32(rgb565pix1hi_32b, 3));
__m128i rgb888pix1lo_32b = _mm_or_si128(rgb888pix1lo_r, _mm_or_si128(rgb888pix1lo_g, rgb888pix1lo_b));
__m128i rgb888pix1hi_32b = _mm_or_si128(rgb888pix1hi_r, _mm_or_si128(rgb888pix1hi_g, rgb888pix1hi_b));
// At this point, rgb888pix_32b contains the pixel data in 32bpp format, need to compress it to 24bpp
// Use the _mm_bs*li_si128(__m128i, int) intrinsic to shift each 24bpp pixel into it's final position
// ...then mask off the other pixels and combine the result together with or
__m128i pix_0_1st = _mm_and_si128(mask_0_1st, rgb888pix0lo_32b ); // First 4 pixels
__m128i pix_0_2nd = _mm_and_si128(mask_0_2nd, _mm_bsrli_si128(rgb888pix0lo_32b, 1 ));
__m128i pix_0_3rd = _mm_and_si128(mask_0_3rd, _mm_bsrli_si128(rgb888pix0lo_32b, 2 ));
__m128i pix_0_4th = _mm_and_si128(mask_0_4th, _mm_bsrli_si128(rgb888pix0lo_32b, 3 ));
__m128i pix_0_5th = _mm_and_si128(mask_0_5th, _mm_bslli_si128(rgb888pix0hi_32b, 12)); // Second 4 pixels
__m128i pix_0_6th = _mm_and_si128(mask_0_6th, _mm_bslli_si128(rgb888pix0hi_32b, 11));
// Combine each piece of 24bpp pixel data into a single 128b variable
__m128i pix128_0 = _mm_or_si128(_mm_or_si128(_mm_or_si128(pix_0_1st, pix_0_2nd), pix_0_3rd),
_mm_or_si128(_mm_or_si128(pix_0_4th, pix_0_5th), pix_0_6th));
_mm_store_si128(packed_rgb888_buf, pix128_0);
// Repeat the same for the second 128b write
__m128i pix_1_6th = _mm_and_si128(mask_1_6th, _mm_bsrli_si128(rgb888pix0hi_32b, 5 ));
__m128i pix_1_7th = _mm_and_si128(mask_1_7th, _mm_bsrli_si128(rgb888pix0hi_32b, 6 ));
__m128i pix_1_8th = _mm_and_si128(mask_1_8th, _mm_bsrli_si128(rgb888pix0hi_32b, 7 ));
__m128i pix_1_9th = _mm_and_si128(mask_1_9th, _mm_bslli_si128(rgb888pix1lo_32b, 8 )); // Third 4 pixels
__m128i pix_1_10th = _mm_and_si128(mask_1_10th, _mm_bslli_si128(rgb888pix1lo_32b, 7 ));
__m128i pix_1_11th = _mm_and_si128(mask_1_11th, _mm_bslli_si128(rgb888pix1lo_32b, 6 ));
__m128i pix128_1 = _mm_or_si128(_mm_or_si128(_mm_or_si128(pix_1_6th, pix_1_7th), pix_1_8th ),
_mm_or_si128(_mm_or_si128(pix_1_9th, pix_1_10th), pix_1_11th));
_mm_store_si128(packed_rgb888_buf+1, pix128_1);
// And again for the third 128b write
__m128i pix_2_11th = _mm_and_si128(mask_2_11th, _mm_bsrli_si128(rgb888pix1lo_32b, 10));
__m128i pix_2_12th = _mm_and_si128(mask_2_12th, _mm_bsrli_si128(rgb888pix1lo_32b, 11));
__m128i pix_2_13th = _mm_and_si128(mask_2_13th, _mm_bslli_si128(rgb888pix1hi_32b, 4)); // Fourth 4 pixels
__m128i pix_2_14th = _mm_and_si128(mask_2_14th, _mm_bslli_si128(rgb888pix1hi_32b, 3));
__m128i pix_2_15th = _mm_and_si128(mask_2_15th, _mm_bslli_si128(rgb888pix1hi_32b, 2));
__m128i pix_2_16th = _mm_and_si128(mask_2_16th, _mm_bslli_si128(rgb888pix1hi_32b, 1));
__m128i pix128_2 = _mm_or_si128(_mm_or_si128(_mm_or_si128(pix_2_11th, pix_2_12th), pix_2_13th),
_mm_or_si128(_mm_or_si128(pix_2_14th, pix_2_15th), pix_2_16th));
_mm_store_si128(packed_rgb888_buf+2, pix128_2);
// Update pointer for next iteration
packed_rgb888_buf += 3;
}
for (int i = 0; i < buf_size; i++) {
uint8_t r565 = (i + 10) & 0x1F;
uint8_t g565 = i & 0x3F;
uint8_t b565 = (i + 20) & 0x1F;
printf("%2d] RGB = (%02x,%02x,%02x), should be (%02x,%02x,%02x)\n", i, rgb888buf[3*i+2],
rgb888buf[3*i+1], rgb888buf[3*i], r565 << 3, g565 << 2, b565 << 3);
}
return EXIT_SUCCESS;
}
编辑:这是将 32bpp 像素数据压缩为 24bpp 的第二种方法。我没有测试它是否更快,尽管我会假设是因为它执行的指令更少,并且不需要在最后运行 OR 树。但是,它的工作原理一目了然。
在此版本中,移位和随机播放的组合用于将每个像素块一起移动,而不是单独屏蔽和移动每个像素块。用于将 16bpp 转换为 32bpp 的方法不变。
首先,我定义了一个辅助函数,用于在 __m128i 的每一半中左移低位 uint32。
__m128i bslli_low_dword_once(__m128i x) {
// Multiply low dwords by 256 to shift right 8 bits
const __m128i shift_multiplier = _mm_set1_epi32(1<<8);
// Mask off the high dwords
const __m128i mask = _mm_set_epi32(0xFFFFFFFF, 0, 0xFFFFFFFF, 0);
return _mm_or_si128(_mm_and_si128(x, mask), _mm_mul_epu32(x, shift_multiplier));
}
那么唯一的其他更改是将 32bpp 数据打包成 24bpp 的代码。
// At this point, rgb888pix_32b contains the pixel data in 32bpp format, need to compress it to 24bpp
__m128i pix_0_block0lo = bslli_low_dword_once(rgb888pix0lo_32b);
pix_0_block0lo = _mm_srli_epi64(pix_0_block0lo, 8);
pix_0_block0lo = _mm_shufflelo_epi16(pix_0_block0lo, _MM_SHUFFLE(2, 1, 0, 3));
pix_0_block0lo = _mm_bsrli_si128(pix_0_block0lo, 2);
__m128i pix_0_block0hi = _mm_unpacklo_epi64(_mm_setzero_si128(), rgb888pix0hi_32b);
pix_0_block0hi = bslli_low_dword_once(pix_0_block0hi);
pix_0_block0hi = _mm_bslli_si128(pix_0_block0hi, 3);
__m128i pix128_0 = _mm_or_si128(pix_0_block0lo, pix_0_block0hi);
_mm_store_si128(packed_rgb888_buf, pix128_0);
// Do the same basic thing for the next 128b chunk of pixel data
__m128i pix_1_block0hi = bslli_low_dword_once(rgb888pix0hi_32b);
pix_1_block0hi = _mm_srli_epi64(pix_1_block0hi, 8);
pix_1_block0hi = _mm_shufflelo_epi16(pix_1_block0hi, _MM_SHUFFLE(2, 1, 0, 3));
pix_1_block0hi = _mm_bsrli_si128(pix_1_block0hi, 6);
__m128i pix_1_block1lo = bslli_low_dword_once(rgb888pix1lo_32b);
pix_1_block1lo = _mm_srli_epi64(pix_1_block1lo, 8);
pix_1_block1lo = _mm_shufflelo_epi16(pix_1_block1lo, _MM_SHUFFLE(2, 1, 0, 3));
pix_1_block1lo = _mm_bslli_si128(pix_1_block1lo, 6);
__m128i pix128_1 = _mm_or_si128(pix_1_block0hi, pix_1_block1lo);
_mm_store_si128(packed_rgb888_buf+1, pix128_1);
// And again for the final chunk
__m128i pix_2_block1lo = bslli_low_dword_once(rgb888pix1lo_32b);
pix_2_block1lo = _mm_bsrli_si128(pix_2_block1lo, 11);
__m128i pix_2_block1hi = bslli_low_dword_once(rgb888pix1hi_32b);
pix_2_block1hi = _mm_srli_epi64(pix_2_block1hi, 8);
pix_2_block1hi = _mm_shufflelo_epi16(pix_2_block1hi, _MM_SHUFFLE(2, 1, 0, 3));
pix_2_block1hi = _mm_bslli_si128(pix_2_block1hi, 2);
__m128i pix128_2 = _mm_or_si128(pix_2_block1lo, pix_2_block1hi);
_mm_store_si128(packed_rgb888_buf+2, pix128_2);