这个答案假设目标平台不支持浮点,或者非常慢的浮点支持(可能通过仿真)。
正如在 cmets 中所指出的,计数前导零 (CLZ) 指令可用于提供通过浮点操作数的指数部分提供的快速 log2 功能。 CLZ 也可以在不通过内部函数提供功能的平台上以合理的效率进行仿真,如下所示。
可以从查找表 (LUT) 中提取对几位有用的初始近似值,就像在浮点情况下一样,它可以通过牛顿迭代进一步细化。对于 32 位整数平方根,一到两次迭代通常就足够了。下面的 ISO-C99 代码显示了工作示例性实施,包括详尽的测试。
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <math.h>
uint8_t clz (uint32_t a); // count leading zeros
uint32_t umul_16_16 (uint16_t a, uint16_t b); // 16x16 bit multiply
uint16_t udiv_32_16 (uint32_t x, uint16_t y); // 32/16 bit division
/* LUT for initial square root approximation */
static const uint16_t sqrt_tab[32] =
{
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
0x85ff, 0x8cff, 0x94ff, 0x9aff, 0xa1ff, 0xa7ff, 0xadff, 0xb3ff,
0xb9ff, 0xbeff, 0xc4ff, 0xc9ff, 0xceff, 0xd3ff, 0xd8ff, 0xdcff,
0xe1ff, 0xe6ff, 0xeaff, 0xeeff, 0xf3ff, 0xf7ff, 0xfbff, 0xffff
};
/* table lookup for initial guess followed by division-based Newton iteration */
uint16_t my_isqrt (uint32_t x)
{
uint16_t q, lz, y, i, xh;
if (x == 0) return x; // early out, code below can't handle zero
// initial guess based on leading 5 bits of argument normalized to 2.30
lz = clz (x);
i = ((x << (lz & ~1)) >> 27);
y = sqrt_tab[i] >> (lz >> 1);
xh = x >> 16; // use for overflow check on divisions
// first Newton iteration, guard against overflow in division
q = 0xffff;
if (xh < y) q = udiv_32_16 (x, y);
y = (q + y) >> 1;
if (lz < 10) {
// second Newton iteration, guard against overflow in division
q = 0xffff;
if (xh < y) q = udiv_32_16 (x, y);
y = (q + y) >> 1;
}
if (umul_16_16 (y, y) > x) y--; // adjust quotient if too large
return y; // (uint16_t)sqrt((double)x)
}
static const uint8_t clz_tab[32] =
{
31, 22, 30, 21, 18, 10, 29, 2, 20, 17, 15, 13, 9, 6, 28, 1,
23, 19, 11, 3, 16, 14, 7, 24, 12, 4, 8, 25, 5, 26, 27, 0
};
/* count leading zeros (for non-zero argument); a machine instruction on many architectures */
uint8_t clz (uint32_t a)
{
a |= a >> 16;
a |= a >> 8;
a |= a >> 4;
a |= a >> 2;
a |= a >> 1;
return clz_tab [0x07c4acdd * a >> 27];
}
/* 16x16->32 bit unsigned multiply; machine instruction on many architectures */
uint32_t umul_16_16 (uint16_t a, uint16_t b)
{
return (uint32_t)a * b;
}
/* 32/16->16 bit division. Note: Will overflow if x[31:16] >= y */
uint16_t udiv_32_16 (uint32_t x, uint16_t y)
{
uint16_t r = x / y;
return r;
}
int main (void)
{
uint32_t x;
uint16_t res, ref;
printf ("testing 32-bit integer square root\n");
x = 0;
do {
ref = (uint16_t)sqrt((double)x);
res = my_isqrt (x);
if (res != ref) {
printf ("error: x=%08x res=%08x ref=%08x\n", x, res, ref);
printf ("exhaustive test FAILED\n");
return EXIT_FAILURE;
}
x++;
} while (x);
printf ("exhaustive test PASSED\n");
return EXIT_SUCCESS;
}