add support for BITS > 32

on x86_64 desktop, it's a little faster to use BITS=56 on MacOS (/llvm) it's _much_ faster (~10%) Change-Id: I47c66ab7488341d8d1696d9301954b86b241b36d
2025-10-27 08:33:03 +01:00 · 2013-03-16 00:06:12 +01:00
parent 9b3db89473
commit 744930dbe2
1 changed files with 36 additions and 11 deletions
--- a/src/utils/bit_reader.h
+++ b/src/utils/bit_reader.h
@@ -59,7 +59,7 @@ extern "C" {
 // The right-justify strategy tends to use less shifts and is often faster.

 //------------------------------------------------------------------------------
-// BITS can be either 32, 24, 16 or 8.
+// BITS can be any multiple of 8 from 8 to 56 (inclusive).
 // Pick values that fit natural register size.

 #if !defined(WEBP_REFERENCE_IMPLEMENTATION)
@@ -68,6 +68,8 @@ extern "C" {

 #if defined(__i386__) || defined(_M_IX86)      // x86 32bit
 #define BITS 16
+#elif defined(__x86_64__) || defined(_M_X64)   // x86 64bit
+#define BITS 56
 #elif defined(__arm__) || defined(_M_ARM)      // ARM
 #define BITS 24
 #else                      // reasonable default
@@ -84,9 +86,15 @@ extern "C" {
 //------------------------------------------------------------------------------
 // Derived types and constants

-#if (BITS == 32)
-typedef uint64_t bit_t;   // natural register type
-typedef uint32_t lbit_t;  // natural type for memory I/O
+// bit_t = natural register type
+// lbit_t = natural type for memory I/O
+
+#if (BITS > 32)
+typedef uint64_t bit_t;
+typedef uint64_t lbit_t;
+#elif (BITS == 32)
+typedef uint64_t bit_t;
+typedef uint32_t lbit_t;
 #elif (BITS == 24)
 typedef uint32_t bit_t;
 typedef uint32_t lbit_t;
@@ -148,19 +156,36 @@ static WEBP_INLINE void VP8LoadNewBytes(VP8BitReader* const br) {
    lbit_t in_bits = *(lbit_t*)br->buf_;
    br->buf_ += (BITS) >> 3;
 #if !defined(__BIG_ENDIAN__)
-#if (BITS == 32) || (BITS == 24)
+#if (BITS > 32)
+// gcc 4.3 has builtin functions for swap32/swap64
+#if defined(__GNUC__) && \
+           (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))
+    bits = (bit_t)__builtin_bswap64(in_bits);
+#elif defined(_MSC_VER)
+    bits = (bit_t)_byteswap_uint64(in_bits);
+#elif defined(__x86_64__)
+    __asm__ volatile("bswapq %0" : "=r"(bits) : "0"(in_bits));
+#else  // generic code for swapping 64-bit values (suggested by bdb@)
+    bits = (bit_t)in_bits;
+    bits = ((bits & 0xffffffff00000000ull) >> 32) |
+           ((bits & 0x00000000ffffffffull) << 32);
+    bits = ((bits & 0xffff0000ffff0000ull) >> 16) |
+           ((bits & 0x0000ffff0000ffffull) << 16);
+    bits = ((bits & 0xff00ff00ff00ff00ull) >> 8) |
+           ((bits & 0x00ff00ff00ff00ffull) << 8);
+#endif
+    bits >>= 64 - BITS;
+#elif (BITS >= 24)
 #if defined(__i386__) || defined(__x86_64__)
    __asm__ volatile("bswap %k0" : "=r"(in_bits) : "0"(in_bits));
    bits = (bit_t)in_bits;   // 24b/32b -> 32b/64b zero-extension
 #elif defined(_MSC_VER)
-    bits = _byteswap_ulong(in_bits);
+    bits = (bit_t)_byteswap_ulong(in_bits);
 #else
    bits = (bit_t)(in_bits >> 24) | ((in_bits >> 8) & 0xff00)
         | ((in_bits << 8) & 0xff0000)  | (in_bits << 24);
 #endif  // x86
-#if (BITS == 24)
-    bits >>= 8;
-#endif
+    bits >>= (32 - BITS);
 #elif (BITS == 16)
    // gcc will recognize a 'rorw $8, ...' here:
    bits = (bit_t)(in_bits >> 8) | ((in_bits & 0xff) << 8);
@@ -248,7 +273,7 @@ static WEBP_INLINE int VP8GetBit(VP8BitReader* const br, int prob) {
 }

 static WEBP_INLINE int VP8GetSigned(VP8BitReader* const br, int v) {
-  const bit_t split = (br->range_ >> 1);
+  const range_t split = (br->range_ >> 1);
  const int bit = VP8BitUpdate(br, split);
  VP8Shift(br);
  return bit ? -v : v;