VP8LFillBitWindow: enable fast path for 32-bit builds

also reduce the load size from 64 to 32 bits as the top 32 bits are being shifted away in the operation. the change is neutral speed-wise on x86_64 as is the change in load size on x86, but it gives a slight improvement on 32-bit arm. x86 is improved ~13%, 32-bit arm ~3.7% aarch64 is untested but will likely benefit as well. Change-Id: Ibcb02a70f46f2651105d7ab571afe352673bef48
2025-12-24 05:56:27 +01:00 · 2014-07-04 13:40:53 -07:00
parent 4f7f52b2a1
commit 6422e683af
1 changed files with 12 additions and 6 deletions
--- a/src/utils/bit_reader.c
+++ b/src/utils/bit_reader.c
@@ -109,6 +109,13 @@ int32_t VP8GetSignedValue(VP8BitReader* const br, int bits) {
 #define WBITS 32      // Minimum number of bytes needed after VP8LFillBitWindow.
 #define LOG8_WBITS 4  // Number of bytes needed to store WBITS bits.
 #if !defined(WEBP_FORCE_ALIGNED) && \
    (defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) || \
     defined(__i386__) || defined(_M_IX86) || \
     defined(__x86_64__) || defined(_M_X64))
 #define VP8L_USE_UNALIGNED_LOAD
 #endif
 static const uint32_t kBitMask[VP8L_MAX_NUM_BIT_READ + 1] = {
  0,
  0x000001, 0x000003, 0x000007, 0x00000f,
@@ -176,17 +183,16 @@ static void ShiftBytes(VP8LBitReader* const br) {
 void VP8LFillBitWindow(VP8LBitReader* const br) {
  if (br->bit_pos_ >= WBITS) {
-    // TODO(jzern): 1) this might be of benefit in 32-bit builds too, along with
+    // TODO(jzern): given the fixed read size it may be possible to force
-    //                 reducing the load size.
+    //              alignment in this block.
-    //              2) given the fixed read size it may be possible to force
+#if defined(VP8L_USE_UNALIGNED_LOAD)
    //                 alignment in this block.
 #if !defined(WEBP_FORCE_ALIGNED) && (defined(__x86_64__) || defined(_M_X64))
    if (br->pos_ + sizeof(br->val_) < br->len_) {
      br->val_ >>= WBITS;
      br->bit_pos_ -= WBITS;
      // The expression below needs a little-endian arch to work correctly.
      // This gives a large speedup for decoding speed.
-      br->val_ |= *(const vp8l_val_t*)(br->buf_ + br->pos_) << (LBITS - WBITS);
+      br->val_ |= (vp8l_val_t)*(const uint32_t*)(br->buf_ + br->pos_) <<
                  (LBITS - WBITS);
      br->pos_ += LOG8_WBITS;
      return;
    }