diff --git a/src/utils/bit_reader.c b/src/utils/bit_reader.c
index 25d02b74..87b4f462 100644
--- a/src/utils/bit_reader.c
+++ b/src/utils/bit_reader.c
@@ -109,6 +109,13 @@ int32_t VP8GetSignedValue(VP8BitReader* const br, int bits) {
 #define WBITS 32      // Minimum number of bytes needed after VP8LFillBitWindow.
 #define LOG8_WBITS 4  // Number of bytes needed to store WBITS bits.
 
+#if !defined(WEBP_FORCE_ALIGNED) && \
+    (defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) || \
+     defined(__i386__) || defined(_M_IX86) || \
+     defined(__x86_64__) || defined(_M_X64))
+#define VP8L_USE_UNALIGNED_LOAD
+#endif
+
 static const uint32_t kBitMask[VP8L_MAX_NUM_BIT_READ + 1] = {
   0,
   0x000001, 0x000003, 0x000007, 0x00000f,
@@ -176,17 +183,16 @@ static void ShiftBytes(VP8LBitReader* const br) {
 
 void VP8LFillBitWindow(VP8LBitReader* const br) {
   if (br->bit_pos_ >= WBITS) {
-    // TODO(jzern): 1) this might be of benefit in 32-bit builds too, along with
-    //                 reducing the load size.
-    //              2) given the fixed read size it may be possible to force
-    //                 alignment in this block.
-#if !defined(WEBP_FORCE_ALIGNED) && (defined(__x86_64__) || defined(_M_X64))
+    // TODO(jzern): given the fixed read size it may be possible to force
+    //              alignment in this block.
+#if defined(VP8L_USE_UNALIGNED_LOAD)
     if (br->pos_ + sizeof(br->val_) < br->len_) {
       br->val_ >>= WBITS;
       br->bit_pos_ -= WBITS;
       // The expression below needs a little-endian arch to work correctly.
       // This gives a large speedup for decoding speed.
-      br->val_ |= *(const vp8l_val_t*)(br->buf_ + br->pos_) << (LBITS - WBITS);
+      br->val_ |= (vp8l_val_t)*(const uint32_t*)(br->buf_ + br->pos_) <<
+                  (LBITS - WBITS);
       br->pos_ += LOG8_WBITS;
       return;
     }