diff --git a/src/dsp/dec.c b/src/dsp/dec.c
index 9ae7b6fa..758c6a57 100644
--- a/src/dsp/dec.c
+++ b/src/dsp/dec.c
@@ -426,11 +426,16 @@ static void HE8uv(uint8_t *dst) {    // horizontal
 }
 
 // helper for chroma-DC predictions
-static WEBP_INLINE void Put8x8uv(uint64_t v, uint8_t* dst) {
+static WEBP_INLINE void Put8x8uv(uint8_t value, uint8_t* dst) {
   int j;
+#ifndef WEBP_REFERENCE_IMPLEMENTATION
+  const uint64_t v = (uint64_t)value * 0x0101010101010101ULL;
   for (j = 0; j < 8; ++j) {
     *(uint64_t*)(dst + j * BPS) = v;
   }
+#else
+  for (j = 0; j < 8; ++j) memset(dst + j * BPS, value, 8);
+#endif
 }
 
 static void DC8uv(uint8_t *dst) {     // DC
@@ -439,7 +444,7 @@ static void DC8uv(uint8_t *dst) {     // DC
   for (i = 0; i < 8; ++i) {
     dc0 += dst[i - BPS] + dst[-1 + i * BPS];
   }
-  Put8x8uv((uint64_t)((dc0 >> 4) * 0x0101010101010101ULL), dst);
+  Put8x8uv(dc0 >> 4, dst);
 }
 
 static void DC8uvNoLeft(uint8_t *dst) {   // DC with no left samples
@@ -448,7 +453,7 @@ static void DC8uvNoLeft(uint8_t *dst) {   // DC with no left samples
   for (i = 0; i < 8; ++i) {
     dc0 += dst[i - BPS];
   }
-  Put8x8uv((uint64_t)((dc0 >> 3) * 0x0101010101010101ULL), dst);
+  Put8x8uv(dc0 >> 3, dst);
 }
 
 static void DC8uvNoTop(uint8_t *dst) {  // DC with no top samples
@@ -457,11 +462,11 @@ static void DC8uvNoTop(uint8_t *dst) {  // DC with no top samples
   for (i = 0; i < 8; ++i) {
     dc0 += dst[-1 + i * BPS];
   }
-  Put8x8uv((uint64_t)((dc0 >> 3) * 0x0101010101010101ULL), dst);
+  Put8x8uv(dc0 >> 3, dst);
 }
 
 static void DC8uvNoTopLeft(uint8_t *dst) {    // DC with nothing
-  Put8x8uv(0x8080808080808080ULL, dst);
+  Put8x8uv(0x80, dst);
 }
 
 //------------------------------------------------------------------------------
diff --git a/src/dsp/lossless.c b/src/dsp/lossless.c
index f951b897..db8f7584 100644
--- a/src/dsp/lossless.c
+++ b/src/dsp/lossless.c
@@ -1079,20 +1079,27 @@ static void CopyOrSwap(const uint32_t* src, int num_pixels, uint8_t* dst,
     const uint32_t* const src_end = src + num_pixels;
     while (src < src_end) {
       uint32_t argb = *src++;
+
+#if !defined(WEBP_REFERENCE_IMPLEMENTATION)
 #if !defined(__BIG_ENDIAN__) && (defined(__i386__) || defined(__x86_64__))
       __asm__ volatile("bswap %0" : "=r"(argb) : "0"(argb));
       *(uint32_t*)dst = argb;
-      dst += sizeof(argb);
 #elif !defined(__BIG_ENDIAN__) && defined(_MSC_VER)
       argb = _byteswap_ulong(argb);
       *(uint32_t*)dst = argb;
-      dst += sizeof(argb);
 #else
-      *dst++ = (argb >> 24) & 0xff;
-      *dst++ = (argb >> 16) & 0xff;
-      *dst++ = (argb >>  8) & 0xff;
-      *dst++ = (argb >>  0) & 0xff;
+      dst[0] = (argb >> 24) & 0xff;
+      dst[1] = (argb >> 16) & 0xff;
+      dst[2] = (argb >>  8) & 0xff;
+      dst[3] = (argb >>  0) & 0xff;
 #endif
+#else   // WEBP_REFERENCE_IMPLEMENTATION
+      dst[0] = (argb >> 24) & 0xff;
+      dst[1] = (argb >> 16) & 0xff;
+      dst[2] = (argb >>  8) & 0xff;
+      dst[3] = (argb >>  0) & 0xff;
+#endif
+      dst += sizeof(argb);
     }
   } else {
     memcpy(dst, src, num_pixels * sizeof(*src));
diff --git a/src/utils/bit_reader.h b/src/utils/bit_reader.h
index daf271ef..f1ded6f3 100644
--- a/src/utils/bit_reader.h
+++ b/src/utils/bit_reader.h
@@ -56,20 +56,29 @@ extern "C" {
 // -> we're back to height active 'value_' bits (marked 'v') and BITS cached
 // bits (marked 'B')
 //
-// The right-justify strategy tends to use less shifts, so let's use it:
-
-#define USE_RIGHT_JUSTIFY
+// The right-justify strategy tends to use less shifts and is often faster.
 
 //------------------------------------------------------------------------------
 // BITS can be either 32, 24, 16 or 8.
 // Pick values that fit natural register size.
 
+#if !defined(WEBP_REFERENCE_IMPLEMENTATION)
+
+#define USE_RIGHT_JUSTIFY
+
 #if defined(__i386__) || defined(_M_IX86)      // x86 32bit
 #define BITS 16
 #elif defined(__arm__) || defined(_M_ARM)     // ARM
-#define BITS 8
+#define BITS 24
 #else                      // reasonable default
-#define BITS 32
+#define BITS 24
+#endif
+
+#else     // reference choices
+
+#define USE_RIGHT_JUSTIFY
+#define BITS 8
+
 #endif
 
 //------------------------------------------------------------------------------