diff --git a/makefile.unix b/makefile.unix
index be3ca08b..6aea2365 100644
--- a/makefile.unix
+++ b/makefile.unix
@@ -200,9 +200,14 @@ HDRS = \
     src/dec/webpi.h \
     src/dsp/dsp.h \
     src/dsp/lossless.h \
+    src/dsp/neon.h \
     src/dsp/yuv.h \
+    src/enc/backward_references.h \
     src/enc/cost.h \
+    src/enc/histogram.h \
     src/enc/vp8enci.h \
+    src/enc/vp8li.h \
+    src/mux/muxi.h \
     src/utils/alpha_processing.h \
     src/utils/bit_reader.h \
     src/utils/bit_writer.h \
@@ -215,6 +220,7 @@ HDRS = \
     src/utils/random.h \
     src/utils/rescaler.h \
     src/utils/thread.h \
+    src/utils/utils.h \
     src/webp/format_constants.h \
     $(HDRS_INSTALLED) \
 
diff --git a/src/dsp/Makefile.am b/src/dsp/Makefile.am
index 284e8f4d..24fb75d7 100644
--- a/src/dsp/Makefile.am
+++ b/src/dsp/Makefile.am
@@ -17,10 +17,11 @@ COMMON_SOURCES += dec_neon.c
 COMMON_SOURCES += dec_sse2.c
 COMMON_SOURCES += dsp.h
 COMMON_SOURCES += lossless.c
+COMMON_SOURCES += lossless.h
 COMMON_SOURCES += lossless_mips32.c
 COMMON_SOURCES += lossless_neon.c
 COMMON_SOURCES += lossless_sse2.c
-COMMON_SOURCES += lossless.h
+COMMON_SOURCES += neon.h
 COMMON_SOURCES += upsampling.c
 COMMON_SOURCES += upsampling_mips32.c
 COMMON_SOURCES += upsampling_neon.c
diff --git a/src/dsp/dec_neon.c b/src/dsp/dec_neon.c
index 2850b465..4d8ea724 100644
--- a/src/dsp/dec_neon.c
+++ b/src/dsp/dec_neon.c
@@ -23,8 +23,7 @@
 // (probably similar to gcc.gnu.org/bugzilla/show_bug.cgi?id=48183)
 #define WORK_AROUND_GCC
 
-#include <arm_neon.h>
-
+#include "./neon.h"
 #include "../dec/vp8i.h"
 
 #define QRegs "q0", "q1", "q2", "q3",                                          \
@@ -1217,16 +1216,15 @@ static void TransformDC(const int16_t* in, uint8_t* dst) {
 
 //------------------------------------------------------------------------------
 
-#define STORE_WHT(dst, col, row01, row23) do {           \
-  *dst = vgetq_lane_s32(row01.val[0], col); (dst) += 16; \
-  *dst = vgetq_lane_s32(row01.val[1], col); (dst) += 16; \
-  *dst = vgetq_lane_s32(row23.val[0], col); (dst) += 16; \
-  *dst = vgetq_lane_s32(row23.val[1], col); (dst) += 16; \
+#define STORE_WHT(dst, col, rows) do {                  \
+  *dst = vgetq_lane_s32(rows.val[0], col); (dst) += 16; \
+  *dst = vgetq_lane_s32(rows.val[1], col); (dst) += 16; \
+  *dst = vgetq_lane_s32(rows.val[2], col); (dst) += 16; \
+  *dst = vgetq_lane_s32(rows.val[3], col); (dst) += 16; \
 } while (0)
 
 static void TransformWHT(const int16_t* in, int16_t* out) {
-  int32x4x2_t tmp0;  // tmp[0..7]
-  int32x4x2_t tmp1;  // tmp[8..15]
+  int32x4x4_t tmp;
 
   {
     // Load the source.
@@ -1238,47 +1236,37 @@ static void TransformWHT(const int16_t* in, int16_t* out) {
     const int32x4_t a1 = vaddl_s16(in04_07, in08_11);  // in[4..7] + in[8..11]
     const int32x4_t a2 = vsubl_s16(in04_07, in08_11);  // in[4..7] - in[8..11]
     const int32x4_t a3 = vsubl_s16(in00_03, in12_15);  // in[0..3] - in[12..15]
-    tmp0.val[0] = vaddq_s32(a0, a1);
-    tmp0.val[1] = vaddq_s32(a3, a2);
-    tmp1.val[0] = vsubq_s32(a0, a1);
-    tmp1.val[1] = vsubq_s32(a3, a2);
+    tmp.val[0] = vaddq_s32(a0, a1);
+    tmp.val[1] = vaddq_s32(a3, a2);
+    tmp.val[2] = vsubq_s32(a0, a1);
+    tmp.val[3] = vsubq_s32(a3, a2);
+    // Arrange the temporary results column-wise.
+    tmp = Transpose4x4(tmp);
   }
 
-  tmp0 = vzipq_s32(tmp0.val[0], tmp0.val[1]);  // 0,  4, 1,  5 |  2,  6,  3,  7
-  tmp1 = vzipq_s32(tmp1.val[0], tmp1.val[1]);  // 8, 12, 9, 13 | 10, 14, 11, 15
-
   {
-    // Arrange the temporary results column-wise.
-    const int32x4_t tmp_0_4_8_12 =
-        vcombine_s32(vget_low_s32(tmp0.val[0]), vget_low_s32(tmp1.val[0]));
-    const int32x4_t tmp_2_6_10_14 =
-        vcombine_s32(vget_low_s32(tmp0.val[1]), vget_low_s32(tmp1.val[1]));
-    const int32x4_t tmp_1_5_9_13 =
-        vcombine_s32(vget_high_s32(tmp0.val[0]), vget_high_s32(tmp1.val[0]));
-    const int32x4_t tmp_3_7_11_15 =
-        vcombine_s32(vget_high_s32(tmp0.val[1]), vget_high_s32(tmp1.val[1]));
-    const int32x4_t three = vdupq_n_s32(3);
-    const int32x4_t dc = vaddq_s32(tmp_0_4_8_12, three);  // add rounder
-    const int32x4_t a0 = vaddq_s32(dc, tmp_3_7_11_15);
-    const int32x4_t a1 = vaddq_s32(tmp_1_5_9_13, tmp_2_6_10_14);
-    const int32x4_t a2 = vsubq_s32(tmp_1_5_9_13, tmp_2_6_10_14);
-    const int32x4_t a3 = vsubq_s32(dc, tmp_3_7_11_15);
+    const int32x4_t kCst3 = vdupq_n_s32(3);
+    const int32x4_t dc = vaddq_s32(tmp.val[0], kCst3);  // add rounder
+    const int32x4_t a0 = vaddq_s32(dc, tmp.val[3]);
+    const int32x4_t a1 = vaddq_s32(tmp.val[1], tmp.val[2]);
+    const int32x4_t a2 = vsubq_s32(tmp.val[1], tmp.val[2]);
+    const int32x4_t a3 = vsubq_s32(dc, tmp.val[3]);
 
-    tmp0.val[0] = vaddq_s32(a0, a1);
-    tmp0.val[1] = vaddq_s32(a3, a2);
-    tmp1.val[0] = vsubq_s32(a0, a1);
-    tmp1.val[1] = vsubq_s32(a3, a2);
+    tmp.val[0] = vaddq_s32(a0, a1);
+    tmp.val[1] = vaddq_s32(a3, a2);
+    tmp.val[2] = vsubq_s32(a0, a1);
+    tmp.val[3] = vsubq_s32(a3, a2);
 
     // right shift the results by 3.
-    tmp0.val[0] = vshrq_n_s32(tmp0.val[0], 3);
-    tmp0.val[1] = vshrq_n_s32(tmp0.val[1], 3);
-    tmp1.val[0] = vshrq_n_s32(tmp1.val[0], 3);
-    tmp1.val[1] = vshrq_n_s32(tmp1.val[1], 3);
+    tmp.val[0] = vshrq_n_s32(tmp.val[0], 3);
+    tmp.val[1] = vshrq_n_s32(tmp.val[1], 3);
+    tmp.val[2] = vshrq_n_s32(tmp.val[2], 3);
+    tmp.val[3] = vshrq_n_s32(tmp.val[3], 3);
 
-    STORE_WHT(out, 0, tmp0, tmp1);
-    STORE_WHT(out, 1, tmp0, tmp1);
-    STORE_WHT(out, 2, tmp0, tmp1);
-    STORE_WHT(out, 3, tmp0, tmp1);
+    STORE_WHT(out, 0, tmp);
+    STORE_WHT(out, 1, tmp);
+    STORE_WHT(out, 2, tmp);
+    STORE_WHT(out, 3, tmp);
   }
 }
 
diff --git a/src/dsp/enc_neon.c b/src/dsp/enc_neon.c
index 170c70b3..1e478825 100644
--- a/src/dsp/enc_neon.c
+++ b/src/dsp/enc_neon.c
@@ -18,8 +18,8 @@
 #define USE_INTRINSICS   // use intrinsics when possible
 
 #include <assert.h>
-#include <arm_neon.h>
 
+#include "./neon.h"
 #include "../enc/vp8enci.h"
 
 //------------------------------------------------------------------------------
@@ -474,38 +474,6 @@ static void FTransform(const uint8_t* src, const uint8_t* ref,
 
 #endif
 
-static WEBP_INLINE int32x4x4_t Transpose4x4(const int32x4x4_t rows) {
-  uint64x2x2_t row01, row23;
-
-  row01.val[0] = vreinterpretq_u64_s32(rows.val[0]);
-  row01.val[1] = vreinterpretq_u64_s32(rows.val[1]);
-  row23.val[0] = vreinterpretq_u64_s32(rows.val[2]);
-  row23.val[1] = vreinterpretq_u64_s32(rows.val[3]);
-  // Transpose 64-bit values (there's no vswp equivalent)
-  {
-    const uint64x1_t row0h = vget_high_u64(row01.val[0]);
-    const uint64x1_t row2l = vget_low_u64(row23.val[0]);
-    const uint64x1_t row1h = vget_high_u64(row01.val[1]);
-    const uint64x1_t row3l = vget_low_u64(row23.val[1]);
-    row01.val[0] = vcombine_u64(vget_low_u64(row01.val[0]), row2l);
-    row23.val[0] = vcombine_u64(row0h, vget_high_u64(row23.val[0]));
-    row01.val[1] = vcombine_u64(vget_low_u64(row01.val[1]), row3l);
-    row23.val[1] = vcombine_u64(row1h, vget_high_u64(row23.val[1]));
-  }
-  {
-    const int32x4x2_t out01 = vtrnq_s32(vreinterpretq_s32_u64(row01.val[0]),
-                                        vreinterpretq_s32_u64(row01.val[1]));
-    const int32x4x2_t out23 = vtrnq_s32(vreinterpretq_s32_u64(row23.val[0]),
-                                        vreinterpretq_s32_u64(row23.val[1]));
-    int32x4x4_t out;
-    out.val[0] = out01.val[0];
-    out.val[1] = out01.val[1];
-    out.val[2] = out23.val[0];
-    out.val[3] = out23.val[1];
-    return out;
-  }
-}
-
 #define LOAD_LANE_16b(VALUE, LANE) do {             \
   (VALUE) = vld1_lane_s16(src, (VALUE), (LANE));    \
   src += stride;                                    \
diff --git a/src/dsp/neon.h b/src/dsp/neon.h
new file mode 100644
index 00000000..96c089ec
--- /dev/null
+++ b/src/dsp/neon.h
@@ -0,0 +1,49 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+//  NEON common code.
+
+#ifndef WEBP_DSP_NEON_H_
+#define WEBP_DSP_NEON_H_
+
+#include <arm_neon.h>
+
+static WEBP_INLINE int32x4x4_t Transpose4x4(const int32x4x4_t rows) {
+  uint64x2x2_t row01, row23;
+
+  row01.val[0] = vreinterpretq_u64_s32(rows.val[0]);
+  row01.val[1] = vreinterpretq_u64_s32(rows.val[1]);
+  row23.val[0] = vreinterpretq_u64_s32(rows.val[2]);
+  row23.val[1] = vreinterpretq_u64_s32(rows.val[3]);
+  // Transpose 64-bit values (there's no vswp equivalent)
+  {
+    const uint64x1_t row0h = vget_high_u64(row01.val[0]);
+    const uint64x1_t row2l = vget_low_u64(row23.val[0]);
+    const uint64x1_t row1h = vget_high_u64(row01.val[1]);
+    const uint64x1_t row3l = vget_low_u64(row23.val[1]);
+    row01.val[0] = vcombine_u64(vget_low_u64(row01.val[0]), row2l);
+    row23.val[0] = vcombine_u64(row0h, vget_high_u64(row23.val[0]));
+    row01.val[1] = vcombine_u64(vget_low_u64(row01.val[1]), row3l);
+    row23.val[1] = vcombine_u64(row1h, vget_high_u64(row23.val[1]));
+  }
+  {
+    const int32x4x2_t out01 = vtrnq_s32(vreinterpretq_s32_u64(row01.val[0]),
+                                        vreinterpretq_s32_u64(row01.val[1]));
+    const int32x4x2_t out23 = vtrnq_s32(vreinterpretq_s32_u64(row23.val[0]),
+                                        vreinterpretq_s32_u64(row23.val[1]));
+    int32x4x4_t out;
+    out.val[0] = out01.val[0];
+    out.val[1] = out01.val[1];
+    out.val[2] = out23.val[0];
+    out.val[3] = out23.val[1];
+    return out;
+  }
+}
+
+#endif  // WEBP_DSP_NEON_H_