diff --git a/src/dsp/enc_neon.c b/src/dsp/enc_neon.c
index a3507925..9af6599e 100644
--- a/src/dsp/enc_neon.c
+++ b/src/dsp/enc_neon.c
@@ -21,6 +21,7 @@
 #include <arm_neon.h>
 
 #include "../enc/vp8enci.h"
+
 //------------------------------------------------------------------------------
 // Transforms (Paragraph 14.4)
 
@@ -251,8 +252,102 @@ static void ITransform(const uint8_t* ref,
   }
 }
 
+// Load all 4x4 pixels into a single uint32x4_t variable.
+static uint8x16_t Load4x4(const uint8_t* src) {
+  uint32x4_t out = { 0, 0, 0, 0 };
+  out = vld1q_lane_u32((const uint32_t*)(src + 0 * BPS), out, 0);
+  out = vld1q_lane_u32((const uint32_t*)(src + 1 * BPS), out, 1);
+  out = vld1q_lane_u32((const uint32_t*)(src + 2 * BPS), out, 2);
+  out = vld1q_lane_u32((const uint32_t*)(src + 3 * BPS), out, 3);
+  return vreinterpretq_u8_u32(out);
+}
+
 // Forward transform.
 
+#if 0  // #ifdef USE_INTRINSICS
+
+static WEBP_INLINE void Transpose4x4_S16(const int16x4_t A, const int16x4_t B,
+                                         const int16x4_t C, const int16x4_t D,
+                                         int16x8_t* const out01,
+                                         int16x8_t* const out32) {
+  const int16x4x2_t AB = vtrn_s16(A, B);
+  const int16x4x2_t CD = vtrn_s16(C, D);
+  const int32x2x2_t tmp02 = vtrn_s32(vreinterpret_s32_s16(AB.val[0]),
+                                     vreinterpret_s32_s16(CD.val[0]));
+  const int32x2x2_t tmp13 = vtrn_s32(vreinterpret_s32_s16(AB.val[1]),
+                                     vreinterpret_s32_s16(CD.val[1]));
+  *out01 = vreinterpretq_s16_s64(
+      vcombine_s64(vreinterpret_s64_s32(tmp02.val[0]),
+                   vreinterpret_s64_s32(tmp13.val[0])));
+  *out32 = vreinterpretq_s16_s64(
+      vcombine_s64(vreinterpret_s64_s32(tmp13.val[1]),
+                   vreinterpret_s64_s32(tmp02.val[1])));
+}
+
+static WEBP_INLINE int16x8_t DiffU8ToS16(const uint8x8_t a,
+                                         const uint8x8_t b) {
+  return vreinterpretq_s16_u16(vsubl_u8(a, b));
+}
+
+static void FTransform(const uint8_t* src, const uint8_t* ref,
+                       int16_t* out) {
+  int16x8_t d0d1, d3d2;   // working 4x4 int16 variables
+  {
+    const uint8x16_t S0 = Load4x4(src);
+    const uint8x16_t R0 = Load4x4(ref);
+    const int16x8_t D0D1 = DiffU8ToS16(vget_low_u8(S0), vget_low_u8(R0));
+    const int16x8_t D2D3 = DiffU8ToS16(vget_high_u8(S0), vget_high_u8(R0));
+    const int16x4_t D0 = vget_low_s16(D0D1);
+    const int16x4_t D1 = vget_high_s16(D0D1);
+    const int16x4_t D2 = vget_low_s16(D2D3);
+    const int16x4_t D3 = vget_high_s16(D2D3);
+    Transpose4x4_S16(D0, D1, D2, D3, &d0d1, &d3d2);
+  }
+  {    // 1rst pass
+    const int32x4_t kCst937 = vdupq_n_s32(937);
+    const int32x4_t kCst1812 = vdupq_n_s32(1812);
+    const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2);   // d0+d3 | d1+d2   (=a0|a1)
+    const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2);   // d0-d3 | d1-d2   (=a3|a2)
+    const int16x8_t a0a1_2 = vshlq_n_s16(a0a1, 3);
+    const int16x4_t tmp0 = vadd_s16(vget_low_s16(a0a1_2),
+                                    vget_high_s16(a0a1_2));
+    const int16x4_t tmp2 = vsub_s16(vget_low_s16(a0a1_2),
+                                    vget_high_s16(a0a1_2));
+    const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);
+    const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);
+    const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);
+    const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
+    const int16x4_t tmp1 = vshrn_n_s32(vaddq_s32(a2_p_a3, kCst1812), 9);
+    const int16x4_t tmp3 = vshrn_n_s32(vaddq_s32(a3_m_a2, kCst937), 9);
+    Transpose4x4_S16(tmp0, tmp1, tmp2, tmp3, &d0d1, &d3d2);
+  }
+  {    // 2nd pass
+    // the (1<<16) addition is for the replacement: a3!=0  <-> 1-(a3==0)
+    const int32x4_t kCst12000 = vdupq_n_s32(12000 + (1 << 16));
+    const int32x4_t kCst51000 = vdupq_n_s32(51000);
+    const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2);   // d0+d3 | d1+d2   (=a0|a1)
+    const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2);   // d0-d3 | d1-d2   (=a3|a2)
+    const int16x4_t a0_k7 = vadd_s16(vget_low_s16(a0a1), vdup_n_s16(7));
+    const int16x4_t out0 = vshr_n_s16(vadd_s16(a0_k7, vget_high_s16(a0a1)), 4);
+    const int16x4_t out2 = vshr_n_s16(vsub_s16(a0_k7, vget_high_s16(a0a1)), 4);
+    const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);
+    const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);
+    const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);
+    const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
+    const int16x4_t tmp1 = vaddhn_s32(a2_p_a3, kCst12000);
+    const int16x4_t out3 = vaddhn_s32(a3_m_a2, kCst51000);
+    const int16x4_t a3_eq_0 =
+        vreinterpret_s16_u16(vceq_s16(vget_low_s16(a3a2), vdup_n_s16(0)));
+    const int16x4_t out1 = vadd_s16(tmp1, a3_eq_0);
+    vst1_s16(out +  0, out0);
+    vst1_s16(out +  4, out1);
+    vst1_s16(out +  8, out2);
+    vst1_s16(out + 12, out3);
+  }
+}
+
+#else
+
 // adapted from vp8/encoder/arm/neon/shortfdct_neon.asm
 static const int16_t kCoeff16[] = {
   5352,  5352,  5352, 5352, 2217,  2217,  2217, 2217
@@ -377,6 +472,8 @@ static void FTransform(const uint8_t* src, const uint8_t* ref,
   );
 }
 
+#endif
+
 static WEBP_INLINE int32x4x4_t Transpose4x4(const int32x4x4_t rows) {
   uint64x2x2_t row01, row23;
 
@@ -737,31 +834,16 @@ static int SSE8x8(const uint8_t* a, const uint8_t* b) {
   return SumToInt(sum);
 }
 
-#define LOAD_LANE_32b(src, VALUE, LANE) \
-    (VALUE) = vld1q_lane_u32((const uint32_t*)(src), (VALUE), (LANE))
-
 static int SSE4x4(const uint8_t* a, const uint8_t* b) {
-  uint32x4_t a0 = { 0, 0, 0, 0 };
-  uint32x4_t b0 = { 0, 0, 0, 0 };
-  // Load all 4x4 pixels into a single uint32x4_t variable.
-  LOAD_LANE_32b(a + 0 * BPS, a0, 0);
-  LOAD_LANE_32b(a + 1 * BPS, a0, 1);
-  LOAD_LANE_32b(a + 2 * BPS, a0, 2);
-  LOAD_LANE_32b(a + 3 * BPS, a0, 3);
-  LOAD_LANE_32b(b + 0 * BPS, b0, 0);
-  LOAD_LANE_32b(b + 1 * BPS, b0, 1);
-  LOAD_LANE_32b(b + 2 * BPS, b0, 2);
-  LOAD_LANE_32b(b + 3 * BPS, b0, 3);
-  {
-    const uint8x16_t abs_diff = vabdq_u8(vreinterpretq_u8_u32(a0),
-                                         vreinterpretq_u8_u32(b0));
-    const uint16x8_t prod_l = vmull_u8(vget_low_u8(abs_diff),
-                                       vget_low_u8(abs_diff));
-    const uint16x8_t prod_h = vmull_u8(vget_high_u8(abs_diff),
-                                       vget_high_u8(abs_diff));
-    const uint32x4_t sum = vpaddlq_u16(vaddq_u16(prod_h, prod_l));
-    return SumToInt(sum);
-  }
+  const uint8x16_t a0 = Load4x4(a);
+  const uint8x16_t b0 = Load4x4(b);
+  const uint8x16_t abs_diff = vabdq_u8(a0, b0);
+  const uint16x8_t prod_l = vmull_u8(vget_low_u8(abs_diff),
+                                     vget_low_u8(abs_diff));
+  const uint16x8_t prod_h = vmull_u8(vget_high_u8(abs_diff),
+                                     vget_high_u8(abs_diff));
+  const uint32x4_t sum = vpaddlq_u16(vaddq_u16(prod_h, prod_l));
+  return SumToInt(sum);
 }
 
 #undef LOAD_LANE_32b