From fe395f0e4dc6a94c681a030ccaf37cbd9af59c5f Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Wed, 22 Oct 2014 20:43:36 +0200
Subject: [PATCH] dec_neon: add LD4 intra predictor

based on SSE2 version, ~55% faster

Change-Id: I782282ffc31dcf238890b3ba0decccf1d793dad0
---
 src/dsp/dec_neon.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/src/dsp/dec_neon.c b/src/dsp/dec_neon.c
index 3ea8e74c..cffa5441 100644
--- a/src/dsp/dec_neon.c
+++ b/src/dsp/dec_neon.c
@@ -1277,6 +1277,25 @@ static void VE4(uint8_t* dst) {    // vertical
   }
 }
 
+static void LD4(uint8_t* dst) {    // Down-left
+  // Note using the same shift trick as VE4() is slower here.
+  const uint8x8_t ABCDEFGH = vld1_u8(dst - BPS + 0);
+  const uint8x8_t BCDEFGH0 = vld1_u8(dst - BPS + 1);
+  const uint8x8_t CDEFGH00 = vld1_u8(dst - BPS + 2);
+  const uint8x8_t CDEFGHH0 = vset_lane_u8(dst[-BPS + 7], CDEFGH00, 6);
+  const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGHH0);
+  const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0);
+  const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
+  const uint32x2_t r0 = vreinterpret_u32_u8(avg2);
+  const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
+  const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
+  const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
+  vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0, 0);
+  vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1, 0);
+  vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2, 0);
+  vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0);
+}
+
 #endif   // WEBP_USE_NEON
 
 //------------------------------------------------------------------------------
@@ -1309,5 +1328,6 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitNEON(void) {
   VP8SimpleHFilter16i = SimpleHFilter16i;
 
   VP8PredLuma4[2] = VE4;
+  VP8PredLuma4[6] = LD4;
 #endif   // WEBP_USE_NEON
 }