From 32de385eca6506286ffbe87a75006ac20c60f665 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 21 Oct 2014 18:06:08 +0200
Subject: [PATCH] dec_neon: add VE4 intra predictor

based on SSE2 version, ~59% faster

Change-Id: Iaa2181eb51bd975de0e9fe5c7b66ed18188f0e3b
---
 src/dsp/dec_neon.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/src/dsp/dec_neon.c b/src/dsp/dec_neon.c
index 39be3d3a..3ea8e74c 100644
--- a/src/dsp/dec_neon.c
+++ b/src/dsp/dec_neon.c
@@ -1258,6 +1258,25 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
 }
 #undef MUL
 
+//------------------------------------------------------------------------------
+// 4x4
+
+static void VE4(uint8_t* dst) {    // vertical
+  // NB: avoid vld1_u64 here as an alignment hint may be added -> SIGBUS.
+  const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(dst - BPS - 1));  // top row
+  const uint64x1_t A1 = vshr_n_u64(A0, 8);
+  const uint64x1_t A2 = vshr_n_u64(A0, 16);
+  const uint8x8_t ABCDEFG = vreinterpret_u8_u64(A0);
+  const uint8x8_t BCDEFG_ = vreinterpret_u8_u64(A1);
+  const uint8x8_t CDEFG__ = vreinterpret_u8_u64(A2);
+  const uint8x8_t b = vhadd_u8(ABCDEFG, CDEFG__);
+  const uint8x8_t avg = vrhadd_u8(b, BCDEFG_);
+  int i;
+  for (i = 0; i < 4; ++i) {
+    vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(avg), 0);
+  }
+}
+
 #endif   // WEBP_USE_NEON
 
 //------------------------------------------------------------------------------
@@ -1288,5 +1307,7 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitNEON(void) {
   VP8SimpleHFilter16 = SimpleHFilter16;
   VP8SimpleVFilter16i = SimpleVFilter16i;
   VP8SimpleHFilter16i = SimpleHFilter16i;
+
+  VP8PredLuma4[2] = VE4;
 #endif   // WEBP_USE_NEON
 }