From 3d49871dbe000f16140e9df454156abab571fef0 Mon Sep 17 00:00:00 2001
From: skal <pascal.massimino@gmail.com>
Date: Wed, 9 Apr 2014 18:40:02 +0200
Subject: [PATCH] NEON functions for lossless coding

Verified OK, but right now they don't seem faster.
So they are disabled behind a USE_INTRINSICS flag (off for now)

Change-Id: I72a1c4fa3798f98c1e034f7ca781914c36d3392c
---
 src/dsp/lossless_neon.c | 190 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 190 insertions(+)

diff --git a/src/dsp/lossless_neon.c b/src/dsp/lossless_neon.c
index d182c5ad..77caf793 100644
--- a/src/dsp/lossless_neon.c
+++ b/src/dsp/lossless_neon.c
@@ -19,6 +19,10 @@
 
 #include "./lossless.h"
 
+// Right now, some intrinsics function seem slower, so we disable it.
+// Uncomment the following for testing:
+// #define USE_INTRINSICS
+
 //------------------------------------------------------------------------------
 // Colorspace conversion functions
 
@@ -140,6 +144,176 @@ static void ConvertBGRAToRGB(const uint32_t* src,
 
 #endif   // gcc-4.8
 
+//------------------------------------------------------------------------------
+
+#ifdef USE_INTRINSICS
+
+static WEBP_INLINE uint32_t Average2(const uint32_t* const a,
+                                     const uint32_t* const b) {
+  const uint64x1_t a0 = { *a }, b0 = { *b };
+  const uint8x8_t a1 = vreinterpret_u8_u64(a0);
+  const uint8x8_t b1 = vreinterpret_u8_u64(b0);
+  const uint8x8_t avg = vhadd_u8(a1, b1);
+  uint32_t ret;
+  vst1_lane_u32(&ret, vreinterpret_u32_u8(avg), 0);
+  return ret;
+}
+
+static WEBP_INLINE uint32_t Average3(const uint32_t* const a,
+                                     const uint32_t* const b,
+                                     const uint32_t* const c) {
+  const uint64x1_t a0 = { *a }, b0 = { *b }, c0 = { *c };
+  const uint8x8_t a1 = vreinterpret_u8_u64(a0);
+  const uint8x8_t b1 = vreinterpret_u8_u64(b0);
+  const uint8x8_t c1 = vreinterpret_u8_u64(c0);
+  const uint8x8_t avg1 = vhadd_u8(a1, c1);
+  const uint8x8_t avg2 = vhadd_u8(avg1, b1);
+  uint32_t ret;
+  vst1_lane_u32(&ret, vreinterpret_u32_u8(avg2), 0);
+  return ret;
+}
+
+static WEBP_INLINE uint32_t Average4(const uint32_t* const a,
+                                     const uint32_t* const b,
+                                     const uint32_t* const c,
+                                     const uint32_t* const d) {
+  const uint64x1_t a0 = { *a }, b0 = { *b }, c0 = { *c }, d0 = { *d };
+  const uint8x8_t a1 = vreinterpret_u8_u64(a0);
+  const uint8x8_t b1 = vreinterpret_u8_u64(b0);
+  const uint8x8_t c1 = vreinterpret_u8_u64(c0);
+  const uint8x8_t d1 = vreinterpret_u8_u64(d0);
+  const uint8x8_t avg1 = vhadd_u8(a1, b1);
+  const uint8x8_t avg2 = vhadd_u8(c1, d1);
+  const uint8x8_t avg3 = vhadd_u8(avg1, avg2);
+  uint32_t ret;
+  vst1_lane_u32(&ret, vreinterpret_u32_u8(avg3), 0);
+  return ret;
+}
+
+static uint32_t Predictor5(uint32_t left, const uint32_t* const top) {
+  return Average3(&left, top + 0, top + 1);
+}
+
+static uint32_t Predictor6(uint32_t left, const uint32_t* const top) {
+  return Average2(&left, top - 1);
+}
+
+static uint32_t Predictor7(uint32_t left, const uint32_t* const top) {
+  return Average2(&left, top + 0);
+}
+
+static uint32_t Predictor8(uint32_t left, const uint32_t* const top) {
+  (void)left;
+  return Average2(top - 1, top + 0);
+}
+
+static uint32_t Predictor9(uint32_t left, const uint32_t* const top) {
+  (void)left;
+  return Average2(top + 0, top + 1);
+}
+
+static uint32_t Predictor10(uint32_t left, const uint32_t* const top) {
+  return Average4(&left, top - 1, top + 0, top + 1);
+}
+
+//------------------------------------------------------------------------------
+
+static WEBP_INLINE uint32_t Select(const uint32_t* const c0,
+                                   const uint32_t* const c1,
+                                   const uint32_t* const c2) {
+  const uint64x1_t C0 = { *c0, 0 }, C1 = { *c1, 0 }, C2 = { *c2, 0 };
+  const uint8x8_t p0 = vreinterpret_u8_u64(C0);
+  const uint8x8_t p1 = vreinterpret_u8_u64(C1);
+  const uint8x8_t p2 = vreinterpret_u8_u64(C2);
+  const uint8x8_t bc = vabd_u8(p1, p2);   // |b-c|
+  const uint8x8_t ac = vabd_u8(p0, p2);   // |a-c|
+  const int16x4_t sum_bc = vreinterpret_s16_u16(vpaddl_u8(bc));
+  const int16x4_t sum_ac = vreinterpret_s16_u16(vpaddl_u8(ac));
+  const int32x2_t diff = vpaddl_s16(vsub_s16(sum_bc, sum_ac));
+  int32_t pa_minus_pb;
+  vst1_lane_s32(&pa_minus_pb, diff, 0);
+  return (pa_minus_pb <= 0) ? *c0 : *c1;
+}
+
+static uint32_t Predictor11(uint32_t left, const uint32_t* const top) {
+  return Select(top + 0, &left, top - 1);
+}
+
+static WEBP_INLINE uint32_t ClampedAddSubtractFull(const uint32_t* const c0,
+                                                   const uint32_t* const c1,
+                                                   const uint32_t* const c2) {
+  const uint64x1_t C0 = { *c0, 0 }, C1 = { *c1, 0 }, C2 = { *c2, 0 };
+  const uint8x8_t p0 = vreinterpret_u8_u64(C0);
+  const uint8x8_t p1 = vreinterpret_u8_u64(C1);
+  const uint8x8_t p2 = vreinterpret_u8_u64(C2);
+  const uint16x8_t sum0 = vaddl_u8(p0, p1);                // add and widen
+  const uint16x8_t sum1 = vqsubq_u16(sum0, vmovl_u8(p2));  // widen and subtract
+  const uint8x8_t out = vqmovn_u16(sum1);                  // narrow and clamp
+  uint32_t ret;
+  vst1_lane_u32(&ret, vreinterpret_u32_u8(out), 0);
+  return ret;
+}
+
+static uint32_t Predictor12(uint32_t left, const uint32_t* const top) {
+  return ClampedAddSubtractFull(&left, top + 0, top - 1);
+}
+
+static WEBP_INLINE uint32_t ClampedAddSubtractHalf(const uint32_t* const c0,
+                                                   const uint32_t* const c1,
+                                                   const uint32_t* const c2) {
+  const uint64x1_t C0 = { *c0, 0 }, C1 = { *c1, 0 }, C2 = { *c2, 0 };
+  const uint8x8_t p0 = vreinterpret_u8_u64(C0);
+  const uint8x8_t p1 = vreinterpret_u8_u64(C1);
+  const uint8x8_t p2 = vreinterpret_u8_u64(C2);
+  const uint8x8_t avg = vhadd_u8(p0, p1);                  // Average(c0,c1)
+  const uint8x8_t ab = vshr_n_u8(vqsub_u8(avg, p2), 1);    // (a-b)>>1 saturated
+  const uint8x8_t ba = vshr_n_u8(vqsub_u8(p2, avg), 1);    // (b-a)>>1 saturated
+  const uint8x8_t out = vqsub_u8(vqadd_u8(avg, ab), ba);
+  uint32_t ret;
+  vst1_lane_u32(&ret, vreinterpret_u32_u8(out), 0);
+  return ret;
+}
+
+static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
+  return ClampedAddSubtractHalf(&left, top + 0, top - 1);
+}
+
+//------------------------------------------------------------------------------
+// Subtract-Green Transform
+
+// 255 = byte will be zero'd
+static const uint8_t kGreenShuffle[8] = { 1, 255, 1, 255, 5, 255, 5, 255  };
+
+static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
+  const uint32_t* const end = argb_data + (num_pixels & ~3);
+  const uint8x8_t shuffle = vld1_u8(kGreenShuffle);
+  for (; argb_data < end; argb_data += 4) {
+    const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data);
+    const uint8x16_t greens =
+        vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle),
+                    vtbl1_u8(vget_high_u8(argb), shuffle));
+    vst1q_u8((uint8_t*)argb_data, vsubq_u8(argb, greens));
+  }
+  // fallthrough and finish off with plain-C
+  VP8LSubtractGreenFromBlueAndRed_C(argb_data, num_pixels & 3);
+}
+
+static void AddGreenToBlueAndRed(uint32_t* argb_data, int num_pixels) {
+  const uint32_t* const end = argb_data + (num_pixels & ~3);
+  const uint8x8_t shuffle = vld1_u8(kGreenShuffle);
+  for (; argb_data < end; argb_data += 4) {
+    const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data);
+    const uint8x16_t greens =
+        vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle),
+                    vtbl1_u8(vget_high_u8(argb), shuffle));
+    vst1q_u8((uint8_t*)argb_data, vaddq_u8(argb, greens));
+  }
+  // fallthrough and finish off with plain-C
+  VP8LAddGreenToBlueAndRed_C(argb_data, num_pixels & 3);
+}
+
+#endif   // USE_INTRINSICS
+
 #endif   // WEBP_USE_NEON
 
 //------------------------------------------------------------------------------
@@ -151,6 +325,22 @@ void VP8LDspInitNEON(void) {
   VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA;
   VP8LConvertBGRAToBGR = ConvertBGRAToBGR;
   VP8LConvertBGRAToRGB = ConvertBGRAToRGB;
+
+#ifdef USE_INTRINSICS
+  VP8LPredictors[5] = Predictor5;
+  VP8LPredictors[6] = Predictor6;
+  VP8LPredictors[7] = Predictor7;
+  VP8LPredictors[8] = Predictor8;
+  VP8LPredictors[9] = Predictor9;
+  VP8LPredictors[10] = Predictor10;
+  VP8LPredictors[11] = Predictor11;
+  VP8LPredictors[12] = Predictor12;
+  VP8LPredictors[13] = Predictor13;
+
+  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
+  VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed;
+#endif
+
 #endif   // WEBP_USE_NEON
 }