From 869eaf6c602dc041f3d1a8bb0bc0e29386d1ae89 Mon Sep 17 00:00:00 2001
From: skal <pascal.massimino@gmail.com>
Date: Mon, 7 Apr 2014 18:02:25 +0200
Subject: [PATCH]  ~30% encoding speedup: use NEON for QuantizeBlock()

also revamped the signature to avoid having to pass the 'first' parameter

Change-Id: Ief9af1747dcfb5db0700b595d0073cebd57542a5
---
 src/dsp/dsp.h        |  2 +-
 src/dsp/enc.c        |  5 ++--
 src/dsp/enc_mips32.c | 17 ++++-------
 src/dsp/enc_neon.c   | 71 +++++++++++++++++++++++++++++++++++++++++++-
 src/dsp/enc_sse2.c   |  9 +++---
 src/enc/quant.c      |  7 +++--
 6 files changed, 87 insertions(+), 24 deletions(-)

diff --git a/src/dsp/dsp.h b/src/dsp/dsp.h
index 4f0666a1..41d48a88 100644
--- a/src/dsp/dsp.h
+++ b/src/dsp/dsp.h
@@ -88,7 +88,7 @@ extern VP8BlockCopy VP8Copy4x4;
 // Quantization
 struct VP8Matrix;   // forward declaration
 typedef int (*VP8QuantizeBlock)(int16_t in[16], int16_t out[16],
-                                int n, const struct VP8Matrix* const mtx);
+                                const struct VP8Matrix* const mtx);
 extern VP8QuantizeBlock VP8EncQuantizeBlock;
 
 // specific to 2nd transform:
diff --git a/src/dsp/enc.c b/src/dsp/enc.c
index a38135e8..66f3a6a9 100644
--- a/src/dsp/enc.c
+++ b/src/dsp/enc.c
@@ -600,9 +600,10 @@ static const uint8_t kZigzag[16] = {
 
 // Simple quantization
 static int QuantizeBlock(int16_t in[16], int16_t out[16],
-                         int n, const VP8Matrix* const mtx) {
+                         const VP8Matrix* const mtx) {
   int last = -1;
-  for (; n < 16; ++n) {
+  int n;
+  for (n = 0; n < 16; ++n) {
     const int j = kZigzag[n];
     const int sign = (in[j] < 0);
     const uint32_t coeff = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
diff --git a/src/dsp/enc_mips32.c b/src/dsp/enc_mips32.c
index a03441ba..15ef0112 100755
--- a/src/dsp/enc_mips32.c
+++ b/src/dsp/enc_mips32.c
@@ -186,8 +186,7 @@ static void ITransformMIPS32(const uint8_t* ref, const int16_t* in,
   "sh           %[level],       "#N"(%[pout])                       \n\t"
 
 static int QuantizeBlockMIPS32(int16_t in[16], int16_t out[16],
-                               int n, const VP8Matrix* const mtx) {
-  int last;
+                               const VP8Matrix* const mtx) {
   int temp0, temp1, temp2, temp3, temp4, temp5;
   int sign, coeff, level, i;
   int max_level = MAX_LEVEL;
@@ -201,9 +200,7 @@ static int QuantizeBlockMIPS32(int16_t in[16], int16_t out[16],
   const uint32_t* ppbias    = &mtx->bias_[0];
 
   __asm__ volatile(
-    "bnez         %[n],           1f                               \n\t"
     QUANTIZE_ONE( 0,  0,  0)
-  "1:                                                              \n\t"
     QUANTIZE_ONE( 2,  4,  2)
     QUANTIZE_ONE( 8, 16,  4)
     QUANTIZE_ONE(16, 32,  6)
@@ -225,7 +222,7 @@ static int QuantizeBlockMIPS32(int16_t in[16], int16_t out[16],
       [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
       [sign]"=&r"(sign), [coeff]"=&r"(coeff),
       [level]"=&r"(level)
-    : [n]"r"(n), [pout]"r"(pout), [ppin]"r"(ppin),
+    : [pout]"r"(pout), [ppin]"r"(ppin),
       [ppiq]"r"(ppiq), [max_level]"r"(max_level),
       [ppbias]"r"(ppbias), [ppzthresh]"r"(ppzthresh),
       [ppsharpen]"r"(ppsharpen), [ppq]"r"(ppq)
@@ -233,14 +230,10 @@ static int QuantizeBlockMIPS32(int16_t in[16], int16_t out[16],
   );
 
   // moved out from macro to increase possibility for earlier breaking
-  last = -1;
-  for (i = 15; i >= n; i--) {
-    if (out[i]) {
-      last = i;
-      break;
-    }
+  for (i = 15; i >= 0; i--) {
+    if (out[i]) return 1;
   }
-  return (last >= 0);
+  return 0;
 }
 
 #undef QUANTIZE_ONE
diff --git a/src/dsp/enc_neon.c b/src/dsp/enc_neon.c
index 280caa8d..a3507925 100644
--- a/src/dsp/enc_neon.c
+++ b/src/dsp/enc_neon.c
@@ -17,10 +17,10 @@
 
 #define USE_INTRINSICS   // use intrinsics when possible
 
+#include <assert.h>
 #include <arm_neon.h>
 
 #include "../enc/vp8enci.h"
-
 //------------------------------------------------------------------------------
 // Transforms (Paragraph 14.4)
 
@@ -766,6 +766,72 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) {
 
 #undef LOAD_LANE_32b
 
+//------------------------------------------------------------------------------
+
+// Compilation with gcc4.6.3 is problematic for now. Disable this function then.
+#if (__GNUC__ <= 4 && __GNUC_MINOR__ < 8)
+#define SKIP_QUANTIZE
+#endif
+
+#if !defined(SKIP_QUANTIZE)
+
+static int16x8_t Quantize(int16_t* const in,
+                          const VP8Matrix* const mtx, int offset) {
+  const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]);
+  const uint16x8_t q = vld1q_u16(&mtx->q_[offset]);
+  const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]);
+  const uint32x4_t bias0 = vld1q_u32(&mtx->bias_[offset + 0]);
+  const uint32x4_t bias1 = vld1q_u32(&mtx->bias_[offset + 4]);
+
+  const int16x8_t a = vld1q_s16(in + offset);                // in
+  const uint16x8_t b = vreinterpretq_u16_s16(vabsq_s16(a));  // coeff = abs(in)
+  const int16x8_t sign = vshrq_n_s16(a, 15);                 // sign
+  const uint16x8_t c = vaddq_u16(b, sharp);                  // + sharpen
+  const uint32x4_t m0 = vmull_u16(vget_low_u16(c), vget_low_u16(iq));
+  const uint32x4_t m1 = vmull_u16(vget_high_u16(c), vget_high_u16(iq));
+  const uint32x4_t m2 = vhaddq_u32(m0, bias0);
+  const uint32x4_t m3 = vhaddq_u32(m1, bias1);     // (coeff * iQ + bias) >> 1
+  const uint16x8_t c0 = vcombine_u16(vshrn_n_u32(m2, 16),
+                                     vshrn_n_u32(m3, 16));   // QFIX=17 = 16+1
+  const uint16x8_t c1 = vminq_u16(c0, vdupq_n_u16(MAX_LEVEL));
+  const int16x8_t c2 = veorq_s16(vreinterpretq_s16_u16(c1), sign);
+  const int16x8_t c3 = vsubq_s16(c2, sign);                  // restore sign
+  const int16x8_t c4 = vmulq_s16(c3, vreinterpretq_s16_u16(q));
+  vst1q_s16(in + offset, c4);
+  assert(QFIX == 17);  // this function can't work as is if QFIX != 16+1
+  return c3;
+}
+
+static const uint8_t kShuffles[4][8] = {
+ { 0,   1,  2,  3,  8,  9, 16, 17 },
+ { 10, 11,  4,  5,  6,  7, 12, 13 },
+ { 18, 19, 24, 25, 26, 27, 20, 21 },
+ { 14, 15, 22, 23, 28, 29, 30, 31 }
+};
+
+static int QuantizeBlock(int16_t in[16], int16_t out[16],
+                         const VP8Matrix* const mtx) {
+  const int16x8_t out0 = Quantize(in, mtx, 0);
+  const int16x8_t out1 = Quantize(in, mtx, 8);
+  const uint8x8x4_t all_out = {{
+      vreinterpret_u8_s16(vget_low_s16(out0)),
+      vreinterpret_u8_s16(vget_high_s16(out0)),
+      vreinterpret_u8_s16(vget_low_s16(out1)),
+      vreinterpret_u8_s16(vget_high_s16(out1)) }};
+  // Zigzag reordering
+  vst1_u8((uint8_t*)(out +  0), vtbl4_u8(all_out, vld1_u8(kShuffles[0])));
+  vst1_u8((uint8_t*)(out +  4), vtbl4_u8(all_out, vld1_u8(kShuffles[1])));
+  vst1_u8((uint8_t*)(out +  8), vtbl4_u8(all_out, vld1_u8(kShuffles[2])));
+  vst1_u8((uint8_t*)(out + 12), vtbl4_u8(all_out, vld1_u8(kShuffles[3])));
+  // test zeros
+  if (*(uint64_t*)(out +  0) != 0) return 1;
+  if (*(uint64_t*)(out +  4) != 0) return 1;
+  if (*(uint64_t*)(out +  8) != 0) return 1;
+  if (*(uint64_t*)(out + 12) != 0) return 1;
+  return 0;
+}
+#endif   // SKIP_QUANTIZE
+
 #endif   // WEBP_USE_NEON
 
 //------------------------------------------------------------------------------
@@ -788,5 +854,8 @@ void VP8EncDspInitNEON(void) {
   VP8SSE8x8 = SSE8x8;
   VP8SSE4x4 = SSE4x4;
 #endif
+#if !defined(SKIP_QUANTIZE)
+  VP8EncQuantizeBlock = QuantizeBlock;
+#endif
 #endif   // WEBP_USE_NEON
 }
diff --git a/src/dsp/enc_sse2.c b/src/dsp/enc_sse2.c
index 0e9fd181..ecebb4b3 100644
--- a/src/dsp/enc_sse2.c
+++ b/src/dsp/enc_sse2.c
@@ -805,7 +805,7 @@ static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
 
 #define QFIX2 0
 static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16],
-                                       int n, int shift,
+                                       int shift,
                                        const uint16_t* const sharpen,
                                        const VP8Matrix* const mtx) {
   const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL);
@@ -916,18 +916,17 @@ static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16],
   }
 
   // detect if all 'out' values are zeroes or not
-  if (n) packed_out = _mm_srli_si128(packed_out, 1);   // ignore DC for n == 1
   return (_mm_movemask_epi8(_mm_cmpeq_epi8(packed_out, zero)) != 0xffff);
 }
 
 static int QuantizeBlock(int16_t in[16], int16_t out[16],
-                         int n, const VP8Matrix* const mtx) {
-  return DoQuantizeBlock(in, out, n, 0, &mtx->sharpen_[0], mtx);
+                         const VP8Matrix* const mtx) {
+  return DoQuantizeBlock(in, out, 0, &mtx->sharpen_[0], mtx);
 }
 
 static int QuantizeBlockWHT(int16_t in[16], int16_t out[16],
                             const VP8Matrix* const mtx) {
-  return DoQuantizeBlock(in, out, 0, 0, &mtx->sharpen_[0], mtx);
+  return DoQuantizeBlock(in, out, 0, &mtx->sharpen_[0], mtx);
 }
 
 #endif   // WEBP_USE_SSE2
diff --git a/src/enc/quant.c b/src/enc/quant.c
index 609e3910..c8cdc160 100644
--- a/src/enc/quant.c
+++ b/src/enc/quant.c
@@ -746,7 +746,8 @@ static int ReconstructIntra16(VP8EncIterator* const it,
     }
   } else {
     for (n = 0; n < 16; ++n) {
-      nz |= VP8EncQuantizeBlock(tmp[n], rd->y_ac_levels[n], 1, &dqm->y1_) << n;
+      tmp[n][0] = 0;  // so that nz is correct below
+      nz |= VP8EncQuantizeBlock(tmp[n], rd->y_ac_levels[n], &dqm->y1_) << n;
     }
   }
 
@@ -777,7 +778,7 @@ static int ReconstructIntra4(VP8EncIterator* const it,
     nz = TrellisQuantizeBlock(enc, tmp, levels, ctx, 3, &dqm->y1_,
                               dqm->lambda_trellis_i4_);
   } else {
-    nz = VP8EncQuantizeBlock(tmp, levels, 0, &dqm->y1_);
+    nz = VP8EncQuantizeBlock(tmp, levels, &dqm->y1_);
   }
   VP8ITransform(ref, tmp, yuv_out, 0);
   return nz;
@@ -812,7 +813,7 @@ static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
     }
   } else {
     for (n = 0; n < 8; ++n) {
-      nz |= VP8EncQuantizeBlock(tmp[n], rd->uv_levels[n], 0, &dqm->uv_) << n;
+      nz |= VP8EncQuantizeBlock(tmp[n], rd->uv_levels[n], &dqm->uv_) << n;
     }
   }