From 6679f8996f2098d36ea34d9be761aff202929723 Mon Sep 17 00:00:00 2001
From: skal <pascal.massimino@gmail.com>
Date: Tue, 3 Jun 2014 06:44:04 +0200
Subject: [PATCH] Optimize VP8SetResidualCoeffs.

Brings down WebP lossy encoding timings by 5%

Change-Id: Ia4a2fab0a887aaaf7841ce6d9ee16270d3e15489
---
 src/dsp/enc_sse2.c | 29 +++++++++++++++++++++++++++++
 src/dsp/lossless.h | 36 +-----------------------------------
 src/enc/cost.c     | 42 ++++++++++++++++++++++++++++++++++--------
 src/enc/cost.h     |  8 +++++++-
 src/enc/webpenc.c  |  1 +
 src/utils/utils.h  | 35 +++++++++++++++++++++++++++++++++++
 6 files changed, 107 insertions(+), 44 deletions(-)

diff --git a/src/dsp/enc_sse2.c b/src/dsp/enc_sse2.c
index ecebb4b3..d4ffd208 100644
--- a/src/dsp/enc_sse2.c
+++ b/src/dsp/enc_sse2.c
@@ -17,7 +17,9 @@
 #include <stdlib.h>  // for abs()
 #include <emmintrin.h>
 
+#include "../enc/cost.h"
 #include "../enc/vp8enci.h"
+#include "../utils/utils.h"
 
 //------------------------------------------------------------------------------
 // Quite useful macro for debugging. Left here for convenience.
@@ -929,6 +931,33 @@ static int QuantizeBlockWHT(int16_t in[16], int16_t out[16],
   return DoQuantizeBlock(in, out, 0, &mtx->sharpen_[0], mtx);
 }
 
+// Forward declaration.
+void VP8SetResidualCoeffsSSE2(const int16_t* const coeffs,
+                              VP8Residual* const res);
+
+void VP8SetResidualCoeffsSSE2(const int16_t* const coeffs,
+                              VP8Residual* const res) {
+  const __m128i c0 = _mm_loadu_si128((const __m128i*)coeffs);
+  const __m128i c1 = _mm_loadu_si128((const __m128i*)(coeffs + 8));
+  // Use SSE to compare 8 values with a single instruction.
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i m0 = _mm_cmpeq_epi16(c0, zero);
+  const __m128i m1 = _mm_cmpeq_epi16(c1, zero);
+  // Get the comparison results as a bitmask, consisting of two times 16 bits:
+  // two identical bits for each result. Concatenate both bitmasks to get a
+  // single 32 bit value. Negate the mask to get the position of entries that
+  // are not equal to zero. Finally, mask out least significant bits according
+  // to res->first.
+  const uint32_t mask =
+      ~((_mm_movemask_epi8(m1) << 16) | _mm_movemask_epi8(m0)) &
+      -(1U << (res->first << 1));
+  // The position of the most significant non-zero bit indicates the position of
+  // the last non-zero value. Divide the result by two because __movemask_epi8
+  // operates on 8 bit values instead of 16 bit values.
+  res->last = mask ? (BitsLog2Floor(mask) >> 1) : -1;
+  res->coeffs = coeffs;
+}
+
 #endif   // WEBP_USE_SSE2
 
 //------------------------------------------------------------------------------
diff --git a/src/dsp/lossless.h b/src/dsp/lossless.h
index b99cc09b..e4da705f 100644
--- a/src/dsp/lossless.h
+++ b/src/dsp/lossless.h
@@ -19,6 +19,7 @@
 #include "../webp/decode.h"
 
 #include "../enc/histogram.h"
+#include "../utils/utils.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -169,41 +170,6 @@ extern VP8LHistogramAddFunc VP8LHistogramAdd;
 // -----------------------------------------------------------------------------
 // PrefixEncode()
 
-// use GNU builtins where available.
-#if defined(__GNUC__) && \
-    ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4)
-static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
-  return 31 ^ __builtin_clz(n);
-}
-#elif defined(_MSC_VER) && _MSC_VER > 1310 && \
-      (defined(_M_X64) || defined(_M_IX86))
-#include <intrin.h>
-#pragma intrinsic(_BitScanReverse)
-
-static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
-  unsigned long first_set_bit;
-  _BitScanReverse(&first_set_bit, n);
-  return first_set_bit;
-}
-#else
-// Returns (int)floor(log2(n)). n must be > 0.
-static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
-  int log = 0;
-  uint32_t value = n;
-  int i;
-
-  for (i = 4; i >= 0; --i) {
-    const int shift = (1 << i);
-    const uint32_t x = value >> shift;
-    if (x != 0) {
-      value = x;
-      log += shift;
-    }
-  }
-  return log;
-}
-#endif
-
 static WEBP_INLINE int VP8LBitsLog2Ceiling(uint32_t n) {
   const int log_floor = BitsLog2Floor(n);
   if (n == (n & ~(n - 1)))  // zero or a power of two.
diff --git a/src/enc/cost.c b/src/enc/cost.c
index 9d6a490d..73c157dc 100644
--- a/src/enc/cost.c
+++ b/src/enc/cost.c
@@ -13,6 +13,12 @@
 
 #include "./cost.h"
 
+#if defined(WEBP_USE_SSE2)
+#include <emmintrin.h>
+#endif  // WEBP_USE_SSE2
+
+#include "../utils/utils.h"
+
 //------------------------------------------------------------------------------
 // Boolean-cost cost table
 
@@ -536,15 +542,13 @@ extern int VP8GetResidualCostMIPS32(int ctx0, const VP8Residual* const res);
 VP8GetResidualCostFunc VP8GetResidualCost;
 
 void VP8GetResidualCostInit(void) {
-  if (VP8GetResidualCost == NULL) {
-    VP8GetResidualCost = GetResidualCost;
-    if (VP8GetCPUInfo != NULL) {
+  VP8GetResidualCost = GetResidualCost;
+  if (VP8GetCPUInfo != NULL) {
 #if defined(WEBP_USE_MIPS32)
-      if (VP8GetCPUInfo(kMIPS32)) {
-        VP8GetResidualCost = VP8GetResidualCostMIPS32;
-      }
-#endif
+    if (VP8GetCPUInfo(kMIPS32)) {
+      VP8GetResidualCost = VP8GetResidualCostMIPS32;
     }
+#endif
   }
 }
 
@@ -560,7 +564,8 @@ void VP8InitResidual(int first, int coeff_type,
   res->first = first;
 }
 
-void VP8SetResidualCoeffs(const int16_t* const coeffs, VP8Residual* const res) {
+static void SetResidualCoeffs(const int16_t* const coeffs,
+                              VP8Residual* const res) {
   int n;
   res->last = -1;
   for (n = 15; n >= res->first; --n) {
@@ -572,6 +577,27 @@ void VP8SetResidualCoeffs(const int16_t* const coeffs, VP8Residual* const res) {
   res->coeffs = coeffs;
 }
 
+//------------------------------------------------------------------------------
+// init function
+
+#if defined(WEBP_USE_SSE2)
+extern void VP8SetResidualCoeffsSSE2(const int16_t* const coeffs,
+                                     VP8Residual* const res);
+#endif  // WEBP_USE_SSE2
+
+VP8SetResidualCoeffsFunc VP8SetResidualCoeffs;
+
+void VP8SetResidualCoeffsInit(void) {
+  VP8SetResidualCoeffs = SetResidualCoeffs;
+  if (VP8GetCPUInfo != NULL) {
+#if defined(WEBP_USE_SSE2)
+    if (VP8GetCPUInfo(kSSE2)) {
+      VP8SetResidualCoeffs = VP8SetResidualCoeffsSSE2;
+    }
+#endif
+  }
+}
+
 //------------------------------------------------------------------------------
 // Mode costs
 
diff --git a/src/enc/cost.h b/src/enc/cost.h
index 71fca855..5d107569 100644
--- a/src/enc/cost.h
+++ b/src/enc/cost.h
@@ -37,7 +37,13 @@ typedef struct {
 
 void VP8InitResidual(int first, int coeff_type,
                      VP8Encoder* const enc, VP8Residual* const res);
-void VP8SetResidualCoeffs(const int16_t* const coeffs, VP8Residual* const res);
+
+typedef void (*VP8SetResidualCoeffsFunc)(const int16_t* const coeffs,
+                                         VP8Residual* const res);
+extern VP8SetResidualCoeffsFunc VP8SetResidualCoeffs;
+
+extern void VP8SetResidualCoeffsInit(void);  // must be called first
+
 int VP8RecordCoeffs(int ctx, const VP8Residual* const res);
 
 // approximate cost per level:
diff --git a/src/enc/webpenc.c b/src/enc/webpenc.c
index 6275f45d..7aeb8411 100644
--- a/src/enc/webpenc.c
+++ b/src/enc/webpenc.c
@@ -253,6 +253,7 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
   ResetFilterHeader(enc);
   ResetBoundaryPredictions(enc);
   VP8GetResidualCostInit();
+  VP8SetResidualCoeffsInit();
   VP8EncInitAlpha(enc);
 
   // lower quality means smaller output -> we modulate a little the page
diff --git a/src/utils/utils.h b/src/utils/utils.h
index 90efcfcd..f2c498a9 100644
--- a/src/utils/utils.h
+++ b/src/utils/utils.h
@@ -77,6 +77,41 @@ static WEBP_INLINE void PutLE32(uint8_t* const data, uint32_t val) {
   PutLE16(data + 2, (int)(val >> 16));
 }
 
+// Returns (int)floor(log2(n)). n must be > 0.
+// use GNU builtins where available.
+#if defined(__GNUC__) && \
+    ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4)
+static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
+  return 31 ^ __builtin_clz(n);
+}
+#elif defined(_MSC_VER) && _MSC_VER > 1310 && \
+      (defined(_M_X64) || defined(_M_IX86))
+#include <intrin.h>
+#pragma intrinsic(_BitScanReverse)
+
+static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
+  uint32_t first_set_bit;
+  _BitScanReverse(&first_set_bit, n);
+  return first_set_bit;
+}
+#else
+static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
+  int log = 0;
+  uint32_t value = n;
+  int i;
+
+  for (i = 4; i >= 0; --i) {
+    const int shift = (1 << i);
+    const uint32_t x = value >> shift;
+    if (x != 0) {
+      value = x;
+      log += shift;
+    }
+  }
+  return log;
+}
+#endif
+
 //------------------------------------------------------------------------------
 
 #ifdef __cplusplus