From 6679f8996f2098d36ea34d9be761aff202929723 Mon Sep 17 00:00:00 2001 From: skal Date: Tue, 3 Jun 2014 06:44:04 +0200 Subject: [PATCH] Optimize VP8SetResidualCoeffs. Brings down WebP lossy encoding timings by 5% Change-Id: Ia4a2fab0a887aaaf7841ce6d9ee16270d3e15489 --- src/dsp/enc_sse2.c | 29 +++++++++++++++++++++++++++++ src/dsp/lossless.h | 36 +----------------------------------- src/enc/cost.c | 42 ++++++++++++++++++++++++++++++++++-------- src/enc/cost.h | 8 +++++++- src/enc/webpenc.c | 1 + src/utils/utils.h | 35 +++++++++++++++++++++++++++++++++++ 6 files changed, 107 insertions(+), 44 deletions(-) diff --git a/src/dsp/enc_sse2.c b/src/dsp/enc_sse2.c index ecebb4b3..d4ffd208 100644 --- a/src/dsp/enc_sse2.c +++ b/src/dsp/enc_sse2.c @@ -17,7 +17,9 @@ #include // for abs() #include +#include "../enc/cost.h" #include "../enc/vp8enci.h" +#include "../utils/utils.h" //------------------------------------------------------------------------------ // Quite useful macro for debugging. Left here for convenience. @@ -929,6 +931,33 @@ static int QuantizeBlockWHT(int16_t in[16], int16_t out[16], return DoQuantizeBlock(in, out, 0, &mtx->sharpen_[0], mtx); } +// Forward declaration. +void VP8SetResidualCoeffsSSE2(const int16_t* const coeffs, + VP8Residual* const res); + +void VP8SetResidualCoeffsSSE2(const int16_t* const coeffs, + VP8Residual* const res) { + const __m128i c0 = _mm_loadu_si128((const __m128i*)coeffs); + const __m128i c1 = _mm_loadu_si128((const __m128i*)(coeffs + 8)); + // Use SSE to compare 8 values with a single instruction. + const __m128i zero = _mm_setzero_si128(); + const __m128i m0 = _mm_cmpeq_epi16(c0, zero); + const __m128i m1 = _mm_cmpeq_epi16(c1, zero); + // Get the comparison results as a bitmask, consisting of two times 16 bits: + // two identical bits for each result. Concatenate both bitmasks to get a + // single 32 bit value. Negate the mask to get the position of entries that + // are not equal to zero. Finally, mask out least significant bits according + // to res->first. + const uint32_t mask = + ~((_mm_movemask_epi8(m1) << 16) | _mm_movemask_epi8(m0)) & + -(1U << (res->first << 1)); + // The position of the most significant non-zero bit indicates the position of + // the last non-zero value. Divide the result by two because __movemask_epi8 + // operates on 8 bit values instead of 16 bit values. + res->last = mask ? (BitsLog2Floor(mask) >> 1) : -1; + res->coeffs = coeffs; +} + #endif // WEBP_USE_SSE2 //------------------------------------------------------------------------------ diff --git a/src/dsp/lossless.h b/src/dsp/lossless.h index b99cc09b..e4da705f 100644 --- a/src/dsp/lossless.h +++ b/src/dsp/lossless.h @@ -19,6 +19,7 @@ #include "../webp/decode.h" #include "../enc/histogram.h" +#include "../utils/utils.h" #ifdef __cplusplus extern "C" { @@ -169,41 +170,6 @@ extern VP8LHistogramAddFunc VP8LHistogramAdd; // ----------------------------------------------------------------------------- // PrefixEncode() -// use GNU builtins where available. -#if defined(__GNUC__) && \ - ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4) -static WEBP_INLINE int BitsLog2Floor(uint32_t n) { - return 31 ^ __builtin_clz(n); -} -#elif defined(_MSC_VER) && _MSC_VER > 1310 && \ - (defined(_M_X64) || defined(_M_IX86)) -#include -#pragma intrinsic(_BitScanReverse) - -static WEBP_INLINE int BitsLog2Floor(uint32_t n) { - unsigned long first_set_bit; - _BitScanReverse(&first_set_bit, n); - return first_set_bit; -} -#else -// Returns (int)floor(log2(n)). n must be > 0. -static WEBP_INLINE int BitsLog2Floor(uint32_t n) { - int log = 0; - uint32_t value = n; - int i; - - for (i = 4; i >= 0; --i) { - const int shift = (1 << i); - const uint32_t x = value >> shift; - if (x != 0) { - value = x; - log += shift; - } - } - return log; -} -#endif - static WEBP_INLINE int VP8LBitsLog2Ceiling(uint32_t n) { const int log_floor = BitsLog2Floor(n); if (n == (n & ~(n - 1))) // zero or a power of two. diff --git a/src/enc/cost.c b/src/enc/cost.c index 9d6a490d..73c157dc 100644 --- a/src/enc/cost.c +++ b/src/enc/cost.c @@ -13,6 +13,12 @@ #include "./cost.h" +#if defined(WEBP_USE_SSE2) +#include +#endif // WEBP_USE_SSE2 + +#include "../utils/utils.h" + //------------------------------------------------------------------------------ // Boolean-cost cost table @@ -536,15 +542,13 @@ extern int VP8GetResidualCostMIPS32(int ctx0, const VP8Residual* const res); VP8GetResidualCostFunc VP8GetResidualCost; void VP8GetResidualCostInit(void) { - if (VP8GetResidualCost == NULL) { - VP8GetResidualCost = GetResidualCost; - if (VP8GetCPUInfo != NULL) { + VP8GetResidualCost = GetResidualCost; + if (VP8GetCPUInfo != NULL) { #if defined(WEBP_USE_MIPS32) - if (VP8GetCPUInfo(kMIPS32)) { - VP8GetResidualCost = VP8GetResidualCostMIPS32; - } -#endif + if (VP8GetCPUInfo(kMIPS32)) { + VP8GetResidualCost = VP8GetResidualCostMIPS32; } +#endif } } @@ -560,7 +564,8 @@ void VP8InitResidual(int first, int coeff_type, res->first = first; } -void VP8SetResidualCoeffs(const int16_t* const coeffs, VP8Residual* const res) { +static void SetResidualCoeffs(const int16_t* const coeffs, + VP8Residual* const res) { int n; res->last = -1; for (n = 15; n >= res->first; --n) { @@ -572,6 +577,27 @@ void VP8SetResidualCoeffs(const int16_t* const coeffs, VP8Residual* const res) { res->coeffs = coeffs; } +//------------------------------------------------------------------------------ +// init function + +#if defined(WEBP_USE_SSE2) +extern void VP8SetResidualCoeffsSSE2(const int16_t* const coeffs, + VP8Residual* const res); +#endif // WEBP_USE_SSE2 + +VP8SetResidualCoeffsFunc VP8SetResidualCoeffs; + +void VP8SetResidualCoeffsInit(void) { + VP8SetResidualCoeffs = SetResidualCoeffs; + if (VP8GetCPUInfo != NULL) { +#if defined(WEBP_USE_SSE2) + if (VP8GetCPUInfo(kSSE2)) { + VP8SetResidualCoeffs = VP8SetResidualCoeffsSSE2; + } +#endif + } +} + //------------------------------------------------------------------------------ // Mode costs diff --git a/src/enc/cost.h b/src/enc/cost.h index 71fca855..5d107569 100644 --- a/src/enc/cost.h +++ b/src/enc/cost.h @@ -37,7 +37,13 @@ typedef struct { void VP8InitResidual(int first, int coeff_type, VP8Encoder* const enc, VP8Residual* const res); -void VP8SetResidualCoeffs(const int16_t* const coeffs, VP8Residual* const res); + +typedef void (*VP8SetResidualCoeffsFunc)(const int16_t* const coeffs, + VP8Residual* const res); +extern VP8SetResidualCoeffsFunc VP8SetResidualCoeffs; + +extern void VP8SetResidualCoeffsInit(void); // must be called first + int VP8RecordCoeffs(int ctx, const VP8Residual* const res); // approximate cost per level: diff --git a/src/enc/webpenc.c b/src/enc/webpenc.c index 6275f45d..7aeb8411 100644 --- a/src/enc/webpenc.c +++ b/src/enc/webpenc.c @@ -253,6 +253,7 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config, ResetFilterHeader(enc); ResetBoundaryPredictions(enc); VP8GetResidualCostInit(); + VP8SetResidualCoeffsInit(); VP8EncInitAlpha(enc); // lower quality means smaller output -> we modulate a little the page diff --git a/src/utils/utils.h b/src/utils/utils.h index 90efcfcd..f2c498a9 100644 --- a/src/utils/utils.h +++ b/src/utils/utils.h @@ -77,6 +77,41 @@ static WEBP_INLINE void PutLE32(uint8_t* const data, uint32_t val) { PutLE16(data + 2, (int)(val >> 16)); } +// Returns (int)floor(log2(n)). n must be > 0. +// use GNU builtins where available. +#if defined(__GNUC__) && \ + ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4) +static WEBP_INLINE int BitsLog2Floor(uint32_t n) { + return 31 ^ __builtin_clz(n); +} +#elif defined(_MSC_VER) && _MSC_VER > 1310 && \ + (defined(_M_X64) || defined(_M_IX86)) +#include +#pragma intrinsic(_BitScanReverse) + +static WEBP_INLINE int BitsLog2Floor(uint32_t n) { + uint32_t first_set_bit; + _BitScanReverse(&first_set_bit, n); + return first_set_bit; +} +#else +static WEBP_INLINE int BitsLog2Floor(uint32_t n) { + int log = 0; + uint32_t value = n; + int i; + + for (i = 4; i >= 0; --i) { + const int shift = (1 << i); + const uint32_t x = value >> shift; + if (x != 0) { + value = x; + log += shift; + } + } + return log; +} +#endif + //------------------------------------------------------------------------------ #ifdef __cplusplus