From 8074b89eb30bb4edafa998f63e726d8fb8fa6af2 Mon Sep 17 00:00:00 2001 From: Pascal Massimino Date: Fri, 13 Jan 2017 16:35:42 +0100 Subject: [PATCH] introduce a generic GetCoeffs() function pointer We can switch at run-time between the standard GetCoeffs() critical function, that uses a fast variant of VP8GetBit(). However, some platforms have slow instructions that make standard VP8GetBit() slow. GetCoeffs() is the right level of branching to switch to GetCoeffsAlt() that avoids these slow instructions in some not-frequent cases. Next patch will upgrade VP8GetBit() to use clz, after this one is proved to be neutral speed-wise. Change-Id: Ia6cef5de9de6131574d2202bbc0bea8559c9b693 --- src/dec/vp8.c | 56 ++++++++++++++++++++++++++++++++++++-- src/utils/bit_reader_inl.h | 32 ++++++++++++++++++++++ 2 files changed, 86 insertions(+), 2 deletions(-) diff --git a/src/dec/vp8.c b/src/dec/vp8.c index 61682540..9b54389f 100644 --- a/src/dec/vp8.c +++ b/src/dec/vp8.c @@ -26,6 +26,16 @@ int WebPGetDecoderVersion(void) { return (DEC_MAJ_VERSION << 16) | (DEC_MIN_VERSION << 8) | DEC_REV_VERSION; } +//------------------------------------------------------------------------------ +// Signature and pointer-to-function for GetCoeffs() variants below. + +typedef int (*GetCoeffsFunc)(VP8BitReader* const br, + const VP8BandProbas* const prob[], + int ctx, const quant_t dq, int n, int16_t* out); +static volatile GetCoeffsFunc GetCoeffs = NULL; + +static void InitGetCoeffs(void); + //------------------------------------------------------------------------------ // VP8Decoder @@ -51,6 +61,7 @@ VP8Decoder* VP8New(void) { WebPGetWorkerInterface()->Init(&dec->worker_); dec->ready_ = 0; dec->num_parts_minus_one_ = 0; + InitGetCoeffs(); } return dec; } @@ -422,8 +433,9 @@ static int GetLargeValue(VP8BitReader* const br, const uint8_t* const p) { } // Returns the position of the last non-zero coeff plus one -static int GetCoeffs(VP8BitReader* const br, const VP8BandProbas* const prob[], - int ctx, const quant_t dq, int n, int16_t* out) { +static int GetCoeffsFast(VP8BitReader* const br, + const VP8BandProbas* const prob[], + int ctx, const quant_t dq, int n, int16_t* out) { const uint8_t* p = prob[n]->probas_[ctx]; for (; n < 16; ++n) { if (!VP8GetBit(br, p[0])) { @@ -449,6 +461,46 @@ static int GetCoeffs(VP8BitReader* const br, const VP8BandProbas* const prob[], return 16; } +// This version of GetCoeffs() uses VP8GetBitAlt() which is an alternate version +// of VP8GetBitAlt() targeting specific platforms. +static int GetCoeffsAlt(VP8BitReader* const br, + const VP8BandProbas* const prob[], + int ctx, const quant_t dq, int n, int16_t* out) { + const uint8_t* p = prob[n]->probas_[ctx]; + for (; n < 16; ++n) { + if (!VP8GetBitAlt(br, p[0])) { + return n; // previous coeff was last non-zero coeff + } + while (!VP8GetBitAlt(br, p[1])) { // sequence of zero coeffs + p = prob[++n]->probas_[0]; + if (n == 16) return 16; + } + { // non zero coeff + const VP8ProbaArray* const p_ctx = &prob[n + 1]->probas_[0]; + int v; + if (!VP8GetBitAlt(br, p[2])) { + v = 1; + p = p_ctx[1]; + } else { + v = GetLargeValue(br, p); + p = p_ctx[2]; + } + out[kZigzag[n]] = VP8GetSigned(br, v) * dq[n > 0]; + } + } + return 16; +} + +WEBP_TSAN_IGNORE_FUNCTION static void InitGetCoeffs(void) { + if (GetCoeffs == NULL) { + if (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kSlowSSSE3)) { + GetCoeffs = GetCoeffsAlt; + } else { + GetCoeffs = GetCoeffsFast; + } + } +} + static WEBP_INLINE uint32_t NzCodeBits(uint32_t nz_coeffs, int nz, int dc_nz) { nz_coeffs <<= 2; nz_coeffs |= (nz > 3) ? 3 : (nz > 1) ? 2 : dc_nz; diff --git a/src/utils/bit_reader_inl.h b/src/utils/bit_reader_inl.h index 003f6a88..db8aca9a 100644 --- a/src/utils/bit_reader_inl.h +++ b/src/utils/bit_reader_inl.h @@ -25,6 +25,7 @@ #include "../dsp/dsp.h" #include "./bit_reader.h" #include "./endian_inl.h" +#include "./utils.h" #ifdef __cplusplus extern "C" { @@ -163,6 +164,37 @@ int VP8GetSigned(VP8BitReader* const br, int v) { } } +static WEBP_INLINE int VP8GetBitAlt(VP8BitReader* const br, int prob) { + // Don't move this declaration! It makes a big speed difference to store + // 'range' *before* calling VP8LoadNewBytes(), even if this function doesn't + // alter br->range_ value. + range_t range = br->range_; + if (br->bits_ < 0) { + VP8LoadNewBytes(br); + } + { + const int pos = br->bits_; + const range_t split = (range * prob) >> 8; + const range_t value = (range_t)(br->value_ >> pos); + int bit; // Don't use 'const int bit = (value > split);", it's slower. + if (value > split) { + range -= split + 1; + br->value_ -= (bit_t)(split + 1) << pos; + bit = 1; + } else { + range = split; + bit = 0; + } + if (range <= (range_t)0x7e) { + const int shift = kVP8Log2Range[range]; + range = kVP8NewRange[range]; + br->bits_ -= shift; + } + br->range_ = range; + return bit; + } +} + #ifdef __cplusplus } // extern "C" #endif