introduce a generic GetCoeffs() function pointer

We can switch at run-time between the standard GetCoeffs() critical
function, that uses a fast variant of VP8GetBit().
However, some platforms have slow instructions that make standard
VP8GetBit() slow. GetCoeffs() is the right level of branching to
switch to GetCoeffsAlt() that avoids these slow instructions in some
not-frequent cases.

Next patch will upgrade VP8GetBit() to use clz, after this one
is proved to be neutral speed-wise.

Change-Id: Ia6cef5de9de6131574d2202bbc0bea8559c9b693
This commit is contained in:
Pascal Massimino 2017-01-13 16:35:42 +01:00
parent db013a8d5c
commit 8074b89eb3
2 changed files with 86 additions and 2 deletions

View File

@ -26,6 +26,16 @@ int WebPGetDecoderVersion(void) {
return (DEC_MAJ_VERSION << 16) | (DEC_MIN_VERSION << 8) | DEC_REV_VERSION; return (DEC_MAJ_VERSION << 16) | (DEC_MIN_VERSION << 8) | DEC_REV_VERSION;
} }
//------------------------------------------------------------------------------
// Signature and pointer-to-function for GetCoeffs() variants below.
typedef int (*GetCoeffsFunc)(VP8BitReader* const br,
const VP8BandProbas* const prob[],
int ctx, const quant_t dq, int n, int16_t* out);
static volatile GetCoeffsFunc GetCoeffs = NULL;
static void InitGetCoeffs(void);
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// VP8Decoder // VP8Decoder
@ -51,6 +61,7 @@ VP8Decoder* VP8New(void) {
WebPGetWorkerInterface()->Init(&dec->worker_); WebPGetWorkerInterface()->Init(&dec->worker_);
dec->ready_ = 0; dec->ready_ = 0;
dec->num_parts_minus_one_ = 0; dec->num_parts_minus_one_ = 0;
InitGetCoeffs();
} }
return dec; return dec;
} }
@ -422,8 +433,9 @@ static int GetLargeValue(VP8BitReader* const br, const uint8_t* const p) {
} }
// Returns the position of the last non-zero coeff plus one // Returns the position of the last non-zero coeff plus one
static int GetCoeffs(VP8BitReader* const br, const VP8BandProbas* const prob[], static int GetCoeffsFast(VP8BitReader* const br,
int ctx, const quant_t dq, int n, int16_t* out) { const VP8BandProbas* const prob[],
int ctx, const quant_t dq, int n, int16_t* out) {
const uint8_t* p = prob[n]->probas_[ctx]; const uint8_t* p = prob[n]->probas_[ctx];
for (; n < 16; ++n) { for (; n < 16; ++n) {
if (!VP8GetBit(br, p[0])) { if (!VP8GetBit(br, p[0])) {
@ -449,6 +461,46 @@ static int GetCoeffs(VP8BitReader* const br, const VP8BandProbas* const prob[],
return 16; return 16;
} }
// This version of GetCoeffs() uses VP8GetBitAlt() which is an alternate version
// of VP8GetBitAlt() targeting specific platforms.
static int GetCoeffsAlt(VP8BitReader* const br,
const VP8BandProbas* const prob[],
int ctx, const quant_t dq, int n, int16_t* out) {
const uint8_t* p = prob[n]->probas_[ctx];
for (; n < 16; ++n) {
if (!VP8GetBitAlt(br, p[0])) {
return n; // previous coeff was last non-zero coeff
}
while (!VP8GetBitAlt(br, p[1])) { // sequence of zero coeffs
p = prob[++n]->probas_[0];
if (n == 16) return 16;
}
{ // non zero coeff
const VP8ProbaArray* const p_ctx = &prob[n + 1]->probas_[0];
int v;
if (!VP8GetBitAlt(br, p[2])) {
v = 1;
p = p_ctx[1];
} else {
v = GetLargeValue(br, p);
p = p_ctx[2];
}
out[kZigzag[n]] = VP8GetSigned(br, v) * dq[n > 0];
}
}
return 16;
}
WEBP_TSAN_IGNORE_FUNCTION static void InitGetCoeffs(void) {
if (GetCoeffs == NULL) {
if (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kSlowSSSE3)) {
GetCoeffs = GetCoeffsAlt;
} else {
GetCoeffs = GetCoeffsFast;
}
}
}
static WEBP_INLINE uint32_t NzCodeBits(uint32_t nz_coeffs, int nz, int dc_nz) { static WEBP_INLINE uint32_t NzCodeBits(uint32_t nz_coeffs, int nz, int dc_nz) {
nz_coeffs <<= 2; nz_coeffs <<= 2;
nz_coeffs |= (nz > 3) ? 3 : (nz > 1) ? 2 : dc_nz; nz_coeffs |= (nz > 3) ? 3 : (nz > 1) ? 2 : dc_nz;

View File

@ -25,6 +25,7 @@
#include "../dsp/dsp.h" #include "../dsp/dsp.h"
#include "./bit_reader.h" #include "./bit_reader.h"
#include "./endian_inl.h" #include "./endian_inl.h"
#include "./utils.h"
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
@ -163,6 +164,37 @@ int VP8GetSigned(VP8BitReader* const br, int v) {
} }
} }
static WEBP_INLINE int VP8GetBitAlt(VP8BitReader* const br, int prob) {
// Don't move this declaration! It makes a big speed difference to store
// 'range' *before* calling VP8LoadNewBytes(), even if this function doesn't
// alter br->range_ value.
range_t range = br->range_;
if (br->bits_ < 0) {
VP8LoadNewBytes(br);
}
{
const int pos = br->bits_;
const range_t split = (range * prob) >> 8;
const range_t value = (range_t)(br->value_ >> pos);
int bit; // Don't use 'const int bit = (value > split);", it's slower.
if (value > split) {
range -= split + 1;
br->value_ -= (bit_t)(split + 1) << pos;
bit = 1;
} else {
range = split;
bit = 0;
}
if (range <= (range_t)0x7e) {
const int shift = kVP8Log2Range[range];
range = kVP8NewRange[range];
br->bits_ -= shift;
}
br->range_ = range;
return bit;
}
}
#ifdef __cplusplus #ifdef __cplusplus
} // extern "C" } // extern "C"
#endif #endif