4-5% faster decoding, optimized byte loads in arithmetic decoder.

Bits are loaded 32bits at a time (and often aligned).
Rather 64bit-friendly

Change-Id: If7f67dbe5e37696efbeb6d579d9d8482350b79ee
This commit is contained in:
Pascal Massimino 2012-01-29 17:38:37 -08:00
parent 631117ea5e
commit 01b6380656
2 changed files with 105 additions and 46 deletions

View File

@ -15,6 +15,8 @@
extern "C" { extern "C" {
#endif #endif
#define MK(X) (((bit_t)(X) << (BITS)) | (MASK))
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// VP8BitReader // VP8BitReader
@ -23,11 +25,11 @@ void VP8InitBitReader(VP8BitReader* const br,
assert(br); assert(br);
assert(start); assert(start);
assert(start <= end); assert(start <= end);
br->range_ = 255 - 1; br->range_ = MK(255 - 1);
br->buf_ = start; br->buf_ = start;
br->buf_end_ = end; br->buf_end_ = end;
br->value_ = 0; br->value_ = 0;
br->missing_ = 8; br->missing_ = 8; // to load the very first 8bits
br->eof_ = 0; br->eof_ = 0;
} }
@ -43,19 +45,39 @@ const uint8_t kVP8Log2Range[128] = {
0 0
}; };
// range = ((range + 1) << kVP8Log2Range[range]) - 1 // range = (range << kVP8Log2Range[range]) + trailing 1's
const uint8_t kVP8NewRange[128] = { const bit_t kVP8NewRange[128] = {
127, 127, 191, 127, 159, 191, 223, 127, 143, 159, 175, 191, 207, 223, 239, MK(127), MK(127), MK(191), MK(127), MK(159), MK(191), MK(223), MK(127),
127, 135, 143, 151, 159, 167, 175, 183, 191, 199, 207, 215, 223, 231, 239, MK(143), MK(159), MK(175), MK(191), MK(207), MK(223), MK(239), MK(127),
247, 127, 131, 135, 139, 143, 147, 151, 155, 159, 163, 167, 171, 175, 179, MK(135), MK(143), MK(151), MK(159), MK(167), MK(175), MK(183), MK(191),
183, 187, 191, 195, 199, 203, 207, 211, 215, 219, 223, 227, 231, 235, 239, MK(199), MK(207), MK(215), MK(223), MK(231), MK(239), MK(247), MK(127),
243, 247, 251, 127, 129, 131, 133, 135, 137, 139, 141, 143, 145, 147, 149, MK(131), MK(135), MK(139), MK(143), MK(147), MK(151), MK(155), MK(159),
151, 153, 155, 157, 159, 161, 163, 165, 167, 169, 171, 173, 175, 177, 179, MK(163), MK(167), MK(171), MK(175), MK(179), MK(183), MK(187), MK(191),
181, 183, 185, 187, 189, 191, 193, 195, 197, 199, 201, 203, 205, 207, 209, MK(195), MK(199), MK(203), MK(207), MK(211), MK(215), MK(219), MK(223),
211, 213, 215, 217, 219, 221, 223, 225, 227, 229, 231, 233, 235, 237, 239, MK(227), MK(231), MK(235), MK(239), MK(243), MK(247), MK(251), MK(127),
241, 243, 245, 247, 249, 251, 253, 127 MK(129), MK(131), MK(133), MK(135), MK(137), MK(139), MK(141), MK(143),
MK(145), MK(147), MK(149), MK(151), MK(153), MK(155), MK(157), MK(159),
MK(161), MK(163), MK(165), MK(167), MK(169), MK(171), MK(173), MK(175),
MK(177), MK(179), MK(181), MK(183), MK(185), MK(187), MK(189), MK(191),
MK(193), MK(195), MK(197), MK(199), MK(201), MK(203), MK(205), MK(207),
MK(209), MK(211), MK(213), MK(215), MK(217), MK(219), MK(221), MK(223),
MK(225), MK(227), MK(229), MK(231), MK(233), MK(235), MK(237), MK(239),
MK(241), MK(243), MK(245), MK(247), MK(249), MK(251), MK(253), MK(127)
}; };
#undef MK
void VP8LoadFinalBytes(VP8BitReader* const br) {
assert(br && br->buf_);
// Only read 8bits at a time
if (br->buf_ < br->buf_end_) {
br->value_ |= (bit_t)(*br->buf_++) << ((BITS) - 8 + br->missing_);
br->missing_ -= 8;
} else {
br->eof_ = 1;
}
}
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// Higher-level calls // Higher-level calls

View File

@ -19,6 +19,19 @@
extern "C" { extern "C" {
#endif #endif
#define BITS 32 // can be 32, 16 or 8
#define MASK ((((bit_t)1) << (BITS)) - 1)
#if (BITS == 32)
typedef uint64_t bit_t; // natural register type
typedef uint32_t lbit_t; // natural type for memory I/O
#elif (BITS == 16)
typedef uint32_t bit_t;
typedef uint16_t lbit_t;
#else
typedef uint32_t bit_t;
typedef uint8_t lbit_t;
#endif
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// Bitreader and code-tree reader // Bitreader and code-tree reader
@ -29,9 +42,9 @@ struct VP8BitReader {
int eof_; // true if input is exhausted int eof_; // true if input is exhausted
// boolean decoder // boolean decoder
uint32_t range_; // current range minus 1. In [127, 254] interval. bit_t range_; // current range minus 1. In [127, 254] interval.
uint32_t value_; // current value bit_t value_; // current value
int missing_; // number of missing bits in value_ (8bit) int missing_; // number of missing bits in value_ (8bit)
}; };
// Initialize the bit reader and the boolean decoder. // Initialize the bit reader and the boolean decoder.
@ -49,56 +62,80 @@ int32_t VP8GetSignedValue(VP8BitReader* const br, int num_bits);
// Read a bit with proba 'prob'. Speed-critical function! // Read a bit with proba 'prob'. Speed-critical function!
extern const uint8_t kVP8Log2Range[128]; extern const uint8_t kVP8Log2Range[128];
extern const uint8_t kVP8NewRange[128]; extern const bit_t kVP8NewRange[128];
static WEBP_INLINE uint32_t VP8GetByte(VP8BitReader* const br) {
assert(br); void VP8LoadFinalBytes(VP8BitReader* const br); // special case for the tail
if (br->buf_ < br->buf_end_) {
assert(br->buf_); static WEBP_INLINE void VP8LoadNewBytes(VP8BitReader* const br) {
return *br->buf_++; assert(br && br->buf_);
// Read 'BITS' bits at a time if possible.
if (br->buf_ + sizeof(lbit_t) <= br->buf_end_) {
// convert memory type to register type (with some zero'ing!)
bit_t bits;
lbit_t in_bits = *(lbit_t*)br->buf_;
br->buf_ += (BITS) >> 3;
#if !defined(__BIG_ENDIAN__) // TODO(skal): what about PPC?
#if (BITS == 32)
#if defined(__i386__) || defined(__x86_64__)
__asm__ volatile("bswap %k0" : "=r"(in_bits) : "0"(in_bits));
bits = (bit_t)in_bits; // 32b -> 64b zero-extension
#elif defined(_MSC_VER)
bits = _byteswap_ulong(in_bits);
#else
bits = (bit_t)(in_bits >> 24) | ((in_bits >> 8) & 0xff00)
| ((in_bits << 8) & 0xff0000) | (in_bits << 24);
#endif // x86
#elif (BITS == 16)
// gcc will recognize a 'rorw $8, ...' here:
bits = (bit_t)(in_bits >> 8) | ((in_bits & 0xff) << 8);
#endif
#endif // LITTLE_ENDIAN
br->value_ |= bits << br->missing_;
br->missing_ -= (BITS);
} else {
VP8LoadFinalBytes(br); // no need to be inlined
} }
br->eof_ = 1;
return 0xff;
} }
static WEBP_INLINE uint32_t VP8BitUpdate( static WEBP_INLINE int VP8BitUpdate(VP8BitReader* const br, bit_t split) {
VP8BitReader* const br, uint32_t split) { const bit_t value_split = split | (MASK);
uint32_t bit; if (br->missing_ > 0) { // Make sure we have a least BITS bits in 'value_'
const uint32_t value_split = (split + 1) << 8; VP8LoadNewBytes(br);
// Make sure we have a least 8 bits in 'value_'
if (br->missing_ > 0) {
br->value_ |= VP8GetByte(br) << br->missing_;
br->missing_ -= 8;
} }
bit = (br->value_ >= value_split); if (br->value_ > value_split) {
if (bit) { br->range_ -= value_split + 1;
br->range_ -= split + 1; br->value_ -= value_split + 1;
br->value_ -= value_split; return 1;
} else { } else {
br->range_ = split; br->range_ = value_split;
return 0;
} }
return bit;
} }
static WEBP_INLINE void VP8Shift(VP8BitReader* const br) { static WEBP_INLINE void VP8Shift(VP8BitReader* const br) {
// range_ is in [0..127] interval here. // range_ is in [0..127] interval here.
const int shift = kVP8Log2Range[br->range_]; const int idx = br->range_ >> (BITS);
br->range_ = kVP8NewRange[br->range_]; const int shift = kVP8Log2Range[idx];
br->range_ = kVP8NewRange[idx];
br->value_ <<= shift; br->value_ <<= shift;
br->missing_ += shift; br->missing_ += shift;
} }
static WEBP_INLINE uint32_t VP8GetBit(VP8BitReader* const br, int prob) { static WEBP_INLINE int VP8GetBit(VP8BitReader* const br, int prob) {
const uint32_t split = (br->range_ * prob) >> 8; // It's important to avoid generating a 64bit x 64bit multiply here.
const uint32_t bit = VP8BitUpdate(br, split); // We just need an 8b x 8b after all.
if (br->range_ < 0x7f) { const bit_t split =
(bit_t)((uint32_t)(br->range_ >> (BITS)) * prob) << ((BITS) - 8);
const int bit = VP8BitUpdate(br, split);
if (br->range_ <= (((bit_t)0x7e << (BITS)) | (MASK))) {
VP8Shift(br); VP8Shift(br);
} }
return bit; return bit;
} }
static WEBP_INLINE int VP8GetSigned(VP8BitReader* const br, int v) { static WEBP_INLINE int VP8GetSigned(VP8BitReader* const br, int v) {
const uint32_t split = br->range_ >> 1; const bit_t split = (br->range_ >> 1);
const uint32_t bit = VP8BitUpdate(br, split); const int bit = VP8BitUpdate(br, split);
VP8Shift(br); VP8Shift(br);
return bit ? -v : v; return bit ? -v : v;
} }