Remove ReadOneBit() and ReadSymbolUnsafe()

Simplify and re-organize the VP8L bit-reader functions
(e.g.: the 40-bit look-ahead code was helping much)

Speed-up with LBITS=64, on arm7-a:

=> before:
./dwebp_justify_24_neon -v bryce_ll.webp
Time to decode picture: 11.393s
File bryce_ll.webp can be decoded (dimensions: 11158 x 2156).
...

=> after (LBITS=64):	Time to decode picture: 9.953s

making the VP8L bit-reader in 32 bit mode is going to be
harder (because we need to be able to read two symbols
at a time, each with max length 15 bits)

Change-Id: I89746fb103b87b5e2fd40a3208a6fbc584b88297
This commit is contained in:
skal 2013-02-20 00:13:23 +01:00
parent b7490f8553
commit 1667bded67
3 changed files with 53 additions and 93 deletions

View File

@ -149,29 +149,21 @@ static WEBP_INLINE int PlaneCodeToDistance(int xsize, int plane_code) {
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// Decodes the next Huffman code from bit-stream. // Decodes the next Huffman code from bit-stream.
// FillBitWindow(br) needs to be called at minimum every second call // FillBitWindow(br) needs to be called at minimum every second call
// to ReadSymbolUnsafe. // to ReadSymbol, in order to pre-fetch enough bits.
static int ReadSymbolUnsafe(const HuffmanTree* tree, VP8LBitReader* const br) {
const HuffmanTreeNode* node = tree->root_;
assert(node != NULL);
while (!HuffmanTreeNodeIsLeaf(node)) {
node = HuffmanTreeNextNode(node, VP8LReadOneBitUnsafe(br));
}
return node->symbol_;
}
static WEBP_INLINE int ReadSymbol(const HuffmanTree* tree, static WEBP_INLINE int ReadSymbol(const HuffmanTree* tree,
VP8LBitReader* const br) { VP8LBitReader* const br) {
const int read_safe = (br->pos_ + 8 > br->len_); const HuffmanTreeNode* node = tree->root_;
if (!read_safe) { int num_bits = 0;
return ReadSymbolUnsafe(tree, br); uint32_t bits;
} else { bits = VP8LPrefetchBits(br);
const HuffmanTreeNode* node = tree->root_; assert(node != NULL);
assert(node != NULL); while (!HuffmanTreeNodeIsLeaf(node)) {
while (!HuffmanTreeNodeIsLeaf(node)) { node = HuffmanTreeNextNode(node, bits & 1);
node = HuffmanTreeNextNode(node, VP8LReadOneBit(br)); bits >>= 1;
} ++num_bits;
return node->symbol_;
} }
VP8LDiscardBits(br, num_bits);
return node->symbol_;
} }
static int ReadHuffmanCodeLengths( static int ReadHuffmanCodeLengths(

View File

@ -113,6 +113,10 @@ int32_t VP8GetSignedValue(VP8BitReader* const br, int bits) {
#define MAX_NUM_BIT_READ 25 #define MAX_NUM_BIT_READ 25
#define LBITS 64 // Number of bits prefetched.
#define WBITS 32 // Minimum number of bytes needed after VP8LFillBitWindow.
#define LOG8_WBITS 4 // Number of bytes needed to store WBITS bits.
static const uint32_t kBitMask[MAX_NUM_BIT_READ] = { static const uint32_t kBitMask[MAX_NUM_BIT_READ] = {
0, 1, 3, 7, 15, 31, 63, 127, 255, 511, 1023, 2047, 4095, 8191, 16383, 32767, 0, 1, 3, 7, 15, 31, 63, 127, 255, 511, 1023, 2047, 4095, 8191, 16383, 32767,
65535, 131071, 262143, 524287, 1048575, 2097151, 4194303, 8388607, 16777215 65535, 131071, 262143, 524287, 1048575, 2097151, 4194303, 8388607, 16777215
@ -134,7 +138,7 @@ void VP8LInitBitReader(VP8LBitReader* const br,
br->eos_ = 0; br->eos_ = 0;
br->error_ = 0; br->error_ = 0;
for (i = 0; i < sizeof(br->val_) && i < br->len_; ++i) { for (i = 0; i < sizeof(br->val_) && i < br->len_; ++i) {
br->val_ |= ((uint64_t)br->buf_[br->pos_]) << (8 * i); br->val_ |= ((vp8l_val_t)br->buf_[br->pos_]) << (8 * i);
++br->pos_; ++br->pos_;
} }
} }
@ -149,91 +153,56 @@ void VP8LBitReaderSetBuffer(VP8LBitReader* const br,
br->len_ = len; br->len_ = len;
} }
// If not at EOS, reload up to LBITS byte-by-byte
static void ShiftBytes(VP8LBitReader* const br) { static void ShiftBytes(VP8LBitReader* const br) {
while (br->bit_pos_ >= 8 && br->pos_ < br->len_) { while (br->bit_pos_ >= 8 && br->pos_ < br->len_) {
br->val_ >>= 8; br->val_ >>= 8;
br->val_ |= ((uint64_t)br->buf_[br->pos_]) << 56; br->val_ |= ((vp8l_val_t)br->buf_[br->pos_]) << (LBITS - 8);
++br->pos_; ++br->pos_;
br->bit_pos_ -= 8; br->bit_pos_ -= 8;
} }
} }
void VP8LFillBitWindow(VP8LBitReader* const br) { void VP8LFillBitWindow(VP8LBitReader* const br) {
if (br->bit_pos_ >= 32) { if (br->bit_pos_ >= WBITS) {
#if defined(__x86_64__) || defined(_M_X64) #if (defined(__x86_64__) || defined(_M_X64))
if (br->pos_ + 8 < br->len_) { if (br->pos_ + sizeof(br->val_) < br->len_) {
br->val_ >>= 32; br->val_ >>= WBITS;
br->bit_pos_ -= WBITS;
// The expression below needs a little-endian arch to work correctly. // The expression below needs a little-endian arch to work correctly.
// This gives a large speedup for decoding speed. // This gives a large speedup for decoding speed.
br->val_ |= *(const uint64_t *)(br->buf_ + br->pos_) << 32; br->val_ |= *(const vp8l_val_t*)(br->buf_ + br->pos_) << (LBITS - WBITS);
br->pos_ += 4; br->pos_ += LOG8_WBITS;
br->bit_pos_ -= 32; return;
} else {
// Slow path.
ShiftBytes(br);
} }
#else
// Always the slow path.
ShiftBytes(br);
#endif #endif
} ShiftBytes(br); // Slow path.
if (br->pos_ == br->len_ && br->bit_pos_ == 64) { if (br->pos_ == br->len_ && br->bit_pos_ == LBITS) {
br->eos_ = 1;
}
}
uint32_t VP8LReadOneBit(VP8LBitReader* const br) {
const uint32_t val = (uint32_t)((br->val_ >> br->bit_pos_) & 1);
// Flag an error at end_of_stream.
if (!br->eos_) {
++br->bit_pos_;
if (br->bit_pos_ >= 32) {
ShiftBytes(br);
}
// After this last bit is read, check if eos needs to be flagged.
if (br->pos_ == br->len_ && br->bit_pos_ == 64) {
br->eos_ = 1; br->eos_ = 1;
} }
} else {
br->error_ = 1;
} }
return val;
} }
uint32_t VP8LReadBits(VP8LBitReader* const br, int n_bits) { uint32_t VP8LReadBits(VP8LBitReader* const br, int n_bits) {
uint32_t val = 0;
assert(n_bits >= 0); assert(n_bits >= 0);
// Flag an error if end_of_stream or n_bits is more than allowed limit. // Flag an error if end_of_stream or n_bits is more than allowed limit.
if (!br->eos_ && n_bits < MAX_NUM_BIT_READ) { if (!br->eos_ && n_bits < MAX_NUM_BIT_READ) {
const uint32_t val =
(uint32_t)(br->val_ >> br->bit_pos_) & kBitMask[n_bits];
const int new_bits = br->bit_pos_ + n_bits;
br->bit_pos_ = new_bits;
// If this read is going to cross the read buffer, set the eos flag. // If this read is going to cross the read buffer, set the eos flag.
if (br->pos_ == br->len_) { if (br->pos_ == br->len_) {
if ((br->bit_pos_ + n_bits) >= 64) { if (new_bits >= LBITS) {
br->eos_ = 1; br->eos_ = 1;
if ((br->bit_pos_ + n_bits) > 64) return val;
}
}
val = (uint32_t)((br->val_ >> br->bit_pos_) & kBitMask[n_bits]);
br->bit_pos_ += n_bits;
if (br->bit_pos_ >= 40) {
if (br->pos_ + 5 < br->len_) {
br->val_ >>= 40;
br->val_ |=
(((uint64_t)br->buf_[br->pos_ + 0]) << 24) |
(((uint64_t)br->buf_[br->pos_ + 1]) << 32) |
(((uint64_t)br->buf_[br->pos_ + 2]) << 40) |
(((uint64_t)br->buf_[br->pos_ + 3]) << 48) |
(((uint64_t)br->buf_[br->pos_ + 4]) << 56);
br->pos_ += 5;
br->bit_pos_ -= 40;
}
if (br->bit_pos_ >= 8) {
ShiftBytes(br);
} }
} }
ShiftBytes(br);
return val;
} else { } else {
br->error_ = 1; br->error_ = 1;
return 0;
} }
return val;
} }
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------

View File

@ -258,14 +258,16 @@ static WEBP_INLINE int VP8GetSigned(VP8BitReader* const br, int v) {
// ----------------------------------------------------------------------------- // -----------------------------------------------------------------------------
// Bitreader for lossless format // Bitreader for lossless format
typedef uint64_t vp8l_val_t; // right now, this bit-reader can only use 64bit.
typedef struct { typedef struct {
uint64_t val_; vp8l_val_t val_; // pre-fetched bits
const uint8_t* buf_; const uint8_t* buf_; // input byte buffer
size_t len_; size_t len_; // buffer length
size_t pos_; size_t pos_; // byte position in buf_
int bit_pos_; int bit_pos_; // current bit-reading position in val_
int eos_; int eos_; // bitstream is finished
int error_; int error_; // an error occurred (buffer overflow attempt...)
} VP8LBitReader; } VP8LBitReader;
void VP8LInitBitReader(VP8LBitReader* const br, void VP8LInitBitReader(VP8LBitReader* const br,
@ -281,17 +283,14 @@ void VP8LBitReaderSetBuffer(VP8LBitReader* const br,
// Flags eos if this read attempt is going to cross the read buffer. // Flags eos if this read attempt is going to cross the read buffer.
uint32_t VP8LReadBits(VP8LBitReader* const br, int n_bits); uint32_t VP8LReadBits(VP8LBitReader* const br, int n_bits);
// Reads one bit from Read Buffer. Flags an error in case end_of_stream. // Return the prefetched bits, so they can be looked up.
// Flags eos after reading last bit from the buffer. static WEBP_INLINE uint32_t VP8LPrefetchBits(VP8LBitReader* const br) {
uint32_t VP8LReadOneBit(VP8LBitReader* const br); return (uint32_t)(br->val_ >> br->bit_pos_);
}
// VP8LReadOneBitUnsafe is faster than VP8LReadOneBit, but it can be called only // Discard 'num_bits' bits from the cache.
// 32 times after the last VP8LFillBitWindow. Any subsequent calls static WEBP_INLINE void VP8LDiscardBits(VP8LBitReader* const br, int num_bits) {
// (without VP8LFillBitWindow) will return invalid data. br->bit_pos_ += num_bits;
static WEBP_INLINE uint32_t VP8LReadOneBitUnsafe(VP8LBitReader* const br) {
const uint32_t val = (uint32_t)((br->val_ >> br->bit_pos_) & 1);
++br->bit_pos_;
return val;
} }
// Advances the Read buffer by 4 bytes to make room for reading next 32 bits. // Advances the Read buffer by 4 bytes to make room for reading next 32 bits.