VP8LFillBitWindow: enable fast path for 32-bit builds

also reduce the load size from 64 to 32 bits as the top 32 bits are
being shifted away in the operation.

the change is neutral speed-wise on x86_64 as is the change in load size
on x86, but it gives a slight improvement on 32-bit arm.
x86 is improved ~13%, 32-bit arm ~3.7%
aarch64 is untested but will likely benefit as well.

Change-Id: Ibcb02a70f46f2651105d7ab571afe352673bef48
This commit is contained in:
James Zern 2014-07-04 13:40:53 -07:00
parent 4f7f52b2a1
commit 6422e683af

View File

@ -109,6 +109,13 @@ int32_t VP8GetSignedValue(VP8BitReader* const br, int bits) {
#define WBITS 32 // Minimum number of bytes needed after VP8LFillBitWindow. #define WBITS 32 // Minimum number of bytes needed after VP8LFillBitWindow.
#define LOG8_WBITS 4 // Number of bytes needed to store WBITS bits. #define LOG8_WBITS 4 // Number of bytes needed to store WBITS bits.
#if !defined(WEBP_FORCE_ALIGNED) && \
(defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) || \
defined(__i386__) || defined(_M_IX86) || \
defined(__x86_64__) || defined(_M_X64))
#define VP8L_USE_UNALIGNED_LOAD
#endif
static const uint32_t kBitMask[VP8L_MAX_NUM_BIT_READ + 1] = { static const uint32_t kBitMask[VP8L_MAX_NUM_BIT_READ + 1] = {
0, 0,
0x000001, 0x000003, 0x000007, 0x00000f, 0x000001, 0x000003, 0x000007, 0x00000f,
@ -176,17 +183,16 @@ static void ShiftBytes(VP8LBitReader* const br) {
void VP8LFillBitWindow(VP8LBitReader* const br) { void VP8LFillBitWindow(VP8LBitReader* const br) {
if (br->bit_pos_ >= WBITS) { if (br->bit_pos_ >= WBITS) {
// TODO(jzern): 1) this might be of benefit in 32-bit builds too, along with // TODO(jzern): given the fixed read size it may be possible to force
// reducing the load size.
// 2) given the fixed read size it may be possible to force
// alignment in this block. // alignment in this block.
#if !defined(WEBP_FORCE_ALIGNED) && (defined(__x86_64__) || defined(_M_X64)) #if defined(VP8L_USE_UNALIGNED_LOAD)
if (br->pos_ + sizeof(br->val_) < br->len_) { if (br->pos_ + sizeof(br->val_) < br->len_) {
br->val_ >>= WBITS; br->val_ >>= WBITS;
br->bit_pos_ -= WBITS; br->bit_pos_ -= WBITS;
// The expression below needs a little-endian arch to work correctly. // The expression below needs a little-endian arch to work correctly.
// This gives a large speedup for decoding speed. // This gives a large speedup for decoding speed.
br->val_ |= *(const vp8l_val_t*)(br->buf_ + br->pos_) << (LBITS - WBITS); br->val_ |= (vp8l_val_t)*(const uint32_t*)(br->buf_ + br->pos_) <<
(LBITS - WBITS);
br->pos_ += LOG8_WBITS; br->pos_ += LOG8_WBITS;
return; return;
} }