From c13fecf908a612a646e378802b1b108a9719cc71 Mon Sep 17 00:00:00 2001 From: skal Date: Thu, 5 Sep 2013 08:53:36 +0200 Subject: [PATCH] remove the PACK() bit-packing tricks was too smart for its own good :) This is more ARM-friendly, since it removes a mult. Change-Id: If146034c8efa2e71e3eaaf1230cb553884a42ebb --- src/dec/frame.c | 16 +++++----- src/dec/vp8.c | 81 +++++++++++++++++++------------------------------ src/dec/vp8i.h | 4 +-- 3 files changed, 43 insertions(+), 58 deletions(-) diff --git a/src/dec/frame.c b/src/dec/frame.c index 9baac8e1..9051b567 100644 --- a/src/dec/frame.c +++ b/src/dec/frame.c @@ -595,6 +595,7 @@ void VP8ReconstructBlock(const VP8Decoder* const dec) { // predict and add residuals if (block->is_i4x4_) { // 4x4 uint32_t* const top_right = (uint32_t*)(y_dst - BPS + 16); + uint32_t bits = (block->non_zero_ & 0xffff) | (block->non_zero_ac_ << 16); if (dec->mb_y_ > 0) { if (dec->mb_x_ >= dec->mb_w_ - 1) { // on rightmost border @@ -607,25 +608,26 @@ void VP8ReconstructBlock(const VP8Decoder* const dec) { top_right[BPS] = top_right[2 * BPS] = top_right[3 * BPS] = top_right[0]; // predict and add residuals for all 4x4 blocks in turn. - for (n = 0; n < 16; n++) { + for (n = 0; n < 16; ++n, bits <<= 1) { uint8_t* const dst = y_dst + kScan[n]; VP8PredLuma4[block->imodes_[n]](dst); - if (block->non_zero_ac_ & (1 << n)) { + if (bits & (1UL << 31)) { VP8Transform(coeffs + n * 16, dst, 0); - } else if (block->non_zero_ & (1 << n)) { // only DC is present + } else if (bits & (1UL << 15)) { // only DC is present VP8TransformDC(coeffs + n * 16, dst); } } } else { // 16x16 const int pred_func = CheckMode(dec->mb_x_, dec->mb_y_, block->imodes_[0]); + uint32_t bits = (block->non_zero_ & 0xffff) | (block->non_zero_ac_ << 16); VP8PredLuma16[pred_func](y_dst); - if (block->non_zero_ & 0xffff) { - for (n = 0; n < 16; n++) { + if (bits & 0xffff) { + for (n = 0; n < 16; ++n, bits <<= 1) { uint8_t* const dst = y_dst + kScan[n]; - if (block->non_zero_ac_ & (1 << n)) { + if (bits & (1UL << 31)) { VP8Transform(coeffs + n * 16, dst, 0); - } else if (block->non_zero_ & (1 << n)) { // only DC is present + } else if (bits & (1UL << 15)) { // only DC is present VP8TransformDC(coeffs + n * 16, dst); } } diff --git a/src/dec/vp8.c b/src/dec/vp8.c index c91d2bcd..59efb71c 100644 --- a/src/dec/vp8.c +++ b/src/dec/vp8.c @@ -505,28 +505,6 @@ static int GetCoeffs(VP8BitReader* const br, ProbaArray prob, return 16; } -// Alias-safe way of converting 4bytes to 32bits. -typedef union { - uint8_t i8[4]; - uint32_t i32; -} PackedNz; - -// Table to unpack four bits into four bytes -static const PackedNz kUnpackTab[16] = { - {{0, 0, 0, 0}}, {{1, 0, 0, 0}}, {{0, 1, 0, 0}}, {{1, 1, 0, 0}}, - {{0, 0, 1, 0}}, {{1, 0, 1, 0}}, {{0, 1, 1, 0}}, {{1, 1, 1, 0}}, - {{0, 0, 0, 1}}, {{1, 0, 0, 1}}, {{0, 1, 0, 1}}, {{1, 1, 0, 1}}, - {{0, 0, 1, 1}}, {{1, 0, 1, 1}}, {{0, 1, 1, 1}}, {{1, 1, 1, 1}} }; - -// Macro to pack four LSB of four bytes into four bits. -#if defined(__PPC__) || defined(_M_PPC) || defined(_ARCH_PPC) || \ - defined(__BIG_ENDIAN__) -#define PACK_CST 0x08040201U -#else -#define PACK_CST 0x01020408U -#endif -#define PACK(X, S) ((((X).i32 * PACK_CST) & 0xff000000) >> (S)) - static int ParseResiduals(VP8Decoder* const dec, VP8MB* const mb, VP8BitReader* const token_br) { uint32_t out_t_nz, out_l_nz; @@ -536,8 +514,7 @@ static int ParseResiduals(VP8Decoder* const dec, VP8MBData* const block = dec->mb_data_; int16_t* dst = block->coeffs_; VP8MB* const left_mb = dec->mb_info_ - 1; - PackedNz nz_ac, nz_dc; - PackedNz tnz, lnz; + uint8_t tnz, lnz; uint32_t non_zero_ac = 0; uint32_t non_zero_dc = 0; int x, y, ch; @@ -557,48 +534,55 @@ static int ParseResiduals(VP8Decoder* const dec, ac_prob = (ProbaArray)dec->proba_.coeffs_[3]; } - tnz = kUnpackTab[mb->nz_ & 0xf]; - lnz = kUnpackTab[left_mb->nz_ & 0xf]; + tnz = mb->nz_ & 0x0f; + lnz = left_mb->nz_ & 0x0f; for (y = 0; y < 4; ++y) { - int l = lnz.i8[y]; + int l = lnz & 1; + uint32_t nz_dc = 0, nz_ac = 0; for (x = 0; x < 4; ++x) { - const int ctx = l + tnz.i8[x]; + const int ctx = l + (tnz & 1); const int nz = GetCoeffs(token_br, ac_prob, ctx, q->y1_mat_, first, dst); - tnz.i8[x] = l = (nz > 0); - nz_dc.i8[x] = (dst[0] != 0); - nz_ac.i8[x] = (nz > 1); + l = (nz > 0); + tnz = (tnz >> 1) | (l << 7); + nz_dc = (nz_dc << 1) | (dst[0] != 0); + nz_ac = (nz_ac << 1) | (nz > 1); dst += 16; } - lnz.i8[y] = l; - non_zero_dc |= PACK(nz_dc, 24 - y * 4); - non_zero_ac |= PACK(nz_ac, 24 - y * 4); + tnz >>= 4; + lnz = (lnz >> 1) | (l << 7); + non_zero_dc = (non_zero_dc << 4) | nz_dc; + non_zero_ac = (non_zero_ac << 4) | nz_ac; } - out_t_nz = PACK(tnz, 24); - out_l_nz = PACK(lnz, 24); + out_t_nz = tnz; + out_l_nz = lnz >> 4; - tnz = kUnpackTab[mb->nz_ >> 4]; - lnz = kUnpackTab[left_mb->nz_ >> 4]; for (ch = 0; ch < 4; ch += 2) { + uint32_t nz_dc = 0, nz_ac = 0; + tnz = mb->nz_ >> (4 + ch); + lnz = left_mb->nz_ >> (4 + ch); for (y = 0; y < 2; ++y) { - int l = lnz.i8[ch + y]; + int l = lnz & 1; for (x = 0; x < 2; ++x) { - const int ctx = l + tnz.i8[ch + x]; + const int ctx = l + (tnz & 1); const int nz = GetCoeffs(token_br, (ProbaArray)dec->proba_.coeffs_[2], ctx, q->uv_mat_, 0, dst); - tnz.i8[ch + x] = l = (nz > 0); - nz_dc.i8[y * 2 + x] = (dst[0] != 0); - nz_ac.i8[y * 2 + x] = (nz > 1); + l = (nz > 0); + tnz = (tnz >> 1) | (l << 3); + nz_dc = (nz_dc << 1) | (dst[0] != 0); + nz_ac = (nz_ac << 1) | (nz > 1); dst += 16; } - lnz.i8[ch + y] = l; + tnz >>= 2; + lnz = (lnz >> 1) | (l << 5); } - non_zero_dc |= PACK(nz_dc, 8 - ch * 2); - non_zero_ac |= PACK(nz_ac, 8 - ch * 2); + // Note: we don't really need the per-4x4 details for U/V blocks. + non_zero_dc |= (nz_dc & 0x0f) << (16 + 2 * ch); + non_zero_ac |= (nz_ac & 0x0f) << (16 + 2 * ch); + out_t_nz |= (tnz << 4) << ch; + out_l_nz |= (lnz & 0xf0) << ch; } - out_t_nz |= PACK(tnz, 20); - out_l_nz |= PACK(lnz, 20); mb->nz_ = out_t_nz; left_mb->nz_ = out_l_nz; @@ -606,7 +590,6 @@ static int ParseResiduals(VP8Decoder* const dec, block->non_zero_ = non_zero_ac | non_zero_dc; return !block->non_zero_; // will be used for further optimization } -#undef PACK //------------------------------------------------------------------------------ // Main loop diff --git a/src/dec/vp8i.h b/src/dec/vp8i.h index 80df889d..308b61d6 100644 --- a/src/dec/vp8i.h +++ b/src/dec/vp8i.h @@ -177,8 +177,8 @@ typedef struct { uint8_t imodes_[16]; // one 16x16 mode (#0) or sixteen 4x4 modes uint8_t uvmode_; // chroma prediction mode // bit-wise info about the content of each sub-4x4 blocks: there are 16 bits - // for luma (bits #0->#15), then 4 bits for chroma-u (#16->#19) and 4 bits for - // chroma-v (#20->#23), each corresponding to one 4x4 block in decoding order. + // for luma (bits #15->#0), then 4 bits for chroma-u (#19->#16) and 4 bits for + // chroma-v (#23->#20), each corresponding to one 4x4 block in decoding order. // If the bit is set, the 4x4 block contains some non-zero coefficients. uint32_t non_zero_; uint32_t non_zero_ac_;