From c13fecf908a612a646e378802b1b108a9719cc71 Mon Sep 17 00:00:00 2001
From: skal <pascal.massimino@gmail.com>
Date: Thu, 5 Sep 2013 08:53:36 +0200
Subject: [PATCH] remove the PACK() bit-packing tricks

was too smart for its own good :)
This is more ARM-friendly, since it removes a mult.

Change-Id: If146034c8efa2e71e3eaaf1230cb553884a42ebb
---
 src/dec/frame.c | 16 +++++-----
 src/dec/vp8.c   | 81 +++++++++++++++++++------------------------------
 src/dec/vp8i.h  |  4 +--
 3 files changed, 43 insertions(+), 58 deletions(-)

diff --git a/src/dec/frame.c b/src/dec/frame.c
index 9baac8e1..9051b567 100644
--- a/src/dec/frame.c
+++ b/src/dec/frame.c
@@ -595,6 +595,7 @@ void VP8ReconstructBlock(const VP8Decoder* const dec) {
     // predict and add residuals
     if (block->is_i4x4_) {   // 4x4
       uint32_t* const top_right = (uint32_t*)(y_dst - BPS + 16);
+      uint32_t bits = (block->non_zero_ & 0xffff) | (block->non_zero_ac_ << 16);
 
       if (dec->mb_y_ > 0) {
         if (dec->mb_x_ >= dec->mb_w_ - 1) {    // on rightmost border
@@ -607,25 +608,26 @@ void VP8ReconstructBlock(const VP8Decoder* const dec) {
       top_right[BPS] = top_right[2 * BPS] = top_right[3 * BPS] = top_right[0];
 
       // predict and add residuals for all 4x4 blocks in turn.
-      for (n = 0; n < 16; n++) {
+      for (n = 0; n < 16; ++n, bits <<= 1) {
         uint8_t* const dst = y_dst + kScan[n];
         VP8PredLuma4[block->imodes_[n]](dst);
-        if (block->non_zero_ac_ & (1 << n)) {
+        if (bits & (1UL << 31)) {
           VP8Transform(coeffs + n * 16, dst, 0);
-        } else if (block->non_zero_ & (1 << n)) {  // only DC is present
+        } else if (bits & (1UL << 15)) {  // only DC is present
           VP8TransformDC(coeffs + n * 16, dst);
         }
       }
     } else {    // 16x16
       const int pred_func = CheckMode(dec->mb_x_, dec->mb_y_,
                                       block->imodes_[0]);
+      uint32_t bits = (block->non_zero_ & 0xffff) | (block->non_zero_ac_ << 16);
       VP8PredLuma16[pred_func](y_dst);
-      if (block->non_zero_ & 0xffff) {
-        for (n = 0; n < 16; n++) {
+      if (bits & 0xffff) {
+        for (n = 0; n < 16; ++n, bits <<= 1) {
           uint8_t* const dst = y_dst + kScan[n];
-          if (block->non_zero_ac_ & (1 << n)) {
+          if (bits & (1UL << 31)) {
             VP8Transform(coeffs + n * 16, dst, 0);
-          } else if (block->non_zero_ & (1 << n)) {  // only DC is present
+          } else if (bits & (1UL << 15)) {  // only DC is present
             VP8TransformDC(coeffs + n * 16, dst);
           }
         }
diff --git a/src/dec/vp8.c b/src/dec/vp8.c
index c91d2bcd..59efb71c 100644
--- a/src/dec/vp8.c
+++ b/src/dec/vp8.c
@@ -505,28 +505,6 @@ static int GetCoeffs(VP8BitReader* const br, ProbaArray prob,
   return 16;
 }
 
-// Alias-safe way of converting 4bytes to 32bits.
-typedef union {
-  uint8_t  i8[4];
-  uint32_t i32;
-} PackedNz;
-
-// Table to unpack four bits into four bytes
-static const PackedNz kUnpackTab[16] = {
-  {{0, 0, 0, 0}},  {{1, 0, 0, 0}},  {{0, 1, 0, 0}},  {{1, 1, 0, 0}},
-  {{0, 0, 1, 0}},  {{1, 0, 1, 0}},  {{0, 1, 1, 0}},  {{1, 1, 1, 0}},
-  {{0, 0, 0, 1}},  {{1, 0, 0, 1}},  {{0, 1, 0, 1}},  {{1, 1, 0, 1}},
-  {{0, 0, 1, 1}},  {{1, 0, 1, 1}},  {{0, 1, 1, 1}},  {{1, 1, 1, 1}} };
-
-// Macro to pack four LSB of four bytes into four bits.
-#if defined(__PPC__) || defined(_M_PPC) || defined(_ARCH_PPC) || \
-    defined(__BIG_ENDIAN__)
-#define PACK_CST 0x08040201U
-#else
-#define PACK_CST 0x01020408U
-#endif
-#define PACK(X, S) ((((X).i32 * PACK_CST) & 0xff000000) >> (S))
-
 static int ParseResiduals(VP8Decoder* const dec,
                           VP8MB* const mb, VP8BitReader* const token_br) {
   uint32_t out_t_nz, out_l_nz;
@@ -536,8 +514,7 @@ static int ParseResiduals(VP8Decoder* const dec,
   VP8MBData* const block = dec->mb_data_;
   int16_t* dst = block->coeffs_;
   VP8MB* const left_mb = dec->mb_info_ - 1;
-  PackedNz nz_ac, nz_dc;
-  PackedNz tnz, lnz;
+  uint8_t tnz, lnz;
   uint32_t non_zero_ac = 0;
   uint32_t non_zero_dc = 0;
   int x, y, ch;
@@ -557,48 +534,55 @@ static int ParseResiduals(VP8Decoder* const dec,
     ac_prob = (ProbaArray)dec->proba_.coeffs_[3];
   }
 
-  tnz = kUnpackTab[mb->nz_ & 0xf];
-  lnz = kUnpackTab[left_mb->nz_ & 0xf];
+  tnz = mb->nz_ & 0x0f;
+  lnz = left_mb->nz_ & 0x0f;
   for (y = 0; y < 4; ++y) {
-    int l = lnz.i8[y];
+    int l = lnz & 1;
+    uint32_t nz_dc = 0, nz_ac = 0;
     for (x = 0; x < 4; ++x) {
-      const int ctx = l + tnz.i8[x];
+      const int ctx = l + (tnz & 1);
       const int nz = GetCoeffs(token_br, ac_prob, ctx,
                                q->y1_mat_, first, dst);
-      tnz.i8[x] = l = (nz > 0);
-      nz_dc.i8[x] = (dst[0] != 0);
-      nz_ac.i8[x] = (nz > 1);
+      l = (nz > 0);
+      tnz = (tnz >> 1) | (l << 7);
+      nz_dc = (nz_dc << 1) | (dst[0] != 0);
+      nz_ac = (nz_ac << 1) | (nz > 1);
       dst += 16;
     }
-    lnz.i8[y] = l;
-    non_zero_dc |= PACK(nz_dc, 24 - y * 4);
-    non_zero_ac |= PACK(nz_ac, 24 - y * 4);
+    tnz >>= 4;
+    lnz = (lnz >> 1) | (l << 7);
+    non_zero_dc = (non_zero_dc << 4) | nz_dc;
+    non_zero_ac = (non_zero_ac << 4) | nz_ac;
   }
-  out_t_nz = PACK(tnz, 24);
-  out_l_nz = PACK(lnz, 24);
+  out_t_nz = tnz;
+  out_l_nz = lnz >> 4;
 
-  tnz = kUnpackTab[mb->nz_ >> 4];
-  lnz = kUnpackTab[left_mb->nz_ >> 4];
   for (ch = 0; ch < 4; ch += 2) {
+    uint32_t nz_dc = 0, nz_ac = 0;
+    tnz = mb->nz_ >> (4 + ch);
+    lnz = left_mb->nz_ >> (4 + ch);
     for (y = 0; y < 2; ++y) {
-      int l = lnz.i8[ch + y];
+      int l = lnz & 1;
       for (x = 0; x < 2; ++x) {
-        const int ctx = l + tnz.i8[ch + x];
+        const int ctx = l + (tnz & 1);
         const int nz =
             GetCoeffs(token_br, (ProbaArray)dec->proba_.coeffs_[2],
                       ctx, q->uv_mat_, 0, dst);
-        tnz.i8[ch + x] = l = (nz > 0);
-        nz_dc.i8[y * 2 + x] = (dst[0] != 0);
-        nz_ac.i8[y * 2 + x] = (nz > 1);
+        l = (nz > 0);
+        tnz = (tnz >> 1) | (l << 3);
+        nz_dc = (nz_dc << 1) | (dst[0] != 0);
+        nz_ac = (nz_ac << 1) | (nz > 1);
         dst += 16;
       }
-      lnz.i8[ch + y] = l;
+      tnz >>= 2;
+      lnz = (lnz >> 1) | (l << 5);
     }
-    non_zero_dc |= PACK(nz_dc, 8 - ch * 2);
-    non_zero_ac |= PACK(nz_ac, 8 - ch * 2);
+    // Note: we don't really need the per-4x4 details for U/V blocks.
+    non_zero_dc |= (nz_dc & 0x0f) << (16 + 2 * ch);
+    non_zero_ac |= (nz_ac & 0x0f) << (16 + 2 * ch);
+    out_t_nz |= (tnz << 4) << ch;
+    out_l_nz |= (lnz & 0xf0) << ch;
   }
-  out_t_nz |= PACK(tnz, 20);
-  out_l_nz |= PACK(lnz, 20);
   mb->nz_ = out_t_nz;
   left_mb->nz_ = out_l_nz;
 
@@ -606,7 +590,6 @@ static int ParseResiduals(VP8Decoder* const dec,
   block->non_zero_ = non_zero_ac | non_zero_dc;
   return !block->non_zero_;   // will be used for further optimization
 }
-#undef PACK
 
 //------------------------------------------------------------------------------
 // Main loop
diff --git a/src/dec/vp8i.h b/src/dec/vp8i.h
index 80df889d..308b61d6 100644
--- a/src/dec/vp8i.h
+++ b/src/dec/vp8i.h
@@ -177,8 +177,8 @@ typedef struct {
   uint8_t imodes_[16];    // one 16x16 mode (#0) or sixteen 4x4 modes
   uint8_t uvmode_;        // chroma prediction mode
   // bit-wise info about the content of each sub-4x4 blocks: there are 16 bits
-  // for luma (bits #0->#15), then 4 bits for chroma-u (#16->#19) and 4 bits for
-  // chroma-v (#20->#23), each corresponding to one 4x4 block in decoding order.
+  // for luma (bits #15->#0), then 4 bits for chroma-u (#19->#16) and 4 bits for
+  // chroma-v (#23->#20), each corresponding to one 4x4 block in decoding order.
   // If the bit is set, the 4x4 block contains some non-zero coefficients.
   uint32_t non_zero_;
   uint32_t non_zero_ac_;