Special-case sparse transform

If the number of non-zero coeffs is <= 3, use a simplified transform for luma. Change-Id: I78a1252704228d21720d4bc1221252c84338d9c8
2026-01-09 13:49:18 +01:00 · 2013-10-08 22:05:38 +02:00
parent 00125196f3
commit f9bbc2a034
6 changed files with 159 additions and 72 deletions
--- a/src/dec/frame.c
+++ b/src/dec/frame.c
@@ -544,6 +544,34 @@ static void Copy32b(uint8_t* dst, uint8_t* src) {
  memcpy(dst, src, 4);
 }

+static void DoTransform(uint32_t bits, const int16_t* const src,
+                        uint8_t* const dst) {
+  switch (bits >> 30) {
+    case 3:
+      VP8Transform(src, dst, 0);
+      break;
+    case 2:
+      VP8TransformAC3(src, dst);
+      break;
+    case 1:
+      VP8TransformDC(src, dst);
+      break;
+    default:
+      break;
+  }
+}
+
+static void DoUVTransform(uint32_t bits, const int16_t* const src,
+                          uint8_t* const dst) {
+  if (bits & 0xff) {    // any non-zero coeff at all?
+    if (bits & 0xaa) {  // any non-zero AC coefficient?
+      VP8TransformUV(src, dst);   // note we don't use the AC3 variant for U/V
+    } else {
+      VP8TransformDCUV(src, dst);
+    }
+  }
+}
+
 void VP8ReconstructBlock(const VP8Decoder* const dec) {
  int j;
  uint8_t* const y_dst = dec->yuv_b_ + Y_OFF;
@@ -578,6 +606,7 @@ void VP8ReconstructBlock(const VP8Decoder* const dec) {
    // bring top samples into the cache
    VP8TopSamples* const top_yuv = dec->yuv_t_ + dec->mb_x_;
    const int16_t* const coeffs = block->coeffs_;
+    uint32_t bits = block->non_zero_y_;
    int n;

    if (dec->mb_y_ > 0) {
@@ -595,7 +624,6 @@ void VP8ReconstructBlock(const VP8Decoder* const dec) {
    // predict and add residuals
    if (block->is_i4x4_) {   // 4x4
      uint32_t* const top_right = (uint32_t*)(y_dst - BPS + 16);
-      uint32_t bits = (block->non_zero_ & 0xffff) | (block->non_zero_ac_ << 16);

      if (dec->mb_y_ > 0) {
        if (dec->mb_x_ >= dec->mb_w_ - 1) {    // on rightmost border
@@ -608,53 +636,29 @@ void VP8ReconstructBlock(const VP8Decoder* const dec) {
      top_right[BPS] = top_right[2 * BPS] = top_right[3 * BPS] = top_right[0];

      // predict and add residuals for all 4x4 blocks in turn.
-      for (n = 0; n < 16; ++n, bits <<= 1) {
+      for (n = 0; n < 16; ++n, bits <<= 2) {
        uint8_t* const dst = y_dst + kScan[n];
        VP8PredLuma4[block->imodes_[n]](dst);
-        if (bits & (1UL << 31)) {
-          VP8Transform(coeffs + n * 16, dst, 0);
-        } else if (bits & (1UL << 15)) {  // only DC is present
-          VP8TransformDC(coeffs + n * 16, dst);
-        }
+        DoTransform(bits, coeffs + n * 16, dst);
      }
    } else {    // 16x16
      const int pred_func = CheckMode(dec->mb_x_, dec->mb_y_,
                                      block->imodes_[0]);
-      uint32_t bits = (block->non_zero_ & 0xffff) | (block->non_zero_ac_ << 16);
      VP8PredLuma16[pred_func](y_dst);
-      if (bits & 0xffff) {
-        for (n = 0; n < 16; ++n, bits <<= 1) {
-          uint8_t* const dst = y_dst + kScan[n];
-          if (bits & (1UL << 31)) {
-            VP8Transform(coeffs + n * 16, dst, 0);
-          } else if (bits & (1UL << 15)) {  // only DC is present
-            VP8TransformDC(coeffs + n * 16, dst);
-          }
+      if (bits != 0) {
+        for (n = 0; n < 16; ++n, bits <<= 2) {
+          DoTransform(bits, coeffs + n * 16, y_dst + kScan[n]);
        }
      }
    }
    {
      // Chroma
+      const uint32_t bits_uv = block->non_zero_uv_;
      const int pred_func = CheckMode(dec->mb_x_, dec->mb_y_, block->uvmode_);
      VP8PredChroma8[pred_func](u_dst);
      VP8PredChroma8[pred_func](v_dst);
-
-      if (block->non_zero_ & 0x0f0000) {   // chroma-U
-        const int16_t* const u_coeffs = coeffs + 16 * 16;
-        if (block->non_zero_ac_ & 0x0f0000) {
-          VP8TransformUV(u_coeffs, u_dst);
-        } else {
-          VP8TransformDCUV(u_coeffs, u_dst);
-        }
-      }
-      if (block->non_zero_ & 0xf00000) {   // chroma-V
-        const int16_t* const v_coeffs = coeffs + 20 * 16;
-        if (block->non_zero_ac_ & 0xf00000) {
-          VP8TransformUV(v_coeffs, v_dst);
-        } else {
-          VP8TransformDCUV(v_coeffs, v_dst);
-        }
-      }
+      DoUVTransform(bits_uv >> 0, coeffs + 16 * 16, u_dst);
+      DoUVTransform(bits_uv >> 8, coeffs + 20 * 16, v_dst);
    }

    // stash away top samples for next block
--- a/src/dec/vp8.c
+++ b/src/dec/vp8.c
@@ -509,8 +509,8 @@ static int ParseResiduals(VP8Decoder* const dec,
  int16_t* dst = block->coeffs_;
  VP8MB* const left_mb = dec->mb_info_ - 1;
  uint8_t tnz, lnz;
-  uint32_t non_zero_ac = 0;
-  uint32_t non_zero_dc = 0;
+  uint32_t non_zero_y = 0;
+  uint32_t non_zero_uv = 0;
  int x, y, ch;
  uint32_t out_t_nz, out_l_nz;
  int first;
@@ -539,26 +539,27 @@ static int ParseResiduals(VP8Decoder* const dec,
  lnz = left_mb->nz_ & 0x0f;
  for (y = 0; y < 4; ++y) {
    int l = lnz & 1;
-    uint32_t nz_dc = 0, nz_ac = 0;
+    uint32_t nz_coeffs = 0;
    for (x = 0; x < 4; ++x) {
      const int ctx = l + (tnz & 1);
      const int nz = GetCoeffs(token_br, ac_proba, ctx, q->y1_mat_, first, dst);
      l = (nz > first);
      tnz = (tnz >> 1) | (l << 7);
-      nz_dc = (nz_dc << 1) | (dst[0] != 0);
-      nz_ac = (nz_ac << 1) | (nz > 1);
+      nz_coeffs <<= 2;
+      if (nz > 3) nz_coeffs |= 3;
+      else if (nz > 1) nz_coeffs |= 2;
+      else if (dst[0] != 0) nz_coeffs |= 1;
      dst += 16;
    }
    tnz >>= 4;
    lnz = (lnz >> 1) | (l << 7);
-    non_zero_dc = (non_zero_dc << 4) | nz_dc;
-    non_zero_ac = (non_zero_ac << 4) | nz_ac;
+    non_zero_y = (non_zero_y << 8) | nz_coeffs;
  }
  out_t_nz = tnz;
  out_l_nz = lnz >> 4;

  for (ch = 0; ch < 4; ch += 2) {
-    uint32_t nz_dc = 0, nz_ac = 0;
+    uint32_t nz_coeffs = 0;
    tnz = mb->nz_ >> (4 + ch);
    lnz = left_mb->nz_ >> (4 + ch);
    for (y = 0; y < 2; ++y) {
@@ -568,25 +569,26 @@ static int ParseResiduals(VP8Decoder* const dec,
        const int nz = GetCoeffs(token_br, bands[2], ctx, q->uv_mat_, 0, dst);
        l = (nz > 0);
        tnz = (tnz >> 1) | (l << 3);
-        nz_dc = (nz_dc << 1) | (dst[0] != 0);
-        nz_ac = (nz_ac << 1) | (nz > 1);
+        nz_coeffs <<= 2;
+        if (nz > 3) nz_coeffs |= 3;
+        else if (nz > 1) nz_coeffs |= 2;
+        else if (dst[0] != 0) nz_coeffs |= 1;
        dst += 16;
      }
      tnz >>= 2;
      lnz = (lnz >> 1) | (l << 5);
    }
    // Note: we don't really need the per-4x4 details for U/V blocks.
-    non_zero_dc |= (nz_dc & 0x0f) << (16 + 2 * ch);
-    non_zero_ac |= (nz_ac & 0x0f) << (16 + 2 * ch);
+    non_zero_uv |= nz_coeffs << (4 * ch);
    out_t_nz |= (tnz << 4) << ch;
    out_l_nz |= (lnz & 0xf0) << ch;
  }
  mb->nz_ = out_t_nz;
  left_mb->nz_ = out_l_nz;

-  block->non_zero_ac_ = non_zero_ac;
-  block->non_zero_ = non_zero_ac | non_zero_dc;
-  return !block->non_zero_;   // will be used for further optimization
+  block->non_zero_y_ = non_zero_y;
+  block->non_zero_uv_ = non_zero_uv;
+  return !(non_zero_y | non_zero_uv);  // will be used for further optimization
 }

 //------------------------------------------------------------------------------
@@ -621,8 +623,8 @@ int VP8DecodeMB(VP8Decoder* const dec, VP8BitReader* const token_br) {
    if (!block->is_i4x4_) {
      left->nz_dc_ = mb->nz_dc_ = 0;
    }
-    block->non_zero_ = 0;
-    block->non_zero_ac_ = 0;
+    block->non_zero_y_ = 0;
+    block->non_zero_uv_ = 0;
  }

  if (dec->filter_type_ > 0) {  // store filter info
--- a/src/dec/vp8i.h
+++ b/src/dec/vp8i.h
@@ -184,12 +184,15 @@ typedef struct {
  uint8_t is_i4x4_;       // true if intra4x4
  uint8_t imodes_[16];    // one 16x16 mode (#0) or sixteen 4x4 modes
  uint8_t uvmode_;        // chroma prediction mode
-  // bit-wise info about the content of each sub-4x4 blocks: there are 16 bits
-  // for luma (bits #15->#0), then 4 bits for chroma-u (#19->#16) and 4 bits for
-  // chroma-v (#23->#20), each corresponding to one 4x4 block in decoding order.
-  // If the bit is set, the 4x4 block contains some non-zero coefficients.
-  uint32_t non_zero_;
-  uint32_t non_zero_ac_;
+  // bit-wise info about the content of each sub-4x4 blocks (in decoding order).
+  // Each of the 4x4 blocks for y/u/v is associated with a 2b code according to:
+  //   code=0 -> no coefficient
+  //   code=1 -> only DC
+  //   code=2 -> first three coefficients are non-zero
+  //   code=3 -> more than three coefficients are non-zero
+  // This allows to call specialized transform functions.
+  uint32_t non_zero_y_;
+  uint32_t non_zero_uv_;
 } VP8MBData;

 // Persistent information needed by the parallel processing
--- a/src/dsp/dec.c
+++ b/src/dsp/dec.c
@@ -61,6 +61,14 @@ static WEBP_INLINE uint8_t clip_8b(int v) {
 #define STORE(x, y, v) \
  dst[x + y * BPS] = clip_8b(dst[x + y * BPS] + ((v) >> 3))

+#define STORE2(y, dc, d, c) do {    \
+  const int DC = (dc);              \
+  STORE(0, y, DC + (d));            \
+  STORE(1, y, DC + (c));            \
+  STORE(2, y, DC - (c));            \
+  STORE(3, y, DC - (d));            \
+} while (0)
+
 static const int kC1 = 20091 + (1 << 16);
 static const int kC2 = 35468;
 #define MUL(a, b) (((a) * (b)) >> 16)
@@ -103,7 +111,21 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
    dst += BPS;
  }
 }
+
+// Simplified transform when only in[0], in[1] and in[4] are non-zero
+static void TransformAC3(const int16_t* in, uint8_t* dst) {
+  const int a = in[0] + 4;
+  const int c4 = MUL(in[4], kC2);
+  const int d4 = MUL(in[4], kC1);
+  const int c1 = MUL(in[1], kC2);
+  const int d1 = MUL(in[1], kC1);
+  STORE2(0, a + d4, d1, c1);
+  STORE2(1, a + c4, d1, c1);
+  STORE2(2, a - c4, d1, c1);
+  STORE2(3, a - d4, d1, c1);
+}
 #undef MUL
+#undef STORE2

 static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
  TransformOne(in, dst);
@@ -679,6 +701,7 @@ static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
 //------------------------------------------------------------------------------

 VP8DecIdct2 VP8Transform;
+VP8DecIdct VP8TransformAC3;
 VP8DecIdct VP8TransformUV;
 VP8DecIdct VP8TransformDC;
 VP8DecIdct VP8TransformDCUV;
@@ -706,6 +729,7 @@ void VP8DspInit(void) {
  VP8TransformUV = TransformUV;
  VP8TransformDC = TransformDC;
  VP8TransformDCUV = TransformDCUV;
+  VP8TransformAC3 = TransformAC3;

  VP8VFilter16 = VFilter16;
  VP8HFilter16 = HFilter16;
--- a/src/dsp/dec_sse2.c
+++ b/src/dsp/dec_sse2.c
@@ -20,6 +20,10 @@ extern "C" {

 #if defined(WEBP_USE_SSE2)

+// The 3-coeff sparse transform in SSE2 is not really faster than the plain-C
+// one it seems => disable it by default. Uncomment the following to enable:
+// #define USE_TRANSFORM_AC3
+
 #include <emmintrin.h>
 #include "../dec/vp8i.h"

@@ -201,16 +205,16 @@ static void TransformSSE2(const int16_t* in, uint8_t* dst, int do_two) {
    __m128i dst0, dst1, dst2, dst3;
    if (do_two) {
      // Load eight bytes/pixels per line.
-      dst0 = _mm_loadl_epi64((__m128i*)&dst[0 * BPS]);
-      dst1 = _mm_loadl_epi64((__m128i*)&dst[1 * BPS]);
-      dst2 = _mm_loadl_epi64((__m128i*)&dst[2 * BPS]);
-      dst3 = _mm_loadl_epi64((__m128i*)&dst[3 * BPS]);
+      dst0 = _mm_loadl_epi64((__m128i*)(dst + 0 * BPS));
+      dst1 = _mm_loadl_epi64((__m128i*)(dst + 1 * BPS));
+      dst2 = _mm_loadl_epi64((__m128i*)(dst + 2 * BPS));
+      dst3 = _mm_loadl_epi64((__m128i*)(dst + 3 * BPS));
    } else {
      // Load four bytes/pixels per line.
-      dst0 = _mm_cvtsi32_si128(*(int*)&dst[0 * BPS]);
-      dst1 = _mm_cvtsi32_si128(*(int*)&dst[1 * BPS]);
-      dst2 = _mm_cvtsi32_si128(*(int*)&dst[2 * BPS]);
-      dst3 = _mm_cvtsi32_si128(*(int*)&dst[3 * BPS]);
+      dst0 = _mm_cvtsi32_si128(*(int*)(dst + 0 * BPS));
+      dst1 = _mm_cvtsi32_si128(*(int*)(dst + 1 * BPS));
+      dst2 = _mm_cvtsi32_si128(*(int*)(dst + 2 * BPS));
+      dst3 = _mm_cvtsi32_si128(*(int*)(dst + 3 * BPS));
    }
    // Convert to 16b.
    dst0 = _mm_unpacklo_epi8(dst0, zero);
@@ -230,20 +234,66 @@ static void TransformSSE2(const int16_t* in, uint8_t* dst, int do_two) {
    // Store the results.
    if (do_two) {
      // Store eight bytes/pixels per line.
-      _mm_storel_epi64((__m128i*)&dst[0 * BPS], dst0);
-      _mm_storel_epi64((__m128i*)&dst[1 * BPS], dst1);
-      _mm_storel_epi64((__m128i*)&dst[2 * BPS], dst2);
-      _mm_storel_epi64((__m128i*)&dst[3 * BPS], dst3);
+      _mm_storel_epi64((__m128i*)(dst + 0 * BPS), dst0);
+      _mm_storel_epi64((__m128i*)(dst + 1 * BPS), dst1);
+      _mm_storel_epi64((__m128i*)(dst + 2 * BPS), dst2);
+      _mm_storel_epi64((__m128i*)(dst + 3 * BPS), dst3);
    } else {
      // Store four bytes/pixels per line.
-      *((int32_t *)&dst[0 * BPS]) = _mm_cvtsi128_si32(dst0);
-      *((int32_t *)&dst[1 * BPS]) = _mm_cvtsi128_si32(dst1);
-      *((int32_t *)&dst[2 * BPS]) = _mm_cvtsi128_si32(dst2);
-      *((int32_t *)&dst[3 * BPS]) = _mm_cvtsi128_si32(dst3);
+      *(int*)(dst + 0 * BPS) = _mm_cvtsi128_si32(dst0);
+      *(int*)(dst + 1 * BPS) = _mm_cvtsi128_si32(dst1);
+      *(int*)(dst + 2 * BPS) = _mm_cvtsi128_si32(dst2);
+      *(int*)(dst + 3 * BPS) = _mm_cvtsi128_si32(dst3);
    }
  }
 }

+#if defined(USE_TRANSFORM_AC3)
+#define MUL(a, b) (((a) * (b)) >> 16)
+static void TransformAC3SSE2(const int16_t* in, uint8_t* dst) {
+  static const int kC1 = 20091 + (1 << 16);
+  static const int kC2 = 35468;
+  const __m128i A = _mm_set1_epi16(in[0] + 4);
+  const __m128i c4 = _mm_set1_epi16(MUL(in[4], kC2));
+  const __m128i d4 = _mm_set1_epi16(MUL(in[4], kC1));
+  const int c1 = MUL(in[1], kC2);
+  const int d1 = MUL(in[1], kC1);
+  const __m128i CD = _mm_set_epi16(0, 0, 0, 0, -d1, -c1, c1, d1);
+  const __m128i B = _mm_adds_epi16(A, CD);
+  const __m128i m0 = _mm_adds_epi16(B, d4);
+  const __m128i m1 = _mm_adds_epi16(B, c4);
+  const __m128i m2 = _mm_subs_epi16(B, c4);
+  const __m128i m3 = _mm_subs_epi16(B, d4);
+  const __m128i zero = _mm_setzero_si128();
+  // Load the source pixels.
+  __m128i dst0 = _mm_cvtsi32_si128(*(int*)(dst + 0 * BPS));
+  __m128i dst1 = _mm_cvtsi32_si128(*(int*)(dst + 1 * BPS));
+  __m128i dst2 = _mm_cvtsi32_si128(*(int*)(dst + 2 * BPS));
+  __m128i dst3 = _mm_cvtsi32_si128(*(int*)(dst + 3 * BPS));
+  // Convert to 16b.
+  dst0 = _mm_unpacklo_epi8(dst0, zero);
+  dst1 = _mm_unpacklo_epi8(dst1, zero);
+  dst2 = _mm_unpacklo_epi8(dst2, zero);
+  dst3 = _mm_unpacklo_epi8(dst3, zero);
+  // Add the inverse transform.
+  dst0 = _mm_adds_epi16(dst0, _mm_srai_epi16(m0, 3));
+  dst1 = _mm_adds_epi16(dst1, _mm_srai_epi16(m1, 3));
+  dst2 = _mm_adds_epi16(dst2, _mm_srai_epi16(m2, 3));
+  dst3 = _mm_adds_epi16(dst3, _mm_srai_epi16(m3, 3));
+  // Unsigned saturate to 8b.
+  dst0 = _mm_packus_epi16(dst0, dst0);
+  dst1 = _mm_packus_epi16(dst1, dst1);
+  dst2 = _mm_packus_epi16(dst2, dst2);
+  dst3 = _mm_packus_epi16(dst3, dst3);
+  // Store the results.
+  *(int*)(dst + 0 * BPS) = _mm_cvtsi128_si32(dst0);
+  *(int*)(dst + 1 * BPS) = _mm_cvtsi128_si32(dst1);
+  *(int*)(dst + 2 * BPS) = _mm_cvtsi128_si32(dst2);
+  *(int*)(dst + 3 * BPS) = _mm_cvtsi128_si32(dst3);
+}
+#undef MUL
+#endif   // USE_TRANSFORM_AC3
+
 //------------------------------------------------------------------------------
 // Loop Filter (Paragraph 15)

@@ -888,6 +938,9 @@ extern void VP8DspInitSSE2(void);
 void VP8DspInitSSE2(void) {
 #if defined(WEBP_USE_SSE2)
  VP8Transform = TransformSSE2;
+#if defined(USE_TRANSFORM_AC3)
+  VP8TransformAC3 = TransformAC3SSE2;
+#endif

  VP8VFilter16 = VFilter16SSE2;
  VP8HFilter16 = HFilter16SSE2;
--- a/src/dsp/dsp.h
+++ b/src/dsp/dsp.h
@@ -103,6 +103,7 @@ typedef void (*VP8DecIdct)(const int16_t* coeffs, uint8_t* dst);
 // when doing two transforms, coeffs is actually int16_t[2][16].
 typedef void (*VP8DecIdct2)(const int16_t* coeffs, uint8_t* dst, int do_two);
 extern VP8DecIdct2 VP8Transform;
+extern VP8DecIdct VP8TransformAC3;
 extern VP8DecIdct VP8TransformUV;
 extern VP8DecIdct VP8TransformDC;
 extern VP8DecIdct VP8TransformDCUV;