FastMBAnalyze() for quick i16/i4 decision

The decision is based on the variance between DC values of each sub-4x4 block. This heuristic is rather ok for predicting whether the 2nd transform (intra-16) is going to help or not. The decision threshold varies with quality (=quantization). It's only used for -m 0 and -m 1, where no full RD-opt is performed. It actually makes these modes quite faster, with RD curve much closer to the -m 2 mode. Change-Id: I15f972db97ba4082cbd1dfd16bee3eb2eca701a8
2024-11-20 04:18:26 +01:00 · 2016-07-15 11:21:08 -07:00 · 2016-07-15 11:21:08 -07:00 · 5b60db5c9d
commit 5b60db5c9d
parent 567e697776
5 changed files with 89 additions and 9 deletions
--- a/src/dsp/dsp.h
+++ b/src/dsp/dsp.h
@ -185,6 +185,11 @@ typedef int (*VP8WMetric)(const uint8_t* pix, const uint8_t* ref,
 // 4 by 4 symmetric matrix.
 extern VP8WMetric VP8TDisto4x4, VP8TDisto16x16;

+// Compute the average (DC) of four 4x4 blocks.
+// Each sub-4x4 block #i sum is stored in dc[i].
+typedef void (*VP8MeanMetric)(const uint8_t* ref, uint32_t dc[4]);
+extern VP8MeanMetric VP8Mean16x4;
+
 typedef void (*VP8BlockCopy)(const uint8_t* src, uint8_t* dst);
 extern VP8BlockCopy VP8Copy4x4;
 extern VP8BlockCopy VP8Copy16x8;
--- a/src/dsp/enc.c
+++ b/src/dsp/enc.c
@ -551,6 +551,20 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) {
  return GetSSE(a, b, 4, 4);
 }

+static void Mean16x4(const uint8_t* ref, uint32_t dc[4]) {
+  int k, x, y;
+  for (k = 0; k < 4; ++k) {
+    uint32_t avg = 0;
+    for (y = 0; y < 4; ++y) {
+      for (x = 0; x < 4; ++x) {
+        avg += ref[x + y * BPS];
+      }
+    }
+    dc[k] = avg;
+    ref += 4;   // go to next 4x4 block.
+  }
+}
+
 //------------------------------------------------------------------------------
 // Texture distortion
 //
@ -757,6 +771,7 @@ VP8Metric VP8SSE16x8;
 VP8Metric VP8SSE4x4;
 VP8WMetric VP8TDisto4x4;
 VP8WMetric VP8TDisto16x16;
+VP8MeanMetric VP8Mean16x4;
 VP8QuantizeBlock VP8EncQuantizeBlock;
 VP8Quantize2Blocks VP8EncQuantize2Blocks;
 VP8QuantizeBlockWHT VP8EncQuantizeBlockWHT;
@ -795,6 +810,7 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) {
  VP8SSE4x4 = SSE4x4;
  VP8TDisto4x4 = Disto4x4;
  VP8TDisto16x16 = Disto16x16;
+  VP8Mean16x4 = Mean16x4;
  VP8EncQuantizeBlock = QuantizeBlock;
  VP8EncQuantize2Blocks = Quantize2Blocks;
  VP8EncQuantizeBlockWHT = QuantizeBlock;
--- a/src/dsp/enc_sse2.c
+++ b/src/dsp/enc_sse2.c
@ -1045,6 +1045,37 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) {
  return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
 }

+//------------------------------------------------------------------------------
+
+static void Mean16x4(const uint8_t* ref, uint32_t dc[4]) {
+  const __m128i mask = _mm_set1_epi16(0x00ff);
+  const __m128i a0 = _mm_loadu_si128((const __m128i*)&ref[BPS * 0]);
+  const __m128i a1 = _mm_loadu_si128((const __m128i*)&ref[BPS * 1]);
+  const __m128i a2 = _mm_loadu_si128((const __m128i*)&ref[BPS * 2]);
+  const __m128i a3 = _mm_loadu_si128((const __m128i*)&ref[BPS * 3]);
+  const __m128i b0 = _mm_srli_epi16(a0, 8);     // hi byte
+  const __m128i b1 = _mm_srli_epi16(a1, 8);
+  const __m128i b2 = _mm_srli_epi16(a2, 8);
+  const __m128i b3 = _mm_srli_epi16(a3, 8);
+  const __m128i c0 = _mm_and_si128(a0, mask);   // lo byte
+  const __m128i c1 = _mm_and_si128(a1, mask);
+  const __m128i c2 = _mm_and_si128(a2, mask);
+  const __m128i c3 = _mm_and_si128(a3, mask);
+  const __m128i d0 = _mm_add_epi32(b0, c0);
+  const __m128i d1 = _mm_add_epi32(b1, c1);
+  const __m128i d2 = _mm_add_epi32(b2, c2);
+  const __m128i d3 = _mm_add_epi32(b3, c3);
+  const __m128i e0 = _mm_add_epi32(d0, d1);
+  const __m128i e1 = _mm_add_epi32(d2, d3);
+  const __m128i f0 = _mm_add_epi32(e0, e1);
+  uint16_t tmp[8];
+  _mm_storeu_si128((__m128i*)tmp, f0);
+  dc[0] = tmp[0] + tmp[1];
+  dc[1] = tmp[2] + tmp[3];
+  dc[2] = tmp[4] + tmp[5];
+  dc[3] = tmp[6] + tmp[7];
+}
+
 //------------------------------------------------------------------------------
 // Texture distortion
 //
@ -1331,6 +1362,7 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitSSE2(void) {
  VP8SSE4x4 = SSE4x4;
  VP8TDisto4x4 = Disto4x4;
  VP8TDisto16x16 = Disto16x16;
+  VP8Mean16x4 = Mean16x4;
 }

 #else  // !WEBP_USE_SSE2
--- a/src/enc/analysis.c
+++ b/src/enc/analysis.c
@ -262,6 +262,29 @@ static int MBAnalyzeBestIntra16Mode(VP8EncIterator* const it) {
  return best_alpha;
 }

+static int FastMBAnalyze(VP8EncIterator* const it) {
+  // Empirical cut-off value, should be around 16 (~=block size). We use the
+  // [8-17] range and favor intra4 at high quality, intra16 for low quality.
+  const int q = (int)it->enc_->config_->quality;
+  const uint32_t kThreshold = 8 + (17 - 8) * q / 100;
+  int k;
+  uint32_t dc[16], m, m2;
+  for (k = 0; k < 16; k += 4) {
+    VP8Mean16x4(it->yuv_in_ + Y_OFF_ENC + k * BPS, &dc[k]);
+  }
+  for (m = 0, m2 = 0, k = 0; k < 16; ++k) {
+    m += dc[k];
+    m2 += dc[k] * dc[k];
+  }
+  if (kThreshold * m2 < m * m) {
+    VP8SetIntra16Mode(it, 0);   // DC16
+  } else {
+    const uint8_t modes[16] = { 0 };  // DC4
+    VP8SetIntra4Mode(it, modes);
+  }
+  return 0;
+}
+
 static int MBAnalyzeBestIntra4Mode(VP8EncIterator* const it,
                                   int best_alpha) {
  uint8_t modes[16];
@ -339,14 +362,18 @@ static void MBAnalyze(VP8EncIterator* const it,
  VP8SetSkip(it, 0);         // not skipped
  VP8SetSegment(it, 0);      // default segment, spec-wise.

+  if (enc->method_ <= 1) {
+    best_alpha = FastMBAnalyze(it);
+  } else {
    best_alpha = MBAnalyzeBestIntra16Mode(it);
    if (enc->method_ >= 5) {
      // We go and make a fast decision for intra4/intra16.
-    // It's usually not a good and definitive pick, but helps seeding the stats
-    // about level bit-cost.
+      // It's usually not a good and definitive pick, but helps seeding the
+      // stats about level bit-cost.
      // TODO(skal): improve criterion.
      best_alpha = MBAnalyzeBestIntra4Mode(it, best_alpha);
    }
+  }
  best_uv_alpha = MBAnalyzeBestUVMode(it);

  // Final susceptibility mix
@ -448,7 +475,7 @@ int VP8EncAnalyze(VP8Encoder* const enc) {
  const int do_segments =
      enc->config_->emulate_jpeg_size ||   // We need the complexity evaluation.
      (enc->segment_hdr_.num_segments_ > 1) ||
-      (enc->method_ == 0);  // for method 0, we need preds_[] to be filled.
+      (enc->method_ <= 1);  // for method 0 - 1, we need preds_[] to be filled.
  if (do_segments) {
    const int last_row = enc->mb_h_;
    // We give a little more than a half work to the main thread.
--- a/src/enc/webpenc.c
+++ b/src/enc/webpenc.c
@ -75,7 +75,7 @@ static void ResetBoundaryPredictions(VP8Encoder* const enc) {
 //-------------------+---+---+---+---+---+---+---+
 // dynamic proba     | ~ | x | x | x | x | x | x |
 //-------------------+---+---+---+---+---+---+---+
-// fast mode analysis|   |   |   |   | x | x | x |
+// fast mode analysis|[x]|[x]|   |   | x | x | x |
 //-------------------+---+---+---+---+---+---+---+
 // basic rd-opt      |   |   |   | x | x | x | x |
 //-------------------+---+---+---+---+---+---+---+