From 5b60db5c9db7f78d6cd97612a5a6202b9a254e56 Mon Sep 17 00:00:00 2001 From: skal Date: Fri, 15 Jul 2016 11:21:08 -0700 Subject: [PATCH] FastMBAnalyze() for quick i16/i4 decision The decision is based on the variance between DC values of each sub-4x4 block. This heuristic is rather ok for predicting whether the 2nd transform (intra-16) is going to help or not. The decision threshold varies with quality (=quantization). It's only used for -m 0 and -m 1, where no full RD-opt is performed. It actually makes these modes quite faster, with RD curve much closer to the -m 2 mode. Change-Id: I15f972db97ba4082cbd1dfd16bee3eb2eca701a8 --- src/dsp/dsp.h | 5 +++++ src/dsp/enc.c | 16 ++++++++++++++++ src/dsp/enc_sse2.c | 32 ++++++++++++++++++++++++++++++++ src/enc/analysis.c | 43 +++++++++++++++++++++++++++++++++++-------- src/enc/webpenc.c | 2 +- 5 files changed, 89 insertions(+), 9 deletions(-) diff --git a/src/dsp/dsp.h b/src/dsp/dsp.h index 1faac27b..d2e59ed5 100644 --- a/src/dsp/dsp.h +++ b/src/dsp/dsp.h @@ -185,6 +185,11 @@ typedef int (*VP8WMetric)(const uint8_t* pix, const uint8_t* ref, // 4 by 4 symmetric matrix. extern VP8WMetric VP8TDisto4x4, VP8TDisto16x16; +// Compute the average (DC) of four 4x4 blocks. +// Each sub-4x4 block #i sum is stored in dc[i]. +typedef void (*VP8MeanMetric)(const uint8_t* ref, uint32_t dc[4]); +extern VP8MeanMetric VP8Mean16x4; + typedef void (*VP8BlockCopy)(const uint8_t* src, uint8_t* dst); extern VP8BlockCopy VP8Copy4x4; extern VP8BlockCopy VP8Copy16x8; diff --git a/src/dsp/enc.c b/src/dsp/enc.c index 64a47bf7..110ef189 100644 --- a/src/dsp/enc.c +++ b/src/dsp/enc.c @@ -551,6 +551,20 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) { return GetSSE(a, b, 4, 4); } +static void Mean16x4(const uint8_t* ref, uint32_t dc[4]) { + int k, x, y; + for (k = 0; k < 4; ++k) { + uint32_t avg = 0; + for (y = 0; y < 4; ++y) { + for (x = 0; x < 4; ++x) { + avg += ref[x + y * BPS]; + } + } + dc[k] = avg; + ref += 4; // go to next 4x4 block. + } +} + //------------------------------------------------------------------------------ // Texture distortion // @@ -757,6 +771,7 @@ VP8Metric VP8SSE16x8; VP8Metric VP8SSE4x4; VP8WMetric VP8TDisto4x4; VP8WMetric VP8TDisto16x16; +VP8MeanMetric VP8Mean16x4; VP8QuantizeBlock VP8EncQuantizeBlock; VP8Quantize2Blocks VP8EncQuantize2Blocks; VP8QuantizeBlockWHT VP8EncQuantizeBlockWHT; @@ -795,6 +810,7 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) { VP8SSE4x4 = SSE4x4; VP8TDisto4x4 = Disto4x4; VP8TDisto16x16 = Disto16x16; + VP8Mean16x4 = Mean16x4; VP8EncQuantizeBlock = QuantizeBlock; VP8EncQuantize2Blocks = Quantize2Blocks; VP8EncQuantizeBlockWHT = QuantizeBlock; diff --git a/src/dsp/enc_sse2.c b/src/dsp/enc_sse2.c index 238aa436..0d3cb2de 100644 --- a/src/dsp/enc_sse2.c +++ b/src/dsp/enc_sse2.c @@ -1045,6 +1045,37 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) { return (tmp[3] + tmp[2] + tmp[1] + tmp[0]); } +//------------------------------------------------------------------------------ + +static void Mean16x4(const uint8_t* ref, uint32_t dc[4]) { + const __m128i mask = _mm_set1_epi16(0x00ff); + const __m128i a0 = _mm_loadu_si128((const __m128i*)&ref[BPS * 0]); + const __m128i a1 = _mm_loadu_si128((const __m128i*)&ref[BPS * 1]); + const __m128i a2 = _mm_loadu_si128((const __m128i*)&ref[BPS * 2]); + const __m128i a3 = _mm_loadu_si128((const __m128i*)&ref[BPS * 3]); + const __m128i b0 = _mm_srli_epi16(a0, 8); // hi byte + const __m128i b1 = _mm_srli_epi16(a1, 8); + const __m128i b2 = _mm_srli_epi16(a2, 8); + const __m128i b3 = _mm_srli_epi16(a3, 8); + const __m128i c0 = _mm_and_si128(a0, mask); // lo byte + const __m128i c1 = _mm_and_si128(a1, mask); + const __m128i c2 = _mm_and_si128(a2, mask); + const __m128i c3 = _mm_and_si128(a3, mask); + const __m128i d0 = _mm_add_epi32(b0, c0); + const __m128i d1 = _mm_add_epi32(b1, c1); + const __m128i d2 = _mm_add_epi32(b2, c2); + const __m128i d3 = _mm_add_epi32(b3, c3); + const __m128i e0 = _mm_add_epi32(d0, d1); + const __m128i e1 = _mm_add_epi32(d2, d3); + const __m128i f0 = _mm_add_epi32(e0, e1); + uint16_t tmp[8]; + _mm_storeu_si128((__m128i*)tmp, f0); + dc[0] = tmp[0] + tmp[1]; + dc[1] = tmp[2] + tmp[3]; + dc[2] = tmp[4] + tmp[5]; + dc[3] = tmp[6] + tmp[7]; +} + //------------------------------------------------------------------------------ // Texture distortion // @@ -1331,6 +1362,7 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitSSE2(void) { VP8SSE4x4 = SSE4x4; VP8TDisto4x4 = Disto4x4; VP8TDisto16x16 = Disto16x16; + VP8Mean16x4 = Mean16x4; } #else // !WEBP_USE_SSE2 diff --git a/src/enc/analysis.c b/src/enc/analysis.c index b55128fd..e81b3a72 100644 --- a/src/enc/analysis.c +++ b/src/enc/analysis.c @@ -262,6 +262,29 @@ static int MBAnalyzeBestIntra16Mode(VP8EncIterator* const it) { return best_alpha; } +static int FastMBAnalyze(VP8EncIterator* const it) { + // Empirical cut-off value, should be around 16 (~=block size). We use the + // [8-17] range and favor intra4 at high quality, intra16 for low quality. + const int q = (int)it->enc_->config_->quality; + const uint32_t kThreshold = 8 + (17 - 8) * q / 100; + int k; + uint32_t dc[16], m, m2; + for (k = 0; k < 16; k += 4) { + VP8Mean16x4(it->yuv_in_ + Y_OFF_ENC + k * BPS, &dc[k]); + } + for (m = 0, m2 = 0, k = 0; k < 16; ++k) { + m += dc[k]; + m2 += dc[k] * dc[k]; + } + if (kThreshold * m2 < m * m) { + VP8SetIntra16Mode(it, 0); // DC16 + } else { + const uint8_t modes[16] = { 0 }; // DC4 + VP8SetIntra4Mode(it, modes); + } + return 0; +} + static int MBAnalyzeBestIntra4Mode(VP8EncIterator* const it, int best_alpha) { uint8_t modes[16]; @@ -339,13 +362,17 @@ static void MBAnalyze(VP8EncIterator* const it, VP8SetSkip(it, 0); // not skipped VP8SetSegment(it, 0); // default segment, spec-wise. - best_alpha = MBAnalyzeBestIntra16Mode(it); - if (enc->method_ >= 5) { - // We go and make a fast decision for intra4/intra16. - // It's usually not a good and definitive pick, but helps seeding the stats - // about level bit-cost. - // TODO(skal): improve criterion. - best_alpha = MBAnalyzeBestIntra4Mode(it, best_alpha); + if (enc->method_ <= 1) { + best_alpha = FastMBAnalyze(it); + } else { + best_alpha = MBAnalyzeBestIntra16Mode(it); + if (enc->method_ >= 5) { + // We go and make a fast decision for intra4/intra16. + // It's usually not a good and definitive pick, but helps seeding the + // stats about level bit-cost. + // TODO(skal): improve criterion. + best_alpha = MBAnalyzeBestIntra4Mode(it, best_alpha); + } } best_uv_alpha = MBAnalyzeBestUVMode(it); @@ -448,7 +475,7 @@ int VP8EncAnalyze(VP8Encoder* const enc) { const int do_segments = enc->config_->emulate_jpeg_size || // We need the complexity evaluation. (enc->segment_hdr_.num_segments_ > 1) || - (enc->method_ == 0); // for method 0, we need preds_[] to be filled. + (enc->method_ <= 1); // for method 0 - 1, we need preds_[] to be filled. if (do_segments) { const int last_row = enc->mb_h_; // We give a little more than a half work to the main thread. diff --git a/src/enc/webpenc.c b/src/enc/webpenc.c index a7d04ea2..bccd803c 100644 --- a/src/enc/webpenc.c +++ b/src/enc/webpenc.c @@ -75,7 +75,7 @@ static void ResetBoundaryPredictions(VP8Encoder* const enc) { //-------------------+---+---+---+---+---+---+---+ // dynamic proba | ~ | x | x | x | x | x | x | //-------------------+---+---+---+---+---+---+---+ -// fast mode analysis| | | | | x | x | x | +// fast mode analysis|[x]|[x]| | | x | x | x | //-------------------+---+---+---+---+---+---+---+ // basic rd-opt | | | | x | x | x | x | //-------------------+---+---+---+---+---+---+---+