mirror of
				https://github.com/webmproject/libwebp.git
				synced 2025-10-31 10:25:46 +01:00 
			
		
		
		
	FastMBAnalyze() for quick i16/i4 decision
The decision is based on the variance between DC values of each sub-4x4 block. This heuristic is rather ok for predicting whether the 2nd transform (intra-16) is going to help or not. The decision threshold varies with quality (=quantization). It's only used for -m 0 and -m 1, where no full RD-opt is performed. It actually makes these modes quite faster, with RD curve much closer to the -m 2 mode. Change-Id: I15f972db97ba4082cbd1dfd16bee3eb2eca701a8
This commit is contained in:
		| @@ -185,6 +185,11 @@ typedef int (*VP8WMetric)(const uint8_t* pix, const uint8_t* ref, | |||||||
| // 4 by 4 symmetric matrix. | // 4 by 4 symmetric matrix. | ||||||
| extern VP8WMetric VP8TDisto4x4, VP8TDisto16x16; | extern VP8WMetric VP8TDisto4x4, VP8TDisto16x16; | ||||||
|  |  | ||||||
|  | // Compute the average (DC) of four 4x4 blocks. | ||||||
|  | // Each sub-4x4 block #i sum is stored in dc[i]. | ||||||
|  | typedef void (*VP8MeanMetric)(const uint8_t* ref, uint32_t dc[4]); | ||||||
|  | extern VP8MeanMetric VP8Mean16x4; | ||||||
|  |  | ||||||
| typedef void (*VP8BlockCopy)(const uint8_t* src, uint8_t* dst); | typedef void (*VP8BlockCopy)(const uint8_t* src, uint8_t* dst); | ||||||
| extern VP8BlockCopy VP8Copy4x4; | extern VP8BlockCopy VP8Copy4x4; | ||||||
| extern VP8BlockCopy VP8Copy16x8; | extern VP8BlockCopy VP8Copy16x8; | ||||||
|   | |||||||
| @@ -551,6 +551,20 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) { | |||||||
|   return GetSSE(a, b, 4, 4); |   return GetSSE(a, b, 4, 4); | ||||||
| } | } | ||||||
|  |  | ||||||
|  | static void Mean16x4(const uint8_t* ref, uint32_t dc[4]) { | ||||||
|  |   int k, x, y; | ||||||
|  |   for (k = 0; k < 4; ++k) { | ||||||
|  |     uint32_t avg = 0; | ||||||
|  |     for (y = 0; y < 4; ++y) { | ||||||
|  |       for (x = 0; x < 4; ++x) { | ||||||
|  |         avg += ref[x + y * BPS]; | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |     dc[k] = avg; | ||||||
|  |     ref += 4;   // go to next 4x4 block. | ||||||
|  |   } | ||||||
|  | } | ||||||
|  |  | ||||||
| //------------------------------------------------------------------------------ | //------------------------------------------------------------------------------ | ||||||
| // Texture distortion | // Texture distortion | ||||||
| // | // | ||||||
| @@ -757,6 +771,7 @@ VP8Metric VP8SSE16x8; | |||||||
| VP8Metric VP8SSE4x4; | VP8Metric VP8SSE4x4; | ||||||
| VP8WMetric VP8TDisto4x4; | VP8WMetric VP8TDisto4x4; | ||||||
| VP8WMetric VP8TDisto16x16; | VP8WMetric VP8TDisto16x16; | ||||||
|  | VP8MeanMetric VP8Mean16x4; | ||||||
| VP8QuantizeBlock VP8EncQuantizeBlock; | VP8QuantizeBlock VP8EncQuantizeBlock; | ||||||
| VP8Quantize2Blocks VP8EncQuantize2Blocks; | VP8Quantize2Blocks VP8EncQuantize2Blocks; | ||||||
| VP8QuantizeBlockWHT VP8EncQuantizeBlockWHT; | VP8QuantizeBlockWHT VP8EncQuantizeBlockWHT; | ||||||
| @@ -795,6 +810,7 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) { | |||||||
|   VP8SSE4x4 = SSE4x4; |   VP8SSE4x4 = SSE4x4; | ||||||
|   VP8TDisto4x4 = Disto4x4; |   VP8TDisto4x4 = Disto4x4; | ||||||
|   VP8TDisto16x16 = Disto16x16; |   VP8TDisto16x16 = Disto16x16; | ||||||
|  |   VP8Mean16x4 = Mean16x4; | ||||||
|   VP8EncQuantizeBlock = QuantizeBlock; |   VP8EncQuantizeBlock = QuantizeBlock; | ||||||
|   VP8EncQuantize2Blocks = Quantize2Blocks; |   VP8EncQuantize2Blocks = Quantize2Blocks; | ||||||
|   VP8EncQuantizeBlockWHT = QuantizeBlock; |   VP8EncQuantizeBlockWHT = QuantizeBlock; | ||||||
|   | |||||||
| @@ -1045,6 +1045,37 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) { | |||||||
|   return (tmp[3] + tmp[2] + tmp[1] + tmp[0]); |   return (tmp[3] + tmp[2] + tmp[1] + tmp[0]); | ||||||
| } | } | ||||||
|  |  | ||||||
|  | //------------------------------------------------------------------------------ | ||||||
|  |  | ||||||
|  | static void Mean16x4(const uint8_t* ref, uint32_t dc[4]) { | ||||||
|  |   const __m128i mask = _mm_set1_epi16(0x00ff); | ||||||
|  |   const __m128i a0 = _mm_loadu_si128((const __m128i*)&ref[BPS * 0]); | ||||||
|  |   const __m128i a1 = _mm_loadu_si128((const __m128i*)&ref[BPS * 1]); | ||||||
|  |   const __m128i a2 = _mm_loadu_si128((const __m128i*)&ref[BPS * 2]); | ||||||
|  |   const __m128i a3 = _mm_loadu_si128((const __m128i*)&ref[BPS * 3]); | ||||||
|  |   const __m128i b0 = _mm_srli_epi16(a0, 8);     // hi byte | ||||||
|  |   const __m128i b1 = _mm_srli_epi16(a1, 8); | ||||||
|  |   const __m128i b2 = _mm_srli_epi16(a2, 8); | ||||||
|  |   const __m128i b3 = _mm_srli_epi16(a3, 8); | ||||||
|  |   const __m128i c0 = _mm_and_si128(a0, mask);   // lo byte | ||||||
|  |   const __m128i c1 = _mm_and_si128(a1, mask); | ||||||
|  |   const __m128i c2 = _mm_and_si128(a2, mask); | ||||||
|  |   const __m128i c3 = _mm_and_si128(a3, mask); | ||||||
|  |   const __m128i d0 = _mm_add_epi32(b0, c0); | ||||||
|  |   const __m128i d1 = _mm_add_epi32(b1, c1); | ||||||
|  |   const __m128i d2 = _mm_add_epi32(b2, c2); | ||||||
|  |   const __m128i d3 = _mm_add_epi32(b3, c3); | ||||||
|  |   const __m128i e0 = _mm_add_epi32(d0, d1); | ||||||
|  |   const __m128i e1 = _mm_add_epi32(d2, d3); | ||||||
|  |   const __m128i f0 = _mm_add_epi32(e0, e1); | ||||||
|  |   uint16_t tmp[8]; | ||||||
|  |   _mm_storeu_si128((__m128i*)tmp, f0); | ||||||
|  |   dc[0] = tmp[0] + tmp[1]; | ||||||
|  |   dc[1] = tmp[2] + tmp[3]; | ||||||
|  |   dc[2] = tmp[4] + tmp[5]; | ||||||
|  |   dc[3] = tmp[6] + tmp[7]; | ||||||
|  | } | ||||||
|  |  | ||||||
| //------------------------------------------------------------------------------ | //------------------------------------------------------------------------------ | ||||||
| // Texture distortion | // Texture distortion | ||||||
| // | // | ||||||
| @@ -1331,6 +1362,7 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitSSE2(void) { | |||||||
|   VP8SSE4x4 = SSE4x4; |   VP8SSE4x4 = SSE4x4; | ||||||
|   VP8TDisto4x4 = Disto4x4; |   VP8TDisto4x4 = Disto4x4; | ||||||
|   VP8TDisto16x16 = Disto16x16; |   VP8TDisto16x16 = Disto16x16; | ||||||
|  |   VP8Mean16x4 = Mean16x4; | ||||||
| } | } | ||||||
|  |  | ||||||
| #else  // !WEBP_USE_SSE2 | #else  // !WEBP_USE_SSE2 | ||||||
|   | |||||||
| @@ -262,6 +262,29 @@ static int MBAnalyzeBestIntra16Mode(VP8EncIterator* const it) { | |||||||
|   return best_alpha; |   return best_alpha; | ||||||
| } | } | ||||||
|  |  | ||||||
|  | static int FastMBAnalyze(VP8EncIterator* const it) { | ||||||
|  |   // Empirical cut-off value, should be around 16 (~=block size). We use the | ||||||
|  |   // [8-17] range and favor intra4 at high quality, intra16 for low quality. | ||||||
|  |   const int q = (int)it->enc_->config_->quality; | ||||||
|  |   const uint32_t kThreshold = 8 + (17 - 8) * q / 100; | ||||||
|  |   int k; | ||||||
|  |   uint32_t dc[16], m, m2; | ||||||
|  |   for (k = 0; k < 16; k += 4) { | ||||||
|  |     VP8Mean16x4(it->yuv_in_ + Y_OFF_ENC + k * BPS, &dc[k]); | ||||||
|  |   } | ||||||
|  |   for (m = 0, m2 = 0, k = 0; k < 16; ++k) { | ||||||
|  |     m += dc[k]; | ||||||
|  |     m2 += dc[k] * dc[k]; | ||||||
|  |   } | ||||||
|  |   if (kThreshold * m2 < m * m) { | ||||||
|  |     VP8SetIntra16Mode(it, 0);   // DC16 | ||||||
|  |   } else { | ||||||
|  |     const uint8_t modes[16] = { 0 };  // DC4 | ||||||
|  |     VP8SetIntra4Mode(it, modes); | ||||||
|  |   } | ||||||
|  |   return 0; | ||||||
|  | } | ||||||
|  |  | ||||||
| static int MBAnalyzeBestIntra4Mode(VP8EncIterator* const it, | static int MBAnalyzeBestIntra4Mode(VP8EncIterator* const it, | ||||||
|                                    int best_alpha) { |                                    int best_alpha) { | ||||||
|   uint8_t modes[16]; |   uint8_t modes[16]; | ||||||
| @@ -339,13 +362,17 @@ static void MBAnalyze(VP8EncIterator* const it, | |||||||
|   VP8SetSkip(it, 0);         // not skipped |   VP8SetSkip(it, 0);         // not skipped | ||||||
|   VP8SetSegment(it, 0);      // default segment, spec-wise. |   VP8SetSegment(it, 0);      // default segment, spec-wise. | ||||||
|  |  | ||||||
|   best_alpha = MBAnalyzeBestIntra16Mode(it); |   if (enc->method_ <= 1) { | ||||||
|   if (enc->method_ >= 5) { |     best_alpha = FastMBAnalyze(it); | ||||||
|     // We go and make a fast decision for intra4/intra16. |   } else { | ||||||
|     // It's usually not a good and definitive pick, but helps seeding the stats |     best_alpha = MBAnalyzeBestIntra16Mode(it); | ||||||
|     // about level bit-cost. |     if (enc->method_ >= 5) { | ||||||
|     // TODO(skal): improve criterion. |       // We go and make a fast decision for intra4/intra16. | ||||||
|     best_alpha = MBAnalyzeBestIntra4Mode(it, best_alpha); |       // It's usually not a good and definitive pick, but helps seeding the | ||||||
|  |       // stats about level bit-cost. | ||||||
|  |       // TODO(skal): improve criterion. | ||||||
|  |       best_alpha = MBAnalyzeBestIntra4Mode(it, best_alpha); | ||||||
|  |     } | ||||||
|   } |   } | ||||||
|   best_uv_alpha = MBAnalyzeBestUVMode(it); |   best_uv_alpha = MBAnalyzeBestUVMode(it); | ||||||
|  |  | ||||||
| @@ -448,7 +475,7 @@ int VP8EncAnalyze(VP8Encoder* const enc) { | |||||||
|   const int do_segments = |   const int do_segments = | ||||||
|       enc->config_->emulate_jpeg_size ||   // We need the complexity evaluation. |       enc->config_->emulate_jpeg_size ||   // We need the complexity evaluation. | ||||||
|       (enc->segment_hdr_.num_segments_ > 1) || |       (enc->segment_hdr_.num_segments_ > 1) || | ||||||
|       (enc->method_ == 0);  // for method 0, we need preds_[] to be filled. |       (enc->method_ <= 1);  // for method 0 - 1, we need preds_[] to be filled. | ||||||
|   if (do_segments) { |   if (do_segments) { | ||||||
|     const int last_row = enc->mb_h_; |     const int last_row = enc->mb_h_; | ||||||
|     // We give a little more than a half work to the main thread. |     // We give a little more than a half work to the main thread. | ||||||
|   | |||||||
| @@ -75,7 +75,7 @@ static void ResetBoundaryPredictions(VP8Encoder* const enc) { | |||||||
| //-------------------+---+---+---+---+---+---+---+ | //-------------------+---+---+---+---+---+---+---+ | ||||||
| // dynamic proba     | ~ | x | x | x | x | x | x | | // dynamic proba     | ~ | x | x | x | x | x | x | | ||||||
| //-------------------+---+---+---+---+---+---+---+ | //-------------------+---+---+---+---+---+---+---+ | ||||||
| // fast mode analysis|   |   |   |   | x | x | x | | // fast mode analysis|[x]|[x]|   |   | x | x | x | | ||||||
| //-------------------+---+---+---+---+---+---+---+ | //-------------------+---+---+---+---+---+---+---+ | ||||||
| // basic rd-opt      |   |   |   | x | x | x | x | | // basic rd-opt      |   |   |   | x | x | x | x | | ||||||
| //-------------------+---+---+---+---+---+---+---+ | //-------------------+---+---+---+---+---+---+---+ | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user