fix -m 2 mode-cost evaluation (causing partition0 overflow)

The mode's bits were not taken into account, which is ok for most of cases. But in case of super large image, with 'easy' content, their overhead starts mattering a lot and we were omitting to optimize for these. Now, these mode bits have their own lambda values associated, limiting the jerkiness. We also limit (for -m 2 only) the individual number of bits to something that will prevent the partition 0 overflow. removed the I4_PENALTY constant, which was a rather crude approximation. Replaced by some q-dependent expression. fixes issue #289 Change-Id: I956ae2d2308c339adc4706d52722f0bb61ccf18c
2025-06-07 06:24:22 +02:00 · 2016-03-11 20:34:45 +01:00 · 2016-03-11 20:34:45 +01:00 · e88c4ca013
commit e88c4ca013
parent 4562e83dc2
3 changed files with 58 additions and 25 deletions
--- a/src/enc/quant.c
+++ b/src/enc/quant.c
@ -30,8 +30,6 @@
 #define SNS_TO_DQ 0.9     // Scaling constant between the sns value and the QP
                          // power-law modulation. Must be strictly less than 1.
 #define I4_PENALTY 14000  // Rate-penalty for quick i4/i16 decision
 // number of non-zero coeffs below which we consider the block very flat
 // (and apply a penalty to complex predictions)
 #define FLATNESS_LIMIT_I16 10      // I16 mode
@ -236,6 +234,8 @@ static int ExpandMatrix(VP8Matrix* const m, int type) {
  return (sum + 8) >> 4;
 }
 static void CheckLambdaValue(int* const v) { if (*v < 1) *v = 1; }
 static void SetupMatrices(VP8Encoder* enc) {
  int i;
  const int tlambda_scale =
@ -245,7 +245,7 @@ static void SetupMatrices(VP8Encoder* enc) {
  for (i = 0; i < num_segments; ++i) {
    VP8SegmentInfo* const m = &enc->dqm_[i];
    const int q = m->quant_;
-    int q4, q16, quv;
+    int q_i4, q_i16, q_uv;
    m->y1_.q_[0] = kDcTable[clip(q + enc->dq_y1_dc_, 0, 127)];
    m->y1_.q_[1] = kAcTable[clip(q,                  0, 127)];
@ -255,21 +255,33 @@ static void SetupMatrices(VP8Encoder* enc) {
    m->uv_.q_[0] = kDcTable[clip(q + enc->dq_uv_dc_, 0, 117)];
    m->uv_.q_[1] = kAcTable[clip(q + enc->dq_uv_ac_, 0, 127)];
-    q4  = ExpandMatrix(&m->y1_, 0);
+    q_i4  = ExpandMatrix(&m->y1_, 0);
-    q16 = ExpandMatrix(&m->y2_, 1);
+    q_i16 = ExpandMatrix(&m->y2_, 1);
-    quv = ExpandMatrix(&m->uv_, 2);
+    q_uv  = ExpandMatrix(&m->uv_, 2);
-    m->lambda_i4_          = (3 * q4 * q4) >> 7;
+    m->lambda_i4_          = (3 * q_i4 * q_i4) >> 7;
-    m->lambda_i16_         = (3 * q16 * q16);
+    m->lambda_i16_         = (3 * q_i16 * q_i16);
-    m->lambda_uv_          = (3 * quv * quv) >> 6;
+    m->lambda_uv_          = (3 * q_uv * q_uv) >> 6;
-    m->lambda_mode_        = (1 * q4 * q4) >> 7;
+    m->lambda_mode_        = (1 * q_i4 * q_i4) >> 7;
-    m->lambda_trellis_i4_  = (7 * q4 * q4) >> 3;
+    m->lambda_trellis_i4_  = (7 * q_i4 * q_i4) >> 3;
-    m->lambda_trellis_i16_ = (q16 * q16) >> 2;
+    m->lambda_trellis_i16_ = (q_i16 * q_i16) >> 2;
-    m->lambda_trellis_uv_  = (quv *quv) << 1;
+    m->lambda_trellis_uv_  = (q_uv * q_uv) << 1;
-    m->tlambda_            = (tlambda_scale * q4) >> 5;
+    m->tlambda_            = (tlambda_scale * q_i4) >> 5;
    // none of these constants should be < 1
    CheckLambdaValue(&m->lambda_i4_);
    CheckLambdaValue(&m->lambda_i16_);
    CheckLambdaValue(&m->lambda_uv_);
    CheckLambdaValue(&m->lambda_mode_);
    CheckLambdaValue(&m->lambda_trellis_i4_);
    CheckLambdaValue(&m->lambda_trellis_i16_);
    CheckLambdaValue(&m->lambda_trellis_uv_);
    CheckLambdaValue(&m->tlambda_);
    m->min_disto_ = 10 * m->y1_.q_[0];   // quantization-aware min disto
    m->max_edge_  = 0;
    m->i4_penalty_ = 1000 * q_i4 * q_i4;
  }
 }
@ -1127,19 +1139,29 @@ static void RefineUsingDistortion(VP8EncIterator* const it,
                                  int try_both_modes, int refine_uv_mode,
                                  VP8ModeScore* const rd) {
  score_t best_score = MAX_COST;
  score_t score_i4 = (score_t)I4_PENALTY;
  int16_t tmp_levels[16][16];
  uint8_t modes_i4[16];
  int nz = 0;
  int mode;
  int is_i16 = try_both_modes || (it->mb_->type_ == 1);
  const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
  // Some empiric constants, of approximate order of magnitude.
  const int lambda_d_i16 = 106;
  const int lambda_d_i4 = 11;
  const int lambda_d_uv = 120;
  score_t score_i4 = dqm->i4_penalty_;
  score_t i4_bit_sum = 0;
  const score_t bit_limit = it->enc_->mb_header_limit_;
  if (is_i16) {   // First, evaluate Intra16 distortion
    int best_mode = -1;
    const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
    for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
      const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
-      const score_t score = VP8SSE16x16(src, ref);
+      const score_t score = VP8SSE16x16(src, ref) * RD_DISTO_MULT
                          + VP8FixedCostsI16[mode] * lambda_d_i16;
      if (mode > 0 && VP8FixedCostsI16[mode] > bit_limit) {
        continue;
      }
      if (score < best_score) {
        best_mode = mode;
        best_score = score;
@ -1159,25 +1181,28 @@ static void RefineUsingDistortion(VP8EncIterator* const it,
      int best_i4_mode = -1;
      score_t best_i4_score = MAX_COST;
      const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
      const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
      VP8MakeIntra4Preds(it);
      for (mode = 0; mode < NUM_BMODES; ++mode) {
        const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
-        const score_t score = VP8SSE4x4(src, ref);
+        const score_t score = VP8SSE4x4(src, ref) * RD_DISTO_MULT
                            + mode_costs[mode] * lambda_d_i4;
        if (score < best_i4_score) {
          best_i4_mode = mode;
          best_i4_score = score;
        }
      }
-      modes_i4[it->i4_] = best_i4_mode;
+      i4_bit_sum += mode_costs[best_i4_mode];
      rd->modes_i4[it->i4_] = best_i4_mode;
      score_i4 += best_i4_score;
-      if (score_i4 >= best_score) {
+      if (score_i4 >= best_score || i4_bit_sum > bit_limit) {
        // Intra4 won't be better than Intra16. Bail out and pick Intra16.
        is_i16 = 1;
        break;
      } else {  // reconstruct partial block inside yuv_out2_ buffer
        uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC + VP8Scan[it->i4_];
-        nz |= ReconstructIntra4(it, tmp_levels[it->i4_],
+        nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
                                src, tmp_dst, best_i4_mode) << it->i4_;
      }
    } while (VP8IteratorRotateI4(it, it->yuv_out2_ + Y_OFF_ENC));
@ -1185,8 +1210,7 @@ static void RefineUsingDistortion(VP8EncIterator* const it,
  // Final reconstruction, depending on which mode is selected.
  if (!is_i16) {
-    VP8SetIntra4Mode(it, modes_i4);
+    VP8SetIntra4Mode(it, rd->modes_i4);
    memcpy(rd->y_ac_levels, tmp_levels, sizeof(tmp_levels));
    SwapOut(it);
    best_score = score_i4;
  } else {
@ -1200,7 +1224,8 @@ static void RefineUsingDistortion(VP8EncIterator* const it,
    const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
    for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
      const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
-      const score_t score = VP8SSE16x8(src, ref);
+      const score_t score = VP8SSE16x8(src, ref) * RD_DISTO_MULT
                          + VP8FixedCostsUV[mode] * lambda_d_uv;
      if (score < best_uv_score) {
        best_mode = mode;
        best_uv_score = score;
--- a/src/enc/vp8enci.h
+++ b/src/enc/vp8enci.h
@ -196,6 +196,9 @@ typedef struct {
  int lambda_i16_, lambda_i4_, lambda_uv_;
  int lambda_mode_, lambda_trellis_, tlambda_;
  int lambda_trellis_i16_, lambda_trellis_i4_, lambda_trellis_uv_;
  // lambda values for distortion-based evaluation
  score_t i4_penalty_;   // penalty for using Intra4
 } VP8SegmentInfo;
 // Handy transient struct to accumulate score and info during RD-optimization
@ -391,6 +394,7 @@ struct VP8Encoder {
  int method_;               // 0=fastest, 6=best/slowest.
  VP8RDLevel rd_opt_level_;  // Deduced from method_.
  int max_i4_header_bits_;   // partition #0 safeness factor
  int mb_header_limit_;      // rough limit for header bits per MB
  int thread_level_;         // derived from config->thread_level
  int do_search_;            // derived from config->target_XXX
  int use_tokens_;           // if true, use token buffer
--- a/src/enc/webpenc.c
+++ b/src/enc/webpenc.c
@ -105,6 +105,10 @@ static void MapConfigToTools(VP8Encoder* const enc) {
      256 * 16 * 16 *                 // upper bound: up to 16bit per 4x4 block
      (limit * limit) / (100 * 100);  // ... modulated with a quadratic curve.
  // partition0 = 512k max.
  enc->mb_header_limit_ =
      (score_t)256 * 510 * 8 * 1024 / (enc->mb_w_ * enc->mb_h_);
  enc->thread_level_ = config->thread_level;
  enc->do_search_ = (config->target_size > 0 || config->target_PSNR > 0);