5-10% encoding speedup with faster trellis (-m 6)

mostly by: - storing a single rd-score instead of cost / distortion separately - evaluating terminal cost only once - getting some invariants out of the loops - more consts behind fewer variables Change-Id: I79451f3fd1143d6537200fb8b90d0ba252809f8c
2025-06-30 16:14:29 +02:00 · 2014-03-03 22:07:06 +01:00 · 2014-03-03 22:07:06 +01:00 · 5aeeb087d6
commit 5aeeb087d6
parent e3dd9243cb
1 changed files with 44 additions and 49 deletions
--- a/src/enc/quant.c
+++ b/src/enc/quant.c
@ -517,11 +517,10 @@ static void AddScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
 // Trellis
 typedef struct {
-  int prev;        // best previous
+  int prev;               // best previous node
-  int level;       // level
+  int level;              // level
-  int sign;        // sign of coeff_i
+  int sign;               // sign of coeff_i
-  score_t cost;    // bit cost
+  score_t score;          // partial RD score
  score_t error;   // distortion = sum of (|coeff_i| - level_i * Q_i)^2
  const uint16_t* costs;  // shortcut to cost tables
 } Node;
@ -582,8 +581,8 @@ static int TrellisQuantizeBlock(const VP8EncIterator* const it,
    // initialize source node.
    n = first - 1;
    for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
-      NODE(n, m).cost = (ctx0 == 0) ? VP8BitCost(1, last_proba) : 0;
+      const score_t rate = (ctx0 == 0) ? VP8BitCost(1, last_proba) : 0;
-      NODE(n, m).error = max_error;
+      NODE(n, m).score = RDScoreTrellis(lambda, rate, max_error);
      NODE(n, m).costs = costs[VP8EncBands[first]][ctx0];
    }
  }
@ -604,63 +603,59 @@ static int TrellisQuantizeBlock(const VP8EncIterator* const it,
    // test all alternate level values around level0.
    for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
      Node* const cur = &NODE(n, m);
      int delta_error, new_error;
      score_t cur_score = MAX_COST;
      int level = level0 + m;
      const int ctx = (level > 2) ? 2 : level;
      const int band = VP8EncBands[n + 1];
-      int last_pos_cost;   // extra cost if last coeff's position is < 15
+      score_t base_score, last_pos_cost;
      cur->score = MAX_COST;
      if (level > MAX_LEVEL || level < 0) {   // node is dead?
        continue;
      }
      cur->sign = sign;
      cur->level = level;
      cur->costs = costs[band][ctx];
      if (level > MAX_LEVEL || level < 0) {   // node is dead?
        cur->cost = MAX_COST;
        continue;
      }
      last_pos_cost = (n < 15) ? VP8BitCost(0, probas[band][ctx][0])
                               : 0;
-      // Compute delta_error = how much coding this level will
+      // Compute extra rate cost if last coeff's position is < 15
-      // subtract as distortion to max_error
+      last_pos_cost = (n < 15) ? VP8BitCost(0, probas[band][ctx][0]) : 0;
-      new_error = coeff0 - level * Q;
+
-      delta_error =
+      {
-          kWeightTrellis[j] * (coeff0 * coeff0 - new_error * new_error);
+        // Compute delta_error = how much coding this level will
        // subtract to max_error as distortion.
        // Here, distortion = sum of (|coeff_i| - level_i * Q_i)^2
        const int new_error = coeff0 - level * Q;
        const int delta_error =
            kWeightTrellis[j] * (new_error * new_error - coeff0 * coeff0);
        base_score = RDScoreTrellis(lambda, 0, delta_error);
      }
      // Inspect all possible non-dead predecessors. Retain only the best one.
      for (p = -MIN_DELTA; p <= MAX_DELTA; ++p) {
        const Node* const prev = &NODE(n - 1, p);
-        const uint16_t* const tcost = prev->costs;
+        if (prev->score < MAX_COST) {   // skip dead node
-        const score_t total_error = prev->error - delta_error;
+          // Base cost of both terminal / non-terminal hypothesis
-        score_t cost, score;
+          const score_t cost = VP8LevelCost(prev->costs, level);
-
+          // Examine node assuming it's a non-terminal one.
-        if (prev->cost >= MAX_COST) {   // dead node?
+          score_t score =
-          continue;
+              base_score + prev->score + RDScoreTrellis(lambda, cost, 0);
-        }
+          if (score < cur->score) {
-
+            cur->score  = score;
-        // Base cost of both terminal/non-terminal
+            cur->prev  = p;
        cost = prev->cost + VP8LevelCost(tcost, level);
        // Examine node assuming it's a non-terminal one.
        score = RDScoreTrellis(lambda, cost, total_error);
        if (score < cur_score) {
          cur_score = score;
          cur->cost  = cost;
          cur->error = total_error;
          cur->prev  = p;
        }
        // Now, record best terminal node (and thus best entry in the graph).
        if (level) {
          score = RDScoreTrellis(lambda, cost + last_pos_cost, total_error);
          if (score < best_score) {
            best_score = score;
            best_path[0] = n;   // best eob position
            best_path[1] = m;   // best level
            best_path[2] = p;   // best predecessor
          }
        }
      }
      // Now, record best terminal node (and thus best entry in the graph).
      if (cur->level != 0) {
        const score_t last_pos_score =
            RDScoreTrellis(lambda, last_pos_cost, 0);
        const score_t score = cur->score + last_pos_score;
        if (score < best_score) {
          best_score = score;
          best_path[0] = n;                     // best eob position
          best_path[1] = cur->level - level0;   // best node index ('m')
          best_path[2] = cur->prev;             // best predecessor
        }
      }
    }
  }