1-2% faster encoding by removing an indirection in GetResidualCost()

The MIPS code for cost is not updated yet, that's why i keep Residual::*cost around for now. Should be removed in favor of *costs later. Change-Id: Id1d09a8c37ea8c5b34ad5eb8811d6a3ec6c4d89f
2025-12-24 05:56:27 +01:00 · 2015-02-19 08:44:35 +01:00
parent eddb7e70be
commit 2382050748
6 changed files with 25 additions and 12 deletions
--- a/src/dsp/cost.c
+++ b/src/dsp/cost.c
@@ -323,7 +323,8 @@ static int GetResidualCost(int ctx0, const VP8Residual* const res) {
  int n = res->first;
  // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
  const int p0 = res->prob[n][ctx0][0];
-  const uint16_t* t = res->cost[n][ctx0];
+  CostArrayPtr const costs = res->costs;
  const uint16_t* t = costs[n][ctx0];
  // bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0
  // (as required by the syntax). For ctx0 == 0, we need to add it here or it'll
  // be missing during the loop.
@@ -334,10 +335,9 @@ static int GetResidualCost(int ctx0, const VP8Residual* const res) {
  }
  for (; n < res->last; ++n) {
    const int v = abs(res->coeffs[n]);
    const int b = VP8EncBands[n + 1];
    const int ctx = (v >= 2) ? 2 : v;
    cost += VP8LevelCost(t, v);
-    t = res->cost[b][ctx];
+    t = costs[n + 1][ctx];
  }
  // Last coefficient is always non-zero
  {
--- a/src/dsp/cost_sse2.c
+++ b/src/dsp/cost_sse2.c
@@ -51,7 +51,8 @@ static int GetResidualCostSSE2(int ctx0, const VP8Residual* const res) {
  int n = res->first;
  // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
  const int p0 = res->prob[n][ctx0][0];
-  const uint16_t* t = res->cost[n][ctx0];
+  CostArrayPtr const costs = res->costs;
  const uint16_t* t = costs[n][ctx0];
  // bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0
  // (as required by the syntax). For ctx0 == 0, we need to add it here or it'll
  // be missing during the loop.
@@ -87,9 +88,8 @@ static int GetResidualCostSSE2(int ctx0, const VP8Residual* const res) {
    const int ctx = ctxs[n];
    const int level = levels[n];
    const int flevel = abs_levels[n];   // full level
    const int b = VP8EncBands[n + 1];
    cost += VP8LevelFixedCosts[flevel] + t[level];  // simplified VP8LevelCost()
-    t = res->cost[b][ctx];
+    t = costs[n + 1][ctx];
  }
  // Last coefficient is always non-zero
  {
--- a/src/enc/cost.c
+++ b/src/enc/cost.c
@@ -63,6 +63,7 @@ void VP8CalculateLevelCosts(VP8Proba* const proba) {
  if (!proba->dirty_) return;  // nothing to do.
  for (ctype = 0; ctype < NUM_TYPES; ++ctype) {
    int n;
    for (band = 0; band < NUM_BANDS; ++band) {
      for (ctx = 0; ctx < NUM_CTX; ++ctx) {
        const uint8_t* const p = proba->coeffs_[ctype][band][ctx];
@@ -78,6 +79,12 @@ void VP8CalculateLevelCosts(VP8Proba* const proba) {
        // actually constant.
      }
    }
    for (n = 0; n < 16; ++n) {    // replicate bands. We don't need to sentinel.
      for (ctx = 0; ctx < NUM_CTX; ++ctx) {
        proba->remapped_costs_[ctype][n][ctx] =
            proba->level_cost_[ctype][VP8EncBands[n]][ctx];
      }
    }
  }
  proba->dirty_ = 0;
 }
@@ -202,6 +209,7 @@ void VP8InitResidual(int first, int coeff_type,
  res->prob  = enc->proba_.coeffs_[coeff_type];
  res->stats = enc->proba_.stats_[coeff_type];
  res->cost  = enc->proba_.level_cost_[coeff_type];
  res->costs = enc->proba_.remapped_costs_[coeff_type];
  res->first = first;
 }
--- a/src/enc/cost.h
+++ b/src/enc/cost.h
@@ -31,9 +31,10 @@ struct VP8Residual {
  const int16_t* coeffs;
  int coeff_type;
-  ProbaArray* prob;
+  ProbaArray*   prob;
-  StatsArray* stats;
+  StatsArray*   stats;
-  CostArray*  cost;
+  CostArray*    cost;    // TODO(skal): remove in favor of *costs
  CostArrayPtr  costs;
 };
 void VP8InitResidual(int first, int coeff_type,
--- a/src/enc/quant.c
+++ b/src/enc/quant.c
@@ -550,7 +550,8 @@ static int TrellisQuantizeBlock(const VP8Encoder* const enc,
                                const VP8Matrix* const mtx,
                                int lambda) {
  const ProbaArray* const probas = enc->proba_.coeffs_[coeff_type];
-  const CostArray* const costs = enc->proba_.level_cost_[coeff_type];
+  CostArrayPtr const costs =
      (CostArrayPtr)enc->proba_.remapped_costs_[coeff_type];
  const int first = (coeff_type == 0) ? 1 : 0;
  Node nodes[16][NUM_NODES];
  ScoreState score_states[2][NUM_NODES];
@@ -587,7 +588,7 @@ static int TrellisQuantizeBlock(const VP8Encoder* const enc,
    for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
      const score_t rate = (ctx0 == 0) ? VP8BitCost(1, last_proba) : 0;
      ss_cur[m].score = RDScoreTrellis(lambda, rate, 0);
-      ss_cur[m].costs = costs[VP8EncBands[first]][ctx0];
+      ss_cur[m].costs = costs[first][ctx0];
    }
  }
@@ -621,7 +622,7 @@ static int TrellisQuantizeBlock(const VP8Encoder* const enc,
      int best_prev = 0;   // default, in case
      ss_cur[m].score = MAX_COST;
-      ss_cur[m].costs = costs[band][ctx];
+      ss_cur[m].costs = costs[n + 1][ctx];
      if (level > MAX_LEVEL || level < 0) {   // node is dead?
        continue;
      }
--- a/src/enc/vp8enci.h
+++ b/src/enc/vp8enci.h
@@ -151,6 +151,8 @@ typedef uint32_t proba_t;   // 16b + 16b
 typedef uint8_t ProbaArray[NUM_CTX][NUM_PROBAS];
 typedef proba_t StatsArray[NUM_CTX][NUM_PROBAS];
 typedef uint16_t CostArray[NUM_CTX][MAX_VARIABLE_LEVEL + 1];
 typedef const uint16_t* (*CostArrayPtr)[NUM_CTX];   // for easy casting
 typedef const uint16_t* CostArrayMap[16][NUM_CTX];
 typedef double LFStats[NUM_MB_SEGMENTS][MAX_LF_LEVELS];  // filter stats
 typedef struct VP8Encoder VP8Encoder;
@@ -170,6 +172,7 @@ typedef struct {
  ProbaArray coeffs_[NUM_TYPES][NUM_BANDS];      // 1056 bytes
  StatsArray stats_[NUM_TYPES][NUM_BANDS];       // 4224 bytes
  CostArray level_cost_[NUM_TYPES][NUM_BANDS];   // 13056 bytes
  CostArrayMap remapped_costs_[NUM_TYPES];       // 1536 bytes
  int dirty_;               // if true, need to call VP8CalculateLevelCosts()
  int use_skip_proba_;      // Note: we always use skip_proba for now.
  int nb_skip_;             // number of skipped blocks