From 390c8b316dcce1c026061383f8871b4330d49721 Mon Sep 17 00:00:00 2001
From: Pascal Massimino <pascal.massimino@gmail.com>
Date: Wed, 26 Feb 2014 05:52:24 -0800
Subject: [PATCH] lossy encoding: ~3% speed-up

incorporate non-last cost in per-level cost table

also: correct trellis-quant cost evaluation at nodes
(output a little bit different now). Method 6 is ~4% faster.

Change-Id: Ic48bd6d33f9193838216e7dc3a9f9c5508a1fbe8
---
 src/enc/cost.c  |  5 +++--
 src/enc/frame.c | 13 +++++++------
 src/enc/quant.c | 24 ++++++++++--------------
 3 files changed, 20 insertions(+), 22 deletions(-)

diff --git a/src/enc/cost.c b/src/enc/cost.c
index 09699f80..3155c084 100644
--- a/src/enc/cost.c
+++ b/src/enc/cost.c
@@ -360,9 +360,10 @@ void VP8CalculateLevelCosts(VP8Proba* const proba) {
       for (ctx = 0; ctx < NUM_CTX; ++ctx) {
         const uint8_t* const p = proba->coeffs_[ctype][band][ctx];
         uint16_t* const table = proba->level_cost_[ctype][band][ctx];
-        const int cost_base = VP8BitCost(1, p[1]);
+        const int cost0 = (ctx > 0) ? VP8BitCost(1, p[0]) : 0;
+        const int cost_base = VP8BitCost(1, p[1]) + cost0;
         int v;
-        table[0] = VP8BitCost(0, p[1]);
+        table[0] = VP8BitCost(0, p[1]) + cost0;
         for (v = 1; v <= MAX_VARIABLE_LEVEL; ++v) {
           table[v] = cost_base + VariableLevelCost(v, p);
         }
diff --git a/src/enc/frame.c b/src/enc/frame.c
index 2582244c..12595426 100644
--- a/src/enc/frame.c
+++ b/src/enc/frame.c
@@ -199,8 +199,9 @@ static int RecordCoeffs(int ctx, const VP8Residual* const res) {
         Record((v >= 3 + (8 << 3)), s + 10);
       }
 #else
-      if (v > MAX_VARIABLE_LEVEL)
+      if (v > MAX_VARIABLE_LEVEL) {
         v = MAX_VARIABLE_LEVEL;
+      }
 
       {
         const int bits = VP8LevelCodes[v - 1][1];
@@ -339,22 +340,22 @@ static void SetResidualCoeffs(const int16_t* const coeffs,
 static int GetResidualCost(int ctx0, const VP8Residual* const res) {
   int n = res->first;
   // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
-  int p0 = res->prob[n][ctx0][0];
+  const int p0 = res->prob[n][ctx0][0];
   const uint16_t* t = res->cost[n][ctx0];
-  int cost;
+  // bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0
+  // (as required by the syntax). For ctx0 == 0, we need to add it here or it'll
+  // be missing during the loop.
+  int cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0;
 
   if (res->last < 0) {
     return VP8BitCost(0, p0);
   }
-  cost = VP8BitCost(1, p0);
   for (; n < res->last; ++n) {
     const int v = abs(res->coeffs[n]);
     const int b = VP8EncBands[n + 1];
     const int ctx = (v >= 2) ? 2 : v;
     cost += VP8LevelCost(t, v);
     t = res->cost[b][ctx];
-    // the masking trick is faster than "if (v) cost += ..." with clang
-    cost += (v ? ~0U : 0) & VP8BitCost(1, res->prob[b][ctx][0]);
   }
   // Last coefficient is always non-zero
   {
diff --git a/src/enc/quant.c b/src/enc/quant.c
index 876d7d29..d20ccde6 100644
--- a/src/enc/quant.c
+++ b/src/enc/quant.c
@@ -548,7 +548,7 @@ static int TrellisQuantizeBlock(const VP8EncIterator* const it,
                                 int ctx0, int coeff_type,
                                 const VP8Matrix* const mtx,
                                 int lambda) {
-  ProbaArray* const last_costs = it->enc_->proba_.coeffs_[coeff_type];
+  ProbaArray* const probas = it->enc_->proba_.coeffs_[coeff_type];
   CostArray* const costs = it->enc_->proba_.level_cost_[coeff_type];
   const int first = (coeff_type == 0) ? 1 : 0;
   Node nodes[17][NUM_NODES];
@@ -562,7 +562,7 @@ static int TrellisQuantizeBlock(const VP8EncIterator* const it,
     score_t cost;
     score_t max_error;
     const int thresh = mtx->q_[1] * mtx->q_[1] / 4;
-    const int last_proba = last_costs[VP8EncBands[first]][ctx0][0];
+    const int last_proba = probas[VP8EncBands[first]][ctx0][0];
 
     // compute maximal distortion.
     max_error = 0;
@@ -583,7 +583,7 @@ static int TrellisQuantizeBlock(const VP8EncIterator* const it,
     // initialize source node.
     n = first - 1;
     for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
-      NODE(n, m).cost = 0;
+      NODE(n, m).cost = (ctx0 == 0) ? VP8BitCost(1, last_proba) : 0;
       NODE(n, m).error = max_error;
       NODE(n, m).ctx = ctx0;
     }
@@ -608,7 +608,7 @@ static int TrellisQuantizeBlock(const VP8EncIterator* const it,
       int delta_error, new_error;
       score_t cur_score = MAX_COST;
       int level = level0 + m;
-      int last_proba;
+      int last_pos_cost;   // extra cost if last coeff's position is < 15
 
       cur->sign = sign;
       cur->level = level;
@@ -617,7 +617,9 @@ static int TrellisQuantizeBlock(const VP8EncIterator* const it,
         cur->cost = MAX_COST;
         continue;
       }
-      last_proba = last_costs[VP8EncBands[n + 1]][cur->ctx][0];
+      last_pos_cost =
+          (n < 15) ? VP8BitCost(0, probas[VP8EncBands[n + 1]][cur->ctx][0])
+                   : 0;
 
       // Compute delta_error = how much coding this level will
       // subtract as distortion to max_error
@@ -631,20 +633,16 @@ static int TrellisQuantizeBlock(const VP8EncIterator* const it,
         const int prev_ctx = prev->ctx;
         const uint16_t* const tcost = costs[VP8EncBands[n]][prev_ctx];
         const score_t total_error = prev->error - delta_error;
-        score_t cost, base_cost, score;
+        score_t cost, score;
 
         if (prev->cost >= MAX_COST) {   // dead node?
           continue;
         }
 
         // Base cost of both terminal/non-terminal
-        base_cost = prev->cost + VP8LevelCost(tcost, level);
+        cost = prev->cost + VP8LevelCost(tcost, level);
 
         // Examine node assuming it's a non-terminal one.
-        cost = base_cost;
-        if (level && n < 15) {
-          cost += VP8BitCost(1, last_proba);
-        }
         score = RDScoreTrellis(lambda, cost, total_error);
         if (score < cur_score) {
           cur_score = score;
@@ -655,9 +653,7 @@ static int TrellisQuantizeBlock(const VP8EncIterator* const it,
 
         // Now, record best terminal node (and thus best entry in the graph).
         if (level) {
-          cost = base_cost;
-          if (n < 15) cost += VP8BitCost(0, last_proba);
-          score = RDScoreTrellis(lambda, cost, total_error);
+          score = RDScoreTrellis(lambda, cost + last_pos_cost, total_error);
           if (score < best_score) {
             best_score = score;
             best_path[0] = n;   // best eob position