From 390c8b316dcce1c026061383f8871b4330d49721 Mon Sep 17 00:00:00 2001 From: Pascal Massimino Date: Wed, 26 Feb 2014 05:52:24 -0800 Subject: [PATCH] lossy encoding: ~3% speed-up incorporate non-last cost in per-level cost table also: correct trellis-quant cost evaluation at nodes (output a little bit different now). Method 6 is ~4% faster. Change-Id: Ic48bd6d33f9193838216e7dc3a9f9c5508a1fbe8 --- src/enc/cost.c | 5 +++-- src/enc/frame.c | 13 +++++++------ src/enc/quant.c | 24 ++++++++++-------------- 3 files changed, 20 insertions(+), 22 deletions(-) diff --git a/src/enc/cost.c b/src/enc/cost.c index 09699f80..3155c084 100644 --- a/src/enc/cost.c +++ b/src/enc/cost.c @@ -360,9 +360,10 @@ void VP8CalculateLevelCosts(VP8Proba* const proba) { for (ctx = 0; ctx < NUM_CTX; ++ctx) { const uint8_t* const p = proba->coeffs_[ctype][band][ctx]; uint16_t* const table = proba->level_cost_[ctype][band][ctx]; - const int cost_base = VP8BitCost(1, p[1]); + const int cost0 = (ctx > 0) ? VP8BitCost(1, p[0]) : 0; + const int cost_base = VP8BitCost(1, p[1]) + cost0; int v; - table[0] = VP8BitCost(0, p[1]); + table[0] = VP8BitCost(0, p[1]) + cost0; for (v = 1; v <= MAX_VARIABLE_LEVEL; ++v) { table[v] = cost_base + VariableLevelCost(v, p); } diff --git a/src/enc/frame.c b/src/enc/frame.c index 2582244c..12595426 100644 --- a/src/enc/frame.c +++ b/src/enc/frame.c @@ -199,8 +199,9 @@ static int RecordCoeffs(int ctx, const VP8Residual* const res) { Record((v >= 3 + (8 << 3)), s + 10); } #else - if (v > MAX_VARIABLE_LEVEL) + if (v > MAX_VARIABLE_LEVEL) { v = MAX_VARIABLE_LEVEL; + } { const int bits = VP8LevelCodes[v - 1][1]; @@ -339,22 +340,22 @@ static void SetResidualCoeffs(const int16_t* const coeffs, static int GetResidualCost(int ctx0, const VP8Residual* const res) { int n = res->first; // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1 - int p0 = res->prob[n][ctx0][0]; + const int p0 = res->prob[n][ctx0][0]; const uint16_t* t = res->cost[n][ctx0]; - int cost; + // bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0 + // (as required by the syntax). For ctx0 == 0, we need to add it here or it'll + // be missing during the loop. + int cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0; if (res->last < 0) { return VP8BitCost(0, p0); } - cost = VP8BitCost(1, p0); for (; n < res->last; ++n) { const int v = abs(res->coeffs[n]); const int b = VP8EncBands[n + 1]; const int ctx = (v >= 2) ? 2 : v; cost += VP8LevelCost(t, v); t = res->cost[b][ctx]; - // the masking trick is faster than "if (v) cost += ..." with clang - cost += (v ? ~0U : 0) & VP8BitCost(1, res->prob[b][ctx][0]); } // Last coefficient is always non-zero { diff --git a/src/enc/quant.c b/src/enc/quant.c index 876d7d29..d20ccde6 100644 --- a/src/enc/quant.c +++ b/src/enc/quant.c @@ -548,7 +548,7 @@ static int TrellisQuantizeBlock(const VP8EncIterator* const it, int ctx0, int coeff_type, const VP8Matrix* const mtx, int lambda) { - ProbaArray* const last_costs = it->enc_->proba_.coeffs_[coeff_type]; + ProbaArray* const probas = it->enc_->proba_.coeffs_[coeff_type]; CostArray* const costs = it->enc_->proba_.level_cost_[coeff_type]; const int first = (coeff_type == 0) ? 1 : 0; Node nodes[17][NUM_NODES]; @@ -562,7 +562,7 @@ static int TrellisQuantizeBlock(const VP8EncIterator* const it, score_t cost; score_t max_error; const int thresh = mtx->q_[1] * mtx->q_[1] / 4; - const int last_proba = last_costs[VP8EncBands[first]][ctx0][0]; + const int last_proba = probas[VP8EncBands[first]][ctx0][0]; // compute maximal distortion. max_error = 0; @@ -583,7 +583,7 @@ static int TrellisQuantizeBlock(const VP8EncIterator* const it, // initialize source node. n = first - 1; for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) { - NODE(n, m).cost = 0; + NODE(n, m).cost = (ctx0 == 0) ? VP8BitCost(1, last_proba) : 0; NODE(n, m).error = max_error; NODE(n, m).ctx = ctx0; } @@ -608,7 +608,7 @@ static int TrellisQuantizeBlock(const VP8EncIterator* const it, int delta_error, new_error; score_t cur_score = MAX_COST; int level = level0 + m; - int last_proba; + int last_pos_cost; // extra cost if last coeff's position is < 15 cur->sign = sign; cur->level = level; @@ -617,7 +617,9 @@ static int TrellisQuantizeBlock(const VP8EncIterator* const it, cur->cost = MAX_COST; continue; } - last_proba = last_costs[VP8EncBands[n + 1]][cur->ctx][0]; + last_pos_cost = + (n < 15) ? VP8BitCost(0, probas[VP8EncBands[n + 1]][cur->ctx][0]) + : 0; // Compute delta_error = how much coding this level will // subtract as distortion to max_error @@ -631,20 +633,16 @@ static int TrellisQuantizeBlock(const VP8EncIterator* const it, const int prev_ctx = prev->ctx; const uint16_t* const tcost = costs[VP8EncBands[n]][prev_ctx]; const score_t total_error = prev->error - delta_error; - score_t cost, base_cost, score; + score_t cost, score; if (prev->cost >= MAX_COST) { // dead node? continue; } // Base cost of both terminal/non-terminal - base_cost = prev->cost + VP8LevelCost(tcost, level); + cost = prev->cost + VP8LevelCost(tcost, level); // Examine node assuming it's a non-terminal one. - cost = base_cost; - if (level && n < 15) { - cost += VP8BitCost(1, last_proba); - } score = RDScoreTrellis(lambda, cost, total_error); if (score < cur_score) { cur_score = score; @@ -655,9 +653,7 @@ static int TrellisQuantizeBlock(const VP8EncIterator* const it, // Now, record best terminal node (and thus best entry in the graph). if (level) { - cost = base_cost; - if (n < 15) cost += VP8BitCost(0, last_proba); - score = RDScoreTrellis(lambda, cost, total_error); + score = RDScoreTrellis(lambda, cost + last_pos_cost, total_error); if (score < best_score) { best_score = score; best_path[0] = n; // best eob position