From 238205074817d91c288d13f7843bf2214bd913ac Mon Sep 17 00:00:00 2001 From: Pascal Massimino Date: Thu, 19 Feb 2015 08:44:35 +0100 Subject: [PATCH] 1-2% faster encoding by removing an indirection in GetResidualCost() The MIPS code for cost is not updated yet, that's why i keep Residual::*cost around for now. Should be removed in favor of *costs later. Change-Id: Id1d09a8c37ea8c5b34ad5eb8811d6a3ec6c4d89f --- src/dsp/cost.c | 6 +++--- src/dsp/cost_sse2.c | 6 +++--- src/enc/cost.c | 8 ++++++++ src/enc/cost.h | 7 ++++--- src/enc/quant.c | 7 ++++--- src/enc/vp8enci.h | 3 +++ 6 files changed, 25 insertions(+), 12 deletions(-) diff --git a/src/dsp/cost.c b/src/dsp/cost.c index ebd75013..fe72d26e 100644 --- a/src/dsp/cost.c +++ b/src/dsp/cost.c @@ -323,7 +323,8 @@ static int GetResidualCost(int ctx0, const VP8Residual* const res) { int n = res->first; // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1 const int p0 = res->prob[n][ctx0][0]; - const uint16_t* t = res->cost[n][ctx0]; + CostArrayPtr const costs = res->costs; + const uint16_t* t = costs[n][ctx0]; // bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0 // (as required by the syntax). For ctx0 == 0, we need to add it here or it'll // be missing during the loop. @@ -334,10 +335,9 @@ static int GetResidualCost(int ctx0, const VP8Residual* const res) { } for (; n < res->last; ++n) { const int v = abs(res->coeffs[n]); - const int b = VP8EncBands[n + 1]; const int ctx = (v >= 2) ? 2 : v; cost += VP8LevelCost(t, v); - t = res->cost[b][ctx]; + t = costs[n + 1][ctx]; } // Last coefficient is always non-zero { diff --git a/src/dsp/cost_sse2.c b/src/dsp/cost_sse2.c index 540d6820..624c4e26 100644 --- a/src/dsp/cost_sse2.c +++ b/src/dsp/cost_sse2.c @@ -51,7 +51,8 @@ static int GetResidualCostSSE2(int ctx0, const VP8Residual* const res) { int n = res->first; // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1 const int p0 = res->prob[n][ctx0][0]; - const uint16_t* t = res->cost[n][ctx0]; + CostArrayPtr const costs = res->costs; + const uint16_t* t = costs[n][ctx0]; // bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0 // (as required by the syntax). For ctx0 == 0, we need to add it here or it'll // be missing during the loop. @@ -87,9 +88,8 @@ static int GetResidualCostSSE2(int ctx0, const VP8Residual* const res) { const int ctx = ctxs[n]; const int level = levels[n]; const int flevel = abs_levels[n]; // full level - const int b = VP8EncBands[n + 1]; cost += VP8LevelFixedCosts[flevel] + t[level]; // simplified VP8LevelCost() - t = res->cost[b][ctx]; + t = costs[n + 1][ctx]; } // Last coefficient is always non-zero { diff --git a/src/enc/cost.c b/src/enc/cost.c index df2372b0..17dbf67f 100644 --- a/src/enc/cost.c +++ b/src/enc/cost.c @@ -63,6 +63,7 @@ void VP8CalculateLevelCosts(VP8Proba* const proba) { if (!proba->dirty_) return; // nothing to do. for (ctype = 0; ctype < NUM_TYPES; ++ctype) { + int n; for (band = 0; band < NUM_BANDS; ++band) { for (ctx = 0; ctx < NUM_CTX; ++ctx) { const uint8_t* const p = proba->coeffs_[ctype][band][ctx]; @@ -78,6 +79,12 @@ void VP8CalculateLevelCosts(VP8Proba* const proba) { // actually constant. } } + for (n = 0; n < 16; ++n) { // replicate bands. We don't need to sentinel. + for (ctx = 0; ctx < NUM_CTX; ++ctx) { + proba->remapped_costs_[ctype][n][ctx] = + proba->level_cost_[ctype][VP8EncBands[n]][ctx]; + } + } } proba->dirty_ = 0; } @@ -202,6 +209,7 @@ void VP8InitResidual(int first, int coeff_type, res->prob = enc->proba_.coeffs_[coeff_type]; res->stats = enc->proba_.stats_[coeff_type]; res->cost = enc->proba_.level_cost_[coeff_type]; + res->costs = enc->proba_.remapped_costs_[coeff_type]; res->first = first; } diff --git a/src/enc/cost.h b/src/enc/cost.h index 714cc43a..01810637 100644 --- a/src/enc/cost.h +++ b/src/enc/cost.h @@ -31,9 +31,10 @@ struct VP8Residual { const int16_t* coeffs; int coeff_type; - ProbaArray* prob; - StatsArray* stats; - CostArray* cost; + ProbaArray* prob; + StatsArray* stats; + CostArray* cost; // TODO(skal): remove in favor of *costs + CostArrayPtr costs; }; void VP8InitResidual(int first, int coeff_type, diff --git a/src/enc/quant.c b/src/enc/quant.c index d403ddc7..2db605d5 100644 --- a/src/enc/quant.c +++ b/src/enc/quant.c @@ -550,7 +550,8 @@ static int TrellisQuantizeBlock(const VP8Encoder* const enc, const VP8Matrix* const mtx, int lambda) { const ProbaArray* const probas = enc->proba_.coeffs_[coeff_type]; - const CostArray* const costs = enc->proba_.level_cost_[coeff_type]; + CostArrayPtr const costs = + (CostArrayPtr)enc->proba_.remapped_costs_[coeff_type]; const int first = (coeff_type == 0) ? 1 : 0; Node nodes[16][NUM_NODES]; ScoreState score_states[2][NUM_NODES]; @@ -587,7 +588,7 @@ static int TrellisQuantizeBlock(const VP8Encoder* const enc, for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) { const score_t rate = (ctx0 == 0) ? VP8BitCost(1, last_proba) : 0; ss_cur[m].score = RDScoreTrellis(lambda, rate, 0); - ss_cur[m].costs = costs[VP8EncBands[first]][ctx0]; + ss_cur[m].costs = costs[first][ctx0]; } } @@ -621,7 +622,7 @@ static int TrellisQuantizeBlock(const VP8Encoder* const enc, int best_prev = 0; // default, in case ss_cur[m].score = MAX_COST; - ss_cur[m].costs = costs[band][ctx]; + ss_cur[m].costs = costs[n + 1][ctx]; if (level > MAX_LEVEL || level < 0) { // node is dead? continue; } diff --git a/src/enc/vp8enci.h b/src/enc/vp8enci.h index 3c733c95..16a4443f 100644 --- a/src/enc/vp8enci.h +++ b/src/enc/vp8enci.h @@ -151,6 +151,8 @@ typedef uint32_t proba_t; // 16b + 16b typedef uint8_t ProbaArray[NUM_CTX][NUM_PROBAS]; typedef proba_t StatsArray[NUM_CTX][NUM_PROBAS]; typedef uint16_t CostArray[NUM_CTX][MAX_VARIABLE_LEVEL + 1]; +typedef const uint16_t* (*CostArrayPtr)[NUM_CTX]; // for easy casting +typedef const uint16_t* CostArrayMap[16][NUM_CTX]; typedef double LFStats[NUM_MB_SEGMENTS][MAX_LF_LEVELS]; // filter stats typedef struct VP8Encoder VP8Encoder; @@ -170,6 +172,7 @@ typedef struct { ProbaArray coeffs_[NUM_TYPES][NUM_BANDS]; // 1056 bytes StatsArray stats_[NUM_TYPES][NUM_BANDS]; // 4224 bytes CostArray level_cost_[NUM_TYPES][NUM_BANDS]; // 13056 bytes + CostArrayMap remapped_costs_[NUM_TYPES]; // 1536 bytes int dirty_; // if true, need to call VP8CalculateLevelCosts() int use_skip_proba_; // Note: we always use skip_proba for now. int nb_skip_; // number of skipped blocks