1-2% faster encoding by removing an indirection in GetResidualCost()

The MIPS code for cost is not updated yet, that's why i keep Residual::*cost
around for now. Should be removed in favor of *costs later.

Change-Id: Id1d09a8c37ea8c5b34ad5eb8811d6a3ec6c4d89f
This commit is contained in:
Pascal Massimino 2015-02-19 08:44:35 +01:00
parent eddb7e70be
commit 2382050748
6 changed files with 25 additions and 12 deletions

View File

@ -323,7 +323,8 @@ static int GetResidualCost(int ctx0, const VP8Residual* const res) {
int n = res->first; int n = res->first;
// should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1 // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
const int p0 = res->prob[n][ctx0][0]; const int p0 = res->prob[n][ctx0][0];
const uint16_t* t = res->cost[n][ctx0]; CostArrayPtr const costs = res->costs;
const uint16_t* t = costs[n][ctx0];
// bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0 // bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0
// (as required by the syntax). For ctx0 == 0, we need to add it here or it'll // (as required by the syntax). For ctx0 == 0, we need to add it here or it'll
// be missing during the loop. // be missing during the loop.
@ -334,10 +335,9 @@ static int GetResidualCost(int ctx0, const VP8Residual* const res) {
} }
for (; n < res->last; ++n) { for (; n < res->last; ++n) {
const int v = abs(res->coeffs[n]); const int v = abs(res->coeffs[n]);
const int b = VP8EncBands[n + 1];
const int ctx = (v >= 2) ? 2 : v; const int ctx = (v >= 2) ? 2 : v;
cost += VP8LevelCost(t, v); cost += VP8LevelCost(t, v);
t = res->cost[b][ctx]; t = costs[n + 1][ctx];
} }
// Last coefficient is always non-zero // Last coefficient is always non-zero
{ {

View File

@ -51,7 +51,8 @@ static int GetResidualCostSSE2(int ctx0, const VP8Residual* const res) {
int n = res->first; int n = res->first;
// should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1 // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
const int p0 = res->prob[n][ctx0][0]; const int p0 = res->prob[n][ctx0][0];
const uint16_t* t = res->cost[n][ctx0]; CostArrayPtr const costs = res->costs;
const uint16_t* t = costs[n][ctx0];
// bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0 // bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0
// (as required by the syntax). For ctx0 == 0, we need to add it here or it'll // (as required by the syntax). For ctx0 == 0, we need to add it here or it'll
// be missing during the loop. // be missing during the loop.
@ -87,9 +88,8 @@ static int GetResidualCostSSE2(int ctx0, const VP8Residual* const res) {
const int ctx = ctxs[n]; const int ctx = ctxs[n];
const int level = levels[n]; const int level = levels[n];
const int flevel = abs_levels[n]; // full level const int flevel = abs_levels[n]; // full level
const int b = VP8EncBands[n + 1];
cost += VP8LevelFixedCosts[flevel] + t[level]; // simplified VP8LevelCost() cost += VP8LevelFixedCosts[flevel] + t[level]; // simplified VP8LevelCost()
t = res->cost[b][ctx]; t = costs[n + 1][ctx];
} }
// Last coefficient is always non-zero // Last coefficient is always non-zero
{ {

View File

@ -63,6 +63,7 @@ void VP8CalculateLevelCosts(VP8Proba* const proba) {
if (!proba->dirty_) return; // nothing to do. if (!proba->dirty_) return; // nothing to do.
for (ctype = 0; ctype < NUM_TYPES; ++ctype) { for (ctype = 0; ctype < NUM_TYPES; ++ctype) {
int n;
for (band = 0; band < NUM_BANDS; ++band) { for (band = 0; band < NUM_BANDS; ++band) {
for (ctx = 0; ctx < NUM_CTX; ++ctx) { for (ctx = 0; ctx < NUM_CTX; ++ctx) {
const uint8_t* const p = proba->coeffs_[ctype][band][ctx]; const uint8_t* const p = proba->coeffs_[ctype][band][ctx];
@ -78,6 +79,12 @@ void VP8CalculateLevelCosts(VP8Proba* const proba) {
// actually constant. // actually constant.
} }
} }
for (n = 0; n < 16; ++n) { // replicate bands. We don't need to sentinel.
for (ctx = 0; ctx < NUM_CTX; ++ctx) {
proba->remapped_costs_[ctype][n][ctx] =
proba->level_cost_[ctype][VP8EncBands[n]][ctx];
}
}
} }
proba->dirty_ = 0; proba->dirty_ = 0;
} }
@ -202,6 +209,7 @@ void VP8InitResidual(int first, int coeff_type,
res->prob = enc->proba_.coeffs_[coeff_type]; res->prob = enc->proba_.coeffs_[coeff_type];
res->stats = enc->proba_.stats_[coeff_type]; res->stats = enc->proba_.stats_[coeff_type];
res->cost = enc->proba_.level_cost_[coeff_type]; res->cost = enc->proba_.level_cost_[coeff_type];
res->costs = enc->proba_.remapped_costs_[coeff_type];
res->first = first; res->first = first;
} }

View File

@ -31,9 +31,10 @@ struct VP8Residual {
const int16_t* coeffs; const int16_t* coeffs;
int coeff_type; int coeff_type;
ProbaArray* prob; ProbaArray* prob;
StatsArray* stats; StatsArray* stats;
CostArray* cost; CostArray* cost; // TODO(skal): remove in favor of *costs
CostArrayPtr costs;
}; };
void VP8InitResidual(int first, int coeff_type, void VP8InitResidual(int first, int coeff_type,

View File

@ -550,7 +550,8 @@ static int TrellisQuantizeBlock(const VP8Encoder* const enc,
const VP8Matrix* const mtx, const VP8Matrix* const mtx,
int lambda) { int lambda) {
const ProbaArray* const probas = enc->proba_.coeffs_[coeff_type]; const ProbaArray* const probas = enc->proba_.coeffs_[coeff_type];
const CostArray* const costs = enc->proba_.level_cost_[coeff_type]; CostArrayPtr const costs =
(CostArrayPtr)enc->proba_.remapped_costs_[coeff_type];
const int first = (coeff_type == 0) ? 1 : 0; const int first = (coeff_type == 0) ? 1 : 0;
Node nodes[16][NUM_NODES]; Node nodes[16][NUM_NODES];
ScoreState score_states[2][NUM_NODES]; ScoreState score_states[2][NUM_NODES];
@ -587,7 +588,7 @@ static int TrellisQuantizeBlock(const VP8Encoder* const enc,
for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) { for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
const score_t rate = (ctx0 == 0) ? VP8BitCost(1, last_proba) : 0; const score_t rate = (ctx0 == 0) ? VP8BitCost(1, last_proba) : 0;
ss_cur[m].score = RDScoreTrellis(lambda, rate, 0); ss_cur[m].score = RDScoreTrellis(lambda, rate, 0);
ss_cur[m].costs = costs[VP8EncBands[first]][ctx0]; ss_cur[m].costs = costs[first][ctx0];
} }
} }
@ -621,7 +622,7 @@ static int TrellisQuantizeBlock(const VP8Encoder* const enc,
int best_prev = 0; // default, in case int best_prev = 0; // default, in case
ss_cur[m].score = MAX_COST; ss_cur[m].score = MAX_COST;
ss_cur[m].costs = costs[band][ctx]; ss_cur[m].costs = costs[n + 1][ctx];
if (level > MAX_LEVEL || level < 0) { // node is dead? if (level > MAX_LEVEL || level < 0) { // node is dead?
continue; continue;
} }

View File

@ -151,6 +151,8 @@ typedef uint32_t proba_t; // 16b + 16b
typedef uint8_t ProbaArray[NUM_CTX][NUM_PROBAS]; typedef uint8_t ProbaArray[NUM_CTX][NUM_PROBAS];
typedef proba_t StatsArray[NUM_CTX][NUM_PROBAS]; typedef proba_t StatsArray[NUM_CTX][NUM_PROBAS];
typedef uint16_t CostArray[NUM_CTX][MAX_VARIABLE_LEVEL + 1]; typedef uint16_t CostArray[NUM_CTX][MAX_VARIABLE_LEVEL + 1];
typedef const uint16_t* (*CostArrayPtr)[NUM_CTX]; // for easy casting
typedef const uint16_t* CostArrayMap[16][NUM_CTX];
typedef double LFStats[NUM_MB_SEGMENTS][MAX_LF_LEVELS]; // filter stats typedef double LFStats[NUM_MB_SEGMENTS][MAX_LF_LEVELS]; // filter stats
typedef struct VP8Encoder VP8Encoder; typedef struct VP8Encoder VP8Encoder;
@ -170,6 +172,7 @@ typedef struct {
ProbaArray coeffs_[NUM_TYPES][NUM_BANDS]; // 1056 bytes ProbaArray coeffs_[NUM_TYPES][NUM_BANDS]; // 1056 bytes
StatsArray stats_[NUM_TYPES][NUM_BANDS]; // 4224 bytes StatsArray stats_[NUM_TYPES][NUM_BANDS]; // 4224 bytes
CostArray level_cost_[NUM_TYPES][NUM_BANDS]; // 13056 bytes CostArray level_cost_[NUM_TYPES][NUM_BANDS]; // 13056 bytes
CostArrayMap remapped_costs_[NUM_TYPES]; // 1536 bytes
int dirty_; // if true, need to call VP8CalculateLevelCosts() int dirty_; // if true, need to call VP8CalculateLevelCosts()
int use_skip_proba_; // Note: we always use skip_proba for now. int use_skip_proba_; // Note: we always use skip_proba for now.
int nb_skip_; // number of skipped blocks int nb_skip_; // number of skipped blocks