mirror of
https://github.com/webmproject/libwebp.git
synced 2024-12-30 15:28:23 +01:00
2-5% faster trellis with clang/MacOS
(and ~2-3% on ARM) We don't need to store cost/score for each node, but only for the current and previous one -> simplify code and save some memory. Also made the 'Node' structure tighter. Change-Id: Ie3ad7d3b678992b396242f56e2ac387fe43852e6
This commit is contained in:
parent
80e218d43a
commit
d1b33ad58b
@ -515,15 +515,18 @@ static void AddScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
|
|||||||
//------------------------------------------------------------------------------
|
//------------------------------------------------------------------------------
|
||||||
// Performs trellis-optimized quantization.
|
// Performs trellis-optimized quantization.
|
||||||
|
|
||||||
// Trellis
|
// Trellis node
|
||||||
|
typedef struct {
|
||||||
|
int8_t prev; // best previous node
|
||||||
|
int8_t sign; // sign of coeff_i
|
||||||
|
int16_t level; // level
|
||||||
|
} Node;
|
||||||
|
|
||||||
|
// Score state
|
||||||
typedef struct {
|
typedef struct {
|
||||||
int prev; // best previous node
|
|
||||||
int level; // level
|
|
||||||
int sign; // sign of coeff_i
|
|
||||||
score_t score; // partial RD score
|
score_t score; // partial RD score
|
||||||
const uint16_t* costs; // shortcut to cost tables
|
const uint16_t* costs; // shortcut to cost tables
|
||||||
} Node;
|
} ScoreState;
|
||||||
|
|
||||||
// If a coefficient was quantized to a value Q (using a neutral bias),
|
// If a coefficient was quantized to a value Q (using a neutral bias),
|
||||||
// we test all alternate possibilities between [Q-MIN_DELTA, Q+MAX_DELTA]
|
// we test all alternate possibilities between [Q-MIN_DELTA, Q+MAX_DELTA]
|
||||||
@ -531,7 +534,8 @@ typedef struct {
|
|||||||
#define MIN_DELTA 0 // how much lower level to try
|
#define MIN_DELTA 0 // how much lower level to try
|
||||||
#define MAX_DELTA 1 // how much higher
|
#define MAX_DELTA 1 // how much higher
|
||||||
#define NUM_NODES (MIN_DELTA + 1 + MAX_DELTA)
|
#define NUM_NODES (MIN_DELTA + 1 + MAX_DELTA)
|
||||||
#define NODE(n, l) (nodes[(n) + 1][(l) + MIN_DELTA])
|
#define NODE(n, l) (nodes[(n)][(l) + MIN_DELTA])
|
||||||
|
#define SCORE_STATE(n, l) (score_states[n][(l) + MIN_DELTA])
|
||||||
|
|
||||||
static WEBP_INLINE void SetRDScore(int lambda, VP8ModeScore* const rd) {
|
static WEBP_INLINE void SetRDScore(int lambda, VP8ModeScore* const rd) {
|
||||||
// TODO: incorporate the "* 256" in the tables?
|
// TODO: incorporate the "* 256" in the tables?
|
||||||
@ -551,7 +555,10 @@ static int TrellisQuantizeBlock(const VP8Encoder* const enc,
|
|||||||
const ProbaArray* const probas = enc->proba_.coeffs_[coeff_type];
|
const ProbaArray* const probas = enc->proba_.coeffs_[coeff_type];
|
||||||
const CostArray* const costs = enc->proba_.level_cost_[coeff_type];
|
const CostArray* const costs = enc->proba_.level_cost_[coeff_type];
|
||||||
const int first = (coeff_type == 0) ? 1 : 0;
|
const int first = (coeff_type == 0) ? 1 : 0;
|
||||||
Node nodes[17][NUM_NODES];
|
Node nodes[16][NUM_NODES];
|
||||||
|
ScoreState score_states[2][NUM_NODES];
|
||||||
|
ScoreState* ss_cur = &SCORE_STATE(0, MIN_DELTA);
|
||||||
|
ScoreState* ss_prev = &SCORE_STATE(1, MIN_DELTA);
|
||||||
int best_path[3] = {-1, -1, -1}; // store best-last/best-level/best-previous
|
int best_path[3] = {-1, -1, -1}; // store best-last/best-level/best-previous
|
||||||
score_t best_score;
|
score_t best_score;
|
||||||
int n, m, p, last;
|
int n, m, p, last;
|
||||||
@ -580,11 +587,10 @@ static int TrellisQuantizeBlock(const VP8Encoder* const enc,
|
|||||||
best_score = RDScoreTrellis(lambda, cost, 0);
|
best_score = RDScoreTrellis(lambda, cost, 0);
|
||||||
|
|
||||||
// initialize source node.
|
// initialize source node.
|
||||||
n = first - 1;
|
|
||||||
for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
|
for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
|
||||||
const score_t rate = (ctx0 == 0) ? VP8BitCost(1, last_proba) : 0;
|
const score_t rate = (ctx0 == 0) ? VP8BitCost(1, last_proba) : 0;
|
||||||
NODE(n, m).score = RDScoreTrellis(lambda, rate, 0);
|
ss_cur[m].score = RDScoreTrellis(lambda, rate, 0);
|
||||||
NODE(n, m).costs = costs[VP8EncBands[first]][ctx0];
|
ss_cur[m].costs = costs[VP8EncBands[first]][ctx0];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -601,25 +607,34 @@ static int TrellisQuantizeBlock(const VP8Encoder* const enc,
|
|||||||
int level0 = QUANTDIV(coeff0, iQ, B);
|
int level0 = QUANTDIV(coeff0, iQ, B);
|
||||||
if (level0 > MAX_LEVEL) level0 = MAX_LEVEL;
|
if (level0 > MAX_LEVEL) level0 = MAX_LEVEL;
|
||||||
|
|
||||||
|
{ // Swap current and previous score states
|
||||||
|
ScoreState* const tmp = ss_cur;
|
||||||
|
ss_cur = ss_prev;
|
||||||
|
ss_prev = tmp;
|
||||||
|
}
|
||||||
|
|
||||||
// test all alternate level values around level0.
|
// test all alternate level values around level0.
|
||||||
for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
|
for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
|
||||||
Node* const cur = &NODE(n, m);
|
Node* const cur = &NODE(n, m);
|
||||||
int level = level0 + m;
|
int level = level0 + m;
|
||||||
const int ctx = (level > 2) ? 2 : level;
|
const int ctx = (level > 2) ? 2 : level;
|
||||||
const int band = VP8EncBands[n + 1];
|
const int band = VP8EncBands[n + 1];
|
||||||
score_t base_score, last_pos_cost;
|
score_t base_score, last_pos_score;
|
||||||
|
score_t best_cur_score = MAX_COST;
|
||||||
|
int best_prev = 0; // default, in case
|
||||||
|
|
||||||
cur->score = MAX_COST;
|
ss_cur[m].score = MAX_COST;
|
||||||
|
ss_cur[m].costs = costs[band][ctx];
|
||||||
if (level > MAX_LEVEL || level < 0) { // node is dead?
|
if (level > MAX_LEVEL || level < 0) { // node is dead?
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
cur->sign = sign;
|
|
||||||
cur->level = level;
|
|
||||||
cur->costs = costs[band][ctx];
|
|
||||||
cur->prev = 0; // default, in case
|
|
||||||
|
|
||||||
// Compute extra rate cost if last coeff's position is < 15
|
// Compute extra rate cost if last coeff's position is < 15
|
||||||
last_pos_cost = (n < 15) ? VP8BitCost(0, probas[band][ctx][0]) : 0;
|
{
|
||||||
|
const score_t last_pos_cost =
|
||||||
|
(n < 15) ? VP8BitCost(0, probas[band][ctx][0]) : 0;
|
||||||
|
last_pos_score = RDScoreTrellis(lambda, last_pos_cost, 0);
|
||||||
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
// Compute delta_error = how much coding this level will
|
// Compute delta_error = how much coding this level will
|
||||||
@ -633,29 +648,31 @@ static int TrellisQuantizeBlock(const VP8Encoder* const enc,
|
|||||||
|
|
||||||
// Inspect all possible non-dead predecessors. Retain only the best one.
|
// Inspect all possible non-dead predecessors. Retain only the best one.
|
||||||
for (p = -MIN_DELTA; p <= MAX_DELTA; ++p) {
|
for (p = -MIN_DELTA; p <= MAX_DELTA; ++p) {
|
||||||
const Node* const prev = &NODE(n - 1, p);
|
// Dead nodes (with ss_prev[p].score >= MAX_COST) are automatically
|
||||||
if (prev->score < MAX_COST) { // skip dead node
|
// eliminated since their score can't be better than the current best.
|
||||||
// Base cost of both terminal / non-terminal hypothesis
|
const score_t cost = VP8LevelCost(ss_prev[p].costs, level);
|
||||||
const score_t cost = VP8LevelCost(prev->costs, level);
|
// Examine node assuming it's a non-terminal one.
|
||||||
// Examine node assuming it's a non-terminal one.
|
const score_t score =
|
||||||
const score_t score =
|
base_score + ss_prev[p].score + RDScoreTrellis(lambda, cost, 0);
|
||||||
base_score + prev->score + RDScoreTrellis(lambda, cost, 0);
|
if (score < best_cur_score) {
|
||||||
if (score < cur->score) {
|
best_cur_score = score;
|
||||||
cur->score = score;
|
best_prev = p;
|
||||||
cur->prev = p;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// Store best finding in current node.
|
||||||
|
cur->sign = sign;
|
||||||
|
cur->level = level;
|
||||||
|
cur->prev = best_prev;
|
||||||
|
ss_cur[m].score = best_cur_score;
|
||||||
|
|
||||||
// Now, record best terminal node (and thus best entry in the graph).
|
// Now, record best terminal node (and thus best entry in the graph).
|
||||||
if (cur->level != 0) {
|
if (level != 0) {
|
||||||
const score_t last_pos_score =
|
const score_t score = best_cur_score + last_pos_score;
|
||||||
RDScoreTrellis(lambda, last_pos_cost, 0);
|
|
||||||
const score_t score = cur->score + last_pos_score;
|
|
||||||
if (score < best_score) {
|
if (score < best_score) {
|
||||||
best_score = score;
|
best_score = score;
|
||||||
best_path[0] = n; // best eob position
|
best_path[0] = n; // best eob position
|
||||||
best_path[1] = cur->level - level0; // best node index ('m')
|
best_path[1] = m; // best node index
|
||||||
best_path[2] = cur->prev; // best predecessor
|
best_path[2] = best_prev; // best predecessor
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -160,6 +160,8 @@ extern const int VP8I4ModeOffsets[NUM_BMODES];
|
|||||||
#define I4TMP (6 * 16 * BPS + 8 * BPS + 8)
|
#define I4TMP (6 * 16 * BPS + 8 * BPS + 8)
|
||||||
|
|
||||||
typedef int64_t score_t; // type used for scores, rate, distortion
|
typedef int64_t score_t; // type used for scores, rate, distortion
|
||||||
|
// Note that MAX_COST is not the maximum allowed by sizeof(score_t),
|
||||||
|
// in order to allow overflowing computations.
|
||||||
#define MAX_COST ((score_t)0x7fffffffffffffLL)
|
#define MAX_COST ((score_t)0x7fffffffffffffLL)
|
||||||
|
|
||||||
#define QFIX 17
|
#define QFIX 17
|
||||||
|
Loading…
Reference in New Issue
Block a user