fix -m 2 mode-cost evaluation (causing partition0 overflow)

The mode's bits were not taken into account, which is ok for most of cases.
But in case of super large image, with 'easy' content, their overhead starts
mattering a lot and we were omitting to optimize for these.
Now, these mode bits have their own lambda values associated, limiting
the jerkiness. We also limit (for -m 2 only) the individual number of bits
to something that will prevent the partition 0 overflow.

removed the I4_PENALTY constant, which was a rather crude approximation.
Replaced by some q-dependent expression.

fixes issue #289

Change-Id: I956ae2d2308c339adc4706d52722f0bb61ccf18c
This commit is contained in:
Pascal Massimino 2016-03-11 20:34:45 +01:00
parent 4562e83dc2
commit e88c4ca013
3 changed files with 58 additions and 25 deletions

View File

@ -30,8 +30,6 @@
#define SNS_TO_DQ 0.9 // Scaling constant between the sns value and the QP #define SNS_TO_DQ 0.9 // Scaling constant between the sns value and the QP
// power-law modulation. Must be strictly less than 1. // power-law modulation. Must be strictly less than 1.
#define I4_PENALTY 14000 // Rate-penalty for quick i4/i16 decision
// number of non-zero coeffs below which we consider the block very flat // number of non-zero coeffs below which we consider the block very flat
// (and apply a penalty to complex predictions) // (and apply a penalty to complex predictions)
#define FLATNESS_LIMIT_I16 10 // I16 mode #define FLATNESS_LIMIT_I16 10 // I16 mode
@ -236,6 +234,8 @@ static int ExpandMatrix(VP8Matrix* const m, int type) {
return (sum + 8) >> 4; return (sum + 8) >> 4;
} }
static void CheckLambdaValue(int* const v) { if (*v < 1) *v = 1; }
static void SetupMatrices(VP8Encoder* enc) { static void SetupMatrices(VP8Encoder* enc) {
int i; int i;
const int tlambda_scale = const int tlambda_scale =
@ -245,7 +245,7 @@ static void SetupMatrices(VP8Encoder* enc) {
for (i = 0; i < num_segments; ++i) { for (i = 0; i < num_segments; ++i) {
VP8SegmentInfo* const m = &enc->dqm_[i]; VP8SegmentInfo* const m = &enc->dqm_[i];
const int q = m->quant_; const int q = m->quant_;
int q4, q16, quv; int q_i4, q_i16, q_uv;
m->y1_.q_[0] = kDcTable[clip(q + enc->dq_y1_dc_, 0, 127)]; m->y1_.q_[0] = kDcTable[clip(q + enc->dq_y1_dc_, 0, 127)];
m->y1_.q_[1] = kAcTable[clip(q, 0, 127)]; m->y1_.q_[1] = kAcTable[clip(q, 0, 127)];
@ -255,21 +255,33 @@ static void SetupMatrices(VP8Encoder* enc) {
m->uv_.q_[0] = kDcTable[clip(q + enc->dq_uv_dc_, 0, 117)]; m->uv_.q_[0] = kDcTable[clip(q + enc->dq_uv_dc_, 0, 117)];
m->uv_.q_[1] = kAcTable[clip(q + enc->dq_uv_ac_, 0, 127)]; m->uv_.q_[1] = kAcTable[clip(q + enc->dq_uv_ac_, 0, 127)];
q4 = ExpandMatrix(&m->y1_, 0); q_i4 = ExpandMatrix(&m->y1_, 0);
q16 = ExpandMatrix(&m->y2_, 1); q_i16 = ExpandMatrix(&m->y2_, 1);
quv = ExpandMatrix(&m->uv_, 2); q_uv = ExpandMatrix(&m->uv_, 2);
m->lambda_i4_ = (3 * q4 * q4) >> 7; m->lambda_i4_ = (3 * q_i4 * q_i4) >> 7;
m->lambda_i16_ = (3 * q16 * q16); m->lambda_i16_ = (3 * q_i16 * q_i16);
m->lambda_uv_ = (3 * quv * quv) >> 6; m->lambda_uv_ = (3 * q_uv * q_uv) >> 6;
m->lambda_mode_ = (1 * q4 * q4) >> 7; m->lambda_mode_ = (1 * q_i4 * q_i4) >> 7;
m->lambda_trellis_i4_ = (7 * q4 * q4) >> 3; m->lambda_trellis_i4_ = (7 * q_i4 * q_i4) >> 3;
m->lambda_trellis_i16_ = (q16 * q16) >> 2; m->lambda_trellis_i16_ = (q_i16 * q_i16) >> 2;
m->lambda_trellis_uv_ = (quv *quv) << 1; m->lambda_trellis_uv_ = (q_uv * q_uv) << 1;
m->tlambda_ = (tlambda_scale * q4) >> 5; m->tlambda_ = (tlambda_scale * q_i4) >> 5;
// none of these constants should be < 1
CheckLambdaValue(&m->lambda_i4_);
CheckLambdaValue(&m->lambda_i16_);
CheckLambdaValue(&m->lambda_uv_);
CheckLambdaValue(&m->lambda_mode_);
CheckLambdaValue(&m->lambda_trellis_i4_);
CheckLambdaValue(&m->lambda_trellis_i16_);
CheckLambdaValue(&m->lambda_trellis_uv_);
CheckLambdaValue(&m->tlambda_);
m->min_disto_ = 10 * m->y1_.q_[0]; // quantization-aware min disto m->min_disto_ = 10 * m->y1_.q_[0]; // quantization-aware min disto
m->max_edge_ = 0; m->max_edge_ = 0;
m->i4_penalty_ = 1000 * q_i4 * q_i4;
} }
} }
@ -1127,19 +1139,29 @@ static void RefineUsingDistortion(VP8EncIterator* const it,
int try_both_modes, int refine_uv_mode, int try_both_modes, int refine_uv_mode,
VP8ModeScore* const rd) { VP8ModeScore* const rd) {
score_t best_score = MAX_COST; score_t best_score = MAX_COST;
score_t score_i4 = (score_t)I4_PENALTY;
int16_t tmp_levels[16][16];
uint8_t modes_i4[16];
int nz = 0; int nz = 0;
int mode; int mode;
int is_i16 = try_both_modes || (it->mb_->type_ == 1); int is_i16 = try_both_modes || (it->mb_->type_ == 1);
const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
// Some empiric constants, of approximate order of magnitude.
const int lambda_d_i16 = 106;
const int lambda_d_i4 = 11;
const int lambda_d_uv = 120;
score_t score_i4 = dqm->i4_penalty_;
score_t i4_bit_sum = 0;
const score_t bit_limit = it->enc_->mb_header_limit_;
if (is_i16) { // First, evaluate Intra16 distortion if (is_i16) { // First, evaluate Intra16 distortion
int best_mode = -1; int best_mode = -1;
const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC; const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
for (mode = 0; mode < NUM_PRED_MODES; ++mode) { for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode]; const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
const score_t score = VP8SSE16x16(src, ref); const score_t score = VP8SSE16x16(src, ref) * RD_DISTO_MULT
+ VP8FixedCostsI16[mode] * lambda_d_i16;
if (mode > 0 && VP8FixedCostsI16[mode] > bit_limit) {
continue;
}
if (score < best_score) { if (score < best_score) {
best_mode = mode; best_mode = mode;
best_score = score; best_score = score;
@ -1159,25 +1181,28 @@ static void RefineUsingDistortion(VP8EncIterator* const it,
int best_i4_mode = -1; int best_i4_mode = -1;
score_t best_i4_score = MAX_COST; score_t best_i4_score = MAX_COST;
const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_]; const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
VP8MakeIntra4Preds(it); VP8MakeIntra4Preds(it);
for (mode = 0; mode < NUM_BMODES; ++mode) { for (mode = 0; mode < NUM_BMODES; ++mode) {
const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode]; const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
const score_t score = VP8SSE4x4(src, ref); const score_t score = VP8SSE4x4(src, ref) * RD_DISTO_MULT
+ mode_costs[mode] * lambda_d_i4;
if (score < best_i4_score) { if (score < best_i4_score) {
best_i4_mode = mode; best_i4_mode = mode;
best_i4_score = score; best_i4_score = score;
} }
} }
modes_i4[it->i4_] = best_i4_mode; i4_bit_sum += mode_costs[best_i4_mode];
rd->modes_i4[it->i4_] = best_i4_mode;
score_i4 += best_i4_score; score_i4 += best_i4_score;
if (score_i4 >= best_score) { if (score_i4 >= best_score || i4_bit_sum > bit_limit) {
// Intra4 won't be better than Intra16. Bail out and pick Intra16. // Intra4 won't be better than Intra16. Bail out and pick Intra16.
is_i16 = 1; is_i16 = 1;
break; break;
} else { // reconstruct partial block inside yuv_out2_ buffer } else { // reconstruct partial block inside yuv_out2_ buffer
uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC + VP8Scan[it->i4_]; uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC + VP8Scan[it->i4_];
nz |= ReconstructIntra4(it, tmp_levels[it->i4_], nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
src, tmp_dst, best_i4_mode) << it->i4_; src, tmp_dst, best_i4_mode) << it->i4_;
} }
} while (VP8IteratorRotateI4(it, it->yuv_out2_ + Y_OFF_ENC)); } while (VP8IteratorRotateI4(it, it->yuv_out2_ + Y_OFF_ENC));
@ -1185,8 +1210,7 @@ static void RefineUsingDistortion(VP8EncIterator* const it,
// Final reconstruction, depending on which mode is selected. // Final reconstruction, depending on which mode is selected.
if (!is_i16) { if (!is_i16) {
VP8SetIntra4Mode(it, modes_i4); VP8SetIntra4Mode(it, rd->modes_i4);
memcpy(rd->y_ac_levels, tmp_levels, sizeof(tmp_levels));
SwapOut(it); SwapOut(it);
best_score = score_i4; best_score = score_i4;
} else { } else {
@ -1200,7 +1224,8 @@ static void RefineUsingDistortion(VP8EncIterator* const it,
const uint8_t* const src = it->yuv_in_ + U_OFF_ENC; const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
for (mode = 0; mode < NUM_PRED_MODES; ++mode) { for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode]; const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
const score_t score = VP8SSE16x8(src, ref); const score_t score = VP8SSE16x8(src, ref) * RD_DISTO_MULT
+ VP8FixedCostsUV[mode] * lambda_d_uv;
if (score < best_uv_score) { if (score < best_uv_score) {
best_mode = mode; best_mode = mode;
best_uv_score = score; best_uv_score = score;

View File

@ -196,6 +196,9 @@ typedef struct {
int lambda_i16_, lambda_i4_, lambda_uv_; int lambda_i16_, lambda_i4_, lambda_uv_;
int lambda_mode_, lambda_trellis_, tlambda_; int lambda_mode_, lambda_trellis_, tlambda_;
int lambda_trellis_i16_, lambda_trellis_i4_, lambda_trellis_uv_; int lambda_trellis_i16_, lambda_trellis_i4_, lambda_trellis_uv_;
// lambda values for distortion-based evaluation
score_t i4_penalty_; // penalty for using Intra4
} VP8SegmentInfo; } VP8SegmentInfo;
// Handy transient struct to accumulate score and info during RD-optimization // Handy transient struct to accumulate score and info during RD-optimization
@ -391,6 +394,7 @@ struct VP8Encoder {
int method_; // 0=fastest, 6=best/slowest. int method_; // 0=fastest, 6=best/slowest.
VP8RDLevel rd_opt_level_; // Deduced from method_. VP8RDLevel rd_opt_level_; // Deduced from method_.
int max_i4_header_bits_; // partition #0 safeness factor int max_i4_header_bits_; // partition #0 safeness factor
int mb_header_limit_; // rough limit for header bits per MB
int thread_level_; // derived from config->thread_level int thread_level_; // derived from config->thread_level
int do_search_; // derived from config->target_XXX int do_search_; // derived from config->target_XXX
int use_tokens_; // if true, use token buffer int use_tokens_; // if true, use token buffer

View File

@ -105,6 +105,10 @@ static void MapConfigToTools(VP8Encoder* const enc) {
256 * 16 * 16 * // upper bound: up to 16bit per 4x4 block 256 * 16 * 16 * // upper bound: up to 16bit per 4x4 block
(limit * limit) / (100 * 100); // ... modulated with a quadratic curve. (limit * limit) / (100 * 100); // ... modulated with a quadratic curve.
// partition0 = 512k max.
enc->mb_header_limit_ =
(score_t)256 * 510 * 8 * 1024 / (enc->mb_w_ * enc->mb_h_);
enc->thread_level_ = config->thread_level; enc->thread_level_ = config->thread_level;
enc->do_search_ = (config->target_size > 0 || config->target_PSNR > 0); enc->do_search_ = (config->target_size > 0 || config->target_PSNR > 0);