decoding speed-up (~1%)

- precompute filtering strength once for all at the beginning
  instead of per-macroblock
- reduce size of VP8MB struct from 8 bytes to 4.
- removed VP8StoreBlock() accordingly

Change-Id: Icf3d329473e21c464770be3d72a04c9ee4c321f2
This commit is contained in:
Pascal Massimino 2012-12-14 10:22:54 -08:00
parent bcec339b01
commit 0f57dcc31f
4 changed files with 68 additions and 69 deletions

View File

@ -97,53 +97,50 @@ static void FilterRow(const VP8Decoder* const dec) {
} }
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// Precompute the filtering strength for each segment and each i4x4/i16x16 mode.
void VP8StoreBlock(VP8Decoder* const dec) { static void PrecomputeFilterStrengths(VP8Decoder* const dec) {
if (dec->filter_type_ > 0) { if (dec->filter_type_ > 0) {
VP8FInfo* const info = dec->f_info_ + dec->mb_x_; int s;
const int skip = dec->mb_info_[dec->mb_x_].skip_; const VP8FilterHeader* const hdr = &dec->filter_hdr_;
int level = dec->filter_levels_[dec->segment_]; for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
if (dec->filter_hdr_.use_lf_delta_) { int i4x4;
// TODO(skal): only CURRENT is handled for now. // First, compute the initial level
level += dec->filter_hdr_.ref_lf_delta_[0]; int base_level;
if (dec->is_i4x4_) { if (dec->segment_hdr_.use_segment_) {
level += dec->filter_hdr_.mode_lf_delta_[0]; base_level = dec->segment_hdr_.filter_strength_[s];
} if (!dec->segment_hdr_.absolute_delta_) {
} base_level += hdr->level_;
level = (level < 0) ? 0 : (level > 63) ? 63 : level; }
info->f_level_ = level;
if (dec->filter_hdr_.sharpness_ > 0) {
if (dec->filter_hdr_.sharpness_ > 4) {
level >>= 2;
} else { } else {
level >>= 1; base_level = hdr->level_;
} }
if (level > 9 - dec->filter_hdr_.sharpness_) { for (i4x4 = 0; i4x4 <= 1; ++i4x4) {
level = 9 - dec->filter_hdr_.sharpness_; VP8FInfo* const info = &dec->fstrengths_[s][i4x4];
} int level = base_level;
} if (hdr->use_lf_delta_) {
// TODO(skal): only CURRENT is handled for now.
level += hdr->ref_lf_delta_[0];
if (i4x4) {
level += hdr->mode_lf_delta_[0];
}
}
level = (level < 0) ? 0 : (level > 63) ? 63 : level;
info->f_level_ = level;
info->f_ilevel_ = (level < 1) ? 1 : level; if (hdr->sharpness_ > 0) {
info->f_inner_ = (!skip || dec->is_i4x4_); if (hdr->sharpness_ > 4) {
} level >>= 2;
{ } else {
// Transfer samples to row cache level >>= 1;
int y; }
const int y_offset = dec->cache_id_ * 16 * dec->cache_y_stride_; if (level > 9 - hdr->sharpness_) {
const int uv_offset = dec->cache_id_ * 8 * dec->cache_uv_stride_; level = 9 - hdr->sharpness_;
uint8_t* const ydst = dec->cache_y_ + dec->mb_x_ * 16 + y_offset; }
uint8_t* const udst = dec->cache_u_ + dec->mb_x_ * 8 + uv_offset; }
uint8_t* const vdst = dec->cache_v_ + dec->mb_x_ * 8 + uv_offset; info->f_ilevel_ = (level < 1) ? 1 : level;
for (y = 0; y < 16; ++y) { info->f_inner_ = 0;
memcpy(ydst + y * dec->cache_y_stride_, }
dec->yuv_b_ + Y_OFF + y * BPS, 16);
}
for (y = 0; y < 8; ++y) {
memcpy(udst + y * dec->cache_uv_stride_,
dec->yuv_b_ + U_OFF + y * BPS, 8);
memcpy(vdst + y * dec->cache_uv_stride_,
dec->yuv_b_ + V_OFF + y * BPS, 8);
} }
} }
} }
@ -339,6 +336,7 @@ VP8StatusCode VP8EnterCritical(VP8Decoder* const dec, VP8Io* const io) {
dec->br_mb_y_ = dec->mb_h_; dec->br_mb_y_ = dec->mb_h_;
} }
} }
PrecomputeFilterStrengths(dec);
return VP8_STATUS_OK; return VP8_STATUS_OK;
} }
@ -551,6 +549,7 @@ static WEBP_INLINE void Copy32b(uint8_t* dst, uint8_t* src) {
} }
void VP8ReconstructBlock(VP8Decoder* const dec) { void VP8ReconstructBlock(VP8Decoder* const dec) {
int j;
uint8_t* const y_dst = dec->yuv_b_ + Y_OFF; uint8_t* const y_dst = dec->yuv_b_ + Y_OFF;
uint8_t* const u_dst = dec->yuv_b_ + U_OFF; uint8_t* const u_dst = dec->yuv_b_ + U_OFF;
uint8_t* const v_dst = dec->yuv_b_ + V_OFF; uint8_t* const v_dst = dec->yuv_b_ + V_OFF;
@ -558,7 +557,6 @@ void VP8ReconstructBlock(VP8Decoder* const dec) {
// Rotate in the left samples from previously decoded block. We move four // Rotate in the left samples from previously decoded block. We move four
// pixels at a time for alignment reason, and because of in-loop filter. // pixels at a time for alignment reason, and because of in-loop filter.
if (dec->mb_x_ > 0) { if (dec->mb_x_ > 0) {
int j;
for (j = -1; j < 16; ++j) { for (j = -1; j < 16; ++j) {
Copy32b(&y_dst[j * BPS - 4], &y_dst[j * BPS + 12]); Copy32b(&y_dst[j * BPS - 4], &y_dst[j * BPS + 12]);
} }
@ -567,7 +565,6 @@ void VP8ReconstructBlock(VP8Decoder* const dec) {
Copy32b(&v_dst[j * BPS - 4], &v_dst[j * BPS + 4]); Copy32b(&v_dst[j * BPS - 4], &v_dst[j * BPS + 4]);
} }
} else { } else {
int j;
for (j = 0; j < 16; ++j) { for (j = 0; j < 16; ++j) {
y_dst[j * BPS - 1] = 129; y_dst[j * BPS - 1] = 129;
} }
@ -670,6 +667,21 @@ void VP8ReconstructBlock(VP8Decoder* const dec) {
} }
} }
} }
// Transfer reconstructed samples from yuv_b_ cache to final destination.
{
const int y_offset = dec->cache_id_ * 16 * dec->cache_y_stride_;
const int uv_offset = dec->cache_id_ * 8 * dec->cache_uv_stride_;
uint8_t* const y_out = dec->cache_y_ + dec->mb_x_ * 16 + y_offset;
uint8_t* const u_out = dec->cache_u_ + dec->mb_x_ * 8 + uv_offset;
uint8_t* const v_out = dec->cache_v_ + dec->mb_x_ * 8 + uv_offset;
for (j = 0; j < 16; ++j) {
memcpy(y_out + j * dec->cache_y_stride_, y_dst + j * BPS, 16);
}
for (j = 0; j < 8; ++j) {
memcpy(u_out + j * dec->cache_uv_stride_, u_dst + j * BPS, 8);
memcpy(v_out + j * dec->cache_uv_stride_, v_dst + j * BPS, 8);
}
}
} }
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------

View File

@ -425,9 +425,8 @@ static VP8StatusCode DecodeRemaining(WebPIDecoder* const idec) {
} }
return VP8_STATUS_SUSPENDED; return VP8_STATUS_SUSPENDED;
} }
// Reconstruct and emit samples.
VP8ReconstructBlock(dec); VP8ReconstructBlock(dec);
// Store data and save block's filtering params
VP8StoreBlock(dec);
// Release buffer only if there is only one partition // Release buffer only if there is only one partition
if (dec->num_parts_ == 1) { if (dec->num_parts_ == 1) {

View File

@ -236,20 +236,6 @@ static int ParseFilterHeader(VP8BitReader* br, VP8Decoder* const dec) {
} }
} }
dec->filter_type_ = (hdr->level_ == 0) ? 0 : hdr->simple_ ? 1 : 2; dec->filter_type_ = (hdr->level_ == 0) ? 0 : hdr->simple_ ? 1 : 2;
if (dec->filter_type_ > 0) { // precompute filter levels per segment
if (dec->segment_hdr_.use_segment_) {
int s;
for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
int strength = dec->segment_hdr_.filter_strength_[s];
if (!dec->segment_hdr_.absolute_delta_) {
strength += hdr->level_;
}
dec->filter_levels_[s] = strength;
}
} else {
dec->filter_levels_[0] = hdr->level_;
}
}
return !br->eof_; return !br->eof_;
} }
@ -675,6 +661,12 @@ int VP8DecodeMB(VP8Decoder* const dec, VP8BitReader* const token_br) {
dec->non_zero_ac_ = 0; dec->non_zero_ac_ = 0;
} }
if (dec->filter_type_ > 0) { // store filter info
VP8FInfo* const finfo = dec->f_info_ + dec->mb_x_;
*finfo = dec->fstrengths_[dec->segment_][dec->is_i4x4_];
finfo->f_inner_ = (!info->skip_ || dec->is_i4x4_);
}
return (!token_br->eof_); return (!token_br->eof_);
} }
@ -698,10 +690,8 @@ static int ParseFrame(VP8Decoder* const dec, VP8Io* io) {
return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA, return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
"Premature end-of-file encountered."); "Premature end-of-file encountered.");
} }
// Reconstruct and emit samples.
VP8ReconstructBlock(dec); VP8ReconstructBlock(dec);
// Store data and save block's filtering params
VP8StoreBlock(dec);
} }
if (!VP8ProcessRow(dec, io)) { if (!VP8ProcessRow(dec, io)) {
return VP8SetError(dec, VP8_STATUS_USER_ABORT, "Output aborted."); return VP8SetError(dec, VP8_STATUS_USER_ABORT, "Output aborted.");

View File

@ -157,7 +157,7 @@ typedef struct { // filter specs
} VP8FInfo; } VP8FInfo;
typedef struct { // used for syntax-parsing typedef struct { // used for syntax-parsing
unsigned int nz_; // non-zero AC/DC coeffs unsigned int nz_:24; // non-zero AC/DC coeffs (24bit)
unsigned int dc_nz_:1; // non-zero DC coeffs unsigned int dc_nz_:1; // non-zero DC coeffs
unsigned int skip_:1; // block type unsigned int skip_:1; // block type
} VP8MB; } VP8MB;
@ -269,9 +269,9 @@ struct VP8Decoder {
uint32_t non_zero_ac_; uint32_t non_zero_ac_;
// Filtering side-info // Filtering side-info
int filter_type_; // 0=off, 1=simple, 2=complex int filter_type_; // 0=off, 1=simple, 2=complex
int filter_row_; // per-row flag int filter_row_; // per-row flag
uint8_t filter_levels_[NUM_MB_SEGMENTS]; // precalculated per-segment VP8FInfo fstrengths_[NUM_MB_SEGMENTS][2]; // precalculated per-segment/type
// extensions // extensions
const uint8_t* alpha_data_; // compressed alpha data (if present) const uint8_t* alpha_data_; // compressed alpha data (if present)
@ -312,8 +312,6 @@ VP8StatusCode VP8EnterCritical(VP8Decoder* const dec, VP8Io* const io);
int VP8ExitCritical(VP8Decoder* const dec, VP8Io* const io); int VP8ExitCritical(VP8Decoder* const dec, VP8Io* const io);
// Process the last decoded row (filtering + output) // Process the last decoded row (filtering + output)
int VP8ProcessRow(VP8Decoder* const dec, VP8Io* const io); int VP8ProcessRow(VP8Decoder* const dec, VP8Io* const io);
// Store a block, along with filtering params
void VP8StoreBlock(VP8Decoder* const dec);
// To be called at the start of a new scanline, to initialize predictors. // To be called at the start of a new scanline, to initialize predictors.
void VP8InitScanline(VP8Decoder* const dec); void VP8InitScanline(VP8Decoder* const dec);
// Decode one macroblock. Returns false if there is not enough data. // Decode one macroblock. Returns false if there is not enough data.