diff --git a/src/dec/frame.c b/src/dec/frame.c index b8a56fd0..ae34eb7b 100644 --- a/src/dec/frame.c +++ b/src/dec/frame.c @@ -252,10 +252,13 @@ static int FinishRow(VP8Decoder* const dec, VP8Io* const io) { int VP8ProcessRow(VP8Decoder* const dec, VP8Io* const io) { int ok = 1; VP8ThreadContext* const ctx = &dec->thread_ctx_; + const int filter_row = + (dec->filter_type_ > 0) && + (dec->mb_y_ >= dec->tl_mb_y_) && (dec->mb_y_ <= dec->br_mb_y_); if (!dec->use_threads_) { // ctx->id_ and ctx->f_info_ are already set ctx->mb_y_ = dec->mb_y_; - ctx->filter_row_ = dec->filter_row_; + ctx->filter_row_ = filter_row; ok = FinishRow(dec, io); } else { WebPWorker* const worker = &dec->worker_; @@ -266,8 +269,8 @@ int VP8ProcessRow(VP8Decoder* const dec, VP8Io* const io) { ctx->io_ = *io; ctx->id_ = dec->cache_id_; ctx->mb_y_ = dec->mb_y_; - ctx->filter_row_ = dec->filter_row_; - if (ctx->filter_row_) { // just swap filter info + ctx->filter_row_ = filter_row; + if (filter_row) { // just swap filter info VP8FInfo* const tmp = ctx->f_info_; ctx->f_info_ = dec->f_info_; dec->f_info_ = tmp; @@ -416,7 +419,7 @@ static int AllocateMemory(VP8Decoder* const dec) { mb_w * (dec->use_threads_ ? 2 : 1) * sizeof(VP8FInfo) : 0; const size_t yuv_size = YUV_SIZE * sizeof(*dec->yuv_b_); - const size_t mb_data_size = sizeof(*dec->mb_data_); + const size_t mb_data_size = mb_w * sizeof(*dec->mb_data_); const size_t cache_height = (16 * num_caches + kFilterExtraRows[dec->filter_type_]) * 3 / 2; const size_t cache_size = top_size * cache_height; @@ -491,8 +494,9 @@ static int AllocateMemory(VP8Decoder* const dec) { mem += alpha_size; assert(mem <= (uint8_t*)dec->mem_ + dec->mem_size_); - // note: left-info is initialized once for all. + // note: left/top-info is initialized once for all. memset(dec->mb_info_ - 1, 0, mb_info_size); + VP8InitScanline(dec); // initialize left too. // initialize top memset(dec->intra_t_, B_DC_PRED, intra_pred_mode_size); @@ -572,115 +576,118 @@ static void DoUVTransform(uint32_t bits, const int16_t* const src, } } -void VP8ReconstructBlock(const VP8Decoder* const dec) { +void VP8ReconstructBlocks(const VP8Decoder* const dec, int mb_y) { int j; + int mb_x; uint8_t* const y_dst = dec->yuv_b_ + Y_OFF; uint8_t* const u_dst = dec->yuv_b_ + U_OFF; uint8_t* const v_dst = dec->yuv_b_ + V_OFF; - const VP8MBData* const block = dec->mb_data_; + for (mb_x = 0; mb_x < dec->mb_w_; ++mb_x) { + const VP8MBData* const block = dec->mb_data_ + mb_x; - // Rotate in the left samples from previously decoded block. We move four - // pixels at a time for alignment reason, and because of in-loop filter. - if (dec->mb_x_ > 0) { - for (j = -1; j < 16; ++j) { - Copy32b(&y_dst[j * BPS - 4], &y_dst[j * BPS + 12]); - } - for (j = -1; j < 8; ++j) { - Copy32b(&u_dst[j * BPS - 4], &u_dst[j * BPS + 4]); - Copy32b(&v_dst[j * BPS - 4], &v_dst[j * BPS + 4]); - } - } else { - for (j = 0; j < 16; ++j) { - y_dst[j * BPS - 1] = 129; - } - for (j = 0; j < 8; ++j) { - u_dst[j * BPS - 1] = 129; - v_dst[j * BPS - 1] = 129; - } - // Init top-left sample on left column too - if (dec->mb_y_ > 0) { - y_dst[-1 - BPS] = u_dst[-1 - BPS] = v_dst[-1 - BPS] = 129; - } - } - { - // bring top samples into the cache - VP8TopSamples* const top_yuv = dec->yuv_t_ + dec->mb_x_; - const int16_t* const coeffs = block->coeffs_; - uint32_t bits = block->non_zero_y_; - int n; - - if (dec->mb_y_ > 0) { - memcpy(y_dst - BPS, top_yuv[0].y, 16); - memcpy(u_dst - BPS, top_yuv[0].u, 8); - memcpy(v_dst - BPS, top_yuv[0].v, 8); - } else if (dec->mb_x_ == 0) { - // we only need to do this init once at block (0,0). - // Afterward, it remains valid for the whole topmost row. - memset(y_dst - BPS - 1, 127, 16 + 4 + 1); - memset(u_dst - BPS - 1, 127, 8 + 1); - memset(v_dst - BPS - 1, 127, 8 + 1); - } - - // predict and add residuals - if (block->is_i4x4_) { // 4x4 - uint32_t* const top_right = (uint32_t*)(y_dst - BPS + 16); - - if (dec->mb_y_ > 0) { - if (dec->mb_x_ >= dec->mb_w_ - 1) { // on rightmost border - memset(top_right, top_yuv[0].y[15], sizeof(*top_right)); - } else { - memcpy(top_right, top_yuv[1].y, sizeof(*top_right)); - } + // Rotate in the left samples from previously decoded block. We move four + // pixels at a time for alignment reason, and because of in-loop filter. + if (mb_x > 0) { + for (j = -1; j < 16; ++j) { + Copy32b(&y_dst[j * BPS - 4], &y_dst[j * BPS + 12]); } - // replicate the top-right pixels below - top_right[BPS] = top_right[2 * BPS] = top_right[3 * BPS] = top_right[0]; - - // predict and add residuals for all 4x4 blocks in turn. - for (n = 0; n < 16; ++n, bits <<= 2) { - uint8_t* const dst = y_dst + kScan[n]; - VP8PredLuma4[block->imodes_[n]](dst); - DoTransform(bits, coeffs + n * 16, dst); + for (j = -1; j < 8; ++j) { + Copy32b(&u_dst[j * BPS - 4], &u_dst[j * BPS + 4]); + Copy32b(&v_dst[j * BPS - 4], &v_dst[j * BPS + 4]); } - } else { // 16x16 - const int pred_func = CheckMode(dec->mb_x_, dec->mb_y_, - block->imodes_[0]); - VP8PredLuma16[pred_func](y_dst); - if (bits != 0) { - for (n = 0; n < 16; ++n, bits <<= 2) { - DoTransform(bits, coeffs + n * 16, y_dst + kScan[n]); - } + } else { + for (j = 0; j < 16; ++j) { + y_dst[j * BPS - 1] = 129; + } + for (j = 0; j < 8; ++j) { + u_dst[j * BPS - 1] = 129; + v_dst[j * BPS - 1] = 129; + } + // Init top-left sample on left column too + if (mb_y > 0) { + y_dst[-1 - BPS] = u_dst[-1 - BPS] = v_dst[-1 - BPS] = 129; } } { - // Chroma - const uint32_t bits_uv = block->non_zero_uv_; - const int pred_func = CheckMode(dec->mb_x_, dec->mb_y_, block->uvmode_); - VP8PredChroma8[pred_func](u_dst); - VP8PredChroma8[pred_func](v_dst); - DoUVTransform(bits_uv >> 0, coeffs + 16 * 16, u_dst); - DoUVTransform(bits_uv >> 8, coeffs + 20 * 16, v_dst); - } + // bring top samples into the cache + VP8TopSamples* const top_yuv = dec->yuv_t_ + mb_x; + const int16_t* const coeffs = block->coeffs_; + uint32_t bits = block->non_zero_y_; + int n; - // stash away top samples for next block - if (dec->mb_y_ < dec->mb_h_ - 1) { - memcpy(top_yuv[0].y, y_dst + 15 * BPS, 16); - memcpy(top_yuv[0].u, u_dst + 7 * BPS, 8); - memcpy(top_yuv[0].v, v_dst + 7 * BPS, 8); + if (mb_y > 0) { + memcpy(y_dst - BPS, top_yuv[0].y, 16); + memcpy(u_dst - BPS, top_yuv[0].u, 8); + memcpy(v_dst - BPS, top_yuv[0].v, 8); + } else if (mb_x == 0) { + // we only need to do this init once at block (0,0). + // Afterward, it remains valid for the whole topmost row. + memset(y_dst - BPS - 1, 127, 16 + 4 + 1); + memset(u_dst - BPS - 1, 127, 8 + 1); + memset(v_dst - BPS - 1, 127, 8 + 1); + } + + // predict and add residuals + if (block->is_i4x4_) { // 4x4 + uint32_t* const top_right = (uint32_t*)(y_dst - BPS + 16); + + if (mb_y > 0) { + if (mb_x >= dec->mb_w_ - 1) { // on rightmost border + memset(top_right, top_yuv[0].y[15], sizeof(*top_right)); + } else { + memcpy(top_right, top_yuv[1].y, sizeof(*top_right)); + } + } + // replicate the top-right pixels below + top_right[BPS] = top_right[2 * BPS] = top_right[3 * BPS] = top_right[0]; + + // predict and add residuals for all 4x4 blocks in turn. + for (n = 0; n < 16; ++n, bits <<= 2) { + uint8_t* const dst = y_dst + kScan[n]; + VP8PredLuma4[block->imodes_[n]](dst); + DoTransform(bits, coeffs + n * 16, dst); + } + } else { // 16x16 + const int pred_func = CheckMode(mb_x, mb_y, + block->imodes_[0]); + VP8PredLuma16[pred_func](y_dst); + if (bits != 0) { + for (n = 0; n < 16; ++n, bits <<= 2) { + DoTransform(bits, coeffs + n * 16, y_dst + kScan[n]); + } + } + } + { + // Chroma + const uint32_t bits_uv = block->non_zero_uv_; + const int pred_func = CheckMode(mb_x, mb_y, block->uvmode_); + VP8PredChroma8[pred_func](u_dst); + VP8PredChroma8[pred_func](v_dst); + DoUVTransform(bits_uv >> 0, coeffs + 16 * 16, u_dst); + DoUVTransform(bits_uv >> 8, coeffs + 20 * 16, v_dst); + } + + // stash away top samples for next block + if (mb_y < dec->mb_h_ - 1) { + memcpy(top_yuv[0].y, y_dst + 15 * BPS, 16); + memcpy(top_yuv[0].u, u_dst + 7 * BPS, 8); + memcpy(top_yuv[0].v, v_dst + 7 * BPS, 8); + } } - } - // Transfer reconstructed samples from yuv_b_ cache to final destination. - { - const int y_offset = dec->cache_id_ * 16 * dec->cache_y_stride_; - const int uv_offset = dec->cache_id_ * 8 * dec->cache_uv_stride_; - uint8_t* const y_out = dec->cache_y_ + dec->mb_x_ * 16 + y_offset; - uint8_t* const u_out = dec->cache_u_ + dec->mb_x_ * 8 + uv_offset; - uint8_t* const v_out = dec->cache_v_ + dec->mb_x_ * 8 + uv_offset; - for (j = 0; j < 16; ++j) { - memcpy(y_out + j * dec->cache_y_stride_, y_dst + j * BPS, 16); - } - for (j = 0; j < 8; ++j) { - memcpy(u_out + j * dec->cache_uv_stride_, u_dst + j * BPS, 8); - memcpy(v_out + j * dec->cache_uv_stride_, v_dst + j * BPS, 8); + // Transfer reconstructed samples from yuv_b_ cache to final destination. + { + const int y_offset = dec->cache_id_ * 16 * dec->cache_y_stride_; + const int uv_offset = dec->cache_id_ * 8 * dec->cache_uv_stride_; + uint8_t* const y_out = dec->cache_y_ + mb_x * 16 + y_offset; + uint8_t* const u_out = dec->cache_u_ + mb_x * 8 + uv_offset; + uint8_t* const v_out = dec->cache_v_ + mb_x * 8 + uv_offset; + for (j = 0; j < 16; ++j) { + memcpy(y_out + j * dec->cache_y_stride_, y_dst + j * BPS, 16); + } + for (j = 0; j < 8; ++j) { + memcpy(u_out + j * dec->cache_uv_stride_, u_dst + j * BPS, 8); + memcpy(v_out + j * dec->cache_uv_stride_, v_dst + j * BPS, 8); + } } } } diff --git a/src/dec/idec.c b/src/dec/idec.c index c6c3b9ce..ce3290e3 100644 --- a/src/dec/idec.c +++ b/src/dec/idec.c @@ -451,16 +451,11 @@ static VP8StatusCode DecodeRemaining(WebPIDecoder* const idec) { VP8Io* const io = &idec->io_; assert(dec->ready_); - for (; dec->mb_y_ < dec->mb_h_; ++dec->mb_y_) { VP8BitReader* token_br = &dec->parts_[dec->mb_y_ & (dec->num_parts_ - 1)]; - if (dec->mb_x_ == 0) { - VP8InitScanline(dec); - } - for (; dec->mb_x_ < dec->mb_w_; dec->mb_x_++) { + for (; dec->mb_x_ < dec->mb_w_; ++dec->mb_x_) { MBContext context; SaveContext(dec, token_br, &context); - if (!VP8DecodeMB(dec, token_br)) { RestoreContext(&context, dec, token_br); // We shouldn't fail when MAX_MB data was available @@ -469,19 +464,20 @@ static VP8StatusCode DecodeRemaining(WebPIDecoder* const idec) { } return VP8_STATUS_SUSPENDED; } - // Reconstruct and emit samples. - VP8ReconstructBlock(dec); - // Release buffer only if there is only one partition if (dec->num_parts_ == 1) { idec->mem_.start_ = token_br->buf_ - idec->mem_.buf_; assert(idec->mem_.start_ <= idec->mem_.end_); } } + VP8InitScanline(dec); // Prepare for next scanline + + // Reconstruct the samples. + VP8ReconstructBlocks(dec, dec->mb_y_); + // Filter and emit the row. if (!VP8ProcessRow(dec, io)) { return IDecError(idec, VP8_STATUS_USER_ABORT); } - dec->mb_x_ = 0; } // Synchronize the thread and check for errors. if (!VP8ExitCritical(dec, io)) { diff --git a/src/dec/tree.c b/src/dec/tree.c index 82a58b88..89f95dbc 100644 --- a/src/dec/tree.c +++ b/src/dec/tree.c @@ -338,7 +338,7 @@ void VP8ResetProba(VP8Proba* const proba) { void VP8ParseIntraMode(VP8BitReader* const br, VP8Decoder* const dec) { uint8_t* const top = dec->intra_t_ + 4 * dec->mb_x_; uint8_t* const left = dec->intra_l_; - VP8MBData* const block = dec->mb_data_; + VP8MBData* const block = dec->mb_data_ + dec->mb_x_; block->is_i4x4_ = !VP8GetBit(br, 145); // decide for B_PRED first if (!block->is_i4x4_) { diff --git a/src/dec/vp8.c b/src/dec/vp8.c index 91337db4..975920dd 100644 --- a/src/dec/vp8.c +++ b/src/dec/vp8.c @@ -511,7 +511,7 @@ static int ParseResiduals(VP8Decoder* const dec, VP8BandProbas (* const bands)[NUM_BANDS] = dec->proba_.bands_; const VP8BandProbas* ac_proba; const VP8QuantMatrix* const q = &dec->dqm_[dec->segment_]; - VP8MBData* const block = dec->mb_data_; + VP8MBData* const block = dec->mb_data_ + dec->mb_x_; int16_t* dst = block->coeffs_; VP8MB* const left_mb = dec->mb_info_ - 1; uint8_t tnz, lnz; @@ -598,7 +598,7 @@ int VP8DecodeMB(VP8Decoder* const dec, VP8BitReader* const token_br) { VP8BitReader* const br = &dec->br_; VP8MB* const left = dec->mb_info_ - 1; VP8MB* const mb = dec->mb_info_ + dec->mb_x_; - VP8MBData* const block = dec->mb_data_; + VP8MBData* const block = dec->mb_data_ + dec->mb_x_; int skip; // Note: we don't save segment map (yet), as we don't expect @@ -641,24 +641,25 @@ void VP8InitScanline(VP8Decoder* const dec) { left->nz_ = 0; left->nz_dc_ = 0; memset(dec->intra_l_, B_DC_PRED, sizeof(dec->intra_l_)); - dec->filter_row_ = - (dec->filter_type_ > 0) && - (dec->mb_y_ >= dec->tl_mb_y_) && (dec->mb_y_ <= dec->br_mb_y_); + dec->mb_x_ = 0; } static int ParseFrame(VP8Decoder* const dec, VP8Io* io) { for (dec->mb_y_ = 0; dec->mb_y_ < dec->br_mb_y_; ++dec->mb_y_) { + // Parse bitstream for this row. VP8BitReader* const token_br = &dec->parts_[dec->mb_y_ & (dec->num_parts_ - 1)]; - VP8InitScanline(dec); - for (dec->mb_x_ = 0; dec->mb_x_ < dec->mb_w_; dec->mb_x_++) { + for (; dec->mb_x_ < dec->mb_w_; ++dec->mb_x_) { if (!VP8DecodeMB(dec, token_br)) { return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA, "Premature end-of-file encountered."); } - // Reconstruct and emit samples. - VP8ReconstructBlock(dec); } + VP8InitScanline(dec); // Prepare for next scanline + + // Reconstruct the samples. + VP8ReconstructBlocks(dec, dec->mb_y_); + // Filter and emit the row. if (!VP8ProcessRow(dec, io)) { return VP8SetError(dec, VP8_STATUS_USER_ABORT, "Output aborted."); } diff --git a/src/dec/vp8i.h b/src/dec/vp8i.h index e2c08c4e..4b19343b 100644 --- a/src/dec/vp8i.h +++ b/src/dec/vp8i.h @@ -292,7 +292,6 @@ struct VP8Decoder { // Filtering side-info int filter_type_; // 0=off, 1=simple, 2=complex - int filter_row_; // per-row flag VP8FInfo fstrengths_[NUM_MB_SEGMENTS][2]; // precalculated per-segment/type // Alpha @@ -325,8 +324,8 @@ void VP8ParseQuant(VP8Decoder* const dec); // in frame.c int VP8InitFrame(VP8Decoder* const dec, VP8Io* io); -// Predict a block and add residual -void VP8ReconstructBlock(const VP8Decoder* const dec); +// Reconstruct a full row of blocks (prediction + residual adding) +void VP8ReconstructBlocks(const VP8Decoder* const dec, int mb_y); // Call io->setup() and finish setting up scan parameters. // After this call returns, one must always call VP8ExitCritical() with the // same parameters. Both functions should be used in pair. Returns VP8_STATUS_OK