From 0532149c8a7d618a81842f1135ceb2d734de55f2 Mon Sep 17 00:00:00 2001 From: skal Date: Tue, 15 Oct 2013 00:25:21 +0200 Subject: [PATCH] up to 20% faster multi-threaded decoding Mostly visible for large images. Reconstruction+filtering is now done in parallel to bitstream-parsing. Change-Id: I4cc4483d803b255f4d97a2fcd9158b1c291dd900 --- src/dec/frame.c | 57 +++++++++++++++++++++++++++++++++---------------- src/dec/idec.c | 4 +--- src/dec/vp8.c | 4 +--- src/dec/vp8i.h | 17 +++++++-------- 4 files changed, 49 insertions(+), 33 deletions(-) diff --git a/src/dec/frame.c b/src/dec/frame.c index ae34eb7b..fad92e6a 100644 --- a/src/dec/frame.c +++ b/src/dec/frame.c @@ -21,6 +21,9 @@ extern "C" { #define ALIGN_MASK (32 - 1) +static void ReconstructRow(const VP8Decoder* const dec, + const VP8ThreadContext* ctx); // TODO(skal): remove + //------------------------------------------------------------------------------ // Filtering @@ -41,9 +44,10 @@ static WEBP_INLINE int hev_thresh_from_level(int level, int keyframe) { static void DoFilter(const VP8Decoder* const dec, int mb_x, int mb_y) { const VP8ThreadContext* const ctx = &dec->thread_ctx_; + const int cache_id = ctx->id_; const int y_bps = dec->cache_y_stride_; VP8FInfo* const f_info = ctx->f_info_ + mb_x; - uint8_t* const y_dst = dec->cache_y_ + ctx->id_ * 16 * y_bps + mb_x * 16; + uint8_t* const y_dst = dec->cache_y_ + cache_id * 16 * y_bps + mb_x * 16; const int level = f_info->f_level_; const int ilevel = f_info->f_ilevel_; const int limit = 2 * level + ilevel; @@ -65,8 +69,8 @@ static void DoFilter(const VP8Decoder* const dec, int mb_x, int mb_y) { } } else { // complex const int uv_bps = dec->cache_uv_stride_; - uint8_t* const u_dst = dec->cache_u_ + ctx->id_ * 8 * uv_bps + mb_x * 8; - uint8_t* const v_dst = dec->cache_v_ + ctx->id_ * 8 * uv_bps + mb_x * 8; + uint8_t* const u_dst = dec->cache_u_ + cache_id * 8 * uv_bps + mb_x * 8; + uint8_t* const v_dst = dec->cache_v_ + cache_id * 8 * uv_bps + mb_x * 8; const int hev_thresh = hev_thresh_from_level(level, dec->frm_hdr_.key_frame_); if (mb_x > 0) { @@ -164,25 +168,29 @@ static void PrecomputeFilterStrengths(VP8Decoder* const dec) { static int FinishRow(VP8Decoder* const dec, VP8Io* const io) { int ok = 1; const VP8ThreadContext* const ctx = &dec->thread_ctx_; + const int cache_id = ctx->id_; const int extra_y_rows = kFilterExtraRows[dec->filter_type_]; const int ysize = extra_y_rows * dec->cache_y_stride_; const int uvsize = (extra_y_rows / 2) * dec->cache_uv_stride_; - const int y_offset = ctx->id_ * 16 * dec->cache_y_stride_; - const int uv_offset = ctx->id_ * 8 * dec->cache_uv_stride_; + const int y_offset = cache_id * 16 * dec->cache_y_stride_; + const int uv_offset = cache_id * 8 * dec->cache_uv_stride_; uint8_t* const ydst = dec->cache_y_ - ysize + y_offset; uint8_t* const udst = dec->cache_u_ - uvsize + uv_offset; uint8_t* const vdst = dec->cache_v_ - uvsize + uv_offset; - const int first_row = (ctx->mb_y_ == 0); - const int last_row = (ctx->mb_y_ >= dec->br_mb_y_ - 1); - int y_start = MACROBLOCK_VPOS(ctx->mb_y_); - int y_end = MACROBLOCK_VPOS(ctx->mb_y_ + 1); + const int mb_y = ctx->mb_y_; + const int is_first_row = (mb_y == 0); + const int is_last_row = (mb_y >= dec->br_mb_y_ - 1); + + ReconstructRow(dec, ctx); if (ctx->filter_row_) { FilterRow(dec); } if (io->put != NULL) { - if (!first_row) { + int y_start = MACROBLOCK_VPOS(mb_y); + int y_end = MACROBLOCK_VPOS(mb_y + 1); + if (!is_first_row) { y_start -= extra_y_rows; io->y = ydst; io->u = udst; @@ -193,7 +201,7 @@ static int FinishRow(VP8Decoder* const dec, VP8Io* const io) { io->v = dec->cache_v_ + uv_offset; } - if (!last_row) { + if (!is_last_row) { y_end -= extra_y_rows; } if (y_end > io->crop_bottom) { @@ -234,8 +242,8 @@ static int FinishRow(VP8Decoder* const dec, VP8Io* const io) { } } // rotate top samples if needed - if (ctx->id_ + 1 == dec->num_caches_) { - if (!last_row) { + if (cache_id + 1 == dec->num_caches_) { + if (!is_last_row) { memcpy(dec->cache_y_ - ysize, ydst + 16 * dec->cache_y_stride_, ysize); memcpy(dec->cache_u_ - uvsize, udst + 8 * dec->cache_uv_stride_, uvsize); memcpy(dec->cache_v_ - uvsize, vdst + 8 * dec->cache_uv_stride_, uvsize); @@ -270,6 +278,11 @@ int VP8ProcessRow(VP8Decoder* const dec, VP8Io* const io) { ctx->id_ = dec->cache_id_; ctx->mb_y_ = dec->mb_y_; ctx->filter_row_ = filter_row; + { + VP8MBData* const tmp = ctx->mb_data_; + ctx->mb_data_ = dec->mb_data_; + dec->mb_data_ = tmp; + } if (filter_row) { // just swap filter info VP8FInfo* const tmp = ctx->f_info_; ctx->f_info_ = dec->f_info_; @@ -419,7 +432,8 @@ static int AllocateMemory(VP8Decoder* const dec) { mb_w * (dec->use_threads_ ? 2 : 1) * sizeof(VP8FInfo) : 0; const size_t yuv_size = YUV_SIZE * sizeof(*dec->yuv_b_); - const size_t mb_data_size = mb_w * sizeof(*dec->mb_data_); + const size_t mb_data_size = + (dec->use_threads_ ? 2 : 1) * mb_w * sizeof(*dec->mb_data_); const size_t cache_height = (16 * num_caches + kFilterExtraRows[dec->filter_type_]) * 3 / 2; const size_t cache_size = top_size * cache_height; @@ -472,6 +486,10 @@ static int AllocateMemory(VP8Decoder* const dec) { mem += yuv_size; dec->mb_data_ = (VP8MBData*)mem; + dec->thread_ctx_.mb_data_ = (VP8MBData*)mem; + if (dec->use_threads_) { + dec->thread_ctx_.mb_data_ += mb_w; + } mem += mb_data_size; dec->cache_y_stride_ = 16 * mb_w; @@ -576,14 +594,17 @@ static void DoUVTransform(uint32_t bits, const int16_t* const src, } } -void VP8ReconstructBlocks(const VP8Decoder* const dec, int mb_y) { +static void ReconstructRow(const VP8Decoder* const dec, + const VP8ThreadContext* ctx) { int j; int mb_x; + const int mb_y = ctx->mb_y_; + const int cache_id = ctx->id_; uint8_t* const y_dst = dec->yuv_b_ + Y_OFF; uint8_t* const u_dst = dec->yuv_b_ + U_OFF; uint8_t* const v_dst = dec->yuv_b_ + V_OFF; for (mb_x = 0; mb_x < dec->mb_w_; ++mb_x) { - const VP8MBData* const block = dec->mb_data_ + mb_x; + const VP8MBData* const block = ctx->mb_data_ + mb_x; // Rotate in the left samples from previously decoded block. We move four // pixels at a time for alignment reason, and because of in-loop filter. @@ -676,8 +697,8 @@ void VP8ReconstructBlocks(const VP8Decoder* const dec, int mb_y) { } // Transfer reconstructed samples from yuv_b_ cache to final destination. { - const int y_offset = dec->cache_id_ * 16 * dec->cache_y_stride_; - const int uv_offset = dec->cache_id_ * 8 * dec->cache_uv_stride_; + const int y_offset = cache_id * 16 * dec->cache_y_stride_; + const int uv_offset = cache_id * 8 * dec->cache_uv_stride_; uint8_t* const y_out = dec->cache_y_ + mb_x * 16 + y_offset; uint8_t* const u_out = dec->cache_u_ + mb_x * 8 + uv_offset; uint8_t* const v_out = dec->cache_v_ + mb_x * 8 + uv_offset; diff --git a/src/dec/idec.c b/src/dec/idec.c index ce3290e3..a74dbfeb 100644 --- a/src/dec/idec.c +++ b/src/dec/idec.c @@ -472,9 +472,7 @@ static VP8StatusCode DecodeRemaining(WebPIDecoder* const idec) { } VP8InitScanline(dec); // Prepare for next scanline - // Reconstruct the samples. - VP8ReconstructBlocks(dec, dec->mb_y_); - // Filter and emit the row. + // Reconstruct, filter and emit the row. if (!VP8ProcessRow(dec, io)) { return IDecError(idec, VP8_STATUS_USER_ABORT); } diff --git a/src/dec/vp8.c b/src/dec/vp8.c index 975920dd..03022c14 100644 --- a/src/dec/vp8.c +++ b/src/dec/vp8.c @@ -657,9 +657,7 @@ static int ParseFrame(VP8Decoder* const dec, VP8Io* io) { } VP8InitScanline(dec); // Prepare for next scanline - // Reconstruct the samples. - VP8ReconstructBlocks(dec, dec->mb_y_); - // Filter and emit the row. + // Reconstruct, filter and emit the row. if (!VP8ProcessRow(dec, io)) { return VP8SetError(dec, VP8_STATUS_USER_ABORT, "Output aborted."); } diff --git a/src/dec/vp8i.h b/src/dec/vp8i.h index 4b19343b..b8dd2e8c 100644 --- a/src/dec/vp8i.h +++ b/src/dec/vp8i.h @@ -197,11 +197,12 @@ typedef struct { // Persistent information needed by the parallel processing typedef struct { - int id_; // cache row to process (in [0..2]) - int mb_y_; // macroblock position of the row - int filter_row_; // true if row-filtering is needed - VP8FInfo* f_info_; // filter strengths - VP8Io io_; // copy of the VP8Io to pass to put() + int id_; // cache row to process (in [0..2]) + int mb_y_; // macroblock position of the row + int filter_row_; // true if row-filtering is needed + VP8FInfo* f_info_; // filter strengths (swapped with dec->f_info_) + VP8MBData* mb_data_; // reconstruction data (swapped with dec->mb_data_) + VP8Io io_; // copy of the VP8Io to pass to put() } VP8ThreadContext; // Saved top samples, per macroblock. Fits into a cache-line. @@ -287,8 +288,8 @@ struct VP8Decoder { size_t mem_size_; // Per macroblock non-persistent infos. - int mb_x_, mb_y_; // current position, in macroblock units - VP8MBData* mb_data_; // reconstruction data + int mb_x_, mb_y_; // current position, in macroblock units + VP8MBData* mb_data_; // parsed reconstruction data // Filtering side-info int filter_type_; // 0=off, 1=simple, 2=complex @@ -324,8 +325,6 @@ void VP8ParseQuant(VP8Decoder* const dec); // in frame.c int VP8InitFrame(VP8Decoder* const dec, VP8Io* io); -// Reconstruct a full row of blocks (prediction + residual adding) -void VP8ReconstructBlocks(const VP8Decoder* const dec, int mb_y); // Call io->setup() and finish setting up scan parameters. // After this call returns, one must always call VP8ExitCritical() with the // same parameters. Both functions should be used in pair. Returns VP8_STATUS_OK