mirror of
https://github.com/webmproject/libwebp.git
synced 2025-02-13 07:22:52 +01:00
up to 20% faster multi-threaded decoding
Mostly visible for large images. Reconstruction+filtering is now done in parallel to bitstream-parsing. Change-Id: I4cc4483d803b255f4d97a2fcd9158b1c291dd900
This commit is contained in:
parent
38efdc2e9c
commit
0532149c8a
@ -21,6 +21,9 @@ extern "C" {
|
|||||||
|
|
||||||
#define ALIGN_MASK (32 - 1)
|
#define ALIGN_MASK (32 - 1)
|
||||||
|
|
||||||
|
static void ReconstructRow(const VP8Decoder* const dec,
|
||||||
|
const VP8ThreadContext* ctx); // TODO(skal): remove
|
||||||
|
|
||||||
//------------------------------------------------------------------------------
|
//------------------------------------------------------------------------------
|
||||||
// Filtering
|
// Filtering
|
||||||
|
|
||||||
@ -41,9 +44,10 @@ static WEBP_INLINE int hev_thresh_from_level(int level, int keyframe) {
|
|||||||
|
|
||||||
static void DoFilter(const VP8Decoder* const dec, int mb_x, int mb_y) {
|
static void DoFilter(const VP8Decoder* const dec, int mb_x, int mb_y) {
|
||||||
const VP8ThreadContext* const ctx = &dec->thread_ctx_;
|
const VP8ThreadContext* const ctx = &dec->thread_ctx_;
|
||||||
|
const int cache_id = ctx->id_;
|
||||||
const int y_bps = dec->cache_y_stride_;
|
const int y_bps = dec->cache_y_stride_;
|
||||||
VP8FInfo* const f_info = ctx->f_info_ + mb_x;
|
VP8FInfo* const f_info = ctx->f_info_ + mb_x;
|
||||||
uint8_t* const y_dst = dec->cache_y_ + ctx->id_ * 16 * y_bps + mb_x * 16;
|
uint8_t* const y_dst = dec->cache_y_ + cache_id * 16 * y_bps + mb_x * 16;
|
||||||
const int level = f_info->f_level_;
|
const int level = f_info->f_level_;
|
||||||
const int ilevel = f_info->f_ilevel_;
|
const int ilevel = f_info->f_ilevel_;
|
||||||
const int limit = 2 * level + ilevel;
|
const int limit = 2 * level + ilevel;
|
||||||
@ -65,8 +69,8 @@ static void DoFilter(const VP8Decoder* const dec, int mb_x, int mb_y) {
|
|||||||
}
|
}
|
||||||
} else { // complex
|
} else { // complex
|
||||||
const int uv_bps = dec->cache_uv_stride_;
|
const int uv_bps = dec->cache_uv_stride_;
|
||||||
uint8_t* const u_dst = dec->cache_u_ + ctx->id_ * 8 * uv_bps + mb_x * 8;
|
uint8_t* const u_dst = dec->cache_u_ + cache_id * 8 * uv_bps + mb_x * 8;
|
||||||
uint8_t* const v_dst = dec->cache_v_ + ctx->id_ * 8 * uv_bps + mb_x * 8;
|
uint8_t* const v_dst = dec->cache_v_ + cache_id * 8 * uv_bps + mb_x * 8;
|
||||||
const int hev_thresh =
|
const int hev_thresh =
|
||||||
hev_thresh_from_level(level, dec->frm_hdr_.key_frame_);
|
hev_thresh_from_level(level, dec->frm_hdr_.key_frame_);
|
||||||
if (mb_x > 0) {
|
if (mb_x > 0) {
|
||||||
@ -164,25 +168,29 @@ static void PrecomputeFilterStrengths(VP8Decoder* const dec) {
|
|||||||
static int FinishRow(VP8Decoder* const dec, VP8Io* const io) {
|
static int FinishRow(VP8Decoder* const dec, VP8Io* const io) {
|
||||||
int ok = 1;
|
int ok = 1;
|
||||||
const VP8ThreadContext* const ctx = &dec->thread_ctx_;
|
const VP8ThreadContext* const ctx = &dec->thread_ctx_;
|
||||||
|
const int cache_id = ctx->id_;
|
||||||
const int extra_y_rows = kFilterExtraRows[dec->filter_type_];
|
const int extra_y_rows = kFilterExtraRows[dec->filter_type_];
|
||||||
const int ysize = extra_y_rows * dec->cache_y_stride_;
|
const int ysize = extra_y_rows * dec->cache_y_stride_;
|
||||||
const int uvsize = (extra_y_rows / 2) * dec->cache_uv_stride_;
|
const int uvsize = (extra_y_rows / 2) * dec->cache_uv_stride_;
|
||||||
const int y_offset = ctx->id_ * 16 * dec->cache_y_stride_;
|
const int y_offset = cache_id * 16 * dec->cache_y_stride_;
|
||||||
const int uv_offset = ctx->id_ * 8 * dec->cache_uv_stride_;
|
const int uv_offset = cache_id * 8 * dec->cache_uv_stride_;
|
||||||
uint8_t* const ydst = dec->cache_y_ - ysize + y_offset;
|
uint8_t* const ydst = dec->cache_y_ - ysize + y_offset;
|
||||||
uint8_t* const udst = dec->cache_u_ - uvsize + uv_offset;
|
uint8_t* const udst = dec->cache_u_ - uvsize + uv_offset;
|
||||||
uint8_t* const vdst = dec->cache_v_ - uvsize + uv_offset;
|
uint8_t* const vdst = dec->cache_v_ - uvsize + uv_offset;
|
||||||
const int first_row = (ctx->mb_y_ == 0);
|
const int mb_y = ctx->mb_y_;
|
||||||
const int last_row = (ctx->mb_y_ >= dec->br_mb_y_ - 1);
|
const int is_first_row = (mb_y == 0);
|
||||||
int y_start = MACROBLOCK_VPOS(ctx->mb_y_);
|
const int is_last_row = (mb_y >= dec->br_mb_y_ - 1);
|
||||||
int y_end = MACROBLOCK_VPOS(ctx->mb_y_ + 1);
|
|
||||||
|
ReconstructRow(dec, ctx);
|
||||||
|
|
||||||
if (ctx->filter_row_) {
|
if (ctx->filter_row_) {
|
||||||
FilterRow(dec);
|
FilterRow(dec);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (io->put != NULL) {
|
if (io->put != NULL) {
|
||||||
if (!first_row) {
|
int y_start = MACROBLOCK_VPOS(mb_y);
|
||||||
|
int y_end = MACROBLOCK_VPOS(mb_y + 1);
|
||||||
|
if (!is_first_row) {
|
||||||
y_start -= extra_y_rows;
|
y_start -= extra_y_rows;
|
||||||
io->y = ydst;
|
io->y = ydst;
|
||||||
io->u = udst;
|
io->u = udst;
|
||||||
@ -193,7 +201,7 @@ static int FinishRow(VP8Decoder* const dec, VP8Io* const io) {
|
|||||||
io->v = dec->cache_v_ + uv_offset;
|
io->v = dec->cache_v_ + uv_offset;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!last_row) {
|
if (!is_last_row) {
|
||||||
y_end -= extra_y_rows;
|
y_end -= extra_y_rows;
|
||||||
}
|
}
|
||||||
if (y_end > io->crop_bottom) {
|
if (y_end > io->crop_bottom) {
|
||||||
@ -234,8 +242,8 @@ static int FinishRow(VP8Decoder* const dec, VP8Io* const io) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
// rotate top samples if needed
|
// rotate top samples if needed
|
||||||
if (ctx->id_ + 1 == dec->num_caches_) {
|
if (cache_id + 1 == dec->num_caches_) {
|
||||||
if (!last_row) {
|
if (!is_last_row) {
|
||||||
memcpy(dec->cache_y_ - ysize, ydst + 16 * dec->cache_y_stride_, ysize);
|
memcpy(dec->cache_y_ - ysize, ydst + 16 * dec->cache_y_stride_, ysize);
|
||||||
memcpy(dec->cache_u_ - uvsize, udst + 8 * dec->cache_uv_stride_, uvsize);
|
memcpy(dec->cache_u_ - uvsize, udst + 8 * dec->cache_uv_stride_, uvsize);
|
||||||
memcpy(dec->cache_v_ - uvsize, vdst + 8 * dec->cache_uv_stride_, uvsize);
|
memcpy(dec->cache_v_ - uvsize, vdst + 8 * dec->cache_uv_stride_, uvsize);
|
||||||
@ -270,6 +278,11 @@ int VP8ProcessRow(VP8Decoder* const dec, VP8Io* const io) {
|
|||||||
ctx->id_ = dec->cache_id_;
|
ctx->id_ = dec->cache_id_;
|
||||||
ctx->mb_y_ = dec->mb_y_;
|
ctx->mb_y_ = dec->mb_y_;
|
||||||
ctx->filter_row_ = filter_row;
|
ctx->filter_row_ = filter_row;
|
||||||
|
{
|
||||||
|
VP8MBData* const tmp = ctx->mb_data_;
|
||||||
|
ctx->mb_data_ = dec->mb_data_;
|
||||||
|
dec->mb_data_ = tmp;
|
||||||
|
}
|
||||||
if (filter_row) { // just swap filter info
|
if (filter_row) { // just swap filter info
|
||||||
VP8FInfo* const tmp = ctx->f_info_;
|
VP8FInfo* const tmp = ctx->f_info_;
|
||||||
ctx->f_info_ = dec->f_info_;
|
ctx->f_info_ = dec->f_info_;
|
||||||
@ -419,7 +432,8 @@ static int AllocateMemory(VP8Decoder* const dec) {
|
|||||||
mb_w * (dec->use_threads_ ? 2 : 1) * sizeof(VP8FInfo)
|
mb_w * (dec->use_threads_ ? 2 : 1) * sizeof(VP8FInfo)
|
||||||
: 0;
|
: 0;
|
||||||
const size_t yuv_size = YUV_SIZE * sizeof(*dec->yuv_b_);
|
const size_t yuv_size = YUV_SIZE * sizeof(*dec->yuv_b_);
|
||||||
const size_t mb_data_size = mb_w * sizeof(*dec->mb_data_);
|
const size_t mb_data_size =
|
||||||
|
(dec->use_threads_ ? 2 : 1) * mb_w * sizeof(*dec->mb_data_);
|
||||||
const size_t cache_height = (16 * num_caches
|
const size_t cache_height = (16 * num_caches
|
||||||
+ kFilterExtraRows[dec->filter_type_]) * 3 / 2;
|
+ kFilterExtraRows[dec->filter_type_]) * 3 / 2;
|
||||||
const size_t cache_size = top_size * cache_height;
|
const size_t cache_size = top_size * cache_height;
|
||||||
@ -472,6 +486,10 @@ static int AllocateMemory(VP8Decoder* const dec) {
|
|||||||
mem += yuv_size;
|
mem += yuv_size;
|
||||||
|
|
||||||
dec->mb_data_ = (VP8MBData*)mem;
|
dec->mb_data_ = (VP8MBData*)mem;
|
||||||
|
dec->thread_ctx_.mb_data_ = (VP8MBData*)mem;
|
||||||
|
if (dec->use_threads_) {
|
||||||
|
dec->thread_ctx_.mb_data_ += mb_w;
|
||||||
|
}
|
||||||
mem += mb_data_size;
|
mem += mb_data_size;
|
||||||
|
|
||||||
dec->cache_y_stride_ = 16 * mb_w;
|
dec->cache_y_stride_ = 16 * mb_w;
|
||||||
@ -576,14 +594,17 @@ static void DoUVTransform(uint32_t bits, const int16_t* const src,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void VP8ReconstructBlocks(const VP8Decoder* const dec, int mb_y) {
|
static void ReconstructRow(const VP8Decoder* const dec,
|
||||||
|
const VP8ThreadContext* ctx) {
|
||||||
int j;
|
int j;
|
||||||
int mb_x;
|
int mb_x;
|
||||||
|
const int mb_y = ctx->mb_y_;
|
||||||
|
const int cache_id = ctx->id_;
|
||||||
uint8_t* const y_dst = dec->yuv_b_ + Y_OFF;
|
uint8_t* const y_dst = dec->yuv_b_ + Y_OFF;
|
||||||
uint8_t* const u_dst = dec->yuv_b_ + U_OFF;
|
uint8_t* const u_dst = dec->yuv_b_ + U_OFF;
|
||||||
uint8_t* const v_dst = dec->yuv_b_ + V_OFF;
|
uint8_t* const v_dst = dec->yuv_b_ + V_OFF;
|
||||||
for (mb_x = 0; mb_x < dec->mb_w_; ++mb_x) {
|
for (mb_x = 0; mb_x < dec->mb_w_; ++mb_x) {
|
||||||
const VP8MBData* const block = dec->mb_data_ + mb_x;
|
const VP8MBData* const block = ctx->mb_data_ + mb_x;
|
||||||
|
|
||||||
// Rotate in the left samples from previously decoded block. We move four
|
// Rotate in the left samples from previously decoded block. We move four
|
||||||
// pixels at a time for alignment reason, and because of in-loop filter.
|
// pixels at a time for alignment reason, and because of in-loop filter.
|
||||||
@ -676,8 +697,8 @@ void VP8ReconstructBlocks(const VP8Decoder* const dec, int mb_y) {
|
|||||||
}
|
}
|
||||||
// Transfer reconstructed samples from yuv_b_ cache to final destination.
|
// Transfer reconstructed samples from yuv_b_ cache to final destination.
|
||||||
{
|
{
|
||||||
const int y_offset = dec->cache_id_ * 16 * dec->cache_y_stride_;
|
const int y_offset = cache_id * 16 * dec->cache_y_stride_;
|
||||||
const int uv_offset = dec->cache_id_ * 8 * dec->cache_uv_stride_;
|
const int uv_offset = cache_id * 8 * dec->cache_uv_stride_;
|
||||||
uint8_t* const y_out = dec->cache_y_ + mb_x * 16 + y_offset;
|
uint8_t* const y_out = dec->cache_y_ + mb_x * 16 + y_offset;
|
||||||
uint8_t* const u_out = dec->cache_u_ + mb_x * 8 + uv_offset;
|
uint8_t* const u_out = dec->cache_u_ + mb_x * 8 + uv_offset;
|
||||||
uint8_t* const v_out = dec->cache_v_ + mb_x * 8 + uv_offset;
|
uint8_t* const v_out = dec->cache_v_ + mb_x * 8 + uv_offset;
|
||||||
|
@ -472,9 +472,7 @@ static VP8StatusCode DecodeRemaining(WebPIDecoder* const idec) {
|
|||||||
}
|
}
|
||||||
VP8InitScanline(dec); // Prepare for next scanline
|
VP8InitScanline(dec); // Prepare for next scanline
|
||||||
|
|
||||||
// Reconstruct the samples.
|
// Reconstruct, filter and emit the row.
|
||||||
VP8ReconstructBlocks(dec, dec->mb_y_);
|
|
||||||
// Filter and emit the row.
|
|
||||||
if (!VP8ProcessRow(dec, io)) {
|
if (!VP8ProcessRow(dec, io)) {
|
||||||
return IDecError(idec, VP8_STATUS_USER_ABORT);
|
return IDecError(idec, VP8_STATUS_USER_ABORT);
|
||||||
}
|
}
|
||||||
|
@ -657,9 +657,7 @@ static int ParseFrame(VP8Decoder* const dec, VP8Io* io) {
|
|||||||
}
|
}
|
||||||
VP8InitScanline(dec); // Prepare for next scanline
|
VP8InitScanline(dec); // Prepare for next scanline
|
||||||
|
|
||||||
// Reconstruct the samples.
|
// Reconstruct, filter and emit the row.
|
||||||
VP8ReconstructBlocks(dec, dec->mb_y_);
|
|
||||||
// Filter and emit the row.
|
|
||||||
if (!VP8ProcessRow(dec, io)) {
|
if (!VP8ProcessRow(dec, io)) {
|
||||||
return VP8SetError(dec, VP8_STATUS_USER_ABORT, "Output aborted.");
|
return VP8SetError(dec, VP8_STATUS_USER_ABORT, "Output aborted.");
|
||||||
}
|
}
|
||||||
|
@ -200,7 +200,8 @@ typedef struct {
|
|||||||
int id_; // cache row to process (in [0..2])
|
int id_; // cache row to process (in [0..2])
|
||||||
int mb_y_; // macroblock position of the row
|
int mb_y_; // macroblock position of the row
|
||||||
int filter_row_; // true if row-filtering is needed
|
int filter_row_; // true if row-filtering is needed
|
||||||
VP8FInfo* f_info_; // filter strengths
|
VP8FInfo* f_info_; // filter strengths (swapped with dec->f_info_)
|
||||||
|
VP8MBData* mb_data_; // reconstruction data (swapped with dec->mb_data_)
|
||||||
VP8Io io_; // copy of the VP8Io to pass to put()
|
VP8Io io_; // copy of the VP8Io to pass to put()
|
||||||
} VP8ThreadContext;
|
} VP8ThreadContext;
|
||||||
|
|
||||||
@ -288,7 +289,7 @@ struct VP8Decoder {
|
|||||||
|
|
||||||
// Per macroblock non-persistent infos.
|
// Per macroblock non-persistent infos.
|
||||||
int mb_x_, mb_y_; // current position, in macroblock units
|
int mb_x_, mb_y_; // current position, in macroblock units
|
||||||
VP8MBData* mb_data_; // reconstruction data
|
VP8MBData* mb_data_; // parsed reconstruction data
|
||||||
|
|
||||||
// Filtering side-info
|
// Filtering side-info
|
||||||
int filter_type_; // 0=off, 1=simple, 2=complex
|
int filter_type_; // 0=off, 1=simple, 2=complex
|
||||||
@ -324,8 +325,6 @@ void VP8ParseQuant(VP8Decoder* const dec);
|
|||||||
|
|
||||||
// in frame.c
|
// in frame.c
|
||||||
int VP8InitFrame(VP8Decoder* const dec, VP8Io* io);
|
int VP8InitFrame(VP8Decoder* const dec, VP8Io* io);
|
||||||
// Reconstruct a full row of blocks (prediction + residual adding)
|
|
||||||
void VP8ReconstructBlocks(const VP8Decoder* const dec, int mb_y);
|
|
||||||
// Call io->setup() and finish setting up scan parameters.
|
// Call io->setup() and finish setting up scan parameters.
|
||||||
// After this call returns, one must always call VP8ExitCritical() with the
|
// After this call returns, one must always call VP8ExitCritical() with the
|
||||||
// same parameters. Both functions should be used in pair. Returns VP8_STATUS_OK
|
// same parameters. Both functions should be used in pair. Returns VP8_STATUS_OK
|
||||||
|
Loading…
x
Reference in New Issue
Block a user