Add a second multi-thread method

method 1 grouping: [parse + reconstruction] // [filtering + output]
method 2 grouping: [parse] // [reconstruction+filtering + output]

Depending on some heuristics (see VP8ThreadMethod()), we
can pick one of the other when -mt flag (or option.use_threads)
is selected.

Conservatively, we always use method #2 for now until the heuristic
is refined (so, timing should be the same the before this patch)

+ replace 'use_threads' by 'mt_method'
+ define MIN_WIDTH_FOR_THREADS constant
+ fix comment alignment

Change-Id: I11a756dea9070d6e21b1a9481d357a1e8aa0663e
This commit is contained in:
skal
2013-10-15 23:58:31 +02:00
parent 7d6f2da075
commit 8a2fa099cc
5 changed files with 62 additions and 30 deletions

View File

@ -181,7 +181,9 @@ static int FinishRow(VP8Decoder* const dec, VP8Io* const io) {
const int is_first_row = (mb_y == 0);
const int is_last_row = (mb_y >= dec->br_mb_y_ - 1);
ReconstructRow(dec, ctx);
if (dec->mt_method_ == 2) {
ReconstructRow(dec, ctx);
}
if (ctx->filter_row_) {
FilterRow(dec);
@ -263,10 +265,11 @@ int VP8ProcessRow(VP8Decoder* const dec, VP8Io* const io) {
const int filter_row =
(dec->filter_type_ > 0) &&
(dec->mb_y_ >= dec->tl_mb_y_) && (dec->mb_y_ <= dec->br_mb_y_);
if (!dec->use_threads_) {
if (dec->mt_method_ == 0) {
// ctx->id_ and ctx->f_info_ are already set
ctx->mb_y_ = dec->mb_y_;
ctx->filter_row_ = filter_row;
ReconstructRow(dec, ctx);
ok = FinishRow(dec, io);
} else {
WebPWorker* const worker = &dec->worker_;
@ -278,17 +281,20 @@ int VP8ProcessRow(VP8Decoder* const dec, VP8Io* const io) {
ctx->id_ = dec->cache_id_;
ctx->mb_y_ = dec->mb_y_;
ctx->filter_row_ = filter_row;
{
if (dec->mt_method_ == 2) { // swap macroblock data
VP8MBData* const tmp = ctx->mb_data_;
ctx->mb_data_ = dec->mb_data_;
dec->mb_data_ = tmp;
} else {
// perform reconstruction directly in main thread
ReconstructRow(dec, ctx);
}
if (filter_row) { // just swap filter info
if (filter_row) { // swap filter info
VP8FInfo* const tmp = ctx->f_info_;
ctx->f_info_ = dec->f_info_;
dec->f_info_ = tmp;
}
WebPWorkerLaunch(worker);
WebPWorkerLaunch(worker); // (reconstruct)+filter in parallel
if (++dec->cache_id_ == dec->num_caches_) {
dec->cache_id_ = 0;
}
@ -357,7 +363,7 @@ VP8StatusCode VP8EnterCritical(VP8Decoder* const dec, VP8Io* const io) {
int VP8ExitCritical(VP8Decoder* const dec, VP8Io* const io) {
int ok = 1;
if (dec->use_threads_) {
if (dec->mt_method_ > 0) {
ok = WebPWorkerSync(&dec->worker_);
}
@ -397,7 +403,7 @@ int VP8ExitCritical(VP8Decoder* const dec, VP8Io* const io) {
// Initialize multi/single-thread worker
static int InitThreadContext(VP8Decoder* const dec) {
dec->cache_id_ = 0;
if (dec->use_threads_) {
if (dec->mt_method_ > 0) {
WebPWorker* const worker = &dec->worker_;
if (!WebPWorkerReset(worker)) {
return VP8SetError(dec, VP8_STATUS_OUT_OF_MEMORY,
@ -414,6 +420,28 @@ static int InitThreadContext(VP8Decoder* const dec) {
return 1;
}
int VP8GetThreadMethod(const WebPDecoderOptions* const options,
const WebPHeaderStructure* const headers,
int width, int height) {
if (options == NULL || options->use_threads == 0) {
return 0;
}
(void)headers;
(void)width;
(void)height;
assert(!headers->is_lossless);
#if defined(WEBP_USE_THREAD)
if (width < MIN_WIDTH_FOR_THREADS) return 0;
// TODO(skal): tune the heuristic further
#if 0
if (height < 2 * width) return 2;
#endif
return 2;
#else // !WEBP_USE_THREAD
return 0;
#endif
}
#undef MT_CACHE_LINES
#undef ST_CACHE_LINES
@ -429,11 +457,11 @@ static int AllocateMemory(VP8Decoder* const dec) {
const size_t mb_info_size = (mb_w + 1) * sizeof(VP8MB);
const size_t f_info_size =
(dec->filter_type_ > 0) ?
mb_w * (dec->use_threads_ ? 2 : 1) * sizeof(VP8FInfo)
mb_w * (dec->mt_method_ > 0 ? 2 : 1) * sizeof(VP8FInfo)
: 0;
const size_t yuv_size = YUV_SIZE * sizeof(*dec->yuv_b_);
const size_t mb_data_size =
(dec->use_threads_ ? 2 : 1) * mb_w * sizeof(*dec->mb_data_);
(dec->mt_method_ == 2 ? 2 : 1) * mb_w * sizeof(*dec->mb_data_);
const size_t cache_height = (16 * num_caches
+ kFilterExtraRows[dec->filter_type_]) * 3 / 2;
const size_t cache_size = top_size * cache_height;
@ -473,7 +501,7 @@ static int AllocateMemory(VP8Decoder* const dec) {
mem += f_info_size;
dec->thread_ctx_.id_ = 0;
dec->thread_ctx_.f_info_ = dec->f_info_;
if (dec->use_threads_) {
if (dec->mt_method_ > 0) {
// secondary cache line. The deblocking process need to make use of the
// filtering strength from previous macroblock row, while the new ones
// are being decoded in parallel. We'll just swap the pointers.
@ -487,7 +515,7 @@ static int AllocateMemory(VP8Decoder* const dec) {
dec->mb_data_ = (VP8MBData*)mem;
dec->thread_ctx_.mb_data_ = (VP8MBData*)mem;
if (dec->use_threads_) {
if (dec->mt_method_ == 2) {
dec->thread_ctx_.mb_data_ += mb_w;
}
mem += mb_data_size;