diff --git a/src/enc/analysis.c b/src/enc/analysis.c index 4ff3edd2..77b17ab8 100644 --- a/src/enc/analysis.c +++ b/src/enc/analysis.c @@ -384,32 +384,111 @@ static void ResetAllMBInfo(VP8Encoder* const enc) { // Default susceptibilities. enc->dqm_[0].alpha_ = 0; enc->dqm_[0].beta_ = 0; - // Note: we can't compute this alpha_ / uv_alpha_. + // Note: we can't compute this alpha_ / uv_alpha_ -> set to default value. + enc->alpha_ = 0; + enc->uv_alpha_ = 0; WebPReportProgress(enc->pic_, enc->percent_ + 20, &enc->percent_); } +// struct used to collect job result +typedef struct { + WebPWorker worker; + int alphas[MAX_ALPHA + 1]; + int alpha, uv_alpha; + VP8EncIterator it; + int delta_progress; +} SegmentJob; + +// main work call +static int DoSegmentsJob(SegmentJob* const job, VP8EncIterator* const it) { + int ok = 1; + if (!VP8IteratorIsDone(it)) { + uint8_t tmp[32 + ALIGN_CST]; + uint8_t* const scratch = (uint8_t*)DO_ALIGN(tmp); + do { + // Let's pretend we have perfect lossless reconstruction. + VP8IteratorImport(it, scratch); + MBAnalyze(it, job->alphas, &job->alpha, &job->uv_alpha); + ok = VP8IteratorProgress(it, job->delta_progress); + } while (ok && VP8IteratorNext(it)); + } + return ok; +} + +static void MergeJobs(const SegmentJob* const src, SegmentJob* const dst) { + int i; + for (i = 0; i <= MAX_ALPHA; ++i) dst->alphas[i] += src->alphas[i]; + dst->alpha += src->alpha; + dst->uv_alpha += src->uv_alpha; +} + +// initialize the job struct with some TODOs +static void InitSegmentJob(VP8Encoder* const enc, SegmentJob* const job, + int start_row, int end_row) { + WebPWorkerInit(&job->worker); + job->worker.data1 = job; + job->worker.data2 = &job->it; + job->worker.hook = (WebPWorkerHook)DoSegmentsJob; + VP8IteratorInit(enc, &job->it); + VP8IteratorSetRow(&job->it, start_row); + VP8IteratorSetCountDown(&job->it, (end_row - start_row) * enc->mb_w_); + memset(job->alphas, 0, sizeof(job->alphas)); + job->alpha = 0; + job->uv_alpha = 0; + // only one of both jobs can record the progress, since we don't + // expect the user's hook to be multi-thread safe + job->delta_progress = (start_row == 0) ? 20 : 0; +} + +// main entry point int VP8EncAnalyze(VP8Encoder* const enc) { int ok = 1; const int do_segments = enc->config_->emulate_jpeg_size || // We need the complexity evaluation. (enc->segment_hdr_.num_segments_ > 1) || (enc->method_ == 0); // for method 0, we need preds_[] to be filled. - enc->alpha_ = 0; - enc->uv_alpha_ = 0; if (do_segments) { - int alphas[MAX_ALPHA + 1] = { 0 }; - VP8EncIterator it; - - VP8IteratorInit(enc, &it); - do { - VP8IteratorImport(&it); - MBAnalyze(&it, alphas, &enc->alpha_, &enc->uv_alpha_); - ok = VP8IteratorProgress(&it, 20); - // Let's pretend we have perfect lossless reconstruction. - } while (ok && VP8IteratorNext(&it, it.yuv_in_)); - enc->alpha_ /= enc->mb_w_ * enc->mb_h_; - enc->uv_alpha_ /= enc->mb_w_ * enc->mb_h_; - if (ok) AssignSegments(enc, alphas); + const int last_row = enc->mb_h_; + // We give a little more than a half work to the main thread. + const int split_row = (9 * last_row + 15) >> 4; + const int total_mb = last_row * enc->mb_w_; +#ifdef WEBP_USE_THREAD + const int kMinSplitRow = 2; // minimal rows needed for mt to be worth it + const int do_mt = (enc->thread_level_ > 0) && (split_row >= kMinSplitRow); +#else + const int do_mt = 0; +#endif + SegmentJob main_job; + if (do_mt) { + SegmentJob side_job; + // Note the use of '&' instead of '&&' because we must call the functions + // no matter what. + InitSegmentJob(enc, &main_job, 0, split_row); + InitSegmentJob(enc, &side_job, split_row, last_row); + // we don't need to call Reset() on main_job.worker, since we're calling + // WebPWorkerExecute() on it + ok &= WebPWorkerReset(&side_job.worker); + // launch the two jobs in parallel + if (ok) { + WebPWorkerLaunch(&side_job.worker); + WebPWorkerExecute(&main_job.worker); + ok &= WebPWorkerSync(&side_job.worker); + ok &= WebPWorkerSync(&main_job.worker); + } + WebPWorkerEnd(&side_job.worker); + if (ok) MergeJobs(&side_job, &main_job); // merge results together + } else { + // Even for single-thread case, we use the generic Worker tools. + InitSegmentJob(enc, &main_job, 0, last_row); + WebPWorkerExecute(&main_job.worker); + ok &= WebPWorkerSync(&main_job.worker); + } + WebPWorkerEnd(&main_job.worker); + if (ok) { + enc->alpha_ = main_job.alpha / total_mb; + enc->uv_alpha_ = main_job.uv_alpha / total_mb; + AssignSegments(enc, main_job.alphas); + } } else { // Use only one default segment. ResetAllMBInfo(enc); } diff --git a/src/enc/frame.c b/src/enc/frame.c index d71f72e8..4624d913 100644 --- a/src/enc/frame.c +++ b/src/enc/frame.c @@ -721,7 +721,7 @@ static int OneStatPass(VP8Encoder* const enc, float q, VP8RDLevel rd_opt, VP8IteratorInit(enc, &it); do { VP8ModeScore info; - VP8IteratorImport(&it); + VP8IteratorImport(&it, NULL); if (VP8Decimate(&it, &info, rd_opt)) { // Just record the number of skips and act like skip_proba is not used. enc->proba_.nb_skip_++; @@ -731,7 +731,8 @@ static int OneStatPass(VP8Encoder* const enc, float q, VP8RDLevel rd_opt, distortion += info.D; if (percent_delta && !VP8IteratorProgress(&it, percent_delta)) return 0; - } while (VP8IteratorNext(&it, it.yuv_out_) && --nb_mbs > 0); + VP8IteratorSaveBoundary(&it, it.yuv_out_); + } while (VP8IteratorNext(&it) && --nb_mbs > 0); size += FinalizeSkipProba(enc); size += FinalizeTokenProbas(&enc->proba_); size += enc->segment_hdr_.size_; @@ -877,7 +878,7 @@ int VP8EncLoop(VP8Encoder* const enc) { const int dont_use_skip = !enc->proba_.use_skip_proba_; const VP8RDLevel rd_opt = enc->rd_opt_level_; - VP8IteratorImport(&it); + VP8IteratorImport(&it, NULL); // Warning! order is important: first call VP8Decimate() and // *then* decide how to code the skip decision if there's one. if (!VP8Decimate(&it, &info, rd_opt) || dont_use_skip) { @@ -894,7 +895,8 @@ int VP8EncLoop(VP8Encoder* const enc) { VP8StoreFilterStats(&it); VP8IteratorExport(&it); ok = VP8IteratorProgress(&it, 20); - } while (ok && VP8IteratorNext(&it, it.yuv_out_)); + VP8IteratorSaveBoundary(&it, it.yuv_out_); + } while (ok && VP8IteratorNext(&it)); return PostLoopFinalize(&it, ok); } @@ -937,7 +939,7 @@ int VP8EncTokenLoop(VP8Encoder* const enc) { VP8TBufferClear(&enc->tokens_); do { VP8ModeScore info; - VP8IteratorImport(&it); + VP8IteratorImport(&it, NULL); if (--cnt < 0) { FinalizeTokenProbas(proba); VP8CalculateLevelCosts(proba); // refresh cost tables for rd-opt @@ -956,7 +958,8 @@ int VP8EncTokenLoop(VP8Encoder* const enc) { VP8IteratorExport(&it); ok = VP8IteratorProgress(&it, 20); } - } while (ok && VP8IteratorNext(&it, it.yuv_out_)); + VP8IteratorSaveBoundary(&it, it.yuv_out_); + } while (ok && VP8IteratorNext(&it)); } ok = ok && WebPReportProgress(enc->pic_, enc->percent_ + 20, &enc->percent_); if (ok) { diff --git a/src/enc/iterator.c b/src/enc/iterator.c index 781ce9fa..6ad14aa5 100644 --- a/src/enc/iterator.c +++ b/src/enc/iterator.c @@ -55,13 +55,21 @@ void VP8IteratorSetRow(VP8EncIterator* const it, int y) { void VP8IteratorReset(VP8EncIterator* const it) { VP8Encoder* const enc = it->enc_; VP8IteratorSetRow(it, 0); - it->count_down_ = enc->mb_w_ * enc->mb_h_; + VP8IteratorSetCountDown(it, enc->mb_w_ * enc->mb_h_); // default InitTop(it); InitLeft(it); memset(it->bit_count_, 0, sizeof(it->bit_count_)); it->do_trellis_ = 0; } +void VP8IteratorSetCountDown(VP8EncIterator* const it, int count_down) { + it->count_down_ = it->count_down0_ = count_down; +} + +int VP8IteratorIsDone(const VP8EncIterator* const it) { + return (it->count_down_ <= 0); +} + void VP8IteratorInit(VP8Encoder* const enc, VP8EncIterator* const it) { it->enc_ = enc; it->y_stride_ = enc->pic_->y_stride; @@ -81,9 +89,10 @@ void VP8IteratorInit(VP8Encoder* const enc, VP8EncIterator* const it) { int VP8IteratorProgress(const VP8EncIterator* const it, int delta) { VP8Encoder* const enc = it->enc_; if (delta && enc->pic_->progress_hook != NULL) { - const int percent = (enc->mb_h_ <= 1) + const int done = it->count_down0_ - it->count_down_; + const int percent = (it->count_down0_ <= 0) ? it->percent0_ - : it->percent0_ + delta * it->y_ / (enc->mb_h_ - 1); + : it->percent0_ + delta * done / it->count_down0_; return WebPReportProgress(enc->pic_, percent, &enc->percent_); } return 1; @@ -93,6 +102,8 @@ int VP8IteratorProgress(const VP8EncIterator* const it, int delta) { // Import the source samples into the cache. Takes care of replicating // boundary pixels if necessary. +static WEBP_INLINE int MinSize(int a, int b) { return (a < b) ? a : b; } + static void ImportBlock(const uint8_t* src, int src_stride, uint8_t* dst, int w, int h, int size) { int i; @@ -110,30 +121,55 @@ static void ImportBlock(const uint8_t* src, int src_stride, } } -void VP8IteratorImport(const VP8EncIterator* const it) { +static void ImportLine(const uint8_t* src, int src_stride, + uint8_t* dst, int len, int total_len) { + int i; + for (i = 0; i < len; ++i, src += src_stride) dst[i] = *src; + for (; i < total_len; ++i) dst[i] = dst[len - 1]; +} + +void VP8IteratorImport(VP8EncIterator* const it, uint8_t* tmp_32) { const VP8Encoder* const enc = it->enc_; const int x = it->x_, y = it->y_; const WebPPicture* const pic = enc->pic_; - const uint8_t* const ysrc = pic->y + (y * pic->y_stride + x) * 16; + const uint8_t* const ysrc = pic->y + (y * pic->y_stride + x) * 16; const uint8_t* const usrc = pic->u + (y * pic->uv_stride + x) * 8; const uint8_t* const vsrc = pic->v + (y * pic->uv_stride + x) * 8; - uint8_t* const ydst = it->yuv_in_ + Y_OFF; - uint8_t* const udst = it->yuv_in_ + U_OFF; - uint8_t* const vdst = it->yuv_in_ + V_OFF; - int w = (pic->width - x * 16); - int h = (pic->height - y * 16); + const int w = MinSize(pic->width - x * 16, 16); + const int h = MinSize(pic->height - y * 16, 16); + const int uv_w = (w + 1) >> 1; + const int uv_h = (h + 1) >> 1; - if (w > 16) w = 16; - if (h > 16) h = 16; + ImportBlock(ysrc, pic->y_stride, it->yuv_in_ + Y_OFF, w, h, 16); + ImportBlock(usrc, pic->uv_stride, it->yuv_in_ + U_OFF, uv_w, uv_h, 8); + ImportBlock(vsrc, pic->uv_stride, it->yuv_in_ + V_OFF, uv_w, uv_h, 8); - // Luma plane - ImportBlock(ysrc, pic->y_stride, ydst, w, h, 16); + if (tmp_32 == NULL) return; - { // U/V planes - const int uv_w = (w + 1) >> 1; - const int uv_h = (h + 1) >> 1; - ImportBlock(usrc, pic->uv_stride, udst, uv_w, uv_h, 8); - ImportBlock(vsrc, pic->uv_stride, vdst, uv_w, uv_h, 8); + // Import source (uncompressed) samples into boundary. + if (x == 0) { + InitLeft(it); + } else { + if (y == 0) { + it->y_left_[-1] = it->u_left_[-1] = it->v_left_[-1] = 127; + } else { + it->y_left_[-1] = ysrc[- 1 - pic->y_stride]; + it->u_left_[-1] = usrc[- 1 - pic->uv_stride]; + it->v_left_[-1] = vsrc[- 1 - pic->uv_stride]; + } + ImportLine(ysrc - 1, pic->y_stride, it->y_left_, h, 16); + ImportLine(usrc - 1, pic->uv_stride, it->u_left_, uv_h, 8); + ImportLine(vsrc - 1, pic->uv_stride, it->v_left_, uv_h, 8); + } + + it->y_top_ = tmp_32 + 0; + it->uv_top_ = tmp_32 + 16; + if (y == 0) { + memset(tmp_32, 127, 32 * sizeof(*tmp_32)); + } else { + ImportLine(ysrc - pic->y_stride, 1, tmp_32, w, 16); + ImportLine(usrc - pic->uv_stride, 1, tmp_32 + 16, uv_w, 8); + ImportLine(vsrc - pic->uv_stride, 1, tmp_32 + 16 + 8, uv_w, 8); } } @@ -251,40 +287,40 @@ void VP8IteratorBytesToNz(VP8EncIterator* const it) { //------------------------------------------------------------------------------ // Advance to the next position, doing the bookeeping. -int VP8IteratorNext(VP8EncIterator* const it, - const uint8_t* const block_to_save) { +void VP8IteratorSaveBoundary(VP8EncIterator* const it, + const uint8_t* const block_to_save) { VP8Encoder* const enc = it->enc_; - if (block_to_save != NULL) { - const int x = it->x_, y = it->y_; - const uint8_t* const ysrc = block_to_save + Y_OFF; - const uint8_t* const usrc = block_to_save + U_OFF; - if (x < enc->mb_w_ - 1) { // left - int i; - for (i = 0; i < 16; ++i) { - it->y_left_[i] = ysrc[15 + i * BPS]; - } - for (i = 0; i < 8; ++i) { - it->u_left_[i] = usrc[7 + i * BPS]; - it->v_left_[i] = usrc[15 + i * BPS]; - } - // top-left (before 'top'!) - it->y_left_[-1] = it->y_top_[15]; - it->u_left_[-1] = it->uv_top_[0 + 7]; - it->v_left_[-1] = it->uv_top_[8 + 7]; + const int x = it->x_, y = it->y_; + const uint8_t* const ysrc = block_to_save + Y_OFF; + const uint8_t* const usrc = block_to_save + U_OFF; + if (x < enc->mb_w_ - 1) { // left + int i; + for (i = 0; i < 16; ++i) { + it->y_left_[i] = ysrc[15 + i * BPS]; } - if (y < enc->mb_h_ - 1) { // top - memcpy(it->y_top_, ysrc + 15 * BPS, 16); - memcpy(it->uv_top_, usrc + 7 * BPS, 8 + 8); + for (i = 0; i < 8; ++i) { + it->u_left_[i] = usrc[7 + i * BPS]; + it->v_left_[i] = usrc[15 + i * BPS]; } + // top-left (before 'top'!) + it->y_left_[-1] = it->y_top_[15]; + it->u_left_[-1] = it->uv_top_[0 + 7]; + it->v_left_[-1] = it->uv_top_[8 + 7]; } + if (y < enc->mb_h_ - 1) { // top + memcpy(it->y_top_, ysrc + 15 * BPS, 16); + memcpy(it->uv_top_, usrc + 7 * BPS, 8 + 8); + } +} +int VP8IteratorNext(VP8EncIterator* const it) { it->preds_ += 4; it->mb_ += 1; it->nz_ += 1; it->y_top_ += 16; it->uv_top_ += 16; it->x_ += 1; - if (it->x_ == enc->mb_w_) { + if (it->x_ == it->enc_->mb_w_) { VP8IteratorSetRow(it, ++it->y_); } return (0 < --it->count_down_); diff --git a/src/enc/quant.c b/src/enc/quant.c index ffeaec30..f26fafcd 100644 --- a/src/enc/quant.c +++ b/src/enc/quant.c @@ -367,16 +367,14 @@ const int VP8I4ModeOffsets[NUM_BMODES] = { }; void VP8MakeLuma16Preds(const VP8EncIterator* const it) { - const VP8Encoder* const enc = it->enc_; const uint8_t* const left = it->x_ ? it->y_left_ : NULL; - const uint8_t* const top = it->y_ ? enc->y_top_ + it->x_ * 16 : NULL; + const uint8_t* const top = it->y_ ? it->y_top_ : NULL; VP8EncPredLuma16(it->yuv_p_, left, top); } void VP8MakeChroma8Preds(const VP8EncIterator* const it) { - const VP8Encoder* const enc = it->enc_; const uint8_t* const left = it->x_ ? it->u_left_ : NULL; - const uint8_t* const top = it->y_ ? enc->uv_top_ + it->x_ * 16 : NULL; + const uint8_t* const top = it->y_ ? it->uv_top_ : NULL; VP8EncPredChroma8(it->yuv_p_, left, top); } diff --git a/src/enc/tree.c b/src/enc/tree.c index 5c204bdd..27f0babe 100644 --- a/src/enc/tree.c +++ b/src/enc/tree.c @@ -344,7 +344,7 @@ void VP8CodeIntraModes(VP8Encoder* const enc) { } } PutUVMode(bw, mb->uv_mode_); - } while (VP8IteratorNext(&it, NULL)); + } while (VP8IteratorNext(&it)); } //------------------------------------------------------------------------------ diff --git a/src/enc/vp8enci.h b/src/enc/vp8enci.h index a43e17c9..9f4cf27f 100644 --- a/src/enc/vp8enci.h +++ b/src/enc/vp8enci.h @@ -292,6 +292,7 @@ typedef struct { LFStats* lf_stats_; // filter stats (borrowed from enc_) int do_trellis_; // if true, perform extra level optimisation int count_down_; // number of mb still to be processed + int count_down0_; // starting counter value (for progress) int percent0_; // saved initial progress percent uint8_t* y_left_; // left luma samples (addressable from index -1 to 15). @@ -313,15 +314,22 @@ void VP8IteratorInit(VP8Encoder* const enc, VP8EncIterator* const it); void VP8IteratorReset(VP8EncIterator* const it); // reset iterator position to row 'y' void VP8IteratorSetRow(VP8EncIterator* const it, int y); -// import samples from source -void VP8IteratorImport(const VP8EncIterator* const it); +// set count down (=number of iterations to go) +void VP8IteratorSetCountDown(VP8EncIterator* const it, int count_down); +// return true if iteration is finished +int VP8IteratorIsDone(const VP8EncIterator* const it); +// Import uncompressed samples from source. +// If tmp_32 is not NULL, import boundary samples too. +// tmp_32 is a 32-bytes scratch buffer that must be aligned in memory. +void VP8IteratorImport(VP8EncIterator* const it, uint8_t* tmp_32); // export decimated samples void VP8IteratorExport(const VP8EncIterator* const it); -// go to next macroblock. Returns false if not finished. If *block_to_save is -// non-null, will save the boundary values to top_/left_ arrays. block_to_save -// can be it->yuv_out_ or it->yuv_in_. -int VP8IteratorNext(VP8EncIterator* const it, - const uint8_t* const block_to_save); +// go to next macroblock. Returns false if not finished. +int VP8IteratorNext(VP8EncIterator* const it); +// save the boundary values to top_/left_ arrays for next iterations. +// block_to_save can be it->yuv_out_ or it->yuv_in_. +void VP8IteratorSaveBoundary(VP8EncIterator* const it, + const uint8_t* const block_to_save); // Report progression based on macroblock rows. Return 0 for user-abort request. int VP8IteratorProgress(const VP8EncIterator* const it, int final_delta_percent);