From 733a7faae4a44bd62532d56c29461287878ab390 Mon Sep 17 00:00:00 2001 From: skal Date: Sat, 31 Aug 2013 23:38:11 +0200 Subject: [PATCH] enc->Iterator memory cleanup * move yuv_in_/out_* scratch buffers to iterator * add y_top_/uv_top_ shortcuts in iterator That's ~3k of stack size instead of heap. But it allows having several iterators work in parallel. Change-Id: I6a437c0f2ef1e5d398c1d6a2fd4974fa0869f0c1 --- src/enc/iterator.c | 27 +++++++++++++++------------ src/enc/vp8enci.h | 22 ++++++++++++---------- src/enc/webpenc.c | 13 +------------ 3 files changed, 28 insertions(+), 34 deletions(-) diff --git a/src/enc/iterator.c b/src/enc/iterator.c index cec8afa3..781ce9fa 100644 --- a/src/enc/iterator.c +++ b/src/enc/iterator.c @@ -47,6 +47,8 @@ void VP8IteratorSetRow(VP8EncIterator* const it, int y) { it->preds_ = enc->preds_ + y * 4 * enc->preds_w_; it->nz_ = enc->nz_; it->mb_ = enc->mb_info_ + y * enc->mb_w_; + it->y_top_ = enc->y_top_; + it->uv_top_ = enc->uv_top_; InitLeft(it); } @@ -64,11 +66,10 @@ void VP8IteratorInit(VP8Encoder* const enc, VP8EncIterator* const it) { it->enc_ = enc; it->y_stride_ = enc->pic_->y_stride; it->uv_stride_ = enc->pic_->uv_stride; - // TODO(later): for multithreading, these should be owned by 'it'. - it->yuv_in_ = enc->yuv_in_; - it->yuv_out_ = enc->yuv_out_; - it->yuv_out2_ = enc->yuv_out2_; - it->yuv_p_ = enc->yuv_p_; + it->yuv_in_ = (uint8_t*)DO_ALIGN(it->yuv_mem_); + it->yuv_out_ = it->yuv_in_ + YUV_SIZE; + it->yuv_out2_ = it->yuv_out_ + YUV_SIZE; + it->yuv_p_ = it->yuv_out2_ + YUV_SIZE; it->lf_stats_ = enc->lf_stats_; it->percent0_ = enc->percent_; it->y_left_ = (uint8_t*)DO_ALIGN(it->yuv_left_mem_ + 1); @@ -267,19 +268,21 @@ int VP8IteratorNext(VP8EncIterator* const it, it->v_left_[i] = usrc[15 + i * BPS]; } // top-left (before 'top'!) - it->y_left_[-1] = enc->y_top_[x * 16 + 15]; - it->u_left_[-1] = enc->uv_top_[x * 16 + 0 + 7]; - it->v_left_[-1] = enc->uv_top_[x * 16 + 8 + 7]; + it->y_left_[-1] = it->y_top_[15]; + it->u_left_[-1] = it->uv_top_[0 + 7]; + it->v_left_[-1] = it->uv_top_[8 + 7]; } if (y < enc->mb_h_ - 1) { // top - memcpy(enc->y_top_ + x * 16, ysrc + 15 * BPS, 16); - memcpy(enc->uv_top_ + x * 16, usrc + 7 * BPS, 8 + 8); + memcpy(it->y_top_, ysrc + 15 * BPS, 16); + memcpy(it->uv_top_, usrc + 7 * BPS, 8 + 8); } } it->preds_ += 4; it->mb_ += 1; it->nz_ += 1; + it->y_top_ += 16; + it->uv_top_ += 16; it->x_ += 1; if (it->x_ == enc->mb_w_) { VP8IteratorSetRow(it, ++it->y_); @@ -374,12 +377,12 @@ void VP8IteratorStartI4(VP8EncIterator* const it) { it->i4_boundary_[i] = it->y_left_[15 - i]; } for (i = 0; i < 16; ++i) { // top - it->i4_boundary_[17 + i] = enc->y_top_[it->x_ * 16 + i]; + it->i4_boundary_[17 + i] = it->y_top_[i]; } // top-right samples have a special case on the far right of the picture if (it->x_ < enc->mb_w_ - 1) { for (i = 16; i < 16 + 4; ++i) { - it->i4_boundary_[17 + i] = enc->y_top_[it->x_ * 16 + i]; + it->i4_boundary_[17 + i] = it->y_top_[i]; } } else { // else, replicate the last valid pixel four times for (i = 16; i < 16 + 4; ++i) { diff --git a/src/enc/vp8enci.h b/src/enc/vp8enci.h index e3a3ee8b..a43e17c9 100644 --- a/src/enc/vp8enci.h +++ b/src/enc/vp8enci.h @@ -74,7 +74,7 @@ typedef enum { // Rate-distortion optimization levels // The predicted blocks can be accessed using offsets to yuv_p_ and // the arrays VP8*ModeOffsets[]; // +----+ YUV Samples area. See VP8Scan[] for accessing the blocks. -// Y_OFF |YYYY| <- original samples (enc->yuv_in_) +// Y_OFF |YYYY| <- original samples ('yuv_in_') // |YYYY| // |YYYY| // |YYYY| @@ -272,10 +272,10 @@ typedef struct { typedef struct { int x_, y_; // current macroblock int y_stride_, uv_stride_; // respective strides - uint8_t* yuv_in_; // borrowed from enc_ (for now) - uint8_t* yuv_out_; // '' - uint8_t* yuv_out2_; // '' - uint8_t* yuv_p_; // '' + uint8_t* yuv_in_; // input samples + uint8_t* yuv_out_; // output samples + uint8_t* yuv_out2_; // secondary buffer swapped with yuv_out_. + uint8_t* yuv_p_; // scratch buffer for prediction VP8Encoder* enc_; // back-pointer VP8MBInfo* mb_; // current macroblock VP8BitWriter* bw_; // current bit-writer @@ -297,7 +297,13 @@ typedef struct { uint8_t* y_left_; // left luma samples (addressable from index -1 to 15). uint8_t* u_left_; // left u samples (addressable from index -1 to 7) uint8_t* v_left_; // left v samples (addressable from index -1 to 7) - uint8_t yuv_left_mem_[17 + 16 + 16 + 8 + ALIGN_CST]; // memory for *_left_ + + uint8_t* y_top_; // top luma samples at position 'x_' + uint8_t* uv_top_; // top u/v samples at position 'x_', packed as 16 bytes + + // memory for storing y/u/v_left_ and yuv_in_/out_* + uint8_t yuv_left_mem_[17 + 16 + 16 + 8 + ALIGN_CST]; // memory for *_left_ + uint8_t yuv_mem_[3 * YUV_SIZE + PRED_SIZE + ALIGN_CST]; // memory for yuv_* } VP8EncIterator; // in iterator.c @@ -441,10 +447,6 @@ struct VP8Encoder { VP8MBInfo* mb_info_; // contextual macroblock infos (mb_w_ + 1) uint8_t* preds_; // predictions modes: (4*mb_w+1) * (4*mb_h+1) uint32_t* nz_; // non-zero bit context: mb_w+1 - uint8_t* yuv_in_; // input samples - uint8_t* yuv_out_; // output samples - uint8_t* yuv_out2_; // secondary scratch out-buffer. swapped with yuv_out_. - uint8_t* yuv_p_; // scratch buffer for prediction uint8_t *y_top_; // top luma samples. uint8_t *uv_top_; // top u/v samples. // U and V are packed into 16 bytes (8 U + 8 V) diff --git a/src/enc/webpenc.c b/src/enc/webpenc.c index 4a56c841..7ba10aa3 100644 --- a/src/enc/webpenc.c +++ b/src/enc/webpenc.c @@ -176,7 +176,6 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config, const size_t preds_size = preds_w * preds_h * sizeof(uint8_t); const int top_stride = mb_w * 16; const size_t nz_size = (mb_w + 1) * sizeof(uint32_t) + ALIGN_CST; - const size_t cache_size = (3 * YUV_SIZE + PRED_SIZE) * sizeof(uint8_t); const size_t info_size = mb_w * mb_h * sizeof(VP8MBInfo); const size_t samples_size = 2 * top_stride * sizeof(uint8_t) // top-luma/u/v + ALIGN_CST; // align all @@ -186,7 +185,6 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config, uint8_t* mem; const uint64_t size = (uint64_t)sizeof(VP8Encoder) // main struct + ALIGN_CST // cache alignment - + cache_size // working caches + info_size // modes info + preds_size // prediction modes + samples_size // top/left samples @@ -197,14 +195,13 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config, printf("===================================\n"); printf("Memory used:\n" " encoder: %ld\n" - " block cache: %ld\n" " info: %ld\n" " preds: %ld\n" " top samples: %ld\n" " non-zero: %ld\n" " lf-stats: %ld\n" " total: %ld\n", - sizeof(VP8Encoder) + ALIGN_CST, cache_size, info_size, + sizeof(VP8Encoder) + ALIGN_CST, info_size, preds_size, samples_size, nz_size, lf_stats_size, size); printf("Transient object sizes:\n" " VP8EncIterator: %ld\n" @@ -231,14 +228,6 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config, enc->mb_w_ = mb_w; enc->mb_h_ = mb_h; enc->preds_w_ = preds_w; - enc->yuv_in_ = (uint8_t*)mem; - mem += YUV_SIZE; - enc->yuv_out_ = (uint8_t*)mem; - mem += YUV_SIZE; - enc->yuv_out2_ = (uint8_t*)mem; - mem += YUV_SIZE; - enc->yuv_p_ = (uint8_t*)mem; - mem += PRED_SIZE; enc->mb_info_ = (VP8MBInfo*)mem; mem += info_size; enc->preds_ = ((uint8_t*)mem) + 1 + enc->preds_w_;