enc->Iterator memory cleanup

* move yuv_in_/out_* scratch buffers to iterator * add y_top_/uv_top_ shortcuts in iterator That's ~3k of stack size instead of heap. But it allows having several iterators work in parallel. Change-Id: I6a437c0f2ef1e5d398c1d6a2fd4974fa0869f0c1
2025-08-29 07:12:05 +02:00 · 2013-08-31 23:38:11 +02:00
parent e81fac86dd
commit 733a7faae4
3 changed files with 28 additions and 34 deletions
--- a/src/enc/iterator.c
+++ b/src/enc/iterator.c
@@ -47,6 +47,8 @@ void VP8IteratorSetRow(VP8EncIterator* const it, int y) {
  it->preds_ = enc->preds_ + y * 4 * enc->preds_w_;
  it->nz_ = enc->nz_;
  it->mb_ = enc->mb_info_ + y * enc->mb_w_;
+  it->y_top_ = enc->y_top_;
+  it->uv_top_ = enc->uv_top_;
  InitLeft(it);
 }

@@ -64,11 +66,10 @@ void VP8IteratorInit(VP8Encoder* const enc, VP8EncIterator* const it) {
  it->enc_ = enc;
  it->y_stride_  = enc->pic_->y_stride;
  it->uv_stride_ = enc->pic_->uv_stride;
-  // TODO(later): for multithreading, these should be owned by 'it'.
-  it->yuv_in_   = enc->yuv_in_;
-  it->yuv_out_  = enc->yuv_out_;
-  it->yuv_out2_ = enc->yuv_out2_;
-  it->yuv_p_    = enc->yuv_p_;
+  it->yuv_in_   = (uint8_t*)DO_ALIGN(it->yuv_mem_);
+  it->yuv_out_  = it->yuv_in_ + YUV_SIZE;
+  it->yuv_out2_ = it->yuv_out_ + YUV_SIZE;
+  it->yuv_p_    = it->yuv_out2_ + YUV_SIZE;
  it->lf_stats_ = enc->lf_stats_;
  it->percent0_ = enc->percent_;
  it->y_left_ = (uint8_t*)DO_ALIGN(it->yuv_left_mem_ + 1);
@@ -267,19 +268,21 @@ int VP8IteratorNext(VP8EncIterator* const it,
        it->v_left_[i] = usrc[15 + i * BPS];
      }
      // top-left (before 'top'!)
-      it->y_left_[-1] = enc->y_top_[x * 16 + 15];
-      it->u_left_[-1] = enc->uv_top_[x * 16 + 0 + 7];
-      it->v_left_[-1] = enc->uv_top_[x * 16 + 8 + 7];
+      it->y_left_[-1] = it->y_top_[15];
+      it->u_left_[-1] = it->uv_top_[0 + 7];
+      it->v_left_[-1] = it->uv_top_[8 + 7];
    }
    if (y < enc->mb_h_ - 1) {  // top
-      memcpy(enc->y_top_ + x * 16, ysrc + 15 * BPS, 16);
-      memcpy(enc->uv_top_ + x * 16, usrc + 7 * BPS, 8 + 8);
+      memcpy(it->y_top_, ysrc + 15 * BPS, 16);
+      memcpy(it->uv_top_, usrc + 7 * BPS, 8 + 8);
    }
  }

  it->preds_ += 4;
  it->mb_ += 1;
  it->nz_ += 1;
+  it->y_top_ += 16;
+  it->uv_top_ += 16;
  it->x_ += 1;
  if (it->x_ == enc->mb_w_) {
    VP8IteratorSetRow(it, ++it->y_);
@@ -374,12 +377,12 @@ void VP8IteratorStartI4(VP8EncIterator* const it) {
    it->i4_boundary_[i] = it->y_left_[15 - i];
  }
  for (i = 0; i < 16; ++i) {    // top
-    it->i4_boundary_[17 + i] = enc->y_top_[it->x_ * 16 + i];
+    it->i4_boundary_[17 + i] = it->y_top_[i];
  }
  // top-right samples have a special case on the far right of the picture
  if (it->x_ < enc->mb_w_ - 1) {
    for (i = 16; i < 16 + 4; ++i) {
-      it->i4_boundary_[17 + i] = enc->y_top_[it->x_ * 16 + i];
+      it->i4_boundary_[17 + i] = it->y_top_[i];
    }
  } else {    // else, replicate the last valid pixel four times
    for (i = 16; i < 16 + 4; ++i) {
--- a/src/enc/vp8enci.h
+++ b/src/enc/vp8enci.h
@@ -74,7 +74,7 @@ typedef enum {   // Rate-distortion optimization levels
 // The predicted blocks can be accessed using offsets to yuv_p_ and
 // the arrays VP8*ModeOffsets[];
 //         +----+      YUV Samples area. See VP8Scan[] for accessing the blocks.
-//  Y_OFF  |YYYY| <- original samples  (enc->yuv_in_)
+//  Y_OFF  |YYYY| <- original samples  ('yuv_in_')
 //         |YYYY|
 //         |YYYY|
 //         |YYYY|
@@ -272,10 +272,10 @@ typedef struct {
 typedef struct {
  int x_, y_;                      // current macroblock
  int y_stride_, uv_stride_;       // respective strides
-  uint8_t*      yuv_in_;           // borrowed from enc_ (for now)
-  uint8_t*      yuv_out_;          // ''
-  uint8_t*      yuv_out2_;         // ''
-  uint8_t*      yuv_p_;            // ''
+  uint8_t*      yuv_in_;           // input samples
+  uint8_t*      yuv_out_;          // output samples
+  uint8_t*      yuv_out2_;         // secondary buffer swapped with yuv_out_.
+  uint8_t*      yuv_p_;            // scratch buffer for prediction
  VP8Encoder*   enc_;              // back-pointer
  VP8MBInfo*    mb_;               // current macroblock
  VP8BitWriter* bw_;               // current bit-writer
@@ -297,7 +297,13 @@ typedef struct {
  uint8_t* y_left_;    // left luma samples (addressable from index -1 to 15).
  uint8_t* u_left_;    // left u samples (addressable from index -1 to 7)
  uint8_t* v_left_;    // left v samples (addressable from index -1 to 7)
-  uint8_t  yuv_left_mem_[17 + 16 + 16 + 8 + ALIGN_CST];  // memory for *_left_
+
+  uint8_t* y_top_;     // top luma samples at position 'x_'
+  uint8_t* uv_top_;    // top u/v samples at position 'x_', packed as 16 bytes
+
+  // memory for storing y/u/v_left_ and yuv_in_/out_*
+  uint8_t yuv_left_mem_[17 + 16 + 16 + 8 + ALIGN_CST];     // memory for *_left_
+  uint8_t yuv_mem_[3 * YUV_SIZE + PRED_SIZE + ALIGN_CST];  // memory for yuv_*
 } VP8EncIterator;

  // in iterator.c
@@ -441,10 +447,6 @@ struct VP8Encoder {
  VP8MBInfo* mb_info_;   // contextual macroblock infos (mb_w_ + 1)
  uint8_t*   preds_;     // predictions modes: (4*mb_w+1) * (4*mb_h+1)
  uint32_t*  nz_;        // non-zero bit context: mb_w+1
-  uint8_t*   yuv_in_;    // input samples
-  uint8_t*   yuv_out_;   // output samples
-  uint8_t*   yuv_out2_;  // secondary scratch out-buffer. swapped with yuv_out_.
-  uint8_t*   yuv_p_;     // scratch buffer for prediction
  uint8_t   *y_top_;     // top luma samples.
  uint8_t   *uv_top_;    // top u/v samples.
                         // U and V are packed into 16 bytes (8 U + 8 V)
--- a/src/enc/webpenc.c
+++ b/src/enc/webpenc.c
@@ -176,7 +176,6 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
  const size_t preds_size = preds_w * preds_h * sizeof(uint8_t);
  const int top_stride = mb_w * 16;
  const size_t nz_size = (mb_w + 1) * sizeof(uint32_t) + ALIGN_CST;
-  const size_t cache_size = (3 * YUV_SIZE + PRED_SIZE) * sizeof(uint8_t);
  const size_t info_size = mb_w * mb_h * sizeof(VP8MBInfo);
  const size_t samples_size = 2 * top_stride * sizeof(uint8_t)  // top-luma/u/v
                            + ALIGN_CST;                        // align all
@@ -186,7 +185,6 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
  uint8_t* mem;
  const uint64_t size = (uint64_t)sizeof(VP8Encoder)   // main struct
                      + ALIGN_CST                      // cache alignment
-                      + cache_size                     // working caches
                      + info_size                      // modes info
                      + preds_size                     // prediction modes
                      + samples_size                   // top/left samples
@@ -197,14 +195,13 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
  printf("===================================\n");
  printf("Memory used:\n"
         "             encoder: %ld\n"
-         "         block cache: %ld\n"
         "                info: %ld\n"
         "               preds: %ld\n"
         "         top samples: %ld\n"
         "            non-zero: %ld\n"
         "            lf-stats: %ld\n"
         "               total: %ld\n",
-         sizeof(VP8Encoder) + ALIGN_CST, cache_size, info_size,
+         sizeof(VP8Encoder) + ALIGN_CST, info_size,
         preds_size, samples_size, nz_size, lf_stats_size, size);
  printf("Transient object sizes:\n"
         "      VP8EncIterator: %ld\n"
@@ -231,14 +228,6 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
  enc->mb_w_ = mb_w;
  enc->mb_h_ = mb_h;
  enc->preds_w_ = preds_w;
-  enc->yuv_in_ = (uint8_t*)mem;
-  mem += YUV_SIZE;
-  enc->yuv_out_ = (uint8_t*)mem;
-  mem += YUV_SIZE;
-  enc->yuv_out2_ = (uint8_t*)mem;
-  mem += YUV_SIZE;
-  enc->yuv_p_ = (uint8_t*)mem;
-  mem += PRED_SIZE;
  enc->mb_info_ = (VP8MBInfo*)mem;
  mem += info_size;
  enc->preds_ = ((uint8_t*)mem) + 1 + enc->preds_w_;