From 0532149c8a7d618a81842f1135ceb2d734de55f2 Mon Sep 17 00:00:00 2001
From: skal <pascal.massimino@gmail.com>
Date: Tue, 15 Oct 2013 00:25:21 +0200
Subject: [PATCH] up to 20% faster multi-threaded decoding

Mostly visible for large images.
Reconstruction+filtering is now done in parallel to bitstream-parsing.

Change-Id: I4cc4483d803b255f4d97a2fcd9158b1c291dd900
---
 src/dec/frame.c | 57 +++++++++++++++++++++++++++++++++----------------
 src/dec/idec.c  |  4 +---
 src/dec/vp8.c   |  4 +---
 src/dec/vp8i.h  | 17 +++++++--------
 4 files changed, 49 insertions(+), 33 deletions(-)

diff --git a/src/dec/frame.c b/src/dec/frame.c
index ae34eb7b..fad92e6a 100644
--- a/src/dec/frame.c
+++ b/src/dec/frame.c
@@ -21,6 +21,9 @@ extern "C" {
 
 #define ALIGN_MASK (32 - 1)
 
+static void ReconstructRow(const VP8Decoder* const dec,
+                           const VP8ThreadContext* ctx);  // TODO(skal): remove
+
 //------------------------------------------------------------------------------
 // Filtering
 
@@ -41,9 +44,10 @@ static WEBP_INLINE int hev_thresh_from_level(int level, int keyframe) {
 
 static void DoFilter(const VP8Decoder* const dec, int mb_x, int mb_y) {
   const VP8ThreadContext* const ctx = &dec->thread_ctx_;
+  const int cache_id = ctx->id_;
   const int y_bps = dec->cache_y_stride_;
   VP8FInfo* const f_info = ctx->f_info_ + mb_x;
-  uint8_t* const y_dst = dec->cache_y_ + ctx->id_ * 16 * y_bps + mb_x * 16;
+  uint8_t* const y_dst = dec->cache_y_ + cache_id * 16 * y_bps + mb_x * 16;
   const int level = f_info->f_level_;
   const int ilevel = f_info->f_ilevel_;
   const int limit = 2 * level + ilevel;
@@ -65,8 +69,8 @@ static void DoFilter(const VP8Decoder* const dec, int mb_x, int mb_y) {
     }
   } else {    // complex
     const int uv_bps = dec->cache_uv_stride_;
-    uint8_t* const u_dst = dec->cache_u_ + ctx->id_ * 8 * uv_bps + mb_x * 8;
-    uint8_t* const v_dst = dec->cache_v_ + ctx->id_ * 8 * uv_bps + mb_x * 8;
+    uint8_t* const u_dst = dec->cache_u_ + cache_id * 8 * uv_bps + mb_x * 8;
+    uint8_t* const v_dst = dec->cache_v_ + cache_id * 8 * uv_bps + mb_x * 8;
     const int hev_thresh =
         hev_thresh_from_level(level, dec->frm_hdr_.key_frame_);
     if (mb_x > 0) {
@@ -164,25 +168,29 @@ static void PrecomputeFilterStrengths(VP8Decoder* const dec) {
 static int FinishRow(VP8Decoder* const dec, VP8Io* const io) {
   int ok = 1;
   const VP8ThreadContext* const ctx = &dec->thread_ctx_;
+  const int cache_id = ctx->id_;
   const int extra_y_rows = kFilterExtraRows[dec->filter_type_];
   const int ysize = extra_y_rows * dec->cache_y_stride_;
   const int uvsize = (extra_y_rows / 2) * dec->cache_uv_stride_;
-  const int y_offset = ctx->id_ * 16 * dec->cache_y_stride_;
-  const int uv_offset = ctx->id_ * 8 * dec->cache_uv_stride_;
+  const int y_offset = cache_id * 16 * dec->cache_y_stride_;
+  const int uv_offset = cache_id * 8 * dec->cache_uv_stride_;
   uint8_t* const ydst = dec->cache_y_ - ysize + y_offset;
   uint8_t* const udst = dec->cache_u_ - uvsize + uv_offset;
   uint8_t* const vdst = dec->cache_v_ - uvsize + uv_offset;
-  const int first_row = (ctx->mb_y_ == 0);
-  const int last_row = (ctx->mb_y_ >= dec->br_mb_y_ - 1);
-  int y_start = MACROBLOCK_VPOS(ctx->mb_y_);
-  int y_end = MACROBLOCK_VPOS(ctx->mb_y_ + 1);
+  const int mb_y = ctx->mb_y_;
+  const int is_first_row = (mb_y == 0);
+  const int is_last_row = (mb_y >= dec->br_mb_y_ - 1);
+
+  ReconstructRow(dec, ctx);
 
   if (ctx->filter_row_) {
     FilterRow(dec);
   }
 
   if (io->put != NULL) {
-    if (!first_row) {
+    int y_start = MACROBLOCK_VPOS(mb_y);
+    int y_end = MACROBLOCK_VPOS(mb_y + 1);
+    if (!is_first_row) {
       y_start -= extra_y_rows;
       io->y = ydst;
       io->u = udst;
@@ -193,7 +201,7 @@ static int FinishRow(VP8Decoder* const dec, VP8Io* const io) {
       io->v = dec->cache_v_ + uv_offset;
     }
 
-    if (!last_row) {
+    if (!is_last_row) {
       y_end -= extra_y_rows;
     }
     if (y_end > io->crop_bottom) {
@@ -234,8 +242,8 @@ static int FinishRow(VP8Decoder* const dec, VP8Io* const io) {
     }
   }
   // rotate top samples if needed
-  if (ctx->id_ + 1 == dec->num_caches_) {
-    if (!last_row) {
+  if (cache_id + 1 == dec->num_caches_) {
+    if (!is_last_row) {
       memcpy(dec->cache_y_ - ysize, ydst + 16 * dec->cache_y_stride_, ysize);
       memcpy(dec->cache_u_ - uvsize, udst + 8 * dec->cache_uv_stride_, uvsize);
       memcpy(dec->cache_v_ - uvsize, vdst + 8 * dec->cache_uv_stride_, uvsize);
@@ -270,6 +278,11 @@ int VP8ProcessRow(VP8Decoder* const dec, VP8Io* const io) {
       ctx->id_ = dec->cache_id_;
       ctx->mb_y_ = dec->mb_y_;
       ctx->filter_row_ = filter_row;
+      {
+        VP8MBData* const tmp = ctx->mb_data_;
+        ctx->mb_data_ = dec->mb_data_;
+        dec->mb_data_ = tmp;
+      }
       if (filter_row) {    // just swap filter info
         VP8FInfo* const tmp = ctx->f_info_;
         ctx->f_info_ = dec->f_info_;
@@ -419,7 +432,8 @@ static int AllocateMemory(VP8Decoder* const dec) {
           mb_w * (dec->use_threads_ ? 2 : 1) * sizeof(VP8FInfo)
         : 0;
   const size_t yuv_size = YUV_SIZE * sizeof(*dec->yuv_b_);
-  const size_t mb_data_size = mb_w * sizeof(*dec->mb_data_);
+  const size_t mb_data_size =
+      (dec->use_threads_ ? 2 : 1) * mb_w * sizeof(*dec->mb_data_);
   const size_t cache_height = (16 * num_caches
                             + kFilterExtraRows[dec->filter_type_]) * 3 / 2;
   const size_t cache_size = top_size * cache_height;
@@ -472,6 +486,10 @@ static int AllocateMemory(VP8Decoder* const dec) {
   mem += yuv_size;
 
   dec->mb_data_ = (VP8MBData*)mem;
+  dec->thread_ctx_.mb_data_ = (VP8MBData*)mem;
+  if (dec->use_threads_) {
+    dec->thread_ctx_.mb_data_ += mb_w;
+  }
   mem += mb_data_size;
 
   dec->cache_y_stride_ = 16 * mb_w;
@@ -576,14 +594,17 @@ static void DoUVTransform(uint32_t bits, const int16_t* const src,
   }
 }
 
-void VP8ReconstructBlocks(const VP8Decoder* const dec, int mb_y) {
+static void ReconstructRow(const VP8Decoder* const dec,
+                           const VP8ThreadContext* ctx) {
   int j;
   int mb_x;
+  const int mb_y = ctx->mb_y_;
+  const int cache_id = ctx->id_;
   uint8_t* const y_dst = dec->yuv_b_ + Y_OFF;
   uint8_t* const u_dst = dec->yuv_b_ + U_OFF;
   uint8_t* const v_dst = dec->yuv_b_ + V_OFF;
   for (mb_x = 0; mb_x < dec->mb_w_; ++mb_x) {
-    const VP8MBData* const block = dec->mb_data_ + mb_x;
+    const VP8MBData* const block = ctx->mb_data_ + mb_x;
 
     // Rotate in the left samples from previously decoded block. We move four
     // pixels at a time for alignment reason, and because of in-loop filter.
@@ -676,8 +697,8 @@ void VP8ReconstructBlocks(const VP8Decoder* const dec, int mb_y) {
     }
     // Transfer reconstructed samples from yuv_b_ cache to final destination.
     {
-      const int y_offset = dec->cache_id_ * 16 * dec->cache_y_stride_;
-      const int uv_offset = dec->cache_id_ * 8 * dec->cache_uv_stride_;
+      const int y_offset = cache_id * 16 * dec->cache_y_stride_;
+      const int uv_offset = cache_id * 8 * dec->cache_uv_stride_;
       uint8_t* const y_out = dec->cache_y_ + mb_x * 16 + y_offset;
       uint8_t* const u_out = dec->cache_u_ + mb_x * 8 + uv_offset;
       uint8_t* const v_out = dec->cache_v_ + mb_x * 8 + uv_offset;
diff --git a/src/dec/idec.c b/src/dec/idec.c
index ce3290e3..a74dbfeb 100644
--- a/src/dec/idec.c
+++ b/src/dec/idec.c
@@ -472,9 +472,7 @@ static VP8StatusCode DecodeRemaining(WebPIDecoder* const idec) {
     }
     VP8InitScanline(dec);   // Prepare for next scanline
 
-    // Reconstruct the samples.
-    VP8ReconstructBlocks(dec, dec->mb_y_);
-    // Filter and emit the row.
+    // Reconstruct, filter and emit the row.
     if (!VP8ProcessRow(dec, io)) {
       return IDecError(idec, VP8_STATUS_USER_ABORT);
     }
diff --git a/src/dec/vp8.c b/src/dec/vp8.c
index 975920dd..03022c14 100644
--- a/src/dec/vp8.c
+++ b/src/dec/vp8.c
@@ -657,9 +657,7 @@ static int ParseFrame(VP8Decoder* const dec, VP8Io* io) {
     }
     VP8InitScanline(dec);   // Prepare for next scanline
 
-    // Reconstruct the samples.
-    VP8ReconstructBlocks(dec, dec->mb_y_);
-    // Filter and emit the row.
+    // Reconstruct, filter and emit the row.
     if (!VP8ProcessRow(dec, io)) {
       return VP8SetError(dec, VP8_STATUS_USER_ABORT, "Output aborted.");
     }
diff --git a/src/dec/vp8i.h b/src/dec/vp8i.h
index 4b19343b..b8dd2e8c 100644
--- a/src/dec/vp8i.h
+++ b/src/dec/vp8i.h
@@ -197,11 +197,12 @@ typedef struct {
 
 // Persistent information needed by the parallel processing
 typedef struct {
-  int id_;            // cache row to process (in [0..2])
-  int mb_y_;          // macroblock position of the row
-  int filter_row_;    // true if row-filtering is needed
-  VP8FInfo* f_info_;  // filter strengths
-  VP8Io io_;          // copy of the VP8Io to pass to put()
+  int id_;              // cache row to process (in [0..2])
+  int mb_y_;            // macroblock position of the row
+  int filter_row_;      // true if row-filtering is needed
+  VP8FInfo* f_info_;    // filter strengths (swapped with dec->f_info_)
+  VP8MBData* mb_data_;  // reconstruction data (swapped with dec->mb_data_)
+  VP8Io io_;            // copy of the VP8Io to pass to put()
 } VP8ThreadContext;
 
 // Saved top samples, per macroblock. Fits into a cache-line.
@@ -287,8 +288,8 @@ struct VP8Decoder {
   size_t mem_size_;
 
   // Per macroblock non-persistent infos.
-  int mb_x_, mb_y_;       // current position, in macroblock units
-  VP8MBData* mb_data_;    // reconstruction data
+  int mb_x_, mb_y_;          // current position, in macroblock units
+  VP8MBData* mb_data_;       // parsed reconstruction data
 
   // Filtering side-info
   int filter_type_;                          // 0=off, 1=simple, 2=complex
@@ -324,8 +325,6 @@ void VP8ParseQuant(VP8Decoder* const dec);
 
 // in frame.c
 int VP8InitFrame(VP8Decoder* const dec, VP8Io* io);
-// Reconstruct a full row of blocks (prediction + residual adding)
-void VP8ReconstructBlocks(const VP8Decoder* const dec, int mb_y);
 // Call io->setup() and finish setting up scan parameters.
 // After this call returns, one must always call VP8ExitCritical() with the
 // same parameters. Both functions should be used in pair. Returns VP8_STATUS_OK