fancy chroma upscaling

When FANCY_UPSCALING is defined, use a smoothing filter for upscaling the U/V chroma fields. The filter used is a separable t[1 3 3 1] x [1 3 3 1] filter. It can be easily changed in macros MIX_*. The upscaling code reside on the thing shell between user and core decoding (in webp.c), and not in the core decoder. As such, this smoothing process can still be offloaded to GPU in some future and is not integral part of the decoding process. Coincidentaly: changed the way data is tranfered to user. For profile 2 (no filtering), it used to be on a per-block basis. Now, for all profiles, we emit rows of pixels (between 8 and 24 in height) when they are ready. This makes the upscaling code much easier. Will update the test vectors MD5 sums soon (as they'll be broken after this change) Change-Id: I2640ff12596cb8b843a4a376d7347447d9b9f778
2026-02-13 21:39:32 +01:00 · 2010-11-03 14:27:51 -07:00
parent 5a936a0a21
commit 6a37a2aaa9
5 changed files with 294 additions and 119 deletions
--- a/src/frame.c
+++ b/src/frame.c
@@ -31,8 +31,7 @@ int VP8InitFrame(VP8Decoder* const dec, VP8Io* io) {
  const int info_size = (mb_w + 1) * sizeof(VP8MB);
  const int yuv_size = YUV_SIZE * sizeof(*dec->yuv_b_);
  const int coeffs_size = 384 * sizeof(*dec->coeffs_);
-  const int cache_height = (dec->filter_type_ == 0) ? 0 :
-                           (16 + kFilterExtraRows[dec->filter_type_]) * 3 / 2;
+  const int cache_height = (16 + kFilterExtraRows[dec->filter_type_]) * 3 / 2;
  const int cache_size = top_size * cache_height;
  const int needed = intra_pred_mode_size
                   + top_size + info_size
@@ -74,14 +73,10 @@ int VP8InitFrame(VP8Decoder* const dec, VP8Io* io) {

  dec->cache_y_stride_ = 16 * mb_w;
  dec->cache_uv_stride_ = 8 * mb_w;
-  if (dec->filter_type_ == 0) {
-    dec->cache_y_ = NULL;
-    dec->cache_u_ = NULL;
-    dec->cache_v_ = NULL;
-  } else {
+  {
    const int extra_rows = kFilterExtraRows[dec->filter_type_];
    const int extra_y = extra_rows * dec->cache_y_stride_;
-    const int extra_uv =(extra_rows / 2) * dec->cache_uv_stride_;
+    const int extra_uv = (extra_rows / 2) * dec->cache_uv_stride_;
    dec->cache_y_ = ((uint8_t*)mem) + extra_y;
    dec->cache_u_ = dec->cache_y_ + 16 * dec->cache_y_stride_ + extra_uv;
    dec->cache_v_ = dec->cache_u_ + 8 * dec->cache_uv_stride_ + extra_uv;
@@ -97,22 +92,13 @@ int VP8InitFrame(VP8Decoder* const dec, VP8Io* io) {
  // prepare 'io'
  io->width = dec->pic_hdr_.width_;
  io->height = dec->pic_hdr_.height_;
-  io->mb_x = 0;
  io->mb_y = 0;
-  if (dec->filter_type_ == 0) {
-    io->y = dec->yuv_b_ + Y_OFF;
-    io->u = dec->yuv_b_ + U_OFF;
-    io->v = dec->yuv_b_ + V_OFF;
-    io->y_stride = BPS;
-    io->uv_stride = BPS;
-  } else {
-    io->y = dec->cache_y_;
-    io->u = dec->cache_u_;
-    io->v = dec->cache_v_;
-    io->y_stride = dec->cache_y_stride_;
-    io->uv_stride = dec->cache_uv_stride_;
-    io->mb_w = io->width;
-  }
+  io->y = dec->cache_y_;
+  io->u = dec->cache_u_;
+  io->v = dec->cache_v_;
+  io->y_stride = dec->cache_y_stride_;
+  io->uv_stride = dec->cache_uv_stride_;
+  io->fancy_upscaling = 0;    // default

  // Init critical function pointers and look-up tables.
  VP8DspInitTables();
@@ -177,32 +163,34 @@ static void DoFilter(VP8Decoder* const dec, int mb_x, int mb_y) {
  }
 }

-void VP8StoreBlock(VP8Decoder* const dec) {
-  VP8MB* const info = dec->mb_info_ + dec->mb_x_;
-  int level = dec->filter_levels_[dec->segment_];
-  if (dec->filter_hdr_.use_lf_delta_) {
-    // TODO(skal): only CURRENT is handled for now.
-    level += dec->filter_hdr_.ref_lf_delta_[0];
-    if (dec->is_i4x4_) {
-      level += dec->filter_hdr_.mode_lf_delta_[0];
+void VP8StoreBlock(VP8Decoder* const dec, VP8Io* const io) {
+  if (dec->filter_type_ > 0) {
+    VP8MB* const info = dec->mb_info_ + dec->mb_x_;
+    int level = dec->filter_levels_[dec->segment_];
+    if (dec->filter_hdr_.use_lf_delta_) {
+      // TODO(skal): only CURRENT is handled for now.
+      level += dec->filter_hdr_.ref_lf_delta_[0];
+      if (dec->is_i4x4_) {
+        level += dec->filter_hdr_.mode_lf_delta_[0];
+      }
    }
-  }
-  level = (level < 0) ? 0 : (level > 63) ? 63 : level;
-  info->f_level_ = level;
+    level = (level < 0) ? 0 : (level > 63) ? 63 : level;
+    info->f_level_ = level;

-  if (dec->filter_hdr_.sharpness_ > 0) {
-    if (dec->filter_hdr_.sharpness_ > 4) {
-      level >>= 2;
-    } else {
-      level >>= 1;
+    if (dec->filter_hdr_.sharpness_ > 0) {
+      if (dec->filter_hdr_.sharpness_ > 4) {
+        level >>= 2;
+      } else {
+        level >>= 1;
+      }
+      if (level > 9 - dec->filter_hdr_.sharpness_) {
+        level = 9 - dec->filter_hdr_.sharpness_;
+      }
    }
-    if (level > 9 - dec->filter_hdr_.sharpness_) {
-      level = 9 - dec->filter_hdr_.sharpness_;
-    }
-  }

-  info->f_ilevel_ = (level < 1) ? 1 : level;
-  info->f_inner_ = (!info->skip_ || dec->is_i4x4_);
+    info->f_ilevel_ = (level < 1) ? 1 : level;
+    info->f_inner_ = (!info->skip_ || dec->is_i4x4_);
+  }
  {
    // Transfer samples to row cache
    int y;
@@ -222,7 +210,7 @@ void VP8StoreBlock(VP8Decoder* const dec) {
  }
 }

-void VP8FilterRow(VP8Decoder* const dec, VP8Io* io) {
+void VP8FinishRow(VP8Decoder* const dec, VP8Io* io) {
  const int extra_y_rows = kFilterExtraRows[dec->filter_type_];
  const int ysize = extra_y_rows * dec->cache_y_stride_;
  const int uvsize = (extra_y_rows / 2) * dec->cache_uv_stride_;
@@ -231,9 +219,11 @@ void VP8FilterRow(VP8Decoder* const dec, VP8Io* io) {
  uint8_t* const ydst = dec->cache_y_ - ysize;
  uint8_t* const udst = dec->cache_u_ - uvsize;
  uint8_t* const vdst = dec->cache_v_ - uvsize;
-  int mb_x;
-  for (mb_x = 0; mb_x < dec->mb_w_; ++mb_x) {
-    DoFilter(dec, mb_x, dec->mb_y_);
+  if (dec->filter_type_ > 0) {
+    int mb_x;
+    for (mb_x = 0; mb_x < dec->mb_w_; ++mb_x) {
+      DoFilter(dec, mb_x, dec->mb_y_);
+    }
  }
  if (io->put) {
    int y_start = dec->mb_y_ * 16;
--- a/src/vp8.c
+++ b/src/vp8.c
@@ -492,18 +492,6 @@ static int ParseResiduals(VP8Decoder* const dec,
 //-----------------------------------------------------------------------------
 // Main loop

-static void SendBlock(VP8Decoder* const dec, VP8Io* io) {
-  if (io->put) {
-    io->mb_x = dec->mb_x_ * 16;
-    io->mb_y = dec->mb_y_ * 16;
-    io->mb_w = io->width - io->mb_x;
-    io->mb_h = io->height - io->mb_y;
-    if (io->mb_w > 16) io->mb_w = 16;
-    if (io->mb_h > 16) io->mb_h = 16;
-    io->put(io);
-  }
-}
-
 static int ParseFrame(VP8Decoder* const dec, VP8Io* io) {
  int ok = 1;
  VP8BitReader* const br = &dec->br_;
@@ -548,19 +536,13 @@ static int ParseFrame(VP8Decoder* const dec, VP8Io* io) {
      }
      VP8ReconstructBlock(dec);

-      // Store filter params
-      if (dec->filter_type_ > 0) {
-        VP8StoreBlock(dec);
-      } else {  // We're done. Send block to user at once.
-        SendBlock(dec, io);
-      }
+      // Store data and save block's filtering params
+      VP8StoreBlock(dec, io);
    }
    if (!ok) {
      break;
    }
-    if (dec->filter_type_ > 0) {   // filter a row
-      VP8FilterRow(dec, io);
-    }
+    VP8FinishRow(dec, io);
    if (dec->br_.eof_ || token_br->eof_) {
      ok = 0;
      break;
@@ -596,17 +578,23 @@ int VP8Decode(VP8Decoder* const dec, VP8Io* const io) {
    return VP8SetError(dec, 3, "Allocation failed");
  }

-  // set-up
-  if (io->setup) io->setup(io);

-  // Main decoding loop
-  if (!ParseFrame(dec, io)) {
+  if (io->setup && !io->setup(io)) {
    VP8Clear(dec);
-    return VP8SetError(dec, 3, "Frame decoding failed");
+    return VP8SetError(dec, 3, "Frame setup failed");
  }

-  // tear-down
-  if (io->teardown) io->teardown(io);
+  // Main decoding loop
+  {
+    const int ret = ParseFrame(dec, io);
+    if (io->teardown) {
+      io->teardown(io);
+    }
+    if (!ret) {
+      VP8Clear(dec);
+      return VP8SetError(dec, 3, "Frame decoding failed");
+    }
+  }

  dec->ready_ = 0;
  return 1;
--- a/src/vp8i.h
+++ b/src/vp8i.h
@@ -263,9 +263,10 @@ void VP8ParseQuant(VP8Decoder* const dec);
 int VP8InitFrame(VP8Decoder* const dec, VP8Io* io);
 // Predict a block and add residual
 void VP8ReconstructBlock(VP8Decoder* const dec);
-// Filtering
-void VP8StoreBlock(VP8Decoder* const dec);
-void VP8FilterRow(VP8Decoder* const dec, VP8Io* io);
+// Store a block, along with filtering params
+void VP8StoreBlock(VP8Decoder* const dec, VP8Io* io);
+// Finalize and transmit a complete row
+void VP8FinishRow(VP8Decoder* const dec, VP8Io* io);

 // in dsp.c
 typedef void (*VP8Idct)(const int16_t* coeffs, uint8_t* dst);
--- a/src/webp.c
+++ b/src/webp.c
@@ -17,6 +17,8 @@
 extern "C" {
 #endif

+#define FANCY_UPSCALING   // undefined to remove fancy upscaling support
+
 //-----------------------------------------------------------------------------
 // RIFF layout is:
 //   0ffset  tag
@@ -60,14 +62,111 @@ static uint32_t CheckRIFFHeader(const uint8_t** data_ptr,
 }

 //-----------------------------------------------------------------------------
+// Fancy upscaling

 typedef enum { MODE_RGB = 0, MODE_RGBA = 1,
               MODE_BGR = 2, MODE_BGRA = 3,
               MODE_YUV = 4 } CSP_MODE;

+#ifdef FANCY_UPSCALING
+
+// Given samples laid out in a square as:
+//  [a b]
+//  [c d]
+// we interpolate u/v as:
+//  ([9*a + 3*b + 3*c +   d    3*a + 9*b + 3*c +   d] + [8 8]) / 16
+//  ([3*a +   b + 9*c + 3*d      a + 3*b + 3*c + 9*d]   [8 8]) / 16
+#define MIX_ODD(a, b, c, d)        \
+  ((9 * (a) + 3 * ((b) + (c)) + (d) + 0x00080008u) >> 4)
+#define MIX_EVEN(a, b, c, d)       \
+  ((9 * (c) + 3 * ((d) + (a)) + (b) + 0x00080008u) >> 4)
+
+// We process u and v together stashed into 32bit (16bit each).
+// Note that we could store the pair (3*t_uv + uv, t_uv + 3*uv)
+// instead of (t_uv, uv), into a 64bit variable. Doing so, we could
+// simplify the MIXing a bit and save two multiplies. TODO(skal).
+#define LOAD_UV(u,v) ((u) | ((v) << 16))
+
+// Macro festival, so we can define all of rgb/bgr/rgba/bgra cases
+// for odd and even lines
+#define UPSCALE_FUNC(FUNC_NAME, MIX, FUNC, XSTEP)                        \
+static void FUNC_NAME(const uint8_t* cur_y,                              \
+                      const uint8_t* cur_u, const uint8_t* cur_v,        \
+                      const uint8_t* top_u, const uint8_t* top_v,        \
+                      int len, uint8_t* dst) {                           \
+  int x;                                                                 \
+  uint32_t tl_uv = LOAD_UV(top_u[0], top_v[0]);   /* top-left sample */  \
+  uint32_t l_uv  = LOAD_UV(cur_u[0], cur_v[0]);   /* left-sample */      \
+  uint32_t uv0 = MIX(tl_uv, tl_uv, l_uv, l_uv);                          \
+  FUNC(cur_y[0], uv0 & 0xff, (uv0 >> 16), dst);                          \
+  len -= 1;   /* first pixel is done. */                                 \
+  for (x = 1; x <= (len >> 1); ++x) {                                    \
+    const uint32_t t_uv = LOAD_UV(top_u[x], top_v[x]);  /* top sample */ \
+    const uint32_t uv   = LOAD_UV(cur_u[x], cur_v[x]);  /* sample */     \
+    const uint32_t uv0  = MIX(tl_uv, t_uv, l_uv, uv);                    \
+    const uint32_t uv1  = MIX(t_uv, tl_uv, uv, l_uv);                    \
+    FUNC(cur_y[2*x-1], uv0 & 0xff, (uv0 >> 16), dst + (2*x-1) * XSTEP);  \
+    FUNC(cur_y[2*x  ], uv1 & 0xff, (uv1 >> 16), dst + (2*x  ) * XSTEP);  \
+    tl_uv = t_uv;                                                        \
+    l_uv = uv;                                                           \
+  }                                                                      \
+  if (len & 1) {                                                         \
+    uv0 = MIX(tl_uv, tl_uv, l_uv, l_uv);                                 \
+    FUNC(cur_y[len], uv0 & 0xff, (uv0 >> 16), dst + len * XSTEP);        \
+  }                                                                      \
+}                                                                        \
+
+// All variants implemented.
+UPSCALE_FUNC(UpscaleEvenRgb,  MIX_EVEN, VP8YuvToRgb,  3)
+UPSCALE_FUNC(UpscaleOddRgb,   MIX_ODD,  VP8YuvToRgb,  3)
+UPSCALE_FUNC(UpscaleEvenBgr,  MIX_EVEN, VP8YuvToBgr,  3)
+UPSCALE_FUNC(UpscaleOddBgr,   MIX_ODD,  VP8YuvToBgr,  3)
+UPSCALE_FUNC(UpscaleEvenRgba, MIX_EVEN, VP8YuvToRgba, 4)
+UPSCALE_FUNC(UpscaleOddRgba,  MIX_ODD,  VP8YuvToRgba, 4)
+UPSCALE_FUNC(UpscaleEvenBgra, MIX_EVEN, VP8YuvToBgra, 4)
+UPSCALE_FUNC(UpscaleOddBgra,  MIX_ODD,  VP8YuvToBgra, 4)
+
+// Main driver function.
+static inline void UpscaleLine(const uint8_t* cur_y,
+                               const uint8_t* cur_u, const uint8_t* cur_v,
+                               const uint8_t* top_u, const uint8_t* top_v,
+                               int len, uint8_t* dst, int odd, CSP_MODE mode) {
+  if (odd) {
+    if (mode == MODE_RGB) {
+      UpscaleOddRgb(cur_y, cur_u, cur_v, top_u, top_v, len, dst);
+    } else if (mode == MODE_BGR) {
+      UpscaleOddBgr(cur_y, cur_u, cur_v, top_u, top_v, len, dst);
+    } else if (mode == MODE_RGBA) {
+      UpscaleOddRgba(cur_y, cur_u, cur_v, top_u, top_v, len, dst);
+    } else {
+      UpscaleOddBgra(cur_y, cur_u, cur_v, top_u, top_v, len, dst);
+    }
+  } else {
+    if (mode == MODE_RGB) {
+      UpscaleEvenRgb(cur_y, cur_u, cur_v, top_u, top_v, len, dst);
+    } else if (mode == MODE_BGR) {
+      UpscaleEvenBgr(cur_y, cur_u, cur_v, top_u, top_v, len, dst);
+    } else if (mode == MODE_RGBA) {
+      UpscaleEvenRgba(cur_y, cur_u, cur_v, top_u, top_v, len, dst);
+    } else {
+      UpscaleEvenBgra(cur_y, cur_u, cur_v, top_u, top_v, len, dst);
+    }
+  }
+}
+#undef LOAD_UV
+#undef UPSCALE_FUNC
+#undef MIX_ODD
+#undef MIX_EVEN
+
+#endif  // FANCY_UPSCALING
+
+//-----------------------------------------------------------------------------
+// Main conversion driver.
+
 typedef struct {
  uint8_t* output;      // rgb(a) or luma
  uint8_t *u, *v;
+  uint8_t *top_y, *top_u, *top_v;
  int stride;           // rgb(a) stride or luma stride
  int u_stride;
  int v_stride;
@@ -76,52 +175,139 @@ typedef struct {

 static void CustomPut(const VP8Io* io) {
  Params *p = (Params*)io->opaque;
-  const int mb_w = io->mb_w;
+  const int w = io->width;
  const int mb_h = io->mb_h;
-  int j;
+  const int uv_w = (w + 1) / 2;
+  assert(!(io->mb_y & 1));

  if (p->mode == MODE_YUV) {
-    uint8_t* const y_dst = p->output + io->mb_x + io->mb_y * p->stride;
-    uint8_t* u_dst;
-    uint8_t* v_dst;
-    int uv_w;
-
+    uint8_t* const y_dst = p->output + io->mb_y * p->stride;
+    uint8_t* const u_dst = p->u + (io->mb_y >> 1) * p->u_stride;
+    uint8_t* const v_dst = p->v + (io->mb_y >> 1) * p->v_stride;
+    int j;
    for (j = 0; j < mb_h; ++j) {
-      memcpy(y_dst + j * p->stride, io->y + j * io->y_stride, mb_w);
+      memcpy(y_dst + j * p->stride, io->y + j * io->y_stride, w);
    }
-    u_dst = p->u + (io->mb_x / 2) + (io->mb_y / 2) * p->u_stride;
-    v_dst = p->v + (io->mb_x / 2) + (io->mb_y / 2) * p->v_stride;
-    uv_w = (mb_w + 1) / 2;
    for (j = 0; j < (mb_h + 1) / 2; ++j) {
      memcpy(u_dst + j * p->u_stride, io->u + j * io->uv_stride, uv_w);
      memcpy(v_dst + j * p->v_stride, io->v + j * io->uv_stride, uv_w);
    }
  } else {
-    const int psize = (p->mode == MODE_RGB || p->mode == MODE_BGR) ? 3 : 4;
-    uint8_t* dst = p->output + psize * io->mb_x + io->mb_y * p->stride;
-    int i;
+    uint8_t* dst = p->output + io->mb_y * p->stride;
+    if (io->fancy_upscaling) {
+#ifdef FANCY_UPSCALING
+      const uint8_t* cur_y;
+      const uint8_t* cur_u = io->u;
+      const uint8_t* cur_v = io->v;
+      const uint8_t* top_u = p->top_u;
+      const uint8_t* top_v = p->top_v;
+      int y = io->mb_y;
+      int y_end = io->mb_y + io->mb_h - 1;
+      if (y > 0) {
+        // If mid-fly, we need to finish the previous line.
+        cur_y = p->top_y;
+        dst -= p->stride;
+        y -= 1;
+      } else {
+        // else we "replicate" the u/v sample of the first line
+        top_u = cur_u;
+        top_v = cur_v;
+        // and start with the top line
+        cur_y = io->y;
+      }
+      if (y_end >= io->height - 1) {
+        // for the very last rows, we can process them right now
+        y_end = io->height;
+      } else {
+        // we won't process the very last line this time,
+        // waiting for the next call instead.
+      }

-    for (j = 0; j < mb_h; ++j) {
-      const uint8_t* y_src = io->y + j * io->y_stride;
-      for (i = 0; i < mb_w; ++i) {
-        const int y = y_src[i];
-        const int u = io->u[(j / 2) * io->uv_stride + (i / 2)];
-        const int v = io->v[(j / 2) * io->uv_stride + (i / 2)];
-        if (p->mode == MODE_RGB) {
-          VP8YuvToRgb(y, u, v, dst + i * 3);
-        } else if (p->mode == MODE_BGR) {
-          VP8YuvToBgr(y, u, v, dst + i * 3);
-        } else if (p->mode == MODE_RGBA) {
-          VP8YuvToRgba(y, u, v, dst + i * 4);
+      // Loop over each output row.
+      for (; y < y_end; ++y) {
+        if (y & 1) {   // odd lines
+          UpscaleLine(cur_y, cur_u, cur_v, top_u, top_v, w, dst, 1, p->mode);
+        } else {       // even lines
+          UpscaleLine(cur_y, cur_u, cur_v, top_u, top_v, w, dst, 0, p->mode);
+          top_u = cur_u;
+          top_v = cur_v;
+          if (y < io->height - 2) {
+            cur_u += io->uv_stride;
+            cur_v += io->uv_stride;
+          }
+        }
+        dst += p->stride;
+        if (cur_y == p->top_y) {
+          cur_y = io->y;
        } else {
-          VP8YuvToBgra(y, u, v, dst + i * 4);
+          cur_y += io->y_stride;
        }
      }
-      dst += p->stride;
+      // Save the unfinished samples for next call (if we're not done yet).
+      if (y < io->height - 1) {
+        memcpy(p->top_y, cur_y, w * sizeof(*p->top_y));
+        memcpy(p->top_u, top_u, uv_w * sizeof(*p->top_u));
+        memcpy(p->top_v, top_v, uv_w * sizeof(*p->top_v));
+      }
+#else
+      assert(0);  // shouldn't happen.
+#endif
+    } else {
+      // Point-sampling U/V upscaler.
+      // Could be implemented with special MIX functions, too.
+      int j;
+      for (j = 0; j < mb_h; ++j) {
+        const uint8_t* y_src = io->y + j * io->y_stride;
+        int i;
+        for (i = 0; i < w; ++i) {
+          const int y = y_src[i];
+          const int u = io->u[(j / 2) * io->uv_stride + (i / 2)];
+          const int v = io->v[(j / 2) * io->uv_stride + (i / 2)];
+          if (p->mode == MODE_RGB) {
+            VP8YuvToRgb(y, u, v, dst + i * 3);
+          } else if (p->mode == MODE_BGR) {
+            VP8YuvToBgr(y, u, v, dst + i * 3);
+          } else if (p->mode == MODE_RGBA) {
+            VP8YuvToRgba(y, u, v, dst + i * 4);
+          } else {
+            VP8YuvToBgra(y, u, v, dst + i * 4);
+          }
+        }
+        dst += p->stride;
+      }
    }
  }
 }

+//-----------------------------------------------------------------------------
+
+static int CustomSetup(VP8Io* io) {
+#ifdef FANCY_UPSCALING
+  Params *p = (Params*)io->opaque;
+  p->top_y = p->top_u = p->top_v = NULL;
+  if (p->mode != MODE_YUV) {
+    const int uv_width = (io->width + 1) >> 1;
+    p->top_y = (uint8_t*)malloc(io->width + 2 * uv_width);
+    if (p->top_y == NULL) {
+      return 0;   // memory error.
+    }
+    p->top_u = p->top_y + io->width;
+    p->top_v = p->top_u + uv_width;
+    io->fancy_upscaling = 1;  // activate fancy upscaling
+  }
+#endif
+  return 1;
+}
+
+static void CustomTeardown(const VP8Io* io) {
+#ifdef FANCY_UPSCALING
+  Params *p = (Params*)io->opaque;
+  if (p->top_y) {
+    free(p->top_y);
+    p->top_y = p->top_u = p->top_v = NULL;
+  }
+#endif
+}

 //-----------------------------------------------------------------------------
 // "Into" variants
@@ -145,6 +331,8 @@ static uint8_t* DecodeInto(CSP_MODE mode,
  params->mode = mode;
  io.opaque = params;
  io.put = CustomPut;
+  io.setup = CustomSetup;
+  io.teardown = CustomTeardown;

  if (!VP8GetHeaders(dec, &io)) {
    VP8Delete(dec);
--- a/src/webp/decode_vp8.h
+++ b/src/webp/decode_vp8.h
@@ -40,28 +40,36 @@ extern "C" {
 typedef struct VP8Io VP8Io;
 struct VP8Io {
  // set by VP8GetHeaders()
-  int width, height;       // picture dimensions, in pixels
+  int width, height;         // picture dimensions, in pixels

  // set before calling put()
-  int mb_x, mb_y;            // position of the current sample (in pixels)
-  int mb_w, mb_h;            // size of the current sample (usually 16x16)
-  const uint8_t *y, *u, *v;  // samples to copy
-  int y_stride;              // stride for luma
-  int uv_stride;             // stride for chroma
+  int mb_y;                  // position of the current rows (in pixels)
+  int mb_h;                  // number of rows in the sample
+  const uint8_t *y, *u, *v;  // rows to copy (in yuv420 format)
+  int y_stride;              // row stride for luma
+  int uv_stride;             // row stride for chroma

  void* opaque;              // user data

-  // called when fresh samples are available (1 block of 16x16 pixels)
+  // called when fresh samples are available. Currently, samples are in
+  // YUV420 format, and can be up to width x 24 in size (depending on the
+  // in-loop filtering level, e.g.).
  void (*put)(const VP8Io* io);

-  // called just before starting to decode the blocks
-  void (*setup)(const VP8Io* io);
+  // called just before starting to decode the blocks.
+  // Should returns 0 in case of error.
+  int (*setup)(VP8Io* io);

-  // called just after block decoding is finished
+  // called just after block decoding is finished (or when an error occurred).
  void (*teardown)(const VP8Io* io);

+  // this is a recommendation for the user-side yuv->rgb converter. This flag
+  // is set when calling setup() hook and can be overwritten by it. It then
+  // can be taken into consideration during the put() method.
+  int fancy_upscaling;
+
  // Input buffer.
-   uint32_t data_size;
+  uint32_t data_size;
  const uint8_t* data;
 };