From d2603105113ae38edc1f1fcf80c4d4ce8688f183 Mon Sep 17 00:00:00 2001
From: Pascal Massimino <pascal.massimino@gmail.com>
Date: Mon, 20 Jun 2011 00:45:15 -0700
Subject: [PATCH] add Advanced Decoding Interface

You can now use WebPDecBuffer, WebPBitstreamFeatures and WebPDecoderOptions
to have better control over the decoding process (and the speed/quality tradeoff).

WebPDecoderOptions allow to:
 - turn fancy upsampler on/off
 - turn in-loop filter on/off
 - perform on-the-fly cropping
 - perform on the-fly rescale
(and more to come. Not all features are implemented yet).

On-the-fly cropping and scaling allow to save quite some memory
(as the decoding operation will now scale with the output's size, not
the input's one). It saves some CPU too (since for instance,
in-loop filtering is partially turned off where it doesn't matter,
and some YUV->RGB conversion operations are ommitted too).

The scaler uses summed area, so is mainly meant to be used for
downscaling (like: for generating thumbnails or previews).

Incremental decoding works with these new options.
More doc to come soon.

dwebp is now using the new decoding interface, with the new flags:
  -nofancy
  -nofilter
  -crop top left width height
  -scale width height

Change-Id: I08baf2fa291941686f4ef70a9cc2e4137874e85e
---
 Android.mk            |   2 +
 Makefile.vc           |   2 +
 README                |  20 +
 examples/dwebp.c      | 312 ++++++++++------
 makefile.unix         |   3 +-
 man/cwebp.1           |   6 +-
 man/dwebp.1           |  25 +-
 src/dec/Makefile.am   |   2 +-
 src/dec/buffer.c      | 201 ++++++++++
 src/dec/frame.c       | 142 +++++--
 src/dec/idec.c        | 207 +++++++----
 src/dec/io.c          | 845 ++++++++++++++++++++++++++++++++++++++++++
 src/dec/vp8.c         |  52 ++-
 src/dec/vp8i.h        |  26 +-
 src/dec/webp.c        | 671 ++++++++++-----------------------
 src/dec/webpi.h       |  91 +++--
 src/dec/yuv.h         |  13 +-
 src/webp/decode.h     | 197 +++++++++-
 src/webp/decode_vp8.h |  23 +-
 19 files changed, 2052 insertions(+), 788 deletions(-)
 create mode 100644 src/dec/buffer.c
 create mode 100644 src/dec/io.c
diff --git a/Android.mk b/Android.mk
index c087446e..553aad61 100644
--- a/Android.mk
+++ b/Android.mk
@@ -12,6 +12,8 @@ LOCAL_SRC_FILES := \
 	src/dec/tree.c \
 	src/dec/vp8.c \
 	src/dec/webp.c \
+	src/dec/io.c \
+	src/dec/buffer.c \
 	src/dec/yuv.c \
 	src/enc/alpha.c \
 	src/enc/analysis.c \
diff --git a/Makefile.vc b/Makefile.vc
index d35238bb..ad5f7d0c 100644
--- a/Makefile.vc
+++ b/Makefile.vc
@@ -122,6 +122,8 @@ X_OBJS= \
 	$(DIROBJ)\dec\tree.obj \
 	$(DIROBJ)\dec\vp8.obj \
 	$(DIROBJ)\dec\webp.obj \
+	$(DIROBJ)\dec\io.obj \
+	$(DIROBJ)\dec\buffer.obj \
 	$(DIROBJ)\dec\yuv.obj \
 	$(DIROBJ)\dec\idec.obj \
 	$(DIROBJ)\dec\alpha.obj \
diff --git a/README b/README
index 1ecefb5e..37cab537 100644
--- a/README
+++ b/README
@@ -144,6 +144,7 @@ options:
   -pass <int> ............ analysis pass number (1..10)
   -partitions <int> ...... number of partitions to use (0..3)
   -crop <x> <y> <w> <h> .. crop picture with the given rectangle
+  -resize <w> <h> ........ resize picture (after any cropping)
   -map <int> ............. print map of extra info.
   -d <file.pgm> .......... dump the compressed output (PGM file).
 
@@ -201,6 +202,25 @@ file test.webp decodes to exactly the same as test_ref.ppm by using:
  ./dwebp test.webp -ppm -o test.ppm
  diff test.ppm test_ref.ppm
 
+The full list of options is available using -h:
+
+> dwebp -h
+Usage: dwebp in_file [options] [-o out_file]
+
+Decodes the WebP image file to PNG format [Default]
+Use following options to convert into alternate image formats:
+  -ppm ......... save the raw RGB samples as color PPM
+  -pgm ......... save the raw YUV samples as a grayscale PGM
+                 file with IMC4 layout.
+ Other options are:
+  -version  .... print version number and exit.
+  -nofancy ..... don't use the fancy YUV420 upscaler.
+  -nofilter .... disable in-loop filtering.
+  -crop <x> <y> <w> <h> ... crop output with the given rectangle
+  -scale <w> <h> .......... scale the output (*after* any cropping)
+  -h     ....... this help message.
+  -v     ....... verbose (e.g. print encoding/decoding times)
+  -noasm ....... disable all assembly optimizations.
 
 Encoding API:
 ===========
diff --git a/examples/dwebp.c b/examples/dwebp.c
index 8b955466..406b0e4c 100644
--- a/examples/dwebp.c
+++ b/examples/dwebp.c
@@ -5,8 +5,7 @@
 //  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
-//  simple command-line example calling libwebpdecode to
-//  decode a WebP image into a PPM image.
+//  Command-line tool for decoding a WebP image
 //
 //  Compile with:     gcc -o dwebp dwebp.c -lwebpdecode
 //
@@ -45,11 +44,18 @@
 extern "C" {
 #endif
 
-//-----------------------------------------------------------------------------
-
+static int verbose = 0;
 extern void* VP8DecGetCPUInfo;   // opaque forward declaration.
 
-static int verbose = 0;
+//-----------------------------------------------------------------------------
+
+// Output types
+typedef enum {
+  PNG = 0,
+  PPM,
+  PGM,
+  ALPHA_PLANE_ONLY  // this is for experimenting only
+} OutputFileFormat;
 
 #ifdef HAVE_WINCODEC_H
 
@@ -69,7 +75,8 @@ static int verbose = 0;
 #define MAKE_REFGUID(x) &(x)
 #endif
 
-static HRESULT CreateOutputStream(const char* out_file_name, IStream** ppStream) {
+static HRESULT CreateOutputStream(const char* out_file_name,
+                                  IStream** ppStream) {
   HRESULT hr = S_OK;
   IFS(SHCreateStreamOnFileA(out_file_name, STGM_WRITE | STGM_CREATE, ppStream));
   if (FAILED(hr))
@@ -117,8 +124,13 @@ static HRESULT WriteUsingWIC(const char* out_file_name, REFGUID container_guid,
   return hr;
 }
 
-static int WritePNG(const char* out_file_name, unsigned char* rgb, int stride,
-                    uint32_t width, uint32_t height, int has_alpha) {
+static int WritePNG(const char* out_file_name,
+                    const WebPDecBuffer* const buffer) {
+  const uint32_t width = buffer->width;
+  const uint32_t height = buffer->height;
+  unsigned char* const rgb = buffer->u.RGBA.rgba;
+  const int stride = buffer->u.RGBA.stride;
+  const int has_alpha = (buffer->colorspace == MODE_RGBA);
   assert(!has_alpha);   // TODO(mikolaj)
   return SUCCEEDED(WriteUsingWIC(out_file_name,
              MAKE_REFGUID(GUID_ContainerFormatPng), rgb, stride, width,
@@ -131,8 +143,12 @@ static void PNGAPI error_function(png_structp png, png_const_charp dummy) {
   longjmp(png_jmpbuf(png), 1);
 }
 
-static int WritePNG(FILE* out_file, unsigned char* rgb, int stride,
-                    png_uint_32 width, png_uint_32 height, int has_alpha) {
+static int WritePNG(FILE* out_file, const WebPDecBuffer* const buffer) {
+  const uint32_t width = buffer->width;
+  const uint32_t height = buffer->height;
+  unsigned char* const rgb = buffer->u.RGBA.rgba;
+  const int stride = buffer->u.RGBA.stride;
+  const int has_alpha = (buffer->colorspace == MODE_RGBA);
   png_structp png;
   png_infop info;
   png_uint_32 y;
@@ -169,8 +185,7 @@ static int WritePNG(FILE* out_file, unsigned char* rgb, int stride,
 
 typedef uint32_t png_uint_32;
 
-static int WritePNG(FILE* out_file, unsigned char* rgb, int stride,
-                    png_uint_32 width, png_uint_32 height, int has_alpha) {
+static int WritePNG(FILE* out_file, const WebPDecBuffer* const buffer) {
   printf("PNG support not compiled. Please install the libpng development "
          "package before building.\n");
   printf("You can run with -ppm flag to decode in PPM format.\n");
@@ -178,84 +193,157 @@ static int WritePNG(FILE* out_file, unsigned char* rgb, int stride,
 }
 #endif
 
-static int WritePPM(FILE* fout, const unsigned char* rgb,
-                    uint32_t width, uint32_t height) {
-  fprintf(fout, "P6\n%d %d\n255\n", width, height);
-  return (fwrite(rgb, width * height, 3, fout) == 3);
-}
-
-static int WriteAlphaPlane(FILE* fout, const unsigned char* rgba,
-                           uint32_t width, uint32_t height) {
+static int WritePPM(FILE* fout, const WebPDecBuffer* const buffer) {
+  const uint32_t width = buffer->width;
+  const uint32_t height = buffer->height;
+  const unsigned char* const rgb = buffer->u.RGBA.rgba;
+  const int stride = buffer->u.RGBA.stride;
   uint32_t y;
-  fprintf(fout, "P5\n%d %d\n255\n", width, height);
+  fprintf(fout, "P6\n%d %d\n255\n", width, height);
   for (y = 0; y < height; ++y) {
-    const unsigned char* line = rgba + y * (width * 4);
-    uint32_t x;
-    for (x = 0; x < width; ++x) {
-      if (fputc(line[4 * x + 3], fout) == EOF) {
-        return 0;
-      }
+    if (fwrite(rgb + y * stride, width, 3, fout) != 3) {
+      return 0;
     }
   }
   return 1;
 }
 
-static int WritePGM(FILE* fout,
-                    unsigned char* y_plane, unsigned char *u, unsigned char* v,
-                    int y_stride, int uv_stride,
-                    uint32_t width, uint32_t height) {
+static int WriteAlphaPlane(FILE* fout, const WebPDecBuffer* const buffer) {
+  const uint32_t width = buffer->width;
+  const uint32_t height = buffer->height;
+  const unsigned char* const a = buffer->u.YUVA.a;
+  const int a_stride = buffer->u.YUVA.a_stride;
+  uint32_t y;
+  assert(a != NULL);
+  fprintf(fout, "P5\n%d %d\n255\n", width, height);
+  for (y = 0; y < height; ++y) {
+    if (fwrite(a + y * a_stride, width, 1, fout) != 1) {
+      return 0;
+    }
+  }
+  return 1;
+}
+
+static int WritePGM(FILE* fout, const WebPDecBuffer* const buffer) {
+  const int width = buffer->width;
+  const int height = buffer->height;
+  const WebPYUVABuffer* const yuv = &buffer->u.YUVA;
   // Save a grayscale PGM file using the IMC4 layout
   // (http://www.fourcc.org/yuv.php#IMC4). This is a very
   // convenient format for viewing the samples, esp. for
   // odd dimensions.
   int ok = 1;
-  unsigned int y;
-  const unsigned int uv_width = (width + 1) / 2;
-  const unsigned int uv_height = (height + 1) / 2;
-  const unsigned int out_stride = (width + 1) & ~1;
-  fprintf(fout, "P5\n%d %d\n255\n", out_stride, height + uv_height);
+  int y;
+  const int uv_width = (width + 1) / 2;
+  const int uv_height = (height + 1) / 2;
+  const int out_stride = (width + 1) & ~1;
+  const int a_height = yuv->a ? height : 0;
+  fprintf(fout, "P5\n%d %d\n255\n", out_stride, height + uv_height + a_height);
   for (y = 0; ok && y < height; ++y) {
-    ok &= (fwrite(y_plane + y * y_stride, width, 1, fout) == 1);
+    ok &= (fwrite(yuv->y + y * yuv->y_stride, width, 1, fout) == 1);
     if (width & 1) fputc(0, fout);    // padding byte
   }
   for (y = 0; ok && y < uv_height; ++y) {
-    ok &= (fwrite(u + y * uv_stride, uv_width, 1, fout) == 1);
-    ok &= (fwrite(v + y * uv_stride, uv_width, 1, fout) == 1);
+    ok &= (fwrite(yuv->u + y * yuv->u_stride, uv_width, 1, fout) == 1);
+    ok &= (fwrite(yuv->v + y * yuv->v_stride, uv_width, 1, fout) == 1);
+  }
+  for (y = 0; ok && y < a_height; ++y) {
+    ok &= (fwrite(yuv->a + y * yuv->a_stride, width, 1, fout) == 1);
+    if (width & 1) fputc(0, fout);    // padding byte
   }
   return ok;
 }
 
-typedef enum {
-  PNG = 0,
-  PPM,
-  PGM,
-  ALPHA_PLANE_ONLY  // this is for experimenting only
-} OutputFileFormat;
+static void SaveOutput(const WebPDecBuffer* const buffer,
+                       OutputFileFormat format, const char* const out_file) {
+  FILE* fout = NULL;
+  int needs_open_file = 1;
+  int ok = 1;
+  Stopwatch stop_watch;
+
+  if (verbose)
+    StopwatchReadAndReset(&stop_watch);
+
+#ifdef _WIN32
+  needs_open_file = (format != PNG);
+#endif
+  if (needs_open_file) {
+    fout = fopen(out_file, "wb");
+    if (!fout) {
+      fprintf(stderr, "Error opening output file %s\n", out_file);
+      return;
+    }
+  }
+
+  if (format == PNG) {
+#ifdef HAVE_WINCODEC_H
+    ok &= WritePNG(out_file, buffer);
+#else
+    ok &= WritePNG(fout, buffer);
+#endif
+  } else if (format == PPM) {
+    ok &= WritePPM(fout, buffer);
+  } else if (format == PGM) {
+    ok &= WritePGM(fout, buffer);
+  } else if (format == ALPHA_PLANE_ONLY) {
+    ok &= WriteAlphaPlane(fout, buffer);
+  }
+  if (fout) {
+    fclose(fout);
+  }
+  if (ok) {
+    printf("Saved file %s\n", out_file);
+    if (verbose) {
+      const double time = StopwatchReadAndReset(&stop_watch);
+      printf("Time to write output: %.3fs\n", time);
+    }
+  } else {
+    fprintf(stderr, "Error writing file %s !!\n", out_file);
+  }
+}
 
 static void Help(void) {
-  printf("Usage: dwebp "
-         "[in_file] [-h] [-v] [-ppm] [-pgm] [-version] [-o out_file]\n\n"
+  printf("Usage: dwebp in_file [options] [-o out_file]\n\n"
          "Decodes the WebP image file to PNG format [Default]\n"
          "Use following options to convert into alternate image formats:\n"
-         " -ppm:  save the raw RGB samples as color PPM\n"
-         " -pgm:  save the raw YUV samples as a grayscale PGM\n"
-         "        file with IMC4 layout.\n"
-         " -version: print version number and exit.\n"
-         "Use -v for verbose (e.g. print encoding/decoding times)\n"
-         "Use -noasm to disable all assembly optimizations.\n"
+         "  -ppm ......... save the raw RGB samples as color PPM\n"
+         "  -pgm ......... save the raw YUV samples as a grayscale PGM\n"
+         "                 file with IMC4 layout.\n"
+         " Other options are:\n"
+         "  -version  .... print version number and exit.\n"
+         "  -nofancy ..... don't use the fancy YUV420 upscaler.\n"
+         "  -nofilter .... disable in-loop filtering.\n"
+         "  -crop <x> <y> <w> <h> ... crop output with the given rectangle\n"
+         "  -scale <w> <h> .......... scale the output (*after* any cropping)\n"
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+         "  -alpha ....... only save the alpha plane.\n"
+#endif
+         "  -h     ....... this help message.\n"
+         "  -v     ....... verbose (e.g. print encoding/decoding times)\n"
+         "  -noasm ....... disable all assembly optimizations.\n"
         );
 }
 
+static const char* const kStatusMessages[] = {
+  "OK", "OUT_OF_MEMORY", "INVALID_PARAM", "BITSTREAM_ERROR",
+  "UNSUPPORTED_FEATURE", "SUSPENDED", "USER_ABORT", "NOT_ENOUGH_DATA"
+};
+
 int main(int argc, const char *argv[]) {
   const char *in_file = NULL;
   const char *out_file = NULL;
 
-  int width, height, stride, uv_stride;
-  int has_alpha = 0;
-  uint8_t* out = NULL, *u = NULL, *v = NULL;
+  WebPDecoderConfig config;
+  WebPDecBuffer* const output_buffer = &config.output;
+  WebPBitstreamFeatures* const bitstream = &config.input;
   OutputFileFormat format = PNG;
-  Stopwatch stop_watch;
   int c;
+
+  if (!WebPInitDecoderConfig(&config)) {
+    fprintf(stderr, "Library version mismatch!\n");
+    return -1;
+  }
+
   for (c = 1; c < argc; ++c) {
     if (!strcmp(argv[c], "-h") || !strcmp(argv[c], "-help")) {
       Help();
@@ -264,6 +352,10 @@ int main(int argc, const char *argv[]) {
       out_file = argv[++c];
     } else if (!strcmp(argv[c], "-alpha")) {
       format = ALPHA_PLANE_ONLY;
+    } else if (!strcmp(argv[c], "-nofancy")) {
+      config.options.no_fancy_upsampling = 1;
+    } else if (!strcmp(argv[c], "-nofilter")) {
+      config.options.bypass_filtering = 1;
     } else if (!strcmp(argv[c], "-ppm")) {
       format = PPM;
     } else if (!strcmp(argv[c], "-version")) {
@@ -273,6 +365,16 @@ int main(int argc, const char *argv[]) {
       return 0;
     } else if (!strcmp(argv[c], "-pgm")) {
       format = PGM;
+    } else if (!strcmp(argv[c], "-crop") && c < argc - 4) {
+      config.options.use_cropping = 1;
+      config.options.crop_left   = strtol(argv[++c], NULL, 0);
+      config.options.crop_top    = strtol(argv[++c], NULL, 0);
+      config.options.crop_width  = strtol(argv[++c], NULL, 0);
+      config.options.crop_height = strtol(argv[++c], NULL, 0);
+    } else if (!strcmp(argv[c], "-scale") && c < argc - 2) {
+      config.options.use_scaling = 1;
+      config.options.scaled_width  = strtol(argv[++c], NULL, 0);
+      config.options.scaled_height = strtol(argv[++c], NULL, 0);
     } else if (!strcmp(argv[c], "-v")) {
       verbose = 1;
     } else if (!strcmp(argv[c], "-noasm")) {
@@ -293,10 +395,13 @@ int main(int argc, const char *argv[]) {
   }
 
   {
+    Stopwatch stop_watch;
+    VP8StatusCode status = VP8_STATUS_OK;
+    int ok;
     uint32_t data_size = 0;
     void* data = NULL;
-    int ok;
     FILE* const in = fopen(in_file, "rb");
+
     if (!in) {
       fprintf(stderr, "cannot open input file '%s'\n", in_file);
       return 1;
@@ -308,101 +413,70 @@ int main(int argc, const char *argv[]) {
     ok = (fread(data, data_size, 1, in) == 1);
     fclose(in);
     if (!ok) {
+      fprintf(stderr, "Could not read %d bytes of data from file %s\n",
+              data_size, in_file);
       free(data);
       return -1;
     }
 
     if (verbose)
       StopwatchReadAndReset(&stop_watch);
+
+    status = WebPGetFeatures((const uint8_t*)data, data_size, bitstream);
+    if (status != VP8_STATUS_OK) {
+      goto end;
+    }
+
     switch (format) {
       case PNG:
 #ifdef _WIN32
-        out = WebPDecodeBGR((const uint8_t*)data, data_size, &width, &height);
-        stride = 3 * width;
-        has_alpha = 0;
+        // TODO(mikolaj): no alpha for now
+        output_buffer->colorspace = MODE_BGR;
 #else
-        out = WebPDecodeRGBA((const uint8_t*)data, data_size, &width, &height);
-        stride = 4 * width;
-        has_alpha = 1;
+        output_buffer->colorspace = bitstream->has_alpha ? MODE_RGBA : MODE_RGB;
 #endif
         break;
       case PPM:
-        out = WebPDecodeRGB((const uint8_t*)data, data_size, &width, &height);
+        output_buffer->colorspace = MODE_RGB;  // drops alpha for PPM
         break;
       case PGM:
-        out = WebPDecodeYUV((const uint8_t*)data, data_size, &width, &height,
-                            &u, &v, &stride, &uv_stride);
+        output_buffer->colorspace = bitstream->has_alpha ? MODE_YUVA : MODE_YUV;
         break;
       case ALPHA_PLANE_ONLY:
-        out = WebPDecodeRGBA((const uint8_t*)data, data_size, &width, &height);
+        output_buffer->colorspace = MODE_YUVA;
         break;
       default:
         free(data);
         return -1;
     }
+    status = WebPDecode((const uint8_t*)data, data_size, &config);
 
     if (verbose) {
       const double time = StopwatchReadAndReset(&stop_watch);
       printf("Time to decode picture: %.3fs\n", time);
     }
-
+ end:
     free(data);
-  }
-
-  if (!out) {
-    fprintf(stderr, "Decoding of %s failed.\n", in_file);
-    return -1;
+    ok = (status == VP8_STATUS_OK);
+    if (!ok) {
+      fprintf(stderr, "Decoding of %s failed.\n", in_file);
+      fprintf(stderr, "Status: %d (%s)\n", status, kStatusMessages[status]);
+      return -1;
+    }
   }
 
   if (out_file) {
-    FILE* fout = NULL;
-    int needs_open_file = 0;
-
-    printf("Decoded %s. Dimensions: %d x %d. Now saving...\n", in_file, width, height);
-    StopwatchReadAndReset(&stop_watch);
-#ifdef _WIN32
-    if (format != PNG) {
-      needs_open_file = 1;
-    }
-#else
-    needs_open_file = 1;
-#endif
-    if (needs_open_file) fout = fopen(out_file, "wb");
-    if (!needs_open_file || fout) {
-      int ok = 1;
-      if (format == PNG) {
-#ifdef HAVE_WINCODEC_H
-        ok &= WritePNG(out_file, out, stride, width, height, has_alpha);
-#else
-        ok &= WritePNG(fout, out, stride, width, height, has_alpha);
-#endif
-      } else if (format == PPM) {
-        ok &= WritePPM(fout, out, width, height);
-      } else if (format == PGM) {
-        ok &= WritePGM(fout, out, u, v, stride, uv_stride, width, height);
-      } else if (format == ALPHA_PLANE_ONLY) {
-        ok &= WriteAlphaPlane(fout, out, width, height);
-      }
-      if (fout)
-        fclose(fout);
-      if (ok) {
-        printf("Saved file %s\n", out_file);
-        if (verbose) {
-          const double time = StopwatchReadAndReset(&stop_watch);
-          printf("Time to write output: %.3fs\n", time);
-        }
-      } else {
-        fprintf(stderr, "Error writing file %s !!\n", out_file);
-      }
-    } else {
-      fprintf(stderr, "Error opening output file %s\n", out_file);
-    }
+    printf("Decoded %s. Dimensions: %d x %d%s. Now saving...\n", in_file,
+           output_buffer->width, output_buffer->height,
+           bitstream->has_alpha ? " (with alpha)" : "");
+    SaveOutput(output_buffer, format, out_file);
   } else {
-    printf("File %s can be decoded (dimensions: %d x %d).\n",
-           in_file, width, height);
+    printf("File %s can be decoded (dimensions: %d x %d)%s.\n",
+           in_file, output_buffer->width, output_buffer->height,
+           bitstream->has_alpha ? " (with alpha)" : "");
     printf("Nothing written; use -o flag to save the result as e.g. PNG.\n");
   }
-  free(out);
+  WebPFreeDecBuffer(output_buffer);
 
   return 0;
 }
diff --git a/makefile.unix b/makefile.unix
index bf57f705..9c0dc757 100644
--- a/makefile.unix
+++ b/makefile.unix
@@ -56,7 +56,8 @@ OBJS = src/enc/webpenc.o src/enc/bit_writer.o src/enc/syntax.o \
        src/enc/layer.o \
        src/dec/bits.o src/dec/dsp.o src/dec/dsp_sse2.o src/dec/frame.o \
        src/dec/webp.o src/dec/quant.o src/dec/tree.o src/dec/vp8.o \
-       src/dec/yuv.o  src/dec/idec.o src/dec/alpha.o src/dec/layer.o
+       src/dec/yuv.o src/dec/idec.o src/dec/alpha.o src/dec/layer.o \
+       src/dec/io.o src/dec/buffer.o
 HDRS = src/webp/encode.h src/enc/vp8enci.h src/enc/bit_writer.h \
        src/enc/cost.h src/dec/bits.h  src/dec/vp8i.h src/dec/yuv.h
 OUTPUT = examples/cwebp examples/dwebp src/libwebp.a
diff --git a/man/cwebp.1 b/man/cwebp.1
index 80bd90a8..b7f02b65 100644
--- a/man/cwebp.1
+++ b/man/cwebp.1
@@ -1,5 +1,5 @@
 .\"                                      Hey, EMACS: -*- nroff -*-
-.TH CWEBP 1 "March  28, 2011"
+.TH CWEBP 1 "June  20, 2011"
 .SH NAME
 cwebp \- compress an image file to a WebP file
 .SH SYNOPSIS
@@ -102,8 +102,8 @@ options \fB\-size\fP or \fB\-psnr\fP. Maximum value is 10.
 .TP
 .B \-crop x_position y_position width height
 Crop the source to a rectangle with top-left corner at coordinates
-(x_position, y_position) and size width x height. This cropping area must
-be fully contained within the source rectangle.
+(\fBx_position\fP, \fBy_position\fP) and size \fBwidth\fP x \fBheight\fP.
+This cropping area must be fully contained within the source rectangle.
 .TP
 .B \-s width height
 Specify that the input file actually consists of raw Y'CbCr samples following
diff --git a/man/dwebp.1 b/man/dwebp.1
index af6003d5..b0a923fb 100644
--- a/man/dwebp.1
+++ b/man/dwebp.1
@@ -1,5 +1,5 @@
 .\"                                      Hey, EMACS: -*- nroff -*-
-.TH DWEBP 1 "March  28, 2011"
+.TH DWEBP 1 "June  20, 2011"
 .SH NAME
 dwebp \- decompress a WebP file to an image file
 .SH SYNOPSIS
@@ -32,6 +32,29 @@ Change the output format to PGM. The output consist of luma/chroma
 samples instead of RGB, using the ICM4 layout. This option is mainly
 for verification and debugging purpose.
 .TP
+.B \-nofancy
+Don't use the fancy upscaler for YUV420. This may lead to jaggy
+edges (especially the red ones), but should be faster.
+.TP
+.B \-nofilter
+Don't use the in-loop filtering process even if it is required by
+the bitstream. This may produce visible blocks on the non-compliant output,
+but will make the decoding faster.
+.TP
+.B \-crop x_position y_position width height
+Crop the decoded picture to a rectangle with top-left corner at coordinates
+(\fBx_position\fP, \fBy_position\fP) and size \fBwidth\fP x \fBheight\fP.
+This cropping area must be fully contained within the source rectangle.
+The top-left corner will be snapped to even coordinates if needed.
+This option is meant to reduce the memory needed for cropping large images.
+Note: the cropping is applied \fIbefore\fP any scaling.
+.TP
+.B \-scale width height
+Rescale the decoded picture to dimension \fBwidth\fP x \fBheight\fP. This option is
+mostly intended to reducing the memory needed to decode large images,
+when only a small version is needed (thumbnail, preview, etc.).
+Note: scaling is applied \fIafter\fP cropping.
+.TP
 .B \-v
 Print extra information (decoding time in particular).
 .TP
diff --git a/src/dec/Makefile.am b/src/dec/Makefile.am
index 6fb27588..e859090c 100644
--- a/src/dec/Makefile.am
+++ b/src/dec/Makefile.am
@@ -2,7 +2,7 @@ AM_CPPFLAGS = -I$(top_srcdir)/src
 
 libwebpdecode_la_SOURCES = bits.h vp8i.h yuv.h bits.c dsp.c dsp_sse2.c frame.c \
                           quant.c tree.c vp8.c webp.c yuv.c idec.c alpha.c \
-                          layer.c
+                          layer.c io.c buffer.c
 libwebpdecode_la_LDFLAGS = -version-info 0:0:0
 libwebpdecode_la_CPPFLAGS = $(USE_EXPERIMENTAL_CODE)
 libwebpdecodeinclude_HEADERS = ../webp/decode.h ../webp/decode_vp8.h ../webp/types.h
diff --git a/src/dec/buffer.c b/src/dec/buffer.c
new file mode 100644
index 00000000..4ea82baa
--- /dev/null
+++ b/src/dec/buffer.c
@@ -0,0 +1,201 @@
+// Copyright 2011 Google Inc.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// Everything about WebPDecBuffer
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <stdlib.h>
+#include "vp8i.h"
+#include "webpi.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+//-----------------------------------------------------------------------------
+// WebPDecBuffer
+
+static VP8StatusCode CheckDecBuffer(const WebPDecBuffer* const buffer) {
+  int ok = 1;
+  WEBP_CSP_MODE mode = buffer->colorspace;
+  const int width = buffer->width;
+  const int height = buffer->height;
+  if (mode >= MODE_YUV) {   // YUV checks
+    const WebPYUVABuffer* const buf = &buffer->u.YUVA;
+    const int size = buf->y_stride * height;
+    const int u_size = buf->u_stride * ((height + 1) / 2);
+    const int v_size = buf->v_stride * ((height + 1) / 2);
+    const int a_size = buf->a_stride * height;
+    ok &= (size <= buf->y_size);
+    ok &= (u_size <= buf->u_size);
+    ok &= (v_size <= buf->v_size);
+    ok &= (a_size <= buf->a_size);
+    ok &= (buf->y_stride >= width);
+    ok &= (buf->u_stride >= (width + 1) / 2);
+    ok &= (buf->v_stride >= (width + 1) / 2);
+    if (buf->a) {
+      ok &= (buf->a_stride >= width);
+    }
+  } else {    // RGB checks
+    const WebPRGBABuffer* const buf = &buffer->u.RGBA;
+    ok &= (buf->stride * height <= buf->size);
+    if (mode == MODE_RGB || mode == MODE_BGR) {
+      ok &= (buf->stride >= width * 3);
+    } else if (mode == MODE_RGBA || mode == MODE_BGRA) {
+      ok &= (buf->stride >= width * 4);
+    }
+  }
+  return ok ? VP8_STATUS_OK : VP8_STATUS_INVALID_PARAM;
+}
+
+static VP8StatusCode AllocateBuffer(WebPDecBuffer* const buffer) {
+  const int w = buffer->width;
+  const int h = buffer->height;
+
+  if (w <= 0 || h <= 0) {
+    return VP8_STATUS_INVALID_PARAM;
+  }
+
+  if (!buffer->is_external_memory && buffer->memory == NULL) {
+    uint8_t* output;
+    WEBP_CSP_MODE mode = buffer->colorspace;
+    int stride;
+    int uv_stride = 0, a_stride = 0;
+    int uv_size = 0;
+    uint64_t size, a_size = 0, total_size;
+    // We need memory and it hasn't been allocated yet.
+    // => initialize output buffer, now that dimensions are known.
+    stride = (mode == MODE_RGB || mode == MODE_BGR) ? 3 * w
+        : (mode == MODE_RGBA || mode == MODE_BGRA) ? 4 * w
+        : w;
+    size = (uint64_t)stride * h;
+
+    if (mode >= MODE_YUV) {
+      uv_stride = (w + 1) / 2;
+      uv_size = (uint64_t)uv_stride * ((h + 1) / 2);
+      if (mode == MODE_YUVA) {
+        a_stride = w;
+        a_size = (uint64_t)a_stride * h;
+      }
+    }
+    total_size = size + 2 * uv_size + a_size;
+
+    // Security/sanity checks
+    if (((size_t)total_size != total_size) || (total_size >= (1ULL << 40))) {
+      return VP8_STATUS_INVALID_PARAM;
+    }
+
+    buffer->memory = output = (uint8_t*)malloc((size_t)total_size);
+    if (output == NULL) {
+      return VP8_STATUS_OUT_OF_MEMORY;
+    }
+
+    if (mode >= MODE_YUV) {   // YUVA initialization
+      WebPYUVABuffer* const buf = &buffer->u.YUVA;
+      buf->y = output;
+      buf->y_stride = stride;
+      buf->y_size = size;
+      buf->u = output + size;
+      buf->u_stride = uv_stride;
+      buf->u_size = uv_size;
+      buf->v = output + size + uv_size;
+      buf->v_stride = uv_stride;
+      buf->v_size = uv_size;
+      if (mode == MODE_YUVA) {
+        buf->a = output + size + 2 * uv_size;
+      }
+      buf->a_size = a_size;
+      buf->a_stride = a_stride;
+    } else {  // RGBA initialization
+      WebPRGBABuffer* const buf = &buffer->u.RGBA;
+      buf->rgba = output;
+      buf->stride = stride;
+      buf->size = size;
+    }
+  }
+  return CheckDecBuffer(buffer);
+}
+
+VP8StatusCode WebPAllocateDecBuffer(int w, int h,
+                                    const WebPDecoderOptions* const options,
+                                    WebPDecBuffer* const out) {
+  if (out == NULL || w <= 0 || h <= 0) {
+    return VP8_STATUS_INVALID_PARAM;
+  }
+  if (options != NULL) {    // First, apply options if there is any.
+    if (options->use_cropping) {
+      const int cw = options->crop_width;
+      const int ch = options->crop_height;
+      const int x = options->crop_left & ~1;
+      const int y = options->crop_top & ~1;
+      if (x < 0 || y < 0 || cw <= 0 || ch <= 0 || x + cw > w || y + ch > h) {
+        return VP8_STATUS_INVALID_PARAM;   // out of frame boundary.
+      }
+      w = cw;
+      h = ch;
+    }
+    if (options->use_scaling) {
+      if (options->scaled_width <= 0 || options->scaled_height <= 0) {
+        return VP8_STATUS_INVALID_PARAM;
+      }
+      w  = options->scaled_width;
+      h = options->scaled_height;
+    }
+  }
+  out->width = w;
+  out->height = h;
+
+  // Then, allocate buffer for real
+  return AllocateBuffer(out);
+}
+
+//-----------------------------------------------------------------------------
+// constructors / destructors
+
+int WebPInitDecBufferInternal(WebPDecBuffer* const buffer, int version) {
+  if (version != WEBP_DECODER_ABI_VERSION) return 0;  // version mismatch
+  if (!buffer) return 0;
+  memset(buffer, 0, sizeof(*buffer));
+  return 1;
+}
+
+void WebPFreeDecBuffer(WebPDecBuffer* const buffer) {
+  if (buffer) {
+    if (!buffer->is_external_memory)
+      free(buffer->memory);
+    buffer->memory = NULL;
+  }
+}
+
+void WebPCopyDecBuffer(const WebPDecBuffer* const src,
+                       WebPDecBuffer* const dst) {
+  if (src && dst) {
+    *dst = *src;
+    if (src->memory) {
+      dst->is_external_memory = 1;   // dst buffer doesn't own the memory.
+      dst->memory = NULL;
+    }
+  }
+}
+
+// Copy and transfer ownership from src to dst (beware of parameter order!)
+void WebPGrabDecBuffer(WebPDecBuffer* const src, WebPDecBuffer* const dst) {
+  if (src && dst) {
+    *dst = *src;
+    if (src->memory) {
+      src->is_external_memory = 1;   // src relinquishes ownership
+      src->memory = NULL;
+    }
+  }
+}
+
+//-----------------------------------------------------------------------------
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/src/dec/frame.c b/src/dec/frame.c
index 46d735f8..29a0f757 100644
--- a/src/dec/frame.c
+++ b/src/dec/frame.c
@@ -18,7 +18,7 @@ extern "C" {
 
 #define ALIGN_MASK (32 - 1)
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Memory setup
 
 // kFilterExtraRows[] = How many extra lines are needed on the MB boundary
@@ -101,15 +101,13 @@ int VP8InitFrame(VP8Decoder* const dec, VP8Io* io) {
   memset(dec->intra_t_, B_DC_PRED, intra_pred_mode_size);
 
   // prepare 'io'
-  io->width = dec->pic_hdr_.width_;
-  io->height = dec->pic_hdr_.height_;
   io->mb_y = 0;
   io->y = dec->cache_y_;
   io->u = dec->cache_u_;
   io->v = dec->cache_v_;
   io->y_stride = dec->cache_y_stride_;
   io->uv_stride = dec->cache_uv_stride_;
-  io->fancy_upscaling = 0;    // default
+  io->fancy_upsampling = 0;    // default
   io->a = NULL;
 
   // Init critical function pointers and look-up tables.
@@ -119,7 +117,7 @@ int VP8InitFrame(VP8Decoder* const dec, VP8Io* io) {
   return 1;
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Filtering
 
 static inline int hev_thresh_from_level(int level, int keyframe) {
@@ -130,7 +128,7 @@ static inline int hev_thresh_from_level(int level, int keyframe) {
   }
 }
 
-static void DoFilter(VP8Decoder* const dec, int mb_x, int mb_y) {
+static void DoFilter(const VP8Decoder* const dec, int mb_x, int mb_y) {
   VP8MB* const mb = dec->mb_info_ + mb_x;
   uint8_t* const y_dst = dec->cache_y_ + mb_x * 16;
   const int y_bps = dec->cache_y_stride_;
@@ -178,6 +176,19 @@ static void DoFilter(VP8Decoder* const dec, int mb_x, int mb_y) {
   }
 }
 
+void VP8FilterRow(const VP8Decoder* const dec) {
+  int mb_x;
+  assert(dec->filter_type_ > 0);
+  if (dec->mb_y_ < dec->tl_mb_y_ || dec->mb_y_ > dec->br_mb_y_) {
+    return;
+  }
+  for (mb_x = dec->tl_mb_x_; mb_x < dec->br_mb_x_; ++mb_x) {
+    DoFilter(dec, mb_x, dec->mb_y_);
+  }
+}
+
+//------------------------------------------------------------------------------
+
 void VP8StoreBlock(VP8Decoder* const dec) {
   if (dec->filter_type_ > 0) {
     VP8MB* const info = dec->mb_info_ + dec->mb_x_;
@@ -225,24 +236,31 @@ void VP8StoreBlock(VP8Decoder* const dec) {
   }
 }
 
+//------------------------------------------------------------------------------
+// This function is called after a row of macroblocks is finished decoding.
+// It also takes into account the following restrictions:
+//  * In case of in-loop filtering, we must hold off sending some of the bottom
+//    pixels as they are yet unfiltered. They will be when the next macroblock
+//    row is decoded. Meanwhile, we must preserve them by rotating them in the
+//    cache area. This doesn't hold for the very bottom row of the uncropped
+//    picture of course.
+//  * we must clip the remaining pixels against the cropping area. The VP8Io
+//    struct must have the following fields set correctly before calling put():
+
+#define MACROBLOCK_VPOS(mb_y)  ((mb_y) * 16)    // vertical position of a MB
+
 int VP8FinishRow(VP8Decoder* const dec, VP8Io* io) {
   const int extra_y_rows = kFilterExtraRows[dec->filter_type_];
   const int ysize = extra_y_rows * dec->cache_y_stride_;
   const int uvsize = (extra_y_rows / 2) * dec->cache_uv_stride_;
-  const int first_row = (dec->mb_y_ == 0);
-  const int last_row = (dec->mb_y_ >= dec->mb_h_ - 1);
   uint8_t* const ydst = dec->cache_y_ - ysize;
   uint8_t* const udst = dec->cache_u_ - uvsize;
   uint8_t* const vdst = dec->cache_v_ - uvsize;
-  if (dec->filter_type_ > 0) {
-    int mb_x;
-    for (mb_x = 0; mb_x < dec->mb_w_; ++mb_x) {
-      DoFilter(dec, mb_x, dec->mb_y_);
-    }
-  }
+  const int first_row = (dec->mb_y_ == 0);
+  const int last_row = (dec->mb_y_ >= dec->br_mb_y_ - 1);
+  int y_start = MACROBLOCK_VPOS(dec->mb_y_);
+  int y_end = MACROBLOCK_VPOS(dec->mb_y_ + 1);
   if (io->put) {
-    int y_start = dec->mb_y_ * 16;
-    int y_end = y_start + 16;
     if (!first_row) {
       y_start -= extra_y_rows;
       io->y = ydst;
@@ -253,14 +271,13 @@ int VP8FinishRow(VP8Decoder* const dec, VP8Io* io) {
       io->u = dec->cache_u_;
       io->v = dec->cache_v_;
     }
+
     if (!last_row) {
       y_end -= extra_y_rows;
     }
-    if (y_end > io->height) {
-      y_end = io->height;
+    if (y_end > io->crop_bottom) {
+      y_end = io->crop_bottom;    // make sure we don't overflow on last row.
     }
-    io->mb_y = y_start;
-    io->mb_h = y_end - y_start;
     io->a = NULL;
 #ifdef WEBP_EXPERIMENTAL_FEATURES
     if (dec->alpha_data_) {
@@ -271,11 +288,33 @@ int VP8FinishRow(VP8Decoder* const dec, VP8Io* io) {
       }
     }
 #endif
-    if (!io->put(io)) {
-      return 0;
+    if (y_start < io->crop_top) {
+      const int delta_y = io->crop_top - y_start;
+      y_start = io->crop_top;
+      assert(!(delta_y & 1));
+      io->y += dec->cache_y_stride_ * delta_y;
+      io->u += dec->cache_uv_stride_ * (delta_y >> 1);
+      io->v += dec->cache_uv_stride_ * (delta_y >> 1);
+      if (io->a) {
+        io->a += io->width * delta_y;
+      }
+    }
+    if (y_start < y_end) {
+      io->y += io->crop_left;
+      io->u += io->crop_left >> 1;
+      io->v += io->crop_left >> 1;
+      if (io->a) {
+        io->a += io->crop_left;
+      }
+      io->mb_y = y_start - io->crop_top;
+      io->mb_w = io->crop_right - io->crop_left;
+      io->mb_h = y_end - y_start;
+      if (!io->put(io)) {
+        return 0;
+      }
     }
   }
-    // rotate top samples
+  // rotate top samples
   if (!last_row) {
     memcpy(ydst, ydst + 16 * dec->cache_y_stride_, ysize);
     memcpy(udst, udst + 8 * dec->cache_uv_stride_, uvsize);
@@ -284,7 +323,60 @@ int VP8FinishRow(VP8Decoder* const dec, VP8Io* io) {
   return 1;
 }
 
-//-----------------------------------------------------------------------------
+#undef MACROBLOCK_VPOS
+
+//------------------------------------------------------------------------------
+// Finish setting up the decoding parameter once user's setup() is called.
+
+VP8StatusCode VP8FinishFrameSetup(VP8Decoder* const dec, VP8Io* const io) {
+  // Call setup() first. This may trigger additional decoding features on 'io'.
+  if (io->setup && !io->setup(io)) {
+    VP8SetError(dec, VP8_STATUS_USER_ABORT, "Frame setup failed");
+    return dec->status_;
+  }
+
+  // Disable filtering per user request
+  if (io->bypass_filtering) {
+    dec->filter_type_ = 0;
+  }
+  // TODO(skal): filter type / strength / sharpness forcing
+
+  // Define the area where we can skip in-loop filtering, in case of cropping.
+  //
+  // 'Simple' filter reads two luma samples outside of the macroblock and
+  // and filters one. It doesn't filter the chroma samples. Hence, we can
+  // avoid doing the in-loop filtering before crop_top/crop_left position.
+  // For the 'Complex' filter, 3 samples are read and up to 3 are filtered.
+  // Means: there's a dependency chain that goes all the way up to the
+  // top-left corner of the picture (MB #0). We must filter all the previous
+  // macroblocks.
+  // TODO(skal): add an 'approximate_decoding' option, that won't produce
+  // a 1:1 bit-exactness for complex filtering?
+  {
+    const int extra_pixels = kFilterExtraRows[dec->filter_type_];
+    if (dec->filter_type_ == 2) {
+      // For complex filter, we need to preserve the dependency chain.
+      dec->tl_mb_x_ = 0;
+      dec->tl_mb_y_ = 0;
+    } else {
+      // For simple filter, we can filter only the cropped region.
+      dec->tl_mb_y_ = io->crop_top >> 4;
+      dec->tl_mb_x_ = io->crop_left >> 4;
+    }
+    // We need some 'extra' pixels on the right/bottom.
+    dec->br_mb_y_ = (io->crop_bottom + 15 + extra_pixels) >> 4;
+    dec->br_mb_x_ = (io->crop_right + 15 + extra_pixels) >> 4;
+    if (dec->br_mb_x_ > dec->mb_w_) {
+      dec->br_mb_x_ = dec->mb_w_;
+    }
+    if (dec->br_mb_y_ > dec->mb_h_) {
+      dec->br_mb_y_ = dec->mb_h_;
+    }
+  }
+  return VP8_STATUS_OK;
+}
+
+//------------------------------------------------------------------------------
 // Main reconstruction function.
 
 static const int kScan[16] = {
@@ -431,7 +523,7 @@ void VP8ReconstructBlock(VP8Decoder* const dec) {
   }
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
diff --git a/src/dec/idec.c b/src/dec/idec.c
index b33007bc..a2022219 100644
--- a/src/dec/idec.c
+++ b/src/dec/idec.c
@@ -15,7 +15,6 @@
 
 #include "webpi.h"
 #include "vp8i.h"
-#include "yuv.h"
 
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
@@ -56,12 +55,12 @@ typedef struct {
 
 struct WebPIDecoder {
   DecState state_;         // current decoding state
-  int w_, h_;              // width and height
   WebPDecParams params_;   // Params to store output info
   VP8Decoder* dec_;
   VP8Io io_;
 
-  MemBuffer mem_;          // memory buffer
+  MemBuffer mem_;          // input memory buffer.
+  WebPDecBuffer output_;   // output buffer (when no external one is supplied)
 };
 
 // MB context to restore in case VP8DecodeMB() fails
@@ -236,24 +235,23 @@ static VP8StatusCode IDecError(WebPIDecoder* idec, VP8StatusCode error) {
 
 // Header
 static VP8StatusCode DecodeHeader(WebPIDecoder* const idec) {
-  int width, height;
-  uint32_t curr_size, riff_header_size, bits;
-  WebPDecParams* params = &idec->params_;
+  uint32_t riff_header_size, bits;
   const uint8_t* data = idec->mem_.buf_ + idec->mem_.start_;
+  uint32_t curr_size = MemDataSize(&idec->mem_);
+  uint32_t chunk_size;
 
-  if (MemDataSize(&idec->mem_) < WEBP_HEADER_SIZE) {
+  if (curr_size < WEBP_HEADER_SIZE) {
     return VP8_STATUS_SUSPENDED;
   }
 
-  if (!WebPInitDecParams(data, idec->mem_.end_, &width, &height, params)) {
+  // Validate and Skip over RIFF header
+  chunk_size = WebPCheckRIFFHeader(&data, &curr_size);
+  if (chunk_size == 0 ||
+      curr_size < VP8_HEADER_SIZE ||
+      !VP8GetInfo(data, curr_size, chunk_size, NULL, NULL, NULL)) {
     return IDecError(idec, VP8_STATUS_BITSTREAM_ERROR);
   }
 
-  // Validate and Skip over RIFF header
-  curr_size = MemDataSize(&idec->mem_);
-  if (!WebPCheckRIFFHeader(&data, &curr_size)) {
-    return IDecError(idec, VP8_STATUS_BITSTREAM_ERROR);
-  }
   riff_header_size = idec->mem_.end_ - curr_size;
   bits = data[0] | (data[1] << 8) | (data[2] << 16);
 
@@ -261,8 +259,6 @@ static VP8StatusCode DecodeHeader(WebPIDecoder* const idec) {
   idec->mem_.start_ += riff_header_size;
   assert(idec->mem_.start_ <= idec->mem_.end_);
 
-  idec->w_ = width;
-  idec->h_ = height;
   idec->io_.data_size -= riff_header_size;
   idec->io_.data = data;
   idec->state_ = STATE_PARTS0;
@@ -298,14 +294,13 @@ static VP8StatusCode DecodePartition0(WebPIDecoder* const idec) {
   VP8Decoder* const dec = idec->dec_;
   VP8Io* const io = &idec->io_;
   const WebPDecParams* const params = &idec->params_;
-  const WEBP_CSP_MODE mode = params->mode;
+  WebPDecBuffer* const output = params->output;
 
   // Wait till we have enough data for the whole partition #0
   if (MemDataSize(&idec->mem_) < idec->mem_.part0_size_) {
     return VP8_STATUS_SUSPENDED;
   }
 
-  io->opaque = &idec->params_;
   if (!VP8GetHeaders(dec, io)) {
     const VP8StatusCode status = dec->status_;
     if (status == VP8_STATUS_SUSPENDED ||
@@ -316,29 +311,26 @@ static VP8StatusCode DecodePartition0(WebPIDecoder* const idec) {
     return IDecError(idec, status);
   }
 
-  if (!WebPCheckDecParams(io, params)) {
-    return IDecError(idec, VP8_STATUS_INVALID_PARAM);
+  // Allocate/Verify output buffer now
+  dec->status_ = WebPAllocateDecBuffer(io->width, io->height, params->options,
+                                       output);
+  if (dec->status_ != VP8_STATUS_OK) {
+    return IDecError(idec, dec->status_);
   }
 
-  if (mode != MODE_YUV) {
-    VP8YUVInit();
-  }
-
-  // allocate memory and prepare everything.
+  // Allocate memory and prepare everything.
   if (!VP8InitFrame(dec, io)) {
-    return IDecError(idec, VP8_STATUS_OUT_OF_MEMORY);
-  }
-  if (io->setup && !io->setup(io)) {
-    return IDecError(idec, VP8_STATUS_USER_ABORT);
+    return IDecError(idec, dec->status_);
   }
 
-  // disable filtering per user request (_after_ setup() is called)
-  if (io->bypass_filtering) dec->filter_type_ = 0;
+  // Finish setting up the decoding parameter
+  if (VP8FinishFrameSetup(dec, io) != VP8_STATUS_OK) {
+    return IDecError(idec, dec->status_);
+  }
 
   if (!CopyParts0Data(idec)) {
     return IDecError(idec, VP8_STATUS_OUT_OF_MEMORY);
   }
-
   idec->state_ = STATE_DATA;
   return VP8_STATUS_OK;
 }
@@ -383,6 +375,9 @@ static VP8StatusCode DecodeRemaining(WebPIDecoder* const idec) {
         assert(idec->mem_.start_ <= idec->mem_.end_);
       }
     }
+    if (dec->filter_type_ > 0) {
+      VP8FilterRow(dec);
+    }
     if (!VP8FinishRow(dec, io)) {
       return IDecError(idec, VP8_STATUS_USER_ABORT);
     }
@@ -410,7 +405,7 @@ static VP8StatusCode IDecode(WebPIDecoder* idec) {
     status = DecodePartition0(idec);
   }
   if (idec->state_ == STATE_DATA) {
-    return DecodeRemaining(idec);
+    status = DecodeRemaining(idec);
   }
   return status;
 }
@@ -418,9 +413,11 @@ static VP8StatusCode IDecode(WebPIDecoder* idec) {
 //------------------------------------------------------------------------------
 // Public functions
 
-WebPIDecoder* WebPINew(WEBP_CSP_MODE mode) {
+WebPIDecoder* WebPINewDecoder(WebPDecBuffer* const output_buffer) {
   WebPIDecoder* idec = (WebPIDecoder*)calloc(1, sizeof(WebPIDecoder));
-  if (!idec) return NULL;
+  if (idec == NULL) {
+    return NULL;
+  }
 
   idec->dec_ = VP8New();
   if (idec->dec_ == NULL) {
@@ -430,53 +427,87 @@ WebPIDecoder* WebPINew(WEBP_CSP_MODE mode) {
 
   idec->state_ = STATE_HEADER;
 
-  WebPResetDecParams(&idec->params_);
-  idec->params_.mode = mode;
-
   InitMemBuffer(&idec->mem_);
+  WebPInitDecBuffer(&idec->output_);
   VP8InitIo(&idec->io_);
-  WebPInitCustomIo(&idec->io_);
+
+  WebPResetDecParams(&idec->params_);
+  idec->params_.output = output_buffer ? output_buffer : &idec->output_;
+  WebPInitCustomIo(&idec->params_, &idec->io_);  // Plug the I/O functions.
+
+  return idec;
+}
+
+WebPIDecoder* WebPIDecode(const uint8_t* data, uint32_t data_size,
+                          WebPDecoderConfig* const config) {
+  WebPIDecoder* idec;
+
+  // Parse the bitstream's features, if requested:
+  if (data != NULL && data_size > 0 && config != NULL) {
+    if (WebPGetFeatures(data, data_size, &config->input) != VP8_STATUS_OK) {
+      return NULL;
+    }
+  }
+  // Create an instance of the incremental decoder
+  idec = WebPINewDecoder(config ? &config->output : NULL);
+  if (!idec) {
+    return NULL;
+  }
+  // Finish initialization
+  if (config != NULL) {
+    idec->params_.options = &config->options;
+  }
   return idec;
 }
 
 void WebPIDelete(WebPIDecoder* const idec) {
   if (!idec) return;
   VP8Delete(idec->dec_);
-  WebPClearDecParams(&idec->params_);
   ClearMemBuffer(&idec->mem_);
+  WebPFreeDecBuffer(&idec->output_);
   free(idec);
 }
 
 //------------------------------------------------------------------------------
+// Wrapper toward WebPINewDecoder
+
+WebPIDecoder* WebPINew(WEBP_CSP_MODE mode) {
+  WebPIDecoder* const idec = WebPINewDecoder(NULL);
+  if (!idec) return NULL;
+  idec->output_.colorspace = mode;
+  return idec;
+}
 
 WebPIDecoder* WebPINewRGB(WEBP_CSP_MODE mode, uint8_t* output_buffer,
                           int output_buffer_size, int output_stride) {
   WebPIDecoder* idec;
-  if (mode == MODE_YUV) return NULL;
-  idec = WebPINew(mode);
-  if (idec == NULL) return NULL;
-  idec->params_.output = output_buffer;
-  idec->params_.stride = output_stride;
-  idec->params_.output_size = output_buffer_size;
-  idec->params_.external_buffer = 1;
+  if (mode >= MODE_YUV) return NULL;
+  idec = WebPINewDecoder(NULL);
+  if (!idec) return NULL;
+  idec->output_.colorspace = mode;
+  idec->output_.is_external_memory = 1;
+  idec->output_.u.RGBA.rgba = output_buffer;
+  idec->output_.u.RGBA.stride = output_stride;
+  idec->output_.u.RGBA.size = output_buffer_size;
   return idec;
 }
 
 WebPIDecoder* WebPINewYUV(uint8_t* luma, int luma_size, int luma_stride,
                           uint8_t* u, int u_size, int u_stride,
                           uint8_t* v, int v_size, int v_stride) {
-  WebPIDecoder* idec = WebPINew(MODE_YUV);
-  if (idec == NULL) return NULL;
-  idec->params_.output = luma;
-  idec->params_.stride = luma_stride;
-  idec->params_.output_size = luma_size;
-  idec->params_.u = u;
-  idec->params_.u_stride = u_stride;
-  idec->params_.output_u_size = u_size;
-  idec->params_.v = v;
-  idec->params_.v_stride = v_stride;
-  idec->params_.output_v_size = v_size;
-  idec->params_.external_buffer = 1;
+  WebPIDecoder* const idec = WebPINewDecoder(NULL);
+  if (!idec) return NULL;
+  idec->output_.colorspace = MODE_YUV;
+  idec->output_.is_external_memory = 1;
+  idec->output_.u.YUVA.y = luma;
+  idec->output_.u.YUVA.y_stride = luma_stride;
+  idec->output_.u.YUVA.y_size = luma_size;
+  idec->output_.u.YUVA.u = u;
+  idec->output_.u.YUVA.u_stride = u_stride;
+  idec->output_.u.YUVA.u_size = u_size;
+  idec->output_.u.YUVA.v = v;
+  idec->output_.u.YUVA.v_stride = v_stride;
+  idec->output_.u.YUVA.v_size = v_size;
   return idec;
 }
 
@@ -540,38 +571,54 @@ VP8StatusCode WebPIUpdate(WebPIDecoder* const idec, const uint8_t* data,
 
 //------------------------------------------------------------------------------
 
-uint8_t* WebPIDecGetRGB(const WebPIDecoder* const idec, int *last_y, int* width,
-                        int* height, int* stride) {
-  if (!idec || !idec->dec_ || idec->params_.mode == MODE_YUV ||
-      idec->state_ <= STATE_PARTS0) {
+static const WebPDecBuffer* GetOutputBuffer(const WebPIDecoder* const idec) {
+  if (!idec || !idec->dec_ || idec->state_ <= STATE_PARTS0) {
     return NULL;
   }
-
-  if (last_y) *last_y = idec->params_.last_y;
-  if (width) *width = idec->w_;
-  if (height) *height = idec->h_;
-  if (stride) *stride = idec->params_.stride;
-
   return idec->params_.output;
 }
 
-uint8_t* WebPIDecGetYUV(const WebPIDecoder* const idec, int *last_y,
-                        uint8_t** u, uint8_t** v, int* width, int* height,
-                        int *stride, int* uv_stride) {
-  if (!idec || !idec->dec_ || idec->params_.mode != MODE_YUV ||
-      idec->state_ <= STATE_PARTS0) {
+const WebPDecBuffer* WebPIDecGetSamples(const WebPIDecoder* const idec,
+                                        int* last_y) {
+  const WebPDecBuffer* const src = GetOutputBuffer(idec);
+  if (last_y) *last_y = idec->params_.last_y;
+  return src;
+}
+
+uint8_t* WebPIDecGetRGB(const WebPIDecoder* const idec, int* last_y,
+                        int* width, int* height, int* stride) {
+  const WebPDecBuffer* const src = GetOutputBuffer(idec);
+  if (!src) return NULL;
+  if (src->colorspace >= MODE_YUV) {
     return NULL;
   }
 
   if (last_y) *last_y = idec->params_.last_y;
-  if (u) *u = idec->params_.u;
-  if (v) *v = idec->params_.v;
-  if (width) *width = idec->w_;
-  if (height) *height = idec->h_;
-  if (stride) *stride = idec->params_.stride;
-  if (uv_stride) *uv_stride = idec->params_.u_stride;
+  if (width) *width = src->width;
+  if (height) *height = src->height;
+  if (stride) *stride = src->u.RGBA.stride;
 
-  return idec->params_.output;
+  return src->u.RGBA.rgba;
+}
+
+uint8_t* WebPIDecGetYUV(const WebPIDecoder* const idec, int* last_y,
+                        uint8_t** u, uint8_t** v,
+                        int* width, int* height, int *stride, int* uv_stride) {
+  const WebPDecBuffer* const src = GetOutputBuffer(idec);
+  if (!src) return NULL;
+  if (src->colorspace < MODE_YUV) {
+    return NULL;
+  }
+
+  if (last_y) *last_y = idec->params_.last_y;
+  if (u) *u = src->u.YUVA.u;
+  if (v) *v = src->u.YUVA.v;
+  if (width) *width = src->width;
+  if (height) *height = src->height;
+  if (stride) *stride = src->u.YUVA.y_stride;
+  if (uv_stride) *uv_stride = src->u.YUVA.u_stride;
+
+  return src->u.YUVA.y;
 }
 
 #if defined(__cplusplus) || defined(c_plusplus)
diff --git a/src/dec/io.c b/src/dec/io.c
new file mode 100644
index 00000000..80233ae6
--- /dev/null
+++ b/src/dec/io.c
@@ -0,0 +1,845 @@
+// Copyright 2011 Google Inc.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// functions for sample output.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <assert.h>
+#include <stdlib.h>
+#include "vp8i.h"
+#include "webpi.h"
+#include "yuv.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+#define FANCY_UPSAMPLING   // undefined to remove fancy upsampling support
+
+// mask to apply to WEBP_CSP_MODE, to know if there's alpha channel or not.
+#define MODE_ALPHA_MASK 1
+
+//------------------------------------------------------------------------------
+// Fancy upsampler
+
+#ifdef FANCY_UPSAMPLING
+
+// Given samples laid out in a square as:
+//  [a b]
+//  [c d]
+// we interpolate u/v as:
+//  ([9*a + 3*b + 3*c +   d    3*a + 9*b + 3*c +   d] + [8 8]) / 16
+//  ([3*a +   b + 9*c + 3*d      a + 3*b + 3*c + 9*d]   [8 8]) / 16
+
+// We process u and v together stashed into 32bit (16bit each).
+#define LOAD_UV(u,v) ((u) | ((v) << 16))
+
+#define UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP)                                  \
+static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
+                      const uint8_t* top_u, const uint8_t* top_v,              \
+                      const uint8_t* cur_u, const uint8_t* cur_v,              \
+                      uint8_t* top_dst, uint8_t* bottom_dst, int len) {        \
+  int x;                                                                       \
+  const int last_pixel_pair = (len - 1) >> 1;                                  \
+  uint32_t tl_uv = LOAD_UV(top_u[0], top_v[0]);   /* top-left sample */        \
+  uint32_t l_uv  = LOAD_UV(cur_u[0], cur_v[0]);   /* left-sample */            \
+  if (top_y) {                                                                 \
+    const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2;                \
+    FUNC(top_y[0], uv0 & 0xff, (uv0 >> 16), top_dst);                          \
+  }                                                                            \
+  if (bottom_y) {                                                              \
+    const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2;                \
+    FUNC(bottom_y[0], uv0 & 0xff, (uv0 >> 16), bottom_dst);                    \
+  }                                                                            \
+  for (x = 1; x <= last_pixel_pair; ++x) {                                     \
+    const uint32_t t_uv = LOAD_UV(top_u[x], top_v[x]);  /* top sample */       \
+    const uint32_t uv   = LOAD_UV(cur_u[x], cur_v[x]);  /* sample */           \
+    /* precompute invariant values associated with first and second diagonals*/\
+    const uint32_t avg = tl_uv + t_uv + l_uv + uv + 0x00080008u;               \
+    const uint32_t diag_12 = (avg + 2 * (t_uv + l_uv)) >> 3;                   \
+    const uint32_t diag_03 = (avg + 2 * (tl_uv + uv)) >> 3;                    \
+    if (top_y) {                                                               \
+      const uint32_t uv0 = (diag_12 + tl_uv) >> 1;                             \
+      const uint32_t uv1 = (diag_03 + t_uv) >> 1;                              \
+      FUNC(top_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16),                          \
+           top_dst + (2 * x - 1) * XSTEP);                                     \
+      FUNC(top_y[2 * x - 0], uv1 & 0xff, (uv1 >> 16),                          \
+           top_dst + (2 * x - 0) * XSTEP);                                     \
+    }                                                                          \
+    if (bottom_y) {                                                            \
+      const uint32_t uv0 = (diag_03 + l_uv) >> 1;                              \
+      const uint32_t uv1 = (diag_12 + uv) >> 1;                                \
+      FUNC(bottom_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16),                       \
+           bottom_dst + (2 * x - 1) * XSTEP);                                  \
+      FUNC(bottom_y[2 * x + 0], uv1 & 0xff, (uv1 >> 16),                       \
+           bottom_dst + (2 * x + 0) * XSTEP);                                  \
+    }                                                                          \
+    tl_uv = t_uv;                                                              \
+    l_uv = uv;                                                                 \
+  }                                                                            \
+  if (!(len & 1)) {                                                            \
+    if (top_y) {                                                               \
+      const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2;              \
+      FUNC(top_y[len - 1], uv0 & 0xff, (uv0 >> 16),                            \
+           top_dst + (len - 1) * XSTEP);                                       \
+    }                                                                          \
+    if (bottom_y) {                                                            \
+      const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2;              \
+      FUNC(bottom_y[len - 1], uv0 & 0xff, (uv0 >> 16),                         \
+           bottom_dst + (len - 1) * XSTEP);                                    \
+    }                                                                          \
+  }                                                                            \
+}
+
+// All variants implemented.
+UPSAMPLE_FUNC(UpsampleRgbLinePair,  VP8YuvToRgb,  3)
+UPSAMPLE_FUNC(UpsampleBgrLinePair,  VP8YuvToBgr,  3)
+UPSAMPLE_FUNC(UpsampleRgbaLinePair, VP8YuvToRgba, 4)
+UPSAMPLE_FUNC(UpsampleBgraLinePair, VP8YuvToBgra, 4)
+// These two don't erase the alpha value
+UPSAMPLE_FUNC(UpsampleRgbKeepAlphaLinePair, VP8YuvToRgb, 4)
+UPSAMPLE_FUNC(UpsampleBgrKeepAlphaLinePair, VP8YuvToBgr, 4)
+
+typedef void (*UpsampleLinePairFunc)(
+  const uint8_t* top_y, const uint8_t* bottom_y,
+  const uint8_t* top_u, const uint8_t* top_v,
+  const uint8_t* cur_u, const uint8_t* cur_v,
+  uint8_t* top_dst, uint8_t* bottom_dst, int len);
+
+static const UpsampleLinePairFunc
+  kUpsamplers[MODE_BGRA + 1] = {
+    UpsampleRgbLinePair,    // MODE_RGB
+    UpsampleRgbaLinePair,   // MODE_RGBA
+    UpsampleBgrLinePair,    // MODE_BGR
+    UpsampleBgraLinePair    // MODE_BGRA
+  },
+  kUpsamplersKeepAlpha[MODE_BGRA + 1] = {
+    UpsampleRgbLinePair,            // MODE_RGB
+    UpsampleRgbKeepAlphaLinePair,   // MODE_RGBA
+    UpsampleBgrLinePair,            // MODE_BGR
+    UpsampleBgrKeepAlphaLinePair    // MODE_BGRA
+  };
+
+#undef LOAD_UV
+#undef UPSAMPLE_FUNC
+
+#endif  // FANCY_UPSAMPLING
+
+//------------------------------------------------------------------------------
+// simple point-sampling
+
+#define SAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP)                                    \
+static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
+                      const uint8_t* u, const uint8_t* v,                      \
+                      uint8_t* top_dst, uint8_t* bottom_dst, int len) {        \
+  int i;                                                                       \
+  for (i = 0; i < len - 1; i += 2) {                                           \
+    FUNC(top_y[0], u[0], v[0], top_dst);                                       \
+    FUNC(top_y[1], u[0], v[0], top_dst + XSTEP);                               \
+    FUNC(bottom_y[0], u[0], v[0], bottom_dst);                                 \
+    FUNC(bottom_y[1], u[0], v[0], bottom_dst + XSTEP);                         \
+    top_y += 2;                                                                \
+    bottom_y += 2;                                                             \
+    u++;                                                                       \
+    v++;                                                                       \
+    top_dst += 2 * XSTEP;                                                      \
+    bottom_dst += 2 * XSTEP;                                                   \
+  }                                                                            \
+  if (i == len - 1) {    /* last one */                                        \
+    FUNC(top_y[0], u[0], v[0], top_dst);                                       \
+    FUNC(bottom_y[0], u[0], v[0], bottom_dst);                                 \
+  }                                                                            \
+}
+
+// All variants implemented.
+SAMPLE_FUNC(SampleRgbLinePair,  VP8YuvToRgb,  3)
+SAMPLE_FUNC(SampleBgrLinePair,  VP8YuvToBgr,  3)
+SAMPLE_FUNC(SampleRgbaLinePair, VP8YuvToRgba, 4)
+SAMPLE_FUNC(SampleBgraLinePair, VP8YuvToBgra, 4)
+
+#undef SAMPLE_FUNC
+
+// Main methods.
+typedef void (*SampleLinePairFunc)(
+  const uint8_t* top_y, const uint8_t* bottom_y,
+  const uint8_t* u, const uint8_t* v,
+  uint8_t* top_dst, uint8_t* bottom_dst, int len);
+
+static const SampleLinePairFunc kSamplers[MODE_BGRA + 1] = {
+  SampleRgbLinePair,    // MODE_RGB
+  SampleRgbaLinePair,   // MODE_RGBA
+  SampleBgrLinePair,    // MODE_BGR
+  SampleBgraLinePair    // MODE_BGRA
+};
+
+//------------------------------------------------------------------------------
+// YUV444 converter
+
+#define YUV444_FUNC(FUNC_NAME, FUNC, XSTEP)                                    \
+static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v,    \
+                      uint8_t* dst, int len) {                                 \
+  int i;                                                                       \
+  for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * XSTEP]);           \
+}
+
+YUV444_FUNC(Yuv444ToRgb,  VP8YuvToRgb,  3)
+YUV444_FUNC(Yuv444ToBgr,  VP8YuvToBgr,  3)
+YUV444_FUNC(Yuv444ToRgba, VP8YuvToRgba, 4)
+YUV444_FUNC(Yuv444ToBgra, VP8YuvToBgra, 4)
+
+#undef YUV444_FUNC
+
+typedef void (*YUV444Func)(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                           uint8_t* dst, int len);
+
+static const YUV444Func kYUV444Converters[MODE_BGRA + 1] = {
+  Yuv444ToRgb,    // MODE_RGB
+  Yuv444ToRgba,   // MODE_RGBA
+  Yuv444ToBgr,    // MODE_BGR
+  Yuv444ToBgra    // MODE_BGRA
+};
+
+//------------------------------------------------------------------------------
+// Main YUV<->RGB conversion functions
+
+static int EmitYUV(const VP8Io* const io, WebPDecParams* const p) {
+  WebPDecBuffer* output = p->output;
+  const WebPYUVABuffer* const buf = &output->u.YUVA;
+  uint8_t* const y_dst = buf->y + io->mb_y * buf->y_stride;
+  uint8_t* const u_dst = buf->u + (io->mb_y >> 1) * buf->u_stride;
+  uint8_t* const v_dst = buf->v + (io->mb_y >> 1) * buf->v_stride;
+  const int mb_w = io->mb_w;
+  const int mb_h = io->mb_h;
+  const int uv_w = (mb_w + 1) / 2;
+  int j;
+  for (j = 0; j < mb_h; ++j) {
+    memcpy(y_dst + j * buf->y_stride, io->y + j * io->y_stride, mb_w);
+  }
+  for (j = 0; j < (mb_h + 1) / 2; ++j) {
+    memcpy(u_dst + j * buf->u_stride, io->u + j * io->uv_stride, uv_w);
+    memcpy(v_dst + j * buf->v_stride, io->v + j * io->uv_stride, uv_w);
+  }
+  return io->mb_h;
+}
+
+// Point-sampling U/V sampler.
+static int EmitSampledRGB(const VP8Io* const io, WebPDecParams* const p) {
+  WebPDecBuffer* output = p->output;
+  const WebPRGBABuffer* const buf = &output->u.RGBA;
+  uint8_t* dst = buf->rgba + io->mb_y * buf->stride;
+  const uint8_t* y_src = io->y;
+  const uint8_t* u_src = io->u;
+  const uint8_t* v_src = io->v;
+  const SampleLinePairFunc sample = kSamplers[output->colorspace];
+  const int mb_w = io->mb_w;
+  const int last = io->mb_h - 1;
+  int j;
+  for (j = 0; j < last; j += 2) {
+    sample(y_src, y_src + io->y_stride, u_src, v_src,
+           dst, dst + buf->stride, mb_w);
+    y_src += 2 * io->y_stride;
+    u_src += io->uv_stride;
+    v_src += io->uv_stride;
+    dst += 2 * buf->stride;
+  }
+  if (j == last) {  // Just do the last line twice
+    sample(y_src, y_src, u_src, v_src, dst, dst, mb_w);
+  }
+  return io->mb_h;
+}
+
+//------------------------------------------------------------------------------
+// YUV444 -> RGB conversion
+
+#if 0   // TODO(skal): this is for future rescaling.
+static int EmitRGB(const VP8Io* const io, WebPDecParams* const p) {
+  WebPDecBuffer* output = p->output;
+  const WebPRGBABuffer* const buf = &output->u.RGBA;
+  uint8_t* dst = buf->rgba + io->mb_y * buf->stride;
+  const uint8_t* y_src = io->y;
+  const uint8_t* u_src = io->u;
+  const uint8_t* v_src = io->v;
+  const YUV444Func convert = kYUV444Converters[output->colorspace];
+  const int mb_w = io->mb_w;
+  const int last = io->mb_h;
+  int j;
+  for (j = 0; j < last; ++j) {
+    convert(y_src, u_src, v_src, dst, mb_w);
+    y_src += io->y_stride;
+    u_src += io->uv_stride;
+    v_src += io->uv_stride;
+    dst += buf->stride;
+  }
+  return io->mb_h;
+}
+#endif
+
+//------------------------------------------------------------------------------
+// Fancy upsampling
+
+#ifdef FANCY_UPSAMPLING
+static int EmitFancyRGB(const VP8Io* const io, WebPDecParams* const p) {
+  int num_lines_out = io->mb_h;   // a priori guess
+  const WebPRGBABuffer* const buf = &p->output->u.RGBA;
+  uint8_t* dst = buf->rgba + io->mb_y * buf->stride;
+  const UpsampleLinePairFunc upsample =
+      io->a ? kUpsamplersKeepAlpha[p->output->colorspace]
+            : kUpsamplers[p->output->colorspace];
+  const uint8_t* cur_y = io->y;
+  const uint8_t* cur_u = io->u;
+  const uint8_t* cur_v = io->v;
+  const uint8_t* top_u = p->tmp_u;
+  const uint8_t* top_v = p->tmp_v;
+  int y = io->mb_y;
+  int y_end = io->mb_y + io->mb_h;
+  const int mb_w = io->mb_w;
+  const int uv_w = (mb_w + 1) / 2;
+
+  if (y == 0) {
+    // First line is special cased. We mirror the u/v samples at boundary.
+    upsample(NULL, cur_y, cur_u, cur_v, cur_u, cur_v, NULL, dst, mb_w);
+  } else {
+    // We can finish the left-over line from previous call.
+    // Warning! Don't overwrite the alpha values (if any), as they
+    // are not lagging one line behind but are already written.
+    upsample(p->tmp_y, cur_y, top_u, top_v, cur_u, cur_v,
+             dst - buf->stride, dst, mb_w);
+    num_lines_out++;
+  }
+  // Loop over each output pairs of row.
+  for (; y + 2 < y_end; y += 2) {
+    top_u = cur_u;
+    top_v = cur_v;
+    cur_u += io->uv_stride;
+    cur_v += io->uv_stride;
+    dst += 2 * buf->stride;
+    cur_y += 2 * io->y_stride;
+    upsample(cur_y - io->y_stride, cur_y,
+             top_u, top_v, cur_u, cur_v,
+             dst - buf->stride, dst, mb_w);
+  }
+  // move to last row
+  cur_y += io->y_stride;
+  if (io->crop_top + y_end < io->crop_bottom) {
+    // Save the unfinished samples for next call (as we're not done yet).
+    memcpy(p->tmp_y, cur_y, mb_w * sizeof(*p->tmp_y));
+    memcpy(p->tmp_u, cur_u, uv_w * sizeof(*p->tmp_u));
+    memcpy(p->tmp_v, cur_v, uv_w * sizeof(*p->tmp_v));
+    // The fancy upsampler leaves a row unfinished behind
+    // (except for the very last row)
+    num_lines_out--;
+  } else {
+    // Process the very last row of even-sized picture
+    if (!(y_end & 1)) {
+      upsample(cur_y, NULL, cur_u, cur_v, cur_u, cur_v,
+              dst + buf->stride, NULL, mb_w);
+    }
+  }
+  return num_lines_out;
+}
+
+#endif    /* FANCY_UPSAMPLING */
+
+//------------------------------------------------------------------------------
+
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+static int EmitAlphaYUV(const VP8Io* const io, WebPDecParams* const p) {
+  const int mb_w = io->mb_w;
+  const int mb_h = io->mb_h;
+  int j;
+  const WebPYUVABuffer* const buf = &p->output->u.YUVA;
+  uint8_t* dst = buf->a + io->mb_y * buf->a_stride;
+  const uint8_t* alpha = io->a;
+  if (alpha) {
+    for (j = 0; j < mb_h; ++j) {
+      memcpy(dst, alpha, mb_w * sizeof(*dst));
+      alpha += io->width;
+      dst += buf->a_stride;
+    }
+  }
+  return 0;
+}
+
+static int EmitAlphaRGB(const VP8Io* const io, WebPDecParams* const p) {
+  const int mb_w = io->mb_w;
+  const int mb_h = io->mb_h;
+  int i, j;
+  const WebPRGBABuffer* const buf = &p->output->u.RGBA;
+  uint8_t* dst = buf->rgba + io->mb_y * buf->stride;
+  const uint8_t* alpha = io->a;
+  if (alpha) {
+    for (j = 0; j < mb_h; ++j) {
+      for (i = 0; i < mb_w; ++i) {
+        dst[4 * i + 3] = alpha[i];
+      }
+      alpha += io->width;
+      dst += buf->stride;
+    }
+  }
+  return 0;
+}
+
+#endif    /* WEBP_EXPERIMENTAL_FEATURES */
+
+//------------------------------------------------------------------------------
+// Simple picture rescaler
+
+// TODO(skal): start a common library for encoder and decoder, and factorize
+// this code in.
+
+#define RFIX 30
+#define MULT(x,y) (((int64_t)(x) * (y) + (1 << (RFIX - 1))) >> RFIX)
+
+static void InitRescaler(WebPRescaler* const wrk,
+                         int src_width, int src_height,
+                         uint8_t* dst,
+                         int dst_width, int dst_height, int dst_stride,
+                         int x_add, int x_sub, int y_add, int y_sub,
+                         int32_t* work) {
+  wrk->x_expand = (src_width < dst_width);
+  wrk->src_width = src_width;
+  wrk->src_height = src_height;
+  wrk->dst_width = dst_width;
+  wrk->dst_height = dst_height;
+  wrk->dst = dst;
+  wrk->dst_stride = dst_stride;
+  // for 'x_expand', we use bilinear interpolation
+  wrk->x_add = wrk->x_expand ? (x_sub - 1) : x_add - x_sub;
+  wrk->x_sub = wrk->x_expand ? (x_add - 1) : x_sub;
+  wrk->y_accum = y_add;
+  wrk->y_add = y_add;
+  wrk->y_sub = y_sub;
+  wrk->fx_scale = (1 << RFIX) / x_sub;
+  wrk->fy_scale = (1 << RFIX) / y_sub;
+  wrk->fxy_scale = wrk->x_expand ?
+      ((int64_t)dst_height << RFIX) / (x_sub * src_height) :
+      ((int64_t)dst_height << RFIX) / (x_add * src_height);
+  wrk->irow = work;
+  wrk->frow = work + dst_width;
+}
+
+static inline void ImportRow(const uint8_t* const src,
+                             WebPRescaler* const wrk) {
+  int x_in = 0;
+  int x_out;
+  int accum = 0;
+  if (!wrk->x_expand) {
+    int sum = 0;
+    for (x_out = 0; x_out < wrk->dst_width; ++x_out) {
+      accum += wrk->x_add;
+      for (; accum > 0; accum -= wrk->x_sub) {
+        sum += src[x_in++];
+      }
+      {        // Emit next horizontal pixel.
+        const int32_t base = src[x_in++];
+        const int32_t frac = base * (-accum);
+        wrk->frow[x_out] = (sum + base) * wrk->x_sub - frac;
+        // fresh fractional start for next pixel
+        sum = MULT(frac, wrk->fx_scale);
+      }
+    }
+  } else {        // simple bilinear interpolation
+    int left = src[0], right = src[0];
+    for (x_out = 0; x_out < wrk->dst_width; ++x_out) {
+      if (accum < 0) {
+        left = right;
+        right = src[++x_in];
+        accum += wrk->x_add;
+      }
+      wrk->frow[x_out] = right * wrk->x_add + (left - right) * accum;
+      accum -= wrk->x_sub;
+    }
+  }
+  // Accumulate the new row's contribution
+  for (x_out = 0; x_out < wrk->dst_width; ++x_out) {
+    wrk->irow[x_out] += wrk->frow[x_out];
+  }
+}
+
+static void ExportRow(WebPRescaler* const wrk) {
+  int x_out;
+  const int yscale = wrk->fy_scale * (-wrk->y_accum);
+  assert(wrk->y_accum <= 0);
+  for (x_out = 0; x_out < wrk->dst_width; ++x_out) {
+    const int frac = MULT(wrk->frow[x_out], yscale);
+    const int v = MULT(wrk->irow[x_out] - frac, wrk->fxy_scale);
+    wrk->dst[x_out] = (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
+    wrk->irow[x_out] = frac;   // new fractional start
+  }
+  wrk->y_accum += wrk->y_add;
+  wrk->dst += wrk->dst_stride;
+}
+
+#undef MULT
+#undef RFIX
+
+//------------------------------------------------------------------------------
+// YUV rescaling (no final RGB conversion needed)
+
+static int Rescale(const uint8_t* src, int src_stride,
+                   int new_lines, WebPRescaler* const wrk) {
+  int num_lines_out = 0;
+  while (new_lines-- > 0) {    // import new contribution of one source row.
+    ImportRow(src, wrk);
+    src += src_stride;
+    wrk->y_accum -= wrk->y_sub;
+    while (wrk->y_accum <= 0) {      // emit output row(s)
+      ExportRow(wrk);
+      num_lines_out++;
+    }
+  }
+  return num_lines_out;
+}
+
+static int EmitRescaledYUV(const VP8Io* const io, WebPDecParams* const p) {
+  const int mb_h = io->mb_h;
+  const int uv_mb_h = (mb_h + 1) >> 1;
+  const int num_lines_out = Rescale(io->y, io->y_stride, mb_h, &p->scaler_y);
+  Rescale(io->u, io->uv_stride, uv_mb_h, &p->scaler_u);
+  Rescale(io->v, io->uv_stride, uv_mb_h, &p->scaler_v);
+  return num_lines_out;
+}
+
+static int EmitRescaledAlphaYUV(const VP8Io* const io, WebPDecParams* const p) {
+  if (io->a) {
+    Rescale(io->a, io->width, io->mb_h, &p->scaler_a);
+  }
+  return 0;
+}
+
+static int InitYUVRescaler(const VP8Io* const io, WebPDecParams* const p) {
+  const int has_alpha = (p->output->colorspace & MODE_ALPHA_MASK);
+  const WebPYUVABuffer* const buf = &p->output->u.YUVA;
+  const int out_width  = io->scaled_width;
+  const int out_height = io->scaled_height;
+  const int uv_out_width  = (out_width + 1) >> 1;
+  const int uv_out_height = (out_height + 1) >> 1;
+  const int uv_in_width  = (io->mb_w + 1) >> 1;
+  const int uv_in_height = (io->mb_h + 1) >> 1;
+  const size_t work_size = 2 * out_width;   // scratch memory for luma rescaler
+  const size_t uv_work_size = 2 * uv_out_width;  // and for each u/v ones
+  size_t tmp_size;
+  int32_t* work;
+
+  tmp_size = work_size + 2 * uv_work_size;
+  if (has_alpha) {
+    tmp_size += work_size;
+  }
+  p->memory = calloc(1, tmp_size * sizeof(*work));
+  if (p->memory == NULL) {
+    return 0;   // memory error
+  }
+  work = (int32_t*)p->memory;
+  InitRescaler(&p->scaler_y, io->mb_w, io->mb_h,
+               buf->y, out_width, out_height, buf->y_stride,
+               io->mb_w, out_width, io->mb_h, out_height,
+               work);
+  InitRescaler(&p->scaler_u, uv_in_width, uv_in_height,
+               buf->u, uv_out_width, uv_out_height, buf->u_stride,
+               uv_in_width, uv_out_width,
+               uv_in_height, uv_out_height,
+               work + work_size);
+  InitRescaler(&p->scaler_v, uv_in_width, uv_in_height,
+               buf->v, uv_out_width, uv_out_height, buf->v_stride,
+               uv_in_width, uv_out_width,
+               uv_in_height, uv_out_height,
+               work + work_size + uv_work_size);
+  p->emit = EmitRescaledYUV;
+  if (has_alpha) {
+    InitRescaler(&p->scaler_a, io->mb_w, io->mb_h,
+                 buf->a, out_width, out_height, buf->a_stride,
+                 io->mb_w, out_width, io->mb_h, out_height,
+                 work + work_size + 2 * uv_work_size);
+    p->emit_alpha = EmitRescaledAlphaYUV;
+  }
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+// RGBA rescaling
+
+// import new contributions until one row is ready to be output, or all input
+// is consumed.
+static int Import(const uint8_t* src, int src_stride,
+                  int new_lines, WebPRescaler* const wrk) {
+  int num_lines_in = 0;
+  while (num_lines_in < new_lines && wrk->y_accum > 0) {
+    ImportRow(src, wrk);
+    src += src_stride;
+    ++num_lines_in;
+    wrk->y_accum -= wrk->y_sub;
+  }
+  return num_lines_in;
+}
+
+static int ExportRGB(WebPDecParams* const p, int y_pos) {
+  const YUV444Func convert = kYUV444Converters[p->output->colorspace];
+  const WebPRGBABuffer* const buf = &p->output->u.RGBA;
+  uint8_t* dst = buf->rgba + (p->last_y + y_pos) * buf->stride;
+  int num_lines_out = 0;
+  // For RGB rescaling, because of the YUV420, current scan position
+  // U/V can be +1/-1 line from the Y one.  Hence the double test.
+  while (p->scaler_y.y_accum <= 0 && p->scaler_u.y_accum <= 0) {
+    assert(p->last_y + y_pos + num_lines_out < p->output->height);
+    assert(p->scaler_u.y_accum == p->scaler_v.y_accum);
+    ExportRow(&p->scaler_y);
+    ExportRow(&p->scaler_u);
+    ExportRow(&p->scaler_v);
+    convert(p->scaler_y.dst, p->scaler_u.dst, p->scaler_v.dst,
+            dst, p->scaler_y.dst_width);
+    dst += buf->stride;
+    num_lines_out++;
+  }
+  return num_lines_out;
+}
+
+static int EmitRescaledRGB(const VP8Io* const io, WebPDecParams* const p) {
+  const int mb_h = io->mb_h;
+  const int uv_mb_h = (mb_h + 1) >> 1;
+  int j = 0, uv_j = 0;
+  int num_lines_out = 0;
+  while (j < mb_h) {
+    const int y_lines_in = Import(io->y + j * io->y_stride, io->y_stride,
+                                  mb_h - j, &p->scaler_y);
+    const int u_lines_in = Import(io->u + uv_j * io->uv_stride, io->uv_stride,
+                                  uv_mb_h - uv_j, &p->scaler_u);
+    const int v_lines_in = Import(io->v + uv_j * io->uv_stride, io->uv_stride,
+                                  uv_mb_h - uv_j, &p->scaler_v);
+    (void)v_lines_in;   // remove a gcc warning
+    assert(u_lines_in == v_lines_in);
+    j += y_lines_in;
+    uv_j += u_lines_in;
+    num_lines_out += ExportRGB(p, num_lines_out);
+  }
+  return num_lines_out;
+}
+
+static int ExportAlpha(WebPDecParams* const p, int y_pos) {
+  const WebPRGBABuffer* const buf = &p->output->u.RGBA;
+  uint8_t* dst = buf->rgba + (p->last_y + y_pos) * buf->stride;
+  int num_lines_out = 0;
+  while (p->scaler_a.y_accum <= 0) {
+    int i;
+    assert(p->last_y + y_pos + num_lines_out < p->output->height);
+    ExportRow(&p->scaler_a);
+    for (i = 0; i < p->scaler_a.dst_width; ++i) {
+      dst[4 * i + 3] = p->scaler_a.dst[i];
+    }
+    dst += buf->stride;
+    num_lines_out++;
+  }
+  return num_lines_out;
+}
+
+static int EmitRescaledAlphaRGB(const VP8Io* const io, WebPDecParams* const p) {
+  if (io->a) {
+    int j = 0, pos = 0;
+    while (j < io->mb_h) {
+      j += Import(io->a + j * io->width, io->width, io->mb_h - j, &p->scaler_a);
+      pos += ExportAlpha(p, pos);
+    }
+  }
+  return 0;
+}
+
+static int InitRGBRescaler(const VP8Io* const io, WebPDecParams* const p) {
+  const int has_alpha = (p->output->colorspace & MODE_ALPHA_MASK);
+  const int out_width  = io->scaled_width;
+  const int out_height = io->scaled_height;
+  const int uv_in_width  = (io->mb_w + 1) >> 1;
+  const int uv_in_height = (io->mb_h + 1) >> 1;
+  const size_t work_size = 2 * out_width;   // scratch memory for one rescaler
+  int32_t* work;  // rescalers work area
+  uint8_t* tmp;   // tmp storage for scaled YUV444 samples before RGB conversion
+  size_t tmp_size1, tmp_size2;
+
+  tmp_size1 = 3 * work_size;
+  tmp_size2 = 3 * out_width;
+  if (has_alpha) {
+    tmp_size1 += work_size;
+    tmp_size2 += out_width;
+  }
+  p->memory =
+      calloc(1, tmp_size1 * sizeof(*work) + tmp_size2 * sizeof(*tmp));
+  if (p->memory == NULL) {
+    return 0;   // memory error
+  }
+  work = (int32_t*)p->memory;
+  tmp = (uint8_t*)(work + tmp_size1);
+  InitRescaler(&p->scaler_y, io->mb_w, io->mb_h,
+               tmp + 0 * out_width, out_width, out_height, 0,
+               io->mb_w, out_width, io->mb_h, out_height,
+               work + 0 * work_size);
+  InitRescaler(&p->scaler_u, uv_in_width, uv_in_height,
+               tmp + 1 * out_width, out_width, out_height, 0,
+               io->mb_w, 2 * out_width, io->mb_h, 2 * out_height,
+               work + 1 * work_size);
+  InitRescaler(&p->scaler_v, uv_in_width, uv_in_height,
+               tmp + 2 * out_width, out_width, out_height, 0,
+               io->mb_w, 2 * out_width, io->mb_h, 2 * out_height,
+               work + 2 * work_size);
+  p->emit = EmitRescaledRGB;
+
+  if (has_alpha) {
+    InitRescaler(&p->scaler_a, io->mb_w, io->mb_h,
+                 tmp + 3 * out_width, out_width, out_height, 0,
+                 io->mb_w, out_width, io->mb_h, out_height,
+                 work + 3 * work_size);
+    p->emit_alpha = EmitRescaledAlphaRGB;
+  }
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+// Default custom functions
+
+// Setup crop_xxx fields, mb_w and mb_h
+static int InitFromOptions(const WebPDecoderOptions* const options,
+                           VP8Io* const io) {
+  const int W = io->width;
+  const int H = io->height;
+  int x = 0, y = 0, w = W, h = H;
+
+  // Cropping
+  io->use_cropping = (options != NULL) && (options->use_cropping > 0);
+  if (io->use_cropping) {
+    w = options->crop_width;
+    h = options->crop_height;
+    // TODO(skal): take colorspace into account. Don't assume YUV420.
+    x = options->crop_left & ~1;
+    y = options->crop_top & ~1;
+    if (x < 0 || y < 0 || w <= 0 || h <= 0 || x + w > W || y + h > H) {
+      return 0;  // out of frame boundary error
+    }
+  }
+  io->crop_left   = x;
+  io->crop_top    = y;
+  io->crop_right  = x + w;
+  io->crop_bottom = y + h;
+  io->mb_w = w;
+  io->mb_h = h;
+
+  // Scaling
+  io->use_scaling = (options != NULL) && (options->use_scaling > 0);
+  if (io->use_scaling) {
+    if (options->scaled_width <= 0 || options->scaled_height <= 0) {
+      return 0;
+    }
+    io->scaled_width = options->scaled_width;
+    io->scaled_height = options->scaled_height;
+  }
+
+  // Filter
+  io->bypass_filtering = options && options->bypass_filtering;
+
+  // Fancy upsampler
+#ifdef FANCY_UPSAMPLING
+  io->fancy_upsampling = (options == NULL) || (!options->no_fancy_upsampling);
+#endif
+
+  if (io->use_scaling) {
+    // disable filter (only for large downscaling ratio).
+    io->bypass_filtering = (io->scaled_width < W * 3 / 4) &&
+                           (io->scaled_height < H * 3 / 4);
+    io->fancy_upsampling = 0;
+  }
+  return 1;
+}
+
+static int CustomSetup(VP8Io* io) {
+  WebPDecParams* const p = (WebPDecParams*)io->opaque;
+  const int is_rgb = (p->output->colorspace < MODE_YUV);
+
+  p->memory = NULL;
+  p->emit = NULL;
+  p->emit_alpha = NULL;
+  if (!InitFromOptions(p->options, io)) {
+    return 0;
+  }
+
+  if (io->use_scaling) {
+    const int ok = is_rgb ? InitRGBRescaler(io, p) : InitYUVRescaler(io, p);
+    if (!ok) {
+      return 0;    // memory error
+    }
+  } else {
+    if (is_rgb) {
+      p->emit = EmitSampledRGB;   // default
+#ifdef FANCY_UPSAMPLING
+      if (io->fancy_upsampling) {
+        const int uv_width = (io->mb_w + 1) >> 1;
+        p->memory = malloc(io->mb_w + 2 * uv_width);
+        if (p->memory == NULL) {
+          return 0;   // memory error.
+        }
+        p->tmp_y = (uint8_t*)p->memory;
+        p->tmp_u = p->tmp_y + io->mb_w;
+        p->tmp_v = p->tmp_u + uv_width;
+        p->emit = EmitFancyRGB;
+      }
+#endif
+    } else {
+      p->emit = EmitYUV;
+    }
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+    if (p->output->colorspace & MODE_ALPHA_MASK) {
+      // We need transparency output
+      p->emit_alpha = is_rgb ? EmitAlphaRGB : EmitAlphaYUV;
+    }
+#endif
+  }
+
+  if (is_rgb) {
+    VP8YUVInit();
+  }
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+
+static int CustomPut(const VP8Io* io) {
+  WebPDecParams* p = (WebPDecParams*)io->opaque;
+  const int mb_w = io->mb_w;
+  const int mb_h = io->mb_h;
+  int num_lines_out;
+  assert(!(io->mb_y & 1));
+
+  if (mb_w <= 0 || mb_h <= 0) {
+    return 0;
+  }
+  num_lines_out = p->emit(io, p);
+  if (p->emit_alpha) {
+    p->emit_alpha(io, p);
+  }
+  p->last_y += num_lines_out;
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+
+static void CustomTeardown(const VP8Io* io) {
+  WebPDecParams* const p = (WebPDecParams*)io->opaque;
+  free(p->memory);
+  p->memory = NULL;
+}
+
+//------------------------------------------------------------------------------
+// Main entry point
+
+void WebPInitCustomIo(WebPDecParams* const params, VP8Io* const io) {
+  io->put      = CustomPut;
+  io->setup    = CustomSetup;
+  io->teardown = CustomTeardown;
+  io->opaque   = params;
+}
+
+//------------------------------------------------------------------------------
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/src/dec/vp8.c b/src/dec/vp8.c
index 144bd501..1f1ce29d 100644
--- a/src/dec/vp8.c
+++ b/src/dec/vp8.c
@@ -76,8 +76,12 @@ int VP8SetError(VP8Decoder* const dec,
 
 //-----------------------------------------------------------------------------
 
-int VP8GetInfo(const uint8_t* data, uint32_t chunk_size,
-               int *width, int *height) {
+int VP8GetInfo(const uint8_t* data,
+               uint32_t data_size, uint32_t chunk_size,
+               int* width, int* height, int* has_alpha) {
+  if (data_size < 10) {
+    return 0;         // not enough data
+  }
   // check signature
   if (data[3] != 0x9d || data[4] != 0x01 || data[5] != 0x2a) {
     return 0;         // Wrong signature.
@@ -87,6 +91,14 @@ int VP8GetInfo(const uint8_t* data, uint32_t chunk_size,
     const int w = ((data[7] << 8) | data[6]) & 0x3fff;
     const int h = ((data[9] << 8) | data[8]) & 0x3fff;
 
+    if (has_alpha) {
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+      if (data_size < 11) return 0;
+      *has_alpha = !!(data[10] & 0x80);    // the colorspace_ bit
+#else
+      *has_alpha = 0;
+#endif
+    }
     if (!key_frame) {   // Not a keyframe.
       return 0;
     }
@@ -254,7 +266,7 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
                        "null VP8Io passed to VP8GetHeaders()");
   }
 
-  buf = (uint8_t *)io->data;
+  buf = (uint8_t*)io->data;
   buf_size = io->data_size;
   if (buf == NULL || buf_size <= 4) {
     return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
@@ -329,8 +341,17 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
 
     dec->mb_w_ = (pic_hdr->width_ + 15) >> 4;
     dec->mb_h_ = (pic_hdr->height_ + 15) >> 4;
+    // Setup default output area (can be later modified during io->setup())
     io->width = pic_hdr->width_;
     io->height = pic_hdr->height_;
+    io->use_scaling  = 0;
+    io->use_cropping = 0;
+    io->crop_top  = 0;
+    io->crop_left = 0;
+    io->crop_right  = io->width;
+    io->crop_bottom = io->height;
+    io->mb_w = io->width;   // sanity check
+    io->mb_h = io->height;  // ditto
 
     VP8ResetProba(&dec->proba_);
     ResetSegmentHeader(&dec->segment_hdr_);
@@ -458,7 +479,7 @@ static const uint8_t kCat4[] = { 176, 155, 140, 135, 0 };
 static const uint8_t kCat5[] = { 180, 157, 141, 134, 130, 0 };
 static const uint8_t kCat6[] =
   { 254, 254, 243, 230, 196, 177, 153, 140, 133, 130, 129, 0 };
-static const uint8_t * const kCat3456[] = { kCat3, kCat4, kCat5, kCat6 };
+static const uint8_t* const kCat3456[] = { kCat3, kCat4, kCat5, kCat6 };
 static const uint8_t kZigzag[16] = {
   0, 1, 4, 8,  5, 2, 3, 6,  9, 12, 13, 10,  7, 11, 14, 15
 };
@@ -662,11 +683,10 @@ int VP8DecodeMB(VP8Decoder* const dec, VP8BitReader* const token_br) {
 }
 
 static int ParseFrame(VP8Decoder* const dec, VP8Io* io) {
-  for (dec->mb_y_ = 0; dec->mb_y_ < dec->mb_h_; ++dec->mb_y_) {
+  for (dec->mb_y_ = 0; dec->mb_y_ < dec->br_mb_y_; ++dec->mb_y_) {
     VP8MB* const left = dec->mb_info_ - 1;
     VP8BitReader* const token_br =
         &dec->parts_[dec->mb_y_ & (dec->num_parts_ - 1)];
-
     left->nz_ = 0;
     left->dc_nz_ = 0;
     memset(dec->intra_l_, B_DC_PRED, sizeof(dec->intra_l_));
@@ -681,9 +701,11 @@ static int ParseFrame(VP8Decoder* const dec, VP8Io* io) {
       // Store data and save block's filtering params
       VP8StoreBlock(dec);
     }
+    if (dec->filter_type_ > 0) {
+      VP8FilterRow(dec);
+    }
     if (!VP8FinishRow(dec, io)) {
-      return VP8SetError(dec, VP8_STATUS_USER_ABORT,
-                         "Output aborted.");
+      return VP8SetError(dec, VP8_STATUS_USER_ABORT, "Output aborted.");
     }
   }
 
@@ -722,22 +744,18 @@ int VP8Decode(VP8Decoder* const dec, VP8Io* const io) {
   }
   assert(dec->ready_);
 
-  // will allocate memory and prepare everything.
+  // Will allocate memory and prepare everything.
   if (!VP8InitFrame(dec, io)) {
     VP8Clear(dec);
-    return VP8SetError(dec, VP8_STATUS_OUT_OF_MEMORY,
-                       "Allocation failed");
+    return 0;
   }
 
-  if (io->setup && !io->setup(io)) {
+  // Finish setting up the decoding parameter
+  if (VP8FinishFrameSetup(dec, io) != VP8_STATUS_OK) {
     VP8Clear(dec);
-    return VP8SetError(dec, VP8_STATUS_USER_ABORT,
-                       "Frame setup failed");
+    return 0;
   }
 
-  // Disable filtering per user request (_after_ setup() is called)
-  if (io->bypass_filtering) dec->filter_type_ = 0;
-
   // Main decoding loop
   {
     const int ret = ParseFrame(dec, io);
diff --git a/src/dec/vp8i.h b/src/dec/vp8i.h
index 10ac4912..587b1cb4 100644
--- a/src/dec/vp8i.h
+++ b/src/dec/vp8i.h
@@ -184,6 +184,10 @@ struct VP8Decoder {
   // dimension, in macroblock units.
   int mb_w_, mb_h_;
 
+  // Macroblock to process/filter, depending on cropping and filter_type.
+  int tl_mb_x_, tl_mb_y_;  // top-left MB that must be in-loop filtered
+  int br_mb_x_, br_mb_y_;  // last bottom-right MB that must be decoded
+
   // number of partitions.
   int num_parts_;
   // per-partition boolean decoders.
@@ -212,8 +216,8 @@ struct VP8Decoder {
   // Boundary data cache and persistent buffers.
   uint8_t* intra_t_;     // top intra modes values: 4 * mb_w_
   uint8_t  intra_l_[4];  // left intra modes values
-  uint8_t *y_t_;         // top luma samples: 16 * mb_w_
-  uint8_t *u_t_, *v_t_;  // top u/v samples: 8 * mb_w_ each
+  uint8_t* y_t_;         // top luma samples: 16 * mb_w_
+  uint8_t* u_t_, *v_t_;  // top u/v samples: 8 * mb_w_ each
 
   VP8MB* mb_info_;       // contextual macroblock infos (mb_w_ + 1)
   uint8_t* yuv_b_;       // main block for Y/U/V (size = YUV_SIZE)
@@ -264,10 +268,12 @@ struct VP8Decoder {
 int VP8SetError(VP8Decoder* const dec,
                 VP8StatusCode error, const char * const msg);
 // Validates the VP8 data-header and retrieve basic header information viz width
-// and height. Returns 0 in case of formatting error. *width/*height can be
-// passed NULL.
-int VP8GetInfo(const uint8_t* data, uint32_t data_size,
-               int *width, int *height);
+// and height. Returns 0 in case of formatting error. *width/*height/*has_alpha
+// can be passed NULL.
+int VP8GetInfo(const uint8_t* data,
+               uint32_t data_size,    // data available so far
+               uint32_t chunk_size,   // total data size expect in the chunk
+               int *width, int *height, int *has_alpha);
 
 // in tree.c
 void VP8ResetProba(VP8Proba* const proba);
@@ -281,10 +287,14 @@ void VP8ParseQuant(VP8Decoder* const dec);
 int VP8InitFrame(VP8Decoder* const dec, VP8Io* io);
 // Predict a block and add residual
 void VP8ReconstructBlock(VP8Decoder* const dec);
+// Call io->setup() and finish setting up scan parameters.
+VP8StatusCode VP8FinishFrameSetup(VP8Decoder* const dec, VP8Io* const io);
+// Filter the decoded macroblock row (if needed)
+void VP8FilterRow(const VP8Decoder* const dec);
 // Store a block, along with filtering params
 void VP8StoreBlock(VP8Decoder* const dec);
 // Finalize and transmit a complete row. Return false in case of user-abort.
-int VP8FinishRow(VP8Decoder* const dec, VP8Io* io);
+int VP8FinishRow(VP8Decoder* const dec, VP8Io* const io);
 // Decode one macroblock. Returns false if there is not enough data.
 int VP8DecodeMB(VP8Decoder* const dec, VP8BitReader* const token_br);
 
@@ -307,7 +317,7 @@ extern void (*VP8TransformWHT)(const int16_t* in, int16_t* out);
 
 // *dst is the destination block, with stride BPS. Boundary samples are
 // assumed accessible when needed.
-typedef void (*VP8PredFunc)(uint8_t *dst);
+typedef void (*VP8PredFunc)(uint8_t* dst);
 extern VP8PredFunc VP8PredLuma16[NUM_B_DC_MODES];
 extern VP8PredFunc VP8PredChroma8[NUM_B_DC_MODES];
 extern VP8PredFunc VP8PredLuma4[NUM_BMODES];
diff --git a/src/dec/webp.c b/src/dec/webp.c
index 57220a86..e642f31c 100644
--- a/src/dec/webp.c
+++ b/src/dec/webp.c
@@ -12,14 +12,11 @@
 #include <stdlib.h>
 #include "vp8i.h"
 #include "webpi.h"
-#include "yuv.h"
 
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 
-#define FANCY_UPSCALING   // undefined to remove fancy upscaling support
-
 //-----------------------------------------------------------------------------
 // RIFF layout is:
 //   0ffset  tag
@@ -39,7 +36,7 @@ static inline uint32_t get_le32(const uint8_t* const data) {
 
 // If a RIFF container is detected, validate it and skip over it.
 uint32_t WebPCheckRIFFHeader(const uint8_t** data_ptr,
-                             uint32_t *data_size_ptr) {
+                             uint32_t* data_size_ptr) {
   uint32_t chunk_size = 0xffffffffu;
   if (*data_size_ptr >= 10 + 20 && !memcmp(*data_ptr, "RIFF", 4)) {
     if (memcmp(*data_ptr + 8, "WEBP", 4)) {
@@ -67,473 +64,96 @@ uint32_t WebPCheckRIFFHeader(const uint8_t** data_ptr,
 }
 
 //-----------------------------------------------------------------------------
-// Fancy upscaling
-
-#ifdef FANCY_UPSCALING
-
-// Given samples laid out in a square as:
-//  [a b]
-//  [c d]
-// we interpolate u/v as:
-//  ([9*a + 3*b + 3*c +   d    3*a + 9*b + 3*c +   d] + [8 8]) / 16
-//  ([3*a +   b + 9*c + 3*d      a + 3*b + 3*c + 9*d]   [8 8]) / 16
-
-// We process u and v together stashed into 32bit (16bit each).
-#define LOAD_UV(u,v) ((u) | ((v) << 16))
-
-#define UPSCALE_FUNC(FUNC_NAME, FUNC, XSTEP)                                   \
-static inline void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,    \
-                             const uint8_t* top_u, const uint8_t* top_v,       \
-                             const uint8_t* cur_u, const uint8_t* cur_v,       \
-                             uint8_t* top_dst, uint8_t* bottom_dst, int len) { \
-  int x;                                                                       \
-  const int last_pixel_pair = (len - 1) >> 1;                                  \
-  uint32_t tl_uv = LOAD_UV(top_u[0], top_v[0]);   /* top-left sample */        \
-  uint32_t l_uv  = LOAD_UV(cur_u[0], cur_v[0]);   /* left-sample */            \
-  if (top_y) {                                                                 \
-    const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2;                \
-    FUNC(top_y[0], uv0 & 0xff, (uv0 >> 16), top_dst);                          \
-  }                                                                            \
-  if (bottom_y) {                                                              \
-    const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2;                \
-    FUNC(bottom_y[0], uv0 & 0xff, (uv0 >> 16), bottom_dst);                    \
-  }                                                                            \
-  for (x = 1; x <= last_pixel_pair; ++x) {                                     \
-    const uint32_t t_uv = LOAD_UV(top_u[x], top_v[x]);  /* top sample */       \
-    const uint32_t uv   = LOAD_UV(cur_u[x], cur_v[x]);  /* sample */           \
-    /* precompute invariant values associated with first and second diagonals*/\
-    const uint32_t avg = tl_uv + t_uv + l_uv + uv + 0x00080008u;               \
-    const uint32_t diag_12 = (avg + 2 * (t_uv + l_uv)) >> 3;                   \
-    const uint32_t diag_03 = (avg + 2 * (tl_uv + uv)) >> 3;                    \
-    if (top_y) {                                                               \
-      const uint32_t uv0 = (diag_12 + tl_uv) >> 1;                             \
-      const uint32_t uv1 = (diag_03 + t_uv) >> 1;                              \
-      FUNC(top_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16),                          \
-           top_dst + (2 * x - 1) * XSTEP);                                     \
-      FUNC(top_y[2 * x - 0], uv1 & 0xff, (uv1 >> 16),                          \
-           top_dst + (2 * x - 0) * XSTEP);                                     \
-    }                                                                          \
-    if (bottom_y) {                                                            \
-      const uint32_t uv0 = (diag_03 + l_uv) >> 1;                              \
-      const uint32_t uv1 = (diag_12 + uv) >> 1;                                \
-      FUNC(bottom_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16),                       \
-           bottom_dst + (2 * x - 1) * XSTEP);                                  \
-      FUNC(bottom_y[2 * x + 0], uv1 & 0xff, (uv1 >> 16),                       \
-           bottom_dst + (2 * x + 0) * XSTEP);                                  \
-    }                                                                          \
-    tl_uv = t_uv;                                                              \
-    l_uv = uv;                                                                 \
-  }                                                                            \
-  if (!(len & 1)) {                                                            \
-    if (top_y) {                                                               \
-      const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2;              \
-      FUNC(top_y[len - 1], uv0 & 0xff, (uv0 >> 16),                            \
-           top_dst + (len - 1) * XSTEP);                                       \
-    }                                                                          \
-    if (bottom_y) {                                                            \
-      const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2;              \
-      FUNC(bottom_y[len - 1], uv0 & 0xff, (uv0 >> 16),                         \
-           bottom_dst + (len - 1) * XSTEP);                                    \
-    }                                                                          \
-  }                                                                            \
-}
-
-// All variants implemented.
-UPSCALE_FUNC(UpscaleRgbLinePair,  VP8YuvToRgb, 3)
-UPSCALE_FUNC(UpscaleBgrLinePair,  VP8YuvToBgr, 3)
-UPSCALE_FUNC(UpscaleRgbaLinePair, VP8YuvToRgb, 4)
-UPSCALE_FUNC(UpscaleBgraLinePair, VP8YuvToBgr, 4)
-
-// Main driver function.
-static inline
-void UpscaleLinePair(const uint8_t* top_y, const uint8_t* bottom_y,
-                     const uint8_t* top_u, const uint8_t* top_v,
-                     const uint8_t* cur_u, const uint8_t* cur_v,
-                     uint8_t* top_dst, uint8_t* bottom_dst, int len,
-                     WEBP_CSP_MODE mode) {
-  if (mode == MODE_RGB) {
-    UpscaleRgbLinePair(top_y, bottom_y, top_u, top_v, cur_u, cur_v,
-                       top_dst, bottom_dst, len);
-  } else if (mode == MODE_BGR) {
-    UpscaleBgrLinePair(top_y, bottom_y, top_u, top_v, cur_u, cur_v,
-                       top_dst, bottom_dst, len);
-  } else if (mode == MODE_RGBA) {
-    UpscaleRgbaLinePair(top_y, bottom_y, top_u, top_v, cur_u, cur_v,
-                        top_dst, bottom_dst, len);
-  } else {
-    assert(mode == MODE_BGRA);
-    UpscaleBgraLinePair(top_y, bottom_y, top_u, top_v, cur_u, cur_v,
-                        top_dst, bottom_dst, len);
-  }
-}
-
-#undef LOAD_UV
-#undef UPSCALE_FUNC
-
-#endif  // FANCY_UPSCALING
-
-//-----------------------------------------------------------------------------
-// Main conversion driver.
-
-static int CustomPut(const VP8Io* io) {
-  WebPDecParams *p = (WebPDecParams*)io->opaque;
-  const int w = io->width;
-  const int mb_h = io->mb_h;
-  const int uv_w = (w + 1) / 2;
-  assert(!(io->mb_y & 1));
-
-  if (w <= 0 || mb_h <= 0) {
-    return 0;
-  }
-
-  p->last_y = io->mb_y + io->mb_h;  // a priori guess
-  if (p->mode == MODE_YUV) {
-    uint8_t* const y_dst = p->output + io->mb_y * p->stride;
-    uint8_t* const u_dst = p->u + (io->mb_y >> 1) * p->u_stride;
-    uint8_t* const v_dst = p->v + (io->mb_y >> 1) * p->v_stride;
-    int j;
-    for (j = 0; j < mb_h; ++j) {
-      memcpy(y_dst + j * p->stride, io->y + j * io->y_stride, w);
-    }
-    for (j = 0; j < (mb_h + 1) / 2; ++j) {
-      memcpy(u_dst + j * p->u_stride, io->u + j * io->uv_stride, uv_w);
-      memcpy(v_dst + j * p->v_stride, io->v + j * io->uv_stride, uv_w);
-    }
-  } else {
-    uint8_t* dst = p->output + io->mb_y * p->stride;
-    if (io->fancy_upscaling) {
-#ifdef FANCY_UPSCALING
-      const uint8_t* cur_y = io->y;
-      const uint8_t* cur_u = io->u;
-      const uint8_t* cur_v = io->v;
-      const uint8_t* top_u = p->top_u;
-      const uint8_t* top_v = p->top_v;
-      int y = io->mb_y;
-      int y_end = io->mb_y + io->mb_h;
-      if (y == 0) {
-        // First line is special cased. We mirror the u/v samples at boundary.
-        UpscaleLinePair(NULL, cur_y, cur_u, cur_v, cur_u, cur_v,
-                        NULL, dst, w, p->mode);
-      } else {
-        // We can finish the left-over line from previous call
-        UpscaleLinePair(p->top_y, cur_y, top_u, top_v, cur_u, cur_v,
-                        dst - p->stride, dst, w, p->mode);
-      }
-      // Loop over each output pairs of row.
-      for (; y + 2 < y_end; y += 2) {
-        top_u = cur_u;
-        top_v = cur_v;
-        cur_u += io->uv_stride;
-        cur_v += io->uv_stride;
-        dst += 2 * p->stride;
-        cur_y += 2 * io->y_stride;
-        UpscaleLinePair(cur_y - io->y_stride, cur_y,
-                        top_u, top_v, cur_u, cur_v,
-                        dst - p->stride, dst, w, p->mode);
-      }
-      // move to last row
-      cur_y += io->y_stride;
-      if (y_end != io->height) {
-        // Save the unfinished samples for next call (as we're not done yet).
-        memcpy(p->top_y, cur_y, w * sizeof(*p->top_y));
-        memcpy(p->top_u, cur_u, uv_w * sizeof(*p->top_u));
-        memcpy(p->top_v, cur_v, uv_w * sizeof(*p->top_v));
-        // The fancy upscaler leaves a row unfinished behind
-        // (except for the very last row)
-        p->last_y -= 1;
-      } else {
-        // Process the very last row of even-sized picture
-        if (!(y_end & 1)) {
-          UpscaleLinePair(cur_y, NULL, cur_u, cur_v, cur_u, cur_v,
-                          dst + p->stride, NULL, w, p->mode);
-        }
-      }
-#else
-      assert(0);  // shouldn't happen.
-#endif
-    } else {
-      // Point-sampling U/V upscaler.
-      int j;
-      for (j = 0; j < mb_h; ++j) {
-        const uint8_t* y_src = io->y + j * io->y_stride;
-        int i;
-        for (i = 0; i < w; ++i) {
-          const int y = y_src[i];
-          const int u = io->u[(j / 2) * io->uv_stride + (i / 2)];
-          const int v = io->v[(j / 2) * io->uv_stride + (i / 2)];
-          if (p->mode == MODE_RGB) {
-            VP8YuvToRgb(y, u, v, dst + i * 3);
-          } else if (p->mode == MODE_BGR) {
-            VP8YuvToBgr(y, u, v, dst + i * 3);
-          } else if (p->mode == MODE_RGBA) {
-            VP8YuvToRgb(y, u, v, dst + i * 4);
-          } else {
-            VP8YuvToBgr(y, u, v, dst + i * 4);
-          }
-        }
-        dst += p->stride;
-      }
-    }
-  }
-
-  // Alpha handling
-  if (p->mode == MODE_RGBA || p->mode == MODE_BGRA) {
-    int i, j;
-    uint8_t* dst = p->output + io->mb_y * p->stride + 3;
-    const uint8_t* alpha = io->a;
-    const int has_alpha = (alpha != NULL);
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-    if (has_alpha) {
-      for (j = 0; j < mb_h; ++j) {
-        for (i = 0; i < w; ++i) {
-          dst[4 * i] = alpha[i];
-        }
-        alpha += io->width;
-        dst += p->stride;
-      }
-    }
-#endif
-    if (!has_alpha) {    // fill-in with 0xFFs
-      for (j = 0; j < mb_h; ++j) {
-        for (i = 0; i < w; ++i) {
-          dst[4 * i] = 0xff;
-        }
-        dst += p->stride;
-      }
-    }
-  }
-  return 1;
-}
-
-//-----------------------------------------------------------------------------
-
-static int CustomSetup(VP8Io* io) {
-#ifdef FANCY_UPSCALING
-  WebPDecParams *p = (WebPDecParams*)io->opaque;
-  p->top_y = p->top_u = p->top_v = NULL;
-  if (p->mode != MODE_YUV) {
-    const int uv_width = (io->width + 1) >> 1;
-    p->top_y = (uint8_t*)malloc(io->width + 2 * uv_width);
-    if (p->top_y == NULL) {
-      return 0;   // memory error.
-    }
-    p->top_u = p->top_y + io->width;
-    p->top_v = p->top_u + uv_width;
-    io->fancy_upscaling = 1;  // activate fancy upscaling
-  }
-#endif
-  return 1;
-}
-
-static void CustomTeardown(const VP8Io* io) {
-#ifdef FANCY_UPSCALING
-  WebPDecParams *p = (WebPDecParams*)io->opaque;
-  if (p->top_y) {
-    free(p->top_y);
-    p->top_y = p->top_u = p->top_v = NULL;
-  }
-#endif
-}
-
-void WebPInitCustomIo(VP8Io* const io) {
-  io->put = CustomPut;
-  io->setup = CustomSetup;
-  io->teardown = CustomTeardown;
-}
-
-//-----------------------------------------------------------------------------
-// Init/Check/Free decoding parameters and buffer
-
-int WebPInitDecParams(const uint8_t* data, uint32_t data_size, int* width,
-                      int* height, WebPDecParams* const params) {
-  int w, h;
-
-  if (!WebPGetInfo(data, data_size, &w, &h)) {
-    return 0;
-  }
-  if (width) *width = w;
-  if (height) *height = h;
-
-  if (!params->external_buffer) {
-    int stride;
-    int uv_stride = 0;
-    int size;
-    int uv_size = 0;
-    uint8_t* output;
-    WEBP_CSP_MODE mode = params->mode;
-
-    // initialize output buffer, now that dimensions are known.
-    stride = (mode == MODE_RGB || mode == MODE_BGR) ? 3 * w
-        : (mode == MODE_RGBA || mode == MODE_BGRA) ? 4 * w
-        : w;
-    size = stride * h;
-
-    if (mode == MODE_YUV) {
-      uv_stride = (w + 1) / 2;
-      uv_size = uv_stride * ((h + 1) / 2);
-    }
-
-    output = (uint8_t*)malloc(size + 2 * uv_size);
-    if (!output) {
-      return 0;
-    }
-
-    params->output = output;
-    params->stride = stride;
-    params->output_size = size;
-    if (mode == MODE_YUV) {
-      params->u = output + size;
-      params->u_stride = uv_stride;
-      params->output_u_size = uv_size;
-      params->v = output + size + uv_size;
-      params->v_stride = uv_stride;
-      params->output_v_size = uv_size;
-    }
-  }
-  return 1;
-}
-
-int WebPCheckDecParams(const VP8Io* io, const WebPDecParams* params) {
-  int ok = 1;
-  WEBP_CSP_MODE mode = params->mode;
-  ok &= (params->stride * io->height <= params->output_size);
-  if (mode == MODE_RGB || mode == MODE_BGR) {
-    ok &= (params->stride >= io->width * 3);
-  } else if (mode == MODE_RGBA || mode == MODE_BGRA) {
-    ok &= (params->stride >= io->width * 4);
-  } else {
-    // some extra checks for U/V
-    const int u_size = params->u_stride * ((io->height + 1) / 2);
-    const int v_size = params->v_stride * ((io->height + 1) / 2);
-    ok &= (params->stride >= io->width);
-    ok &= (params->u_stride >= (io->width + 1) / 2) &&
-          (params->v_stride >= (io->width + 1) / 2);
-    ok &= (u_size <= params->output_u_size &&
-           v_size <= params->output_v_size);
-  }
-  return ok;
-}
+// WebPDecParams
 
 void WebPResetDecParams(WebPDecParams* const params) {
-  assert(params);
-  memset(params, 0, sizeof(*params));
-}
-
-void WebPClearDecParams(WebPDecParams* params) {
-  assert(params);
-  if (!params->external_buffer) {
-    free(params->output);
+  if (params) {
+    memset(params, 0, sizeof(*params));
   }
-  WebPResetDecParams(params);
 }
 
 //-----------------------------------------------------------------------------
-// "Into" variants
+// "Into" decoding variants
 
-static uint8_t* DecodeInto(const uint8_t* data, uint32_t data_size,
-                           WebPDecParams* params) {
+// Main flow
+static VP8StatusCode DecodeInto(const uint8_t* data, uint32_t data_size,
+                                WebPDecParams* const params) {
   VP8Decoder* dec = VP8New();
+  VP8StatusCode status = VP8_STATUS_OK;
   VP8Io io;
-  int ok = 1;
 
+  assert(params);
   if (dec == NULL) {
-    return NULL;
+    return VP8_STATUS_INVALID_PARAM;
   }
 
   VP8InitIo(&io);
   io.data = data;
   io.data_size = data_size;
+  WebPInitCustomIo(params, &io);  // Plug the I/O functions.
 
-  io.opaque = params;
-  WebPInitCustomIo(&io);
-
+  // Decode bitstream header, update io->width/io->height.
   if (!VP8GetHeaders(dec, &io)) {
-    VP8Delete(dec);
-    return NULL;
+    status = VP8_STATUS_BITSTREAM_ERROR;
+  } else {
+    // Allocate/check output buffers.
+    status = WebPAllocateDecBuffer(io.width, io.height, params->options,
+                                   params->output);
+    if (status == VP8_STATUS_OK) {
+      // Decode
+      if (!VP8Decode(dec, &io)) {
+        status = dec->status_;
+      }
+    }
   }
-
-  // check output buffers
-  ok = WebPCheckDecParams(&io, params);
-  if (!ok) {
-    VP8Delete(dec);
-    return NULL;
-  }
-
-  if (params->mode != MODE_YUV) {
-    VP8YUVInit();
-  }
-
-  ok = VP8Decode(dec, &io);
   VP8Delete(dec);
-  return ok ? params->output : NULL;
+  if (status != VP8_STATUS_OK) {
+    WebPFreeDecBuffer(params->output);
+  }
+  return status;
+}
+
+// Helpers
+static uint8_t* DecodeIntoRGBABuffer(WEBP_CSP_MODE colorspace,
+                                     const uint8_t* data, uint32_t data_size,
+                                     uint8_t* rgba, int stride, int size) {
+  WebPDecParams params;
+  WebPDecBuffer buf;
+  if (rgba == NULL) {
+    return NULL;
+  }
+  WebPInitDecBuffer(&buf);
+  WebPResetDecParams(&params);
+  params.output = &buf;
+  buf.colorspace    = colorspace;
+  buf.u.RGBA.rgba   = rgba;
+  buf.u.RGBA.stride = stride;
+  buf.u.RGBA.size   = size;
+  buf.is_external_memory = 1;
+  if (DecodeInto(data, data_size, &params) != VP8_STATUS_OK) {
+    return NULL;
+  }
+  return rgba;
 }
 
 uint8_t* WebPDecodeRGBInto(const uint8_t* data, uint32_t data_size,
-                           uint8_t* output, int output_size,
-                           int output_stride) {
-  WebPDecParams params;
-
-  if (output == NULL) {
-    return NULL;
-  }
-
-  WebPResetDecParams(&params);
-  params.mode = MODE_RGB;
-  params.output = output;
-  params.stride = output_stride;
-  params.output_size = output_size;
-  return DecodeInto(data, data_size, &params);
+                           uint8_t* output, int size, int stride) {
+  return DecodeIntoRGBABuffer(MODE_RGB, data, data_size, output, stride, size);
 }
 
 uint8_t* WebPDecodeRGBAInto(const uint8_t* data, uint32_t data_size,
-                            uint8_t* output, int output_size,
-                            int output_stride) {
-  WebPDecParams params;
-
-  if (output == NULL) {
-    return NULL;
-  }
-
-  WebPResetDecParams(&params);
-  params.mode = MODE_RGBA;
-  params.output = output;
-  params.stride = output_stride;
-  params.output_size = output_size;
-  return DecodeInto(data, data_size, &params);
+                            uint8_t* output, int size, int stride) {
+  return DecodeIntoRGBABuffer(MODE_RGBA, data, data_size, output, stride, size);
 }
 
 uint8_t* WebPDecodeBGRInto(const uint8_t* data, uint32_t data_size,
-                           uint8_t* output, int output_size,
-                           int output_stride) {
-  WebPDecParams params;
-
-  if (output == NULL) {
-    return NULL;
-  }
-
-  WebPResetDecParams(&params);
-  params.mode = MODE_BGR;
-  params.output = output;
-  params.stride = output_stride;
-  params.output_size = output_size;
-  return DecodeInto(data, data_size, &params);
+                           uint8_t* output, int size, int stride) {
+  return DecodeIntoRGBABuffer(MODE_BGR, data, data_size, output, stride, size);
 }
 
 uint8_t* WebPDecodeBGRAInto(const uint8_t* data, uint32_t data_size,
-                            uint8_t* output, int output_size,
-                            int output_stride) {
-  WebPDecParams params;
-
-  if (output == NULL) {
-    return NULL;
-  }
-
-  WebPResetDecParams(&params);
-  params.mode = MODE_BGRA;
-  params.output = output;
-  params.stride = output_stride;
-  params.output_size = output_size;
-  return DecodeInto(data, data_size, &params);
+                            uint8_t* output, int size, int stride) {
+  return DecodeIntoRGBABuffer(MODE_BGRA, data, data_size, output, stride, size);
 }
 
 uint8_t* WebPDecodeYUVInto(const uint8_t* data, uint32_t data_size,
@@ -541,85 +161,93 @@ uint8_t* WebPDecodeYUVInto(const uint8_t* data, uint32_t data_size,
                            uint8_t* u, int u_size, int u_stride,
                            uint8_t* v, int v_size, int v_stride) {
   WebPDecParams params;
-
-  if (luma == NULL) {
+  WebPDecBuffer output;
+  if (luma == NULL) return NULL;
+  WebPInitDecBuffer(&output);
+  WebPResetDecParams(&params);
+  params.output = &output;
+  output.colorspace      = MODE_YUV;
+  output.u.YUVA.y        = luma;
+  output.u.YUVA.y_stride = luma_stride;
+  output.u.YUVA.y_size   = luma_size;
+  output.u.YUVA.u        = u;
+  output.u.YUVA.u_stride = u_stride;
+  output.u.YUVA.u_size   = u_size;
+  output.u.YUVA.v        = v;
+  output.u.YUVA.v_stride = v_stride;
+  output.u.YUVA.v_size   = v_size;
+  output.is_external_memory = 1;
+  if (DecodeInto(data, data_size, &params) != VP8_STATUS_OK) {
     return NULL;
   }
-
-  WebPResetDecParams(&params);
-  params.mode = MODE_YUV;
-  params.output = luma;
-  params.stride = luma_stride;
-  params.output_size = luma_size;
-  params.u = u;
-  params.u_stride = u_stride;
-  params.output_u_size = u_size;
-  params.v = v;
-  params.v_stride = v_stride;
-  params.output_v_size = v_size;
-  return DecodeInto(data, data_size, &params);
+  return luma;
 }
 
 //-----------------------------------------------------------------------------
 
 static uint8_t* Decode(WEBP_CSP_MODE mode, const uint8_t* data,
                        uint32_t data_size, int* width, int* height,
-                       WebPDecParams* params_out) {
-  uint8_t* output;
+                       WebPDecBuffer* keep_info) {
   WebPDecParams params;
+  WebPDecBuffer output;
 
+  WebPInitDecBuffer(&output);
   WebPResetDecParams(&params);
-  params.mode = mode;
-  if (!WebPInitDecParams(data, data_size, width, height, &params)) {
+  params.output = &output;
+  output.colorspace = mode;
+
+  // Retrieve (and report back) the required dimensions from bitstream.
+  if (!WebPGetInfo(data, data_size, &output.width, &output.height)) {
     return NULL;
   }
+  if (width) *width = output.width;
+  if (height) *height = output.height;
 
-  params.output_size = params.stride * (*height);
-  params.output_u_size = params.output_v_size =
-    params.u_stride * ((*height + 1) / 2);
-  output = DecodeInto(data, data_size, &params);
-  if (!output) {
-    WebPClearDecParams(&params);
+  // Decode
+  if (DecodeInto(data, data_size, &params) != VP8_STATUS_OK) {
+    return NULL;
   }
-  if (params_out) {
-    *params_out = params;
+  if (keep_info) {    // keep track of the side-info
+    WebPCopyDecBuffer(&output, keep_info);
   }
-  return output;
+  // return decoded samples (don't clear 'output'!)
+  return (mode >= MODE_YUV) ? output.u.YUVA.y : output.u.RGBA.rgba;
 }
 
 uint8_t* WebPDecodeRGB(const uint8_t* data, uint32_t data_size,
-                       int *width, int *height) {
+                       int* width, int* height) {
   return Decode(MODE_RGB, data, data_size, width, height, NULL);
 }
 
 uint8_t* WebPDecodeRGBA(const uint8_t* data, uint32_t data_size,
-                        int *width, int *height) {
+                        int* width, int* height) {
   return Decode(MODE_RGBA, data, data_size, width, height, NULL);
 }
 
 uint8_t* WebPDecodeBGR(const uint8_t* data, uint32_t data_size,
-                       int *width, int *height) {
+                       int* width, int* height) {
   return Decode(MODE_BGR, data, data_size, width, height, NULL);
 }
 
 uint8_t* WebPDecodeBGRA(const uint8_t* data, uint32_t data_size,
-                        int *width, int *height) {
+                        int* width, int* height) {
   return Decode(MODE_BGRA, data, data_size, width, height, NULL);
 }
 
 uint8_t* WebPDecodeYUV(const uint8_t* data, uint32_t data_size,
-                       int *width, int *height, uint8_t** u, uint8_t** v,
-                       int *stride, int* uv_stride) {
-  WebPDecParams params;
+                       int* width, int* height, uint8_t** u, uint8_t** v,
+                       int* stride, int* uv_stride) {
+  WebPDecBuffer output;   // only to preserve the side-infos
   uint8_t* const out = Decode(MODE_YUV, data, data_size,
-                              width, height, &params);
+                              width, height, &output);
 
   if (out) {
-    *u = params.u;
-    *v = params.v;
-    *stride = params.stride;
-    *uv_stride = params.u_stride;
-    assert(params.u_stride == params.v_stride);
+    const WebPYUVABuffer* const buf = &output.u.YUVA;
+    *u = buf->u;
+    *v = buf->v;
+    *stride = buf->y_stride;
+    *uv_stride = buf->u_stride;
+    assert(buf->u_stride == buf->v_stride);
   }
   return out;
 }
@@ -628,16 +256,91 @@ uint8_t* WebPDecodeYUV(const uint8_t* data, uint32_t data_size,
 // WebPGetInfo()
 
 int WebPGetInfo(const uint8_t* data, uint32_t data_size,
-                int *width, int *height) {
+                int* width, int* height) {
   const uint32_t chunk_size = WebPCheckRIFFHeader(&data, &data_size);
   if (!chunk_size) {
     return 0;         // unsupported RIFF header
   }
   // Validate raw video data
-  if (data_size < 10) {
-    return 0;         // not enough data
+  return VP8GetInfo(data, data_size, chunk_size, width, height, NULL);
+}
+
+static void DefaultFeatures(WebPBitstreamFeatures* const features) {
+  assert(features);
+  memset(features, 0, sizeof(*features));
+  features->bitstream_version = 0;
+}
+
+static VP8StatusCode GetFeatures(const uint8_t** data, uint32_t* data_size,
+                                 WebPBitstreamFeatures* const features) {
+  uint32_t chunk_size;
+  if (features == NULL) {
+    return VP8_STATUS_INVALID_PARAM;
   }
-  return VP8GetInfo(data, chunk_size, width, height);
+  DefaultFeatures(features);
+  if (data == NULL || *data == NULL || data_size == 0) {
+    return VP8_STATUS_INVALID_PARAM;
+  }
+  chunk_size = WebPCheckRIFFHeader(data, data_size);
+  if (chunk_size == 0) {
+    return VP8_STATUS_BITSTREAM_ERROR;   // unsupported RIFF header
+  }
+  if (!VP8GetInfo(*data, *data_size, chunk_size,
+                  &features->width, &features->height, &features->has_alpha)) {
+    return VP8_STATUS_BITSTREAM_ERROR;
+  }
+  return VP8_STATUS_OK;
+}
+
+//-----------------------------------------------------------------------------
+// Advance decoding API
+
+int WebPInitDecoderConfigInternal(WebPDecoderConfig* const config,
+                                  int version) {
+  if (version != WEBP_DECODER_ABI_VERSION) {
+    return 0;   // version mismatch
+  }
+  if (config == NULL) {
+    return 0;
+  }
+  memset(config, 0, sizeof(*config));
+  DefaultFeatures(&config->input);
+  WebPInitDecBuffer(&config->output);
+  return 1;
+}
+
+VP8StatusCode WebPGetFeaturesInternal(const uint8_t* data, uint32_t data_size,
+                                      WebPBitstreamFeatures* const features,
+                            int version) {
+  if (version != WEBP_DECODER_ABI_VERSION) {
+    return VP8_STATUS_INVALID_PARAM;   // version mismatch
+  }
+  if (features == NULL) {
+    return VP8_STATUS_INVALID_PARAM;
+  }
+  return GetFeatures(&data, &data_size, features);
+}
+
+VP8StatusCode WebPDecode(const uint8_t* data, uint32_t data_size,
+                         WebPDecoderConfig* const config) {
+  WebPDecParams params;
+  VP8StatusCode status;
+
+  if (!config) {
+    return VP8_STATUS_INVALID_PARAM;
+  }
+
+  status = GetFeatures(&data, &data_size, &config->input);
+  if (status != VP8_STATUS_OK) {
+    return status;
+  }
+
+  WebPResetDecParams(&params);
+  params.output = &config->output;
+  params.options = &config->options;
+  status = DecodeInto(data, data_size, &params);
+
+  return status;
 }
 
 #if defined(__cplusplus) || defined(c_plusplus)
diff --git a/src/dec/webpi.h b/src/dec/webpi.h
index ce2d653b..662441da 100644
--- a/src/dec/webpi.h
+++ b/src/dec/webpi.h
@@ -18,46 +18,81 @@ extern "C" {
 
 #include "../webp/decode_vp8.h"
 
-// Decoding output parameters.
+//------------------------------------------------------------------------------
+// WebPDecParams: Decoding output parameters. Transcient internal object.
+
+typedef struct WebPDecParams WebPDecParams;
+typedef int (*OutputFunc)(const VP8Io* const io, WebPDecParams* const p);
+
+// Structure use for on-the-fly rescaling
 typedef struct {
-  uint8_t* output;      // rgb(a) or luma
-  uint8_t *u, *v;       // chroma u/v
-  uint8_t *top_y, *top_u, *top_v;   // cache for the fancy upscaler
-  int stride;           // rgb(a) stride or luma stride
-  int u_stride;         // chroma-u stride
-  int v_stride;         // chroma-v stride
-  WEBP_CSP_MODE mode;   // rgb(a) or yuv
-  int last_y;           // coordinate of the line that was last output
-  int output_size;      // size of 'output' buffer
-  int output_u_size;    // size of 'u' buffer
-  int output_v_size;    // size of 'v' buffer
-  int external_buffer;  // If true, the output buffers are externally owned
-} WebPDecParams;
+  int x_expand;               // true if we're expanding in the x direction
+  int fy_scale, fx_scale;     // fixed-point scaling factor
+  int64_t fxy_scale;          // ''
+  // we need hpel-precise add/sub increments, for the downsampled U/V planes.
+  int y_accum;                // vertical accumulator
+  int y_add, y_sub;           // vertical increments (add ~= src, sub ~= dst)
+  int x_add, x_sub;           // horizontal increments (add ~= src, sub ~= dst)
+  int src_width, src_height;  // source dimensions
+  int dst_width, dst_height;  // destination dimensions
+  uint8_t* dst;
+  int dst_stride;
+  int32_t* irow, *frow;       // work buffer
+} WebPRescaler;
+
+struct WebPDecParams {
+  WebPDecBuffer* output;             // output buffer.
+  uint8_t* tmp_y, *tmp_u, *tmp_v;    // cache for the fancy upsampler
+                                     // or used for tmp rescaling
+
+  int last_y;                 // coordinate of the line that was last output
+  const WebPDecoderOptions* options;  // if not NULL, use alt decoding features
+  // rescalers
+  WebPRescaler scaler_y, scaler_u, scaler_v, scaler_a;
+  void* memory;               // overall scratch memory for the output work.
+  OutputFunc emit;            // output RGB or YUV samples
+  OutputFunc emit_alpha;      // output alpha channel
+};
 
 // Should be called first, before any use of the WebPDecParams object.
 void WebPResetDecParams(WebPDecParams* const params);
 
+//------------------------------------------------------------------------------
+// Misc utils
+
 // If a RIFF container is detected, validate it and skip over it. Returns
 // VP8 bit-stream size if RIFF header is valid else returns 0
 uint32_t WebPCheckRIFFHeader(const uint8_t** data_ptr,
-                             uint32_t *data_size_ptr);
+                             uint32_t* data_size_ptr);
 
-// Initializes VP8Io with custom setup, io and teardown functions
-void WebPInitCustomIo(VP8Io* const io);
+// Initializes VP8Io with custom setup, io and teardown functions. The default
+// hooks will use the supplied 'params' as io->opaque handle.
+void WebPInitCustomIo(WebPDecParams* const params, VP8Io* const io);
 
-// Initializes params_out by allocating output buffer and setting the
-// stride information. It also outputs width and height information of
-// the WebP image. Returns 1 if succeeds.
-int WebPInitDecParams(const uint8_t* data, uint32_t data_size, int* width,
-                      int* height, WebPDecParams* const params_out);
+//------------------------------------------------------------------------------
+// Internal functions regarding WebPDecBuffer memory (in buffer.c).
+// Don't really need to be externally visible for now.
 
-// Verifies various size configurations (e.g stride >= width, specified
-// output size <= stride * height etc.). Returns 0 if checks fail.
-int WebPCheckDecParams(const VP8Io* io, const WebPDecParams* params);
+// Prepare 'buffer' with the requested initial dimensions width/height.
+// If no external storage is supplied, initializes buffer by allocating output
+// memory and setting up the stride information. Validate the parameters. Return
+// an error code in case of problem (no memory, or invalid stride / size /
+// dimension / etc.). If *options is not NULL, also verify that the options'
+// parameters are valid and apply them to the width/height dimensions of the
+// output buffer. This takes cropping / scaling / rotation into account.
+VP8StatusCode WebPAllocateDecBuffer(int width, int height,
+                                    const WebPDecoderOptions* const options,
+                                    WebPDecBuffer* const buffer);
 
-// Deallocate memory allocated by WebPInitDecParams() and reset the
-// WebPDecParams object.
-void WebPClearDecParams(WebPDecParams* params);
+// Copy 'src' into 'dst' buffer, making sure 'dst' is not marked as owner of the
+// memory (still held by 'src').
+void WebPCopyDecBuffer(const WebPDecBuffer* const src,
+                       WebPDecBuffer* const dst);
+
+// Copy and transfer ownership from src to dst (beware of parameter order!)
+void WebPGrabDecBuffer(WebPDecBuffer* const src, WebPDecBuffer* const dst);
+
+//------------------------------------------------------------------------------
 
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
diff --git a/src/dec/yuv.h b/src/dec/yuv.h
index 1398f2e0..0604df79 100644
--- a/src/dec/yuv.h
+++ b/src/dec/yuv.h
@@ -26,7 +26,7 @@ extern int16_t VP8kVToR[256], VP8kUToB[256];
 extern int32_t VP8kVToG[256], VP8kUToG[256];
 extern uint8_t VP8kClip[YUV_RANGE_MAX - YUV_RANGE_MIN];
 
-inline static void VP8YuvToRgb(uint8_t y, uint8_t u, uint8_t v,
+static inline void VP8YuvToRgb(uint8_t y, uint8_t u, uint8_t v,
                                uint8_t* const rgb) {
   const int r_off = VP8kVToR[v];
   const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
@@ -36,7 +36,7 @@ inline static void VP8YuvToRgb(uint8_t y, uint8_t u, uint8_t v,
   rgb[2] = VP8kClip[y + b_off - YUV_RANGE_MIN];
 }
 
-inline static void VP8YuvToBgr(uint8_t y, uint8_t u, uint8_t v,
+static inline void VP8YuvToBgr(uint8_t y, uint8_t u, uint8_t v,
                                uint8_t* const bgr) {
   const int r_off = VP8kVToR[v];
   const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
@@ -46,11 +46,18 @@ inline static void VP8YuvToBgr(uint8_t y, uint8_t u, uint8_t v,
   bgr[2] = VP8kClip[y + r_off - YUV_RANGE_MIN];
 }
 
-inline static void VP8YuvToBgra(int y, int u, int v, uint8_t* const bgra) {
+static inline void VP8YuvToBgra(uint8_t y, uint8_t u, uint8_t v,
+                                uint8_t* const bgra) {
   VP8YuvToBgr(y, u, v, bgra);
   bgra[3] = 0xff;
 }
 
+static inline void VP8YuvToRgba(uint8_t y, uint8_t u, uint8_t v,
+                                uint8_t* const rgba) {
+  VP8YuvToRgb(y, u, v, rgba);
+  rgba[3] = 0xff;
+}
+
 // Must be called before everything, to initialize the tables.
 void VP8YUVInit(void);
 
diff --git a/src/webp/decode.h b/src/webp/decode.h
index a53eb599..af132190 100644
--- a/src/webp/decode.h
+++ b/src/webp/decode.h
@@ -18,6 +18,8 @@
 extern "C" {
 #endif
 
+#define WEBP_DECODER_ABI_VERSION 0x0002
+
 // Return the decoder's version number, packed in hexadecimal using 8bits for
 // each of major/minor/revision. E.g: v2.5.7 is 0x020507.
 int WebPGetDecoderVersion(void);
@@ -27,25 +29,25 @@ int WebPGetDecoderVersion(void);
 // case of formatting error.
 // Pointers *width/*height can be passed NULL if deemed irrelevant.
 int WebPGetInfo(const uint8_t* data, uint32_t data_size,
-                int *width, int *height);
+                int* width, int* height);
 
 // Decodes WEBP images pointed to by *data and returns RGB samples, along
 // with the dimensions in *width and *height.
 // The returned pointer should be deleted calling free().
 // Returns NULL in case of error.
 uint8_t* WebPDecodeRGB(const uint8_t* data, uint32_t data_size,
-                       int *width, int *height);
+                       int* width, int* height);
 
 // Same as WebPDecodeRGB, but returning RGBA data.
 uint8_t* WebPDecodeRGBA(const uint8_t* data, uint32_t data_size,
-                        int *width, int *height);
+                        int* width, int* height);
 
 // This variant decode to BGR instead of RGB.
 uint8_t* WebPDecodeBGR(const uint8_t* data, uint32_t data_size,
-                       int *width, int *height);
+                       int* width, int* height);
 // This variant decodes to BGRA instead of RGBA.
 uint8_t* WebPDecodeBGRA(const uint8_t* data, uint32_t data_size,
-                        int *width, int *height);
+                        int* width, int* height);
 
 // Decode WEBP images stored in *data in Y'UV format(*). The pointer returned is
 // the Y samples buffer. Upon return, *u and *v will point to the U and V
@@ -57,8 +59,8 @@ uint8_t* WebPDecodeBGRA(const uint8_t* data, uint32_t data_size,
 // Return NULL in case of error.
 // (*) Also named Y'CbCr. See: http://en.wikipedia.org/wiki/YCbCr
 uint8_t* WebPDecodeYUV(const uint8_t* data, uint32_t data_size,
-                       int *width, int *height, uint8_t** u, uint8_t** v,
-                       int *stride, int* uv_stride);
+                       int* width, int* height, uint8_t** u, uint8_t** v,
+                       int* stride, int* uv_stride);
 
 // These three functions are variants of the above ones, that decode the image
 // directly into a pre-allocated buffer 'output_buffer'. The maximum storage
@@ -95,13 +97,59 @@ uint8_t* WebPDecodeYUVInto(const uint8_t* data, uint32_t data_size,
                            uint8_t* v, int v_size, int v_stride);
 
 //-----------------------------------------------------------------------------
+// Output colorspaces and buffer
 
-// Output colorspaces
+// Colorspaces
 typedef enum { MODE_RGB = 0, MODE_RGBA = 1,
                MODE_BGR = 2, MODE_BGRA = 3,
-               MODE_YUV = 4 } WEBP_CSP_MODE;
+               MODE_YUV = 4, MODE_YUVA = 5  // yuv 4:2:0
+             } WEBP_CSP_MODE;
 
+// Generic structure for describing the sample buffer.
+typedef struct {    // view as RGBA
+  uint8_t* rgba;    // pointer to RGBA samples
+  int stride;       // stride in bytes from one scanline to the next.
+  int size;         // total size of the *rgba buffer.
+} WebPRGBABuffer;
+
+typedef struct {              // view as YUVA
+  uint8_t* y, *u, *v, *a;     // pointer to luma, chroma U/V, alpha samples
+  int y_stride;               // luma stride
+  int u_stride, v_stride;     // chroma strides
+  int a_stride;               // alpha stride
+  int y_size;                 // luma plane size
+  int u_size, v_size;         // chroma planes size
+  int a_size;                 // alpha-plane size
+} WebPYUVABuffer;
+
+// Output buffer
+typedef struct {
+  WEBP_CSP_MODE colorspace;  // Colorspace.
+  int width, height;         // Dimensions.
+  int is_external_memory;    // If true, the *memory pointer is not owned.
+  union {
+    WebPRGBABuffer RGBA;
+    WebPYUVABuffer YUVA;
+  } u;                       // nameless union of buffer parameters.
+  uint8_t* memory;           // main pointer (when is_external_memory is false)
+} WebPDecBuffer;
+
+// Internal, version-checked, entry point
+int WebPInitDecBufferInternal(WebPDecBuffer* const, int);
+
+// Initialize the structure as empty. Must be called before any other use.
+// Returns false in case of version mismatch
+static inline int WebPInitDecBuffer(WebPDecBuffer* const buffer) {
+  return WebPInitDecBufferInternal(buffer, WEBP_DECODER_ABI_VERSION);
+}
+
+// Free any memory associated with the buffer. Must always be called last.
+// Note: doesn't free the 'buffer' structure itself.
+void WebPFreeDecBuffer(WebPDecBuffer* const buffer);
+
+//-----------------------------------------------------------------------------
 // Enumeration of the status codes
+
 typedef enum {
   VP8_STATUS_OK = 0,
   VP8_STATUS_OUT_OF_MEMORY,
@@ -116,8 +164,8 @@ typedef enum {
 //-----------------------------------------------------------------------------
 // Incremental decoding
 //
-//  This API allows streamlined decoding of partial data.
-//  Picture can be incrementally decoded as data become available thanks to the
+// This API allows streamlined decoding of partial data.
+// Picture can be incrementally decoded as data become available thanks to the
 // WebPIDecoder object. This object can be left in a SUSPENDED state if the
 // picture is only partially decoded, pending additional input.
 // Code example:
@@ -138,7 +186,16 @@ typedef enum {
 
 typedef struct WebPIDecoder WebPIDecoder;
 
+// Creates a new incremental decoder with the supplied buffer parameter.
+// This output_buffer can be passed NULL, in which case a default output buffer
+// is used (with MODE_RGB). Otherwise, an internal reference to 'output_buffer'
+// is kept, which means that the lifespan of 'output_buffer' must be larger than
+// that of the returned WebPIDecoder object.
+// Returns NULL if the allocation failed.
+WebPIDecoder* WebPINewDecoder(WebPDecBuffer* const output_buffer);
+
 // Creates a WebPIDecoder object. Returns NULL in case of failure.
+// TODO(skal): DEPRECATED. Prefer using WebPINewDecoder().
 WebPIDecoder* WebPINew(WEBP_CSP_MODE mode);
 
 // This function allocates and initializes an incremental-decoder object, which
@@ -183,7 +240,7 @@ VP8StatusCode WebPIUpdate(WebPIDecoder* const idec, const uint8_t* data,
 // specified in WebPINew()/WebPINewRGB(). *last_y is the index of last decoded
 // row in raster scan order. Some pointers (*last_y, *width etc.) can be NULL if
 // corresponding information is not needed.
-uint8_t* WebPIDecGetRGB(const WebPIDecoder* const idec, int *last_y,
+uint8_t* WebPIDecGetRGB(const WebPIDecoder* const idec, int* last_y,
                         int* width, int* height, int* stride);
 
 // Same as above function to get YUV image. Returns pointer to the luma plane
@@ -192,6 +249,122 @@ uint8_t* WebPIDecGetYUV(const WebPIDecoder* const idec, int* last_y,
                         uint8_t** u, uint8_t** v,
                         int* width, int* height, int* stride, int* uv_stride);
 
+// Generic call to retrieve output buffer information.
+// Returns NULL in case of error, otherwise returns the pointer to the internal
+// representation. This structure is read-only and shouldn't be modified.
+// TODO(skal): instead of 'last_y' only, we should pass *left/top/right/bottom,
+// to get the visible area. Esp. useful for rotation.
+const WebPDecBuffer* WebPIDecGetSamples(const WebPIDecoder* const idec,
+                                        int* last_y);
+
+//-----------------------------------------------------------------------------
+// Advanced decoding parametrization
+//
+//  Code sample for using the advanced decoding API
+/*
+     // A) Init a configuration object
+     WebPDecoderConfig config;
+     CHECK(WebPInitDecoderConfig(&config));
+
+     // B) optional: retrieve the bitstream's features.
+     CHECK(WebPGetFeatures(data, data_size, &config.input) == VP8_STATUS_OK);
+
+     // C) Adjust 'config', if needed
+     config.no_fancy = 1;
+     config.output.colorspace = MODE_BGRA;
+     // etc.
+
+     // Note that you can also make config.output point to an externally
+     // supplied memory buffer, provided it's big enough to store the decoded
+     // picture. Otherwise, config.output will just be used to allocate memory
+     // and store the decoded picture.
+
+     // D) Decode!
+     CHECK(WebPDecode(data, data_size, &config) == VP8_STATUS_OK);
+
+     // E) Decoded image is now in config.output (and config.output.u.RGBA)
+
+     // F) Reclaim memory allocated in config's object. It's safe to call
+     // this function even if the memory is external and wasn't allocated
+     // by WebPDecode().
+     WebPFreeDecBuffer(&config.output);
+*/
+
+// Features gathered from the bitstream
+typedef struct {
+  int width;        // the original width, as read from the bitstream
+  int height;       // the original height, as read from the bitstream
+  int has_alpha;    // true if bitstream contains an alpha channel
+  int no_incremental_decoding;  // if true, using incremental decoding is not
+                                // recommended.
+  int rotate;                   // TODO(later)
+  int uv_sampling;              // should be 0 for now. TODO(later)
+  int bitstream_version;        // should be 0 for now. TODO(later)
+} WebPBitstreamFeatures;
+
+// Internal, version-checked, entry point
+extern VP8StatusCode WebPGetFeaturesInternal(const uint8_t*, uint32_t,
+                                             WebPBitstreamFeatures* const, int);
+
+// Retrieve features from the bitstream. The *features structure is filled
+// with informations gathered from the bitstream.
+// Returns false in case of error or version mismatch.
+// In case of error, features->bitstream_status will reflect the error code.
+static inline
+  VP8StatusCode WebPGetFeatures(const uint8_t* data, uint32_t data_size,
+                                WebPBitstreamFeatures* const features) {
+  return WebPGetFeaturesInternal(data, data_size, features,
+                                 WEBP_DECODER_ABI_VERSION);
+}
+
+// Decoding options
+typedef struct {
+  int bypass_filtering;               // if true, skip the in-loop filtering
+  int no_fancy_upsampling;            // if true, use faster pointwise upsampler
+  int use_cropping;                   // if true, cropping is applied _first_
+  int crop_left, crop_top;            // top-left position for cropping.
+                                      // Will be snapped to even values.
+  int crop_width, crop_height;        // dimension of the cropping area
+  int use_scaling;                    // if true, scaling is applied _afterward_
+  int scaled_width, scaled_height;    // final resolution
+  int force_rotation;                 // forced rotation (to be applied _last_)
+  int no_enhancement;                 // if true, discard enhancement layer
+} WebPDecoderOptions;
+
+// Main object storing the configuration for advanced decoding.
+typedef struct {
+  WebPBitstreamFeatures input;  // Immutable bitstream features (optional)
+  WebPDecBuffer output;         // Output buffer (can point to external mem)
+  WebPDecoderOptions options;   // Decoding options
+} WebPDecoderConfig;
+
+// Internal, version-checked, entry point
+extern int WebPInitDecoderConfigInternal(WebPDecoderConfig* const, int);
+
+// Initialize the configuration as empty. This function must always be
+// called first, unless WebPGetFeatures() is to be called.
+// Returns false in case of mismatched version.
+static inline int WebPInitDecoderConfig(WebPDecoderConfig* const config) {
+  return WebPInitDecoderConfigInternal(config, WEBP_DECODER_ABI_VERSION);
+}
+
+// Instantiate a new incremental decoder object with requested configuration.
+// The bitstream can be passed using *data and data_size parameter,
+// in which case the features will be parsed and stored into config->input.
+// Otherwise, 'data' can be NULL and now parsing will occur.
+// Note that 'config' can be NULL too, in which case a default configuration is
+// used.
+// The return WebPIDecoder object must always be deleted calling WebPIDelete().
+// Returns NULL in case of error (and config->status will then reflect
+// the error condition).
+WebPIDecoder* WebPIDecode(const uint8_t* data, uint32_t data_size,
+                          WebPDecoderConfig* const config);
+
+// Non-incremental version. This version decodes the full data at once, taking
+// 'config' into account. Return decoding status (VP8_STATUS_OK if decoding
+// was successful).
+VP8StatusCode WebPDecode(const uint8_t* data, uint32_t data_size,
+                         WebPDecoderConfig* const config);
 
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
diff --git a/src/webp/decode_vp8.h b/src/webp/decode_vp8.h
index 3f24ea24..68a9f975 100644
--- a/src/webp/decode_vp8.h
+++ b/src/webp/decode_vp8.h
@@ -18,8 +18,6 @@
 extern "C" {
 #endif
 
-#define WEBP_DECODER_ABI_VERSION 0x0002
-
 //-----------------------------------------------------------------------------
 // Lower-level API
 //
@@ -42,12 +40,16 @@ extern "C" {
 typedef struct VP8Io VP8Io;
 struct VP8Io {
   // set by VP8GetHeaders()
-  int width, height;         // picture dimensions, in pixels
+  int width, height;         // picture dimensions, in pixels (invariable).
+                             // These are the original, uncropped dimensions.
+                             // The actual area passed to put() is stored
+                             // in mb_w / mb_h fields.
 
   // set before calling put()
   int mb_y;                  // position of the current rows (in pixels)
+  int mb_w;                  // number of columns in the sample
   int mb_h;                  // number of rows in the sample
-  const uint8_t *y, *u, *v;  // rows to copy (in yuv420 format)
+  const uint8_t* y, *u, *v;  // rows to copy (in yuv420 format)
   int y_stride;              // row stride for luma
   int uv_stride;             // row stride for chroma
 
@@ -56,7 +58,8 @@ struct VP8Io {
   // called when fresh samples are available. Currently, samples are in
   // YUV420 format, and can be up to width x 24 in size (depending on the
   // in-loop filtering level, e.g.). Should return false in case of error
-  // or abort request.
+  // or abort request. The actual size of the area to update is mb_w x mb_h
+  // in size, taking cropping into account.
   int (*put)(const VP8Io* io);
 
   // called just before starting to decode the blocks.
@@ -69,7 +72,7 @@ struct VP8Io {
   // this is a recommendation for the user-side yuv->rgb converter. This flag
   // is set when calling setup() hook and can be overwritten by it. It then
   // can be taken into consideration during the put() method.
-  int fancy_upscaling;
+  int fancy_upsampling;
 
   // Input buffer.
   uint32_t data_size;
@@ -81,6 +84,14 @@ struct VP8Io {
   // with the VP8 specifications.
   int bypass_filtering;
 
+  // Cropping parameters.
+  int use_cropping;
+  int crop_left, crop_right, crop_top, crop_bottom;
+
+  // Scaling parameters.
+  int use_scaling;
+  int scaled_width, scaled_height;
+
   // pointer to the alpha data (if present) corresponding to the rows
   const uint8_t* a;
 };