diff --git a/Android.mk b/Android.mk
index c087446e..553aad61 100644
--- a/Android.mk
+++ b/Android.mk
@@ -12,6 +12,8 @@ LOCAL_SRC_FILES := \
 	src/dec/tree.c \
 	src/dec/vp8.c \
 	src/dec/webp.c \
+	src/dec/io.c \
+	src/dec/buffer.c \
 	src/dec/yuv.c \
 	src/enc/alpha.c \
 	src/enc/analysis.c \
diff --git a/Makefile.vc b/Makefile.vc
index d35238bb..ad5f7d0c 100644
--- a/Makefile.vc
+++ b/Makefile.vc
@@ -122,6 +122,8 @@ X_OBJS= \
 	$(DIROBJ)\dec\tree.obj \
 	$(DIROBJ)\dec\vp8.obj \
 	$(DIROBJ)\dec\webp.obj \
+	$(DIROBJ)\dec\io.obj \
+	$(DIROBJ)\dec\buffer.obj \
 	$(DIROBJ)\dec\yuv.obj \
 	$(DIROBJ)\dec\idec.obj \
 	$(DIROBJ)\dec\alpha.obj \
diff --git a/README b/README
index 1ecefb5e..37cab537 100644
--- a/README
+++ b/README
@@ -144,6 +144,7 @@ options:
   -pass <int> ............ analysis pass number (1..10)
   -partitions <int> ...... number of partitions to use (0..3)
   -crop <x> <y> <w> <h> .. crop picture with the given rectangle
+  -resize <w> <h> ........ resize picture (after any cropping)
   -map <int> ............. print map of extra info.
   -d <file.pgm> .......... dump the compressed output (PGM file).
 
@@ -201,6 +202,25 @@ file test.webp decodes to exactly the same as test_ref.ppm by using:
  ./dwebp test.webp -ppm -o test.ppm
  diff test.ppm test_ref.ppm
 
+The full list of options is available using -h:
+
+> dwebp -h
+Usage: dwebp in_file [options] [-o out_file]
+
+Decodes the WebP image file to PNG format [Default]
+Use following options to convert into alternate image formats:
+  -ppm ......... save the raw RGB samples as color PPM
+  -pgm ......... save the raw YUV samples as a grayscale PGM
+                 file with IMC4 layout.
+ Other options are:
+  -version  .... print version number and exit.
+  -nofancy ..... don't use the fancy YUV420 upscaler.
+  -nofilter .... disable in-loop filtering.
+  -crop <x> <y> <w> <h> ... crop output with the given rectangle
+  -scale <w> <h> .......... scale the output (*after* any cropping)
+  -h     ....... this help message.
+  -v     ....... verbose (e.g. print encoding/decoding times)
+  -noasm ....... disable all assembly optimizations.
 
 Encoding API:
 ===========
diff --git a/examples/dwebp.c b/examples/dwebp.c
index 8b955466..406b0e4c 100644
--- a/examples/dwebp.c
+++ b/examples/dwebp.c
@@ -5,8 +5,7 @@
 //  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
-//  simple command-line example calling libwebpdecode to
-//  decode a WebP image into a PPM image.
+//  Command-line tool for decoding a WebP image
 //
 //  Compile with:     gcc -o dwebp dwebp.c -lwebpdecode
 //
@@ -45,11 +44,18 @@
 extern "C" {
 #endif
 
-//-----------------------------------------------------------------------------
-
+static int verbose = 0;
 extern void* VP8DecGetCPUInfo;   // opaque forward declaration.
 
-static int verbose = 0;
+//-----------------------------------------------------------------------------
+
+// Output types
+typedef enum {
+  PNG = 0,
+  PPM,
+  PGM,
+  ALPHA_PLANE_ONLY  // this is for experimenting only
+} OutputFileFormat;
 
 #ifdef HAVE_WINCODEC_H
 
@@ -69,7 +75,8 @@ static int verbose = 0;
 #define MAKE_REFGUID(x) &(x)
 #endif
 
-static HRESULT CreateOutputStream(const char* out_file_name, IStream** ppStream) {
+static HRESULT CreateOutputStream(const char* out_file_name,
+                                  IStream** ppStream) {
   HRESULT hr = S_OK;
   IFS(SHCreateStreamOnFileA(out_file_name, STGM_WRITE | STGM_CREATE, ppStream));
   if (FAILED(hr))
@@ -117,8 +124,13 @@ static HRESULT WriteUsingWIC(const char* out_file_name, REFGUID container_guid,
   return hr;
 }
 
-static int WritePNG(const char* out_file_name, unsigned char* rgb, int stride,
-                    uint32_t width, uint32_t height, int has_alpha) {
+static int WritePNG(const char* out_file_name,
+                    const WebPDecBuffer* const buffer) {
+  const uint32_t width = buffer->width;
+  const uint32_t height = buffer->height;
+  unsigned char* const rgb = buffer->u.RGBA.rgba;
+  const int stride = buffer->u.RGBA.stride;
+  const int has_alpha = (buffer->colorspace == MODE_RGBA);
   assert(!has_alpha);   // TODO(mikolaj)
   return SUCCEEDED(WriteUsingWIC(out_file_name,
              MAKE_REFGUID(GUID_ContainerFormatPng), rgb, stride, width,
@@ -131,8 +143,12 @@ static void PNGAPI error_function(png_structp png, png_const_charp dummy) {
   longjmp(png_jmpbuf(png), 1);
 }
 
-static int WritePNG(FILE* out_file, unsigned char* rgb, int stride,
-                    png_uint_32 width, png_uint_32 height, int has_alpha) {
+static int WritePNG(FILE* out_file, const WebPDecBuffer* const buffer) {
+  const uint32_t width = buffer->width;
+  const uint32_t height = buffer->height;
+  unsigned char* const rgb = buffer->u.RGBA.rgba;
+  const int stride = buffer->u.RGBA.stride;
+  const int has_alpha = (buffer->colorspace == MODE_RGBA);
   png_structp png;
   png_infop info;
   png_uint_32 y;
@@ -169,8 +185,7 @@ static int WritePNG(FILE* out_file, unsigned char* rgb, int stride,
 
 typedef uint32_t png_uint_32;
 
-static int WritePNG(FILE* out_file, unsigned char* rgb, int stride,
-                    png_uint_32 width, png_uint_32 height, int has_alpha) {
+static int WritePNG(FILE* out_file, const WebPDecBuffer* const buffer) {
   printf("PNG support not compiled. Please install the libpng development "
          "package before building.\n");
   printf("You can run with -ppm flag to decode in PPM format.\n");
@@ -178,84 +193,157 @@ static int WritePNG(FILE* out_file, unsigned char* rgb, int stride,
 }
 #endif
 
-static int WritePPM(FILE* fout, const unsigned char* rgb,
-                    uint32_t width, uint32_t height) {
-  fprintf(fout, "P6\n%d %d\n255\n", width, height);
-  return (fwrite(rgb, width * height, 3, fout) == 3);
-}
-
-static int WriteAlphaPlane(FILE* fout, const unsigned char* rgba,
-                           uint32_t width, uint32_t height) {
+static int WritePPM(FILE* fout, const WebPDecBuffer* const buffer) {
+  const uint32_t width = buffer->width;
+  const uint32_t height = buffer->height;
+  const unsigned char* const rgb = buffer->u.RGBA.rgba;
+  const int stride = buffer->u.RGBA.stride;
   uint32_t y;
-  fprintf(fout, "P5\n%d %d\n255\n", width, height);
+  fprintf(fout, "P6\n%d %d\n255\n", width, height);
   for (y = 0; y < height; ++y) {
-    const unsigned char* line = rgba + y * (width * 4);
-    uint32_t x;
-    for (x = 0; x < width; ++x) {
-      if (fputc(line[4 * x + 3], fout) == EOF) {
-        return 0;
-      }
+    if (fwrite(rgb + y * stride, width, 3, fout) != 3) {
+      return 0;
     }
   }
   return 1;
 }
 
-static int WritePGM(FILE* fout,
-                    unsigned char* y_plane, unsigned char *u, unsigned char* v,
-                    int y_stride, int uv_stride,
-                    uint32_t width, uint32_t height) {
+static int WriteAlphaPlane(FILE* fout, const WebPDecBuffer* const buffer) {
+  const uint32_t width = buffer->width;
+  const uint32_t height = buffer->height;
+  const unsigned char* const a = buffer->u.YUVA.a;
+  const int a_stride = buffer->u.YUVA.a_stride;
+  uint32_t y;
+  assert(a != NULL);
+  fprintf(fout, "P5\n%d %d\n255\n", width, height);
+  for (y = 0; y < height; ++y) {
+    if (fwrite(a + y * a_stride, width, 1, fout) != 1) {
+      return 0;
+    }
+  }
+  return 1;
+}
+
+static int WritePGM(FILE* fout, const WebPDecBuffer* const buffer) {
+  const int width = buffer->width;
+  const int height = buffer->height;
+  const WebPYUVABuffer* const yuv = &buffer->u.YUVA;
   // Save a grayscale PGM file using the IMC4 layout
   // (http://www.fourcc.org/yuv.php#IMC4). This is a very
   // convenient format for viewing the samples, esp. for
   // odd dimensions.
   int ok = 1;
-  unsigned int y;
-  const unsigned int uv_width = (width + 1) / 2;
-  const unsigned int uv_height = (height + 1) / 2;
-  const unsigned int out_stride = (width + 1) & ~1;
-  fprintf(fout, "P5\n%d %d\n255\n", out_stride, height + uv_height);
+  int y;
+  const int uv_width = (width + 1) / 2;
+  const int uv_height = (height + 1) / 2;
+  const int out_stride = (width + 1) & ~1;
+  const int a_height = yuv->a ? height : 0;
+  fprintf(fout, "P5\n%d %d\n255\n", out_stride, height + uv_height + a_height);
   for (y = 0; ok && y < height; ++y) {
-    ok &= (fwrite(y_plane + y * y_stride, width, 1, fout) == 1);
+    ok &= (fwrite(yuv->y + y * yuv->y_stride, width, 1, fout) == 1);
     if (width & 1) fputc(0, fout);    // padding byte
   }
   for (y = 0; ok && y < uv_height; ++y) {
-    ok &= (fwrite(u + y * uv_stride, uv_width, 1, fout) == 1);
-    ok &= (fwrite(v + y * uv_stride, uv_width, 1, fout) == 1);
+    ok &= (fwrite(yuv->u + y * yuv->u_stride, uv_width, 1, fout) == 1);
+    ok &= (fwrite(yuv->v + y * yuv->v_stride, uv_width, 1, fout) == 1);
+  }
+  for (y = 0; ok && y < a_height; ++y) {
+    ok &= (fwrite(yuv->a + y * yuv->a_stride, width, 1, fout) == 1);
+    if (width & 1) fputc(0, fout);    // padding byte
   }
   return ok;
 }
 
-typedef enum {
-  PNG = 0,
-  PPM,
-  PGM,
-  ALPHA_PLANE_ONLY  // this is for experimenting only
-} OutputFileFormat;
+static void SaveOutput(const WebPDecBuffer* const buffer,
+                       OutputFileFormat format, const char* const out_file) {
+  FILE* fout = NULL;
+  int needs_open_file = 1;
+  int ok = 1;
+  Stopwatch stop_watch;
+
+  if (verbose)
+    StopwatchReadAndReset(&stop_watch);
+
+#ifdef _WIN32
+  needs_open_file = (format != PNG);
+#endif
+  if (needs_open_file) {
+    fout = fopen(out_file, "wb");
+    if (!fout) {
+      fprintf(stderr, "Error opening output file %s\n", out_file);
+      return;
+    }
+  }
+
+  if (format == PNG) {
+#ifdef HAVE_WINCODEC_H
+    ok &= WritePNG(out_file, buffer);
+#else
+    ok &= WritePNG(fout, buffer);
+#endif
+  } else if (format == PPM) {
+    ok &= WritePPM(fout, buffer);
+  } else if (format == PGM) {
+    ok &= WritePGM(fout, buffer);
+  } else if (format == ALPHA_PLANE_ONLY) {
+    ok &= WriteAlphaPlane(fout, buffer);
+  }
+  if (fout) {
+    fclose(fout);
+  }
+  if (ok) {
+    printf("Saved file %s\n", out_file);
+    if (verbose) {
+      const double time = StopwatchReadAndReset(&stop_watch);
+      printf("Time to write output: %.3fs\n", time);
+    }
+  } else {
+    fprintf(stderr, "Error writing file %s !!\n", out_file);
+  }
+}
 
 static void Help(void) {
-  printf("Usage: dwebp "
-         "[in_file] [-h] [-v] [-ppm] [-pgm] [-version] [-o out_file]\n\n"
+  printf("Usage: dwebp in_file [options] [-o out_file]\n\n"
          "Decodes the WebP image file to PNG format [Default]\n"
          "Use following options to convert into alternate image formats:\n"
-         " -ppm:  save the raw RGB samples as color PPM\n"
-         " -pgm:  save the raw YUV samples as a grayscale PGM\n"
-         "        file with IMC4 layout.\n"
-         " -version: print version number and exit.\n"
-         "Use -v for verbose (e.g. print encoding/decoding times)\n"
-         "Use -noasm to disable all assembly optimizations.\n"
+         "  -ppm ......... save the raw RGB samples as color PPM\n"
+         "  -pgm ......... save the raw YUV samples as a grayscale PGM\n"
+         "                 file with IMC4 layout.\n"
+         " Other options are:\n"
+         "  -version  .... print version number and exit.\n"
+         "  -nofancy ..... don't use the fancy YUV420 upscaler.\n"
+         "  -nofilter .... disable in-loop filtering.\n"
+         "  -crop <x> <y> <w> <h> ... crop output with the given rectangle\n"
+         "  -scale <w> <h> .......... scale the output (*after* any cropping)\n"
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+         "  -alpha ....... only save the alpha plane.\n"
+#endif
+         "  -h     ....... this help message.\n"
+         "  -v     ....... verbose (e.g. print encoding/decoding times)\n"
+         "  -noasm ....... disable all assembly optimizations.\n"
         );
 }
 
+static const char* const kStatusMessages[] = {
+  "OK", "OUT_OF_MEMORY", "INVALID_PARAM", "BITSTREAM_ERROR",
+  "UNSUPPORTED_FEATURE", "SUSPENDED", "USER_ABORT", "NOT_ENOUGH_DATA"
+};
+
 int main(int argc, const char *argv[]) {
   const char *in_file = NULL;
   const char *out_file = NULL;
 
-  int width, height, stride, uv_stride;
-  int has_alpha = 0;
-  uint8_t* out = NULL, *u = NULL, *v = NULL;
+  WebPDecoderConfig config;
+  WebPDecBuffer* const output_buffer = &config.output;
+  WebPBitstreamFeatures* const bitstream = &config.input;
   OutputFileFormat format = PNG;
-  Stopwatch stop_watch;
   int c;
+
+  if (!WebPInitDecoderConfig(&config)) {
+    fprintf(stderr, "Library version mismatch!\n");
+    return -1;
+  }
+
   for (c = 1; c < argc; ++c) {
     if (!strcmp(argv[c], "-h") || !strcmp(argv[c], "-help")) {
       Help();
@@ -264,6 +352,10 @@ int main(int argc, const char *argv[]) {
       out_file = argv[++c];
     } else if (!strcmp(argv[c], "-alpha")) {
       format = ALPHA_PLANE_ONLY;
+    } else if (!strcmp(argv[c], "-nofancy")) {
+      config.options.no_fancy_upsampling = 1;
+    } else if (!strcmp(argv[c], "-nofilter")) {
+      config.options.bypass_filtering = 1;
     } else if (!strcmp(argv[c], "-ppm")) {
       format = PPM;
     } else if (!strcmp(argv[c], "-version")) {
@@ -273,6 +365,16 @@ int main(int argc, const char *argv[]) {
       return 0;
     } else if (!strcmp(argv[c], "-pgm")) {
       format = PGM;
+    } else if (!strcmp(argv[c], "-crop") && c < argc - 4) {
+      config.options.use_cropping = 1;
+      config.options.crop_left   = strtol(argv[++c], NULL, 0);
+      config.options.crop_top    = strtol(argv[++c], NULL, 0);
+      config.options.crop_width  = strtol(argv[++c], NULL, 0);
+      config.options.crop_height = strtol(argv[++c], NULL, 0);
+    } else if (!strcmp(argv[c], "-scale") && c < argc - 2) {
+      config.options.use_scaling = 1;
+      config.options.scaled_width  = strtol(argv[++c], NULL, 0);
+      config.options.scaled_height = strtol(argv[++c], NULL, 0);
     } else if (!strcmp(argv[c], "-v")) {
       verbose = 1;
     } else if (!strcmp(argv[c], "-noasm")) {
@@ -293,10 +395,13 @@ int main(int argc, const char *argv[]) {
   }
 
   {
+    Stopwatch stop_watch;
+    VP8StatusCode status = VP8_STATUS_OK;
+    int ok;
     uint32_t data_size = 0;
     void* data = NULL;
-    int ok;
     FILE* const in = fopen(in_file, "rb");
+
     if (!in) {
       fprintf(stderr, "cannot open input file '%s'\n", in_file);
       return 1;
@@ -308,101 +413,70 @@ int main(int argc, const char *argv[]) {
     ok = (fread(data, data_size, 1, in) == 1);
     fclose(in);
     if (!ok) {
+      fprintf(stderr, "Could not read %d bytes of data from file %s\n",
+              data_size, in_file);
       free(data);
       return -1;
     }
 
     if (verbose)
       StopwatchReadAndReset(&stop_watch);
+
+    status = WebPGetFeatures((const uint8_t*)data, data_size, bitstream);
+    if (status != VP8_STATUS_OK) {
+      goto end;
+    }
+
     switch (format) {
       case PNG:
 #ifdef _WIN32
-        out = WebPDecodeBGR((const uint8_t*)data, data_size, &width, &height);
-        stride = 3 * width;
-        has_alpha = 0;
+        // TODO(mikolaj): no alpha for now
+        output_buffer->colorspace = MODE_BGR;
 #else
-        out = WebPDecodeRGBA((const uint8_t*)data, data_size, &width, &height);
-        stride = 4 * width;
-        has_alpha = 1;
+        output_buffer->colorspace = bitstream->has_alpha ? MODE_RGBA : MODE_RGB;
 #endif
         break;
       case PPM:
-        out = WebPDecodeRGB((const uint8_t*)data, data_size, &width, &height);
+        output_buffer->colorspace = MODE_RGB;  // drops alpha for PPM
         break;
       case PGM:
-        out = WebPDecodeYUV((const uint8_t*)data, data_size, &width, &height,
-                            &u, &v, &stride, &uv_stride);
+        output_buffer->colorspace = bitstream->has_alpha ? MODE_YUVA : MODE_YUV;
         break;
       case ALPHA_PLANE_ONLY:
-        out = WebPDecodeRGBA((const uint8_t*)data, data_size, &width, &height);
+        output_buffer->colorspace = MODE_YUVA;
         break;
       default:
         free(data);
         return -1;
     }
+    status = WebPDecode((const uint8_t*)data, data_size, &config);
 
     if (verbose) {
       const double time = StopwatchReadAndReset(&stop_watch);
       printf("Time to decode picture: %.3fs\n", time);
     }
-
+ end:
     free(data);
-  }
-
-  if (!out) {
-    fprintf(stderr, "Decoding of %s failed.\n", in_file);
-    return -1;
+    ok = (status == VP8_STATUS_OK);
+    if (!ok) {
+      fprintf(stderr, "Decoding of %s failed.\n", in_file);
+      fprintf(stderr, "Status: %d (%s)\n", status, kStatusMessages[status]);
+      return -1;
+    }
   }
 
   if (out_file) {
-    FILE* fout = NULL;
-    int needs_open_file = 0;
-
-    printf("Decoded %s. Dimensions: %d x %d. Now saving...\n", in_file, width, height);
-    StopwatchReadAndReset(&stop_watch);
-#ifdef _WIN32
-    if (format != PNG) {
-      needs_open_file = 1;
-    }
-#else
-    needs_open_file = 1;
-#endif
-    if (needs_open_file) fout = fopen(out_file, "wb");
-    if (!needs_open_file || fout) {
-      int ok = 1;
-      if (format == PNG) {
-#ifdef HAVE_WINCODEC_H
-        ok &= WritePNG(out_file, out, stride, width, height, has_alpha);
-#else
-        ok &= WritePNG(fout, out, stride, width, height, has_alpha);
-#endif
-      } else if (format == PPM) {
-        ok &= WritePPM(fout, out, width, height);
-      } else if (format == PGM) {
-        ok &= WritePGM(fout, out, u, v, stride, uv_stride, width, height);
-      } else if (format == ALPHA_PLANE_ONLY) {
-        ok &= WriteAlphaPlane(fout, out, width, height);
-      }
-      if (fout)
-        fclose(fout);
-      if (ok) {
-        printf("Saved file %s\n", out_file);
-        if (verbose) {
-          const double time = StopwatchReadAndReset(&stop_watch);
-          printf("Time to write output: %.3fs\n", time);
-        }
-      } else {
-        fprintf(stderr, "Error writing file %s !!\n", out_file);
-      }
-    } else {
-      fprintf(stderr, "Error opening output file %s\n", out_file);
-    }
+    printf("Decoded %s. Dimensions: %d x %d%s. Now saving...\n", in_file,
+           output_buffer->width, output_buffer->height,
+           bitstream->has_alpha ? " (with alpha)" : "");
+    SaveOutput(output_buffer, format, out_file);
   } else {
-    printf("File %s can be decoded (dimensions: %d x %d).\n",
-           in_file, width, height);
+    printf("File %s can be decoded (dimensions: %d x %d)%s.\n",
+           in_file, output_buffer->width, output_buffer->height,
+           bitstream->has_alpha ? " (with alpha)" : "");
     printf("Nothing written; use -o flag to save the result as e.g. PNG.\n");
   }
-  free(out);
+  WebPFreeDecBuffer(output_buffer);
 
   return 0;
 }
diff --git a/makefile.unix b/makefile.unix
index bf57f705..9c0dc757 100644
--- a/makefile.unix
+++ b/makefile.unix
@@ -56,7 +56,8 @@ OBJS = src/enc/webpenc.o src/enc/bit_writer.o src/enc/syntax.o \
        src/enc/layer.o \
        src/dec/bits.o src/dec/dsp.o src/dec/dsp_sse2.o src/dec/frame.o \
        src/dec/webp.o src/dec/quant.o src/dec/tree.o src/dec/vp8.o \
-       src/dec/yuv.o  src/dec/idec.o src/dec/alpha.o src/dec/layer.o
+       src/dec/yuv.o src/dec/idec.o src/dec/alpha.o src/dec/layer.o \
+       src/dec/io.o src/dec/buffer.o
 HDRS = src/webp/encode.h src/enc/vp8enci.h src/enc/bit_writer.h \
        src/enc/cost.h src/dec/bits.h  src/dec/vp8i.h src/dec/yuv.h
 OUTPUT = examples/cwebp examples/dwebp src/libwebp.a
diff --git a/man/cwebp.1 b/man/cwebp.1
index 80bd90a8..b7f02b65 100644
--- a/man/cwebp.1
+++ b/man/cwebp.1
@@ -1,5 +1,5 @@
 .\"                                      Hey, EMACS: -*- nroff -*-
-.TH CWEBP 1 "March  28, 2011"
+.TH CWEBP 1 "June  20, 2011"
 .SH NAME
 cwebp \- compress an image file to a WebP file
 .SH SYNOPSIS
@@ -102,8 +102,8 @@ options \fB\-size\fP or \fB\-psnr\fP. Maximum value is 10.
 .TP
 .B \-crop x_position y_position width height
 Crop the source to a rectangle with top-left corner at coordinates
-(x_position, y_position) and size width x height. This cropping area must
-be fully contained within the source rectangle.
+(\fBx_position\fP, \fBy_position\fP) and size \fBwidth\fP x \fBheight\fP.
+This cropping area must be fully contained within the source rectangle.
 .TP
 .B \-s width height
 Specify that the input file actually consists of raw Y'CbCr samples following
diff --git a/man/dwebp.1 b/man/dwebp.1
index af6003d5..b0a923fb 100644
--- a/man/dwebp.1
+++ b/man/dwebp.1
@@ -1,5 +1,5 @@
 .\"                                      Hey, EMACS: -*- nroff -*-
-.TH DWEBP 1 "March  28, 2011"
+.TH DWEBP 1 "June  20, 2011"
 .SH NAME
 dwebp \- decompress a WebP file to an image file
 .SH SYNOPSIS
@@ -32,6 +32,29 @@ Change the output format to PGM. The output consist of luma/chroma
 samples instead of RGB, using the ICM4 layout. This option is mainly
 for verification and debugging purpose.
 .TP
+.B \-nofancy
+Don't use the fancy upscaler for YUV420. This may lead to jaggy
+edges (especially the red ones), but should be faster.
+.TP
+.B \-nofilter
+Don't use the in-loop filtering process even if it is required by
+the bitstream. This may produce visible blocks on the non-compliant output,
+but will make the decoding faster.
+.TP
+.B \-crop x_position y_position width height
+Crop the decoded picture to a rectangle with top-left corner at coordinates
+(\fBx_position\fP, \fBy_position\fP) and size \fBwidth\fP x \fBheight\fP.
+This cropping area must be fully contained within the source rectangle.
+The top-left corner will be snapped to even coordinates if needed.
+This option is meant to reduce the memory needed for cropping large images.
+Note: the cropping is applied \fIbefore\fP any scaling.
+.TP
+.B \-scale width height
+Rescale the decoded picture to dimension \fBwidth\fP x \fBheight\fP. This option is
+mostly intended to reducing the memory needed to decode large images,
+when only a small version is needed (thumbnail, preview, etc.).
+Note: scaling is applied \fIafter\fP cropping.
+.TP
 .B \-v
 Print extra information (decoding time in particular).
 .TP
diff --git a/src/dec/Makefile.am b/src/dec/Makefile.am
index 6fb27588..e859090c 100644
--- a/src/dec/Makefile.am
+++ b/src/dec/Makefile.am
@@ -2,7 +2,7 @@ AM_CPPFLAGS = -I$(top_srcdir)/src
 
 libwebpdecode_la_SOURCES = bits.h vp8i.h yuv.h bits.c dsp.c dsp_sse2.c frame.c \
                           quant.c tree.c vp8.c webp.c yuv.c idec.c alpha.c \
-                          layer.c
+                          layer.c io.c buffer.c
 libwebpdecode_la_LDFLAGS = -version-info 0:0:0
 libwebpdecode_la_CPPFLAGS = $(USE_EXPERIMENTAL_CODE)
 libwebpdecodeinclude_HEADERS = ../webp/decode.h ../webp/decode_vp8.h ../webp/types.h
diff --git a/src/dec/buffer.c b/src/dec/buffer.c
new file mode 100644
index 00000000..4ea82baa
--- /dev/null
+++ b/src/dec/buffer.c
@@ -0,0 +1,201 @@
+// Copyright 2011 Google Inc.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// Everything about WebPDecBuffer
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <stdlib.h>
+#include "vp8i.h"
+#include "webpi.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+//-----------------------------------------------------------------------------
+// WebPDecBuffer
+
+static VP8StatusCode CheckDecBuffer(const WebPDecBuffer* const buffer) {
+  int ok = 1;
+  WEBP_CSP_MODE mode = buffer->colorspace;
+  const int width = buffer->width;
+  const int height = buffer->height;
+  if (mode >= MODE_YUV) {   // YUV checks
+    const WebPYUVABuffer* const buf = &buffer->u.YUVA;
+    const int size = buf->y_stride * height;
+    const int u_size = buf->u_stride * ((height + 1) / 2);
+    const int v_size = buf->v_stride * ((height + 1) / 2);
+    const int a_size = buf->a_stride * height;
+    ok &= (size <= buf->y_size);
+    ok &= (u_size <= buf->u_size);
+    ok &= (v_size <= buf->v_size);
+    ok &= (a_size <= buf->a_size);
+    ok &= (buf->y_stride >= width);
+    ok &= (buf->u_stride >= (width + 1) / 2);
+    ok &= (buf->v_stride >= (width + 1) / 2);
+    if (buf->a) {
+      ok &= (buf->a_stride >= width);
+    }
+  } else {    // RGB checks
+    const WebPRGBABuffer* const buf = &buffer->u.RGBA;
+    ok &= (buf->stride * height <= buf->size);
+    if (mode == MODE_RGB || mode == MODE_BGR) {
+      ok &= (buf->stride >= width * 3);
+    } else if (mode == MODE_RGBA || mode == MODE_BGRA) {
+      ok &= (buf->stride >= width * 4);
+    }
+  }
+  return ok ? VP8_STATUS_OK : VP8_STATUS_INVALID_PARAM;
+}
+
+static VP8StatusCode AllocateBuffer(WebPDecBuffer* const buffer) {
+  const int w = buffer->width;
+  const int h = buffer->height;
+
+  if (w <= 0 || h <= 0) {
+    return VP8_STATUS_INVALID_PARAM;
+  }
+
+  if (!buffer->is_external_memory && buffer->memory == NULL) {
+    uint8_t* output;
+    WEBP_CSP_MODE mode = buffer->colorspace;
+    int stride;
+    int uv_stride = 0, a_stride = 0;
+    int uv_size = 0;
+    uint64_t size, a_size = 0, total_size;
+    // We need memory and it hasn't been allocated yet.
+    // => initialize output buffer, now that dimensions are known.
+    stride = (mode == MODE_RGB || mode == MODE_BGR) ? 3 * w
+        : (mode == MODE_RGBA || mode == MODE_BGRA) ? 4 * w
+        : w;
+    size = (uint64_t)stride * h;
+
+    if (mode >= MODE_YUV) {
+      uv_stride = (w + 1) / 2;
+      uv_size = (uint64_t)uv_stride * ((h + 1) / 2);
+      if (mode == MODE_YUVA) {
+        a_stride = w;
+        a_size = (uint64_t)a_stride * h;
+      }
+    }
+    total_size = size + 2 * uv_size + a_size;
+
+    // Security/sanity checks
+    if (((size_t)total_size != total_size) || (total_size >= (1ULL << 40))) {
+      return VP8_STATUS_INVALID_PARAM;
+    }
+
+    buffer->memory = output = (uint8_t*)malloc((size_t)total_size);
+    if (output == NULL) {
+      return VP8_STATUS_OUT_OF_MEMORY;
+    }
+
+    if (mode >= MODE_YUV) {   // YUVA initialization
+      WebPYUVABuffer* const buf = &buffer->u.YUVA;
+      buf->y = output;
+      buf->y_stride = stride;
+      buf->y_size = size;
+      buf->u = output + size;
+      buf->u_stride = uv_stride;
+      buf->u_size = uv_size;
+      buf->v = output + size + uv_size;
+      buf->v_stride = uv_stride;
+      buf->v_size = uv_size;
+      if (mode == MODE_YUVA) {
+        buf->a = output + size + 2 * uv_size;
+      }
+      buf->a_size = a_size;
+      buf->a_stride = a_stride;
+    } else {  // RGBA initialization
+      WebPRGBABuffer* const buf = &buffer->u.RGBA;
+      buf->rgba = output;
+      buf->stride = stride;
+      buf->size = size;
+    }
+  }
+  return CheckDecBuffer(buffer);
+}
+
+VP8StatusCode WebPAllocateDecBuffer(int w, int h,
+                                    const WebPDecoderOptions* const options,
+                                    WebPDecBuffer* const out) {
+  if (out == NULL || w <= 0 || h <= 0) {
+    return VP8_STATUS_INVALID_PARAM;
+  }
+  if (options != NULL) {    // First, apply options if there is any.
+    if (options->use_cropping) {
+      const int cw = options->crop_width;
+      const int ch = options->crop_height;
+      const int x = options->crop_left & ~1;
+      const int y = options->crop_top & ~1;
+      if (x < 0 || y < 0 || cw <= 0 || ch <= 0 || x + cw > w || y + ch > h) {
+        return VP8_STATUS_INVALID_PARAM;   // out of frame boundary.
+      }
+      w = cw;
+      h = ch;
+    }
+    if (options->use_scaling) {
+      if (options->scaled_width <= 0 || options->scaled_height <= 0) {
+        return VP8_STATUS_INVALID_PARAM;
+      }
+      w  = options->scaled_width;
+      h = options->scaled_height;
+    }
+  }
+  out->width = w;
+  out->height = h;
+
+  // Then, allocate buffer for real
+  return AllocateBuffer(out);
+}
+
+//-----------------------------------------------------------------------------
+// constructors / destructors
+
+int WebPInitDecBufferInternal(WebPDecBuffer* const buffer, int version) {
+  if (version != WEBP_DECODER_ABI_VERSION) return 0;  // version mismatch
+  if (!buffer) return 0;
+  memset(buffer, 0, sizeof(*buffer));
+  return 1;
+}
+
+void WebPFreeDecBuffer(WebPDecBuffer* const buffer) {
+  if (buffer) {
+    if (!buffer->is_external_memory)
+      free(buffer->memory);
+    buffer->memory = NULL;
+  }
+}
+
+void WebPCopyDecBuffer(const WebPDecBuffer* const src,
+                       WebPDecBuffer* const dst) {
+  if (src && dst) {
+    *dst = *src;
+    if (src->memory) {
+      dst->is_external_memory = 1;   // dst buffer doesn't own the memory.
+      dst->memory = NULL;
+    }
+  }
+}
+
+// Copy and transfer ownership from src to dst (beware of parameter order!)
+void WebPGrabDecBuffer(WebPDecBuffer* const src, WebPDecBuffer* const dst) {
+  if (src && dst) {
+    *dst = *src;
+    if (src->memory) {
+      src->is_external_memory = 1;   // src relinquishes ownership
+      src->memory = NULL;
+    }
+  }
+}
+
+//-----------------------------------------------------------------------------
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/src/dec/frame.c b/src/dec/frame.c
index 46d735f8..29a0f757 100644
--- a/src/dec/frame.c
+++ b/src/dec/frame.c
@@ -18,7 +18,7 @@ extern "C" {
 
 #define ALIGN_MASK (32 - 1)
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Memory setup
 
 // kFilterExtraRows[] = How many extra lines are needed on the MB boundary
@@ -101,15 +101,13 @@ int VP8InitFrame(VP8Decoder* const dec, VP8Io* io) {
   memset(dec->intra_t_, B_DC_PRED, intra_pred_mode_size);
 
   // prepare 'io'
-  io->width = dec->pic_hdr_.width_;
-  io->height = dec->pic_hdr_.height_;
   io->mb_y = 0;
   io->y = dec->cache_y_;
   io->u = dec->cache_u_;
   io->v = dec->cache_v_;
   io->y_stride = dec->cache_y_stride_;
   io->uv_stride = dec->cache_uv_stride_;
-  io->fancy_upscaling = 0;    // default
+  io->fancy_upsampling = 0;    // default
   io->a = NULL;
 
   // Init critical function pointers and look-up tables.
@@ -119,7 +117,7 @@ int VP8InitFrame(VP8Decoder* const dec, VP8Io* io) {
   return 1;
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Filtering
 
 static inline int hev_thresh_from_level(int level, int keyframe) {
@@ -130,7 +128,7 @@ static inline int hev_thresh_from_level(int level, int keyframe) {
   }
 }
 
-static void DoFilter(VP8Decoder* const dec, int mb_x, int mb_y) {
+static void DoFilter(const VP8Decoder* const dec, int mb_x, int mb_y) {
   VP8MB* const mb = dec->mb_info_ + mb_x;
   uint8_t* const y_dst = dec->cache_y_ + mb_x * 16;
   const int y_bps = dec->cache_y_stride_;
@@ -178,6 +176,19 @@ static void DoFilter(VP8Decoder* const dec, int mb_x, int mb_y) {
   }
 }
 
+void VP8FilterRow(const VP8Decoder* const dec) {
+  int mb_x;
+  assert(dec->filter_type_ > 0);
+  if (dec->mb_y_ < dec->tl_mb_y_ || dec->mb_y_ > dec->br_mb_y_) {
+    return;
+  }
+  for (mb_x = dec->tl_mb_x_; mb_x < dec->br_mb_x_; ++mb_x) {
+    DoFilter(dec, mb_x, dec->mb_y_);
+  }
+}
+
+//------------------------------------------------------------------------------
+
 void VP8StoreBlock(VP8Decoder* const dec) {
   if (dec->filter_type_ > 0) {
     VP8MB* const info = dec->mb_info_ + dec->mb_x_;
@@ -225,24 +236,31 @@ void VP8StoreBlock(VP8Decoder* const dec) {
   }
 }
 
+//------------------------------------------------------------------------------
+// This function is called after a row of macroblocks is finished decoding.
+// It also takes into account the following restrictions:
+//  * In case of in-loop filtering, we must hold off sending some of the bottom
+//    pixels as they are yet unfiltered. They will be when the next macroblock
+//    row is decoded. Meanwhile, we must preserve them by rotating them in the
+//    cache area. This doesn't hold for the very bottom row of the uncropped
+//    picture of course.
+//  * we must clip the remaining pixels against the cropping area. The VP8Io
+//    struct must have the following fields set correctly before calling put():
+
+#define MACROBLOCK_VPOS(mb_y)  ((mb_y) * 16)    // vertical position of a MB
+
 int VP8FinishRow(VP8Decoder* const dec, VP8Io* io) {
   const int extra_y_rows = kFilterExtraRows[dec->filter_type_];
   const int ysize = extra_y_rows * dec->cache_y_stride_;
   const int uvsize = (extra_y_rows / 2) * dec->cache_uv_stride_;
-  const int first_row = (dec->mb_y_ == 0);
-  const int last_row = (dec->mb_y_ >= dec->mb_h_ - 1);
   uint8_t* const ydst = dec->cache_y_ - ysize;
   uint8_t* const udst = dec->cache_u_ - uvsize;
   uint8_t* const vdst = dec->cache_v_ - uvsize;
-  if (dec->filter_type_ > 0) {
-    int mb_x;
-    for (mb_x = 0; mb_x < dec->mb_w_; ++mb_x) {
-      DoFilter(dec, mb_x, dec->mb_y_);
-    }
-  }
+  const int first_row = (dec->mb_y_ == 0);
+  const int last_row = (dec->mb_y_ >= dec->br_mb_y_ - 1);
+  int y_start = MACROBLOCK_VPOS(dec->mb_y_);
+  int y_end = MACROBLOCK_VPOS(dec->mb_y_ + 1);
   if (io->put) {
-    int y_start = dec->mb_y_ * 16;
-    int y_end = y_start + 16;
     if (!first_row) {
       y_start -= extra_y_rows;
       io->y = ydst;
@@ -253,14 +271,13 @@ int VP8FinishRow(VP8Decoder* const dec, VP8Io* io) {
       io->u = dec->cache_u_;
       io->v = dec->cache_v_;
     }
+
     if (!last_row) {
       y_end -= extra_y_rows;
     }
-    if (y_end > io->height) {
-      y_end = io->height;
+    if (y_end > io->crop_bottom) {
+      y_end = io->crop_bottom;    // make sure we don't overflow on last row.
     }
-    io->mb_y = y_start;
-    io->mb_h = y_end - y_start;
     io->a = NULL;
 #ifdef WEBP_EXPERIMENTAL_FEATURES
     if (dec->alpha_data_) {
@@ -271,11 +288,33 @@ int VP8FinishRow(VP8Decoder* const dec, VP8Io* io) {
       }
     }
 #endif
-    if (!io->put(io)) {
-      return 0;
+    if (y_start < io->crop_top) {
+      const int delta_y = io->crop_top - y_start;
+      y_start = io->crop_top;
+      assert(!(delta_y & 1));
+      io->y += dec->cache_y_stride_ * delta_y;
+      io->u += dec->cache_uv_stride_ * (delta_y >> 1);
+      io->v += dec->cache_uv_stride_ * (delta_y >> 1);
+      if (io->a) {
+        io->a += io->width * delta_y;
+      }
+    }
+    if (y_start < y_end) {
+      io->y += io->crop_left;
+      io->u += io->crop_left >> 1;
+      io->v += io->crop_left >> 1;
+      if (io->a) {
+        io->a += io->crop_left;
+      }
+      io->mb_y = y_start - io->crop_top;
+      io->mb_w = io->crop_right - io->crop_left;
+      io->mb_h = y_end - y_start;
+      if (!io->put(io)) {
+        return 0;
+      }
     }
   }
-    // rotate top samples
+  // rotate top samples
   if (!last_row) {
     memcpy(ydst, ydst + 16 * dec->cache_y_stride_, ysize);
     memcpy(udst, udst + 8 * dec->cache_uv_stride_, uvsize);
@@ -284,7 +323,60 @@ int VP8FinishRow(VP8Decoder* const dec, VP8Io* io) {
   return 1;
 }
 
-//-----------------------------------------------------------------------------
+#undef MACROBLOCK_VPOS
+
+//------------------------------------------------------------------------------
+// Finish setting up the decoding parameter once user's setup() is called.
+
+VP8StatusCode VP8FinishFrameSetup(VP8Decoder* const dec, VP8Io* const io) {
+  // Call setup() first. This may trigger additional decoding features on 'io'.
+  if (io->setup && !io->setup(io)) {
+    VP8SetError(dec, VP8_STATUS_USER_ABORT, "Frame setup failed");
+    return dec->status_;
+  }
+
+  // Disable filtering per user request
+  if (io->bypass_filtering) {
+    dec->filter_type_ = 0;
+  }
+  // TODO(skal): filter type / strength / sharpness forcing
+
+  // Define the area where we can skip in-loop filtering, in case of cropping.
+  //
+  // 'Simple' filter reads two luma samples outside of the macroblock and
+  // and filters one. It doesn't filter the chroma samples. Hence, we can
+  // avoid doing the in-loop filtering before crop_top/crop_left position.
+  // For the 'Complex' filter, 3 samples are read and up to 3 are filtered.
+  // Means: there's a dependency chain that goes all the way up to the
+  // top-left corner of the picture (MB #0). We must filter all the previous
+  // macroblocks.
+  // TODO(skal): add an 'approximate_decoding' option, that won't produce
+  // a 1:1 bit-exactness for complex filtering?
+  {
+    const int extra_pixels = kFilterExtraRows[dec->filter_type_];
+    if (dec->filter_type_ == 2) {
+      // For complex filter, we need to preserve the dependency chain.
+      dec->tl_mb_x_ = 0;
+      dec->tl_mb_y_ = 0;
+    } else {
+      // For simple filter, we can filter only the cropped region.
+      dec->tl_mb_y_ = io->crop_top >> 4;
+      dec->tl_mb_x_ = io->crop_left >> 4;
+    }
+    // We need some 'extra' pixels on the right/bottom.
+    dec->br_mb_y_ = (io->crop_bottom + 15 + extra_pixels) >> 4;
+    dec->br_mb_x_ = (io->crop_right + 15 + extra_pixels) >> 4;
+    if (dec->br_mb_x_ > dec->mb_w_) {
+      dec->br_mb_x_ = dec->mb_w_;
+    }
+    if (dec->br_mb_y_ > dec->mb_h_) {
+      dec->br_mb_y_ = dec->mb_h_;
+    }
+  }
+  return VP8_STATUS_OK;
+}
+
+//------------------------------------------------------------------------------
 // Main reconstruction function.
 
 static const int kScan[16] = {
@@ -431,7 +523,7 @@ void VP8ReconstructBlock(VP8Decoder* const dec) {
   }
 }
 
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
diff --git a/src/dec/idec.c b/src/dec/idec.c
index b33007bc..a2022219 100644
--- a/src/dec/idec.c
+++ b/src/dec/idec.c
@@ -15,7 +15,6 @@
 
 #include "webpi.h"
 #include "vp8i.h"
-#include "yuv.h"
 
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
@@ -56,12 +55,12 @@ typedef struct {
 
 struct WebPIDecoder {
   DecState state_;         // current decoding state
-  int w_, h_;              // width and height
   WebPDecParams params_;   // Params to store output info
   VP8Decoder* dec_;
   VP8Io io_;
 
-  MemBuffer mem_;          // memory buffer
+  MemBuffer mem_;          // input memory buffer.
+  WebPDecBuffer output_;   // output buffer (when no external one is supplied)
 };
 
 // MB context to restore in case VP8DecodeMB() fails
@@ -236,24 +235,23 @@ static VP8StatusCode IDecError(WebPIDecoder* idec, VP8StatusCode error) {
 
 // Header
 static VP8StatusCode DecodeHeader(WebPIDecoder* const idec) {
-  int width, height;
-  uint32_t curr_size, riff_header_size, bits;
-  WebPDecParams* params = &idec->params_;
+  uint32_t riff_header_size, bits;
   const uint8_t* data = idec->mem_.buf_ + idec->mem_.start_;
+  uint32_t curr_size = MemDataSize(&idec->mem_);
+  uint32_t chunk_size;
 
-  if (MemDataSize(&idec->mem_) < WEBP_HEADER_SIZE) {
+  if (curr_size < WEBP_HEADER_SIZE) {
     return VP8_STATUS_SUSPENDED;
   }
 
-  if (!WebPInitDecParams(data, idec->mem_.end_, &width, &height, params)) {
+  // Validate and Skip over RIFF header
+  chunk_size = WebPCheckRIFFHeader(&data, &curr_size);
+  if (chunk_size == 0 ||
+      curr_size < VP8_HEADER_SIZE ||
+      !VP8GetInfo(data, curr_size, chunk_size, NULL, NULL, NULL)) {
     return IDecError(idec, VP8_STATUS_BITSTREAM_ERROR);
   }
 
-  // Validate and Skip over RIFF header
-  curr_size = MemDataSize(&idec->mem_);
-  if (!WebPCheckRIFFHeader(&data, &curr_size)) {
-    return IDecError(idec, VP8_STATUS_BITSTREAM_ERROR);
-  }
   riff_header_size = idec->mem_.end_ - curr_size;
   bits = data[0] | (data[1] << 8) | (data[2] << 16);
 
@@ -261,8 +259,6 @@ static VP8StatusCode DecodeHeader(WebPIDecoder* const idec) {
   idec->mem_.start_ += riff_header_size;
   assert(idec->mem_.start_ <= idec->mem_.end_);
 
-  idec->w_ = width;
-  idec->h_ = height;
   idec->io_.data_size -= riff_header_size;
   idec->io_.data = data;
   idec->state_ = STATE_PARTS0;
@@ -298,14 +294,13 @@ static VP8StatusCode DecodePartition0(WebPIDecoder* const idec) {
   VP8Decoder* const dec = idec->dec_;
   VP8Io* const io = &idec->io_;
   const WebPDecParams* const params = &idec->params_;
-  const WEBP_CSP_MODE mode = params->mode;
+  WebPDecBuffer* const output = params->output;
 
   // Wait till we have enough data for the whole partition #0
   if (MemDataSize(&idec->mem_) < idec->mem_.part0_size_) {
     return VP8_STATUS_SUSPENDED;
   }
 
-  io->opaque = &idec->params_;
   if (!VP8GetHeaders(dec, io)) {
     const VP8StatusCode status = dec->status_;
     if (status == VP8_STATUS_SUSPENDED ||
@@ -316,29 +311,26 @@ static VP8StatusCode DecodePartition0(WebPIDecoder* const idec) {
     return IDecError(idec, status);
   }
 
-  if (!WebPCheckDecParams(io, params)) {
-    return IDecError(idec, VP8_STATUS_INVALID_PARAM);
+  // Allocate/Verify output buffer now
+  dec->status_ = WebPAllocateDecBuffer(io->width, io->height, params->options,
+                                       output);
+  if (dec->status_ != VP8_STATUS_OK) {
+    return IDecError(idec, dec->status_);
   }
 
-  if (mode != MODE_YUV) {
-    VP8YUVInit();
-  }
-
-  // allocate memory and prepare everything.
+  // Allocate memory and prepare everything.
   if (!VP8InitFrame(dec, io)) {
-    return IDecError(idec, VP8_STATUS_OUT_OF_MEMORY);
-  }
-  if (io->setup && !io->setup(io)) {
-    return IDecError(idec, VP8_STATUS_USER_ABORT);
+    return IDecError(idec, dec->status_);
   }
 
-  // disable filtering per user request (_after_ setup() is called)
-  if (io->bypass_filtering) dec->filter_type_ = 0;
+  // Finish setting up the decoding parameter
+  if (VP8FinishFrameSetup(dec, io) != VP8_STATUS_OK) {
+    return IDecError(idec, dec->status_);
+  }
 
   if (!CopyParts0Data(idec)) {
     return IDecError(idec, VP8_STATUS_OUT_OF_MEMORY);
   }
-
   idec->state_ = STATE_DATA;
   return VP8_STATUS_OK;
 }
@@ -383,6 +375,9 @@ static VP8StatusCode DecodeRemaining(WebPIDecoder* const idec) {
         assert(idec->mem_.start_ <= idec->mem_.end_);
       }
     }
+    if (dec->filter_type_ > 0) {
+      VP8FilterRow(dec);
+    }
     if (!VP8FinishRow(dec, io)) {
       return IDecError(idec, VP8_STATUS_USER_ABORT);
     }
@@ -410,7 +405,7 @@ static VP8StatusCode IDecode(WebPIDecoder* idec) {
     status = DecodePartition0(idec);
   }
   if (idec->state_ == STATE_DATA) {
-    return DecodeRemaining(idec);
+    status = DecodeRemaining(idec);
   }
   return status;
 }
@@ -418,9 +413,11 @@ static VP8StatusCode IDecode(WebPIDecoder* idec) {
 //------------------------------------------------------------------------------
 // Public functions
 
-WebPIDecoder* WebPINew(WEBP_CSP_MODE mode) {
+WebPIDecoder* WebPINewDecoder(WebPDecBuffer* const output_buffer) {
   WebPIDecoder* idec = (WebPIDecoder*)calloc(1, sizeof(WebPIDecoder));
-  if (!idec) return NULL;
+  if (idec == NULL) {
+    return NULL;
+  }
 
   idec->dec_ = VP8New();
   if (idec->dec_ == NULL) {
@@ -430,53 +427,87 @@ WebPIDecoder* WebPINew(WEBP_CSP_MODE mode) {
 
   idec->state_ = STATE_HEADER;
 
-  WebPResetDecParams(&idec->params_);
-  idec->params_.mode = mode;
-
   InitMemBuffer(&idec->mem_);
+  WebPInitDecBuffer(&idec->output_);
   VP8InitIo(&idec->io_);
-  WebPInitCustomIo(&idec->io_);
+
+  WebPResetDecParams(&idec->params_);
+  idec->params_.output = output_buffer ? output_buffer : &idec->output_;
+  WebPInitCustomIo(&idec->params_, &idec->io_);  // Plug the I/O functions.
+
+  return idec;
+}
+
+WebPIDecoder* WebPIDecode(const uint8_t* data, uint32_t data_size,
+                          WebPDecoderConfig* const config) {
+  WebPIDecoder* idec;
+
+  // Parse the bitstream's features, if requested:
+  if (data != NULL && data_size > 0 && config != NULL) {
+    if (WebPGetFeatures(data, data_size, &config->input) != VP8_STATUS_OK) {
+      return NULL;
+    }
+  }
+  // Create an instance of the incremental decoder
+  idec = WebPINewDecoder(config ? &config->output : NULL);
+  if (!idec) {
+    return NULL;
+  }
+  // Finish initialization
+  if (config != NULL) {
+    idec->params_.options = &config->options;
+  }
   return idec;
 }
 
 void WebPIDelete(WebPIDecoder* const idec) {
   if (!idec) return;
   VP8Delete(idec->dec_);
-  WebPClearDecParams(&idec->params_);
   ClearMemBuffer(&idec->mem_);
+  WebPFreeDecBuffer(&idec->output_);
   free(idec);
 }
 
 //------------------------------------------------------------------------------
+// Wrapper toward WebPINewDecoder
+
+WebPIDecoder* WebPINew(WEBP_CSP_MODE mode) {
+  WebPIDecoder* const idec = WebPINewDecoder(NULL);
+  if (!idec) return NULL;
+  idec->output_.colorspace = mode;
+  return idec;
+}
 
 WebPIDecoder* WebPINewRGB(WEBP_CSP_MODE mode, uint8_t* output_buffer,
                           int output_buffer_size, int output_stride) {
   WebPIDecoder* idec;
-  if (mode == MODE_YUV) return NULL;
-  idec = WebPINew(mode);
-  if (idec == NULL) return NULL;
-  idec->params_.output = output_buffer;
-  idec->params_.stride = output_stride;
-  idec->params_.output_size = output_buffer_size;
-  idec->params_.external_buffer = 1;
+  if (mode >= MODE_YUV) return NULL;
+  idec = WebPINewDecoder(NULL);
+  if (!idec) return NULL;
+  idec->output_.colorspace = mode;
+  idec->output_.is_external_memory = 1;
+  idec->output_.u.RGBA.rgba = output_buffer;
+  idec->output_.u.RGBA.stride = output_stride;
+  idec->output_.u.RGBA.size = output_buffer_size;
   return idec;
 }
 
 WebPIDecoder* WebPINewYUV(uint8_t* luma, int luma_size, int luma_stride,
                           uint8_t* u, int u_size, int u_stride,
                           uint8_t* v, int v_size, int v_stride) {
-  WebPIDecoder* idec = WebPINew(MODE_YUV);
-  if (idec == NULL) return NULL;
-  idec->params_.output = luma;
-  idec->params_.stride = luma_stride;
-  idec->params_.output_size = luma_size;
-  idec->params_.u = u;
-  idec->params_.u_stride = u_stride;
-  idec->params_.output_u_size = u_size;
-  idec->params_.v = v;
-  idec->params_.v_stride = v_stride;
-  idec->params_.output_v_size = v_size;
-  idec->params_.external_buffer = 1;
+  WebPIDecoder* const idec = WebPINewDecoder(NULL);
+  if (!idec) return NULL;
+  idec->output_.colorspace = MODE_YUV;
+  idec->output_.is_external_memory = 1;
+  idec->output_.u.YUVA.y = luma;
+  idec->output_.u.YUVA.y_stride = luma_stride;
+  idec->output_.u.YUVA.y_size = luma_size;
+  idec->output_.u.YUVA.u = u;
+  idec->output_.u.YUVA.u_stride = u_stride;
+  idec->output_.u.YUVA.u_size = u_size;
+  idec->output_.u.YUVA.v = v;
+  idec->output_.u.YUVA.v_stride = v_stride;
+  idec->output_.u.YUVA.v_size = v_size;
   return idec;
 }
 
@@ -540,38 +571,54 @@ VP8StatusCode WebPIUpdate(WebPIDecoder* const idec, const uint8_t* data,
 
 //------------------------------------------------------------------------------
 
-uint8_t* WebPIDecGetRGB(const WebPIDecoder* const idec, int *last_y, int* width,
-                        int* height, int* stride) {
-  if (!idec || !idec->dec_ || idec->params_.mode == MODE_YUV ||
-      idec->state_ <= STATE_PARTS0) {
+static const WebPDecBuffer* GetOutputBuffer(const WebPIDecoder* const idec) {
+  if (!idec || !idec->dec_ || idec->state_ <= STATE_PARTS0) {
     return NULL;
   }
-
-  if (last_y) *last_y = idec->params_.last_y;
-  if (width) *width = idec->w_;
-  if (height) *height = idec->h_;
-  if (stride) *stride = idec->params_.stride;
-
   return idec->params_.output;
 }
 
-uint8_t* WebPIDecGetYUV(const WebPIDecoder* const idec, int *last_y,
-                        uint8_t** u, uint8_t** v, int* width, int* height,
-                        int *stride, int* uv_stride) {
-  if (!idec || !idec->dec_ || idec->params_.mode != MODE_YUV ||
-      idec->state_ <= STATE_PARTS0) {
+const WebPDecBuffer* WebPIDecGetSamples(const WebPIDecoder* const idec,
+                                        int* last_y) {
+  const WebPDecBuffer* const src = GetOutputBuffer(idec);
+  if (last_y) *last_y = idec->params_.last_y;
+  return src;
+}
+
+uint8_t* WebPIDecGetRGB(const WebPIDecoder* const idec, int* last_y,
+                        int* width, int* height, int* stride) {
+  const WebPDecBuffer* const src = GetOutputBuffer(idec);
+  if (!src) return NULL;
+  if (src->colorspace >= MODE_YUV) {
     return NULL;
   }
 
   if (last_y) *last_y = idec->params_.last_y;
-  if (u) *u = idec->params_.u;
-  if (v) *v = idec->params_.v;
-  if (width) *width = idec->w_;
-  if (height) *height = idec->h_;
-  if (stride) *stride = idec->params_.stride;
-  if (uv_stride) *uv_stride = idec->params_.u_stride;
+  if (width) *width = src->width;
+  if (height) *height = src->height;
+  if (stride) *stride = src->u.RGBA.stride;
 
-  return idec->params_.output;
+  return src->u.RGBA.rgba;
+}
+
+uint8_t* WebPIDecGetYUV(const WebPIDecoder* const idec, int* last_y,
+                        uint8_t** u, uint8_t** v,
+                        int* width, int* height, int *stride, int* uv_stride) {
+  const WebPDecBuffer* const src = GetOutputBuffer(idec);
+  if (!src) return NULL;
+  if (src->colorspace < MODE_YUV) {
+    return NULL;
+  }
+
+  if (last_y) *last_y = idec->params_.last_y;
+  if (u) *u = src->u.YUVA.u;
+  if (v) *v = src->u.YUVA.v;
+  if (width) *width = src->width;
+  if (height) *height = src->height;
+  if (stride) *stride = src->u.YUVA.y_stride;
+  if (uv_stride) *uv_stride = src->u.YUVA.u_stride;
+
+  return src->u.YUVA.y;
 }
 
 #if defined(__cplusplus) || defined(c_plusplus)
diff --git a/src/dec/io.c b/src/dec/io.c
new file mode 100644
index 00000000..80233ae6
--- /dev/null
+++ b/src/dec/io.c
@@ -0,0 +1,845 @@
+// Copyright 2011 Google Inc.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// functions for sample output.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <assert.h>
+#include <stdlib.h>
+#include "vp8i.h"
+#include "webpi.h"
+#include "yuv.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+#define FANCY_UPSAMPLING   // undefined to remove fancy upsampling support
+
+// mask to apply to WEBP_CSP_MODE, to know if there's alpha channel or not.
+#define MODE_ALPHA_MASK 1
+
+//------------------------------------------------------------------------------
+// Fancy upsampler
+
+#ifdef FANCY_UPSAMPLING
+
+// Given samples laid out in a square as:
+//  [a b]
+//  [c d]
+// we interpolate u/v as:
+//  ([9*a + 3*b + 3*c +   d    3*a + 9*b + 3*c +   d] + [8 8]) / 16
+//  ([3*a +   b + 9*c + 3*d      a + 3*b + 3*c + 9*d]   [8 8]) / 16
+
+// We process u and v together stashed into 32bit (16bit each).
+#define LOAD_UV(u,v) ((u) | ((v) << 16))
+
+#define UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP)                                  \
+static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
+                      const uint8_t* top_u, const uint8_t* top_v,              \
+                      const uint8_t* cur_u, const uint8_t* cur_v,              \
+                      uint8_t* top_dst, uint8_t* bottom_dst, int len) {        \
+  int x;                                                                       \
+  const int last_pixel_pair = (len - 1) >> 1;                                  \
+  uint32_t tl_uv = LOAD_UV(top_u[0], top_v[0]);   /* top-left sample */        \
+  uint32_t l_uv  = LOAD_UV(cur_u[0], cur_v[0]);   /* left-sample */            \
+  if (top_y) {                                                                 \
+    const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2;                \
+    FUNC(top_y[0], uv0 & 0xff, (uv0 >> 16), top_dst);                          \
+  }                                                                            \
+  if (bottom_y) {                                                              \
+    const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2;                \
+    FUNC(bottom_y[0], uv0 & 0xff, (uv0 >> 16), bottom_dst);                    \
+  }                                                                            \
+  for (x = 1; x <= last_pixel_pair; ++x) {                                     \
+    const uint32_t t_uv = LOAD_UV(top_u[x], top_v[x]);  /* top sample */       \
+    const uint32_t uv   = LOAD_UV(cur_u[x], cur_v[x]);  /* sample */           \
+    /* precompute invariant values associated with first and second diagonals*/\
+    const uint32_t avg = tl_uv + t_uv + l_uv + uv + 0x00080008u;               \
+    const uint32_t diag_12 = (avg + 2 * (t_uv + l_uv)) >> 3;                   \
+    const uint32_t diag_03 = (avg + 2 * (tl_uv + uv)) >> 3;                    \
+    if (top_y) {                                                               \
+      const uint32_t uv0 = (diag_12 + tl_uv) >> 1;                             \
+      const uint32_t uv1 = (diag_03 + t_uv) >> 1;                              \
+      FUNC(top_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16),                          \
+           top_dst + (2 * x - 1) * XSTEP);                                     \
+      FUNC(top_y[2 * x - 0], uv1 & 0xff, (uv1 >> 16),                          \
+           top_dst + (2 * x - 0) * XSTEP);                                     \
+    }                                                                          \
+    if (bottom_y) {                                                            \
+      const uint32_t uv0 = (diag_03 + l_uv) >> 1;                              \
+      const uint32_t uv1 = (diag_12 + uv) >> 1;                                \
+      FUNC(bottom_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16),                       \
+           bottom_dst + (2 * x - 1) * XSTEP);                                  \
+      FUNC(bottom_y[2 * x + 0], uv1 & 0xff, (uv1 >> 16),                       \
+           bottom_dst + (2 * x + 0) * XSTEP);                                  \
+    }                                                                          \
+    tl_uv = t_uv;                                                              \
+    l_uv = uv;                                                                 \
+  }                                                                            \
+  if (!(len & 1)) {                                                            \
+    if (top_y) {                                                               \
+      const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2;              \
+      FUNC(top_y[len - 1], uv0 & 0xff, (uv0 >> 16),                            \
+           top_dst + (len - 1) * XSTEP);                                       \
+    }                                                                          \
+    if (bottom_y) {                                                            \
+      const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2;              \
+      FUNC(bottom_y[len - 1], uv0 & 0xff, (uv0 >> 16),                         \
+           bottom_dst + (len - 1) * XSTEP);                                    \
+    }                                                                          \
+  }                                                                            \
+}
+
+// All variants implemented.
+UPSAMPLE_FUNC(UpsampleRgbLinePair,  VP8YuvToRgb,  3)
+UPSAMPLE_FUNC(UpsampleBgrLinePair,  VP8YuvToBgr,  3)
+UPSAMPLE_FUNC(UpsampleRgbaLinePair, VP8YuvToRgba, 4)
+UPSAMPLE_FUNC(UpsampleBgraLinePair, VP8YuvToBgra, 4)
+// These two don't erase the alpha value
+UPSAMPLE_FUNC(UpsampleRgbKeepAlphaLinePair, VP8YuvToRgb, 4)
+UPSAMPLE_FUNC(UpsampleBgrKeepAlphaLinePair, VP8YuvToBgr, 4)
+
+typedef void (*UpsampleLinePairFunc)(
+  const uint8_t* top_y, const uint8_t* bottom_y,
+  const uint8_t* top_u, const uint8_t* top_v,
+  const uint8_t* cur_u, const uint8_t* cur_v,
+  uint8_t* top_dst, uint8_t* bottom_dst, int len);
+
+static const UpsampleLinePairFunc
+  kUpsamplers[MODE_BGRA + 1] = {
+    UpsampleRgbLinePair,    // MODE_RGB
+    UpsampleRgbaLinePair,   // MODE_RGBA
+    UpsampleBgrLinePair,    // MODE_BGR
+    UpsampleBgraLinePair    // MODE_BGRA
+  },
+  kUpsamplersKeepAlpha[MODE_BGRA + 1] = {
+    UpsampleRgbLinePair,            // MODE_RGB
+    UpsampleRgbKeepAlphaLinePair,   // MODE_RGBA
+    UpsampleBgrLinePair,            // MODE_BGR
+    UpsampleBgrKeepAlphaLinePair    // MODE_BGRA
+  };
+
+#undef LOAD_UV
+#undef UPSAMPLE_FUNC
+
+#endif  // FANCY_UPSAMPLING
+
+//------------------------------------------------------------------------------
+// simple point-sampling
+
+#define SAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP)                                    \
+static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
+                      const uint8_t* u, const uint8_t* v,                      \
+                      uint8_t* top_dst, uint8_t* bottom_dst, int len) {        \
+  int i;                                                                       \
+  for (i = 0; i < len - 1; i += 2) {                                           \
+    FUNC(top_y[0], u[0], v[0], top_dst);                                       \
+    FUNC(top_y[1], u[0], v[0], top_dst + XSTEP);                               \
+    FUNC(bottom_y[0], u[0], v[0], bottom_dst);                                 \
+    FUNC(bottom_y[1], u[0], v[0], bottom_dst + XSTEP);                         \
+    top_y += 2;                                                                \
+    bottom_y += 2;                                                             \
+    u++;                                                                       \
+    v++;                                                                       \
+    top_dst += 2 * XSTEP;                                                      \
+    bottom_dst += 2 * XSTEP;                                                   \
+  }                                                                            \
+  if (i == len - 1) {    /* last one */                                        \
+    FUNC(top_y[0], u[0], v[0], top_dst);                                       \
+    FUNC(bottom_y[0], u[0], v[0], bottom_dst);                                 \
+  }                                                                            \
+}
+
+// All variants implemented.
+SAMPLE_FUNC(SampleRgbLinePair,  VP8YuvToRgb,  3)
+SAMPLE_FUNC(SampleBgrLinePair,  VP8YuvToBgr,  3)
+SAMPLE_FUNC(SampleRgbaLinePair, VP8YuvToRgba, 4)
+SAMPLE_FUNC(SampleBgraLinePair, VP8YuvToBgra, 4)
+
+#undef SAMPLE_FUNC
+
+// Main methods.
+typedef void (*SampleLinePairFunc)(
+  const uint8_t* top_y, const uint8_t* bottom_y,
+  const uint8_t* u, const uint8_t* v,
+  uint8_t* top_dst, uint8_t* bottom_dst, int len);
+
+static const SampleLinePairFunc kSamplers[MODE_BGRA + 1] = {
+  SampleRgbLinePair,    // MODE_RGB
+  SampleRgbaLinePair,   // MODE_RGBA
+  SampleBgrLinePair,    // MODE_BGR
+  SampleBgraLinePair    // MODE_BGRA
+};
+
+//------------------------------------------------------------------------------
+// YUV444 converter
+
+#define YUV444_FUNC(FUNC_NAME, FUNC, XSTEP)                                    \
+static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v,    \
+                      uint8_t* dst, int len) {                                 \
+  int i;                                                                       \
+  for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * XSTEP]);           \
+}
+
+YUV444_FUNC(Yuv444ToRgb,  VP8YuvToRgb,  3)
+YUV444_FUNC(Yuv444ToBgr,  VP8YuvToBgr,  3)
+YUV444_FUNC(Yuv444ToRgba, VP8YuvToRgba, 4)
+YUV444_FUNC(Yuv444ToBgra, VP8YuvToBgra, 4)
+
+#undef YUV444_FUNC
+
+typedef void (*YUV444Func)(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                           uint8_t* dst, int len);
+
+static const YUV444Func kYUV444Converters[MODE_BGRA + 1] = {
+  Yuv444ToRgb,    // MODE_RGB
+  Yuv444ToRgba,   // MODE_RGBA
+  Yuv444ToBgr,    // MODE_BGR
+  Yuv444ToBgra    // MODE_BGRA
+};
+
+//------------------------------------------------------------------------------
+// Main YUV<->RGB conversion functions
+
+static int EmitYUV(const VP8Io* const io, WebPDecParams* const p) {
+  WebPDecBuffer* output = p->output;
+  const WebPYUVABuffer* const buf = &output->u.YUVA;
+  uint8_t* const y_dst = buf->y + io->mb_y * buf->y_stride;
+  uint8_t* const u_dst = buf->u + (io->mb_y >> 1) * buf->u_stride;
+  uint8_t* const v_dst = buf->v + (io->mb_y >> 1) * buf->v_stride;
+  const int mb_w = io->mb_w;
+  const int mb_h = io->mb_h;
+  const int uv_w = (mb_w + 1) / 2;
+  int j;
+  for (j = 0; j < mb_h; ++j) {
+    memcpy(y_dst + j * buf->y_stride, io->y + j * io->y_stride, mb_w);
+  }
+  for (j = 0; j < (mb_h + 1) / 2; ++j) {
+    memcpy(u_dst + j * buf->u_stride, io->u + j * io->uv_stride, uv_w);
+    memcpy(v_dst + j * buf->v_stride, io->v + j * io->uv_stride, uv_w);
+  }
+  return io->mb_h;
+}
+
+// Point-sampling U/V sampler.
+static int EmitSampledRGB(const VP8Io* const io, WebPDecParams* const p) {
+  WebPDecBuffer* output = p->output;
+  const WebPRGBABuffer* const buf = &output->u.RGBA;
+  uint8_t* dst = buf->rgba + io->mb_y * buf->stride;
+  const uint8_t* y_src = io->y;
+  const uint8_t* u_src = io->u;
+  const uint8_t* v_src = io->v;
+  const SampleLinePairFunc sample = kSamplers[output->colorspace];
+  const int mb_w = io->mb_w;
+  const int last = io->mb_h - 1;
+  int j;
+  for (j = 0; j < last; j += 2) {
+    sample(y_src, y_src + io->y_stride, u_src, v_src,
+           dst, dst + buf->stride, mb_w);
+    y_src += 2 * io->y_stride;
+    u_src += io->uv_stride;
+    v_src += io->uv_stride;
+    dst += 2 * buf->stride;
+  }
+  if (j == last) {  // Just do the last line twice
+    sample(y_src, y_src, u_src, v_src, dst, dst, mb_w);
+  }
+  return io->mb_h;
+}
+
+//------------------------------------------------------------------------------
+// YUV444 -> RGB conversion
+
+#if 0   // TODO(skal): this is for future rescaling.
+static int EmitRGB(const VP8Io* const io, WebPDecParams* const p) {
+  WebPDecBuffer* output = p->output;
+  const WebPRGBABuffer* const buf = &output->u.RGBA;
+  uint8_t* dst = buf->rgba + io->mb_y * buf->stride;
+  const uint8_t* y_src = io->y;
+  const uint8_t* u_src = io->u;
+  const uint8_t* v_src = io->v;
+  const YUV444Func convert = kYUV444Converters[output->colorspace];
+  const int mb_w = io->mb_w;
+  const int last = io->mb_h;
+  int j;
+  for (j = 0; j < last; ++j) {
+    convert(y_src, u_src, v_src, dst, mb_w);
+    y_src += io->y_stride;
+    u_src += io->uv_stride;
+    v_src += io->uv_stride;
+    dst += buf->stride;
+  }
+  return io->mb_h;
+}
+#endif
+
+//------------------------------------------------------------------------------
+// Fancy upsampling
+
+#ifdef FANCY_UPSAMPLING
+static int EmitFancyRGB(const VP8Io* const io, WebPDecParams* const p) {
+  int num_lines_out = io->mb_h;   // a priori guess
+  const WebPRGBABuffer* const buf = &p->output->u.RGBA;
+  uint8_t* dst = buf->rgba + io->mb_y * buf->stride;
+  const UpsampleLinePairFunc upsample =
+      io->a ? kUpsamplersKeepAlpha[p->output->colorspace]
+            : kUpsamplers[p->output->colorspace];
+  const uint8_t* cur_y = io->y;
+  const uint8_t* cur_u = io->u;
+  const uint8_t* cur_v = io->v;
+  const uint8_t* top_u = p->tmp_u;
+  const uint8_t* top_v = p->tmp_v;
+  int y = io->mb_y;
+  int y_end = io->mb_y + io->mb_h;
+  const int mb_w = io->mb_w;
+  const int uv_w = (mb_w + 1) / 2;
+
+  if (y == 0) {
+    // First line is special cased. We mirror the u/v samples at boundary.
+    upsample(NULL, cur_y, cur_u, cur_v, cur_u, cur_v, NULL, dst, mb_w);
+  } else {
+    // We can finish the left-over line from previous call.
+    // Warning! Don't overwrite the alpha values (if any), as they
+    // are not lagging one line behind but are already written.
+    upsample(p->tmp_y, cur_y, top_u, top_v, cur_u, cur_v,
+             dst - buf->stride, dst, mb_w);
+    num_lines_out++;
+  }
+  // Loop over each output pairs of row.
+  for (; y + 2 < y_end; y += 2) {
+    top_u = cur_u;
+    top_v = cur_v;
+    cur_u += io->uv_stride;
+    cur_v += io->uv_stride;
+    dst += 2 * buf->stride;
+    cur_y += 2 * io->y_stride;
+    upsample(cur_y - io->y_stride, cur_y,
+             top_u, top_v, cur_u, cur_v,
+             dst - buf->stride, dst, mb_w);
+  }
+  // move to last row
+  cur_y += io->y_stride;
+  if (io->crop_top + y_end < io->crop_bottom) {
+    // Save the unfinished samples for next call (as we're not done yet).
+    memcpy(p->tmp_y, cur_y, mb_w * sizeof(*p->tmp_y));
+    memcpy(p->tmp_u, cur_u, uv_w * sizeof(*p->tmp_u));
+    memcpy(p->tmp_v, cur_v, uv_w * sizeof(*p->tmp_v));
+    // The fancy upsampler leaves a row unfinished behind
+    // (except for the very last row)
+    num_lines_out--;
+  } else {
+    // Process the very last row of even-sized picture
+    if (!(y_end & 1)) {
+      upsample(cur_y, NULL, cur_u, cur_v, cur_u, cur_v,
+              dst + buf->stride, NULL, mb_w);
+    }
+  }
+  return num_lines_out;
+}
+
+#endif    /* FANCY_UPSAMPLING */
+
+//------------------------------------------------------------------------------
+
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+static int EmitAlphaYUV(const VP8Io* const io, WebPDecParams* const p) {
+  const int mb_w = io->mb_w;
+  const int mb_h = io->mb_h;
+  int j;
+  const WebPYUVABuffer* const buf = &p->output->u.YUVA;
+  uint8_t* dst = buf->a + io->mb_y * buf->a_stride;
+  const uint8_t* alpha = io->a;
+  if (alpha) {
+    for (j = 0; j < mb_h; ++j) {
+      memcpy(dst, alpha, mb_w * sizeof(*dst));
+      alpha += io->width;
+      dst += buf->a_stride;
+    }
+  }
+  return 0;
+}
+
+static int EmitAlphaRGB(const VP8Io* const io, WebPDecParams* const p) {
+  const int mb_w = io->mb_w;
+  const int mb_h = io->mb_h;
+  int i, j;
+  const WebPRGBABuffer* const buf = &p->output->u.RGBA;
+  uint8_t* dst = buf->rgba + io->mb_y * buf->stride;
+  const uint8_t* alpha = io->a;
+  if (alpha) {
+    for (j = 0; j < mb_h; ++j) {
+      for (i = 0; i < mb_w; ++i) {
+        dst[4 * i + 3] = alpha[i];
+      }
+      alpha += io->width;
+      dst += buf->stride;
+    }
+  }
+  return 0;
+}
+
+#endif    /* WEBP_EXPERIMENTAL_FEATURES */
+
+//------------------------------------------------------------------------------
+// Simple picture rescaler
+
+// TODO(skal): start a common library for encoder and decoder, and factorize
+// this code in.
+
+#define RFIX 30
+#define MULT(x,y) (((int64_t)(x) * (y) + (1 << (RFIX - 1))) >> RFIX)
+
+static void InitRescaler(WebPRescaler* const wrk,
+                         int src_width, int src_height,
+                         uint8_t* dst,
+                         int dst_width, int dst_height, int dst_stride,
+                         int x_add, int x_sub, int y_add, int y_sub,
+                         int32_t* work) {
+  wrk->x_expand = (src_width < dst_width);
+  wrk->src_width = src_width;
+  wrk->src_height = src_height;
+  wrk->dst_width = dst_width;
+  wrk->dst_height = dst_height;
+  wrk->dst = dst;
+  wrk->dst_stride = dst_stride;
+  // for 'x_expand', we use bilinear interpolation
+  wrk->x_add = wrk->x_expand ? (x_sub - 1) : x_add - x_sub;
+  wrk->x_sub = wrk->x_expand ? (x_add - 1) : x_sub;
+  wrk->y_accum = y_add;
+  wrk->y_add = y_add;
+  wrk->y_sub = y_sub;
+  wrk->fx_scale = (1 << RFIX) / x_sub;
+  wrk->fy_scale = (1 << RFIX) / y_sub;
+  wrk->fxy_scale = wrk->x_expand ?
+      ((int64_t)dst_height << RFIX) / (x_sub * src_height) :
+      ((int64_t)dst_height << RFIX) / (x_add * src_height);
+  wrk->irow = work;
+  wrk->frow = work + dst_width;
+}
+
+static inline void ImportRow(const uint8_t* const src,
+                             WebPRescaler* const wrk) {
+  int x_in = 0;
+  int x_out;
+  int accum = 0;
+  if (!wrk->x_expand) {
+    int sum = 0;
+    for (x_out = 0; x_out < wrk->dst_width; ++x_out) {
+      accum += wrk->x_add;
+      for (; accum > 0; accum -= wrk->x_sub) {
+        sum += src[x_in++];
+      }
+      {        // Emit next horizontal pixel.
+        const int32_t base = src[x_in++];
+        const int32_t frac = base * (-accum);
+        wrk->frow[x_out] = (sum + base) * wrk->x_sub - frac;
+        // fresh fractional start for next pixel
+        sum = MULT(frac, wrk->fx_scale);
+      }
+    }
+  } else {        // simple bilinear interpolation
+    int left = src[0], right = src[0];
+    for (x_out = 0; x_out < wrk->dst_width; ++x_out) {
+      if (accum < 0) {
+        left = right;
+        right = src[++x_in];
+        accum += wrk->x_add;
+      }
+      wrk->frow[x_out] = right * wrk->x_add + (left - right) * accum;
+      accum -= wrk->x_sub;
+    }
+  }
+  // Accumulate the new row's contribution
+  for (x_out = 0; x_out < wrk->dst_width; ++x_out) {
+    wrk->irow[x_out] += wrk->frow[x_out];
+  }
+}
+
+static void ExportRow(WebPRescaler* const wrk) {
+  int x_out;
+  const int yscale = wrk->fy_scale * (-wrk->y_accum);
+  assert(wrk->y_accum <= 0);
+  for (x_out = 0; x_out < wrk->dst_width; ++x_out) {
+    const int frac = MULT(wrk->frow[x_out], yscale);
+    const int v = MULT(wrk->irow[x_out] - frac, wrk->fxy_scale);
+    wrk->dst[x_out] = (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
+    wrk->irow[x_out] = frac;   // new fractional start
+  }
+  wrk->y_accum += wrk->y_add;
+  wrk->dst += wrk->dst_stride;
+}
+
+#undef MULT
+#undef RFIX
+
+//------------------------------------------------------------------------------
+// YUV rescaling (no final RGB conversion needed)
+
+static int Rescale(const uint8_t* src, int src_stride,
+                   int new_lines, WebPRescaler* const wrk) {
+  int num_lines_out = 0;
+  while (new_lines-- > 0) {    // import new contribution of one source row.
+    ImportRow(src, wrk);
+    src += src_stride;
+    wrk->y_accum -= wrk->y_sub;
+    while (wrk->y_accum <= 0) {      // emit output row(s)
+      ExportRow(wrk);
+      num_lines_out++;
+    }
+  }
+  return num_lines_out;
+}
+
+static int EmitRescaledYUV(const VP8Io* const io, WebPDecParams* const p) {
+  const int mb_h = io->mb_h;
+  const int uv_mb_h = (mb_h + 1) >> 1;
+  const int num_lines_out = Rescale(io->y, io->y_stride, mb_h, &p->scaler_y);
+  Rescale(io->u, io->uv_stride, uv_mb_h, &p->scaler_u);
+  Rescale(io->v, io->uv_stride, uv_mb_h, &p->scaler_v);
+  return num_lines_out;
+}
+
+static int EmitRescaledAlphaYUV(const VP8Io* const io, WebPDecParams* const p) {
+  if (io->a) {
+    Rescale(io->a, io->width, io->mb_h, &p->scaler_a);
+  }
+  return 0;
+}
+
+static int InitYUVRescaler(const VP8Io* const io, WebPDecParams* const p) {
+  const int has_alpha = (p->output->colorspace & MODE_ALPHA_MASK);
+  const WebPYUVABuffer* const buf = &p->output->u.YUVA;
+  const int out_width  = io->scaled_width;
+  const int out_height = io->scaled_height;
+  const int uv_out_width  = (out_width + 1) >> 1;
+  const int uv_out_height = (out_height + 1) >> 1;
+  const int uv_in_width  = (io->mb_w + 1) >> 1;
+  const int uv_in_height = (io->mb_h + 1) >> 1;
+  const size_t work_size = 2 * out_width;   // scratch memory for luma rescaler
+  const size_t uv_work_size = 2 * uv_out_width;  // and for each u/v ones
+  size_t tmp_size;
+  int32_t* work;
+
+  tmp_size = work_size + 2 * uv_work_size;
+  if (has_alpha) {
+    tmp_size += work_size;
+  }
+  p->memory = calloc(1, tmp_size * sizeof(*work));
+  if (p->memory == NULL) {
+    return 0;   // memory error
+  }
+  work = (int32_t*)p->memory;
+  InitRescaler(&p->scaler_y, io->mb_w, io->mb_h,
+               buf->y, out_width, out_height, buf->y_stride,
+               io->mb_w, out_width, io->mb_h, out_height,
+               work);
+  InitRescaler(&p->scaler_u, uv_in_width, uv_in_height,
+               buf->u, uv_out_width, uv_out_height, buf->u_stride,
+               uv_in_width, uv_out_width,
+               uv_in_height, uv_out_height,
+               work + work_size);
+  InitRescaler(&p->scaler_v, uv_in_width, uv_in_height,
+               buf->v, uv_out_width, uv_out_height, buf->v_stride,
+               uv_in_width, uv_out_width,
+               uv_in_height, uv_out_height,
+               work + work_size + uv_work_size);
+  p->emit = EmitRescaledYUV;
+  if (has_alpha) {
+    InitRescaler(&p->scaler_a, io->mb_w, io->mb_h,
+                 buf->a, out_width, out_height, buf->a_stride,
+                 io->mb_w, out_width, io->mb_h, out_height,
+                 work + work_size + 2 * uv_work_size);
+    p->emit_alpha = EmitRescaledAlphaYUV;
+  }
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+// RGBA rescaling
+
+// import new contributions until one row is ready to be output, or all input
+// is consumed.
+static int Import(const uint8_t* src, int src_stride,
+                  int new_lines, WebPRescaler* const wrk) {
+  int num_lines_in = 0;
+  while (num_lines_in < new_lines && wrk->y_accum > 0) {
+    ImportRow(src, wrk);
+    src += src_stride;
+    ++num_lines_in;
+    wrk->y_accum -= wrk->y_sub;
+  }
+  return num_lines_in;
+}
+
+static int ExportRGB(WebPDecParams* const p, int y_pos) {
+  const YUV444Func convert = kYUV444Converters[p->output->colorspace];
+  const WebPRGBABuffer* const buf = &p->output->u.RGBA;
+  uint8_t* dst = buf->rgba + (p->last_y + y_pos) * buf->stride;
+  int num_lines_out = 0;
+  // For RGB rescaling, because of the YUV420, current scan position
+  // U/V can be +1/-1 line from the Y one.  Hence the double test.
+  while (p->scaler_y.y_accum <= 0 && p->scaler_u.y_accum <= 0) {
+    assert(p->last_y + y_pos + num_lines_out < p->output->height);
+    assert(p->scaler_u.y_accum == p->scaler_v.y_accum);
+    ExportRow(&p->scaler_y);
+    ExportRow(&p->scaler_u);
+    ExportRow(&p->scaler_v);
+    convert(p->scaler_y.dst, p->scaler_u.dst, p->scaler_v.dst,
+            dst, p->scaler_y.dst_width);
+    dst += buf->stride;
+    num_lines_out++;
+  }
+  return num_lines_out;
+}
+
+static int EmitRescaledRGB(const VP8Io* const io, WebPDecParams* const p) {
+  const int mb_h = io->mb_h;
+  const int uv_mb_h = (mb_h + 1) >> 1;
+  int j = 0, uv_j = 0;
+  int num_lines_out = 0;
+  while (j < mb_h) {
+    const int y_lines_in = Import(io->y + j * io->y_stride, io->y_stride,
+                                  mb_h - j, &p->scaler_y);
+    const int u_lines_in = Import(io->u + uv_j * io->uv_stride, io->uv_stride,
+                                  uv_mb_h - uv_j, &p->scaler_u);
+    const int v_lines_in = Import(io->v + uv_j * io->uv_stride, io->uv_stride,
+                                  uv_mb_h - uv_j, &p->scaler_v);
+    (void)v_lines_in;   // remove a gcc warning
+    assert(u_lines_in == v_lines_in);
+    j += y_lines_in;
+    uv_j += u_lines_in;
+    num_lines_out += ExportRGB(p, num_lines_out);
+  }
+  return num_lines_out;
+}
+
+static int ExportAlpha(WebPDecParams* const p, int y_pos) {
+  const WebPRGBABuffer* const buf = &p->output->u.RGBA;
+  uint8_t* dst = buf->rgba + (p->last_y + y_pos) * buf->stride;
+  int num_lines_out = 0;
+  while (p->scaler_a.y_accum <= 0) {
+    int i;
+    assert(p->last_y + y_pos + num_lines_out < p->output->height);
+    ExportRow(&p->scaler_a);
+    for (i = 0; i < p->scaler_a.dst_width; ++i) {
+      dst[4 * i + 3] = p->scaler_a.dst[i];
+    }
+    dst += buf->stride;
+    num_lines_out++;
+  }
+  return num_lines_out;
+}
+
+static int EmitRescaledAlphaRGB(const VP8Io* const io, WebPDecParams* const p) {
+  if (io->a) {
+    int j = 0, pos = 0;
+    while (j < io->mb_h) {
+      j += Import(io->a + j * io->width, io->width, io->mb_h - j, &p->scaler_a);
+      pos += ExportAlpha(p, pos);
+    }
+  }
+  return 0;
+}
+
+static int InitRGBRescaler(const VP8Io* const io, WebPDecParams* const p) {
+  const int has_alpha = (p->output->colorspace & MODE_ALPHA_MASK);
+  const int out_width  = io->scaled_width;
+  const int out_height = io->scaled_height;
+  const int uv_in_width  = (io->mb_w + 1) >> 1;
+  const int uv_in_height = (io->mb_h + 1) >> 1;
+  const size_t work_size = 2 * out_width;   // scratch memory for one rescaler
+  int32_t* work;  // rescalers work area
+  uint8_t* tmp;   // tmp storage for scaled YUV444 samples before RGB conversion
+  size_t tmp_size1, tmp_size2;
+
+  tmp_size1 = 3 * work_size;
+  tmp_size2 = 3 * out_width;
+  if (has_alpha) {
+    tmp_size1 += work_size;
+    tmp_size2 += out_width;
+  }
+  p->memory =
+      calloc(1, tmp_size1 * sizeof(*work) + tmp_size2 * sizeof(*tmp));
+  if (p->memory == NULL) {
+    return 0;   // memory error
+  }
+  work = (int32_t*)p->memory;
+  tmp = (uint8_t*)(work + tmp_size1);
+  InitRescaler(&p->scaler_y, io->mb_w, io->mb_h,
+               tmp + 0 * out_width, out_width, out_height, 0,
+               io->mb_w, out_width, io->mb_h, out_height,
+               work + 0 * work_size);
+  InitRescaler(&p->scaler_u, uv_in_width, uv_in_height,
+               tmp + 1 * out_width, out_width, out_height, 0,
+               io->mb_w, 2 * out_width, io->mb_h, 2 * out_height,
+               work + 1 * work_size);
+  InitRescaler(&p->scaler_v, uv_in_width, uv_in_height,
+               tmp + 2 * out_width, out_width, out_height, 0,
+               io->mb_w, 2 * out_width, io->mb_h, 2 * out_height,
+               work + 2 * work_size);
+  p->emit = EmitRescaledRGB;
+
+  if (has_alpha) {
+    InitRescaler(&p->scaler_a, io->mb_w, io->mb_h,
+                 tmp + 3 * out_width, out_width, out_height, 0,
+                 io->mb_w, out_width, io->mb_h, out_height,
+                 work + 3 * work_size);
+    p->emit_alpha = EmitRescaledAlphaRGB;
+  }
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+// Default custom functions
+
+// Setup crop_xxx fields, mb_w and mb_h
+static int InitFromOptions(const WebPDecoderOptions* const options,
+                           VP8Io* const io) {
+  const int W = io->width;
+  const int H = io->height;
+  int x = 0, y = 0, w = W, h = H;
+
+  // Cropping
+  io->use_cropping = (options != NULL) && (options->use_cropping > 0);
+  if (io->use_cropping) {
+    w = options->crop_width;
+    h = options->crop_height;
+    // TODO(skal): take colorspace into account. Don't assume YUV420.
+    x = options->crop_left & ~1;
+    y = options->crop_top & ~1;
+    if (x < 0 || y < 0 || w <= 0 || h <= 0 || x + w > W || y + h > H) {
+      return 0;  // out of frame boundary error
+    }
+  }
+  io->crop_left   = x;
+  io->crop_top    = y;
+  io->crop_right  = x + w;
+  io->crop_bottom = y + h;
+  io->mb_w = w;
+  io->mb_h = h;
+
+  // Scaling
+  io->use_scaling = (options != NULL) && (options->use_scaling > 0);
+  if (io->use_scaling) {
+    if (options->scaled_width <= 0 || options->scaled_height <= 0) {
+      return 0;
+    }
+    io->scaled_width = options->scaled_width;
+    io->scaled_height = options->scaled_height;
+  }
+
+  // Filter
+  io->bypass_filtering = options && options->bypass_filtering;
+
+  // Fancy upsampler
+#ifdef FANCY_UPSAMPLING
+  io->fancy_upsampling = (options == NULL) || (!options->no_fancy_upsampling);
+#endif
+
+  if (io->use_scaling) {
+    // disable filter (only for large downscaling ratio).
+    io->bypass_filtering = (io->scaled_width < W * 3 / 4) &&
+                           (io->scaled_height < H * 3 / 4);
+    io->fancy_upsampling = 0;
+  }
+  return 1;
+}
+
+static int CustomSetup(VP8Io* io) {
+  WebPDecParams* const p = (WebPDecParams*)io->opaque;
+  const int is_rgb = (p->output->colorspace < MODE_YUV);
+
+  p->memory = NULL;
+  p->emit = NULL;
+  p->emit_alpha = NULL;
+  if (!InitFromOptions(p->options, io)) {
+    return 0;
+  }
+
+  if (io->use_scaling) {
+    const int ok = is_rgb ? InitRGBRescaler(io, p) : InitYUVRescaler(io, p);
+    if (!ok) {
+      return 0;    // memory error
+    }
+  } else {
+    if (is_rgb) {
+      p->emit = EmitSampledRGB;   // default
+#ifdef FANCY_UPSAMPLING
+      if (io->fancy_upsampling) {
+        const int uv_width = (io->mb_w + 1) >> 1;
+        p->memory = malloc(io->mb_w + 2 * uv_width);
+        if (p->memory == NULL) {
+          return 0;   // memory error.
+        }
+        p->tmp_y = (uint8_t*)p->memory;
+        p->tmp_u = p->tmp_y + io->mb_w;
+        p->tmp_v = p->tmp_u + uv_width;
+        p->emit = EmitFancyRGB;
+      }
+#endif
+    } else {
+      p->emit = EmitYUV;
+    }
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+    if (p->output->colorspace & MODE_ALPHA_MASK) {
+      // We need transparency output
+      p->emit_alpha = is_rgb ? EmitAlphaRGB : EmitAlphaYUV;
+    }
+#endif
+  }
+
+  if (is_rgb) {
+    VP8YUVInit();
+  }
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+
+static int CustomPut(const VP8Io* io) {
+  WebPDecParams* p = (WebPDecParams*)io->opaque;
+  const int mb_w = io->mb_w;
+  const int mb_h = io->mb_h;
+  int num_lines_out;
+  assert(!(io->mb_y & 1));
+
+  if (mb_w <= 0 || mb_h <= 0) {
+    return 0;
+  }
+  num_lines_out = p->emit(io, p);
+  if (p->emit_alpha) {
+    p->emit_alpha(io, p);
+  }
+  p->last_y += num_lines_out;
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+
+static void CustomTeardown(const VP8Io* io) {
+  WebPDecParams* const p = (WebPDecParams*)io->opaque;
+  free(p->memory);
+  p->memory = NULL;
+}
+
+//------------------------------------------------------------------------------
+// Main entry point
+
+void WebPInitCustomIo(WebPDecParams* const params, VP8Io* const io) {
+  io->put      = CustomPut;
+  io->setup    = CustomSetup;
+  io->teardown = CustomTeardown;
+  io->opaque   = params;
+}
+
+//------------------------------------------------------------------------------
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/src/dec/vp8.c b/src/dec/vp8.c
index 144bd501..1f1ce29d 100644
--- a/src/dec/vp8.c
+++ b/src/dec/vp8.c
@@ -76,8 +76,12 @@ int VP8SetError(VP8Decoder* const dec,
 
 //-----------------------------------------------------------------------------
 
-int VP8GetInfo(const uint8_t* data, uint32_t chunk_size,
-               int *width, int *height) {
+int VP8GetInfo(const uint8_t* data,
+               uint32_t data_size, uint32_t chunk_size,
+               int* width, int* height, int* has_alpha) {
+  if (data_size < 10) {
+    return 0;         // not enough data
+  }
   // check signature
   if (data[3] != 0x9d || data[4] != 0x01 || data[5] != 0x2a) {
     return 0;         // Wrong signature.
@@ -87,6 +91,14 @@ int VP8GetInfo(const uint8_t* data, uint32_t chunk_size,
     const int w = ((data[7] << 8) | data[6]) & 0x3fff;
     const int h = ((data[9] << 8) | data[8]) & 0x3fff;
 
+    if (has_alpha) {
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+      if (data_size < 11) return 0;
+      *has_alpha = !!(data[10] & 0x80);    // the colorspace_ bit
+#else
+      *has_alpha = 0;
+#endif
+    }
     if (!key_frame) {   // Not a keyframe.
       return 0;
     }
@@ -254,7 +266,7 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
                        "null VP8Io passed to VP8GetHeaders()");
   }
 
-  buf = (uint8_t *)io->data;
+  buf = (uint8_t*)io->data;
   buf_size = io->data_size;
   if (buf == NULL || buf_size <= 4) {
     return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
@@ -329,8 +341,17 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
 
     dec->mb_w_ = (pic_hdr->width_ + 15) >> 4;
     dec->mb_h_ = (pic_hdr->height_ + 15) >> 4;
+    // Setup default output area (can be later modified during io->setup())
     io->width = pic_hdr->width_;
     io->height = pic_hdr->height_;
+    io->use_scaling  = 0;
+    io->use_cropping = 0;
+    io->crop_top  = 0;
+    io->crop_left = 0;
+    io->crop_right  = io->width;
+    io->crop_bottom = io->height;
+    io->mb_w = io->width;   // sanity check
+    io->mb_h = io->height;  // ditto
 
     VP8ResetProba(&dec->proba_);
     ResetSegmentHeader(&dec->segment_hdr_);
@@ -458,7 +479,7 @@ static const uint8_t kCat4[] = { 176, 155, 140, 135, 0 };
 static const uint8_t kCat5[] = { 180, 157, 141, 134, 130, 0 };
 static const uint8_t kCat6[] =
   { 254, 254, 243, 230, 196, 177, 153, 140, 133, 130, 129, 0 };
-static const uint8_t * const kCat3456[] = { kCat3, kCat4, kCat5, kCat6 };
+static const uint8_t* const kCat3456[] = { kCat3, kCat4, kCat5, kCat6 };
 static const uint8_t kZigzag[16] = {
   0, 1, 4, 8,  5, 2, 3, 6,  9, 12, 13, 10,  7, 11, 14, 15
 };
@@ -662,11 +683,10 @@ int VP8DecodeMB(VP8Decoder* const dec, VP8BitReader* const token_br) {
 }
 
 static int ParseFrame(VP8Decoder* const dec, VP8Io* io) {
-  for (dec->mb_y_ = 0; dec->mb_y_ < dec->mb_h_; ++dec->mb_y_) {
+  for (dec->mb_y_ = 0; dec->mb_y_ < dec->br_mb_y_; ++dec->mb_y_) {
     VP8MB* const left = dec->mb_info_ - 1;
     VP8BitReader* const token_br =
         &dec->parts_[dec->mb_y_ & (dec->num_parts_ - 1)];
-
     left->nz_ = 0;
     left->dc_nz_ = 0;
     memset(dec->intra_l_, B_DC_PRED, sizeof(dec->intra_l_));
@@ -681,9 +701,11 @@ static int ParseFrame(VP8Decoder* const dec, VP8Io* io) {
       // Store data and save block's filtering params
       VP8StoreBlock(dec);
     }
+    if (dec->filter_type_ > 0) {
+      VP8FilterRow(dec);
+    }
     if (!VP8FinishRow(dec, io)) {
-      return VP8SetError(dec, VP8_STATUS_USER_ABORT,
-                         "Output aborted.");
+      return VP8SetError(dec, VP8_STATUS_USER_ABORT, "Output aborted.");
     }
   }
 
@@ -722,22 +744,18 @@ int VP8Decode(VP8Decoder* const dec, VP8Io* const io) {
   }
   assert(dec->ready_);
 
-  // will allocate memory and prepare everything.
+  // Will allocate memory and prepare everything.
   if (!VP8InitFrame(dec, io)) {
     VP8Clear(dec);
-    return VP8SetError(dec, VP8_STATUS_OUT_OF_MEMORY,
-                       "Allocation failed");
+    return 0;
   }
 
-  if (io->setup && !io->setup(io)) {
+  // Finish setting up the decoding parameter
+  if (VP8FinishFrameSetup(dec, io) != VP8_STATUS_OK) {
     VP8Clear(dec);
-    return VP8SetError(dec, VP8_STATUS_USER_ABORT,
-                       "Frame setup failed");
+    return 0;
   }
 
-  // Disable filtering per user request (_after_ setup() is called)
-  if (io->bypass_filtering) dec->filter_type_ = 0;
-
   // Main decoding loop
   {
     const int ret = ParseFrame(dec, io);
diff --git a/src/dec/vp8i.h b/src/dec/vp8i.h
index 10ac4912..587b1cb4 100644
--- a/src/dec/vp8i.h
+++ b/src/dec/vp8i.h
@@ -184,6 +184,10 @@ struct VP8Decoder {
   // dimension, in macroblock units.
   int mb_w_, mb_h_;
 
+  // Macroblock to process/filter, depending on cropping and filter_type.
+  int tl_mb_x_, tl_mb_y_;  // top-left MB that must be in-loop filtered
+  int br_mb_x_, br_mb_y_;  // last bottom-right MB that must be decoded
+
   // number of partitions.
   int num_parts_;
   // per-partition boolean decoders.
@@ -212,8 +216,8 @@ struct VP8Decoder {
   // Boundary data cache and persistent buffers.
   uint8_t* intra_t_;     // top intra modes values: 4 * mb_w_
   uint8_t  intra_l_[4];  // left intra modes values
-  uint8_t *y_t_;         // top luma samples: 16 * mb_w_
-  uint8_t *u_t_, *v_t_;  // top u/v samples: 8 * mb_w_ each
+  uint8_t* y_t_;         // top luma samples: 16 * mb_w_
+  uint8_t* u_t_, *v_t_;  // top u/v samples: 8 * mb_w_ each
 
   VP8MB* mb_info_;       // contextual macroblock infos (mb_w_ + 1)
   uint8_t* yuv_b_;       // main block for Y/U/V (size = YUV_SIZE)
@@ -264,10 +268,12 @@ struct VP8Decoder {
 int VP8SetError(VP8Decoder* const dec,
                 VP8StatusCode error, const char * const msg);
 // Validates the VP8 data-header and retrieve basic header information viz width
-// and height. Returns 0 in case of formatting error. *width/*height can be
-// passed NULL.
-int VP8GetInfo(const uint8_t* data, uint32_t data_size,
-               int *width, int *height);
+// and height. Returns 0 in case of formatting error. *width/*height/*has_alpha
+// can be passed NULL.
+int VP8GetInfo(const uint8_t* data,
+               uint32_t data_size,    // data available so far
+               uint32_t chunk_size,   // total data size expect in the chunk
+               int *width, int *height, int *has_alpha);
 
 // in tree.c
 void VP8ResetProba(VP8Proba* const proba);
@@ -281,10 +287,14 @@ void VP8ParseQuant(VP8Decoder* const dec);
 int VP8InitFrame(VP8Decoder* const dec, VP8Io* io);
 // Predict a block and add residual
 void VP8ReconstructBlock(VP8Decoder* const dec);
+// Call io->setup() and finish setting up scan parameters.
+VP8StatusCode VP8FinishFrameSetup(VP8Decoder* const dec, VP8Io* const io);
+// Filter the decoded macroblock row (if needed)
+void VP8FilterRow(const VP8Decoder* const dec);
 // Store a block, along with filtering params
 void VP8StoreBlock(VP8Decoder* const dec);
 // Finalize and transmit a complete row. Return false in case of user-abort.
-int VP8FinishRow(VP8Decoder* const dec, VP8Io* io);
+int VP8FinishRow(VP8Decoder* const dec, VP8Io* const io);
 // Decode one macroblock. Returns false if there is not enough data.
 int VP8DecodeMB(VP8Decoder* const dec, VP8BitReader* const token_br);
 
@@ -307,7 +317,7 @@ extern void (*VP8TransformWHT)(const int16_t* in, int16_t* out);
 
 // *dst is the destination block, with stride BPS. Boundary samples are
 // assumed accessible when needed.
-typedef void (*VP8PredFunc)(uint8_t *dst);
+typedef void (*VP8PredFunc)(uint8_t* dst);
 extern VP8PredFunc VP8PredLuma16[NUM_B_DC_MODES];
 extern VP8PredFunc VP8PredChroma8[NUM_B_DC_MODES];
 extern VP8PredFunc VP8PredLuma4[NUM_BMODES];
diff --git a/src/dec/webp.c b/src/dec/webp.c
index 57220a86..e642f31c 100644
--- a/src/dec/webp.c
+++ b/src/dec/webp.c
@@ -12,14 +12,11 @@
 #include <stdlib.h>
 #include "vp8i.h"
 #include "webpi.h"
-#include "yuv.h"
 
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 
-#define FANCY_UPSCALING   // undefined to remove fancy upscaling support
-
 //-----------------------------------------------------------------------------
 // RIFF layout is:
 //   0ffset  tag
@@ -39,7 +36,7 @@ static inline uint32_t get_le32(const uint8_t* const data) {
 
 // If a RIFF container is detected, validate it and skip over it.
 uint32_t WebPCheckRIFFHeader(const uint8_t** data_ptr,
-                             uint32_t *data_size_ptr) {
+                             uint32_t* data_size_ptr) {
   uint32_t chunk_size = 0xffffffffu;
   if (*data_size_ptr >= 10 + 20 && !memcmp(*data_ptr, "RIFF", 4)) {
     if (memcmp(*data_ptr + 8, "WEBP", 4)) {
@@ -67,473 +64,96 @@ uint32_t WebPCheckRIFFHeader(const uint8_t** data_ptr,
 }
 
 //-----------------------------------------------------------------------------
-// Fancy upscaling
-
-#ifdef FANCY_UPSCALING
-
-// Given samples laid out in a square as:
-//  [a b]
-//  [c d]
-// we interpolate u/v as:
-//  ([9*a + 3*b + 3*c +   d    3*a + 9*b + 3*c +   d] + [8 8]) / 16
-//  ([3*a +   b + 9*c + 3*d      a + 3*b + 3*c + 9*d]   [8 8]) / 16
-
-// We process u and v together stashed into 32bit (16bit each).
-#define LOAD_UV(u,v) ((u) | ((v) << 16))
-
-#define UPSCALE_FUNC(FUNC_NAME, FUNC, XSTEP)                                   \
-static inline void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,    \
-                             const uint8_t* top_u, const uint8_t* top_v,       \
-                             const uint8_t* cur_u, const uint8_t* cur_v,       \
-                             uint8_t* top_dst, uint8_t* bottom_dst, int len) { \
-  int x;                                                                       \
-  const int last_pixel_pair = (len - 1) >> 1;                                  \
-  uint32_t tl_uv = LOAD_UV(top_u[0], top_v[0]);   /* top-left sample */        \
-  uint32_t l_uv  = LOAD_UV(cur_u[0], cur_v[0]);   /* left-sample */            \
-  if (top_y) {                                                                 \
-    const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2;                \
-    FUNC(top_y[0], uv0 & 0xff, (uv0 >> 16), top_dst);                          \
-  }                                                                            \
-  if (bottom_y) {                                                              \
-    const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2;                \
-    FUNC(bottom_y[0], uv0 & 0xff, (uv0 >> 16), bottom_dst);                    \
-  }                                                                            \
-  for (x = 1; x <= last_pixel_pair; ++x) {                                     \
-    const uint32_t t_uv = LOAD_UV(top_u[x], top_v[x]);  /* top sample */       \
-    const uint32_t uv   = LOAD_UV(cur_u[x], cur_v[x]);  /* sample */           \
-    /* precompute invariant values associated with first and second diagonals*/\
-    const uint32_t avg = tl_uv + t_uv + l_uv + uv + 0x00080008u;               \
-    const uint32_t diag_12 = (avg + 2 * (t_uv + l_uv)) >> 3;                   \
-    const uint32_t diag_03 = (avg + 2 * (tl_uv + uv)) >> 3;                    \
-    if (top_y) {                                                               \
-      const uint32_t uv0 = (diag_12 + tl_uv) >> 1;                             \
-      const uint32_t uv1 = (diag_03 + t_uv) >> 1;                              \
-      FUNC(top_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16),                          \
-           top_dst + (2 * x - 1) * XSTEP);                                     \
-      FUNC(top_y[2 * x - 0], uv1 & 0xff, (uv1 >> 16),                          \
-           top_dst + (2 * x - 0) * XSTEP);                                     \
-    }                                                                          \
-    if (bottom_y) {                                                            \
-      const uint32_t uv0 = (diag_03 + l_uv) >> 1;                              \
-      const uint32_t uv1 = (diag_12 + uv) >> 1;                                \
-      FUNC(bottom_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16),                       \
-           bottom_dst + (2 * x - 1) * XSTEP);                                  \
-      FUNC(bottom_y[2 * x + 0], uv1 & 0xff, (uv1 >> 16),                       \
-           bottom_dst + (2 * x + 0) * XSTEP);                                  \
-    }                                                                          \
-    tl_uv = t_uv;                                                              \
-    l_uv = uv;                                                                 \
-  }                                                                            \
-  if (!(len & 1)) {                                                            \
-    if (top_y) {                                                               \
-      const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2;              \
-      FUNC(top_y[len - 1], uv0 & 0xff, (uv0 >> 16),                            \
-           top_dst + (len - 1) * XSTEP);                                       \
-    }                                                                          \
-    if (bottom_y) {                                                            \
-      const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2;              \
-      FUNC(bottom_y[len - 1], uv0 & 0xff, (uv0 >> 16),                         \
-           bottom_dst + (len - 1) * XSTEP);                                    \
-    }                                                                          \
-  }                                                                            \
-}
-
-// All variants implemented.
-UPSCALE_FUNC(UpscaleRgbLinePair,  VP8YuvToRgb, 3)
-UPSCALE_FUNC(UpscaleBgrLinePair,  VP8YuvToBgr, 3)
-UPSCALE_FUNC(UpscaleRgbaLinePair, VP8YuvToRgb, 4)
-UPSCALE_FUNC(UpscaleBgraLinePair, VP8YuvToBgr, 4)
-
-// Main driver function.
-static inline
-void UpscaleLinePair(const uint8_t* top_y, const uint8_t* bottom_y,
-                     const uint8_t* top_u, const uint8_t* top_v,
-                     const uint8_t* cur_u, const uint8_t* cur_v,
-                     uint8_t* top_dst, uint8_t* bottom_dst, int len,
-                     WEBP_CSP_MODE mode) {
-  if (mode == MODE_RGB) {
-    UpscaleRgbLinePair(top_y, bottom_y, top_u, top_v, cur_u, cur_v,
-                       top_dst, bottom_dst, len);
-  } else if (mode == MODE_BGR) {
-    UpscaleBgrLinePair(top_y, bottom_y, top_u, top_v, cur_u, cur_v,
-                       top_dst, bottom_dst, len);
-  } else if (mode == MODE_RGBA) {
-    UpscaleRgbaLinePair(top_y, bottom_y, top_u, top_v, cur_u, cur_v,
-                        top_dst, bottom_dst, len);
-  } else {
-    assert(mode == MODE_BGRA);
-    UpscaleBgraLinePair(top_y, bottom_y, top_u, top_v, cur_u, cur_v,
-                        top_dst, bottom_dst, len);
-  }
-}
-
-#undef LOAD_UV
-#undef UPSCALE_FUNC
-
-#endif  // FANCY_UPSCALING
-
-//-----------------------------------------------------------------------------
-// Main conversion driver.
-
-static int CustomPut(const VP8Io* io) {
-  WebPDecParams *p = (WebPDecParams*)io->opaque;
-  const int w = io->width;
-  const int mb_h = io->mb_h;
-  const int uv_w = (w + 1) / 2;
-  assert(!(io->mb_y & 1));
-
-  if (w <= 0 || mb_h <= 0) {
-    return 0;
-  }
-
-  p->last_y = io->mb_y + io->mb_h;  // a priori guess
-  if (p->mode == MODE_YUV) {
-    uint8_t* const y_dst = p->output + io->mb_y * p->stride;
-    uint8_t* const u_dst = p->u + (io->mb_y >> 1) * p->u_stride;
-    uint8_t* const v_dst = p->v + (io->mb_y >> 1) * p->v_stride;
-    int j;
-    for (j = 0; j < mb_h; ++j) {
-      memcpy(y_dst + j * p->stride, io->y + j * io->y_stride, w);
-    }
-    for (j = 0; j < (mb_h + 1) / 2; ++j) {
-      memcpy(u_dst + j * p->u_stride, io->u + j * io->uv_stride, uv_w);
-      memcpy(v_dst + j * p->v_stride, io->v + j * io->uv_stride, uv_w);
-    }
-  } else {
-    uint8_t* dst = p->output + io->mb_y * p->stride;
-    if (io->fancy_upscaling) {
-#ifdef FANCY_UPSCALING
-      const uint8_t* cur_y = io->y;
-      const uint8_t* cur_u = io->u;
-      const uint8_t* cur_v = io->v;
-      const uint8_t* top_u = p->top_u;
-      const uint8_t* top_v = p->top_v;
-      int y = io->mb_y;
-      int y_end = io->mb_y + io->mb_h;
-      if (y == 0) {
-        // First line is special cased. We mirror the u/v samples at boundary.
-        UpscaleLinePair(NULL, cur_y, cur_u, cur_v, cur_u, cur_v,
-                        NULL, dst, w, p->mode);
-      } else {
-        // We can finish the left-over line from previous call
-        UpscaleLinePair(p->top_y, cur_y, top_u, top_v, cur_u, cur_v,
-                        dst - p->stride, dst, w, p->mode);
-      }
-      // Loop over each output pairs of row.
-      for (; y + 2 < y_end; y += 2) {
-        top_u = cur_u;
-        top_v = cur_v;
-        cur_u += io->uv_stride;
-        cur_v += io->uv_stride;
-        dst += 2 * p->stride;
-        cur_y += 2 * io->y_stride;
-        UpscaleLinePair(cur_y - io->y_stride, cur_y,
-                        top_u, top_v, cur_u, cur_v,
-                        dst - p->stride, dst, w, p->mode);
-      }
-      // move to last row
-      cur_y += io->y_stride;
-      if (y_end != io->height) {
-        // Save the unfinished samples for next call (as we're not done yet).
-        memcpy(p->top_y, cur_y, w * sizeof(*p->top_y));
-        memcpy(p->top_u, cur_u, uv_w * sizeof(*p->top_u));
-        memcpy(p->top_v, cur_v, uv_w * sizeof(*p->top_v));
-        // The fancy upscaler leaves a row unfinished behind
-        // (except for the very last row)
-        p->last_y -= 1;
-      } else {
-        // Process the very last row of even-sized picture
-        if (!(y_end & 1)) {
-          UpscaleLinePair(cur_y, NULL, cur_u, cur_v, cur_u, cur_v,
-                          dst + p->stride, NULL, w, p->mode);
-        }
-      }
-#else
-      assert(0);  // shouldn't happen.
-#endif
-    } else {
-      // Point-sampling U/V upscaler.
-      int j;
-      for (j = 0; j < mb_h; ++j) {
-        const uint8_t* y_src = io->y + j * io->y_stride;
-        int i;
-        for (i = 0; i < w; ++i) {
-          const int y = y_src[i];
-          const int u = io->u[(j / 2) * io->uv_stride + (i / 2)];
-          const int v = io->v[(j / 2) * io->uv_stride + (i / 2)];
-          if (p->mode == MODE_RGB) {
-            VP8YuvToRgb(y, u, v, dst + i * 3);
-          } else if (p->mode == MODE_BGR) {
-            VP8YuvToBgr(y, u, v, dst + i * 3);
-          } else if (p->mode == MODE_RGBA) {
-            VP8YuvToRgb(y, u, v, dst + i * 4);
-          } else {
-            VP8YuvToBgr(y, u, v, dst + i * 4);
-          }
-        }
-        dst += p->stride;
-      }
-    }
-  }
-
-  // Alpha handling
-  if (p->mode == MODE_RGBA || p->mode == MODE_BGRA) {
-    int i, j;
-    uint8_t* dst = p->output + io->mb_y * p->stride + 3;
-    const uint8_t* alpha = io->a;
-    const int has_alpha = (alpha != NULL);
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-    if (has_alpha) {
-      for (j = 0; j < mb_h; ++j) {
-        for (i = 0; i < w; ++i) {
-          dst[4 * i] = alpha[i];
-        }
-        alpha += io->width;
-        dst += p->stride;
-      }
-    }
-#endif
-    if (!has_alpha) {    // fill-in with 0xFFs
-      for (j = 0; j < mb_h; ++j) {
-        for (i = 0; i < w; ++i) {
-          dst[4 * i] = 0xff;
-        }
-        dst += p->stride;
-      }
-    }
-  }
-  return 1;
-}
-
-//-----------------------------------------------------------------------------
-
-static int CustomSetup(VP8Io* io) {
-#ifdef FANCY_UPSCALING
-  WebPDecParams *p = (WebPDecParams*)io->opaque;
-  p->top_y = p->top_u = p->top_v = NULL;
-  if (p->mode != MODE_YUV) {
-    const int uv_width = (io->width + 1) >> 1;
-    p->top_y = (uint8_t*)malloc(io->width + 2 * uv_width);
-    if (p->top_y == NULL) {
-      return 0;   // memory error.
-    }
-    p->top_u = p->top_y + io->width;
-    p->top_v = p->top_u + uv_width;
-    io->fancy_upscaling = 1;  // activate fancy upscaling
-  }
-#endif
-  return 1;
-}
-
-static void CustomTeardown(const VP8Io* io) {
-#ifdef FANCY_UPSCALING
-  WebPDecParams *p = (WebPDecParams*)io->opaque;
-  if (p->top_y) {
-    free(p->top_y);
-    p->top_y = p->top_u = p->top_v = NULL;
-  }
-#endif
-}
-
-void WebPInitCustomIo(VP8Io* const io) {
-  io->put = CustomPut;
-  io->setup = CustomSetup;
-  io->teardown = CustomTeardown;
-}
-
-//-----------------------------------------------------------------------------
-// Init/Check/Free decoding parameters and buffer
-
-int WebPInitDecParams(const uint8_t* data, uint32_t data_size, int* width,
-                      int* height, WebPDecParams* const params) {
-  int w, h;
-
-  if (!WebPGetInfo(data, data_size, &w, &h)) {
-    return 0;
-  }
-  if (width) *width = w;
-  if (height) *height = h;
-
-  if (!params->external_buffer) {
-    int stride;
-    int uv_stride = 0;
-    int size;
-    int uv_size = 0;
-    uint8_t* output;
-    WEBP_CSP_MODE mode = params->mode;
-
-    // initialize output buffer, now that dimensions are known.
-    stride = (mode == MODE_RGB || mode == MODE_BGR) ? 3 * w
-        : (mode == MODE_RGBA || mode == MODE_BGRA) ? 4 * w
-        : w;
-    size = stride * h;
-
-    if (mode == MODE_YUV) {
-      uv_stride = (w + 1) / 2;
-      uv_size = uv_stride * ((h + 1) / 2);
-    }
-
-    output = (uint8_t*)malloc(size + 2 * uv_size);
-    if (!output) {
-      return 0;
-    }
-
-    params->output = output;
-    params->stride = stride;
-    params->output_size = size;
-    if (mode == MODE_YUV) {
-      params->u = output + size;
-      params->u_stride = uv_stride;
-      params->output_u_size = uv_size;
-      params->v = output + size + uv_size;
-      params->v_stride = uv_stride;
-      params->output_v_size = uv_size;
-    }
-  }
-  return 1;
-}
-
-int WebPCheckDecParams(const VP8Io* io, const WebPDecParams* params) {
-  int ok = 1;
-  WEBP_CSP_MODE mode = params->mode;
-  ok &= (params->stride * io->height <= params->output_size);
-  if (mode == MODE_RGB || mode == MODE_BGR) {
-    ok &= (params->stride >= io->width * 3);
-  } else if (mode == MODE_RGBA || mode == MODE_BGRA) {
-    ok &= (params->stride >= io->width * 4);
-  } else {
-    // some extra checks for U/V
-    const int u_size = params->u_stride * ((io->height + 1) / 2);
-    const int v_size = params->v_stride * ((io->height + 1) / 2);
-    ok &= (params->stride >= io->width);
-    ok &= (params->u_stride >= (io->width + 1) / 2) &&
-          (params->v_stride >= (io->width + 1) / 2);
-    ok &= (u_size <= params->output_u_size &&
-           v_size <= params->output_v_size);
-  }
-  return ok;
-}
+// WebPDecParams
 
 void WebPResetDecParams(WebPDecParams* const params) {
-  assert(params);
-  memset(params, 0, sizeof(*params));
-}
-
-void WebPClearDecParams(WebPDecParams* params) {
-  assert(params);
-  if (!params->external_buffer) {
-    free(params->output);
+  if (params) {
+    memset(params, 0, sizeof(*params));
   }
-  WebPResetDecParams(params);
 }
 
 //-----------------------------------------------------------------------------
-// "Into" variants
+// "Into" decoding variants
 
-static uint8_t* DecodeInto(const uint8_t* data, uint32_t data_size,
-                           WebPDecParams* params) {
+// Main flow
+static VP8StatusCode DecodeInto(const uint8_t* data, uint32_t data_size,
+                                WebPDecParams* const params) {
   VP8Decoder* dec = VP8New();
+  VP8StatusCode status = VP8_STATUS_OK;
   VP8Io io;
-  int ok = 1;
 
+  assert(params);
   if (dec == NULL) {
-    return NULL;
+    return VP8_STATUS_INVALID_PARAM;
   }
 
   VP8InitIo(&io);
   io.data = data;
   io.data_size = data_size;
+  WebPInitCustomIo(params, &io);  // Plug the I/O functions.
 
-  io.opaque = params;
-  WebPInitCustomIo(&io);
-
+  // Decode bitstream header, update io->width/io->height.
   if (!VP8GetHeaders(dec, &io)) {
-    VP8Delete(dec);
-    return NULL;
+    status = VP8_STATUS_BITSTREAM_ERROR;
+  } else {
+    // Allocate/check output buffers.
+    status = WebPAllocateDecBuffer(io.width, io.height, params->options,
+                                   params->output);
+    if (status == VP8_STATUS_OK) {
+      // Decode
+      if (!VP8Decode(dec, &io)) {
+        status = dec->status_;
+      }
+    }
   }
-
-  // check output buffers
-  ok = WebPCheckDecParams(&io, params);
-  if (!ok) {
-    VP8Delete(dec);
-    return NULL;
-  }
-
-  if (params->mode != MODE_YUV) {
-    VP8YUVInit();
-  }
-
-  ok = VP8Decode(dec, &io);
   VP8Delete(dec);
-  return ok ? params->output : NULL;
+  if (status != VP8_STATUS_OK) {
+    WebPFreeDecBuffer(params->output);
+  }
+  return status;
+}
+
+// Helpers
+static uint8_t* DecodeIntoRGBABuffer(WEBP_CSP_MODE colorspace,
+                                     const uint8_t* data, uint32_t data_size,
+                                     uint8_t* rgba, int stride, int size) {
+  WebPDecParams params;
+  WebPDecBuffer buf;
+  if (rgba == NULL) {
+    return NULL;
+  }
+  WebPInitDecBuffer(&buf);
+  WebPResetDecParams(&params);
+  params.output = &buf;
+  buf.colorspace    = colorspace;
+  buf.u.RGBA.rgba   = rgba;
+  buf.u.RGBA.stride = stride;
+  buf.u.RGBA.size   = size;
+  buf.is_external_memory = 1;
+  if (DecodeInto(data, data_size, &params) != VP8_STATUS_OK) {
+    return NULL;
+  }
+  return rgba;
 }
 
 uint8_t* WebPDecodeRGBInto(const uint8_t* data, uint32_t data_size,
-                           uint8_t* output, int output_size,
-                           int output_stride) {
-  WebPDecParams params;
-
-  if (output == NULL) {
-    return NULL;
-  }
-
-  WebPResetDecParams(&params);
-  params.mode = MODE_RGB;
-  params.output = output;
-  params.stride = output_stride;
-  params.output_size = output_size;
-  return DecodeInto(data, data_size, &params);
+                           uint8_t* output, int size, int stride) {
+  return DecodeIntoRGBABuffer(MODE_RGB, data, data_size, output, stride, size);
 }
 
 uint8_t* WebPDecodeRGBAInto(const uint8_t* data, uint32_t data_size,
-                            uint8_t* output, int output_size,
-                            int output_stride) {
-  WebPDecParams params;
-
-  if (output == NULL) {
-    return NULL;
-  }
-
-  WebPResetDecParams(&params);
-  params.mode = MODE_RGBA;
-  params.output = output;
-  params.stride = output_stride;
-  params.output_size = output_size;
-  return DecodeInto(data, data_size, &params);
+                            uint8_t* output, int size, int stride) {
+  return DecodeIntoRGBABuffer(MODE_RGBA, data, data_size, output, stride, size);
 }
 
 uint8_t* WebPDecodeBGRInto(const uint8_t* data, uint32_t data_size,
-                           uint8_t* output, int output_size,
-                           int output_stride) {
-  WebPDecParams params;
-
-  if (output == NULL) {
-    return NULL;
-  }
-
-  WebPResetDecParams(&params);
-  params.mode = MODE_BGR;
-  params.output = output;
-  params.stride = output_stride;
-  params.output_size = output_size;
-  return DecodeInto(data, data_size, &params);
+                           uint8_t* output, int size, int stride) {
+  return DecodeIntoRGBABuffer(MODE_BGR, data, data_size, output, stride, size);
 }
 
 uint8_t* WebPDecodeBGRAInto(const uint8_t* data, uint32_t data_size,
-                            uint8_t* output, int output_size,
-                            int output_stride) {
-  WebPDecParams params;
-
-  if (output == NULL) {
-    return NULL;
-  }
-
-  WebPResetDecParams(&params);
-  params.mode = MODE_BGRA;
-  params.output = output;
-  params.stride = output_stride;
-  params.output_size = output_size;
-  return DecodeInto(data, data_size, &params);
+                            uint8_t* output, int size, int stride) {
+  return DecodeIntoRGBABuffer(MODE_BGRA, data, data_size, output, stride, size);
 }
 
 uint8_t* WebPDecodeYUVInto(const uint8_t* data, uint32_t data_size,
@@ -541,85 +161,93 @@ uint8_t* WebPDecodeYUVInto(const uint8_t* data, uint32_t data_size,
                            uint8_t* u, int u_size, int u_stride,
                            uint8_t* v, int v_size, int v_stride) {
   WebPDecParams params;
-
-  if (luma == NULL) {
+  WebPDecBuffer output;
+  if (luma == NULL) return NULL;
+  WebPInitDecBuffer(&output);
+  WebPResetDecParams(&params);
+  params.output = &output;
+  output.colorspace      = MODE_YUV;
+  output.u.YUVA.y        = luma;
+  output.u.YUVA.y_stride = luma_stride;
+  output.u.YUVA.y_size   = luma_size;
+  output.u.YUVA.u        = u;
+  output.u.YUVA.u_stride = u_stride;
+  output.u.YUVA.u_size   = u_size;
+  output.u.YUVA.v        = v;
+  output.u.YUVA.v_stride = v_stride;
+  output.u.YUVA.v_size   = v_size;
+  output.is_external_memory = 1;
+  if (DecodeInto(data, data_size, &params) != VP8_STATUS_OK) {
     return NULL;
   }
-
-  WebPResetDecParams(&params);
-  params.mode = MODE_YUV;
-  params.output = luma;
-  params.stride = luma_stride;
-  params.output_size = luma_size;
-  params.u = u;
-  params.u_stride = u_stride;
-  params.output_u_size = u_size;
-  params.v = v;
-  params.v_stride = v_stride;
-  params.output_v_size = v_size;
-  return DecodeInto(data, data_size, &params);
+  return luma;
 }
 
 //-----------------------------------------------------------------------------
 
 static uint8_t* Decode(WEBP_CSP_MODE mode, const uint8_t* data,
                        uint32_t data_size, int* width, int* height,
-                       WebPDecParams* params_out) {
-  uint8_t* output;
+                       WebPDecBuffer* keep_info) {
   WebPDecParams params;
+  WebPDecBuffer output;
 
+  WebPInitDecBuffer(&output);
   WebPResetDecParams(&params);
-  params.mode = mode;
-  if (!WebPInitDecParams(data, data_size, width, height, &params)) {
+  params.output = &output;
+  output.colorspace = mode;
+
+  // Retrieve (and report back) the required dimensions from bitstream.
+  if (!WebPGetInfo(data, data_size, &output.width, &output.height)) {
     return NULL;
   }
+  if (width) *width = output.width;
+  if (height) *height = output.height;
 
-  params.output_size = params.stride * (*height);
-  params.output_u_size = params.output_v_size =
-    params.u_stride * ((*height + 1) / 2);
-  output = DecodeInto(data, data_size, &params);
-  if (!output) {
-    WebPClearDecParams(&params);
+  // Decode
+  if (DecodeInto(data, data_size, &params) != VP8_STATUS_OK) {
+    return NULL;
   }
-  if (params_out) {
-    *params_out = params;
+  if (keep_info) {    // keep track of the side-info
+    WebPCopyDecBuffer(&output, keep_info);
   }
-  return output;
+  // return decoded samples (don't clear 'output'!)
+  return (mode >= MODE_YUV) ? output.u.YUVA.y : output.u.RGBA.rgba;
 }
 
 uint8_t* WebPDecodeRGB(const uint8_t* data, uint32_t data_size,
-                       int *width, int *height) {
+                       int* width, int* height) {
   return Decode(MODE_RGB, data, data_size, width, height, NULL);
 }
 
 uint8_t* WebPDecodeRGBA(const uint8_t* data, uint32_t data_size,
-                        int *width, int *height) {
+                        int* width, int* height) {
   return Decode(MODE_RGBA, data, data_size, width, height, NULL);
 }
 
 uint8_t* WebPDecodeBGR(const uint8_t* data, uint32_t data_size,
-                       int *width, int *height) {
+                       int* width, int* height) {
   return Decode(MODE_BGR, data, data_size, width, height, NULL);
 }
 
 uint8_t* WebPDecodeBGRA(const uint8_t* data, uint32_t data_size,
-                        int *width, int *height) {
+                        int* width, int* height) {
   return Decode(MODE_BGRA, data, data_size, width, height, NULL);
 }
 
 uint8_t* WebPDecodeYUV(const uint8_t* data, uint32_t data_size,
-                       int *width, int *height, uint8_t** u, uint8_t** v,
-                       int *stride, int* uv_stride) {
-  WebPDecParams params;
+                       int* width, int* height, uint8_t** u, uint8_t** v,
+                       int* stride, int* uv_stride) {
+  WebPDecBuffer output;   // only to preserve the side-infos
   uint8_t* const out = Decode(MODE_YUV, data, data_size,
-                              width, height, &params);
+                              width, height, &output);
 
   if (out) {
-    *u = params.u;
-    *v = params.v;
-    *stride = params.stride;
-    *uv_stride = params.u_stride;
-    assert(params.u_stride == params.v_stride);
+    const WebPYUVABuffer* const buf = &output.u.YUVA;
+    *u = buf->u;
+    *v = buf->v;
+    *stride = buf->y_stride;
+    *uv_stride = buf->u_stride;
+    assert(buf->u_stride == buf->v_stride);
   }
   return out;
 }
@@ -628,16 +256,91 @@ uint8_t* WebPDecodeYUV(const uint8_t* data, uint32_t data_size,
 // WebPGetInfo()
 
 int WebPGetInfo(const uint8_t* data, uint32_t data_size,
-                int *width, int *height) {
+                int* width, int* height) {
   const uint32_t chunk_size = WebPCheckRIFFHeader(&data, &data_size);
   if (!chunk_size) {
     return 0;         // unsupported RIFF header
   }
   // Validate raw video data
-  if (data_size < 10) {
-    return 0;         // not enough data
+  return VP8GetInfo(data, data_size, chunk_size, width, height, NULL);
+}
+
+static void DefaultFeatures(WebPBitstreamFeatures* const features) {
+  assert(features);
+  memset(features, 0, sizeof(*features));
+  features->bitstream_version = 0;
+}
+
+static VP8StatusCode GetFeatures(const uint8_t** data, uint32_t* data_size,
+                                 WebPBitstreamFeatures* const features) {
+  uint32_t chunk_size;
+  if (features == NULL) {
+    return VP8_STATUS_INVALID_PARAM;
   }
-  return VP8GetInfo(data, chunk_size, width, height);
+  DefaultFeatures(features);
+  if (data == NULL || *data == NULL || data_size == 0) {
+    return VP8_STATUS_INVALID_PARAM;
+  }
+  chunk_size = WebPCheckRIFFHeader(data, data_size);
+  if (chunk_size == 0) {
+    return VP8_STATUS_BITSTREAM_ERROR;   // unsupported RIFF header
+  }
+  if (!VP8GetInfo(*data, *data_size, chunk_size,
+                  &features->width, &features->height, &features->has_alpha)) {
+    return VP8_STATUS_BITSTREAM_ERROR;
+  }
+  return VP8_STATUS_OK;
+}
+
+//-----------------------------------------------------------------------------
+// Advance decoding API
+
+int WebPInitDecoderConfigInternal(WebPDecoderConfig* const config,
+                                  int version) {
+  if (version != WEBP_DECODER_ABI_VERSION) {
+    return 0;   // version mismatch
+  }
+  if (config == NULL) {
+    return 0;
+  }
+  memset(config, 0, sizeof(*config));
+  DefaultFeatures(&config->input);
+  WebPInitDecBuffer(&config->output);
+  return 1;
+}
+
+VP8StatusCode WebPGetFeaturesInternal(const uint8_t* data, uint32_t data_size,
+                                      WebPBitstreamFeatures* const features,
+                            int version) {
+  if (version != WEBP_DECODER_ABI_VERSION) {
+    return VP8_STATUS_INVALID_PARAM;   // version mismatch
+  }
+  if (features == NULL) {
+    return VP8_STATUS_INVALID_PARAM;
+  }
+  return GetFeatures(&data, &data_size, features);
+}
+
+VP8StatusCode WebPDecode(const uint8_t* data, uint32_t data_size,
+                         WebPDecoderConfig* const config) {
+  WebPDecParams params;
+  VP8StatusCode status;
+
+  if (!config) {
+    return VP8_STATUS_INVALID_PARAM;
+  }
+
+  status = GetFeatures(&data, &data_size, &config->input);
+  if (status != VP8_STATUS_OK) {
+    return status;
+  }
+
+  WebPResetDecParams(&params);
+  params.output = &config->output;
+  params.options = &config->options;
+  status = DecodeInto(data, data_size, &params);
+
+  return status;
 }
 
 #if defined(__cplusplus) || defined(c_plusplus)
diff --git a/src/dec/webpi.h b/src/dec/webpi.h
index ce2d653b..662441da 100644
--- a/src/dec/webpi.h
+++ b/src/dec/webpi.h
@@ -18,46 +18,81 @@ extern "C" {
 
 #include "../webp/decode_vp8.h"
 
-// Decoding output parameters.
+//------------------------------------------------------------------------------
+// WebPDecParams: Decoding output parameters. Transcient internal object.
+
+typedef struct WebPDecParams WebPDecParams;
+typedef int (*OutputFunc)(const VP8Io* const io, WebPDecParams* const p);
+
+// Structure use for on-the-fly rescaling
 typedef struct {
-  uint8_t* output;      // rgb(a) or luma
-  uint8_t *u, *v;       // chroma u/v
-  uint8_t *top_y, *top_u, *top_v;   // cache for the fancy upscaler
-  int stride;           // rgb(a) stride or luma stride
-  int u_stride;         // chroma-u stride
-  int v_stride;         // chroma-v stride
-  WEBP_CSP_MODE mode;   // rgb(a) or yuv
-  int last_y;           // coordinate of the line that was last output
-  int output_size;      // size of 'output' buffer
-  int output_u_size;    // size of 'u' buffer
-  int output_v_size;    // size of 'v' buffer
-  int external_buffer;  // If true, the output buffers are externally owned
-} WebPDecParams;
+  int x_expand;               // true if we're expanding in the x direction
+  int fy_scale, fx_scale;     // fixed-point scaling factor
+  int64_t fxy_scale;          // ''
+  // we need hpel-precise add/sub increments, for the downsampled U/V planes.
+  int y_accum;                // vertical accumulator
+  int y_add, y_sub;           // vertical increments (add ~= src, sub ~= dst)
+  int x_add, x_sub;           // horizontal increments (add ~= src, sub ~= dst)
+  int src_width, src_height;  // source dimensions
+  int dst_width, dst_height;  // destination dimensions
+  uint8_t* dst;
+  int dst_stride;
+  int32_t* irow, *frow;       // work buffer
+} WebPRescaler;
+
+struct WebPDecParams {
+  WebPDecBuffer* output;             // output buffer.
+  uint8_t* tmp_y, *tmp_u, *tmp_v;    // cache for the fancy upsampler
+                                     // or used for tmp rescaling
+
+  int last_y;                 // coordinate of the line that was last output
+  const WebPDecoderOptions* options;  // if not NULL, use alt decoding features
+  // rescalers
+  WebPRescaler scaler_y, scaler_u, scaler_v, scaler_a;
+  void* memory;               // overall scratch memory for the output work.
+  OutputFunc emit;            // output RGB or YUV samples
+  OutputFunc emit_alpha;      // output alpha channel
+};
 
 // Should be called first, before any use of the WebPDecParams object.
 void WebPResetDecParams(WebPDecParams* const params);
 
+//------------------------------------------------------------------------------
+// Misc utils
+
 // If a RIFF container is detected, validate it and skip over it. Returns
 // VP8 bit-stream size if RIFF header is valid else returns 0
 uint32_t WebPCheckRIFFHeader(const uint8_t** data_ptr,
-                             uint32_t *data_size_ptr);
+                             uint32_t* data_size_ptr);
 
-// Initializes VP8Io with custom setup, io and teardown functions
-void WebPInitCustomIo(VP8Io* const io);
+// Initializes VP8Io with custom setup, io and teardown functions. The default
+// hooks will use the supplied 'params' as io->opaque handle.
+void WebPInitCustomIo(WebPDecParams* const params, VP8Io* const io);
 
-// Initializes params_out by allocating output buffer and setting the
-// stride information. It also outputs width and height information of
-// the WebP image. Returns 1 if succeeds.
-int WebPInitDecParams(const uint8_t* data, uint32_t data_size, int* width,
-                      int* height, WebPDecParams* const params_out);
+//------------------------------------------------------------------------------
+// Internal functions regarding WebPDecBuffer memory (in buffer.c).
+// Don't really need to be externally visible for now.
 
-// Verifies various size configurations (e.g stride >= width, specified
-// output size <= stride * height etc.). Returns 0 if checks fail.
-int WebPCheckDecParams(const VP8Io* io, const WebPDecParams* params);
+// Prepare 'buffer' with the requested initial dimensions width/height.
+// If no external storage is supplied, initializes buffer by allocating output
+// memory and setting up the stride information. Validate the parameters. Return
+// an error code in case of problem (no memory, or invalid stride / size /
+// dimension / etc.). If *options is not NULL, also verify that the options'
+// parameters are valid and apply them to the width/height dimensions of the
+// output buffer. This takes cropping / scaling / rotation into account.
+VP8StatusCode WebPAllocateDecBuffer(int width, int height,
+                                    const WebPDecoderOptions* const options,
+                                    WebPDecBuffer* const buffer);
 
-// Deallocate memory allocated by WebPInitDecParams() and reset the
-// WebPDecParams object.
-void WebPClearDecParams(WebPDecParams* params);
+// Copy 'src' into 'dst' buffer, making sure 'dst' is not marked as owner of the
+// memory (still held by 'src').
+void WebPCopyDecBuffer(const WebPDecBuffer* const src,
+                       WebPDecBuffer* const dst);
+
+// Copy and transfer ownership from src to dst (beware of parameter order!)
+void WebPGrabDecBuffer(WebPDecBuffer* const src, WebPDecBuffer* const dst);
+
+//------------------------------------------------------------------------------
 
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
diff --git a/src/dec/yuv.h b/src/dec/yuv.h
index 1398f2e0..0604df79 100644
--- a/src/dec/yuv.h
+++ b/src/dec/yuv.h
@@ -26,7 +26,7 @@ extern int16_t VP8kVToR[256], VP8kUToB[256];
 extern int32_t VP8kVToG[256], VP8kUToG[256];
 extern uint8_t VP8kClip[YUV_RANGE_MAX - YUV_RANGE_MIN];
 
-inline static void VP8YuvToRgb(uint8_t y, uint8_t u, uint8_t v,
+static inline void VP8YuvToRgb(uint8_t y, uint8_t u, uint8_t v,
                                uint8_t* const rgb) {
   const int r_off = VP8kVToR[v];
   const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
@@ -36,7 +36,7 @@ inline static void VP8YuvToRgb(uint8_t y, uint8_t u, uint8_t v,
   rgb[2] = VP8kClip[y + b_off - YUV_RANGE_MIN];
 }
 
-inline static void VP8YuvToBgr(uint8_t y, uint8_t u, uint8_t v,
+static inline void VP8YuvToBgr(uint8_t y, uint8_t u, uint8_t v,
                                uint8_t* const bgr) {
   const int r_off = VP8kVToR[v];
   const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
@@ -46,11 +46,18 @@ inline static void VP8YuvToBgr(uint8_t y, uint8_t u, uint8_t v,
   bgr[2] = VP8kClip[y + r_off - YUV_RANGE_MIN];
 }
 
-inline static void VP8YuvToBgra(int y, int u, int v, uint8_t* const bgra) {
+static inline void VP8YuvToBgra(uint8_t y, uint8_t u, uint8_t v,
+                                uint8_t* const bgra) {
   VP8YuvToBgr(y, u, v, bgra);
   bgra[3] = 0xff;
 }
 
+static inline void VP8YuvToRgba(uint8_t y, uint8_t u, uint8_t v,
+                                uint8_t* const rgba) {
+  VP8YuvToRgb(y, u, v, rgba);
+  rgba[3] = 0xff;
+}
+
 // Must be called before everything, to initialize the tables.
 void VP8YUVInit(void);
 
diff --git a/src/webp/decode.h b/src/webp/decode.h
index a53eb599..af132190 100644
--- a/src/webp/decode.h
+++ b/src/webp/decode.h
@@ -18,6 +18,8 @@
 extern "C" {
 #endif
 
+#define WEBP_DECODER_ABI_VERSION 0x0002
+
 // Return the decoder's version number, packed in hexadecimal using 8bits for
 // each of major/minor/revision. E.g: v2.5.7 is 0x020507.
 int WebPGetDecoderVersion(void);
@@ -27,25 +29,25 @@ int WebPGetDecoderVersion(void);
 // case of formatting error.
 // Pointers *width/*height can be passed NULL if deemed irrelevant.
 int WebPGetInfo(const uint8_t* data, uint32_t data_size,
-                int *width, int *height);
+                int* width, int* height);
 
 // Decodes WEBP images pointed to by *data and returns RGB samples, along
 // with the dimensions in *width and *height.
 // The returned pointer should be deleted calling free().
 // Returns NULL in case of error.
 uint8_t* WebPDecodeRGB(const uint8_t* data, uint32_t data_size,
-                       int *width, int *height);
+                       int* width, int* height);
 
 // Same as WebPDecodeRGB, but returning RGBA data.
 uint8_t* WebPDecodeRGBA(const uint8_t* data, uint32_t data_size,
-                        int *width, int *height);
+                        int* width, int* height);
 
 // This variant decode to BGR instead of RGB.
 uint8_t* WebPDecodeBGR(const uint8_t* data, uint32_t data_size,
-                       int *width, int *height);
+                       int* width, int* height);
 // This variant decodes to BGRA instead of RGBA.
 uint8_t* WebPDecodeBGRA(const uint8_t* data, uint32_t data_size,
-                        int *width, int *height);
+                        int* width, int* height);
 
 // Decode WEBP images stored in *data in Y'UV format(*). The pointer returned is
 // the Y samples buffer. Upon return, *u and *v will point to the U and V
@@ -57,8 +59,8 @@ uint8_t* WebPDecodeBGRA(const uint8_t* data, uint32_t data_size,
 // Return NULL in case of error.
 // (*) Also named Y'CbCr. See: http://en.wikipedia.org/wiki/YCbCr
 uint8_t* WebPDecodeYUV(const uint8_t* data, uint32_t data_size,
-                       int *width, int *height, uint8_t** u, uint8_t** v,
-                       int *stride, int* uv_stride);
+                       int* width, int* height, uint8_t** u, uint8_t** v,
+                       int* stride, int* uv_stride);
 
 // These three functions are variants of the above ones, that decode the image
 // directly into a pre-allocated buffer 'output_buffer'. The maximum storage
@@ -95,13 +97,59 @@ uint8_t* WebPDecodeYUVInto(const uint8_t* data, uint32_t data_size,
                            uint8_t* v, int v_size, int v_stride);
 
 //-----------------------------------------------------------------------------
+// Output colorspaces and buffer
 
-// Output colorspaces
+// Colorspaces
 typedef enum { MODE_RGB = 0, MODE_RGBA = 1,
                MODE_BGR = 2, MODE_BGRA = 3,
-               MODE_YUV = 4 } WEBP_CSP_MODE;
+               MODE_YUV = 4, MODE_YUVA = 5  // yuv 4:2:0
+             } WEBP_CSP_MODE;
 
+// Generic structure for describing the sample buffer.
+typedef struct {    // view as RGBA
+  uint8_t* rgba;    // pointer to RGBA samples
+  int stride;       // stride in bytes from one scanline to the next.
+  int size;         // total size of the *rgba buffer.
+} WebPRGBABuffer;
+
+typedef struct {              // view as YUVA
+  uint8_t* y, *u, *v, *a;     // pointer to luma, chroma U/V, alpha samples
+  int y_stride;               // luma stride
+  int u_stride, v_stride;     // chroma strides
+  int a_stride;               // alpha stride
+  int y_size;                 // luma plane size
+  int u_size, v_size;         // chroma planes size
+  int a_size;                 // alpha-plane size
+} WebPYUVABuffer;
+
+// Output buffer
+typedef struct {
+  WEBP_CSP_MODE colorspace;  // Colorspace.
+  int width, height;         // Dimensions.
+  int is_external_memory;    // If true, the *memory pointer is not owned.
+  union {
+    WebPRGBABuffer RGBA;
+    WebPYUVABuffer YUVA;
+  } u;                       // nameless union of buffer parameters.
+  uint8_t* memory;           // main pointer (when is_external_memory is false)
+} WebPDecBuffer;
+
+// Internal, version-checked, entry point
+int WebPInitDecBufferInternal(WebPDecBuffer* const, int);
+
+// Initialize the structure as empty. Must be called before any other use.
+// Returns false in case of version mismatch
+static inline int WebPInitDecBuffer(WebPDecBuffer* const buffer) {
+  return WebPInitDecBufferInternal(buffer, WEBP_DECODER_ABI_VERSION);
+}
+
+// Free any memory associated with the buffer. Must always be called last.
+// Note: doesn't free the 'buffer' structure itself.
+void WebPFreeDecBuffer(WebPDecBuffer* const buffer);
+
+//-----------------------------------------------------------------------------
 // Enumeration of the status codes
+
 typedef enum {
   VP8_STATUS_OK = 0,
   VP8_STATUS_OUT_OF_MEMORY,
@@ -116,8 +164,8 @@ typedef enum {
 //-----------------------------------------------------------------------------
 // Incremental decoding
 //
-//  This API allows streamlined decoding of partial data.
-//  Picture can be incrementally decoded as data become available thanks to the
+// This API allows streamlined decoding of partial data.
+// Picture can be incrementally decoded as data become available thanks to the
 // WebPIDecoder object. This object can be left in a SUSPENDED state if the
 // picture is only partially decoded, pending additional input.
 // Code example:
@@ -138,7 +186,16 @@ typedef enum {
 
 typedef struct WebPIDecoder WebPIDecoder;
 
+// Creates a new incremental decoder with the supplied buffer parameter.
+// This output_buffer can be passed NULL, in which case a default output buffer
+// is used (with MODE_RGB). Otherwise, an internal reference to 'output_buffer'
+// is kept, which means that the lifespan of 'output_buffer' must be larger than
+// that of the returned WebPIDecoder object.
+// Returns NULL if the allocation failed.
+WebPIDecoder* WebPINewDecoder(WebPDecBuffer* const output_buffer);
+
 // Creates a WebPIDecoder object. Returns NULL in case of failure.
+// TODO(skal): DEPRECATED. Prefer using WebPINewDecoder().
 WebPIDecoder* WebPINew(WEBP_CSP_MODE mode);
 
 // This function allocates and initializes an incremental-decoder object, which
@@ -183,7 +240,7 @@ VP8StatusCode WebPIUpdate(WebPIDecoder* const idec, const uint8_t* data,
 // specified in WebPINew()/WebPINewRGB(). *last_y is the index of last decoded
 // row in raster scan order. Some pointers (*last_y, *width etc.) can be NULL if
 // corresponding information is not needed.
-uint8_t* WebPIDecGetRGB(const WebPIDecoder* const idec, int *last_y,
+uint8_t* WebPIDecGetRGB(const WebPIDecoder* const idec, int* last_y,
                         int* width, int* height, int* stride);
 
 // Same as above function to get YUV image. Returns pointer to the luma plane
@@ -192,6 +249,122 @@ uint8_t* WebPIDecGetYUV(const WebPIDecoder* const idec, int* last_y,
                         uint8_t** u, uint8_t** v,
                         int* width, int* height, int* stride, int* uv_stride);
 
+// Generic call to retrieve output buffer information.
+// Returns NULL in case of error, otherwise returns the pointer to the internal
+// representation. This structure is read-only and shouldn't be modified.
+// TODO(skal): instead of 'last_y' only, we should pass *left/top/right/bottom,
+// to get the visible area. Esp. useful for rotation.
+const WebPDecBuffer* WebPIDecGetSamples(const WebPIDecoder* const idec,
+                                        int* last_y);
+
+//-----------------------------------------------------------------------------
+// Advanced decoding parametrization
+//
+//  Code sample for using the advanced decoding API
+/*
+     // A) Init a configuration object
+     WebPDecoderConfig config;
+     CHECK(WebPInitDecoderConfig(&config));
+
+     // B) optional: retrieve the bitstream's features.
+     CHECK(WebPGetFeatures(data, data_size, &config.input) == VP8_STATUS_OK);
+
+     // C) Adjust 'config', if needed
+     config.no_fancy = 1;
+     config.output.colorspace = MODE_BGRA;
+     // etc.
+
+     // Note that you can also make config.output point to an externally
+     // supplied memory buffer, provided it's big enough to store the decoded
+     // picture. Otherwise, config.output will just be used to allocate memory
+     // and store the decoded picture.
+
+     // D) Decode!
+     CHECK(WebPDecode(data, data_size, &config) == VP8_STATUS_OK);
+
+     // E) Decoded image is now in config.output (and config.output.u.RGBA)
+
+     // F) Reclaim memory allocated in config's object. It's safe to call
+     // this function even if the memory is external and wasn't allocated
+     // by WebPDecode().
+     WebPFreeDecBuffer(&config.output);
+*/
+
+// Features gathered from the bitstream
+typedef struct {
+  int width;        // the original width, as read from the bitstream
+  int height;       // the original height, as read from the bitstream
+  int has_alpha;    // true if bitstream contains an alpha channel
+  int no_incremental_decoding;  // if true, using incremental decoding is not
+                                // recommended.
+  int rotate;                   // TODO(later)
+  int uv_sampling;              // should be 0 for now. TODO(later)
+  int bitstream_version;        // should be 0 for now. TODO(later)
+} WebPBitstreamFeatures;
+
+// Internal, version-checked, entry point
+extern VP8StatusCode WebPGetFeaturesInternal(const uint8_t*, uint32_t,
+                                             WebPBitstreamFeatures* const, int);
+
+// Retrieve features from the bitstream. The *features structure is filled
+// with informations gathered from the bitstream.
+// Returns false in case of error or version mismatch.
+// In case of error, features->bitstream_status will reflect the error code.
+static inline
+  VP8StatusCode WebPGetFeatures(const uint8_t* data, uint32_t data_size,
+                                WebPBitstreamFeatures* const features) {
+  return WebPGetFeaturesInternal(data, data_size, features,
+                                 WEBP_DECODER_ABI_VERSION);
+}
+
+// Decoding options
+typedef struct {
+  int bypass_filtering;               // if true, skip the in-loop filtering
+  int no_fancy_upsampling;            // if true, use faster pointwise upsampler
+  int use_cropping;                   // if true, cropping is applied _first_
+  int crop_left, crop_top;            // top-left position for cropping.
+                                      // Will be snapped to even values.
+  int crop_width, crop_height;        // dimension of the cropping area
+  int use_scaling;                    // if true, scaling is applied _afterward_
+  int scaled_width, scaled_height;    // final resolution
+  int force_rotation;                 // forced rotation (to be applied _last_)
+  int no_enhancement;                 // if true, discard enhancement layer
+} WebPDecoderOptions;
+
+// Main object storing the configuration for advanced decoding.
+typedef struct {
+  WebPBitstreamFeatures input;  // Immutable bitstream features (optional)
+  WebPDecBuffer output;         // Output buffer (can point to external mem)
+  WebPDecoderOptions options;   // Decoding options
+} WebPDecoderConfig;
+
+// Internal, version-checked, entry point
+extern int WebPInitDecoderConfigInternal(WebPDecoderConfig* const, int);
+
+// Initialize the configuration as empty. This function must always be
+// called first, unless WebPGetFeatures() is to be called.
+// Returns false in case of mismatched version.
+static inline int WebPInitDecoderConfig(WebPDecoderConfig* const config) {
+  return WebPInitDecoderConfigInternal(config, WEBP_DECODER_ABI_VERSION);
+}
+
+// Instantiate a new incremental decoder object with requested configuration.
+// The bitstream can be passed using *data and data_size parameter,
+// in which case the features will be parsed and stored into config->input.
+// Otherwise, 'data' can be NULL and now parsing will occur.
+// Note that 'config' can be NULL too, in which case a default configuration is
+// used.
+// The return WebPIDecoder object must always be deleted calling WebPIDelete().
+// Returns NULL in case of error (and config->status will then reflect
+// the error condition).
+WebPIDecoder* WebPIDecode(const uint8_t* data, uint32_t data_size,
+                          WebPDecoderConfig* const config);
+
+// Non-incremental version. This version decodes the full data at once, taking
+// 'config' into account. Return decoding status (VP8_STATUS_OK if decoding
+// was successful).
+VP8StatusCode WebPDecode(const uint8_t* data, uint32_t data_size,
+                         WebPDecoderConfig* const config);
 
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
diff --git a/src/webp/decode_vp8.h b/src/webp/decode_vp8.h
index 3f24ea24..68a9f975 100644
--- a/src/webp/decode_vp8.h
+++ b/src/webp/decode_vp8.h
@@ -18,8 +18,6 @@
 extern "C" {
 #endif
 
-#define WEBP_DECODER_ABI_VERSION 0x0002
-
 //-----------------------------------------------------------------------------
 // Lower-level API
 //
@@ -42,12 +40,16 @@ extern "C" {
 typedef struct VP8Io VP8Io;
 struct VP8Io {
   // set by VP8GetHeaders()
-  int width, height;         // picture dimensions, in pixels
+  int width, height;         // picture dimensions, in pixels (invariable).
+                             // These are the original, uncropped dimensions.
+                             // The actual area passed to put() is stored
+                             // in mb_w / mb_h fields.
 
   // set before calling put()
   int mb_y;                  // position of the current rows (in pixels)
+  int mb_w;                  // number of columns in the sample
   int mb_h;                  // number of rows in the sample
-  const uint8_t *y, *u, *v;  // rows to copy (in yuv420 format)
+  const uint8_t* y, *u, *v;  // rows to copy (in yuv420 format)
   int y_stride;              // row stride for luma
   int uv_stride;             // row stride for chroma
 
@@ -56,7 +58,8 @@ struct VP8Io {
   // called when fresh samples are available. Currently, samples are in
   // YUV420 format, and can be up to width x 24 in size (depending on the
   // in-loop filtering level, e.g.). Should return false in case of error
-  // or abort request.
+  // or abort request. The actual size of the area to update is mb_w x mb_h
+  // in size, taking cropping into account.
   int (*put)(const VP8Io* io);
 
   // called just before starting to decode the blocks.
@@ -69,7 +72,7 @@ struct VP8Io {
   // this is a recommendation for the user-side yuv->rgb converter. This flag
   // is set when calling setup() hook and can be overwritten by it. It then
   // can be taken into consideration during the put() method.
-  int fancy_upscaling;
+  int fancy_upsampling;
 
   // Input buffer.
   uint32_t data_size;
@@ -81,6 +84,14 @@ struct VP8Io {
   // with the VP8 specifications.
   int bypass_filtering;
 
+  // Cropping parameters.
+  int use_cropping;
+  int crop_left, crop_right, crop_top, crop_bottom;
+
+  // Scaling parameters.
+  int use_scaling;
+  int scaled_width, scaled_height;
+
   // pointer to the alpha data (if present) corresponding to the rows
   const uint8_t* a;
 };