diff --git a/AUTHORS b/AUTHORS
index 0c3677ef..c6ea612b 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -2,6 +2,7 @@ Contributors:
 - James Zern (jzern at google dot com)
 - Jan Engelhardt (jengelh at medozas dot de)
 - Johann (johannkoenig at google dot com)
+- Jyrki Alakuijala (jyrki at google dot com)
 - Lou Quillio (louquillio at google dot com)
 - Martin Olsson (mnemo at minimum dot se)
 - Mikołaj Zalewski (mikolajz at google dot com)
diff --git a/Android.mk b/Android.mk
index 6931f9d5..06418196 100644
--- a/Android.mk
+++ b/Android.mk
@@ -47,9 +47,10 @@ LOCAL_SRC_FILES := \
     src/utils/quant_levels.c \
     src/utils/rescaler.c \
     src/utils/thread.c \
+    src/utils/utils.c \
 
 LOCAL_CFLAGS := -Wall -DANDROID -DHAVE_MALLOC_H -DHAVE_PTHREAD \
-                -DNOT_HAVE_LOG2 -DWEBP_USE_THREAD \
+                -DWEBP_USE_THREAD \
                 -finline-functions -frename-registers -ffast-math \
                 -s -fomit-frame-pointer -Isrc/webp
 
diff --git a/ChangeLog b/ChangeLog
index 1b397780..0dcf446b 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,57 @@
+c655380 dec/io.c: cosmetics
+fe1958f RGBA4444: harmonize lossless/lossy alpha values
+681cb30 fix RGBA4444 output w/fancy upsampling
+f06c1d8 Merge "Alignment fix" into 0.2.0
+f56e98f Alignment fix
+6fe843b avoid rgb-premultiply if there's only trivial alpha values
+528a11a fix the ARGB4444 premultiply arithmetic
+a0a4885 Lossless decoder fix for a special transform order
+62dd9bb Update encoding heuristic w.r.t palette colors.
+6f4272b remove unused ApplyInverseTransform()
+93bf0fa Update ChangeLog (v0.2.0-rc1)
+5934fc5 update AUTHORS
+014a711 update NEWS
+43b0d61 add support for ARGB -> YUVA conversion for lossless decoder
+33705ca bump version to 0.2.0
+c40d7ef fix alpha-plane check + add extra checks
+a06f802 MODE_YUVA: set alpha to opaque if the image has none
+52a87dd Merge "silence one more warning" into 0.2.0
+3b02309 silence one more warning
+f94b04f move some RGB->YUV functions to yuv.h
+4b71ba0 README: sync [cd]webp help output
+c9ae57f man/dwebp.1: add links to output file format details
+292ec5c quiet a few 'uninitialized' warnings
+4af3f6c fix indentation
+9b261bf remove the last NOT_HAVE_LOG2 instances
+323dc4d remove use of log2(). Use VP8LFastLog2() instead.
+8c515d5 Merge "harness some malloc/calloc to use WebPSafeMalloc and WebPSafeCalloc" into 0.2.0
+d4b4bb0 Merge changes I46090628,I1a41b2ce into 0.2.0
+bff34ac harness some malloc/calloc to use WebPSafeMalloc and WebPSafeCalloc
+a3c063c Merge "extra size check for security" into 0.2.0
+5e79630 Merge "WebPEncode: clear stats at the start of encode" into 0.2.0
+f1edf62 Merge "rationalize use of color-cache" into 0.2.0
+c193331 extra size check for security
+906be65 rationalize use of color-cache
+dd1c387 Add image-hint for low-color images.
+4eb7aa6 Merge "WebPCheckMalloc() and WebPCheckCalloc():" into 0.2.0
+80cc730 WebPCheckMalloc() and WebPCheckCalloc():
+183cba8 check VP8LBitWriterInit return
+cbfa9ee lossless: fix crash on user abort
+256afef cwebp: exit immediately on version mismatch
+475d87d WebPEncode: clear stats at the start of encode
+a7cc729 fix type and conversion warnings
+7d853d7 add stats for lossless
+d39177b make QuantizeLevels() store the sum of squared error
+5955cf5 replace x*155/100 by x*101581>>16
+7d732f9 make QuantizeLevels() store the sum of squared error
+e45a446 replace x*155/100 by x*101581>>16
+159b75d cwebp output size consistency:
+cbee59e Merge commit 'v0.1.99'
+1889e9b dwebp: report -alpha option
+3bc3f7c Merge "dwebp: add PAM output support" into 0.2.0
+d919ed0 dwebp: add PAM output support
+85e215d README/manpages/configure: update website link
+c3a207b Update ChangeLog (v0.1.99)
 d1fd782 Merge "add extra precision about default values and behaviour" into 0.2.0
 efc826e add extra precision about default values and behaviour
 9f29635 header/doc clean up
@@ -14,6 +68,7 @@ c37c23e README: cosmetics
 ce90847 Merge "add some padding bytes areas for later use" into 0.2.0
 2390dab Merge "fixing the findings by Frederic Kayser to the bitstream spec" into 0.2.0
 0275159 add a very crude progress report for lossless
+a4b9b1c Remove some unused enum values.
 dd10817 rename 'use_argb_input' to 'use_argb'
 90516ae add some padding bytes areas for later use
 d03b250 fixing the findings by Frederic Kayser to the bitstream spec
@@ -46,6 +101,7 @@ c3b014d Android.mk: add missing lossless files
 8c1cc6b makefile.unix dist: explicitly name installed includes
 7f4647e Merge "clarify the colorspace naming and byte ordering of decoded samples" into 0.2.0
 cbf6972 clarify the colorspace naming and byte ordering of decoded samples
+857650c Mux: Add WebPDataInit() and remove WebPImageInfo
 ff771e7 don't install webp/decode_vp8.h
 596dff7 VP8LFillBitWindow: use 64-bit path for msvc x64 builds
 3ca7ce9 Merge "doc: remove non-finalized chunk references" into 0.2.0
diff --git a/Makefile.vc b/Makefile.vc
index a0a495c4..c7447d83 100644
--- a/Makefile.vc
+++ b/Makefile.vc
@@ -27,7 +27,7 @@ CCNODBG    = cl.exe /nologo /O2 /DNDEBUG
 CCDEBUG    = cl.exe /nologo /Od /Gm /Zi /D_DEBUG /RTC1
 CFLAGS     = /Isrc /nologo /W3 /EHsc /FD /c /GS
 CFLAGS     = $(CFLAGS) /DWIN32 /D_CRT_SECURE_NO_WARNINGS /DWIN32_LEAN_AND_MEAN
-CFLAGS     = $(CFLAGS) /DHAVE_WINCODEC_H /DWEBP_USE_THREAD /DNOT_HAVE_LOG2
+CFLAGS     = $(CFLAGS) /DHAVE_WINCODEC_H /DWEBP_USE_THREAD
 LDFLAGS    = /LARGEADDRESSAWARE /MANIFEST /NXCOMPAT /DYNAMICBASE
 LDFLAGS    = $(LDFLAGS) $(PLATFORM_LDFLAGS)
 LNKDLL     = link.exe /DLL
@@ -199,6 +199,7 @@ UTILS_OBJS = \
     $(DIROBJ)\utils\quant_levels.obj \
     $(DIROBJ)\utils\rescaler.obj \
     $(DIROBJ)\utils\thread.obj \
+    $(DIROBJ)\utils\utils.obj \
 
 LIBWEBP_OBJS = $(DEC_OBJS) $(DSP_OBJS) $(ENC_OBJS) $(UTILS_OBJS) $(LIBWEBP_OBJS)
 LIBWEBPMUX_OBJS = $(MUX_OBJS) $(LIBWEBPMUX_OBJS)
diff --git a/NEWS b/NEWS
index b155a6a7..1431c705 100644
--- a/NEWS
+++ b/NEWS
@@ -1,3 +1,10 @@
+- 8/3/12: version 0.2.0
+  * Add support for ARGB -> YUVA conversion for lossless decoder
+    New functions: WebPINewYUVA, WebPIDecGetYUVA
+  * Add stats for lossless and alpha encoding
+  * Security related hardening: allocation and size checks
+  * Add PAM output support to dwebp
+
 - 7/19/12: version 0.1.99
   * This is a pre-release of 0.2.0, not an rc to allow for further
     incompatible changes based on user feedback.
diff --git a/README b/README
index a30d0090..a90fda0f 100644
--- a/README
+++ b/README
@@ -4,7 +4,7 @@
           \__\__/\____/\_____/__/ ____  ___
                 / _/ /    \    \ /  _ \/ _/
                /  \_/   / /   \ \   __/  \__
-               \____/____/\_____/_____/____/v0.1.99
+               \____/____/\_____/_____/____/v0.2.0
 
 Description:
 ============
@@ -13,7 +13,7 @@ WebP codec: library to encode and decode images in WebP format. This package
 contains the library that can be used in other programs to add WebP support,
 as well as the command line tools 'cwebp' and 'dwebp'.
 
-See http://code.google.com/speed/webp
+See http://developers.google.com/speed/webp
 
 Latest sources are available from http://www.webmproject.org/code/
 
@@ -168,7 +168,7 @@ options:
   -noalpha ............... discard any transparency information.
   -lossless .............. Encode image losslessly.
   -hint <string> ......... Specify image characteristics hint.
-                           One of: photo or picture
+                           One of: photo, picture or graph
 
   -short ................. condense printed message
   -quiet ................. don't print anything.
@@ -231,7 +231,8 @@ Usage: dwebp in_file [options] [-o out_file]
 
 Decodes the WebP image file to PNG format [Default]
 Use following options to convert into alternate image formats:
-  -ppm ......... save the raw RGB samples as color PPM
+  -pam ......... save the raw RGBA samples as a color PAM
+  -ppm ......... save the raw RGB samples as a color PPM
   -pgm ......... save the raw YUV samples as a grayscale PGM
                  file with IMC4 layout.
  Other options are:
@@ -241,6 +242,7 @@ Use following options to convert into alternate image formats:
   -mt .......... use multi-threading
   -crop <x> <y> <w> <h> ... crop output with the given rectangle
   -scale <w> <h> .......... scale the output (*after* any cropping)
+  -alpha ....... only save the alpha plane.
   -h     ....... this help message.
   -v     ....... verbose (e.g. print encoding/decoding times)
   -noasm ....... disable all assembly optimizations.
@@ -403,12 +405,12 @@ The 'idec' object must always be released (even upon an error condition) by
 calling: WebPDelete(idec).
 
 To retrieve partially decoded picture samples, one must use the corresponding
-method: WebPIDecGetRGB or WebPIDecGetYUV.
+method: WebPIDecGetRGB or WebPIDecGetYUVA.
 It will return the last displayable pixel row.
 
 Lastly, note that decoding can also be performed into a pre-allocated pixel
 buffer. This buffer must be passed when creating a WebPIDecoder, calling
-WebPINewRGB() or WebPINewYUV().
+WebPINewRGB() or WebPINewYUVA().
 
 Please have a look at the src/webp/decode.h header for further details.
 
diff --git a/configure.ac b/configure.ac
index 9fc948bf..d81c4c98 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,6 +1,6 @@
-AC_INIT([libwebp], [0.1.99],
+AC_INIT([libwebp], [0.2.0],
         [http://code.google.com/p/webp/issues],,
-        [http://code.google.com/speed/webp])
+        [http://developers.google.com/speed/webp])
 AC_CANONICAL_TARGET
 AM_INIT_AUTOMAKE([-Wall foreign subdir-objects])
 AC_PROG_LIBTOOL
@@ -29,11 +29,6 @@ AC_DEFUN([WITHLIB_OPTION],
                               [use $2 libraries from DIR]),
                [$2_LIBS="-L$withval"])])
 
-dnl === Check for native log2
-AC_SEARCH_LIBS([log2], [m],,
-               [AC_DEFINE([NOT_HAVE_LOG2], [1],
-                          [Undefine this if you have log2().])])
-
 dnl === Check for pthread support
 AC_ARG_ENABLE([threading],
               AS_HELP_STRING([--disable-threading],
diff --git a/examples/cwebp.c b/examples/cwebp.c
index 86f5b7ae..8441e94c 100644
--- a/examples/cwebp.c
+++ b/examples/cwebp.c
@@ -132,7 +132,7 @@ static HRESULT ReadPictureWithWIC(const char* filename,
   IWICBitmapDecoder* pDecoder = NULL;
   IStream* pStream = NULL;
   UINT frameCount = 0;
-  UINT width, height = 0;
+  UINT width = 0, height = 0;
   BYTE* rgb = NULL;
   WICPixelFormatGUID srcPixelFormat = { 0 };
   GUID srcContainerFormat = { 0 };
@@ -614,6 +614,25 @@ static void PrintValues(const int values[4]) {
   fprintf(stderr, "|\n");
 }
 
+static void PrintFullLosslessInfo(const WebPAuxStats* const stats,
+                                  const char* const description) {
+  fprintf(stderr, "Lossless-%s compressed size: %d bytes\n",
+          description, stats->lossless_size);
+  if (stats->lossless_features) {
+    fprintf(stderr, "  * Lossless features used:");
+    if (stats->lossless_features & 1) fprintf(stderr, " PREDICTION");
+    if (stats->lossless_features & 2) fprintf(stderr, " CROSS-COLOR-TRANSFORM");
+    if (stats->lossless_features & 4) fprintf(stderr, " SUBTRACT-GREEN");
+    if (stats->lossless_features & 8) fprintf(stderr, " PALETTE");
+    fprintf(stderr, "\n");
+  }
+  fprintf(stderr, "  * Precision Bits: histogram=%d transform=%d cache=%d\n",
+          stats->histogram_bits, stats->transform_bits, stats->cache_bits);
+  if (stats->palette_size > 0) {
+    fprintf(stderr, "  * Palette size:   %d\n", stats->palette_size);
+  }
+}
+
 static void PrintExtraInfoLossless(const WebPPicture* const pic,
                                    int short_output,
                                    const char* const file_name) {
@@ -624,6 +643,7 @@ static void PrintExtraInfoLossless(const WebPPicture* const pic,
     fprintf(stderr, "File:      %s\n", file_name);
     fprintf(stderr, "Dimension: %d x %d\n", pic->width, pic->height);
     fprintf(stderr, "Output:    %d bytes\n", stats->coded_size);
+    PrintFullLosslessInfo(stats, "ARGB");
   }
 }
 
@@ -658,9 +678,9 @@ static void PrintExtraInfoLossy(const WebPPicture* const pic, int short_output,
               100.f * stats->header_bytes[0] / stats->coded_size,
               stats->header_bytes[1],
               100.f * stats->header_bytes[1] / stats->coded_size);
-      if (stats->alpha_data_size) {
-        fprintf(stderr, "             transparency:   %6d\n",
-                stats->alpha_data_size);
+      if (stats->alpha_data_size > 0) {
+        fprintf(stderr, "             transparency:   %6d (%.1f dB)\n",
+                stats->alpha_data_size, stats->PSNR[4]);
       }
       if (stats->layer_data_size) {
         fprintf(stderr, "             enhancement:    %6d\n",
@@ -686,8 +706,11 @@ static void PrintExtraInfoLossy(const WebPPicture* const pic, int short_output,
       fprintf(stderr, " segments total:  ");
       PrintByteCount(totals, stats->coded_size, NULL);
     }
+    if (stats->lossless_size > 0) {
+      PrintFullLosslessInfo(stats, "alpha");
+    }
   }
-  if (pic->extra_info) {
+  if (pic->extra_info != NULL) {
     const int mb_w = (pic->width + 15) / 16;
     const int mb_h = (pic->height + 15) / 16;
     const int type = pic->extra_info_type;
@@ -756,7 +779,7 @@ static int DumpPicture(const WebPPicture* const picture, const char* PGM_name) {
 
 static int ProgressReport(int percent, const WebPPicture* const picture) {
   printf("[%s]: %3d %%      \r",
-         (char*)picture->stats->user_data, percent);
+         (char*)picture->user_data, percent);
   fflush(stdout);
   return 1;  // all ok
 }
@@ -821,7 +844,7 @@ static void HelpLong(void) {
   printf("  -noalpha ............... discard any transparency information.\n");
   printf("  -lossless .............. Encode image losslessly.\n");
   printf("  -hint <string> ......... Specify image characteristics hint.\n");
-  printf("                           One of: photo or picture\n");
+  printf("                           One of: photo, picture or graph\n");
 
   printf("\n");
   printf("  -short ................. condense printed message\n");
@@ -886,7 +909,7 @@ int main(int argc, const char *argv[]) {
       !WebPPictureInit(&original_picture) ||
       !WebPConfigInit(&config)) {
     fprintf(stderr, "Error! Version mismatch!\n");
-    goto Error;
+    return -1;
   }
 
   if (argc == 1) {
@@ -950,6 +973,8 @@ int main(int argc, const char *argv[]) {
         config.image_hint = WEBP_HINT_PHOTO;
       } else if (!strcmp(argv[c], "picture")) {
         config.image_hint = WEBP_HINT_PICTURE;
+      } else if (!strcmp(argv[c], "graph")) {
+        config.image_hint = WEBP_HINT_GRAPH;
       } else {
         fprintf(stderr, "Error! Unrecognized image hint: %s\n", argv[c]);
         goto Error;
@@ -1100,8 +1125,10 @@ int main(int argc, const char *argv[]) {
       fprintf(stderr, "be performed, but its results discarded.\n\n");
     }
   }
-  picture.stats = &stats;
-  stats.user_data = (void*)in_file;
+  if (!quiet) {
+    picture.stats = &stats;
+    picture.user_data = (void*)in_file;
+  }
 
   // Compress
   if (verbose) {
diff --git a/examples/dwebp.c b/examples/dwebp.c
index 619331d7..49ff2fa8 100644
--- a/examples/dwebp.c
+++ b/examples/dwebp.c
@@ -59,6 +59,7 @@ extern void* VP8GetCPUInfo;   // opaque forward declaration.
 // Output types
 typedef enum {
   PNG = 0,
+  PAM,
   PPM,
   PGM,
   ALPHA_PLANE_ONLY  // this is for experimenting only
@@ -201,15 +202,22 @@ static int WritePNG(FILE* out_file, const WebPDecBuffer* const buffer) {
 }
 #endif
 
-static int WritePPM(FILE* fout, const WebPDecBuffer* const buffer) {
+static int WritePPM(FILE* fout, const WebPDecBuffer* const buffer, int alpha) {
   const uint32_t width = buffer->width;
   const uint32_t height = buffer->height;
   const unsigned char* const rgb = buffer->u.RGBA.rgba;
   const int stride = buffer->u.RGBA.stride;
+  const size_t bytes_per_px = alpha ? 4 : 3;
   uint32_t y;
-  fprintf(fout, "P6\n%d %d\n255\n", width, height);
+
+  if (alpha) {
+    fprintf(fout, "P7\nWIDTH %d\nHEIGHT %d\nDEPTH 4\nMAXVAL 255\n"
+                  "TUPLTYPE RGB_ALPHA\nENDHDR\n", width, height);
+  } else {
+    fprintf(fout, "P6\n%d %d\n255\n", width, height);
+  }
   for (y = 0; y < height; ++y) {
-    if (fwrite(rgb + y * stride, width, 3, fout) != 3) {
+    if (fwrite(rgb + y * stride, width, bytes_per_px, fout) != bytes_per_px) {
       return 0;
     }
   }
@@ -289,8 +297,10 @@ static void SaveOutput(const WebPDecBuffer* const buffer,
 #else
     ok &= WritePNG(fout, buffer);
 #endif
+  } else if (format == PAM) {
+    ok &= WritePPM(fout, buffer, 1);
   } else if (format == PPM) {
-    ok &= WritePPM(fout, buffer);
+    ok &= WritePPM(fout, buffer, 0);
   } else if (format == PGM) {
     ok &= WritePGM(fout, buffer);
   } else if (format == ALPHA_PLANE_ONLY) {
@@ -314,7 +324,8 @@ static void Help(void) {
   printf("Usage: dwebp in_file [options] [-o out_file]\n\n"
          "Decodes the WebP image file to PNG format [Default]\n"
          "Use following options to convert into alternate image formats:\n"
-         "  -ppm ......... save the raw RGB samples as color PPM\n"
+         "  -pam ......... save the raw RGBA samples as a color PAM\n"
+         "  -ppm ......... save the raw RGB samples as a color PPM\n"
          "  -pgm ......... save the raw YUV samples as a grayscale PGM\n"
          "                 file with IMC4 layout.\n"
          " Other options are:\n"
@@ -324,9 +335,7 @@ static void Help(void) {
          "  -mt .......... use multi-threading\n"
          "  -crop <x> <y> <w> <h> ... crop output with the given rectangle\n"
          "  -scale <w> <h> .......... scale the output (*after* any cropping)\n"
-#ifdef WEBP_EXPERIMENTAL_FEATURES
          "  -alpha ....... only save the alpha plane.\n"
-#endif
          "  -h     ....... this help message.\n"
          "  -v     ....... verbose (e.g. print encoding/decoding times)\n"
 #ifndef WEBP_DLL
@@ -367,6 +376,8 @@ int main(int argc, const char *argv[]) {
       config.options.no_fancy_upsampling = 1;
     } else if (!strcmp(argv[c], "-nofilter")) {
       config.options.bypass_filtering = 1;
+    } else if (!strcmp(argv[c], "-pam")) {
+      format = PAM;
     } else if (!strcmp(argv[c], "-ppm")) {
       format = PPM;
     } else if (!strcmp(argv[c], "-version")) {
@@ -434,6 +445,9 @@ int main(int argc, const char *argv[]) {
         output_buffer->colorspace = bitstream->has_alpha ? MODE_RGBA : MODE_RGB;
 #endif
         break;
+      case PAM:
+        output_buffer->colorspace = MODE_RGBA;
+        break;
       case PPM:
         output_buffer->colorspace = MODE_RGB;  // drops alpha for PPM
         break;
diff --git a/makefile.unix b/makefile.unix
index 473b90af..85b21073 100644
--- a/makefile.unix
+++ b/makefile.unix
@@ -130,6 +130,7 @@ UTILS_OBJS = \
     src/utils/quant_levels.o \
     src/utils/rescaler.o \
     src/utils/thread.o \
+    src/utils/utils.o \
 
 LIBWEBP_OBJS = $(DEC_OBJS) $(DSP_OBJS) $(ENC_OBJS) $(UTILS_OBJS)
 LIBWEBPMUX_OBJS = $(MUX_OBJS)
diff --git a/man/cwebp.1 b/man/cwebp.1
index b4e4c1a4..fab8517e 100644
--- a/man/cwebp.1
+++ b/man/cwebp.1
@@ -164,7 +164,7 @@ Encode the image without any loss.
 .TP
 .B \-hint string
 Specify the hint about input image type. Possible values are:
-\fBphoto\fP, and \fBpicture\fP.
+\fBphoto\fP, \fBpicture\fP or \fBgraph\fP.
 .TP
 .B \-noasm
 Disable all assembly optimizations.
@@ -212,4 +212,5 @@ for the Debian project (and may be used by others).
 .SH SEE ALSO
 .BR dwebp (1).
 .br
-Please refer to http://code.google.com/speed/webp/ for additional information.
+Please refer to http://developers.google.com/speed/webp/ for additional
+information.
diff --git a/man/dwebp.1 b/man/dwebp.1
index 637297f6..5a559a81 100644
--- a/man/dwebp.1
+++ b/man/dwebp.1
@@ -1,5 +1,5 @@
 .\"                                      Hey, EMACS: -*- nroff -*-
-.TH DWEBP 1 "January 24, 2012"
+.TH DWEBP 1 "August 2, 2012"
 .SH NAME
 dwebp \- decompress a WebP file to an image file
 .SH SYNOPSIS
@@ -11,7 +11,7 @@ This manual page documents the
 .B dwebp
 command.
 .PP
-\fBdwebp\fP decompresses WebP files into PNG, PPM or PGM images.
+\fBdwebp\fP decompresses WebP files into PNG, PAM, PPM or PGM images.
 .SH OPTIONS
 The basic options are:
 .TP
@@ -24,8 +24,11 @@ Print the version number (as major.minor.revision) and exit.
 .B \-o string
 Specify the name of the output file (as PNG format by default).
 .TP
+.B \-pam
+Change the output format to PAM (retains alpha).
+.TP
 .B \-ppm
-Change the output format to PPM.
+Change the output format to PPM (discards alpha).
 .TP
 .B \-pgm
 Change the output format to PGM. The output consist of luma/chroma
@@ -87,4 +90,13 @@ for the Debian project (and may be used by others).
 .SH SEE ALSO
 .BR cwebp (1).
 .br
-Please refer to http://code.google.com/speed/webp/ for additional information.
+Please refer to http://developers.google.com/speed/webp/ for additional
+information.
+.SS Output file format details
+PAM: http://netpbm.sourceforge.net/doc/pam.html
+.br
+PGM: http://netpbm.sourceforge.net/doc/pgm.html
+.br
+PPM: http://netpbm.sourceforge.net/doc/ppm.html
+.br
+PNG: http://www.libpng.org/pub/png/png-sitemap.html#info
diff --git a/man/webpmux.1 b/man/webpmux.1
index 2e44f068..310fbb6d 100644
--- a/man/webpmux.1
+++ b/man/webpmux.1
@@ -131,4 +131,5 @@ for the Debian project (and may be used by others).
 .BR dwebp (1),
 .BR cwebp (1).
 .br
-Please refer to http://code.google.com/speed/webp/ for additional information.
+Please refer to http://developers.google.com/speed/webp/ for additional
+information.
diff --git a/src/Makefile.am b/src/Makefile.am
index ab55b676..69a09ef1 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -20,7 +20,7 @@ libwebp_la_LIBADD += dsp/libwebpdsp.la
 libwebp_la_LIBADD += enc/libwebpencode.la
 libwebp_la_LIBADD += utils/libwebputils.la
 
-libwebp_la_LDFLAGS = -version-info 3:0:0
+libwebp_la_LDFLAGS = -version-info 4:0:0
 libwebpincludedir = $(includedir)/webp
 
 pkgconfig_DATA = libwebp.pc
diff --git a/src/dec/buffer.c b/src/dec/buffer.c
index e8421785..c159f6f2 100644
--- a/src/dec/buffer.c
+++ b/src/dec/buffer.c
@@ -13,6 +13,7 @@
 
 #include "./vp8i.h"
 #include "./webpi.h"
+#include "../utils/utils.h"
 
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
@@ -50,18 +51,23 @@ static VP8StatusCode CheckDecBuffer(const WebPDecBuffer* const buffer) {
     ok &= (y_size <= buf->y_size);
     ok &= (u_size <= buf->u_size);
     ok &= (v_size <= buf->v_size);
-    ok &= (a_size <= buf->a_size);
     ok &= (buf->y_stride >= width);
     ok &= (buf->u_stride >= (width + 1) / 2);
     ok &= (buf->v_stride >= (width + 1) / 2);
-    if (buf->a) {
+    ok &= (buf->y != NULL);
+    ok &= (buf->u != NULL);
+    ok &= (buf->v != NULL);
+    if (mode == MODE_YUVA) {
       ok &= (buf->a_stride >= width);
+      ok &= (a_size <= buf->a_size);
+      ok &= (buf->a != NULL);
     }
   } else {    // RGB checks
     const WebPRGBABuffer* const buf = &buffer->u.RGBA;
     const uint64_t size = (uint64_t)buf->stride * height;
     ok &= (size <= buf->size);
     ok &= (buf->stride >= width * kModeBpp[mode]);
+    ok &= (buf->rgba != NULL);
   }
   return ok ? VP8_STATUS_OK : VP8_STATUS_INVALID_PARAM;
 }
@@ -95,14 +101,11 @@ static VP8StatusCode AllocateBuffer(WebPDecBuffer* const buffer) {
     total_size = size + 2 * uv_size + a_size;
 
     // Security/sanity checks
-    if (((size_t)total_size != total_size) || (total_size >= (1ULL << 40))) {
-      return VP8_STATUS_INVALID_PARAM;
-    }
-
-    buffer->private_memory = output = (uint8_t*)malloc((size_t)total_size);
+    output = (uint8_t*)WebPSafeMalloc(total_size, sizeof(*output));
     if (output == NULL) {
       return VP8_STATUS_OUT_OF_MEMORY;
     }
+    buffer->private_memory = output;
 
     if (!WebPIsRGBMode(mode)) {   // YUVA initialization
       WebPYUVABuffer* const buf = &buffer->u.YUVA;
diff --git a/src/dec/frame.c b/src/dec/frame.c
index 1a444d13..9c91a48e 100644
--- a/src/dec/frame.c
+++ b/src/dec/frame.c
@@ -11,6 +11,7 @@
 
 #include <stdlib.h>
 #include "./vp8i.h"
+#include "../utils/utils.h"
 
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
@@ -435,11 +436,12 @@ static int AllocateMemory(VP8Decoder* const dec) {
   if (needed > dec->mem_size_) {
     free(dec->mem_);
     dec->mem_size_ = 0;
-    dec->mem_ = (uint8_t*)malloc((size_t)needed);
+    dec->mem_ = WebPSafeMalloc(needed, sizeof(uint8_t));
     if (dec->mem_ == NULL) {
       return VP8SetError(dec, VP8_STATUS_OUT_OF_MEMORY,
                          "no memory during frame initialization.");
     }
+    // down-cast is ok, thanks to WebPSafeAlloc() above.
     dec->mem_size_ = (size_t)needed;
   }
 
diff --git a/src/dec/idec.c b/src/dec/idec.c
index c7ab6f6e..7df790ce 100644
--- a/src/dec/idec.c
+++ b/src/dec/idec.c
@@ -15,6 +15,7 @@
 
 #include "./webpi.h"
 #include "./vp8i.h"
+#include "../utils/utils.h"
 
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
@@ -143,14 +144,15 @@ static int AppendToMemBuffer(WebPIDecoder* const idec,
 
   if (mem->end_ + data_size > mem->buf_size_) {  // Need some free memory
     const size_t current_size = MemDataSize(mem);
-    const size_t new_size = current_size + data_size;
-    const size_t extra_size = (new_size + CHUNK_SIZE - 1) & ~(CHUNK_SIZE - 1);
-    uint8_t* const new_buf = (uint8_t*)malloc(extra_size);
+    const uint64_t new_size = (uint64_t)current_size + data_size;
+    const uint64_t extra_size = (new_size + CHUNK_SIZE - 1) & ~(CHUNK_SIZE - 1);
+    uint8_t* const new_buf =
+        (uint8_t*)WebPSafeMalloc(extra_size, sizeof(*new_buf));
     if (new_buf == NULL) return 0;
     memcpy(new_buf, old_base, current_size);
     free(mem->buf_);
     mem->buf_ = new_buf;
-    mem->buf_size_ = extra_size;
+    mem->buf_size_ = (size_t)extra_size;
     mem->start_ = 0;
     mem->end_ = current_size;
   }
@@ -534,7 +536,7 @@ static VP8StatusCode IDecode(WebPIDecoder* idec) {
 // Public functions
 
 WebPIDecoder* WebPINewDecoder(WebPDecBuffer* output_buffer) {
-  WebPIDecoder* idec = (WebPIDecoder*)calloc(1, sizeof(WebPIDecoder));
+  WebPIDecoder* idec = (WebPIDecoder*)calloc(1, sizeof(*idec));
   if (idec == NULL) {
     return NULL;
   }
@@ -565,7 +567,7 @@ WebPIDecoder* WebPIDecode(const uint8_t* data, size_t data_size,
   }
   // Create an instance of the incremental decoder
   idec = WebPINewDecoder(config ? &config->output : NULL);
-  if (!idec) {
+  if (idec == NULL) {
     return NULL;
   }
   // Finish initialization
@@ -597,7 +599,7 @@ WebPIDecoder* WebPINewRGB(WEBP_CSP_MODE mode, uint8_t* output_buffer,
   WebPIDecoder* idec;
   if (mode >= MODE_YUV) return NULL;
   idec = WebPINewDecoder(NULL);
-  if (!idec) return NULL;
+  if (idec == NULL) return NULL;
   idec->output_.colorspace = mode;
   idec->output_.is_external_memory = 1;
   idec->output_.u.RGBA.rgba = output_buffer;
@@ -606,12 +608,13 @@ WebPIDecoder* WebPINewRGB(WEBP_CSP_MODE mode, uint8_t* output_buffer,
   return idec;
 }
 
-WebPIDecoder* WebPINewYUV(uint8_t* luma, size_t luma_size, int luma_stride,
-                          uint8_t* u, size_t u_size, int u_stride,
-                          uint8_t* v, size_t v_size, int v_stride) {
+WebPIDecoder* WebPINewYUVA(uint8_t* luma, size_t luma_size, int luma_stride,
+                           uint8_t* u, size_t u_size, int u_stride,
+                           uint8_t* v, size_t v_size, int v_stride,
+                           uint8_t* a, size_t a_size, int a_stride) {
   WebPIDecoder* const idec = WebPINewDecoder(NULL);
-  if (!idec) return NULL;
-  idec->output_.colorspace = MODE_YUV;
+  if (idec == NULL) return NULL;
+  idec->output_.colorspace = (a == NULL) ? MODE_YUV : MODE_YUVA;
   idec->output_.is_external_memory = 1;
   idec->output_.u.YUVA.y = luma;
   idec->output_.u.YUVA.y_stride = luma_stride;
@@ -622,9 +625,21 @@ WebPIDecoder* WebPINewYUV(uint8_t* luma, size_t luma_size, int luma_stride,
   idec->output_.u.YUVA.v = v;
   idec->output_.u.YUVA.v_stride = v_stride;
   idec->output_.u.YUVA.v_size = v_size;
+  idec->output_.u.YUVA.a = a;
+  idec->output_.u.YUVA.a_stride = a_stride;
+  idec->output_.u.YUVA.a_size = a_size;
   return idec;
 }
 
+WebPIDecoder* WebPINewYUV(uint8_t* luma, size_t luma_size, int luma_stride,
+                          uint8_t* u, size_t u_size, int u_stride,
+                          uint8_t* v, size_t v_size, int v_stride) {
+  return WebPINewYUVA(luma, luma_size, luma_stride,
+                      u, u_size, u_stride,
+                      v, v_size, v_stride,
+                      NULL, 0, 0);
+}
+
 //------------------------------------------------------------------------------
 
 static VP8StatusCode IDecCheckStatus(const WebPIDecoder* const idec) {
@@ -696,15 +711,15 @@ const WebPDecBuffer* WebPIDecodedArea(const WebPIDecoder* idec,
                                       int* left, int* top,
                                       int* width, int* height) {
   const WebPDecBuffer* const src = GetOutputBuffer(idec);
-  if (left) *left = 0;
-  if (top) *top = 0;
+  if (left != NULL) *left = 0;
+  if (top != NULL) *top = 0;
   // TODO(skal): later include handling of rotations.
   if (src) {
-    if (width) *width = src->width;
-    if (height) *height = idec->params_.last_y;
+    if (width != NULL) *width = src->width;
+    if (height != NULL) *height = idec->params_.last_y;
   } else {
-    if (width) *width = 0;
-    if (height) *height = 0;
+    if (width != NULL) *width = 0;
+    if (height != NULL) *height = 0;
   }
   return src;
 }
@@ -712,35 +727,38 @@ const WebPDecBuffer* WebPIDecodedArea(const WebPIDecoder* idec,
 uint8_t* WebPIDecGetRGB(const WebPIDecoder* idec, int* last_y,
                         int* width, int* height, int* stride) {
   const WebPDecBuffer* const src = GetOutputBuffer(idec);
-  if (!src) return NULL;
+  if (src == NULL) return NULL;
   if (src->colorspace >= MODE_YUV) {
     return NULL;
   }
 
-  if (last_y) *last_y = idec->params_.last_y;
-  if (width) *width = src->width;
-  if (height) *height = src->height;
-  if (stride) *stride = src->u.RGBA.stride;
+  if (last_y != NULL) *last_y = idec->params_.last_y;
+  if (width != NULL) *width = src->width;
+  if (height != NULL) *height = src->height;
+  if (stride != NULL) *stride = src->u.RGBA.stride;
 
   return src->u.RGBA.rgba;
 }
 
-uint8_t* WebPIDecGetYUV(const WebPIDecoder* idec, int* last_y,
-                        uint8_t** u, uint8_t** v,
-                        int* width, int* height, int *stride, int* uv_stride) {
+uint8_t* WebPIDecGetYUVA(const WebPIDecoder* idec, int* last_y,
+                         uint8_t** u, uint8_t** v, uint8_t** a,
+                         int* width, int* height,
+                         int* stride, int* uv_stride, int* a_stride) {
   const WebPDecBuffer* const src = GetOutputBuffer(idec);
-  if (!src) return NULL;
+  if (src == NULL) return NULL;
   if (src->colorspace < MODE_YUV) {
     return NULL;
   }
 
-  if (last_y) *last_y = idec->params_.last_y;
-  if (u) *u = src->u.YUVA.u;
-  if (v) *v = src->u.YUVA.v;
-  if (width) *width = src->width;
-  if (height) *height = src->height;
-  if (stride) *stride = src->u.YUVA.y_stride;
-  if (uv_stride) *uv_stride = src->u.YUVA.u_stride;
+  if (last_y != NULL) *last_y = idec->params_.last_y;
+  if (u != NULL) *u = src->u.YUVA.u;
+  if (v != NULL) *v = src->u.YUVA.v;
+  if (a != NULL) *a = src->u.YUVA.a;
+  if (width != NULL) *width = src->width;
+  if (height != NULL) *height = src->height;
+  if (stride != NULL) *stride = src->u.YUVA.y_stride;
+  if (uv_stride != NULL) *uv_stride = src->u.YUVA.u_stride;
+  if (a_stride != NULL) *a_stride = src->u.YUVA.a_stride;
 
   return src->u.YUVA.y;
 }
diff --git a/src/dec/io.c b/src/dec/io.c
index 8a9ee4e7..594804c2 100644
--- a/src/dec/io.c
+++ b/src/dec/io.c
@@ -111,7 +111,7 @@ static int EmitFancyRGB(const VP8Io* const io, WebPDecParams* const p) {
   const uint8_t* top_u = p->tmp_u;
   const uint8_t* top_v = p->tmp_v;
   int y = io->mb_y;
-  int y_end = io->mb_y + io->mb_h;
+  const int y_end = io->mb_y + io->mb_h;
   const int mb_w = io->mb_w;
   const int uv_w = (mb_w + 1) / 2;
 
@@ -150,7 +150,7 @@ static int EmitFancyRGB(const VP8Io* const io, WebPDecParams* const p) {
     // Process the very last row of even-sized picture
     if (!(y_end & 1)) {
       upsample(cur_y, NULL, cur_u, cur_v, cur_u, cur_v,
-              dst + buf->stride, NULL, mb_w);
+               dst + buf->stride, NULL, mb_w);
     }
   }
   return num_lines_out;
@@ -162,64 +162,82 @@ static int EmitFancyRGB(const VP8Io* const io, WebPDecParams* const p) {
 
 static int EmitAlphaYUV(const VP8Io* const io, WebPDecParams* const p) {
   const uint8_t* alpha = io->a;
+  const WebPYUVABuffer* const buf = &p->output->u.YUVA;
+  const int mb_w = io->mb_w;
+  const int mb_h = io->mb_h;
+  uint8_t* dst = buf->a + io->mb_y * buf->a_stride;
+  int j;
+
   if (alpha != NULL) {
-    int j;
-    const int mb_w = io->mb_w;
-    const int mb_h = io->mb_h;
-    const WebPYUVABuffer* const buf = &p->output->u.YUVA;
-    uint8_t* dst = buf->a + io->mb_y * buf->a_stride;
     for (j = 0; j < mb_h; ++j) {
       memcpy(dst, alpha, mb_w * sizeof(*dst));
       alpha += io->width;
       dst += buf->a_stride;
     }
+  } else if (buf->a != NULL) {
+    // the user requested alpha, but there is none, set it to opaque.
+    for (j = 0; j < mb_h; ++j) {
+      memset(dst, 0xff, mb_w * sizeof(*dst));
+      dst += buf->a_stride;
+    }
   }
   return 0;
 }
 
+static int GetAlphaSourceRow(const VP8Io* const io,
+                             const uint8_t** alpha, int* const num_rows) {
+  int start_y = io->mb_y;
+  *num_rows = io->mb_h;
+
+  // Compensate for the 1-line delay of the fancy upscaler.
+  // This is similar to EmitFancyRGB().
+  if (io->fancy_upsampling) {
+    if (start_y == 0) {
+      // We don't process the last row yet. It'll be done during the next call.
+      --*num_rows;
+    } else {
+      --start_y;
+      // Fortunately, *alpha data is persistent, so we can go back
+      // one row and finish alpha blending, now that the fancy upscaler
+      // completed the YUV->RGB interpolation.
+      *alpha -= io->width;
+    }
+    if (io->crop_top + io->mb_y + io->mb_h == io->crop_bottom) {
+      // If it's the very last call, we process all the remaining rows!
+      *num_rows = io->crop_bottom - io->crop_top - start_y;
+    }
+  }
+  return start_y;
+}
+
 static int EmitAlphaRGB(const VP8Io* const io, WebPDecParams* const p) {
   const uint8_t* alpha = io->a;
   if (alpha != NULL) {
     const int mb_w = io->mb_w;
-    const int mb_h = io->mb_h;
-    int i, j;
     const WEBP_CSP_MODE colorspace = p->output->colorspace;
     const int alpha_first =
         (colorspace == MODE_ARGB || colorspace == MODE_Argb);
     const WebPRGBABuffer* const buf = &p->output->u.RGBA;
-    int start_y = io->mb_y;
-    int num_rows = mb_h;
+    int num_rows;
+    const int start_y = GetAlphaSourceRow(io, &alpha, &num_rows);
+    uint8_t* const base_rgba = buf->rgba + start_y * buf->stride;
+    uint8_t* dst = base_rgba + (alpha_first ? 0 : 3);
+    uint32_t alpha_mask = 0xff;
+    int i, j;
 
-    // We compensate for the 1-line delay of fancy upscaler.
-    // This is similar to EmitFancyRGB().
-    if (io->fancy_upsampling) {
-      if (start_y == 0) {
-        // We don't process the last row yet. It'll be done during next call.
-        --num_rows;
-      } else {
-        --start_y;
-        // Fortunately, *alpha data is persistent, so we can go back
-        // one row and finish alpha blending, now that the fancy upscaler
-        // completed the YUV->RGB interpolation.
-        alpha -= io->width;
-      }
-      if (io->crop_top + io->mb_y + mb_h == io->crop_bottom) {
-        // If it's the very last call, we process all the remaing rows!
-        num_rows = io->crop_bottom - io->crop_top - start_y;
+    for (j = 0; j < num_rows; ++j) {
+      for (i = 0; i < mb_w; ++i) {
+        const uint32_t alpha_value = alpha[i];
+        dst[4 * i] = alpha_value;
+        alpha_mask &= alpha_value;
       }
+      alpha += io->width;
+      dst += buf->stride;
     }
-    {
-      uint8_t* const base_rgba = buf->rgba + start_y * buf->stride;
-      uint8_t* dst = base_rgba + (alpha_first ? 0 : 3);
-      for (j = 0; j < num_rows; ++j) {
-        for (i = 0; i < mb_w; ++i) dst[4 * i] = alpha[i];
-        alpha += io->width;
-        dst += buf->stride;
-      }
-      if (WebPIsPremultipliedMode(colorspace)) {
-        WebPApplyAlphaMultiply(base_rgba, alpha_first,
-                               mb_w, num_rows, buf->stride);
-      }
+    // alpha_mask is < 0xff if there's non-trivial alpha to premultiply with.
+    if (alpha_mask != 0xff && WebPIsPremultipliedMode(colorspace)) {
+      WebPApplyAlphaMultiply(base_rgba, alpha_first,
+                             mb_w, num_rows, buf->stride);
     }
   }
   return 0;
@@ -229,22 +247,27 @@ static int EmitAlphaRGBA4444(const VP8Io* const io, WebPDecParams* const p) {
   const uint8_t* alpha = io->a;
   if (alpha != NULL) {
     const int mb_w = io->mb_w;
-    const int mb_h = io->mb_h;
-    int i, j;
+    const WEBP_CSP_MODE colorspace = p->output->colorspace;
     const WebPRGBABuffer* const buf = &p->output->u.RGBA;
-    uint8_t* const base_rgba = buf->rgba + io->mb_y * buf->stride;
+    int num_rows;
+    const int start_y = GetAlphaSourceRow(io, &alpha, &num_rows);
+    uint8_t* const base_rgba = buf->rgba + start_y * buf->stride;
     uint8_t* alpha_dst = base_rgba + 1;
-    for (j = 0; j < mb_h; ++j) {
+    uint32_t alpha_mask = 0x0f;
+    int i, j;
+
+    for (j = 0; j < num_rows; ++j) {
       for (i = 0; i < mb_w; ++i) {
         // Fill in the alpha value (converted to 4 bits).
-        const uint32_t alpha_val = VP8Clip4Bits(alpha[i]);
-        alpha_dst[2 * i] = (alpha_dst[2 * i] & 0xf0) | alpha_val;
+        const uint32_t alpha_value = alpha[i] >> 4;
+        alpha_dst[2 * i] = (alpha_dst[2 * i] & 0xf0) | alpha_value;
+        alpha_mask &= alpha_value;
       }
       alpha += io->width;
       alpha_dst += buf->stride;
     }
-    if (p->output->colorspace == MODE_rgbA_4444) {
-      WebPApplyAlphaMultiply4444(base_rgba, mb_w, mb_h, buf->stride);
+    if (alpha_mask != 0x0f && WebPIsPremultipliedMode(colorspace)) {
+      WebPApplyAlphaMultiply4444(base_rgba, mb_w, num_rows, buf->stride);
     }
   }
   return 0;
@@ -389,17 +412,22 @@ static int ExportAlpha(WebPDecParams* const p, int y_pos) {
   uint8_t* dst = base_rgba + (alpha_first ? 0 : 3);
   int num_lines_out = 0;
   const int is_premult_alpha = WebPIsPremultipliedMode(colorspace);
+  uint32_t alpha_mask = 0xff;
   const int width = p->scaler_a.dst_width;
 
   while (WebPRescalerHasPendingOutput(&p->scaler_a)) {
     int i;
     assert(p->last_y + y_pos + num_lines_out < p->output->height);
     WebPRescalerExportRow(&p->scaler_a);
-    for (i = 0; i < width; ++i) dst[4 * i] = p->scaler_a.dst[i];
+    for (i = 0; i < width; ++i) {
+      const uint32_t alpha_value = p->scaler_a.dst[i];
+      dst[4 * i] = alpha_value;
+      alpha_mask &= alpha_value;
+    }
     dst += buf->stride;
     ++num_lines_out;
   }
-  if (is_premult_alpha) {
+  if (is_premult_alpha && alpha_mask != 0xff) {
     WebPApplyAlphaMultiply(base_rgba, alpha_first,
                            width, num_lines_out, buf->stride);
   }
@@ -414,6 +442,7 @@ static int ExportAlphaRGBA4444(WebPDecParams* const p, int y_pos) {
   const WEBP_CSP_MODE colorspace = p->output->colorspace;
   const int width = p->scaler_a.dst_width;
   const int is_premult_alpha = WebPIsPremultipliedMode(colorspace);
+  uint32_t alpha_mask = 0x0f;
 
   while (WebPRescalerHasPendingOutput(&p->scaler_a)) {
     int i;
@@ -421,13 +450,14 @@ static int ExportAlphaRGBA4444(WebPDecParams* const p, int y_pos) {
     WebPRescalerExportRow(&p->scaler_a);
     for (i = 0; i < width; ++i) {
       // Fill in the alpha value (converted to 4 bits).
-      const uint32_t alpha_val = VP8Clip4Bits(p->scaler_a.dst[i]);
-      alpha_dst[2 * i] = (alpha_dst[2 * i] & 0xf0) | alpha_val;
+      const uint32_t alpha_value = p->scaler_a.dst[i] >> 4;
+      alpha_dst[2 * i] = (alpha_dst[2 * i] & 0xf0) | alpha_value;
+      alpha_mask &= alpha_value;
     }
     alpha_dst += buf->stride;
     ++num_lines_out;
   }
-  if (is_premult_alpha) {
+  if (is_premult_alpha && alpha_mask != 0x0f) {
     WebPApplyAlphaMultiply4444(base_rgba, width, num_lines_out, buf->stride);
   }
   return num_lines_out;
@@ -464,8 +494,7 @@ static int InitRGBRescaler(const VP8Io* const io, WebPDecParams* const p) {
     tmp_size1 += work_size;
     tmp_size2 += out_width;
   }
-  p->memory =
-      calloc(1, tmp_size1 * sizeof(*work) + tmp_size2 * sizeof(*tmp));
+  p->memory = calloc(1, tmp_size1 * sizeof(*work) + tmp_size2 * sizeof(*tmp));
   if (p->memory == NULL) {
     return 0;   // memory error
   }
@@ -562,7 +591,7 @@ static int CustomSetup(VP8Io* io) {
 //------------------------------------------------------------------------------
 
 static int CustomPut(const VP8Io* io) {
-  WebPDecParams* p = (WebPDecParams*)io->opaque;
+  WebPDecParams* const p = (WebPDecParams*)io->opaque;
   const int mb_w = io->mb_w;
   const int mb_h = io->mb_h;
   int num_lines_out;
diff --git a/src/dec/vp8.c b/src/dec/vp8.c
index 5db7d546..b0ccfa2a 100644
--- a/src/dec/vp8.c
+++ b/src/dec/vp8.c
@@ -45,7 +45,7 @@ int VP8InitIoInternal(VP8Io* const io, int version) {
 }
 
 VP8Decoder* VP8New(void) {
-  VP8Decoder* const dec = (VP8Decoder*)calloc(1, sizeof(VP8Decoder));
+  VP8Decoder* const dec = (VP8Decoder*)calloc(1, sizeof(*dec));
   if (dec != NULL) {
     SetOk(dec);
     WebPWorkerInit(&dec->worker_);
diff --git a/src/dec/vp8i.h b/src/dec/vp8i.h
index 9406b754..4382edfd 100644
--- a/src/dec/vp8i.h
+++ b/src/dec/vp8i.h
@@ -27,8 +27,8 @@ extern "C" {
 
 // version numbers
 #define DEC_MAJ_VERSION 0
-#define DEC_MIN_VERSION 1
-#define DEC_REV_VERSION 99
+#define DEC_MIN_VERSION 2
+#define DEC_REV_VERSION 0
 
 #define ONLY_KEYFRAME_CODE      // to remove any code related to P-Frames
 
diff --git a/src/dec/vp8l.c b/src/dec/vp8l.c
index e5d5eec6..897e4395 100644
--- a/src/dec/vp8l.c
+++ b/src/dec/vp8l.c
@@ -14,7 +14,9 @@
 #include <stdlib.h>
 #include "./vp8li.h"
 #include "../dsp/lossless.h"
+#include "../dsp/yuv.h"
 #include "../utils/huffman.h"
+#include "../utils/utils.h"
 
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
@@ -264,7 +266,8 @@ static int ReadHuffmanCode(int alphabet_size, VP8LDecoder* const dec,
       return 0;
     }
 
-    code_lengths = (int*)calloc(alphabet_size, sizeof(*code_lengths));
+    code_lengths =
+        (int*)WebPSafeCalloc((uint64_t)alphabet_size, sizeof(*code_lengths));
     if (code_lengths == NULL) {
       dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
       return 0;
@@ -335,7 +338,9 @@ static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize,
   if (br->error_) goto Error;
 
   assert(num_htree_groups <= 0x10000);
-  htree_groups = (HTreeGroup*)calloc(num_htree_groups, sizeof(*htree_groups));
+  htree_groups =
+      (HTreeGroup*)WebPSafeCalloc((uint64_t)num_htree_groups,
+                                  sizeof(*htree_groups));
   if (htree_groups == NULL) {
     dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
     goto Error;
@@ -380,10 +385,7 @@ static int AllocateAndInitRescaler(VP8LDecoder* const dec, VP8Io* const io) {
   const uint64_t memory_size = sizeof(*dec->rescaler) +
                                work_size * sizeof(*work) +
                                scaled_data_size * sizeof(*scaled_data);
-  uint8_t* memory;
-
-  if (memory_size != (size_t)memory_size) return 0;  // overflow check
-  memory = (uint8_t*)calloc(1, (size_t)memory_size);
+  uint8_t* memory = (uint8_t*)WebPSafeCalloc(memory_size, sizeof(*memory));
   if (memory == NULL) {
     dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
     return 0;
@@ -403,10 +405,12 @@ static int AllocateAndInitRescaler(VP8LDecoder* const dec, VP8Io* const io) {
   return 1;
 }
 
+//------------------------------------------------------------------------------
+// Export to ARGB
+
 // We have special "export" function since we need to convert from BGRA
-static int Export(VP8LDecoder* const dec, WEBP_CSP_MODE colorspace,
+static int Export(WebPRescaler* const rescaler, WEBP_CSP_MODE colorspace,
                   int rgba_stride, uint8_t* const rgba) {
-  WebPRescaler* const rescaler = dec->rescaler;
   const uint32_t* const src = (const uint32_t*)rescaler->dst;
   const int dst_width = rescaler->dst_width;
   int num_lines_out = 0;
@@ -420,18 +424,19 @@ static int Export(VP8LDecoder* const dec, WEBP_CSP_MODE colorspace,
 }
 
 // Emit scaled rows.
-static int EmitRescaledRows(VP8LDecoder* const dec, WEBP_CSP_MODE colorspace,
+static int EmitRescaledRows(const VP8LDecoder* const dec,
                             const uint32_t* const data, int in_stride, int mb_h,
                             uint8_t* const out, int out_stride) {
+  const WEBP_CSP_MODE colorspace = dec->output_->colorspace;
   const uint8_t* const in = (const uint8_t*)data;
   int num_lines_in = 0;
   int num_lines_out = 0;
   while (num_lines_in < mb_h) {
-    const uint8_t* row_in = in + num_lines_in * in_stride;
+    const uint8_t* const row_in = in + num_lines_in * in_stride;
     uint8_t* const row_out = out + num_lines_out * out_stride;
     num_lines_in += WebPRescalerImport(dec->rescaler, mb_h - num_lines_in,
                                        row_in, in_stride);
-    num_lines_out += Export(dec, colorspace, out_stride, row_out);
+    num_lines_out += Export(dec->rescaler, colorspace, out_stride, row_out);
   }
   return num_lines_out;
 }
@@ -452,6 +457,113 @@ static int EmitRows(WEBP_CSP_MODE colorspace,
   return mb_h;  // Num rows out == num rows in.
 }
 
+//------------------------------------------------------------------------------
+// Export to YUVA
+
+static void ConvertToYUVA(const uint32_t* const src, int width, int y_pos,
+                          const WebPDecBuffer* const output) {
+  const WebPYUVABuffer* const buf = &output->u.YUVA;
+  // first, the luma plane
+  {
+    int i;
+    uint8_t* const y = buf->y + y_pos * buf->y_stride;
+    for (i = 0; i < width; ++i) {
+      const uint32_t p = src[i];
+      y[i] = VP8RGBToY((p >> 16) & 0xff, (p >> 8) & 0xff, (p >> 0) & 0xff);
+    }
+  }
+
+  // then U/V planes
+  {
+    uint8_t* const u = buf->u + (y_pos >> 1) * buf->u_stride;
+    uint8_t* const v = buf->v + (y_pos >> 1) * buf->v_stride;
+    const int uv_width = width >> 1;
+    int i;
+    for (i = 0; i < uv_width; ++i) {
+      const uint32_t v0 = src[2 * i + 0];
+      const uint32_t v1 = src[2 * i + 1];
+      // VP8RGBToU/V expects four accumulated pixels. Hence we need to
+      // scale r/g/b value by a factor 2. We just shift v0/v1 one bit less.
+      const int r = ((v0 >> 15) & 0x1fe) + ((v1 >> 15) & 0x1fe);
+      const int g = ((v0 >>  7) & 0x1fe) + ((v1 >>  7) & 0x1fe);
+      const int b = ((v0 <<  1) & 0x1fe) + ((v1 <<  1) & 0x1fe);
+      if (!(y_pos & 1)) {  // even lines: store values
+        u[i] = VP8RGBToU(r, g, b);
+        v[i] = VP8RGBToV(r, g, b);
+      } else {             // odd lines: average with previous values
+        const int tmp_u = VP8RGBToU(r, g, b);
+        const int tmp_v = VP8RGBToV(r, g, b);
+        // Approximated average-of-four. But it's an acceptable diff.
+        u[i] = (u[i] + tmp_u + 1) >> 1;
+        v[i] = (v[i] + tmp_v + 1) >> 1;
+      }
+    }
+    if (width & 1) {       // last pixel
+      const uint32_t v0 = src[2 * i + 0];
+      const int r = (v0 >> 14) & 0x3fc;
+      const int g = (v0 >>  6) & 0x3fc;
+      const int b = (v0 <<  2) & 0x3fc;
+      if (!(y_pos & 1)) {  // even lines
+        u[i] = VP8RGBToU(r, g, b);
+        v[i] = VP8RGBToV(r, g, b);
+      } else {             // odd lines (note: we could just skip this)
+        const int tmp_u = VP8RGBToU(r, g, b);
+        const int tmp_v = VP8RGBToV(r, g, b);
+        u[i] = (u[i] + tmp_u + 1) >> 1;
+        v[i] = (v[i] + tmp_v + 1) >> 1;
+      }
+    }
+  }
+  // Lastly, store alpha if needed.
+  if (buf->a != NULL) {
+    int i;
+    uint8_t* const a = buf->a + y_pos * buf->a_stride;
+    for (i = 0; i < width; ++i) a[i] = (src[i] >> 24);
+  }
+}
+
+static int ExportYUVA(const VP8LDecoder* const dec, int y_pos) {
+  WebPRescaler* const rescaler = dec->rescaler;
+  const uint32_t* const src = (const uint32_t*)rescaler->dst;
+  const int dst_width = rescaler->dst_width;
+  int num_lines_out = 0;
+  while (WebPRescalerHasPendingOutput(rescaler)) {
+    WebPRescalerExportRow(rescaler);
+    ConvertToYUVA(src, dst_width, y_pos, dec->output_);
+    ++y_pos;
+    ++num_lines_out;
+  }
+  return num_lines_out;
+}
+
+static int EmitRescaledRowsYUVA(const VP8LDecoder* const dec,
+                                const uint32_t* const data,
+                                int in_stride, int mb_h) {
+  const uint8_t* const in = (const uint8_t*)data;
+  int num_lines_in = 0;
+  int y_pos = dec->last_out_row_;
+  while (num_lines_in < mb_h) {
+    const uint8_t* const row_in = in + num_lines_in * in_stride;
+    num_lines_in += WebPRescalerImport(dec->rescaler, mb_h - num_lines_in,
+                                       row_in, in_stride);
+    y_pos += ExportYUVA(dec, y_pos);
+  }
+  return y_pos;
+}
+
+static int EmitRowsYUVA(const VP8LDecoder* const dec,
+                        const uint32_t* const data, int in_stride,
+                        int mb_w, int num_rows) {
+  int y_pos = dec->last_out_row_;
+  const uint8_t* row_in = (const uint8_t*)data;
+  while (num_rows-- > 0) {
+    ConvertToYUVA((const uint32_t*)row_in, mb_w, y_pos, dec->output_);
+    row_in += in_stride;
+    ++y_pos;
+  }
+  return y_pos;
+}
+
 //------------------------------------------------------------------------------
 // Cropping.
 
@@ -503,20 +615,22 @@ static WEBP_INLINE HTreeGroup* GetHtreeGroupForPos(VP8LMetadata* const hdr,
 
 typedef void (*ProcessRowsFunc)(VP8LDecoder* const dec, int row);
 
-static void ApplyTransforms(VP8LDecoder* const dec, int num_rows,
-                            const uint32_t* const rows) {
+static void ApplyInverseTransforms(VP8LDecoder* const dec, int num_rows,
+                                   const uint32_t* const rows) {
   int n = dec->next_transform_;
   const int cache_pixs = dec->width_ * num_rows;
-  uint32_t* rows_data = dec->argb_cache_;
   const int start_row = dec->last_row_;
   const int end_row = start_row + num_rows;
+  const uint32_t* rows_in = rows;
+  uint32_t* const rows_out = dec->argb_cache_;
 
   // Inverse transforms.
   // TODO: most transforms only need to operate on the cropped region only.
-  memcpy(rows_data, rows, cache_pixs * sizeof(*rows_data));
+  memcpy(rows_out, rows_in, cache_pixs * sizeof(*rows_out));
   while (n-- > 0) {
     VP8LTransform* const transform = &dec->transforms_[n];
-    VP8LInverseTransform(transform, start_row, end_row, rows, rows_data);
+    VP8LInverseTransform(transform, start_row, end_row, rows_in, rows_out);
+    rows_in = rows_out;
   }
 }
 
@@ -527,7 +641,7 @@ static void ProcessRows(VP8LDecoder* const dec, int row) {
   const int num_rows = row - dec->last_row_;
 
   if (num_rows <= 0) return;  // Nothing to be done.
-  ApplyTransforms(dec, num_rows, rows);
+  ApplyInverseTransforms(dec, num_rows, rows);
 
   // Emit output.
   {
@@ -536,19 +650,23 @@ static void ProcessRows(VP8LDecoder* const dec, int row) {
     if (!SetCropWindow(io, dec->last_row_, row, &rows_data, io->width)) {
       // Nothing to output (this time).
     } else {
-      WebPDecParams* const params = (WebPDecParams*)io->opaque;
-      const WebPDecBuffer* const output = params->output;
-      const WebPRGBABuffer* const buf = &output->u.RGBA;
-      uint8_t* const rgba = buf->rgba + dec->last_out_row_ * buf->stride;
+      const WebPDecBuffer* const output = dec->output_;
       const int in_stride = io->width * sizeof(*rows_data);
-      const WEBP_CSP_MODE colorspace = output->colorspace;
-      const int num_rows_out = io->use_scaling ?
-          EmitRescaledRows(dec, colorspace, rows_data, in_stride, io->mb_h,
-                           rgba, buf->stride) :
-          EmitRows(colorspace, rows_data, in_stride, io->mb_w, io->mb_h,
-                   rgba, buf->stride);
-      // Update 'last_out_row_'.
-      dec->last_out_row_ += num_rows_out;
+      if (output->colorspace < MODE_YUV) {  // convert to RGBA
+        const WebPRGBABuffer* const buf = &output->u.RGBA;
+        uint8_t* const rgba = buf->rgba + dec->last_out_row_ * buf->stride;
+        const int num_rows_out = io->use_scaling ?
+            EmitRescaledRows(dec, rows_data, in_stride, io->mb_h,
+                             rgba, buf->stride) :
+            EmitRows(output->colorspace, rows_data, in_stride,
+                     io->mb_w, io->mb_h, rgba, buf->stride);
+        // Update 'last_out_row_'.
+        dec->last_out_row_ += num_rows_out;
+      } else {                              // convert to YUVA
+        dec->last_out_row_ = io->use_scaling ?
+            EmitRescaledRowsYUVA(dec, rows_data, in_stride, io->mb_h) :
+            EmitRowsYUVA(dec, rows_data, in_stride, io->mb_w, io->mb_h);
+      }
       assert(dec->last_out_row_ <= output->height);
     }
   }
@@ -681,26 +799,14 @@ static void ClearTransform(VP8LTransform* const transform) {
   transform->data_ = NULL;
 }
 
-static void ApplyInverseTransforms(VP8LDecoder* const dec, int start_idx,
-                                   uint32_t* const decoded_data) {
-  int n = dec->next_transform_;
-  assert(start_idx >= 0);
-  while (n-- > start_idx) {
-    VP8LTransform* const transform = &dec->transforms_[n];
-    VP8LInverseTransform(transform, 0, transform->ysize_,
-                         decoded_data, decoded_data);
-    ClearTransform(transform);
-  }
-  dec->next_transform_ = start_idx;
-}
-
 // For security reason, we need to remap the color map to span
 // the total possible bundled values, and not just the num_colors.
 static int ExpandColorMap(int num_colors, VP8LTransform* const transform) {
   int i;
   const int final_num_colors = 1 << (8 >> transform->bits_);
   uint32_t* const new_color_map =
-      (uint32_t*)malloc(final_num_colors * sizeof(*new_color_map));
+      (uint32_t*)WebPSafeMalloc((uint64_t)final_num_colors,
+                                sizeof(*new_color_map));
   if (new_color_map == NULL) {
     return 0;
   } else {
@@ -816,6 +922,8 @@ void VP8LClear(VP8LDecoder* const dec) {
 
   free(dec->rescaler_memory);
   dec->rescaler_memory = NULL;
+
+  dec->output_ = NULL;   // leave no trace behind
 }
 
 void VP8LDelete(VP8LDecoder* const dec) {
@@ -845,7 +953,6 @@ static int DecodeImageStream(int xsize, int ysize,
   VP8LBitReader* const br = &dec->br_;
   VP8LMetadata* const hdr = &dec->hdr_;
   uint32_t* data = NULL;
-  const int transform_start_idx = dec->next_transform_;
   int color_cache_bits = 0;
 
   // Read the transforms (may recurse).
@@ -892,15 +999,8 @@ static int DecodeImageStream(int xsize, int ysize,
   }
 
   {
-    const uint64_t total_size =
-        transform_xsize * transform_ysize * sizeof(*data);
-    if (total_size != (size_t)total_size) {
-      // This shouldn't happen, because of transform_bits limit, but...
-      dec->status_ = VP8_STATUS_BITSTREAM_ERROR;
-      ok = 0;
-      goto End;
-    }
-    data = (uint32_t*)malloc((size_t)total_size);
+    const uint64_t total_size = (uint64_t)transform_xsize * transform_ysize;
+    data = (uint32_t*)WebPSafeMalloc(total_size, sizeof(*data));
     if (data == NULL) {
       dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
       ok = 0;
@@ -912,9 +1012,6 @@ static int DecodeImageStream(int xsize, int ysize,
   ok = DecodeImageData(dec, data, transform_xsize, transform_ysize, NULL);
   ok = ok && !br->error_;
 
-  // Apply transforms on the decoded data.
-  if (ok) ApplyInverseTransforms(dec, transform_start_idx, data);
-
  End:
 
   if (!ok) {
@@ -951,14 +1048,11 @@ static int AllocateARGBBuffers(VP8LDecoder* const dec, int final_width) {
   const uint64_t cache_pixels = (uint64_t)final_width * NUM_ARGB_CACHE_ROWS;
   const uint64_t total_num_pixels =
       num_pixels + cache_top_pixels + cache_pixels;
-  const uint64_t total_size = total_num_pixels * sizeof(*dec->argb_);
 
   assert(dec->width_ <= final_width);
-  // Check for overflow
-  if ((size_t)total_size != total_size) return 0;
-  dec->argb_ = (uint32_t*)malloc((size_t)total_size);
+  dec->argb_ = (uint32_t*)WebPSafeMalloc(total_num_pixels, sizeof(*dec->argb_));
   if (dec->argb_ == NULL) {
-    dec->argb_cache_ = NULL;
+    dec->argb_cache_ = NULL;    // for sanity check
     dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
     return 0;
   }
@@ -974,7 +1068,7 @@ static void ExtractAlphaRows(VP8LDecoder* const dec, int row) {
   const uint32_t* const in = dec->argb_ + dec->width_ * dec->last_row_;
 
   if (num_rows <= 0) return;  // Nothing to be done.
-  ApplyTransforms(dec, num_rows, in);
+  ApplyInverseTransforms(dec, num_rows, in);
 
   // Extract alpha (which is stored in the green plane).
   {
@@ -1060,20 +1154,16 @@ int VP8LDecodeHeader(VP8LDecoder* const dec, VP8Io* const io) {
 int VP8LDecodeImage(VP8LDecoder* const dec) {
   VP8Io* io = NULL;
   WebPDecParams* params = NULL;
-  WebPDecBuffer* output = NULL;
 
   // Sanity checks.
   if (dec == NULL) return 0;
 
   io = dec->io_;
+  assert(io != NULL);
   params = (WebPDecParams*)io->opaque;
   assert(params != NULL);
-  output = params->output;
-  // YUV modes are invalid.
-  if (output->colorspace >= MODE_YUV) {
-    dec->status_ = VP8_STATUS_INVALID_PARAM;
-    goto Err;
-  }
+  dec->output_ = params->output;
+  assert(dec->output_ != NULL);
 
   // Initialization.
   if (!WebPIoInitFromOptions(params->options, io, MODE_BGRA)) {
diff --git a/src/dec/vp8li.h b/src/dec/vp8li.h
index 542dbb71..ee29eb5f 100644
--- a/src/dec/vp8li.h
+++ b/src/dec/vp8li.h
@@ -61,6 +61,8 @@ typedef struct {
   VP8LDecodeState  state_;
   VP8Io           *io_;
 
+  const WebPDecBuffer *output_;    // shortcut to io->opaque->output
+
   uint32_t        *argb_;          // Internal data: always in BGRA color mode.
   uint32_t        *argb_cache_;    // Scratch buffer for temporary BGRA storage.
 
diff --git a/src/dsp/lossless.c b/src/dsp/lossless.c
index cb8ad0bc..62a6b7b1 100644
--- a/src/dsp/lossless.c
+++ b/src/dsp/lossless.c
@@ -23,113 +23,156 @@ extern "C" {
 #include "../dsp/dsp.h"
 #include "../enc/histogram.h"
 
-// A lookup table for small values of log(int) to be used in entropy
-// computation.
-//
-// ", ".join(["%.16ff" % x for x in [0.0]+[log(x) for x in range(1, 256)]])
+#define MAX_DIFF_COST (1e30f)
+
+// lookup table for small values of log2(int)
+#define APPROX_LOG_MAX  4096
+#define LOG_2_RECIPROCAL 1.44269504088896338700465094007086
 #define LOG_LOOKUP_IDX_MAX 256
-static const float kLogTable[LOG_LOOKUP_IDX_MAX] = {
-  0.0000000000000000f, 0.0000000000000000f, 0.6931471805599453f,
-  1.0986122886681098f, 1.3862943611198906f, 1.6094379124341003f,
-  1.7917594692280550f, 1.9459101490553132f, 2.0794415416798357f,
-  2.1972245773362196f, 2.3025850929940459f, 2.3978952727983707f,
-  2.4849066497880004f, 2.5649493574615367f, 2.6390573296152584f,
-  2.7080502011022101f, 2.7725887222397811f, 2.8332133440562162f,
-  2.8903717578961645f, 2.9444389791664403f, 2.9957322735539909f,
-  3.0445224377234230f, 3.0910424533583161f, 3.1354942159291497f,
-  3.1780538303479458f, 3.2188758248682006f, 3.2580965380214821f,
-  3.2958368660043291f, 3.3322045101752038f, 3.3672958299864741f,
-  3.4011973816621555f, 3.4339872044851463f, 3.4657359027997265f,
-  3.4965075614664802f, 3.5263605246161616f, 3.5553480614894135f,
-  3.5835189384561099f, 3.6109179126442243f, 3.6375861597263857f,
-  3.6635616461296463f, 3.6888794541139363f, 3.7135720667043080f,
-  3.7376696182833684f, 3.7612001156935624f, 3.7841896339182610f,
-  3.8066624897703196f, 3.8286413964890951f, 3.8501476017100584f,
-  3.8712010109078911f, 3.8918202981106265f, 3.9120230054281460f,
-  3.9318256327243257f, 3.9512437185814275f, 3.9702919135521220f,
-  3.9889840465642745f, 4.0073331852324712f, 4.0253516907351496f,
-  4.0430512678345503f, 4.0604430105464191f, 4.0775374439057197f,
-  4.0943445622221004f, 4.1108738641733114f, 4.1271343850450917f,
-  4.1431347263915326f, 4.1588830833596715f, 4.1743872698956368f,
-  4.1896547420264252f, 4.2046926193909657f, 4.2195077051761070f,
-  4.2341065045972597f, 4.2484952420493594f, 4.2626798770413155f,
-  4.2766661190160553f, 4.2904594411483910f, 4.3040650932041702f,
-  4.3174881135363101f, 4.3307333402863311f, 4.3438054218536841f,
-  4.3567088266895917f, 4.3694478524670215f, 4.3820266346738812f,
-  4.3944491546724391f, 4.4067192472642533f, 4.4188406077965983f,
-  4.4308167988433134f, 4.4426512564903167f, 4.4543472962535073f,
-  4.4659081186545837f, 4.4773368144782069f, 4.4886363697321396f,
-  4.4998096703302650f, 4.5108595065168497f, 4.5217885770490405f,
-  4.5325994931532563f, 4.5432947822700038f, 4.5538768916005408f,
-  4.5643481914678361f, 4.5747109785033828f, 4.5849674786705723f,
-  4.5951198501345898f, 4.6051701859880918f, 4.6151205168412597f,
-  4.6249728132842707f, 4.6347289882296359f, 4.6443908991413725f,
-  4.6539603501575231f, 4.6634390941120669f, 4.6728288344619058f,
-  4.6821312271242199f, 4.6913478822291435f, 4.7004803657924166f,
-  4.7095302013123339f, 4.7184988712950942f, 4.7273878187123408f,
-  4.7361984483944957f, 4.7449321283632502f, 4.7535901911063645f,
-  4.7621739347977563f, 4.7706846244656651f, 4.7791234931115296f,
-  4.7874917427820458f, 4.7957905455967413f, 4.8040210447332568f,
-  4.8121843553724171f, 4.8202815656050371f, 4.8283137373023015f,
-  4.8362819069514780f, 4.8441870864585912f, 4.8520302639196169f,
-  4.8598124043616719f, 4.8675344504555822f, 4.8751973232011512f,
-  4.8828019225863706f, 4.8903491282217537f, 4.8978397999509111f,
-  4.9052747784384296f, 4.9126548857360524f, 4.9199809258281251f,
-  4.9272536851572051f, 4.9344739331306915f, 4.9416424226093039f,
-  4.9487598903781684f, 4.9558270576012609f, 4.9628446302599070f,
-  4.9698132995760007f, 4.9767337424205742f, 4.9836066217083363f,
-  4.9904325867787360f, 4.9972122737641147f, 5.0039463059454592f,
-  5.0106352940962555f, 5.0172798368149243f, 5.0238805208462765f,
-  5.0304379213924353f, 5.0369526024136295f, 5.0434251169192468f,
-  5.0498560072495371f, 5.0562458053483077f, 5.0625950330269669f,
-  5.0689042022202315f, 5.0751738152338266f, 5.0814043649844631f,
-  5.0875963352323836f, 5.0937502008067623f, 5.0998664278241987f,
-  5.1059454739005803f, 5.1119877883565437f, 5.1179938124167554f,
-  5.1239639794032588f, 5.1298987149230735f, 5.1357984370502621f,
-  5.1416635565026603f, 5.1474944768134527f, 5.1532915944977793f,
-  5.1590552992145291f, 5.1647859739235145f, 5.1704839950381514f,
-  5.1761497325738288f, 5.1817835502920850f, 5.1873858058407549f,
-  5.1929568508902104f, 5.1984970312658261f, 5.2040066870767951f,
-  5.2094861528414214f, 5.2149357576089859f, 5.2203558250783244f,
-  5.2257466737132017f, 5.2311086168545868f, 5.2364419628299492f,
-  5.2417470150596426f, 5.2470240721604862f, 5.2522734280466299f,
-  5.2574953720277815f, 5.2626901889048856f, 5.2678581590633282f,
-  5.2729995585637468f, 5.2781146592305168f, 5.2832037287379885f,
-  5.2882670306945352f, 5.2933048247244923f, 5.2983173665480363f,
-  5.3033049080590757f, 5.3082676974012051f, 5.3132059790417872f,
-  5.3181199938442161f, 5.3230099791384085f, 5.3278761687895813f,
-  5.3327187932653688f, 5.3375380797013179f, 5.3423342519648109f,
-  5.3471075307174685f, 5.3518581334760666f, 5.3565862746720123f,
-  5.3612921657094255f, 5.3659760150218512f, 5.3706380281276624f,
-  5.3752784076841653f, 5.3798973535404597f, 5.3844950627890888f,
-  5.3890717298165010f, 5.3936275463523620f, 5.3981627015177525f,
-  5.4026773818722793f, 5.4071717714601188f, 5.4116460518550396f,
-  5.4161004022044201f, 5.4205349992722862f, 5.4249500174814029f,
-  5.4293456289544411f, 5.4337220035542400f, 5.4380793089231956f,
-  5.4424177105217932f, 5.4467373716663099f, 5.4510384535657002f,
-  5.4553211153577017f, 5.4595855141441589f, 5.4638318050256105f,
-  5.4680601411351315f, 5.4722706736714750f, 5.4764635519315110f,
-  5.4806389233419912f, 5.4847969334906548f, 5.4889377261566867f,
-  5.4930614433405482f, 5.4971682252932021f, 5.5012582105447274f,
-  5.5053315359323625f, 5.5093883366279774f, 5.5134287461649825f,
-  5.5174528964647074f, 5.5214609178622460f, 5.5254529391317835f,
-  5.5294290875114234f, 5.5333894887275203f, 5.5373342670185366f,
-  5.5412635451584258f
+static const float kLog2Table[LOG_LOOKUP_IDX_MAX] = {
+  0.0000000000000000f, 0.0000000000000000f,
+  1.0000000000000000f, 1.5849625007211560f,
+  2.0000000000000000f, 2.3219280948873621f,
+  2.5849625007211560f, 2.8073549220576041f,
+  3.0000000000000000f, 3.1699250014423121f,
+  3.3219280948873621f, 3.4594316186372973f,
+  3.5849625007211560f, 3.7004397181410921f,
+  3.8073549220576041f, 3.9068905956085187f,
+  4.0000000000000000f, 4.0874628412503390f,
+  4.1699250014423121f, 4.2479275134435852f,
+  4.3219280948873626f, 4.3923174227787606f,
+  4.4594316186372973f, 4.5235619560570130f,
+  4.5849625007211560f, 4.6438561897747243f,
+  4.7004397181410917f, 4.7548875021634682f,
+  4.8073549220576037f, 4.8579809951275718f,
+  4.9068905956085187f, 4.9541963103868749f,
+  5.0000000000000000f, 5.0443941193584533f,
+  5.0874628412503390f, 5.1292830169449663f,
+  5.1699250014423121f, 5.2094533656289501f,
+  5.2479275134435852f, 5.2854022188622487f,
+  5.3219280948873626f, 5.3575520046180837f,
+  5.3923174227787606f, 5.4262647547020979f,
+  5.4594316186372973f, 5.4918530963296747f,
+  5.5235619560570130f, 5.5545888516776376f,
+  5.5849625007211560f, 5.6147098441152083f,
+  5.6438561897747243f, 5.6724253419714951f,
+  5.7004397181410917f, 5.7279204545631987f,
+  5.7548875021634682f, 5.7813597135246599f,
+  5.8073549220576037f, 5.8328900141647412f,
+  5.8579809951275718f, 5.8826430493618415f,
+  5.9068905956085187f, 5.9307373375628866f,
+  5.9541963103868749f, 5.9772799234999167f,
+  6.0000000000000000f, 6.0223678130284543f,
+  6.0443941193584533f, 6.0660891904577720f,
+  6.0874628412503390f, 6.1085244567781691f,
+  6.1292830169449663f, 6.1497471195046822f,
+  6.1699250014423121f, 6.1898245588800175f,
+  6.2094533656289501f, 6.2288186904958804f,
+  6.2479275134435852f, 6.2667865406949010f,
+  6.2854022188622487f, 6.3037807481771030f,
+  6.3219280948873626f, 6.3398500028846243f,
+  6.3575520046180837f, 6.3750394313469245f,
+  6.3923174227787606f, 6.4093909361377017f,
+  6.4262647547020979f, 6.4429434958487279f,
+  6.4594316186372973f, 6.4757334309663976f,
+  6.4918530963296747f, 6.5077946401986963f,
+  6.5235619560570130f, 6.5391588111080309f,
+  6.5545888516776376f, 6.5698556083309478f,
+  6.5849625007211560f, 6.5999128421871278f,
+  6.6147098441152083f, 6.6293566200796094f,
+  6.6438561897747243f, 6.6582114827517946f,
+  6.6724253419714951f, 6.6865005271832185f,
+  6.7004397181410917f, 6.7142455176661224f,
+  6.7279204545631987f, 6.7414669864011464f,
+  6.7548875021634682f, 6.7681843247769259f,
+  6.7813597135246599f, 6.7944158663501061f,
+  6.8073549220576037f, 6.8201789624151878f,
+  6.8328900141647412f, 6.8454900509443747f,
+  6.8579809951275718f, 6.8703647195834047f,
+  6.8826430493618415f, 6.8948177633079437f,
+  6.9068905956085187f, 6.9188632372745946f,
+  6.9307373375628866f, 6.9425145053392398f,
+  6.9541963103868749f, 6.9657842846620869f,
+  6.9772799234999167f, 6.9886846867721654f,
+  7.0000000000000000f, 7.0112272554232539f,
+  7.0223678130284543f, 7.0334230015374501f,
+  7.0443941193584533f, 7.0552824355011898f,
+  7.0660891904577720f, 7.0768155970508308f,
+  7.0874628412503390f, 7.0980320829605263f,
+  7.1085244567781691f, 7.1189410727235076f,
+  7.1292830169449663f, 7.1395513523987936f,
+  7.1497471195046822f, 7.1598713367783890f,
+  7.1699250014423121f, 7.1799090900149344f,
+  7.1898245588800175f, 7.1996723448363644f,
+  7.2094533656289501f, 7.2191685204621611f,
+  7.2288186904958804f, 7.2384047393250785f,
+  7.2479275134435852f, 7.2573878426926521f,
+  7.2667865406949010f, 7.2761244052742375f,
+  7.2854022188622487f, 7.2946207488916270f,
+  7.3037807481771030f, 7.3128829552843557f,
+  7.3219280948873626f, 7.3309168781146167f,
+  7.3398500028846243f, 7.3487281542310771f,
+  7.3575520046180837f, 7.3663222142458160f,
+  7.3750394313469245f, 7.3837042924740519f,
+  7.3923174227787606f, 7.4008794362821843f,
+  7.4093909361377017f, 7.4178525148858982f,
+  7.4262647547020979f, 7.4346282276367245f,
+  7.4429434958487279f, 7.4512111118323289f,
+  7.4594316186372973f, 7.4676055500829976f,
+  7.4757334309663976f, 7.4838157772642563f,
+  7.4918530963296747f, 7.4998458870832056f,
+  7.5077946401986963f, 7.5156998382840427f,
+  7.5235619560570130f, 7.5313814605163118f,
+  7.5391588111080309f, 7.5468944598876364f,
+  7.5545888516776376f, 7.5622424242210728f,
+  7.5698556083309478f, 7.5774288280357486f,
+  7.5849625007211560f, 7.5924570372680806f,
+  7.5999128421871278f, 7.6073303137496104f,
+  7.6147098441152083f, 7.6220518194563764f,
+  7.6293566200796094f, 7.6366246205436487f,
+  7.6438561897747243f, 7.6510516911789281f,
+  7.6582114827517946f, 7.6653359171851764f,
+  7.6724253419714951f, 7.6794800995054464f,
+  7.6865005271832185f, 7.6934869574993252f,
+  7.7004397181410917f, 7.7073591320808825f,
+  7.7142455176661224f, 7.7210991887071855f,
+  7.7279204545631987f, 7.7347096202258383f,
+  7.7414669864011464f, 7.7481928495894605f,
+  7.7548875021634682f, 7.7615512324444795f,
+  7.7681843247769259f, 7.7747870596011736f,
+  7.7813597135246599f, 7.7879025593914317f,
+  7.7944158663501061f, 7.8008998999203047f,
+  7.8073549220576037f, 7.8137811912170374f,
+  7.8201789624151878f, 7.8265484872909150f,
+  7.8328900141647412f, 7.8392037880969436f,
+  7.8454900509443747f, 7.8517490414160571f,
+  7.8579809951275718f, 7.8641861446542797f,
+  7.8703647195834047f, 7.8765169465649993f,
+  7.8826430493618415f, 7.8887432488982591f,
+  7.8948177633079437f, 7.9008668079807486f,
+  7.9068905956085187f, 7.9128893362299619f,
+  7.9188632372745946f, 7.9248125036057812f,
+  7.9307373375628866f, 7.9366379390025709f,
+  7.9425145053392398f, 7.9483672315846778f,
+  7.9541963103868749f, 7.9600019320680805f,
+  7.9657842846620869f, 7.9715435539507719f,
+  7.9772799234999167f, 7.9829935746943103f,
+  7.9886846867721654f, 7.9943534368588577f
 };
 
-#define APPROX_LOG_MAX  4096
-#define LOG_2_BASE_E    0.6931471805599453f
-
-float VP8LFastLog(int v) {
-  if (v < APPROX_LOG_MAX) {
+float VP8LFastLog2(int v) {
+  if (v < LOG_LOOKUP_IDX_MAX) {
+    return kLog2Table[v];
+  } else if (v < APPROX_LOG_MAX) {
     int log_cnt = 0;
     while (v >= LOG_LOOKUP_IDX_MAX) {
       ++log_cnt;
       v = v >> 1;
     }
-    return kLogTable[v] + (log_cnt * LOG_2_BASE_E);
+    return kLog2Table[v] + (float)log_cnt;
+  } else {
+    return (float)(LOG_2_RECIPROCAL * log((double)v));
   }
-  return (float)log(v);
 }
 
 //------------------------------------------------------------------------------
@@ -284,8 +327,8 @@ static const PredictorFunc kPredictors[16] = {
 };
 
 // TODO(vikasa): Replace 256 etc with defines.
-static double PredictionCostSpatial(const int* counts,
-                                    int weight_0, double exp_val) {
+static float PredictionCostSpatial(const int* counts,
+                                   int weight_0, double exp_val) {
   const int significant_symbols = 16;
   const double exp_decay_factor = 0.6;
   double bits = weight_0 * counts[0];
@@ -294,27 +337,26 @@ static double PredictionCostSpatial(const int* counts,
     bits += exp_val * (counts[i] + counts[256 - i]);
     exp_val *= exp_decay_factor;
   }
-  return -0.1 * bits;
+  return (float)(-0.1 * bits);
 }
 
 // Compute the Shanon's entropy: Sum(p*log2(p))
-static double ShannonEntropy(const int* const array, int n) {
+static float ShannonEntropy(const int* const array, int n) {
   int i;
-  double retval = 0;
+  float retval = 0.f;
   int sum = 0;
   for (i = 0; i < n; ++i) {
     if (array[i] != 0) {
       sum += array[i];
-      retval += array[i] * VP8LFastLog(array[i]);
+      retval -= VP8LFastSLog2(array[i]);
     }
   }
-  retval -= sum * VP8LFastLog(sum);
-  retval *= -1.4426950408889634;  // 1.0 / -FastLog(2);
+  retval += VP8LFastSLog2(sum);
   return retval;
 }
 
-static double PredictionCostSpatialHistogram(int accumulated[4][256],
-                                             int tile[4][256]) {
+static float PredictionCostSpatialHistogram(int accumulated[4][256],
+                                            int tile[4][256]) {
   int i;
   int k;
   int combo[256];
@@ -328,7 +370,7 @@ static double PredictionCostSpatialHistogram(int accumulated[4][256],
     }
     retval += ShannonEntropy(&combo[0], 256);
   }
-  return retval;
+  return (float)retval;
 }
 
 static int GetBestPredictorForTile(int width, int height,
@@ -344,14 +386,14 @@ static int GetBestPredictorForTile(int width, int height,
   const int xmax = (tile_size <= width - col_start) ?
       tile_size : width - col_start;
   int histo[4][256];
-  double best_diff = 1e99;
+  float best_diff = MAX_DIFF_COST;
   int best_mode = 0;
 
   int mode;
   for (mode = 0; mode < kNumPredModes; ++mode) {
     const uint32_t* current_row = argb_scratch;
     const PredictorFunc pred_func = kPredictors[mode];
-    double cur_diff;
+    float cur_diff;
     int y;
     memset(&histo[0][0], 0, sizeof(histo));
     for (y = 0; y < ymax; ++y) {
@@ -630,8 +672,8 @@ static WEBP_INLINE int SkipRepeatedPixels(const uint32_t* const argb,
   return 0;
 }
 
-static double PredictionCostCrossColor(const int accumulated[256],
-                                       const int counts[256]) {
+static float PredictionCostCrossColor(const int accumulated[256],
+                                      const int counts[256]) {
   // Favor low entropy, locally and globally.
   int i;
   int combo[256];
@@ -651,8 +693,8 @@ static Multipliers GetBestColorTransformForTile(
     int* accumulated_red_histo,
     int* accumulated_blue_histo,
     const uint32_t* const argb) {
-  double best_diff = 1e99;
-  double cur_diff;
+  float best_diff = MAX_DIFF_COST;
+  float cur_diff;
   const int halfstep = step / 2;
   const int max_tile_size = 1 << bits;
   const int tile_y_offset = tile_y * max_tile_size;
@@ -704,7 +746,7 @@ static Multipliers GetBestColorTransformForTile(
       best_tx = tx;
     }
   }
-  best_diff = 1e99;
+  best_diff = MAX_DIFF_COST;
   green_to_red = best_tx.green_to_red_;
   for (green_to_blue = -32; green_to_blue <= 32; green_to_blue += step) {
     for (red_to_blue = -32; red_to_blue <= 32; red_to_blue += step) {
@@ -893,7 +935,7 @@ static void ColorIndexInverseTransform(
       uint32_t packed_pixels = 0;
       int x;
       for (x = 0; x < width; ++x) {
-        // We need to load fresh 'packed_pixels' once every 'bytes_per_pixels'
+        // We need to load fresh 'packed_pixels' once every 'pixels_per_byte'
         // increments of x. Fortunately, pixels_per_byte is a power of 2, so
         // can just use a mask for that, instead of decrementing a counter.
         if ((x & count_mask) == 0) packed_pixels = ((*src++) >> 8) & 0xff;
@@ -934,7 +976,21 @@ void VP8LInverseTransform(const VP8LTransform* const transform,
       ColorSpaceInverseTransform(transform, row_start, row_end, out);
       break;
     case COLOR_INDEXING_TRANSFORM:
-      ColorIndexInverseTransform(transform, row_start, row_end, in, out);
+      if (in == out && transform->bits_ > 0) {
+        // Move packed pixels to the end of unpacked region, so that unpacking
+        // can occur seamlessly.
+        // Also, note that this is the only transform that applies on
+        // the effective width of VP8LSubSampleSize(xsize_, bits_). All other
+        // transforms work on effective width of xsize_.
+        const int out_stride = (row_end - row_start) * transform->xsize_;
+        const int in_stride = (row_end - row_start) *
+            VP8LSubSampleSize(transform->xsize_, transform->bits_);
+        uint32_t* const src = out + out_stride - in_stride;
+        memmove(src, out, in_stride * sizeof(*src));
+        ColorIndexInverseTransform(transform, row_start, row_end, src, out);
+      } else {
+        ColorIndexInverseTransform(transform, row_start, row_end, in, out);
+      }
       break;
   }
 }
diff --git a/src/dsp/lossless.h b/src/dsp/lossless.h
index f00e90e0..992516fc 100644
--- a/src/dsp/lossless.h
+++ b/src/dsp/lossless.h
@@ -59,8 +59,10 @@ static WEBP_INLINE uint32_t VP8LSubSampleSize(uint32_t size,
   return (size + (1 << sampling_bits) - 1) >> sampling_bits;
 }
 
-// Faster logarithm for small integers, with the property of log(0) == 0.
-float VP8LFastLog(int v);
+// Faster logarithm for integers, with the property of log2(0) == 0.
+float VP8LFastLog2(int v);
+// Fast calculation of v * log2(v) for integer input.
+static WEBP_INLINE float VP8LFastSLog2(int v) { return VP8LFastLog2(v) * v; }
 
 // In-place difference of each component with mod 256.
 static WEBP_INLINE uint32_t VP8LSubPixels(uint32_t a, uint32_t b) {
diff --git a/src/dsp/upsampling.c b/src/dsp/upsampling.c
index 9ca04927..4855eb14 100644
--- a/src/dsp/upsampling.c
+++ b/src/dsp/upsampling.c
@@ -271,8 +271,7 @@ static void ApplyAlphaMultiply(uint8_t* rgba, int alpha_first,
 
 // rgbA4444
 
-#define MULTIPLIER(a)  ((a) * 0x11)
-#define PREMULTIPLY(x, m) (((x) * (m)) >> 12)
+#define MULTIPLIER(a)  ((a) * 0x1111)    // 0x1111 ~= (1 << 16) / 15
 
 static WEBP_INLINE uint8_t dither_hi(uint8_t x) {
   return (x & 0xf0) | (x >> 4);
@@ -282,24 +281,27 @@ static WEBP_INLINE uint8_t dither_lo(uint8_t x) {
   return (x & 0x0f) | (x << 4);
 }
 
+static WEBP_INLINE uint8_t multiply(uint8_t x, uint32_t m) {
+  return (x * m) >> 16;
+}
+
 static void ApplyAlphaMultiply4444(uint8_t* rgba4444,
                                    int w, int h, int stride) {
   while (h-- > 0) {
     int i;
     for (i = 0; i < w; ++i) {
-      const uint8_t a = dither_lo(rgba4444[2 * i + 1]);
+      const uint8_t a = (rgba4444[2 * i + 1] & 0x0f);
       const uint32_t mult = MULTIPLIER(a);
-      const uint8_t r = PREMULTIPLY(dither_hi(rgba4444[2 * i + 0]), mult);
-      const uint8_t g = PREMULTIPLY(dither_lo(rgba4444[2 * i + 0]), mult);
-      const uint8_t b = PREMULTIPLY(dither_hi(rgba4444[2 * i + 1]), mult);
-      rgba4444[2 * i + 0] = (r & 0xf0) | (g & 0x0f);
+      const uint8_t r = multiply(dither_hi(rgba4444[2 * i + 0]), mult);
+      const uint8_t g = multiply(dither_lo(rgba4444[2 * i + 0]), mult);
+      const uint8_t b = multiply(dither_hi(rgba4444[2 * i + 1]), mult);
+      rgba4444[2 * i + 0] = (r & 0xf0) | ((g >> 4) & 0x0f);
       rgba4444[2 * i + 1] = (b & 0xf0) | a;
     }
     rgba4444 += stride;
   }
 }
 #undef MULTIPLIER
-#undef PREMULTIPLY
 
 void (*WebPApplyAlphaMultiply)(uint8_t*, int, int, int, int)
     = ApplyAlphaMultiply;
diff --git a/src/dsp/yuv.h b/src/dsp/yuv.h
index c662af67..a569109c 100644
--- a/src/dsp/yuv.h
+++ b/src/dsp/yuv.h
@@ -5,7 +5,7 @@
 //  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
-// inline YUV->RGB conversion function
+// inline YUV<->RGB conversion function
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
@@ -14,6 +14,9 @@
 
 #include "../dec/decode_vp8.h"
 
+//------------------------------------------------------------------------------
+// YUV -> RGB conversion
+
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
@@ -87,14 +90,37 @@ static WEBP_INLINE void VP8YuvToRgba(uint8_t y, uint8_t u, uint8_t v,
   rgba[3] = 0xff;
 }
 
-static WEBP_INLINE uint32_t VP8Clip4Bits(uint8_t c) {
-  const uint32_t v = (c + 8) >> 4;
-  return (v > 15) ? 15 : v;
-}
-
 // Must be called before everything, to initialize the tables.
 void VP8YUVInit(void);
 
+//------------------------------------------------------------------------------
+// RGB -> YUV conversion
+// The exact naming is Y'CbCr, following the ITU-R BT.601 standard.
+// More information at: http://en.wikipedia.org/wiki/YCbCr
+// Y = 0.2569 * R + 0.5044 * G + 0.0979 * B + 16
+// U = -0.1483 * R - 0.2911 * G + 0.4394 * B + 128
+// V = 0.4394 * R - 0.3679 * G - 0.0715 * B + 128
+// We use 16bit fixed point operations.
+
+static WEBP_INLINE int VP8ClipUV(int v) {
+   v = (v + (257 << (YUV_FIX + 2 - 1))) >> (YUV_FIX + 2);
+   return ((v & ~0xff) == 0) ? v : (v < 0) ? 0 : 255;
+}
+
+static WEBP_INLINE int VP8RGBToY(int r, int g, int b) {
+  const int kRound = (1 << (YUV_FIX - 1)) + (16 << YUV_FIX);
+  const int luma = 16839 * r + 33059 * g + 6420 * b;
+  return (luma + kRound) >> YUV_FIX;  // no need to clip
+}
+
+static WEBP_INLINE int VP8RGBToU(int r, int g, int b) {
+  return VP8ClipUV(-9719 * r - 19081 * g + 28800 * b);
+}
+
+static WEBP_INLINE int VP8RGBToV(int r, int g, int b) {
+  return VP8ClipUV(+28800 * r - 24116 * g - 4684 * b);
+}
+
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
diff --git a/src/enc/alpha.c b/src/enc/alpha.c
index b31cee8c..0e519b6c 100644
--- a/src/enc/alpha.c
+++ b/src/enc/alpha.c
@@ -22,19 +22,15 @@ extern "C" {
 #endif
 
 // -----------------------------------------------------------------------------
-// int EncodeAlpha(const uint8_t* data, int width, int height, int stride,
-//                 int quality, int method, int filter, int effort_level,
-//                 uint8_t** output, size_t* output_size)
-//
-// Encodes the given alpha data 'data' of size 'stride'x'height' via specified
-// compression method 'method'. The pre-processing (Quantization) is
-// performed if 'quality' is less than 100. For such cases, the encoding is
-// lossy. Valid ranges for 'quality' is [0, 100] and 'method' is [0, 1]:
+// Encodes the given alpha data via specified compression method 'method'.
+// The pre-processing (quantization) is performed if 'quality' is less than 100.
+// For such cases, the encoding is lossy. The valid range is [0, 100] for
+// 'quality' and [0, 1] for 'method':
 //   'method = 0' - No compression;
 //   'method = 1' - Use lossless coder on the alpha plane only
 // 'filter' values [0, 4] correspond to prediction modes none, horizontal,
 // vertical & gradient filters. The prediction mode 4 will try all the
-// prediction modes (0 to 3) and pick the best prediction mode.
+// prediction modes 0 to 3 and pick the best one.
 // 'effort_level': specifies how much effort must be spent to try and reduce
 //  the compressed output size. In range 0 (quick) to 6 (slow).
 //
@@ -50,10 +46,10 @@ extern "C" {
 
 #include "../enc/vp8li.h"
 
-static int EncodeLossless(const uint8_t* data, int width, int height,
+static int EncodeLossless(const uint8_t* const data, int width, int height,
                           int effort_level,  // in [0..6] range
-                          VP8BitWriter* const bw) {
-
+                          VP8BitWriter* const bw,
+                          WebPAuxStats* const stats) {
   int ok = 0;
   WebPConfig config;
   WebPPicture picture;
@@ -63,6 +59,7 @@ static int EncodeLossless(const uint8_t* data, int width, int height,
   picture.width = width;
   picture.height = height;
   picture.use_argb = 1;
+  picture.stats = stats;
   if (!WebPPictureAlloc(&picture)) return 0;
 
   // Transfer the alpha values to the green channel.
@@ -87,8 +84,8 @@ static int EncodeLossless(const uint8_t* data, int width, int height,
   config.quality = 10.f + 15.f * effort_level;
   if (config.quality > 100.f) config.quality = 100.f;
 
-  VP8LBitWriterInit(&tmp_bw, (width * height) >> 3);
-  ok = (VP8LEncodeStream(&config, &picture, &tmp_bw) == VP8_ENC_OK);
+  ok = VP8LBitWriterInit(&tmp_bw, (width * height) >> 3);
+  ok = ok && (VP8LEncodeStream(&config, &picture, &tmp_bw) == VP8_ENC_OK);
   WebPPictureFree(&picture);
   if (ok) {
     const uint8_t* const data = VP8LBitWriterFinish(&tmp_bw);
@@ -101,10 +98,12 @@ static int EncodeLossless(const uint8_t* data, int width, int height,
 
 // -----------------------------------------------------------------------------
 
-static int EncodeAlphaInternal(const uint8_t* data, int width, int height,
+static int EncodeAlphaInternal(const uint8_t* const data, int width, int height,
                                int method, int filter, int reduce_levels,
                                int effort_level,  // in [0..6] range
-                               uint8_t* tmp_alpha, VP8BitWriter* const bw) {
+                               uint8_t* const tmp_alpha,
+                               VP8BitWriter* const bw,
+                               WebPAuxStats* const stats) {
   int ok = 0;
   const uint8_t* alpha_src;
   WebPFilterFunc filter_func;
@@ -112,6 +111,7 @@ static int EncodeAlphaInternal(const uint8_t* data, int width, int height,
   size_t expected_size;
   const size_t data_size = width * height;
 
+  assert((uint64_t)data_size == (uint64_t)width * height);  // as per spec
   assert(filter >= 0 && filter < WEBP_FILTER_LAST);
   assert(method >= ALPHA_NO_COMPRESSION);
   assert(method <= ALPHA_LOSSLESS_COMPRESSION);
@@ -139,7 +139,7 @@ static int EncodeAlphaInternal(const uint8_t* data, int width, int height,
     ok = VP8BitWriterAppend(bw, alpha_src, width * height);
     ok = ok && !bw->error_;
   } else {
-    ok = EncodeLossless(alpha_src, width, height, effort_level, bw);
+    ok = EncodeLossless(alpha_src, width, height, effort_level, bw, stats);
     VP8BitWriterFinish(bw);
   }
   return ok;
@@ -157,19 +157,26 @@ static void CopyPlane(const uint8_t* src, int src_stride,
   }
 }
 
-static int EncodeAlpha(const uint8_t* data, int width, int height, int stride,
+static int EncodeAlpha(VP8Encoder* const enc,
                        int quality, int method, int filter,
                        int effort_level,
-                       uint8_t** output, size_t* output_size) {
+                       uint8_t** const output, size_t* const output_size) {
+  const WebPPicture* const pic = enc->pic_;
+  const int width = pic->width;
+  const int height = pic->height;
+
   uint8_t* quant_alpha = NULL;
   const size_t data_size = width * height;
+  uint64_t sse = 0;
   int ok = 1;
   const int reduce_levels = (quality < 100);
 
   // quick sanity checks
-  assert(data != NULL && output != NULL && output_size != NULL);
+  assert((uint64_t)data_size == (uint64_t)width * height);  // as per spec
+  assert(enc != NULL && pic != NULL && pic->a != NULL);
+  assert(output != NULL && output_size != NULL);
   assert(width > 0 && height > 0);
-  assert(stride >= width);
+  assert(pic->a_stride >= width);
   assert(filter >= WEBP_FILTER_NONE && filter <= WEBP_FILTER_FAST);
 
   if (quality < 0 || quality > 100) {
@@ -186,7 +193,7 @@ static int EncodeAlpha(const uint8_t* data, int width, int height, int stride,
   }
 
   // Extract alpha data (width x height) from raw_data (stride x height).
-  CopyPlane(data, stride, quant_alpha, width, width, height);
+  CopyPlane(pic->a, pic->a_stride, quant_alpha, width, width, height);
 
   if (reduce_levels) {  // No Quantization required for 'quality = 100'.
     // 16 alpha levels gives quite a low MSE w.r.t original alpha plane hence
@@ -194,24 +201,22 @@ static int EncodeAlpha(const uint8_t* data, int width, int height, int stride,
     // and Quality:]70, 100] -> Levels:]16, 256].
     const int alpha_levels = (quality <= 70) ? (2 + quality / 5)
                                              : (16 + (quality - 70) * 8);
-    ok = QuantizeLevels(quant_alpha, width, height, alpha_levels, NULL);
+    ok = QuantizeLevels(quant_alpha, width, height, alpha_levels, &sse);
   }
 
   if (ok) {
     VP8BitWriter bw;
-    size_t best_score;
     int test_filter;
     uint8_t* filtered_alpha = NULL;
 
     // We always test WEBP_FILTER_NONE first.
     ok = EncodeAlphaInternal(quant_alpha, width, height,
                              method, WEBP_FILTER_NONE, reduce_levels,
-                             effort_level, NULL, &bw);
+                             effort_level, NULL, &bw, pic->stats);
     if (!ok) {
       VP8BitWriterWipeOut(&bw);
       goto End;
     }
-    best_score = VP8BitWriterSize(&bw);
 
     if (filter == WEBP_FILTER_FAST) {  // Quick estimate of a second candidate?
       filter = EstimateBestFilter(quant_alpha, width, height, width);
@@ -228,35 +233,48 @@ static int EncodeAlpha(const uint8_t* data, int width, int height, int stride,
     }
 
     // Try the other mode(s).
-    for (test_filter = WEBP_FILTER_HORIZONTAL;
-         ok && (test_filter <= WEBP_FILTER_GRADIENT);
-         ++test_filter) {
-      VP8BitWriter tmp_bw;
-      if (filter != WEBP_FILTER_BEST && test_filter != filter) {
-        continue;
-      }
+    {
+      WebPAuxStats best_stats;
+      size_t best_score = VP8BitWriterSize(&bw);
 
-      ok = EncodeAlphaInternal(quant_alpha, width, height,
-                               method, test_filter, reduce_levels,
-                               effort_level, filtered_alpha, &tmp_bw);
-      if (ok) {
-        const size_t score = VP8BitWriterSize(&tmp_bw);
-        if (score < best_score) {
-          // swap bitwriter objects.
-          VP8BitWriter tmp = tmp_bw;
-          tmp_bw = bw;
-          bw = tmp;
-          best_score = score;
+      memset(&best_stats, 0, sizeof(best_stats));  // prevent spurious warning
+      if (pic->stats != NULL) best_stats = *pic->stats;
+      for (test_filter = WEBP_FILTER_HORIZONTAL;
+           ok && (test_filter <= WEBP_FILTER_GRADIENT);
+           ++test_filter) {
+        VP8BitWriter tmp_bw;
+        if (filter != WEBP_FILTER_BEST && test_filter != filter) {
+          continue;
         }
-      } else {
-        VP8BitWriterWipeOut(&bw);
+        ok = EncodeAlphaInternal(quant_alpha, width, height,
+                                 method, test_filter, reduce_levels,
+                                 effort_level, filtered_alpha, &tmp_bw,
+                                 pic->stats);
+        if (ok) {
+          const size_t score = VP8BitWriterSize(&tmp_bw);
+          if (score < best_score) {
+            // swap bitwriter objects.
+            VP8BitWriter tmp = tmp_bw;
+            tmp_bw = bw;
+            bw = tmp;
+            best_score = score;
+            if (pic->stats != NULL) best_stats = *pic->stats;
+          }
+        } else {
+          VP8BitWriterWipeOut(&bw);
+        }
+        VP8BitWriterWipeOut(&tmp_bw);
       }
-      VP8BitWriterWipeOut(&tmp_bw);
+      if (pic->stats != NULL) *pic->stats = best_stats;
     }
  Ok:
     if (ok) {
       *output_size = VP8BitWriterSize(&bw);
       *output = VP8BitWriterBuf(&bw);
+      if (pic->stats != NULL) {         // need stats?
+        pic->stats->coded_size += (int)(*output_size);
+        enc->sse_[3] = sse;
+      }
     }
     free(filtered_alpha);
   }
@@ -269,16 +287,15 @@ static int EncodeAlpha(const uint8_t* data, int width, int height, int stride,
 //------------------------------------------------------------------------------
 // Main calls
 
-void VP8EncInitAlpha(VP8Encoder* enc) {
+void VP8EncInitAlpha(VP8Encoder* const enc) {
   enc->has_alpha_ = WebPPictureHasTransparency(enc->pic_);
   enc->alpha_data_ = NULL;
   enc->alpha_data_size_ = 0;
 }
 
-int VP8EncFinishAlpha(VP8Encoder* enc) {
+int VP8EncFinishAlpha(VP8Encoder* const enc) {
   if (enc->has_alpha_) {
     const WebPConfig* config = enc->config_;
-    const WebPPicture* pic = enc->pic_;
     uint8_t* tmp_data = NULL;
     size_t tmp_size = 0;
     const int effort_level = config->method;  // maps to [0..6]
@@ -287,9 +304,7 @@ int VP8EncFinishAlpha(VP8Encoder* enc) {
         (config->alpha_filtering == 1) ? WEBP_FILTER_FAST :
                                          WEBP_FILTER_BEST;
 
-    assert(pic->a);
-    if (!EncodeAlpha(pic->a, pic->width, pic->height, pic->a_stride,
-                     config->alpha_quality, config->alpha_compression,
+    if (!EncodeAlpha(enc, config->alpha_quality, config->alpha_compression,
                      filter, effort_level, &tmp_data, &tmp_size)) {
       return 0;
     }
@@ -303,7 +318,7 @@ int VP8EncFinishAlpha(VP8Encoder* enc) {
   return WebPReportProgress(enc->pic_, enc->percent_ + 20, &enc->percent_);
 }
 
-void VP8EncDeleteAlpha(VP8Encoder* enc) {
+void VP8EncDeleteAlpha(VP8Encoder* const enc) {
   free(enc->alpha_data_);
   enc->alpha_data_ = NULL;
   enc->alpha_data_size_ = 0;
diff --git a/src/enc/analysis.c b/src/enc/analysis.c
index 0eec9472..22cfb492 100644
--- a/src/enc/analysis.c
+++ b/src/enc/analysis.c
@@ -15,6 +15,7 @@
 
 #include "./vp8enci.h"
 #include "./cost.h"
+#include "../utils/utils.h"
 
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
@@ -35,7 +36,8 @@ static void SmoothSegmentMap(VP8Encoder* const enc) {
   const int w = enc->mb_w_;
   const int h = enc->mb_h_;
   const int majority_cnt_3_x_3_grid = 5;
-  uint8_t* const tmp = (uint8_t*)malloc(w * h * sizeof(uint8_t));
+  uint8_t* const tmp = (uint8_t*)WebPSafeMalloc((uint64_t)w * h, sizeof(*tmp));
+  assert((uint64_t)(w * h) == (uint64_t)w * h);   // no overflow, as per spec
 
   if (tmp == NULL) return;
   for (y = 1; y < h - 1; ++y) {
@@ -145,7 +147,7 @@ static void SetSegmentAlphas(VP8Encoder* const enc,
 static void AssignSegments(VP8Encoder* const enc, const int alphas[256]) {
   const int nb = enc->segment_hdr_.num_segments_;
   int centers[NUM_MB_SEGMENTS];
-  int weighted_average;
+  int weighted_average = 0;
   int map[256];
   int a, n, k;
   int min_a = 0, max_a = 255, range_a;
diff --git a/src/enc/backward_references.c b/src/enc/backward_references.c
index 40819f99..b8c8ece8 100644
--- a/src/enc/backward_references.c
+++ b/src/enc/backward_references.c
@@ -14,7 +14,9 @@
 
 #include "./backward_references.h"
 #include "./histogram.h"
+#include "../dsp/lossless.h"
 #include "../utils/color_cache.h"
+#include "../utils/utils.h"
 
 #define VALUES_IN_BYTE 256
 
@@ -93,7 +95,8 @@ int VP8LBackwardRefsAlloc(VP8LBackwardRefs* const refs, int max_size) {
   assert(refs != NULL);
   refs->size = 0;
   refs->max_size = 0;
-  refs->refs = (PixOrCopy*)malloc(max_size * sizeof(*refs->refs));
+  refs->refs = (PixOrCopy*)WebPSafeMalloc((uint64_t)max_size,
+                                          sizeof(*refs->refs));
   if (refs->refs == NULL) return 0;
   refs->max_size = max_size;
   return 1;
@@ -110,7 +113,7 @@ static WEBP_INLINE uint64_t GetPixPairHash64(const uint32_t* const argb) {
 
 static int HashChainInit(HashChain* const p, int size) {
   int i;
-  p->chain_ = (int*)malloc(size * sizeof(*p->chain_));
+  p->chain_ = (int*)WebPSafeMalloc((uint64_t)size, sizeof(*p->chain_));
   if (p->chain_ == NULL) {
     return 0;
   }
@@ -256,8 +259,10 @@ static int BackwardReferencesHashChain(int xsize, int ysize,
   VP8LColorCache hashers;
 
   if (hash_chain == NULL) return 0;
-  cc_init = VP8LColorCacheInit(&hashers, cache_bits);
-  if (!cc_init) goto Error;
+  if (use_color_cache) {
+    cc_init = VP8LColorCacheInit(&hashers, cache_bits);
+    if (!cc_init) goto Error;
+  }
 
   if (!HashChainInit(hash_chain, pix_count)) goto Error;
 
@@ -289,15 +294,16 @@ static int BackwardReferencesHashChain(int xsize, int ysize,
         HashChainFindCopy(hash_chain, quality,
                           i + 1, xsize, argb, maxlen, &offset2, &len2);
         if (len2 > len + 1) {
+          const uint32_t pixel = argb[i];
           // Alternative#2 is a better match. So push pixel at 'i' as literal.
-          if (use_color_cache && VP8LColorCacheContains(&hashers, argb[i])) {
-            const int ix = VP8LColorCacheGetIndex(&hashers, argb[i]);
+          if (use_color_cache && VP8LColorCacheContains(&hashers, pixel)) {
+            const int ix = VP8LColorCacheGetIndex(&hashers, pixel);
             refs->refs[refs->size] = PixOrCopyCreateCacheIdx(ix);
           } else {
-            refs->refs[refs->size] = PixOrCopyCreateLiteral(argb[i]);
+            refs->refs[refs->size] = PixOrCopyCreateLiteral(pixel);
           }
           ++refs->size;
-          VP8LColorCacheInsert(&hashers, argb[i]);
+          if (use_color_cache) VP8LColorCacheInsert(&hashers, pixel);
           i++;  // Backward reference to be done for next pixel.
           len = len2;
           offset = offset2;
@@ -307,24 +313,30 @@ static int BackwardReferencesHashChain(int xsize, int ysize,
         len = MAX_LENGTH - 1;
       }
       refs->refs[refs->size++] = PixOrCopyCreateCopy(offset, len);
-      for (k = 0; k < len; ++k) {
-        VP8LColorCacheInsert(&hashers, argb[i + k]);
-        if (k != 0 && i + k + 1 < pix_count) {
-          // Add to the hash_chain (but cannot add the last pixel).
+      if (use_color_cache) {
+        for (k = 0; k < len; ++k) {
+          VP8LColorCacheInsert(&hashers, argb[i + k]);
+        }
+      }
+      // Add to the hash_chain (but cannot add the last pixel).
+      {
+        const int last = (len < pix_count - 1 - i) ? len : pix_count - 1 - i;
+        for (k = 1; k < last; ++k) {
           HashChainInsert(hash_chain, &argb[i + k], i + k);
         }
       }
       i += len;
     } else {
-      if (use_color_cache && VP8LColorCacheContains(&hashers, argb[i])) {
+      const uint32_t pixel = argb[i];
+      if (use_color_cache && VP8LColorCacheContains(&hashers, pixel)) {
         // push pixel as a PixOrCopyCreateCacheIdx pixel
-        int ix = VP8LColorCacheGetIndex(&hashers, argb[i]);
+        const int ix = VP8LColorCacheGetIndex(&hashers, pixel);
         refs->refs[refs->size] = PixOrCopyCreateCacheIdx(ix);
       } else {
-        refs->refs[refs->size] = PixOrCopyCreateLiteral(argb[i]);
+        refs->refs[refs->size] = PixOrCopyCreateLiteral(pixel);
       }
       ++refs->size;
-      VP8LColorCacheInsert(&hashers, argb[i]);
+      if (use_color_cache) VP8LColorCacheInsert(&hashers, pixel);
       if (i + 1 < pix_count) {
         HashChainInsert(hash_chain, &argb[i], i);
       }
@@ -346,46 +358,65 @@ typedef struct {
   double literal_[PIX_OR_COPY_CODES_MAX];
   double blue_[VALUES_IN_BYTE];
   double distance_[NUM_DISTANCE_CODES];
-  int cache_bits_;
 } CostModel;
 
 static int BackwardReferencesTraceBackwards(
     int xsize, int ysize, int recursive_cost_model,
     const uint32_t* const argb, int cache_bits, VP8LBackwardRefs* const refs);
 
-static int CostModelBuild(CostModel* const p, int xsize, int ysize,
+static void ConvertPopulationCountTableToBitEstimates(
+    int num_symbols, const int population_counts[], double output[]) {
+  int sum = 0;
+  int nonzeros = 0;
+  int i;
+  for (i = 0; i < num_symbols; ++i) {
+    sum += population_counts[i];
+    if (population_counts[i] > 0) {
+      ++nonzeros;
+    }
+  }
+  if (nonzeros <= 1) {
+    memset(output, 0, num_symbols * sizeof(*output));
+  } else {
+    const double logsum = VP8LFastLog2(sum);
+    for (i = 0; i < num_symbols; ++i) {
+      output[i] = logsum - VP8LFastLog2(population_counts[i]);
+    }
+  }
+}
+
+static int CostModelBuild(CostModel* const m, int xsize, int ysize,
                           int recursion_level, const uint32_t* const argb,
                           int cache_bits) {
   int ok = 0;
   VP8LHistogram histo;
   VP8LBackwardRefs refs;
+  const int quality = 100;
 
   if (!VP8LBackwardRefsAlloc(&refs, xsize * ysize)) goto Error;
 
-  p->cache_bits_ = cache_bits;
   if (recursion_level > 0) {
     if (!BackwardReferencesTraceBackwards(xsize, ysize, recursion_level - 1,
                                           argb, cache_bits, &refs)) {
       goto Error;
     }
   } else {
-    const int quality = 100;
     if (!BackwardReferencesHashChain(xsize, ysize, argb, cache_bits, quality,
                                      &refs)) {
       goto Error;
     }
   }
   VP8LHistogramCreate(&histo, &refs, cache_bits);
-  VP8LConvertPopulationCountTableToBitEstimates(
-      VP8LHistogramNumCodes(&histo), histo.literal_, p->literal_);
-  VP8LConvertPopulationCountTableToBitEstimates(
-      VALUES_IN_BYTE, histo.red_, p->red_);
-  VP8LConvertPopulationCountTableToBitEstimates(
-      VALUES_IN_BYTE, histo.blue_, p->blue_);
-  VP8LConvertPopulationCountTableToBitEstimates(
-      VALUES_IN_BYTE, histo.alpha_, p->alpha_);
-  VP8LConvertPopulationCountTableToBitEstimates(
-      NUM_DISTANCE_CODES, histo.distance_, p->distance_);
+  ConvertPopulationCountTableToBitEstimates(
+      VP8LHistogramNumCodes(&histo), histo.literal_, m->literal_);
+  ConvertPopulationCountTableToBitEstimates(
+      VALUES_IN_BYTE, histo.red_, m->red_);
+  ConvertPopulationCountTableToBitEstimates(
+      VALUES_IN_BYTE, histo.blue_, m->blue_);
+  ConvertPopulationCountTableToBitEstimates(
+      VALUES_IN_BYTE, histo.alpha_, m->alpha_);
+  ConvertPopulationCountTableToBitEstimates(
+      NUM_DISTANCE_CODES, histo.distance_, m->distance_);
   ok = 1;
 
  Error:
@@ -393,30 +424,30 @@ static int CostModelBuild(CostModel* const p, int xsize, int ysize,
   return ok;
 }
 
-static WEBP_INLINE double GetLiteralCost(const CostModel* const p, uint32_t v) {
-  return p->alpha_[v >> 24] +
-      p->red_[(v >> 16) & 0xff] +
-      p->literal_[(v >> 8) & 0xff] +
-      p->blue_[v & 0xff];
+static WEBP_INLINE double GetLiteralCost(const CostModel* const m, uint32_t v) {
+  return m->alpha_[v >> 24] +
+         m->red_[(v >> 16) & 0xff] +
+         m->literal_[(v >> 8) & 0xff] +
+         m->blue_[v & 0xff];
 }
 
-static WEBP_INLINE double GetCacheCost(const CostModel* const p, uint32_t idx) {
+static WEBP_INLINE double GetCacheCost(const CostModel* const m, uint32_t idx) {
   const int literal_idx = VALUES_IN_BYTE + NUM_LENGTH_CODES + idx;
-  return p->literal_[literal_idx];
+  return m->literal_[literal_idx];
 }
 
-static WEBP_INLINE double GetLengthCost(const CostModel* const p,
+static WEBP_INLINE double GetLengthCost(const CostModel* const m,
                                         uint32_t length) {
   int code, extra_bits_count, extra_bits_value;
   PrefixEncode(length, &code, &extra_bits_count, &extra_bits_value);
-  return p->literal_[VALUES_IN_BYTE + code] + extra_bits_count;
+  return m->literal_[VALUES_IN_BYTE + code] + extra_bits_count;
 }
 
-static WEBP_INLINE double GetDistanceCost(const CostModel* const p,
+static WEBP_INLINE double GetDistanceCost(const CostModel* const m,
                                           uint32_t distance) {
   int code, extra_bits_count, extra_bits_value;
   PrefixEncode(distance, &code, &extra_bits_count, &extra_bits_value);
-  return p->distance_[code] + extra_bits_count;
+  return m->distance_[code] + extra_bits_count;
 }
 
 static int BackwardReferencesHashChainDistanceOnly(
@@ -428,7 +459,8 @@ static int BackwardReferencesHashChainDistanceOnly(
   const int quality = 100;
   const int pix_count = xsize * ysize;
   const int use_color_cache = (cache_bits > 0);
-  double* const cost = (double*)malloc(pix_count * sizeof(*cost));
+  double* const cost =
+      (double*)WebPSafeMalloc((uint64_t)pix_count, sizeof(*cost));
   CostModel* cost_model = (CostModel*)malloc(sizeof(*cost_model));
   HashChain* hash_chain = (HashChain*)malloc(sizeof(*hash_chain));
   VP8LColorCache hashers;
@@ -437,8 +469,12 @@ static int BackwardReferencesHashChainDistanceOnly(
 
   if (cost == NULL || cost_model == NULL || hash_chain == NULL) goto Error;
 
-  cc_init = VP8LColorCacheInit(&hashers, cache_bits);
-  if (!cc_init || !HashChainInit(hash_chain, pix_count)) goto Error;
+  if (!HashChainInit(hash_chain, pix_count)) goto Error;
+
+  if (use_color_cache) {
+    cc_init = VP8LColorCacheInit(&hashers, cache_bits);
+    if (!cc_init) goto Error;
+  }
 
   if (!CostModelBuild(cost_model, xsize, ysize, recursive_cost_model, argb,
                       cache_bits)) {
@@ -486,14 +522,20 @@ static int BackwardReferencesHashChainDistanceOnly(
           // Long copy for short distances, let's skip the middle
           // lookups for better copies.
           // 1) insert the hashes.
-          for (k = 0; k < len; ++k) {
-            VP8LColorCacheInsert(&hashers, argb[i + k]);
-            if (i + k + 1 < pix_count) {
-              // Add to the hash_chain (but cannot add the last pixel).
+          if (use_color_cache) {
+            for (k = 0; k < len; ++k) {
+              VP8LColorCacheInsert(&hashers, argb[i + k]);
+            }
+          }
+          // 2) Add to the hash_chain (but cannot add the last pixel)
+          {
+            const int last = (len < pix_count - 1 - i) ? len
+                                                       : pix_count - 1 - i;
+            for (k = 0; k < last; ++k) {
               HashChainInsert(hash_chain, &argb[i + k], i + k);
             }
           }
-          // 2) jump.
+          // 3) jump.
           i += len - 1;  // for loop does ++i, thus -1 here.
           goto next_symbol;
         }
@@ -515,7 +557,7 @@ static int BackwardReferencesHashChainDistanceOnly(
         cost[i] = cost_val;
         dist_array[i] = 1;  // only one is inserted.
       }
-      VP8LColorCacheInsert(&hashers, argb[i]);
+      if (use_color_cache) VP8LColorCacheInsert(&hashers, argb[i]);
     }
  next_symbol: ;
   }
@@ -545,7 +587,8 @@ static int TraceBackwards(const uint32_t* const dist_array,
   }
   // Allocate.
   *chosen_path_size = count;
-  *chosen_path = (uint32_t*)malloc(count * sizeof(*chosen_path));
+  *chosen_path =
+      (uint32_t*)WebPSafeMalloc((uint64_t)count, sizeof(**chosen_path));
   if (*chosen_path == NULL) return 0;
 
   // Write in reverse order.
@@ -574,11 +617,13 @@ static int BackwardReferencesHashChainFollowChosenPath(
   HashChain* hash_chain = (HashChain*)malloc(sizeof(*hash_chain));
   VP8LColorCache hashers;
 
-  if (hash_chain == NULL ||
-      !(cc_init = VP8LColorCacheInit(&hashers, cache_bits)) ||
-      !HashChainInit(hash_chain, pix_count)) {
+  if (hash_chain == NULL || !HashChainInit(hash_chain, pix_count)) {
     goto Error;
   }
+  if (use_color_cache) {
+    cc_init = VP8LColorCacheInit(&hashers, cache_bits);
+    if (!cc_init) goto Error;
+  }
 
   refs->size = 0;
   for (ix = 0; ix < chosen_path_size; ++ix, ++size) {
@@ -590,10 +635,14 @@ static int BackwardReferencesHashChainFollowChosenPath(
                         i, xsize, argb, maxlen, &offset, &len);
       assert(len == maxlen);
       refs->refs[size] = PixOrCopyCreateCopy(offset, len);
-      for (k = 0; k < len; ++k) {
-        VP8LColorCacheInsert(&hashers, argb[i + k]);
-        if (i + k + 1 < pix_count) {
-          // Add to the hash_chain (but cannot add the last pixel).
+      if (use_color_cache) {
+        for (k = 0; k < len; ++k) {
+          VP8LColorCacheInsert(&hashers, argb[i + k]);
+        }
+      }
+      {
+        const int last = (len < pix_count - 1 - i) ? len : pix_count - 1 - i;
+        for (k = 0; k < last; ++k) {
           HashChainInsert(hash_chain, &argb[i + k], i + k);
         }
       }
@@ -606,7 +655,7 @@ static int BackwardReferencesHashChainFollowChosenPath(
       } else {
         refs->refs[size] = PixOrCopyCreateLiteral(argb[i]);
       }
-      VP8LColorCacheInsert(&hashers, argb[i]);
+      if (use_color_cache) VP8LColorCacheInsert(&hashers, argb[i]);
       if (i + 1 < pix_count) {
         HashChainInsert(hash_chain, &argb[i], i);
       }
@@ -633,7 +682,7 @@ static int BackwardReferencesTraceBackwards(int xsize, int ysize,
   uint32_t* chosen_path = NULL;
   int chosen_path_size = 0;
   uint32_t* dist_array =
-      (uint32_t*)malloc(dist_array_size * sizeof(*dist_array));
+      (uint32_t*)WebPSafeMalloc((uint64_t)dist_array_size, sizeof(*dist_array));
 
   if (dist_array == NULL) goto Error;
 
@@ -755,13 +804,18 @@ static int ComputeCacheHistogram(const uint32_t* const argb,
   int i;
   uint32_t k;
   VP8LColorCache hashers;
+  const int use_color_cache = (cache_bits > 0);
+  int cc_init = 0;
 
-  if (!VP8LColorCacheInit(&hashers, cache_bits)) return 0;
+  if (use_color_cache) {
+    cc_init = VP8LColorCacheInit(&hashers, cache_bits);
+    if (!cc_init) return 0;
+  }
 
   for (i = 0; i < refs->size; ++i) {
     const PixOrCopy* const v = &refs->refs[i];
     if (PixOrCopyIsLiteral(v)) {
-      if (cache_bits != 0 &&
+      if (use_color_cache &&
           VP8LColorCacheContains(&hashers, argb[pixel_index])) {
         // push pixel as a cache index
         const int ix = VP8LColorCacheGetIndex(&hashers, argb[pixel_index]);
@@ -773,15 +827,17 @@ static int ComputeCacheHistogram(const uint32_t* const argb,
     } else {
       VP8LHistogramAddSinglePixOrCopy(histo, v);
     }
-    for (k = 0; k < PixOrCopyLength(v); ++k) {
-      VP8LColorCacheInsert(&hashers, argb[pixel_index]);
-      ++pixel_index;
+    if (use_color_cache) {
+      for (k = 0; k < PixOrCopyLength(v); ++k) {
+        VP8LColorCacheInsert(&hashers, argb[pixel_index + k]);
+      }
     }
+    pixel_index += PixOrCopyLength(v);
   }
   assert(pixel_index == xsize * ysize);
   (void)xsize;  // xsize is not used in non-debug compilations otherwise.
   (void)ysize;  // ysize is not used in non-debug compilations otherwise.
-  VP8LColorCacheClear(&hashers);
+  if (cc_init) VP8LColorCacheClear(&hashers);
   return 1;
 }
 
diff --git a/src/enc/config.c b/src/enc/config.c
index fa11e89a..1a261135 100644
--- a/src/enc/config.c
+++ b/src/enc/config.c
@@ -120,7 +120,7 @@ int WebPValidateConfig(const WebPConfig* config) {
     return 0;
   if (config->lossless < 0 || config->lossless > 1)
     return 0;
-  if (config->image_hint > WEBP_HINT_PHOTO)
+  if (config->image_hint >= WEBP_HINT_LAST)
     return 0;
   return 1;
 }
diff --git a/src/enc/histogram.c b/src/enc/histogram.c
index 6a35eda3..ca838e06 100644
--- a/src/enc/histogram.c
+++ b/src/enc/histogram.c
@@ -17,17 +17,7 @@
 #include "./backward_references.h"
 #include "./histogram.h"
 #include "../dsp/lossless.h"
-
-#if defined(_MSC_VER) && !defined(NOT_HAVE_LOG2)
-# define NOT_HAVE_LOG2 1
-#endif
-
-#ifdef NOT_HAVE_LOG2
-static WEBP_INLINE double log2(double d) {
-  const double kLog2Reciprocal = 1.442695040888963;
-  return log(d) * kLog2Reciprocal;
-}
-#endif
+#include "../utils/utils.h"
 
 static void HistogramClear(VP8LHistogram* const p) {
   memset(p->literal_, 0, sizeof(p->literal_));
@@ -65,10 +55,10 @@ VP8LHistogramSet* VP8LAllocateHistogramSet(int size, int cache_bits) {
   int i;
   VP8LHistogramSet* set;
   VP8LHistogram* bulk;
-  const size_t total_size = sizeof(*set)
-                          + size * sizeof(*set->histograms)
-                          + size * sizeof(**set->histograms);
-  uint8_t* memory = (uint8_t*)malloc(total_size);
+  const uint64_t total_size = (uint64_t)sizeof(*set)
+                            + size * sizeof(*set->histograms)
+                            + size * sizeof(**set->histograms);
+  uint8_t* memory = (uint8_t*)WebPSafeMalloc(total_size, sizeof(*memory));
   if (memory == NULL) return NULL;
 
   set = (VP8LHistogramSet*)memory;
@@ -87,33 +77,6 @@ VP8LHistogramSet* VP8LAllocateHistogramSet(int size, int cache_bits) {
 
 // -----------------------------------------------------------------------------
 
-void VP8LConvertPopulationCountTableToBitEstimates(
-    int num_symbols, const int population_counts[], double output[]) {
-  int sum = 0;
-  int nonzeros = 0;
-  int i;
-  for (i = 0; i < num_symbols; ++i) {
-    sum += population_counts[i];
-    if (population_counts[i] > 0) {
-      ++nonzeros;
-    }
-  }
-  if (nonzeros <= 1) {
-    memset(output, 0, num_symbols * sizeof(*output));
-    return;
-  }
-  {
-    const double log2sum = log2(sum);
-    for (i = 0; i < num_symbols; ++i) {
-      if (population_counts[i] == 0) {
-        output[i] = log2sum;
-      } else {
-        output[i] = log2sum - log2(population_counts[i]);
-      }
-    }
-  }
-}
-
 void VP8LHistogramAddSinglePixOrCopy(VP8LHistogram* const histo,
                                      const PixOrCopy* const v) {
   if (PixOrCopyIsLiteral(v)) {
@@ -138,7 +101,7 @@ void VP8LHistogramAddSinglePixOrCopy(VP8LHistogram* const histo,
 
 
 static double BitsEntropy(const int* const array, int n) {
-  double retval = 0;
+  double retval = 0.;
   int sum = 0;
   int nonzeros = 0;
   int max_val = 0;
@@ -148,15 +111,14 @@ static double BitsEntropy(const int* const array, int n) {
     if (array[i] != 0) {
       sum += array[i];
       ++nonzeros;
-      retval += array[i] * VP8LFastLog(array[i]);
+      retval -= VP8LFastSLog2(array[i]);
       if (max_val < array[i]) {
         max_val = array[i];
       }
     }
   }
-  retval -= sum * VP8LFastLog(sum);
-  retval *= -1.4426950408889634;  // 1.0 / -Log(2);
-  mix = 0.627;
+  retval += VP8LFastSLog2(sum);
+
   if (nonzeros < 5) {
     if (nonzeros <= 1) {
       return 0;
@@ -176,15 +138,15 @@ static double BitsEntropy(const int* const array, int n) {
     } else {
       mix = 0.7;  // nonzeros == 4.
     }
+  } else {
+    mix = 0.627;
   }
+
   {
     double min_limit = 2 * sum - max_val;
     min_limit = mix * min_limit + (1.0 - mix) * retval;
-    if (retval < min_limit) {
-      return min_limit;
-    }
+    return (retval < min_limit) ? min_limit : retval;
   }
-  return retval;
 }
 
 double VP8LHistogramEstimateBitsBulk(const VP8LHistogram* const p) {
diff --git a/src/enc/histogram.h b/src/enc/histogram.h
index 480aba81..ec573c5c 100644
--- a/src/enc/histogram.h
+++ b/src/enc/histogram.h
@@ -101,9 +101,6 @@ static WEBP_INLINE int VP8LHistogramNumCodes(const VP8LHistogram* const p) {
       ((p->palette_code_bits_ > 0) ? (1 << p->palette_code_bits_) : 0);
 }
 
-void VP8LConvertPopulationCountTableToBitEstimates(
-    int num_symbols, const int population_counts[], double output[]);
-
 // Builds the histogram image.
 int VP8LGetHistoImageSymbols(int xsize, int ysize,
                              const VP8LBackwardRefs* const refs,
diff --git a/src/enc/picture.c b/src/enc/picture.c
index f8ca19db..44eed060 100644
--- a/src/enc/picture.c
+++ b/src/enc/picture.c
@@ -15,7 +15,9 @@
 
 #include "./vp8enci.h"
 #include "../utils/rescaler.h"
+#include "../utils/utils.h"
 #include "../dsp/dsp.h"
+#include "../dsp/yuv.h"
 
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
@@ -81,14 +83,12 @@ int WebPPictureAlloc(WebPPicture* picture) {
 
       // Security and validation checks
       if (width <= 0 || height <= 0 ||         // luma/alpha param error
-          uv_width < 0 || uv_height < 0 ||     // u/v param error
-          y_size >= (1ULL << 40) ||            // reasonable global size
-          (size_t)total_size != total_size) {  // overflow on 32bit
+          uv_width < 0 || uv_height < 0) {     // u/v param error
         return 0;
       }
       // Clear previous buffer and allocate a new one.
       WebPPictureFree(picture);   // erase previous buffer
-      mem = (uint8_t*)malloc((size_t)total_size);
+      mem = (uint8_t*)WebPSafeMalloc(total_size, sizeof(*mem));
       if (mem == NULL) return 0;
 
       // From now on, we're in the clear, we can no longer fail...
@@ -119,15 +119,12 @@ int WebPPictureAlloc(WebPPicture* picture) {
     } else {
       void* memory;
       const uint64_t argb_size = (uint64_t)width * height;
-      const uint64_t total_size = argb_size * sizeof(*picture->argb);
-      if (width <= 0 || height <= 0 ||
-          argb_size >= (1ULL << 40) ||
-          (size_t)total_size != total_size) {
+      if (width <= 0 || height <= 0) {
         return 0;
       }
       // Clear previous buffer and allocate a new one.
       WebPPictureFree(picture);   // erase previous buffer
-      memory = malloc((size_t)total_size);
+      memory = WebPSafeMalloc(argb_size, sizeof(*picture->argb));
       if (memory == NULL) return 0;
 
       // TODO(skal): align plane to cache line?
@@ -416,7 +413,7 @@ int WebPPictureRescale(WebPPicture* pic, int width, int height) {
   if (!WebPPictureAlloc(&tmp)) return 0;
 
   if (!pic->use_argb) {
-    work = (int32_t*)malloc(2 * width * sizeof(*work));
+    work = (int32_t*)WebPSafeMalloc(2ULL * width, sizeof(*work));
     if (work == NULL) {
       WebPPictureFree(&tmp);
       return 0;
@@ -449,7 +446,7 @@ int WebPPictureRescale(WebPPicture* pic, int width, int height) {
     }
 #endif
   } else {
-    work = (int32_t*)malloc(2 * width * 4 * sizeof(*work));
+    work = (int32_t*)WebPSafeMalloc(2ULL * width * 4, sizeof(*work));
     if (work == NULL) {
       WebPPictureFree(&tmp);
       return 0;
@@ -480,17 +477,17 @@ void WebPMemoryWriterInit(WebPMemoryWriter* writer) {
 int WebPMemoryWrite(const uint8_t* data, size_t data_size,
                     const WebPPicture* picture) {
   WebPMemoryWriter* const w = (WebPMemoryWriter*)picture->custom_ptr;
-  size_t next_size;
+  uint64_t next_size;
   if (w == NULL) {
     return 1;
   }
-  next_size = w->size + data_size;
+  next_size = (uint64_t)w->size + data_size;
   if (next_size > w->max_size) {
     uint8_t* new_mem;
-    size_t next_max_size = w->max_size * 2;
+    uint64_t next_max_size = 2ULL * w->max_size;
     if (next_max_size < next_size) next_max_size = next_size;
-    if (next_max_size < 8192) next_max_size = 8192;
-    new_mem = (uint8_t*)malloc(next_max_size);
+    if (next_max_size < 8192ULL) next_max_size = 8192ULL;
+    new_mem = (uint8_t*)WebPSafeMalloc(next_max_size, 1);
     if (new_mem == NULL) {
       return 0;
     }
@@ -499,7 +496,8 @@ int WebPMemoryWrite(const uint8_t* data, size_t data_size,
     }
     free(w->mem);
     w->mem = new_mem;
-    w->max_size = next_max_size;
+    // down-cast is ok, thanks to WebPSafeMalloc
+    w->max_size = (size_t)next_max_size;
   }
   if (data_size > 0) {
     memcpy(w->mem + w->size, data, data_size);
@@ -547,33 +545,6 @@ int WebPPictureHasTransparency(const WebPPicture* picture) {
 
 //------------------------------------------------------------------------------
 // RGB -> YUV conversion
-// The exact naming is Y'CbCr, following the ITU-R BT.601 standard.
-// More information at: http://en.wikipedia.org/wiki/YCbCr
-// Y = 0.2569 * R + 0.5044 * G + 0.0979 * B + 16
-// U = -0.1483 * R - 0.2911 * G + 0.4394 * B + 128
-// V = 0.4394 * R - 0.3679 * G - 0.0715 * B + 128
-// We use 16bit fixed point operations.
-
-enum { YUV_FRAC = 16 };
-
-static WEBP_INLINE int clip_uv(int v) {
-   v = (v + (257 << (YUV_FRAC + 2 - 1))) >> (YUV_FRAC + 2);
-   return ((v & ~0xff) == 0) ? v : (v < 0) ? 0 : 255;
-}
-
-static WEBP_INLINE int rgb_to_y(int r, int g, int b) {
-  const int kRound = (1 << (YUV_FRAC - 1)) + (16 << YUV_FRAC);
-  const int luma = 16839 * r + 33059 * g + 6420 * b;
-  return (luma + kRound) >> YUV_FRAC;  // no need to clip
-}
-
-static WEBP_INLINE int rgb_to_u(int r, int g, int b) {
-  return clip_uv(-9719 * r - 19081 * g + 28800 * b);
-}
-
-static WEBP_INLINE int rgb_to_v(int r, int g, int b) {
-  return clip_uv(+28800 * r - 24116 * g - 4684 * b);
-}
 
 // TODO: we can do better than simply 2x2 averaging on U/V samples.
 #define SUM4(ptr) ((ptr)[0] + (ptr)[step] + \
@@ -587,8 +558,8 @@ static WEBP_INLINE int rgb_to_v(int r, int g, int b) {
   const int r = SUM(r_ptr + src);                        \
   const int g = SUM(g_ptr + src);                        \
   const int b = SUM(b_ptr + src);                        \
-  picture->u[dst] = rgb_to_u(r, g, b);                   \
-  picture->v[dst] = rgb_to_v(r, g, b);                   \
+  picture->u[dst] = VP8RGBToU(r, g, b);                  \
+  picture->v[dst] = VP8RGBToV(r, g, b);                  \
 }
 
 #define RGB_TO_UV0(x_in, x_out, y, SUM) {                \
@@ -597,8 +568,8 @@ static WEBP_INLINE int rgb_to_v(int r, int g, int b) {
   const int r = SUM(r_ptr + src);                        \
   const int g = SUM(g_ptr + src);                        \
   const int b = SUM(b_ptr + src);                        \
-  picture->u0[dst] = rgb_to_u(r, g, b);                  \
-  picture->v0[dst] = rgb_to_v(r, g, b);                  \
+  picture->u0[dst] = VP8RGBToU(r, g, b);                 \
+  picture->v0[dst] = VP8RGBToV(r, g, b);                 \
 }
 
 static void MakeGray(WebPPicture* const picture) {
@@ -636,7 +607,7 @@ static int ImportYUVAFromRGBA(const uint8_t* const r_ptr,
     for (x = 0; x < width; ++x) {
       const int offset = step * x + y * rgb_stride;
       picture->y[x + y * picture->y_stride] =
-          rgb_to_y(r_ptr[offset], g_ptr[offset], b_ptr[offset]);
+          VP8RGBToY(r_ptr[offset], g_ptr[offset], b_ptr[offset]);
     }
   }
 
@@ -646,7 +617,7 @@ static int ImportYUVAFromRGBA(const uint8_t* const r_ptr,
       for (x = 0; x < (width >> 1); ++x) {
         RGB_TO_UV(x, y, SUM4);
       }
-      if (picture->width & 1) {
+      if (width & 1) {
         RGB_TO_UV(x, y, SUM2V);
       }
     }
diff --git a/src/enc/vp8enci.h b/src/enc/vp8enci.h
index 17ae2621..a77778c0 100644
--- a/src/enc/vp8enci.h
+++ b/src/enc/vp8enci.h
@@ -26,8 +26,8 @@ extern "C" {
 
 // version numbers
 #define ENC_MAJ_VERSION 0
-#define ENC_MIN_VERSION 1
-#define ENC_REV_VERSION 99
+#define ENC_MIN_VERSION 2
+#define ENC_REV_VERSION 0
 
 // size of histogram used by CollectHistogram.
 #define MAX_COEFF_THRESH   64
@@ -402,7 +402,7 @@ struct VP8Encoder {
 
   // probabilities and statistics
   VP8Proba proba_;
-  uint64_t sse_[3];        // sum of Y/U/V squared errors for all macroblocks
+  uint64_t sse_[4];        // sum of Y/U/V/A squared errors for all macroblocks
   uint64_t sse_count_;     // pixel count for the sse_[] stats
   int      coded_size_;
   int      residual_bytes_[3][4];
@@ -488,9 +488,9 @@ void VP8SetSegmentParams(VP8Encoder* const enc, float quality);
 int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd, int rd_opt);
 
   // in alpha.c
-void VP8EncInitAlpha(VP8Encoder* enc);           // initialize alpha compression
-int VP8EncFinishAlpha(VP8Encoder* enc);          // finalize compressed data
-void VP8EncDeleteAlpha(VP8Encoder* enc);         // delete compressed data
+void VP8EncInitAlpha(VP8Encoder* const enc);    // initialize alpha compression
+int VP8EncFinishAlpha(VP8Encoder* const enc);   // finalize compressed data
+void VP8EncDeleteAlpha(VP8Encoder* const enc);  // delete compressed data
 
   // in layer.c
 void VP8EncInitLayer(VP8Encoder* const enc);     // init everything
diff --git a/src/enc/vp8l.c b/src/enc/vp8l.c
index 68fb5f17..9c202f8d 100644
--- a/src/enc/vp8l.c
+++ b/src/enc/vp8l.c
@@ -20,6 +20,7 @@
 #include "../dsp/lossless.h"
 #include "../utils/bit_writer.h"
 #include "../utils/huffman_encode.h"
+#include "../utils/utils.h"
 #include "../webp/format_constants.h"
 
 #if defined(__cplusplus) || defined(c_plusplus)
@@ -28,6 +29,7 @@ extern "C" {
 
 #define PALETTE_KEY_RIGHT_SHIFT   22  // Key for 1K buffer.
 #define MAX_HUFF_IMAGE_SIZE       (16 * 1024 * 1024)
+#define MAX_COLORS_FOR_GRAPH      64
 
 // -----------------------------------------------------------------------------
 // Palette
@@ -97,23 +99,24 @@ static int AnalyzeAndCreatePalette(const WebPPicture* const pic,
   return 1;
 }
 
-static int AnalyzeEntropy(const WebPPicture* const pic,
+static int AnalyzeEntropy(const uint32_t* argb,
+                          int width, int height, int argb_stride,
                           double* const nonpredicted_bits,
                           double* const predicted_bits) {
   int x, y;
-  const uint32_t* argb = pic->argb;
   const uint32_t* last_line = NULL;
   uint32_t last_pix = argb[0];    // so we're sure that pix_diff == 0
 
   VP8LHistogram* nonpredicted = NULL;
-  VP8LHistogram* predicted = (VP8LHistogram*)malloc(2 * sizeof(*predicted));
+  VP8LHistogram* predicted =
+      (VP8LHistogram*)malloc(2 * sizeof(*predicted));
   if (predicted == NULL) return 0;
   nonpredicted = predicted + 1;
 
   VP8LHistogramInit(predicted, 0);
   VP8LHistogramInit(nonpredicted, 0);
-  for (y = 0; y < pic->height; ++y) {
-    for (x = 0; x < pic->width; ++x) {
+  for (y = 0; y < height; ++y) {
+    for (x = 0; x < width; ++x) {
       const uint32_t pix = argb[x];
       const uint32_t pix_diff = VP8LSubPixels(pix, last_pix);
       if (pix_diff == 0) continue;
@@ -129,7 +132,7 @@ static int AnalyzeEntropy(const WebPPicture* const pic,
       }
     }
     last_line = argb;
-    argb += pic->argb_stride;
+    argb += argb_stride;
   }
   *nonpredicted_bits = VP8LHistogramEstimateBitsBulk(nonpredicted);
   *predicted_bits = VP8LHistogramEstimateBitsBulk(predicted);
@@ -143,32 +146,42 @@ static int VP8LEncAnalyze(VP8LEncoder* const enc, WebPImageHint image_hint) {
 
   enc->use_palette_ =
       AnalyzeAndCreatePalette(pic, enc->palette_, &enc->palette_size_);
-  if (!enc->use_palette_) {
-    if (image_hint == WEBP_HINT_DEFAULT) {
-      double non_pred_entropy, pred_entropy;
-      if (!AnalyzeEntropy(pic, &non_pred_entropy, &pred_entropy)) {
-        return 0;
-      }
 
-      if (pred_entropy < 0.95 * non_pred_entropy) {
-        enc->use_predict_ = 1;
-        enc->use_cross_color_ = 1;
-      }
-    } else if (image_hint == WEBP_HINT_PHOTO) {
-      enc->use_predict_ = 1;
-      enc->use_cross_color_ = 1;
+  if (image_hint == WEBP_HINT_GRAPH) {
+    if (enc->use_palette_ && enc->palette_size_ < MAX_COLORS_FOR_GRAPH) {
+      enc->use_palette_ = 0;
     }
   }
+
+  if (!enc->use_palette_) {
+    if (image_hint == WEBP_HINT_PHOTO) {
+      enc->use_predict_ = 1;
+      enc->use_cross_color_ = 1;
+    } else {
+      double non_pred_entropy, pred_entropy;
+      if (!AnalyzeEntropy(pic->argb, pic->width, pic->height, pic->argb_stride,
+                          &non_pred_entropy, &pred_entropy)) {
+        return 0;
+      }
+      if (pred_entropy < 0.95 * non_pred_entropy) {
+        enc->use_predict_ = 1;
+        // TODO(vikasa): Observed some correlation of cross_color transform with
+        // predict. Need to investigate this further and add separate heuristic
+        // for setting use_cross_color flag.
+        enc->use_cross_color_ = 1;
+      }
+    }
+  }
+
   return 1;
 }
 
-
 static int GetHuffBitLengthsAndCodes(
     const VP8LHistogramSet* const histogram_image,
     HuffmanTreeCode* const huffman_codes) {
   int i, k;
   int ok = 1;
-  int total_length_size = 0;
+  uint64_t total_length_size = 0;
   uint8_t* mem_buf = NULL;
   const int histogram_image_size = histogram_image->size;
 
@@ -189,9 +202,8 @@ static int GetHuffBitLengthsAndCodes(
   {
     uint16_t* codes;
     uint8_t* lengths;
-    const size_t total_buf_size = total_length_size * sizeof(*lengths)
-                                + total_length_size * sizeof(*codes);
-    mem_buf = (uint8_t*)calloc(total_buf_size, 1);
+    mem_buf = (uint8_t*)WebPSafeCalloc(total_length_size,
+                                       sizeof(*lengths) + sizeof(*codes));
     if (mem_buf == NULL) {
       ok = 0;
       goto End;
@@ -293,7 +305,7 @@ static int StoreFullHuffmanCode(VP8LBitWriter* const bw,
   int num_tokens;
   HuffmanTreeCode huffman_code;
   HuffmanTreeToken* const tokens =
-      (HuffmanTreeToken*)malloc(max_tokens * sizeof(*tokens));
+      (HuffmanTreeToken*)WebPSafeMalloc((uint64_t)max_tokens, sizeof(*tokens));
   if (tokens == NULL) return 0;
 
   huffman_code.num_symbols = CODE_LENGTH_CODES;
@@ -500,21 +512,21 @@ static int EncodeImageInternal(VP8LBitWriter* const bw,
                                const uint32_t* const argb,
                                int width, int height, int quality,
                                int cache_bits, int histogram_bits) {
-  int i;
   int ok = 0;
   const int use_2d_locality = 1;
   const int use_color_cache = (cache_bits > 0);
-  const int histogram_image_xysize =
+  const uint32_t histogram_image_xysize =
       VP8LSubSampleSize(width, histogram_bits) *
       VP8LSubSampleSize(height, histogram_bits);
   VP8LHistogramSet* histogram_image =
       VP8LAllocateHistogramSet(histogram_image_xysize, 0);
   int histogram_image_size = 0;
-  int bit_array_size = 0;
+  size_t bit_array_size = 0;
   HuffmanTreeCode* huffman_codes = NULL;
   VP8LBackwardRefs refs;
   uint16_t* const histogram_symbols =
-      (uint16_t*)malloc(histogram_image_xysize * sizeof(*histogram_symbols));
+      (uint16_t*)WebPSafeMalloc((uint64_t)histogram_image_xysize,
+                                sizeof(*histogram_symbols));
   assert(histogram_bits >= MIN_HUFFMAN_BITS);
   assert(histogram_bits <= MAX_HUFFMAN_BITS);
   if (histogram_image == NULL || histogram_symbols == NULL) goto Error;
@@ -534,8 +546,8 @@ static int EncodeImageInternal(VP8LBitWriter* const bw,
   // Create Huffman bit lengths and codes for each histogram image.
   histogram_image_size = histogram_image->size;
   bit_array_size = 5 * histogram_image_size;
-  huffman_codes = (HuffmanTreeCode*)calloc(bit_array_size,
-                                           sizeof(*huffman_codes));
+  huffman_codes = (HuffmanTreeCode*)WebPSafeCalloc(bit_array_size,
+                                                   sizeof(*huffman_codes));
   if (huffman_codes == NULL ||
       !GetHuffBitLengthsAndCodes(histogram_image, huffman_codes)) {
     goto Error;
@@ -553,8 +565,10 @@ static int EncodeImageInternal(VP8LBitWriter* const bw,
     VP8LWriteBits(bw, 1, write_histogram_image);
     if (write_histogram_image) {
       uint32_t* const histogram_argb =
-          (uint32_t*)malloc(histogram_image_xysize * sizeof(*histogram_argb));
+          (uint32_t*)WebPSafeMalloc((uint64_t)histogram_image_xysize,
+                                    sizeof(*histogram_argb));
       int max_index = 0;
+      uint32_t i;
       if (histogram_argb == NULL) goto Error;
       for (i = 0; i < histogram_image_xysize; ++i) {
         const int index = histogram_symbols[i] & 0xffff;
@@ -576,14 +590,14 @@ static int EncodeImageInternal(VP8LBitWriter* const bw,
   }
 
   // Store Huffman codes.
-  for (i = 0; i < 5 * histogram_image_size; ++i) {
-    HuffmanTreeCode* const codes = &huffman_codes[i];
-    if (!StoreHuffmanCode(bw, codes)) {
-      goto Error;
+  {
+    int i;
+    for (i = 0; i < 5 * histogram_image_size; ++i) {
+      HuffmanTreeCode* const codes = &huffman_codes[i];
+      if (!StoreHuffmanCode(bw, codes)) goto Error;
+      ClearHuffmanTreeIfOnlyOneSymbol(codes);
     }
-    ClearHuffmanTreeIfOnlyOneSymbol(codes);
   }
-
   // Free combined histograms.
   free(histogram_image);
   histogram_image = NULL;
@@ -610,7 +624,7 @@ static int EncodeImageInternal(VP8LBitWriter* const bw,
 
 // Check if it would be a good idea to subtract green from red and blue. We
 // only impact entropy in red/blue components, don't bother to look at others.
-static int EvalAndApplySubtractGreen(const VP8LEncoder* const enc,
+static int EvalAndApplySubtractGreen(VP8LEncoder* const enc,
                                      int width, int height,
                                      VP8LBitWriter* const bw) {
   if (!enc->use_palette_) {
@@ -639,7 +653,8 @@ static int EvalAndApplySubtractGreen(const VP8LEncoder* const enc,
     free(histo);
 
     // Check if subtracting green yields low entropy.
-    if (bit_cost_after < bit_cost_before) {
+    enc->use_subtract_green_ = (bit_cost_after < bit_cost_before);
+    if (enc->use_subtract_green_) {
       VP8LWriteBits(bw, 1, TRANSFORM_PRESENT);
       VP8LWriteBits(bw, 2, SUBTRACT_GREEN);
       VP8LSubtractGreenFromBlueAndRed(enc->argb_, width * height);
@@ -754,7 +769,7 @@ static WebPEncodingError WriteImage(const WebPPicture* const pic,
       goto Error;
     }
   }
-  *coded_size = vp8l_size;
+  *coded_size = CHUNK_HEADER_SIZE + riff_size;
   return VP8_ENC_OK;
 
  Error:
@@ -769,14 +784,14 @@ static WebPEncodingError AllocateTransformBuffer(VP8LEncoder* const enc,
                                                  int width, int height) {
   WebPEncodingError err = VP8_ENC_OK;
   const int tile_size = 1 << enc->transform_bits_;
-  const size_t image_size = width * height;
-  const size_t argb_scratch_size = tile_size * width + width;
-  const size_t transform_data_size =
-      VP8LSubSampleSize(width, enc->transform_bits_) *
-      VP8LSubSampleSize(height, enc->transform_bits_);
-  const size_t total_size =
+  const uint64_t image_size = width * height;
+  const uint64_t argb_scratch_size = tile_size * width + width;
+  const uint64_t transform_data_size =
+      (uint64_t)VP8LSubSampleSize(width, enc->transform_bits_) *
+      (uint64_t)VP8LSubSampleSize(height, enc->transform_bits_);
+  const uint64_t total_size =
       image_size + argb_scratch_size + transform_data_size;
-  uint32_t* mem = (uint32_t*)malloc(total_size * sizeof(*mem));
+  uint32_t* mem = (uint32_t*)WebPSafeMalloc(total_size, sizeof(*mem));
   if (mem == NULL) {
     err = VP8_ENC_ERROR_OUT_OF_MEMORY;
     goto Error;
@@ -938,6 +953,7 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
   const int width = picture->width;
   const int height = picture->height;
   VP8LEncoder* const enc = VP8LEncoderNew(config, picture);
+  const size_t byte_position = VP8LBitWriterNumBytes(bw);
 
   if (enc == NULL) {
     err = VP8_ENC_ERROR_OUT_OF_MEMORY;
@@ -957,6 +973,7 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
   if (enc->use_palette_) {
     err = ApplyPalette(bw, enc, quality);
     if (err != VP8_ENC_OK) goto Error;
+    // Color cache is disabled for palette.
     enc->cache_bits_ = 0;
   }
 
@@ -1017,6 +1034,20 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
     goto Error;
   }
 
+  if (picture->stats != NULL) {
+    WebPAuxStats* const stats = picture->stats;
+    stats->lossless_features = 0;
+    if (enc->use_predict_) stats->lossless_features |= 1;
+    if (enc->use_cross_color_) stats->lossless_features |= 2;
+    if (enc->use_subtract_green_) stats->lossless_features |= 4;
+    if (enc->use_palette_) stats->lossless_features |= 8;
+    stats->histogram_bits = enc->histo_bits_;
+    stats->transform_bits = enc->transform_bits_;
+    stats->cache_bits = enc->cache_bits_;
+    stats->palette_size = enc->palette_size_;
+    stats->lossless_size = (int)(VP8LBitWriterNumBytes(bw) - byte_position);
+  }
+
  Error:
   VP8LEncoderDelete(enc);
   return err;
@@ -1035,19 +1066,34 @@ int VP8LEncodeImage(const WebPConfig* const config,
 
   if (config == NULL || picture->argb == NULL) {
     err = VP8_ENC_ERROR_NULL_PARAMETER;
-    goto Error;
+    WebPEncodingSetError(picture, err);
+    return 0;
   }
 
   width = picture->width;
   height = picture->height;
+  if (!VP8LBitWriterInit(&bw, (width * height) >> 1)) {
+    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    goto Error;
+  }
+
   if (!WebPReportProgress(picture, 1, &percent)) {
  UserAbort:
     err = VP8_ENC_ERROR_USER_ABORT;
     goto Error;
   }
+  // Reset stats (for pure lossless coding)
+  if (picture->stats != NULL) {
+    WebPAuxStats* const stats = picture->stats;
+    memset(stats, 0, sizeof(*stats));
+    stats->PSNR[0] = 99.f;
+    stats->PSNR[1] = 99.f;
+    stats->PSNR[2] = 99.f;
+    stats->PSNR[3] = 99.f;
+    stats->PSNR[4] = 99.f;
+  }
 
   // Write image size.
-  VP8LBitWriterInit(&bw, (width * height) >> 1);
   if (!WriteImageSize(picture, &bw)) {
     err = VP8_ENC_ERROR_OUT_OF_MEMORY;
     goto Error;
@@ -1075,15 +1121,10 @@ int VP8LEncodeImage(const WebPConfig* const config,
 
   if (!WebPReportProgress(picture, 100, &percent)) goto UserAbort;
 
-  // Collect some stats if needed.
+  // Save size.
   if (picture->stats != NULL) {
-    WebPAuxStats* const stats = picture->stats;
-    memset(stats, 0, sizeof(*stats));
-    stats->PSNR[0] = 99.;
-    stats->PSNR[1] = 99.;
-    stats->PSNR[2] = 99.;
-    stats->PSNR[3] = 99.;
-    stats->coded_size = (int)coded_size;
+    picture->stats->coded_size += (int)coded_size;
+    picture->stats->lossless_size = (int)coded_size;
   }
 
   if (picture->extra_info != NULL) {
diff --git a/src/enc/vp8li.h b/src/enc/vp8li.h
index 083ff595..eae90dd6 100644
--- a/src/enc/vp8li.h
+++ b/src/enc/vp8li.h
@@ -38,6 +38,7 @@ typedef struct {
 
   // Encoding parameters derived from image characteristics.
   int use_cross_color_;
+  int use_subtract_green_;
   int use_predict_;
   int use_palette_;
   int palette_size_;
diff --git a/src/enc/webpenc.c b/src/enc/webpenc.c
index 99ab170b..3c275589 100644
--- a/src/enc/webpenc.c
+++ b/src/enc/webpenc.c
@@ -16,6 +16,7 @@
 
 #include "./vp8enci.h"
 #include "./vp8li.h"
+#include "../utils/utils.h"
 
 // #define PRINT_MEMORY_INFO
 
@@ -164,13 +165,14 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
       config->autofilter ? sizeof(LFStats) + ALIGN_CST : 0;
   VP8Encoder* enc;
   uint8_t* mem;
-  size_t size = sizeof(VP8Encoder) + ALIGN_CST  // main struct
-              + cache_size                      // working caches
-              + info_size                       // modes info
-              + preds_size                      // prediction modes
-              + samples_size                    // top/left samples
-              + nz_size                         // coeff context bits
-              + lf_stats_size;                  // autofilter stats
+  const uint64_t size = (uint64_t)sizeof(VP8Encoder)   // main struct
+                      + ALIGN_CST                      // cache alignment
+                      + cache_size                     // working caches
+                      + info_size                      // modes info
+                      + preds_size                     // prediction modes
+                      + samples_size                   // top/left samples
+                      + nz_size                        // coeff context bits
+                      + lf_stats_size;                 // autofilter stats
 
 #ifdef PRINT_MEMORY_INFO
   printf("===================================\n");
@@ -198,7 +200,7 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
          mb_w * mb_h * 384 * sizeof(uint8_t));
   printf("===================================\n");
 #endif
-  mem = (uint8_t*)malloc(size);
+  mem = (uint8_t*)WebPSafeMalloc(size, sizeof(*mem));
   if (mem == NULL) {
     WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
     return NULL;
@@ -284,6 +286,7 @@ static void FinalizePSNR(const VP8Encoder* const enc) {
   stats->PSNR[1] = (float)GetPSNR(sse[1], size / 4);
   stats->PSNR[2] = (float)GetPSNR(sse[2], size / 4);
   stats->PSNR[3] = (float)GetPSNR(sse[0] + sse[1] + sse[2], size * 3 / 2);
+  stats->PSNR[4] = (float)GetPSNR(sse[3], size);
 }
 
 static void StoreStats(VP8Encoder* const enc) {
@@ -343,6 +346,8 @@ int WebPEncode(const WebPConfig* config, WebPPicture* pic) {
   if (pic->width > WEBP_MAX_DIMENSION || pic->height > WEBP_MAX_DIMENSION)
     return WebPEncodingSetError(pic, VP8_ENC_ERROR_BAD_DIMENSION);
 
+  if (pic->stats != NULL) memset(pic->stats, 0, sizeof(*pic->stats));
+
   if (!config->lossless) {
     VP8Encoder* enc = NULL;
     if (pic->y == NULL || pic->u == NULL || pic->v == NULL) {
diff --git a/src/utils/Makefile.am b/src/utils/Makefile.am
index 96b2bd45..65054c03 100644
--- a/src/utils/Makefile.am
+++ b/src/utils/Makefile.am
@@ -20,6 +20,8 @@ libwebputils_la_SOURCES += rescaler.c
 libwebputils_la_SOURCES += rescaler.h
 libwebputils_la_SOURCES += thread.c
 libwebputils_la_SOURCES += thread.h
+libwebputils_la_SOURCES += utils.c
+libwebputils_la_SOURCES += utils.h
 
 libwebputilsinclude_HEADERS = ../webp/types.h
 libwebputilsincludedir = $(includedir)/webp
diff --git a/src/utils/bit_writer.c b/src/utils/bit_writer.c
index fcb0a3c9..671159ca 100644
--- a/src/utils/bit_writer.c
+++ b/src/utils/bit_writer.c
@@ -25,18 +25,23 @@ extern "C" {
 static int BitWriterResize(VP8BitWriter* const bw, size_t extra_size) {
   uint8_t* new_buf;
   size_t new_size;
-  const size_t needed_size = bw->pos_ + extra_size;
+  const uint64_t needed_size_64b = (uint64_t)bw->pos_ + extra_size;
+  const size_t needed_size = (size_t)needed_size_64b;
+  if (needed_size_64b != needed_size) {
+    bw->error_ = 1;
+    return 0;
+  }
   if (needed_size <= bw->max_pos_) return 1;
+  // If the following line wraps over 32bit, the test just after will catch it.
   new_size = 2 * bw->max_pos_;
-  if (new_size < needed_size)
-    new_size = needed_size;
+  if (new_size < needed_size) new_size = needed_size;
   if (new_size < 1024) new_size = 1024;
   new_buf = (uint8_t*)malloc(new_size);
   if (new_buf == NULL) {
     bw->error_ = 1;
     return 0;
   }
-  if (bw->pos_ > 0) memcpy(new_buf, bw->buf_, bw->pos_);
+  memcpy(new_buf, bw->buf_, bw->pos_);
   free(bw->buf_);
   bw->buf_ = new_buf;
   bw->max_pos_ = new_size;
@@ -51,10 +56,8 @@ static void kFlush(VP8BitWriter* const bw) {
   bw->nb_bits_ -= 8;
   if ((bits & 0xff) != 0xff) {
     size_t pos = bw->pos_;
-    if (pos + bw->run_ >= bw->max_pos_) {  // reallocate
-      if (!BitWriterResize(bw,  bw->run_ + 1)) {
-        return;
-      }
+    if (!BitWriterResize(bw, bw->run_ + 1)) {
+      return;
     }
     if (bits & 0x100) {  // overflow -> propagate carry over pending 0xff's
       if (pos > 0) bw->buf_[pos - 1]++;
@@ -194,23 +197,28 @@ void VP8BitWriterWipeOut(VP8BitWriter* const bw) {
 static int VP8LBitWriterResize(VP8LBitWriter* const bw, size_t extra_size) {
   uint8_t* allocated_buf;
   size_t allocated_size;
-  const size_t size_required = VP8LBitWriterNumBytes(bw) + extra_size;
-  if ((bw->max_bytes_ > 0) && (size_required <= bw->max_bytes_)) return 1;
-  allocated_size = (3 * bw->max_bytes_) >> 1;
-  if (allocated_size < size_required) {
-    allocated_size = size_required;
+  const size_t current_size = VP8LBitWriterNumBytes(bw);
+  const uint64_t size_required_64b = (uint64_t)current_size + extra_size;
+  const size_t size_required = (size_t)size_required_64b;
+  if (size_required != size_required_64b) {
+    bw->error_ = 1;
+    return 0;
   }
-  // Make Allocated size multiple of KBs
+  if (bw->max_bytes_ > 0 && size_required <= bw->max_bytes_) return 1;
+  allocated_size = (3 * bw->max_bytes_) >> 1;
+  if (allocated_size < size_required) allocated_size = size_required;
+  // make allocated size multiple of 1k
   allocated_size = (((allocated_size >> 10) + 1) << 10);
   allocated_buf = (uint8_t*)malloc(allocated_size);
-  if (allocated_buf == NULL) return 0;
-  memset(allocated_buf, 0, allocated_size);
-  if (bw->bit_pos_ > 0) {
-    memcpy(allocated_buf, bw->buf_, VP8LBitWriterNumBytes(bw));
+  if (allocated_buf == NULL) {
+    bw->error_ = 1;
+    return 0;
   }
+  memcpy(allocated_buf, bw->buf_, current_size);
   free(bw->buf_);
   bw->buf_ = allocated_buf;
   bw->max_bytes_ = allocated_size;
+  memset(allocated_buf + current_size, 0, allocated_size - current_size);
   return 1;
 }
 
@@ -232,33 +240,37 @@ void VP8LWriteBits(VP8LBitWriter* const bw, int n_bits, uint32_t bits) {
   // Technically, this branch of the code can write up to 25 bits at a time,
   // but in prefix encoding, the maximum number of bits written is 18 at a time.
   {
-    uint8_t* p = &bw->buf_[bw->bit_pos_ >> 3];
-    uint32_t v = *(const uint32_t*)(p);
+    uint8_t* const p = &bw->buf_[bw->bit_pos_ >> 3];
+    uint32_t v = *(const uint32_t*)p;
     v |= bits << (bw->bit_pos_ & 7);
-    *(uint32_t*)(p) = v;
+    *(uint32_t*)p = v;
     bw->bit_pos_ += n_bits;
   }
-#else  // LITTLE_ENDIAN
-  // implicit & 0xff is assumed for uint8_t arithmetics
+#else  // BIG_ENDIAN
   {
     uint8_t* p = &bw->buf_[bw->bit_pos_ >> 3];
-    const int bits_reserved_in_first_byte = (bw->bit_pos_ & 7);
-    *p++ |= (bits << bits_reserved_in_first_byte);
+    const int bits_reserved_in_first_byte = bw->bit_pos_ & 7;
     const int bits_left_to_write = n_bits - 8 + bits_reserved_in_first_byte;
+    // implicit & 0xff is assumed for uint8_t arithmetics
+    *p++ |= bits << bits_reserved_in_first_byte;
+    bits >>= 8 - bits_reserved_in_first_byte;
     if (bits_left_to_write >= 1) {
-      *p++ = bits >> (8 - bits_reserved_in_first_byte);
+      *p++ = bits;
+      bits >>= 8;
       if (bits_left_to_write >= 9) {
-        *p++ = bits >> (16 - bits_reserved_in_first_byte);
+        *p++ = bits;
+        bits >>= 8;
       }
     }
     assert(n_bits <= 25);
-    *p = bits >> (24 - bits_reserved_in_first_byte);
+    *p = bits;
     bw->bit_pos_ += n_bits;
   }
-#endif  // BIG_ENDIAN
+#endif
   if ((bw->bit_pos_ >> 3) > (bw->max_bytes_ - 8)) {
-    const size_t kAdditionalBuffer = 32768 + bw->max_bytes_;
-    if (!VP8LBitWriterResize(bw, kAdditionalBuffer)) {
+    const uint64_t extra_size = 32768ULL + bw->max_bytes_;
+    if (extra_size != (size_t)extra_size ||
+        !VP8LBitWriterResize(bw, (size_t)extra_size)) {
       bw->bit_pos_ = 0;
       bw->error_ = 1;
     }
diff --git a/src/utils/color_cache.c b/src/utils/color_cache.c
index 1bb360f1..560f81db 100644
--- a/src/utils/color_cache.c
+++ b/src/utils/color_cache.c
@@ -12,6 +12,7 @@
 #include <assert.h>
 #include <stdlib.h>
 #include "./color_cache.h"
+#include "../utils/utils.h"
 
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
@@ -21,12 +22,11 @@ extern "C" {
 // VP8LColorCache.
 
 int VP8LColorCacheInit(VP8LColorCache* const cc, int hash_bits) {
-  int hash_size;
+  const int hash_size = 1 << hash_bits;
   assert(cc != NULL);
-
-  if (hash_bits == 0) hash_bits = 1;
-  hash_size = 1 << hash_bits;
-  cc->colors_ = (uint32_t*)calloc(hash_size, sizeof(*cc->colors_));
+  assert(hash_bits > 0);
+  cc->colors_ = (uint32_t*)WebPSafeCalloc((uint64_t)hash_size,
+                                          sizeof(*cc->colors_));
   if (cc->colors_ == NULL) return 0;
   cc->hash_shift_ = 32 - hash_bits;
   return 1;
diff --git a/src/utils/huffman.c b/src/utils/huffman.c
index 0ac8248e..41529cc9 100644
--- a/src/utils/huffman.c
+++ b/src/utils/huffman.c
@@ -12,6 +12,7 @@
 #include <assert.h>
 #include <stdlib.h>
 #include "./huffman.h"
+#include "../utils/utils.h"
 #include "../webp/format_constants.h"
 
 #if defined(__cplusplus) || defined(c_plusplus)
@@ -49,8 +50,8 @@ static int TreeInit(HuffmanTree* const tree, int num_leaves) {
   // Note that a Huffman tree is a full binary tree; and in a full binary tree
   // with L leaves, the total number of nodes N = 2 * L - 1.
   tree->max_nodes_ = 2 * num_leaves - 1;
-  tree->root_ =
-      (HuffmanTreeNode*)malloc(tree->max_nodes_ * sizeof(*tree->root_));
+  tree->root_ = (HuffmanTreeNode*)WebPSafeMalloc((uint64_t)tree->max_nodes_,
+                                                 sizeof(*tree->root_));
   if (tree->root_ == NULL) return 0;
   TreeNodeInit(tree->root_);  // Initialize root.
   tree->num_nodes_ = 1;
@@ -173,7 +174,8 @@ int HuffmanTreeBuildImplicit(HuffmanTree* const tree,
     int ok = 0;
 
     // Get Huffman codes from the code lengths.
-    int* const codes = (int*)malloc(code_lengths_size * sizeof(*codes));
+    int* const codes =
+        (int*)WebPSafeMalloc((uint64_t)code_lengths_size, sizeof(*codes));
     if (codes == NULL) goto End;
 
     if (!HuffmanCodeLengthsToCodes(code_lengths, code_lengths_size, codes)) {
diff --git a/src/utils/huffman_encode.c b/src/utils/huffman_encode.c
index a78874fc..8ccd291d 100644
--- a/src/utils/huffman_encode.c
+++ b/src/utils/huffman_encode.c
@@ -13,6 +13,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include "./huffman_encode.h"
+#include "../utils/utils.h"
 #include "../webp/format_constants.h"
 
 // -----------------------------------------------------------------------------
@@ -196,7 +197,7 @@ static int GenerateOptimalTree(const int* const histogram, int histogram_size,
   // population and all the inserted nodes combining two existing nodes.
   // The tree pool needs 2 * (tree_size_orig - 1) entities, and the
   // tree needs exactly tree_size_orig entities.
-  tree = (HuffmanTree*)malloc(3 * tree_size_orig * sizeof(*tree));
+  tree = (HuffmanTree*)WebPSafeMalloc(3ULL * tree_size_orig, sizeof(*tree));
   if (tree == NULL) return 0;
   tree_pool = tree + tree_size_orig;
 
diff --git a/src/utils/utils.c b/src/utils/utils.c
new file mode 100644
index 00000000..673b7e28
--- /dev/null
+++ b/src/utils/utils.c
@@ -0,0 +1,44 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// Misc. common utility functions
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <stdlib.h>
+#include "./utils.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+//------------------------------------------------------------------------------
+// Checked memory allocation
+
+static int CheckSizeArguments(uint64_t nmemb, size_t size) {
+  const uint64_t total_size = nmemb * size;
+  if (nmemb == 0) return 1;
+  if ((uint64_t)size > WEBP_MAX_ALLOCABLE_MEMORY / nmemb) return 0;
+  if (total_size != (size_t)total_size) return 0;
+  return 1;
+}
+
+void* WebPSafeMalloc(uint64_t nmemb, size_t size) {
+  if (!CheckSizeArguments(nmemb, size)) return NULL;
+  return malloc((size_t)(nmemb * size));
+}
+
+void* WebPSafeCalloc(uint64_t nmemb, size_t size) {
+  if (!CheckSizeArguments(nmemb, size)) return NULL;
+  return calloc((size_t)nmemb, size);
+}
+
+//------------------------------------------------------------------------------
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/src/utils/utils.h b/src/utils/utils.h
new file mode 100644
index 00000000..a0347625
--- /dev/null
+++ b/src/utils/utils.h
@@ -0,0 +1,44 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// Misc. common utility functions
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#ifndef WEBP_UTILS_UTILS_H_
+#define WEBP_UTILS_UTILS_H_
+
+#include "../webp/types.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+//------------------------------------------------------------------------------
+// Memory allocation
+
+// This is the maximum memory amount that libwebp will ever try to allocate.
+#define WEBP_MAX_ALLOCABLE_MEMORY (1ULL << 40)
+
+// size-checking safe malloc/calloc: verify that the requested size is not too
+// large, or return NULL. You don't need to call these for constructs like
+// malloc(sizeof(foo)), but only if there's picture-dependent size involved
+// somewhere (like: malloc(num_pixels * sizeof(*something))). That's why this
+// safe malloc() borrows the signature from calloc(), pointing at the dangerous
+// underlying multiply involved.
+void* WebPSafeMalloc(uint64_t nmemb, size_t size);
+// Note that WebPSafeCalloc() expects the second argument type to be 'size_t'
+// in order to favor the "calloc(num_foo, sizeof(foo))" pattern.
+void* WebPSafeCalloc(uint64_t nmemb, size_t size);
+
+//------------------------------------------------------------------------------
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
+
+#endif  /* WEBP_UTILS_UTILS_H_ */
diff --git a/src/webp/decode.h b/src/webp/decode.h
index 3c007c52..43b6c58f 100644
--- a/src/webp/decode.h
+++ b/src/webp/decode.h
@@ -18,7 +18,7 @@
 extern "C" {
 #endif
 
-#define WEBP_DECODER_ABI_VERSION 0x0100    // MAJOR(8b) + MINOR(8b)
+#define WEBP_DECODER_ABI_VERSION 0x0200    // MAJOR(8b) + MINOR(8b)
 
 // Return the decoder's version number, packed in hexadecimal using 8bits for
 // each of major/minor/revision. E.g: v2.5.7 is 0x020507.
@@ -233,7 +233,7 @@ typedef enum {
 //
 //     // The above call decodes the current available buffer.
 //     // Part of the image can now be refreshed by calling to
-//     // WebPIDecGetRGB()/WebPIDecGetYUV() etc.
+//     // WebPIDecGetRGB()/WebPIDecGetYUVA() etc.
 //   }
 //   WebPIDelete(idec);
 
@@ -260,9 +260,18 @@ WEBP_EXTERN(WebPIDecoder*) WebPINewRGB(
 // will output the raw luma/chroma samples into a preallocated planes. The luma
 // plane is specified by its pointer 'luma', its size 'luma_size' and its stride
 // 'luma_stride'. Similarly, the chroma-u plane is specified by the 'u',
-// 'u_size' and 'u_stride' parameters, and the chroma-v plane by 'v', 'v_size'
-// and 'v_size'.
+// 'u_size' and 'u_stride' parameters, and the chroma-v plane by 'v'
+// and 'v_size'. And same for the alpha-plane. The 'a' pointer can be pass
+// NULL in case one is not interested in the transparency plane.
 // Returns NULL if the allocation failed.
+WEBP_EXTERN(WebPIDecoder*) WebPINewYUVA(
+    uint8_t* luma, size_t luma_size, int luma_stride,
+    uint8_t* u, size_t u_size, int u_stride,
+    uint8_t* v, size_t v_size, int v_stride,
+    uint8_t* a, size_t a_size, int a_stride);
+
+// Deprecated version of the above, without the alpha plane.
+// Kept for backward compatibility.
 WEBP_EXTERN(WebPIDecoder*) WebPINewYUV(
     uint8_t* luma, size_t luma_size, int luma_stride,
     uint8_t* u, size_t u_size, int u_stride,
@@ -296,12 +305,22 @@ WEBP_EXTERN(uint8_t*) WebPIDecGetRGB(
     const WebPIDecoder* idec, int* last_y,
     int* width, int* height, int* stride);
 
-// Same as above function to get YUV image. Returns pointer to the luma plane
-// or NULL in case of error.
-WEBP_EXTERN(uint8_t*) WebPIDecGetYUV(
+// Same as above function to get a YUVA image. Returns pointer to the luma
+// plane or NULL in case of error. If there is no alpha information
+// the alpha pointer '*a' will be returned NULL.
+WEBP_EXTERN(uint8_t*) WebPIDecGetYUVA(
     const WebPIDecoder* idec, int* last_y,
-    uint8_t** u, uint8_t** v,
-    int* width, int* height, int* stride, int* uv_stride);
+    uint8_t** u, uint8_t** v, uint8_t** a,
+    int* width, int* height, int* stride, int* uv_stride, int* a_stride);
+
+// Deprecated alpha-less version of WebPIDecGetYUVA(): it will ignore the
+// alpha information (if present). Kept for backward compatibility.
+static WEBP_INLINE uint8_t* WebPIDecGetYUV(
+    const WebPIDecoder* idec, int* last_y, uint8_t** u, uint8_t** v,
+    int* width, int* height, int* stride, int* uv_stride) {
+  return WebPIDecGetYUVA(idec, last_y, u, v, NULL, width, height,
+                         stride, uv_stride, NULL);
+}
 
 // Generic call to retrieve information about the displayable area.
 // If non NULL, the left/right/width/height pointers are filled with the visible
diff --git a/src/webp/encode.h b/src/webp/encode.h
index d2857659..2e37cfab 100644
--- a/src/webp/encode.h
+++ b/src/webp/encode.h
@@ -18,7 +18,7 @@
 extern "C" {
 #endif
 
-#define WEBP_ENCODER_ABI_VERSION 0x0100    // MAJOR(8b) + MINOR(8b)
+#define WEBP_ENCODER_ABI_VERSION 0x0200    // MAJOR(8b) + MINOR(8b)
 
 // Return the encoder's version number, packed in hexadecimal using 8bits for
 // each of major/minor/revision. E.g: v2.5.7 is 0x020507.
@@ -69,7 +69,9 @@ WEBP_EXTERN(size_t) WebPEncodeLosslessBGRA(const uint8_t* bgra,
 typedef enum {
   WEBP_HINT_DEFAULT = 0,  // default preset.
   WEBP_HINT_PICTURE,      // digital picture, like portrait, inner shot
-  WEBP_HINT_PHOTO         // outdoor photograph, with natural lighting
+  WEBP_HINT_PHOTO,        // outdoor photograph, with natural lighting
+  WEBP_HINT_GRAPH,        // Discrete tone image (graph, map-tile etc).
+  WEBP_HINT_LAST
 } WebPImageHint;
 
 typedef struct {
@@ -157,7 +159,7 @@ typedef struct WebPPicture WebPPicture;   // main structure for I/O
 typedef struct {
   int coded_size;         // final size
 
-  float PSNR[4];          // peak-signal-to-noise ratio for Y/U/V/All
+  float PSNR[5];          // peak-signal-to-noise ratio for Y/U/V/All/Alpha
   int block_count[3];     // number of intra4/intra16/skipped macroblocks
   int header_bytes[2];    // approximate number of bytes spent for header
                           // and mode-partition #0
@@ -170,10 +172,16 @@ typedef struct {
   int alpha_data_size;    // size of the transparency data
   int layer_data_size;    // size of the enhancement layer data
 
-  void* user_data;        // this field is free to be set to any value and
-                          // used during callbacks (like progress-report e.g.).
+  // lossless encoder statistics
+  uint32_t lossless_features;  // bit0:predictor bit1:cross-color transform
+                               // bit2:subtract-green bit3:color indexing
+  int histogram_bits;          // number of precision bits of histogram
+  int transform_bits;          // precision bits for transform
+  int cache_bits;              // number of bits for color cache lookup
+  int palette_size;            // number of color in palette, if used
+  int lossless_size;           // final lossless size
 
-  uint32_t pad[6];        // padding for later use
+  uint32_t pad[4];        // padding for later use
 } WebPAuxStats;
 
 // Signature for output function. Should return true if writing was successful.
@@ -290,6 +298,9 @@ struct WebPPicture {
   // If not NULL, report progress during encoding.
   WebPProgressHook progress_hook;
 
+  void* user_data;        // this field is free to be set to any value and
+                          // used during callbacks (like progress-report e.g.).
+
   uint32_t pad3[3];       // padding for later use
 
   // Unused for now: original samples (for non-YUV420 modes)