update ChangeLog

Change-Id: If51472e72adaec0a198a8b09becb8be192153ca8
update NEWS description with new general features
2025-07-15 21:39:59 +02:00 · 2013-12-20 00:49:40 -08:00 · 2013-12-20 00:36:47 -08:00 · 2013-12-20 00:06:33 -08:00 · 2013-12-19 18:52:48 -08:00 · 2013-12-19 10:17:08 -08:00
153 changed files with 25079 additions and 7848 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,8 +1,9 @@
 *.l[ao]
-*.o
+*.[ao]
 .deps
 .libs
 /aclocal.m4
+/ar-lib
 /autom4te.cache
 /compile
 /config.*
@ -16,8 +17,12 @@
 /stamp-h1
 Makefile
 Makefile.in
-examples/[cd]webp
+examples/[cdv]webp
+examples/gif2webp
+examples/webpmux
 /output
 /doc/output
 *.idb
 *.pdb
+/iosbuild
+/WebP.framework
--- a/.mailmap
+++ b/.mailmap
@ -1,2 +1,7 @@
+<johann.koenig@duck.com> <johannkoenig@google.com>
 Mikołaj Zalewski <mikolajz@google.com>
 Pascal Massimino <pascal.massimino@gmail.com>
+<pascal.massimino@gmail.com> <skal@google.com>
+Vikas Arora <vikasa@google.com>
+<vikasa@google.com> <vikasa@gmail.com>
+<vikasa@google.com> <vikaas.arora@gmail.com>
--- a/6
+++ b/6
@ -1,8 +1,12 @@
 Contributors:
+- Charles Munger (clm at google dot com)
+- Christian Duvivier (cduvivier at google dot com)
 - James Zern (jzern at google dot com)
 - Jan Engelhardt (jengelh at medozas dot de)
- Johann (johannkoenig at google dot com)
+- Johann (johann dot koenig at duck dot com)
+- Jyrki Alakuijala (jyrki at google dot com)
 - Lou Quillio (louquillio at google dot com)
+- Mans Rullgard (mans at mansr dot com)
 - Martin Olsson (mnemo at minimum dot se)
 - Mikołaj Zalewski (mikolajz at google dot com)
 - Noel Chromium (noel at chromium dot org)
--- a/Android.mk
+++ b/Android.mk
@ -1,6 +1,14 @@
 LOCAL_PATH := $(call my-dir)

+WEBP_CFLAGS := -Wall -DANDROID -DHAVE_MALLOC_H -DHAVE_PTHREAD -DWEBP_USE_THREAD
+
+ifeq ($(APP_OPTIM),release)
+  WEBP_CFLAGS += -finline-functions -frename-registers -ffast-math -s \
+                 -ffunction-sections -fdata-sections
+endif
+
 include $(CLEAR_VARS)
+
 LOCAL_SRC_FILES := \
    src/dec/alpha.c \
    src/dec/buffer.c \
@ -35,9 +43,11 @@ LOCAL_SRC_FILES := \
    src/enc/picture.c \
    src/enc/quant.c \
    src/enc/syntax.c \
+    src/enc/token.c \
    src/enc/tree.c \
    src/enc/vp8l.c \
    src/enc/webpenc.c \
+    src/utils/alpha_processing.c \
    src/utils/bit_reader.c \
    src/utils/bit_writer.c \
    src/utils/color_cache.c \
@ -45,21 +55,25 @@ LOCAL_SRC_FILES := \
    src/utils/huffman.c \
    src/utils/huffman_encode.c \
    src/utils/quant_levels.c \
+    src/utils/quant_levels_dec.c \
+    src/utils/random.c \
    src/utils/rescaler.c \
    src/utils/thread.c \
+    src/utils/utils.c \

-LOCAL_CFLAGS := -Wall -DANDROID -DHAVE_MALLOC_H -DHAVE_PTHREAD \
-                -DNOT_HAVE_LOG2 -DWEBP_USE_THREAD \
-                -finline-functions -frename-registers -ffast-math \
-                -s -fomit-frame-pointer -Isrc/webp
-
+LOCAL_CFLAGS := $(WEBP_CFLAGS)
 LOCAL_C_INCLUDES += $(LOCAL_PATH)/src

+# prefer arm over thumb mode for performance gains
+LOCAL_ARM_MODE := arm
+
 ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
  # Setting LOCAL_ARM_NEON will enable -mfpu=neon which may cause illegal
  # instructions to be generated for armv7a code. Instead target the neon code
  # specifically.
  LOCAL_SRC_FILES += src/dsp/dec_neon.c.neon
+  LOCAL_SRC_FILES += src/dsp/upsampling_neon.c.neon
+  LOCAL_SRC_FILES += src/dsp/enc_neon.c.neon
 endif
 LOCAL_STATIC_LIBRARIES := cpufeatures

@ -67,4 +81,18 @@ LOCAL_MODULE:= webp

 include $(BUILD_STATIC_LIBRARY)

+include $(CLEAR_VARS)
+
+LOCAL_SRC_FILES := \
+    examples/dwebp.c \
+    examples/example_util.c \
+
+LOCAL_CFLAGS := $(WEBP_CFLAGS)
+LOCAL_C_INCLUDES := $(LOCAL_PATH)/src
+LOCAL_STATIC_LIBRARIES := webp
+
+LOCAL_MODULE := dwebp
+
+include $(BUILD_EXECUTABLE)
+
 $(call import-module,android/cpufeatures)
--- a/910
+++ b/910
@ -1,3 +1,903 @@
+256e433 update NEWS description with new general features
+2962534 Merge "gif2webp: don't use C99 %zu" into 0.4.0
+3b9f9dd gif2webp: don't use C99 %zu
+b5b2e3c cwebp: fix metadata output w/lossy+alpha
+ad26df1 makefile.unix: clean up libgif2webp_util.a
+c3b4557 update Changelog
+ca84112 Merge "bump version to 0.4.0" into 0.4.0
+8c524db bump version to 0.4.0
+eec2398 update AUTHORS & .mailmap
+b9bbf6a update NEWS for 0.4.0
+c72e081 Merge "dec/webp.c: don't wait for data before reporting w/h"
+5ad6531 dec/frame.c: fix formatting
+f7fc4bc dec/webp.c: don't wait for data before reporting w/h
+66a32af Merge "NEON speed up"
+26d842e NEON speed up
+f307f98 Merge "webpmux: let -- stop parameter parsing"
+fe051da Merge "README: add a section on gif2webp"
+6fd2bd6 Merge "manpage pedantry"
+4af1900 README: add a section on gif2webp
+6f36ade manpage pedantry
+f9016cb README: update dwebp options
+b4fa0a4 webpmux: let -- stop parameter parsing
+a9a20ac gif2webp: Add a multi-threaded encode option
+495bef4 fix bug in TrellisQuantize
+605a712 simplify __cplusplus ifdef
+33109f9 Merge "drop: ifdef __cplusplus checks from C files"
+7f9de0b Merge changes I994a5587,I8467bb71,I13b50688,I1e2c9c7b
+5459030 gif2webp: let -- stop parameter parsing
+a4b0aa0 vwebp: let -- stop parameter parsing
+98af68f cwebp: let -- stop parameter parsing
+a33831e dwebp: let -- stop parameter parsing
+3630124 add some checks on error paths
+ce4c713 Merge "autoconf: add --disable-wic"
+5227d99 drop: ifdef __cplusplus checks from C files
+f645355 dwebp.1: fix typo
+f91034f Merge "cwebp: print metadata stats when no output file is given"
+d493455 gif2webp: Backward compatibility for giflib version <= 4.1.3
+4c617d3 gif2webp: Disable output of ICC profile by default
+73b731f introduce a special quantization function for WHT
+41c0cc4 Make Forward WHT transform use 32bit fixed-point calculation
+a3359f5 Only compute quantization params once
+7049043 cwebp: print metadata stats when no output file is given
+d513bb6 * fix off-by-one zthresh calculation * remove the sharpening for non luma-AC coeffs * adjust the bias a little bit to compensate for this
+ad9dec0 Merge "cosmetics: dwebp: fix local function name format"
+f737f03 Merge "dwebp: remove a dead store"
+3c3a70d Merge "makefile.unix: install binaries in $(DESTDIR)/bin/"
+150b655 Merge "Android.mk: add some release compile flags"
+dbebd33 cosmetics: dwebp: fix local function name format
+2774995 dwebp: remove a dead store
+a01e04f autoconf: add --disable-wic
+5009b22 makefile.unix: install binaries in $(DESTDIR)/bin/
+bab30fc Merge "fix -print_psnr / ssim options"
+ebef7fb fix -print_psnr / ssim options
+cb63785 Merge "fix bug due to overzealous check in WebPPictureYUVAToARGB()"
+8189885 Merge "EstimateBestFilter: use an int to iterate WEBP_FILTER_TYPE"
+4ad7d33 Android.mk: add some release compile flags
+c12e236 cosmetics: fix a few typos
+6f10403 fix bug due to overzealous check in WebPPictureYUVAToARGB()
+3f6c35c EstimateBestFilter: use an int to iterate WEBP_FILTER_TYPE
+cc55790 Merge changes I8bb7a4dc,I2c180051,I021a014f,I8a224a62
+c536afb Merge "cosmetics: fix some typos"
+cbdd3e6 add a -dither dithering option to the decoder
+e812401 Updated iosbuild.sh for XCode 5.x
+4931c32 cosmetics: fix some typos
+05aacf7 mux: add some missing casts
+617d934 enc/vp8l: add a missing cast
+46db286 idec: add some missing casts
+b524e33 ErrorStatusLossless: correct return type
+cb261f7 fix a descaling bug for vertical/horizontal U/V interpolation
+bcb3955 Merge changes I48968468,I181bc736
+73f5213 gif2webp: Add a mixed compression mode
+6198715 demux: split chunk parsing from ParseVP8X
+d2e3f4e demux: add a tail pointer for chunks
+87cffcc demux: cosmetics: s/has_frames/is_animation/
+e18e667 demux: strictly enforce the animation flag
+c4f39f4 demux: cosmetics: remove a useless break
+61cb884 demux: (non-exp) fail if the fragmented flag is set
+ff379db few % speedup of lossless encoding
+df3649a remove all disabled code related to P-frames
+6d0cb3d Merge "gif2webp: kmin = 0 should suppress key-frame addition."
+3655598 gif2webp: kmin = 0 should suppress key-frame addition.
+7708e60 Merge "detect flatness in blocks and favor DC prediction"
+06b1503 Merge "add comment about the kLevelsFromDelta[][] LUT generation"
+5935259 add comment about the kLevelsFromDelta[][] LUT generation
+e3312ea detect flatness in blocks and favor DC prediction
+ebc9b1e Merge "VPLBitReader bugfix: Catch error if bit_pos > LBITS too."
+96ad0e0 VPLBitReader bugfix: Catch error if bit_pos > LBITS too.
+a014e9c tune quantization biases toward higher precision
+1e89861 add helpful PrintBlockInfo() function
+596a6d7 make use of 'extern' consistent in function declarations
+c8d48c6 Merge "extract random utils to their own file util/random.[ch]"
+98aa33c extract random utils to their own file util/random.[ch]
+432a723 Merge "swig: add basic go bindings"
+fab618b Merge "rename libwebp.i -> libwebp.swig"
+e4e7fcd swig: add basic go bindings
+d340872 Merge "fast auto-determined filtering strength"
+f8bfd5c fast auto-determined filtering strength
+ac0bf95 small clean-up in ExpandMatrix()
+1939607 rename libwebp.i -> libwebp.swig
+43148b6 filtering: precompute ilimit and hev_threshold
+18f992e simplify f_inner calculation a little
+241d11f add missing const
+86c0031 add a 'format' field to WebPBitstreamFeatures
+dde91fd Demux: Correct the extended format validation
+5d6c5bd add entry for '-resize' option in cwebp's man
+7c098d1 Use some gamma-curve range compression when computing U/V average
+0b2b050 Use deterministic random-dithering during RGB->YUV conversion
+8a2fa09 Add a second multi-thread method
+7d6f2da Merge "up to 20% faster multi-threaded decoding"
+266f63e Merge "libwebp.jar: build w/Java 1.6 for Android compat"
+0532149 up to 20% faster multi-threaded decoding
+38efdc2 Simplify the gif2webp tool: move the optimization details to util
+de89951 libwebp.jar: build w/Java 1.6 for Android compat
+cb22155 Decode a full row of bitstream before reconstructing
+dca8a4d Merge "NEON/simple loopfilter: avoid q4-q7 registers"
+9e84d90 Merge "NEON/TransformWHT: avoid q4-q7 registers"
+fc10249 NEON/simple loopfilter: avoid q4-q7 registers
+2f09d63 NEON/TransformWHT: avoid q4-q7 registers
+77585a2 Merge "use a macrofunc for setting NzCoeffs bits"
+d155507 Merge "use HINT_GRAPH as image_hint for gif source"
+9c56164 Merge "only print GIF_DISPOSE_WARNING once"
+0587986 use HINT_GRAPH as image_hint for gif source
+0b28d7a use a macrofunc for setting NzCoeffs bits
+f9bbc2a Special-case sparse transform
+0012519 gif2webp: detect and flatten uniformly similar blocks
+0deaf0f only print GIF_DISPOSE_WARNING once
+6a8c0eb Merge "small optimization in segment-smoothing loop"
+f7146bc small optimization in segment-smoothing loop
+5a7533c small gif2webp fix
+4df0c89 Merge changes Ic697660c,I27285521
+5b2e6bd Android.mk: add a dwebp target
+f910a84 Android.mk: update build flags
+63f9aba special-case WHT transform when there's only DC
+80911ae Merge "7-8% faster decoding by rewriting GetCoeffs()"
+606c430 gif2webp: Improved compression for lossy animated WebP
+fb887f7 gif2webp: Different kmin/kmax defaults for lossy and lossless
+2a98136 7-8% faster decoding by rewriting GetCoeffs()
+92d47e4 improve VP8L signature detection by checking the version bits too
+5cd43e4 Add -incremental option to dwebp
+54b8e3f webpmux: DisplayInfo(): remove unnecessary error checks.
+40ae352 fix memleak in WebPIDelete()
+d966265 mux.h doc: WebPMuxGetFrame() can return WEBP_MUX_MEMORY_ERROR too.
+0e6747f webpmux -info: display dimensions and has_alpha per frame
+d78a82c Sanity check for underflow
+8498f4b Merge "remove -Wshadow warnings"
+e89c6fc Avoid a potential memleak
+3ebe175 Merge "break down the proba 4D-array into some handy structs"
+6a44550 break down the proba 4D-array into some handy structs
+2f5e893 remove -Wshadow warnings
+bf3a29b Merge "add proper WEBP_HAVE_GIF and WEBP_HAVE_GL flags"
+2b0a759 Merge "fix some warnings from static analysis"
+22dd07c mux.h: Some doc corrections
+79ff034 add proper WEBP_HAVE_GIF and WEBP_HAVE_GL flags
+d51f45f fix some warnings from static analysis
+d134307 fix conversion warning on MSVC
+d538cea gif2webp: Support a 'min' and 'max'  key frame interval
+80b54e1 allow search with token buffer loop and fix PARTITION0 problem
+b7d4e04 add VP8EstimateTokenSize()
+10fddf5 enc/quant.c: silence a warning
+399cd45 Merge "fix compile error on ARM/gcc"
+9f24519 encoder: misc rate-related fixes
+c663bb2 Merge "simplify VP8IteratorSaveBoundary() arg passing"
+fa46b31 Demux.h: Correct a method name reference
+f8398c9 fix compile error on ARM/gcc
+f691f0e simplify VP8IteratorSaveBoundary() arg passing
+42542be up to 6% faster encoding with clang compiler
+93402f0 multi-threaded segment analysis
+7e2d659 Merge "remove the PACK() bit-packing tricks"
+c13fecf remove the PACK() bit-packing tricks
+2fd091c Merge "use NULL for lf_stats_ testing, not bool"
+b11c9d6 dwebp: use default dct_method
+4bb8465 Merge "(de)mux.h: wrap pseudo-code in /* */"
+cfb56b1 make -pass option work with token buffers
+5416aab (de)mux.h: wrap pseudo-code in /* */
+35dba33 use NULL for lf_stats_ testing, not bool
+733a7fa enc->Iterator memory cleanup
+e81fac8 Add support for "no blend" in webpmux binary
+3b80bc4 gif2webp: Separate out each step into a method
+bef7e9c Add doc precision about demux object keeping pointers to data.
+61405a1 dwebp: enable stdout output with WIC
+6eabb88 Merge "Animated WebP: add "do no blend" option to spec"
+be20dec fix compilation for BITS 24
+e58cc13 Merge "dwebp: s/unsigned char/uint8_t/"
+72501d4 dwebp: s/unsigned char/uint8_t/
+2c9633e Merge "gif2webp: Insert independent frames at regular intervals."
+f0d6a14 gif2webp: Insert independent frames at regular intervals.
+b25a6fb yuv.h: fix indent
+ede3602 Merge "cosmetics: fix indent"
+3a65122 dwebp: fix stdout related output
+388a724 cosmetics: fix indent
+4c7322c Merge "dsp: msvc compatibility"
+d50c7e3 Merge "5-7% faster SSE2 versions of YUV->RGB conversion functions"
+b8ab784 Merge "simplify upsampler calls: only allow 'bottom' to be NULL"
+df6cebf 5-7% faster SSE2 versions of YUV->RGB conversion functions
+ad6ac32 simplify upsampler calls: only allow 'bottom' to be NULL
+a5e8afa output to stdout if file name is "-"
+f358450 dsp: msvc compatibility
+43a7c8e Merge "cosmetics"
+4c5f19c Merge "bit_reader.h: cosmetics"
+f72fab7 cosmetics
+14dd5e7 fix const-ness
+b20aec4 Merge "Support for 'do not blend' option in vwebp"
+dcf6522 Support for 'do not blend' option in vwebp
+d5bad03 Animated WebP: add "do no blend" option to spec
+a2f5f73 Merge "Support for "Do not blend" in mux and demux libraries"
+e081f2f Pack code & extra_bits to Struct (VP8LPrefixCode).
+6284854 Support for "Do not blend" in mux and demux libraries
+f486aaa Merge "slightly faster ParseIntraMode"
+d171863 slightly faster ParseIntraMode
+3ceca8a bit_reader.h: cosmetics
+69257f7 Create LUT for PrefixEncode.
+988b708 add WebPWorkerExecute() for convenient bypass
+06e2498 Merge "VP8EncIterator clean-up"
+de4d4ad VP8EncIterator clean-up
+7bbe952 Merge "cosmetics: thread.c: drop a redundant comment"
+da41148 cosmetics: thread.c: drop a redundant comment
+feb4b6e thread.h: #ifdef when checking WEBP_USE_THREAD
+8924a3a thread.c: drop WebPWorker prefix from static funcs
+1aed8f2 Merge "fix indent"
+4038ed1 fix indent
+1693fd9 Demux: A new state WEBP_DEMUX_PARSE_ERROR
+8dcae8b fix rescaling-with-alpha inaccuracy
+11249ab Merge changes I9b4dc36c,I4e0eef4d
+52508a1 Mux: support parsing unknown chunks within a frame/fragment.
+05db057 WebPMuxSetChunk: remove unused variable
+8ba1bf6 Stricter check for presence of alpha when writing lossless images
+a03c351 Demux: WebPIterator now also denotes if the frame has alpha.
+6df743a Decoder: handle fragments case correctly too.
+faa4b07 Support for unknown chunks in mux library
+7d60bbc Speed up HashChainFindCopy function.
+6674014 Speedup Alpha plane encoding.
+b7346a1 0.1 % speedup to decoding
+c606182 webp-container-spec: Tighten language added by last
+a34a502 pngdec: output error messages from libpng
+e84c625 Merge "Detect canvas and image size mismatch in decoder."
+f626fe2 Detect canvas and image size mismatch in decoder.
+f5fbdee demux: stricter image bounds check
+30c8158 add extra assert in Huffman decode code
+8967b9f SSE2 for lossless decoding (critical) functions.
+699d80e Jump-lookup for Huffman coding
+c34307a fix some VS9 warnings about type conversion
+eeada35 pngdec: add missing include
+54b6510 gif2webp: If aligning to even offsets, extra pixels should be transparent
+0bcf5ce Merge "remove a malloc() in case we're using only FILTER_NONE for alpha"
+2c07143 remove a malloc() in case we're using only FILTER_NONE for alpha
+a4d5f59 Faster lossless decoding
+fd53bb7 Merge "alternate LUT-base reverse-bits code"
+d1c166e Merge "Container spec: a clarification on background color."
+fdb9177 Rename a method
+5e96753 Container spec: a clarification on background color.
+30e77d0 Merge branch '0.3.0'
+1b631e2 alternate LUT-base reverse-bits code
+24cc307 ~20% faster lossless decoding
+313d853 Speedup for decoding lossless WebP photographs:
+24ee098 change the bytes_per_pixels_ field into more evocative use_8b_decode
+2a04b03 update ChangeLog (tag: v0.3.1-rc2, tag: v0.3.1)
+7288950 Regression fix for alpha channels using color cache:
+2e377b5 wicdec: silence a format warning
+ad9e42a muxedit: silence some uninitialized warnings
+3307c16 Don't set alpha-channel to 0xff for alpha->green uplift
+5130770 Merge "wicdec: silence a format warning"
+a37eff4 Regression fix for alpha channels using color cache:
+241cf99 Merge "muxedit: silence some uninitialized warnings"
+c8f9c84 Regression fix for alpha unfiltering:
+14cd5c6 muxedit: silence some uninitialized warnings
+a368db8 dec/vp8l: quiet vs9 x64 type conversion warning
+ffae9f3 wicdec: silence a format warning
+8cf0701 Alpha encoding: never filter in case of NO_COMPRESSION
+825e73b update ChangeLog (tag: v0.3.1-rc1)
+abf6f69 update NEWS
+5a92c1a bump version to 0.3.1
+86daf77 store top Y/U/V samples in packed fashion
+67bc353 Revert "add WebPBlendAlpha() function to blend colors against background"
+068db59 Intertwined decoding of alpha and RGB
+38cc011 Simplify forward-WHT + SSE2 version
+3fa595a Support decoding upto given row in DECODE_DATA_FUNC
+520f005 DequantizeLevels(): Add 'row' and 'num_rows' args
+47374b8 Alpha unfilter for given set of rows
+f32097e probe input file and quick-check for WebP format.
+a2aed1d configure: improve gl/glut library test
+c7e89cb update copyright text
+a00380d configure: remove use of AS_VAR_APPEND
+a94a88d fix EXIF parsing in PNG
+a71e5d8 add doc precision for WebPPictureCopy() and WebPPictureView()
+8287012 remove datatype qualifier for vmnv
+e190843 fix a memory leak in gif2webp
+0b18b9e fix two minor memory leaks in webpmux
+db5095d remove some cruft from swig/libwebp.jar
+850e956 README: update swig notes
+bddd9b0 swig/python: add minimal documentation
+d573a8d swig: add python encode support
+6b93187 swig/java: reduce wrapper function code duplication
+6fe536f swig/java: rework uint8_t typemap
+a2ea464 Fix the bug in ApplyPalette.
+7bb28d2 webp/lossless: fix big endian BGRA output
+f036d4b Speed up ApplyPalette for ARGB pixels.
+8112c8c remove some warnings:
+cc128e0 Further reduce memory to decode lossy+alpha images
+07db70d fix for big-endian
+eda8a7d gif2webp: Fix signed/unsigned comparison mismatch
+31f346f Makefile.vc: fix libwebpdemux dll variable typo
+6c76d28 swig: add python (decode) support
+b4f5bb6 swig: cosmetics
+498d4dd WebP-Lossless encoding improvements.
+26e7244 swig: ifdef some Java specific code
+8ecec68 configure: add warning related flags
+e676b04 configure: add GLUT detection; build vwebp
+b0ffc43 Alpha decoding: significantly reduce memory usage
+20aa7a8 configure: add --enable-everything
+b8307cc configure.ac: add some helper macros
+980e7ae Remove the gcc compilation comments
+7f25ff9 gif2webp: Fix ICC and XMP support
+d8e5321 Add missing name to AUTHORS
+11edf5e Demux: Fix a potential memleak
+c7b9218 don't forward declare enums
+7a650c6 prevent signed int overflow in left shift ops
+31bea32 add precision about dynamic output reallocation with IDecoder
+c22877f Add incremental support for extended format files
+5051245 Makefile.vc: have 'all' target build everything
+8191dec Makefile.vc: flags cleanup
+b9d7473 Makefile.vc: drop /FD flag
+5568dbc update gitignore
+f4c7b65 WebPEncode: An additional check. Start VP8EncLoop/VP8EncTokenLoop only if VP8EncStartAlpha succeeded.
+1fb04be pngdec: Avoid a double-free.
+dcbb1ca add WebPBlendAlpha() function to blend colors against background
+bc9f5fb configure.ac: add AM_PROG_AR for automake >= 1.12
+bf867bf Tuned cross_color parameter (step) for lower qual
+90e2ec5 Merge "probe input file and quick-check for WebP format."
+7180d7f Merge "update copyright text"
+830f72b probe input file and quick-check for WebP format.
+2ccf58d configure: improve gl/glut library test
+d640614 update copyright text
+c2113ad Merge "configure: remove use of AS_VAR_APPEND"
+9326a56 configure: remove use of AS_VAR_APPEND
+ea63d61 fix a type warning on VS9 x86
+bec1109 fix EXIF parsing in PNG
+b6e65f3 Merge "fix warnings for vs9 x64"
+438946d fix warnings for vs9 x64
+f4710e3 collect macroblock reconstruction data in VP8MBData struct
+23d28e2 add doc precision for WebPPictureCopy() and WebPPictureView()
+518f2cd cosmetics: gif2webp: fix indent
+af358e6 Merge "remove datatype qualifier for vmnv"
+3fe9163 remove datatype qualifier for vmnv
+764fdff fix a memory leak in gif2webp
+3e59a74 fix two minor memory leaks in webpmux
+47b9862 Merge "README: update swig notes"
+325d15f remove some cruft from swig/libwebp.jar
+4a7627c README: update swig notes
+5da81e3 Merge "swig/python: add minimal documentation"
+f39e08f Merge "swig: add python encode support"
+6ca4a3e Merge "swig/java: reduce wrapper function code duplication"
+8f8702b Merge "swig/java: rework uint8_t typemap"
+91413be reduce memory for VP8MB and remove bitfields use
+7413394 Fix the memory leak in ApplyFilters.
+2053c2c simplify the alpha-filter testing loop
+825b64d swig/python: add minimal documentation
+14677e1 swig: add python encode support
+a5c297c swig/java: reduce wrapper function code duplication
+ad4a367 swig/java: rework uint8_t typemap
+0d25876 use uint8_t for inv_palette[]
+afa3450 Fix the bug in ApplyPalette.
+2d6ac42 Merge "webp/lossless: fix big endian BGRA output"
+2ca8396 webp/lossless: fix big endian BGRA output
+742110c Speed up ApplyPalette for ARGB pixels.
+2451e47 misc code cleanup
+83db404 Merge "swig: add python (decode) support"
+eeeea8b Merge "swig: cosmetics"
+d5f9b8f Merge "libwebp: fix vp8 encoder mem alloc offsetting"
+d8edd83 libwebp: fix vp8 encoder mem alloc offsetting
+8983b83 remove use of bit-fields in VP8FInfo
+87a4fca remove some warnings:
+ba8f74e Merge "fix for big-endian"
+a65067f Merge "Further reduce memory to decode lossy+alpha images"
+64c8448 Further reduce memory to decode lossy+alpha images
+332130b Mux: make a few methods static
+4437061 fix for big-endian
+5199eab Merge "add uncompressed TIFF output support"
+a3aede9 add uncompressed TIFF output support
+f975b67 Merge "gif2webp: Fix signed/unsigned comparison mismatch"
+5fbc734 Merge "GetFeatures: Detect invalid VP8X/VP8/VP8L data"
+d5060c8 Merge "mux.h: A comment fix + some consistency fixes"
+352d0de GetFeatures: Detect invalid VP8X/VP8/VP8L data
+3ef79fe Cosmetic: "width * height"
+043e1ae gif2webp: Fix signed/unsigned comparison mismatch
+5818cff mux.h: A comment fix + some consistency fixes
+1153f88 Merge "swig: ifdef some Java specific code"
+3eeedae Makefile.vc: fix libwebpdemux dll variable typo
+f980faf swig: add python (decode) support
+7f5f42b swig: cosmetics
+8eae188 WebP-Lossless encoding improvements.
+c7247c4 swig: ifdef some Java specific code
+4cb234d Merge "Mux: make ValidateForSingleImage() method static"
+ed6f530 Merge "Add GetCanvasSize() method to mux"
+1d530c9 Mux: make ValidateForSingleImage() method static
+bba4c2b configure: add warning related flags
+fffefd1 Add GetCanvasSize() method to mux
+732da8d Merge "configure: add GLUT detection; build vwebp"
+0e513f7 configure: add GLUT detection; build vwebp
+55d1c15 Merge "Alpha decoding: significantly reduce memory usage"
+13d99fb Merge "configure: add --enable-everything"
+2bf698f Merge "configure.ac: add some helper macros"
+edccd19 Alpha decoding: significantly reduce memory usage
+3cafcc9 configure: add --enable-everything
+4ef1447 configure.ac: add some helper macros
+a4e1cdb Remove the gcc compilation comments
+6393fe4 Cosmetic fixes
+9c4ce97 Simplify forward-WHT + SSE2 version
+878b9da fix missed optim
+0004617 VP8GetInfo(): Check for zero width or height.
+9bf3129 align VP8Encoder::nz_ allocation
+5da165c fix CheckMode() signature
+0ece07d Merge "explicitly pad bitfields to 32-bits"
+9dbc9d1 explicitly pad bitfields to 32-bits
+5369a80 Merge "prevent signed int overflow in left shift ops"
+70e3971 Merge "cosmetics: remove unnecessary ';'s"
+d3136ce Merge "don't forward declare enums"
+b26e5ad gif2webp: Fix ICC and XMP support
+46089b2 Add missing name to AUTHORS
+94328d6 Demux: Fix a potential memleak
+96e948d don't forward declare enums
+f4f9088 prevent signed int overflow in left shift ops
+0261545 cosmetics: remove unnecessary ';'s
+7ebdf11 Merge "Fix few missing comparisons to NULL"
+1579989 Fix few missing comparisons to NULL
+ea1b21c Cleaned up VP8GetHeaders() so that it parses only frame header
+b66caee dwebp: add support for BMP output
+ff885bf add precision about dynamic output reallocation with IDecoder
+79241d5 Merge "Makefile.vc: have 'all' target build everything"
+ac1c729 Merge "Makefile.vc: flags cleanup"
+118a055 Merge "Makefile.vc: drop /FD flag"
+ecad010 Merge "update gitignore"
+a681b4f Rename PRE_VP8 state to WEBP_HEADER
+ead4d47 Add incremental support for extended format files
+69d0f92 Makefile.vc: have 'all' target build everything
+5296749 Makefile.vc: flags cleanup
+c61baf0 Makefile.vc: drop /FD flag
+3a15125 update gitignore
+5167ca4 Merge "WebPEncode: An additional check. Start VP8EncLoop/VP8EncTokenLoop only if VP8EncStartAlpha succeeded."
+67708d6 WebPEncode: An additional check. Start VP8EncLoop/VP8EncTokenLoop only if VP8EncStartAlpha succeeded.
+b68912a pngdec: Avoid a double-free.
+82abbe1 Merge "configure.ac: add AM_PROG_AR for automake >= 1.12"
+e7d9548 add WebPBlendAlpha() function to blend colors against background
+ed4dc71 configure.ac: add AM_PROG_AR for automake >= 1.12
+df4a406 Merge branch '0.3.0'
+1e0d4b8 Update ChangeLog (tag: v0.3.0-rc7, tag: v0.3.0)
+d52b405 Cosmetic fixes
+6cb4a61 misc style fix
+68111ab add missing YUVA->ARGB automatic conversion in WebPEncode()
+e9a7990 Cosmetic fixes
+403bfe8 Container spec: Clarify frame disposal
+2aaa423 Merge "add missing YUVA->ARGB automatic conversion in WebPEncode()"
+07d87bd add missing YUVA->ARGB automatic conversion in WebPEncode()
+142c462 misc style fix
+3e7a13a Merge "Container spec: clarify the background color field" into 0.3.0
+14af774 container doc: add a note about the 'ANMF' payload
+cc635ef Container spec: clarify the background color field
+e3e3394 container doc: move RIFF description to own section
+4299f39 libwebp/mux: fix double free
+33f9a69 Merge "demux: keep a frame tail pointer; used in AddFrame" into 0.3.0
+a2a7b95 use WebPDataCopy() instead of re-coding it.
+6f18f12 demux: keep a frame tail pointer; used in AddFrame
+e5af49e add doc precision about WebPParseHeaders() return codes
+db46daa Merge "Makefile.vc: fix dynamic builds" into 0.3.0
+53c77af Merge "gif2webp: Bgcolor fix for a special case" into 0.3.0
+a5ebd14 gif2webp: Bgcolor fix for a special case
+6378f23 Merge "vwebp/animation: fix background dispose" into 0.3.0
+3c8eb9a fix bad saturation order in QuantizeBlock
+04c7a2e vwebp/animation: fix background dispose
+81a5069 Makefile.vc: fix dynamic builds
+5f25c39 update ChangeLog (tag: v0.3.0-rc6)
+14d42af examples: don't use C99 %zu
+5ccf1fe update ChangeLog
+2560c24 update NEWS
+f43bafc Merge changes Iecccb09c,If5ee9fd2,I3e181ce4 into 0.3.0
+a788644 dwebp: warn when decoding animated webp's
+302efcd Decode: return more meaningful error for animation
+ad45273 WebPBitstreamFeatures: add has_animation field
+783dfa4 disable FRGM decoding for good in libwebpmux
+4b956be Update ChangeLog
+ad8b86d update NEWS
+3e084f6 Merge "demux cosmetics: comments/rename internal function" into 0.3.0
+d3f8c62 Merge "move WebPFeatureFlags declaration" into 0.3.0
+7386fe5 Merge "libwebp{demux,mux}: install mux_types.h" into 0.3.0
+d6cd4e9 Merge "bump decode abi" into 0.3.0
+17f8da5 bump decode abi
+97684ae Merge "add doc precision about WebPDemuxPartial()" into 0.3.0
+f933fd2 move WebPFeatureFlags declaration
+289bc47 libwebp{demux,mux}: install mux_types.h
+224e8d4 add doc precision about WebPDemuxPartial()
+4c18e80 demux cosmetics: comments/rename internal function
+7cfd1bf update AUTHORS
+401f7b8 Merge "speed-up lossless (~3%) with ad-hoc histogram cost evaluation" into 0.3.0
+1fc8ffc Merge "makefile.unix: dist related changes" into 0.3.0
+8a89c6e Merge changes I466c377f,Ib761ebd3,I694857fc into 0.3.0
+f4ffb2d speed-up lossless (~3%) with ad-hoc histogram cost evaluation
+723847d gif2webp: only write error messages to stderr
+701b9e2 makefile.unix: dist related changes
+bb85b43 Merge "update NEWS" into 0.3.0
+59423a2 gif2webp: fix crash on open failure with libgif5
+9acb17d gif2webp: silence a unused param warning
+7d9fdc2 Merge "README updates" into 0.3.0
+5621934 Merge "build: fix install race on shared headers" into 0.3.0
+70809d8 Merge "bump version to 0.3.0" into 0.3.0
+d851cd1 demux: make the parse a bit more strict
+28bb410 update NEWS
+cef9388 bump version to 0.3.0
+9048494 build: fix install race on shared headers
+1e67e8e README updates
+42b611a Merge "configure: drop experimental from mux/demux" into 0.3.0
+096a8e3 Merge "vwebp: add color profile support" into 0.3.0
+ddfee5d vwebp: add color profile support
+0d6927d Merge "Mark fragment options as experimental in webpmux" into 0.3.0
+5dbd403 Mark fragment options as experimental in webpmux
+a0a6648 configure: drop experimental from mux/demux
+ee65bad Merge "add support for BITS > 32" into 0.3.0
+744930d add support for BITS > 32
+7dd288f cwebp: fix build
+19a8dd0 Merge "Makefile.vc: add vwebp.exe target" into 0.3.0
+50eedda Merge "examples: normalize icc related program arguments" into 0.3.0
+757f637 Merge "Makefile.vc: add libwebpdecoder target" into 0.3.0
+b65c4b7 Makefile.vc: add libwebpdecoder target
+f8db7b4 Merge "vwebp: replace doubles w/floats where appropriate" into 0.3.0
+d99aa56 Makefile.vc: add vwebp.exe target
+013023e vwebp: replace doubles w/floats where appropriate
+9b3db89 README.mux: add version reference
+7b6a26c Merge "cwebp: output metadata statistics" into 0.3.0
+d8dc72a examples: normalize icc related program arguments
+7bfc905 Merge "make alpha unfilter work in-place" into 0.3.0
+0037b2d Merge "add LUT-free reference code for YUV->RGB conversion." into 0.3.0
+166bf74 Merge "demux: disable fragment parsing" into 0.3.0
+126974b add LUT-free reference code for YUV->RGB conversion.
+0aef3eb make alpha unfilter work in-place
+14ef500 Merge "Remove 'status: experimental' from container spec" into 0.3.0
+d40c98e Merge "webpmux binary: tiny style fix" into 0.3.0
+0bc4268 cwebp: output metadata statistics
+bc03980 Merge "autoconf: normalize experimental define" into 0.3.0
+d1e21b1 Remove 'status: experimental' from container spec
+7681bb9 webpmux binary: tiny style fix
+a3dd3d0 avoid installing example_util.h
+252320e demux: disable fragment parsing
+537bde0 autoconf: normalize experimental define
+5e338e0 Merge changes I33e8a613,I8e8a7b44 into 0.3.0
+d9d0ea1 Merge changes If21e3ec7,I991fc30b into 0.3.0
+627f5ca automake: add reference to libwebp for mux/demux
+eef73d0 don't consolidate proba stats too often
+05ec4cc libwebp{,decoder}.pc: add pthread flags
+1bfcf5b add libwebpmux.pc
+26ca843 add libwebpdemux.pc
+69e2590 Merge "Tune Lossless compression for lower qualities."
+0478b5d Tune Lossless compression for lower qualities.
+39f7586 add a mention of parallel alpha encoding in the NEWS
+5a21d96 Merge "1.5x-2x faster encoding for method 3 and up"
+9bfbdd1 1.5x-2x faster encoding for method 3 and up
+27dc741 Correct frame options order in README.mux
+be2fd17 Mux: fix a scenario with bad ANMF/FRGM size
+19eb012 Merge "Demux: Add option to get frame count using GetI()"
+7368b8c Merge "WebPGetFeatures() out of if condition for clarity."
+f604c9a Merge "fix windows build"
+153f94e fix windows build
+847b492 Merge "vwebp: use magenta for 'i'nfo display"
+25ea46b Merge "vwebp: add keyboard shortcuts to help output"
+bea7cca vwebp: use magenta for 'i'nfo display
+8fab161 webpmux: correct -frame param order in help output
+03cc23d vwebp: add keyboard shortcuts to help output
+068eba8 Demux: Add option to get frame count using GetI()
+988b8f5 WebPGetFeatures() out of if condition for clarity.
+6933d91 Merge "gif2webp: Be lenient about background color index."
+4d0f7c5 Merge "WebPGetFeatures() behavior change:"
+fdeeb01 gif2webp: Be lenient about background color index.
+ad25032 Merge "multi-threaded alpha encoding for lossy"
+4e32d3e Merge "fix compilation of token.c"
+f817930 multi-threaded alpha encoding for lossy
+8805035 fix compilation of token.c
+fc81621 code using the actual values for num_parts_, not the ones from config
+7265535 Merge "move the config check from .c to .h"
+dd9e76f move the config check from .c to .h
+956b217 WebPGetFeatures() behavior change:
+df02e4c WebPDemuxGetI behavior change:
+633c004 Merge "rebalance method tools (-m) for methods [0..4]"
+58ca6f6 rebalance method tools (-m) for methods [0..4]
+7648c3c Merge "describe rd-opt levels introduce VP8RDLevel enum"
+67fb100 Merge "autoconf: enable silent-rules by default"
+a5042a3 GetVersion() methods for mux and demux
+5189957 describe rd-opt levels introduce VP8RDLevel enum
+4e094ac autoconf: enable silent-rules by default
+b7eaa85 inline VP8LFastLog2() and VP8LFastSLog2 for small values
+5cf7792 split quant_levels.c into decoder and encoder version
+e5d3ffe Merge "Update code example in README.mux"
+ac5a915 Update code example in README.mux
+38a91e9 Add example code snippet for demux API
+5f557f3 README.mux: add info about Demux API and vwebp
+c0ba090 backward_references: avoid signed integer overflow
+943386d disable SSE2 for now
+9479fb7 lossless encoding speedup
+ec2030a merge two lines together
+b67956c Merge "Remove ReadOneBit() and ReadSymbolUnsafe()"
+1667bde Remove ReadOneBit() and ReadSymbolUnsafe()
+3151669 wicdec + dwebp cosmetics: normalize formatting
+92668da change default filtering parameters:   * type is now 'strong'   * strength is now '60'
+b7490f8 introduce WEBP_REFERENCE_IMPLEMENTATION compile option
+3383885 faster decoding (3%-6%)
+5c3e381 Merge "add a -jpeg_like option"
+c231104 remove unused declaration of VP8Zigzag
+3615295 Merge "wicdec: add alpha support for paletted formats"
+c9f1649 wicdec: add alpha support for paletted formats
+1262f81 Merge "wicdec: silence some warnings"
+e7ea61e wicdec: silence some warnings
+23c0f35 fix missing intptr_t->int cast for MSVC
+e895059 add a -jpeg_like option
+1f803f6 Merge "Tune alpha quality mapping to more reasonable values."
+1267d49 Tune alpha quality mapping to more reasonable values.
+043076e Merge "speed-up lossless in BackwardTrace"
+f3a44dc remove one malloc from TraceBackwards()
+0fc1a3a speed-up lossless in BackwardTrace
+7c732e5 cwebp: centralize WebPCleanupTransparentArea()
+7381254 Merge "wicdec: add ICC profile extraction"
+e83ff7d wicdec: add ICC profile extraction
+146c6e3 Merge "cosmetics: pngdec: normalize default label location"
+a8f549d Merge "manpages: italicize option parameters"
+e118db8 Merge "encode.h: note the need to free() WebPMemoryWriter"
+1dfee6d cosmetics: pngdec: normalize default label location
+14c3820 manpages: italicize option parameters
+7defbfa encode.h: note the need to free() WebPMemoryWriter
+88d382a cwebp: cleanup after memory_writer
+12d6cec fix extra space in dwebp.1 man
+b01681a Fix for demuxer frame iteration:
+56c12aa Demuxer creation fix:
+66c810b add a -yuv option to dwebp (very similar to -pgm)
+841a3ba Merge "Remove -Wshadow warnings."
+8fd0252 Merge "upsampling_neon.c: fix build"
+6efed26 Remove -Wshadow warnings.
+60904aa Merge "allow WebPINewRGB/YUVA to be passed a NULL output buffer."
+b7adf37 allow WebPINewRGB/YUVA to be passed a NULL output buffer.
+27f8f74 upsampling_neon.c: fix build
+06b9cdf gitignore: add IOS related directories
+f112221 Merge "Fix more comments for iobuild.sh"
+fe4d25d Fix more comments for iobuild.sh
+1de3e25 Merge "NEON optimised yuv to rgb conversion"
+090b708 NEON optimised yuv to rgb conversion
+daa0647 Merge "Add ios build script for building iOS library."
+79fe39e Add ios build script for building iOS library.
+126c035 remove some more -Wshadow warnings
+522e9d6 Merge "cwebp: enable '-metadata'"
+76ec5fa cwebp: enable '-metadata'
+aeb91a9 Merge "cosmetics: break a few long lines"
+be7c96b cosmetics: break a few long lines
+cff8ddb Merge "add libwebpdecoder.pc"
+93148ab Merge "libwebp.pc.in: detab"
+6477f95 Merge "Makefile.vc: normalize path separator"
+bed1ed7 add libwebpdecoder.pc
+46168b2 libwebp.pc.in: detab
+a941a34 Fixed few nits in the build files.
+dd7a49b Makefile.vc: normalize path separator
+9161be8 Merge "cwebp: extract WIC decoding to its own module"
+08e7c58 Merge "Provide an option to build decoder library."
+0aeba52 Provide an option to build decoder library.
+757ebcb catch malloc(0)/calloc(0) with an assert
+152ec3d Merge "handle malloc(0) and calloc(0) uniformly on all platforms"
+a452a55 cwebp: extract WIC decoding to its own module
+2b252a5 Merge "Provide option to swap bytes for 16 bit colormodes"
+94a48b4 Provide option to swap bytes for 16 bit colormodes
+42f8f93 handle malloc(0) and calloc(0) uniformly on all platforms
+8b2152c Merge "add an extra assert to check memory bounds"
+0d19fbf remove some -Wshadow warnings
+cd22f65 add an extra assert to check memory bounds
+8189fed Merge "Add details and reference about the YUV->RGB conversion"
+1d2702b Merge "Formatting fixes in lossless bitstream spec"
+8425aae Formatting fixes in lossless bitstream spec
+a556cb1 Add details and reference about the YUV->RGB conversion
+d8f21e0 add link to SSIM description on Wikipedia
+18e9167 Merge "WebP-lossless spec clarifications:"
+98e25b9 Merge "cwebp: add -metadata option"
+f01c2a5 WebP-lossless spec clarifications:
+f4a9797 Merge "Disto4x4 and Disto16x16 in NEON"
+47b7b0b Disto4x4 and Disto16x16 in NEON
+7eaee9f cwebp: add -metadata option
+36c52c2 tiffdec: use toff_t for exif ifd offset
+7c8111e Merge "cwebp/tiffdec: add TIFF metadata extraction"
+e6409ad Remove redundant include from dsp/lossless code.
+1ab5b3a Merge "configure: fix --with-gifincludedir"
+03c749e configure: fix --with-gifincludedir
+8b65063 multiple libgif versions support for gif2webp
+476e293 gif2webp: Use DGifOpenFileName()
+b50f277 tiffdec: correct format string
+2b9048e Merge "tiffdec: check error returns for width/height"
+a1b5a9a Merge "cwebp/tiff: use the first image directory"
+079423f tiffdec: check error returns for width/height
+d62824a Merge "cwebp/jpegdec: add JPEG metadata extraction"
+03afaca Merge "cwebp: add PNG metadata extraction"
+2c72496 cwebp/jpegdec: add JPEG metadata extraction
+dba64d9 cwebp: add PNG metadata extraction
+1f075f8 Lossless spec corrections/rewording/clarifications
+2914ecf cwebp/tiffdec: add TIFF metadata extraction
+d82a3e3 More corrections/clarifications in lossless spec:
+bd00255 cwebp/tiff: use the first image directory
+df7aa07 Merge "Cleanup around jpegdec"
+0f57dcc decoding speed-up (~1%)
+bcec339 Lossless bitstream clarification:
+6bf2087 add examples/metadata.c
+207f89c Merge "configure: add libwebpdemux status to summary"
+1bd287a Cleanup around jpegdec
+9145567 Merge "cosmetics: use '== 0' in size checks"
+d6b88b7 cosmetics: use '== 0' in size checks
+d3dace2 cosmetics: jpegdec
+2f69af7 configure: add libwebpdemux status to summary
+1c1c564 cwebp: extract tiff decoding to its own module
+6a871d6 cwebp: extract jpeg decoding to its own module
+2ee228f cwebp: extract png decoding to its own module
+4679db0 Merge "cwebp: add metadata framework"
+63aba3a cwebp: add metadata framework
+931bd51 lossless bitstream: block size bits correction
+e4fc4c1 lossless bitstream: block size bits correction
+d65ec67 fix build, move token.c to src/enc/
+657f5c9 move token buffer to its own file (token.c)
+c34a375 introduce GetLargeValue() to slim-fast GetCoeffs().
+d5838cd faster non-transposing SSE2 4x4 FTransform
+f76191f speed up GetResidualCost()
+ba2aa0f Add support for BITS=24 case
+2e7f6e8 makefile.unix: Dependency on libraries
+dca8421 Merge "Separate out mux and demux code and libraries:"
+23782f9 Separate out mux and demux code and libraries:
+bd56a01 configure: add summary output
+90e5e31 dwebp manual: point to webpmux, gif2webp.
+540790c gif2webp.c: add a note about prerequisites
+d1edf69 cwebp man page: meaning of '-q' for lossy/lossless
+79efa1d Add man page for gif2webp utility
+2243e40 Merge "gif2webp build support with autoconf tools"
+c40efca gif2webp build support with autoconf tools
+6523e2d WebP Container:
+4da788d Merge "simplify the fwd transform"
+42c3b55 simplify the fwd transform
+41a6ced user GLfloat instead of float
+b542611 fix indentation
+68f282f * handle offset in anim viewer 'vwebp' * fix gif2webp to handle disposal method and odd offset correctly
+118cb31 Merge "add SSE2 version of Sum of Square error for 16x16, 16x8 and 8x8 case"
+8a7c3cc Merge "Change the order of -frame argument to be more natural"
+99e0a70 Merge "Simplify the texture evaluation Disto4x4()"
+0f923c3 make the bundling work in a tmp buffer
+e5c3b3f Simplify the texture evaluation Disto4x4()
+4860008 Change the order of -frame argument to be more natural
+35bfd4c add SSE2 version of Sum of Square error for 16x16, 16x8 and 8x8 case
+a7305c2 Clarification for unknown chunks
+4c4398e Refine WebP Container Spec wrt unknown chunks.
+2ca642e Rectify WebPMuxGetFeatures:
+7caab1d Some cosmetic/comment fixes.
+60b2651 Merge "Write a GIF to WebP converter based on libgif."
+c7127a4 Merge "Add NEON version of FTransformWHT"
+11b2721 Write a GIF to WebP converter based on libgif.
+e9a15a3 ExUtilWriteFile() to write memory segment to file
+74356eb Add a simple cleanup step in mux assembly:
+51bb1e5 mux.h: correct WebPDemuxSelectFragment() prototype
+22a0fd9 Add NEON version of FTransformWHT
+fa30c86 Update mux code to match the spec wrt animation
+d9c5fbe by-pass Analysis pass in case segments=1
+d2ad445 Merge changes Ibeccffc3,Id1585b16
+5c8be25 Merge "Chunk fourCCs for XMP/EXIF"
+a00a3da Use 'frgm' instead of 'tile' in webpmux parameters
+81b8a74 Design change in ANMF and FRGM chunks:
+f903cba Chunk fourCCs for XMP/EXIF
+812933d Tune performance of HistogramCombine
+52ad197 Animation specification in container spec
+001b930 Image fragment specification in container spec
+391f9db Ordering of description of bits in container spec
+d573577 Metadata specification in container spec
+1c4609b Merge commit 'v0.2.1'
+0ca584c Merge "Color profile specification in container spec"
+e8b41ad add NEON asm version for WHT inverse transform
+af6f0db Color profile specification in container spec
+a61a824 Merge "Add NULL check in chunk APIs"
+0e8b7ee fix WebPPictureView() unassigned strides
+75e5f17 ARM/NEON: 30% encoding speed-up
+02b4356 Add NULL check in chunk APIs
+a077072 mux struct naming
+6c66dde Merge "Tune Lossless encoder"
+ab5ea21 Tune Lossless encoder
+74fefc8 Update ChangeLog (tag: v0.2.1, origin/0.2.0, 0.2.0)
+92f8059 Rename some chunks:
+3bb4bbe Merge "Mux API change:"
+d0c79f0 Mux API change:
+abc0604 Merge "update NEWS" into 0.2.0
+57cf313 update NEWS
+25f585c bump version to 0.2.1
+fed7c04 libwebp: validate chunk size in ParseOptionalChunks
+552cd9b cwebp (windows): fix alpha image import on XP
+b14fea9 autoconf/libwebp: enable dll builds for mingw
+4a8fb27 [cd]webp: always output windows errors
+d662158 fix double to float conversion warning
+72b96a6 cwebp: fix jpg encodes on XP
+734f762 VP8LAllocateHistogramSet: fix overflow in size calculation
+f9cb58f GetHistoBits: fix integer overflow
+b30add2 EncodeImageInternal: fix uninitialized free
+3de58d7 fix the -g/O3 discrepancy for 32bit compile
+77aa7d5 fix the BITS=8 case
+e5970bd Make *InitSSE2() functions be empty on non-SSE2 platform
+ef5cc47 make *InitSSE2() functions be empty on non-SSE2 platform
+c4ea259 make VP8DspInitNEON() public
+8344ead Merge "libwebp: validate chunk size in ParseOptionalChunks"
+4828bb9 Merge "cwebp (windows): fix alpha image import on XP"
+3076333 libwebp: validate chunk size in ParseOptionalChunks
+7048189 AccumulateLSIM: fix double -> float warnings
+eda8ee4 cwebp (windows): fix alpha image import on XP
+c6e9865 Merge "add EXPERIMENTAL code for YUV-JPEG colorspace"
+f0360b4 add EXPERIMENTAL code for YUV-JPEG colorspace
+f86e6ab add LSIM metric to WebPPictureDistortion()
+c3aa215 Speed up HistogramCombine for lower qualities.
+1765cb1 Merge "autoconf/libwebp: enable dll builds for mingw"
+a13562e autoconf/libwebp: enable dll builds for mingw
+9f469b5 typo: no_fancy -> no_fancy_upsampling
+1a27f2f Merge "fix double to float conversion warning"
+cf1e90d Merge "cwebp: fix jpg encodes on XP"
+f2b5d19 [cd]webp: always output windows errors
+e855208 fix double to float conversion warning
+ecd66f7 cwebp: fix jpg encodes on XP
+7b3eb37 Tune lossless compression to get better gains.
+ce8bff4 Merge "VP8LAllocateHistogramSet: fix overflow in size calculation"
+ab5b67a Merge "EncodeImageInternal: fix uninitialized free"
+7fee5d1 Merge "GetHistoBits: fix integer overflow"
+a6ae04d VP8LAllocateHistogramSet: fix overflow in size calculation
+80237c4 GetHistoBits: fix integer overflow
+8a99723 EncodeImageInternal: fix uninitialized free
+0b9e682 minor cosmetics
+a792b91 fix the -g/O3 discrepancy for 32bit compile
+73ba435 Merge "detect and merge similar segments"
+fee6627 detect and merge similar segments
+0c44f41 src/webp/*.h: don't forward declare enums in C++
+d7a5ac8 vwebp: use demux interface
+931e0ea Merge "replace 'typedef struct {} X;" by "typedef struct X X; struct X {};""
+8f216f7 remove cases of equal comparison for qsort()
+28d25c8 replace 'typedef struct {} X;" by "typedef struct X X; struct X {};"
+2afee60 speed up for ARM using 8bit for boolean decoder
+5725cab new segmentation algorithm
+2cf1f81 Merge "fix the BITS=8 case"
+12f78ae fix the BITS=8 case
+6920c71 fix MSVC warnings regarding implicit uint64 to uint32 conversions
+f6c096a webpmux binary: Rename 'xmp' option to 'meta'
+ddfe871 webpmux help correction
+b7c5544 Merge "Make *InitSSE2() functions be empty on non-SSE2 platform"
+1c04a0d Common APIs for chunks metadata and color profile.
+2a3117a Merge "Create WebPMuxFrameInfo struct for Mux APIs"
+5c3a723 Make *InitSSE2() functions be empty on non-SSE2 platform
+7c6e60f make *InitSSE2() functions be empty on non-SSE2 platform
+c7eb457 make VP8DspInitNEON() public
+ab3234a Create WebPMuxFrameInfo struct for Mux APIs
+e3990fd Alignment fixes
+e55fbd6 Merge branch '0.2.0'
+4238bc0 Update ChangeLog (tag: v0.2.0)
+c655380 dec/io.c: cosmetics
+fe1958f RGBA4444: harmonize lossless/lossy alpha values
+681cb30 fix RGBA4444 output w/fancy upsampling
+f06c1d8 Merge "Alignment fix" into 0.2.0
+f56e98f Alignment fix
+6fe843b avoid rgb-premultiply if there's only trivial alpha values
+528a11a fix the ARGB4444 premultiply arithmetic
+a0a4885 Lossless decoder fix for a special transform order
+62dd9bb Update encoding heuristic w.r.t palette colors.
+6f4272b remove unused ApplyInverseTransform()
+93bf0fa Update ChangeLog (tag: v0.2.0-rc1)
+5934fc5 update AUTHORS
+014a711 update NEWS
+43b0d61 add support for ARGB -> YUVA conversion for lossless decoder
+33705ca bump version to 0.2.0
+c40d7ef fix alpha-plane check + add extra checks
+a06f802 MODE_YUVA: set alpha to opaque if the image has none
+52a87dd Merge "silence one more warning" into 0.2.0
+3b02309 silence one more warning
+f94b04f move some RGB->YUV functions to yuv.h
+4b71ba0 README: sync [cd]webp help output
+c9ae57f man/dwebp.1: add links to output file format details
+292ec5c quiet a few 'uninitialized' warnings
+4af3f6c fix indentation
+9b261bf remove the last NOT_HAVE_LOG2 instances
+323dc4d remove use of log2(). Use VP8LFastLog2() instead.
+8c515d5 Merge "harness some malloc/calloc to use WebPSafeMalloc and WebPSafeCalloc" into 0.2.0
+d4b4bb0 Merge changes I46090628,I1a41b2ce into 0.2.0
+bff34ac harness some malloc/calloc to use WebPSafeMalloc and WebPSafeCalloc
+a3c063c Merge "extra size check for security" into 0.2.0
+5e79630 Merge "WebPEncode: clear stats at the start of encode" into 0.2.0
+f1edf62 Merge "rationalize use of color-cache" into 0.2.0
+c193331 extra size check for security
+906be65 rationalize use of color-cache
+dd1c387 Add image-hint for low-color images.
+4eb7aa6 Merge "WebPCheckMalloc() and WebPCheckCalloc():" into 0.2.0
+80cc730 WebPCheckMalloc() and WebPCheckCalloc():
+183cba8 check VP8LBitWriterInit return
+cbfa9ee lossless: fix crash on user abort
+256afef cwebp: exit immediately on version mismatch
+475d87d WebPEncode: clear stats at the start of encode
+a7cc729 fix type and conversion warnings
+7d853d7 add stats for lossless
+d39177b make QuantizeLevels() store the sum of squared error
+5955cf5 replace x*155/100 by x*101581>>16
+7d732f9 make QuantizeLevels() store the sum of squared error
+e45a446 replace x*155/100 by x*101581>>16
+159b75d cwebp output size consistency:
+cbee59e Merge commit 'v0.1.99'
+1889e9b dwebp: report -alpha option
+3bc3f7c Merge "dwebp: add PAM output support" into 0.2.0
+d919ed0 dwebp: add PAM output support
+85e215d README/manpages/configure: update website link
+c3a207b Update ChangeLog (tag: v0.1.99)
 d1fd782 Merge "add extra precision about default values and behaviour" into 0.2.0
 efc826e add extra precision about default values and behaviour
 9f29635 header/doc clean up
@ -14,6 +914,7 @@ c37c23e README: cosmetics
 ce90847 Merge "add some padding bytes areas for later use" into 0.2.0
 2390dab Merge "fixing the findings by Frederic Kayser to the bitstream spec" into 0.2.0
 0275159 add a very crude progress report for lossless
+a4b9b1c Remove some unused enum values.
 dd10817 rename 'use_argb_input' to 'use_argb'
 90516ae add some padding bytes areas for later use
 d03b250 fixing the findings by Frederic Kayser to the bitstream spec
@ -46,6 +947,7 @@ c3b014d Android.mk: add missing lossless files
 8c1cc6b makefile.unix dist: explicitly name installed includes
 7f4647e Merge "clarify the colorspace naming and byte ordering of decoded samples" into 0.2.0
 cbf6972 clarify the colorspace naming and byte ordering of decoded samples
+857650c Mux: Add WebPDataInit() and remove WebPImageInfo
 ff771e7 don't install webp/decode_vp8.h
 596dff7 VP8LFillBitWindow: use 64-bit path for msvc x64 builds
 3ca7ce9 Merge "doc: remove non-finalized chunk references" into 0.2.0
@ -61,7 +963,7 @@ f0b5def bump versions
 05108f6 Merge "More spec/code matching in mux:"
 6808e69 More spec/code matching in mux:
 bd2b46f Merge "doc/webp-container-spec: light cosmetics"
-20ead32 doc/webp-container-spec: light cosmetics (full)
+20ead32 doc/webp-container-spec: light cosmetics
 1d40a8b configure: add pthread detection
 b5e9067 fix some int <-> size_t mix for buffer sizes
 e41a759 build: remove libwebpmux from default targets/config
@ -184,7 +1086,7 @@ f8f9408 libwebp: add WebPDemux stub functions
 fb47bb5 Merge "NumNamedElements() should take an enum param."
 7c68980 Fix asserts in Palette and BackwardReference code.
 fbdcb7e NumNamedElements() should take an enum param.
-fb4943b modify WebPParseHeaders to allow reuse by GetFeatures (old-decode-alph-3)
+fb4943b modify WebPParseHeaders to allow reuse by GetFeatures
 3697b5c write an ad-hoc EncodeImageInternal variant
 eaee9e7 Bug-Fix: Decode small (less than 32 bytes) images.
 0bceae4 Merge "cwebp: fix alpha reporting in stats output"
@ -620,7 +1522,7 @@ f3bf4c7 Added Mux Container Spec & README for MUX-API.
 9f761cf Changed function signature for WebPMuxCreate
 5f31b5e Merge "Add Mux library for manipulating WebP container."
 2315785 Add Mux library for manipulating WebP container.
-7e198ab update ChangeLog (v0.1.3)
+7e198ab update ChangeLog (tag: v0.1.3)
 dfc9c1e Harmonize the dates
 28ad70c Fix PNG decoding bug
 846e93c Update AUTHORS & add .mailmap
@ -761,7 +1663,7 @@ cfbf88a add SSE2 functions. ~2x faster encoding on average.
 e7ff3f9 merge two ITransforms together when applicable and change the TTransform to return the sum directly.
 ca55413 fix WebPIDecGetRGB() to accept any RGB(A) mode, not just MODE_RGB
 8aa50ef fix some 'man' typos
-d3f3bdd update ChangeLog (v0.1.2)
+d3f3bdd update ChangeLog (tag: v0.1.2)
 d7e9a69 update contributor list
 261abb8 add a 'superclean' section
 276ae82 Remove files not mean to be in git, and update .gitignore
--- a/Makefile.vc
+++ b/Makefile.vc
@ -1,8 +1,10 @@
 #
 # Stem for static libs and DLLs
 #
+LIBWEBPDECODER_BASENAME = libwebpdecoder
 LIBWEBP_BASENAME = libwebp
 LIBWEBPMUX_BASENAME = libwebpmux
+LIBWEBPDEMUX_BASENAME = libwebpdemux

 !IFNDEF ARCH
 !IF ! [ cl 2>&1 | find "x86" > NUL ]
@ -22,18 +24,18 @@ PLATFORM_LDFLAGS = /SAFESEH
 #############################################################
 ## Nothing more to do below this line!

-MT         = mt.exe
-CCNODBG    = cl.exe /nologo /O2 /DNDEBUG
-CCDEBUG    = cl.exe /nologo /Od /Gm /Zi /D_DEBUG /RTC1
-CFLAGS     = /Isrc /nologo /W3 /EHsc /FD /c /GS
+NOLOGO     = /nologo
+CCNODBG    = cl.exe $(NOLOGO) /O2 /DNDEBUG
+CCDEBUG    = cl.exe $(NOLOGO) /Od /Gm /Zi /D_DEBUG /RTC1
+CFLAGS     = /Isrc $(NOLOGO) /W3 /EHsc /c /GS
 CFLAGS     = $(CFLAGS) /DWIN32 /D_CRT_SECURE_NO_WARNINGS /DWIN32_LEAN_AND_MEAN
-CFLAGS     = $(CFLAGS) /DHAVE_WINCODEC_H /DWEBP_USE_THREAD /DNOT_HAVE_LOG2
+CFLAGS     = $(CFLAGS) /DHAVE_WINCODEC_H /DWEBP_USE_THREAD
 LDFLAGS    = /LARGEADDRESSAWARE /MANIFEST /NXCOMPAT /DYNAMICBASE
 LDFLAGS    = $(LDFLAGS) $(PLATFORM_LDFLAGS)
-LNKDLL     = link.exe /DLL
-LNKLIB     = link.exe /lib
-LNKEXE     = link.exe
-LFLAGS     = /nologo /machine:$(ARCH)
+LNKDLL     = link.exe /DLL $(NOLOGO)
+LNKEXE     = link.exe $(NOLOGO)
+LNKLIB     = lib.exe $(NOLOGO)
+MT         = mt.exe $(NOLOGO)

 CFGSET     = FALSE
 !IF "$(OBJDIR)" == ""
@ -59,6 +61,7 @@ DIRBIN = $(DIRBASE)\bin
 LIBWEBP_PDBNAME = $(DIROBJ)\$(LIBWEBP_BASENAME).pdb
 OUTPUT_DIRS = $(DIRBIN) $(DIRINC) $(DIRLIB) \
              $(DIROBJ)\dec \
+              $(DIROBJ)\demux \
              $(DIROBJ)\dsp \
              $(DIROBJ)\enc \
              $(DIROBJ)\examples \
@ -73,8 +76,10 @@ STATICLIBBUILD = TRUE
 CC             = $(CCDEBUG)
 RTLIB          = $(RTLIBD)
 STATICLIBBUILD = TRUE
+LIBWEBPDECODER_BASENAME = $(LIBWEBPDECODER_BASENAME)_debug
 LIBWEBP_BASENAME = $(LIBWEBP_BASENAME)_debug
 LIBWEBPMUX_BASENAME = $(LIBWEBPMUX_BASENAME)_debug
+LIBWEBPDEMUX_BASENAME = $(LIBWEBPDEMUX_BASENAME)_debug
 !ELSE IF "$(CFG)" == "release-dynamic"
 CC        = $(CCNODBG)
 DLLBUILD  = TRUE
@ -82,22 +87,28 @@ DLLBUILD  = TRUE
 CC        = $(CCDEBUG)
 RTLIB     = $(RTLIBD)
 DLLBUILD  = TRUE
+LIBWEBPDECODER_BASENAME = $(LIBWEBPDECODER_BASENAME)_debug
 LIBWEBP_BASENAME = $(LIBWEBP_BASENAME)_debug
 LIBWEBPMUX_BASENAME = $(LIBWEBPMUX_BASENAME)_debug
+LIBWEBPDEMUX_BASENAME = $(LIBWEBPDEMUX_BASENAME)_debug
 !ENDIF

 !IF "$(STATICLIBBUILD)" == "TRUE"
 CC     = $(CC) $(RTLIB)
 CFGSET = TRUE
+LIBWEBPDECODER = $(DIRLIB)\$(LIBWEBPDECODER_BASENAME).lib
 LIBWEBP = $(DIRLIB)\$(LIBWEBP_BASENAME).lib
 LIBWEBPMUX = $(DIRLIB)\$(LIBWEBPMUX_BASENAME).lib
+LIBWEBPDEMUX = $(DIRLIB)\$(LIBWEBPDEMUX_BASENAME).lib
 !ELSE IF "$(DLLBUILD)" == "TRUE"
 DLLC   = webp_dll.c
 DLLINC = webp_dll.h
+DLL_OBJS = $(DIROBJ)\$(DLLC:.c=.obj)
 CC     = $(CC) /I$(DIROBJ) /FI$(DLLINC) $(RTLIB) /DWEBP_DLL
+LIBWEBPDECODER = $(DIRLIB)\$(LIBWEBPDECODER_BASENAME)_dll.lib
 LIBWEBP = $(DIRLIB)\$(LIBWEBP_BASENAME)_dll.lib
 LIBWEBPMUX = $(DIRLIB)\$(LIBWEBPMUX_BASENAME)_dll.lib
-LIBWEBP_OBJS = $(DIROBJ)\$(DLLC:.c=.obj)
+LIBWEBPDEMUX = $(DIRLIB)\$(LIBWEBPDEMUX_BASENAME)_dll.lib
 LIBWEBP_PDBNAME = $(DIROBJ)\$(LIBWEBP_BASENAME)_dll.pdb
 CFGSET = TRUE
 !ENDIF
@ -119,7 +130,8 @@ CFGSET = TRUE
 !MESSAGE -  clean                         - perform a clean for CFG
 !MESSAGE -  experimental                  - build CFG with experimental
 !MESSAGE .                                  features enabled.
-!MESSAGE - (empty) or all                 - build all targets for CFG
+!MESSAGE - (empty)                        - build libwebp-based targets for CFG
+!MESSAGE - all                            - build (de)mux-based targets for CFG
 !MESSAGE
 !MESSAGE RTLIBCFG controls the runtime library linkage - 'static' or 'dynamic'.
 !MESSAGE OBJDIR is the path where you like to build (obj, bins, etc.),
@ -151,17 +163,32 @@ DEC_OBJS = \
    $(DIROBJ)\dec\vp8l.obj \
    $(DIROBJ)\dec\webp.obj \

-DSP_OBJS = \
+DEMUX_OBJS = \
+    $(DIROBJ)\demux\demux.obj \
+
+DSP_DEC_OBJS = \
    $(DIROBJ)\dsp\cpu.obj \
    $(DIROBJ)\dsp\dec.obj \
+    $(DIROBJ)\dsp\dec_neon.obj \
    $(DIROBJ)\dsp\dec_sse2.obj \
-    $(DIROBJ)\dsp\enc.obj \
-    $(DIROBJ)\dsp\enc_sse2.obj \
    $(DIROBJ)\dsp\lossless.obj \
    $(DIROBJ)\dsp\upsampling.obj \
+    $(DIROBJ)\dsp\upsampling_neon.obj \
    $(DIROBJ)\dsp\upsampling_sse2.obj \
    $(DIROBJ)\dsp\yuv.obj \

+DSP_ENC_OBJS = \
+    $(DIROBJ)\dsp\enc.obj \
+    $(DIROBJ)\dsp\enc_neon.obj \
+    $(DIROBJ)\dsp\enc_sse2.obj \
+
+EX_FORMAT_DEC_OBJS = \
+    $(DIROBJ)\examples\jpegdec.obj \
+    $(DIROBJ)\examples\metadata.obj \
+    $(DIROBJ)\examples\pngdec.obj \
+    $(DIROBJ)\examples\tiffdec.obj \
+    $(DIROBJ)\examples\wicdec.obj \
+
 EX_UTIL_OBJS = \
    $(DIROBJ)\examples\example_util.obj \

@ -179,67 +206,85 @@ ENC_OBJS = \
    $(DIROBJ)\enc\picture.obj \
    $(DIROBJ)\enc\quant.obj \
    $(DIROBJ)\enc\syntax.obj \
+    $(DIROBJ)\enc\token.obj \
    $(DIROBJ)\enc\tree.obj \
    $(DIROBJ)\enc\vp8l.obj \
    $(DIROBJ)\enc\webpenc.obj \

 MUX_OBJS = \
-    $(DIROBJ)\mux\demux.obj \
    $(DIROBJ)\mux\muxedit.obj \
    $(DIROBJ)\mux\muxinternal.obj \
    $(DIROBJ)\mux\muxread.obj \

-UTILS_OBJS = \
+UTILS_DEC_OBJS = \
+    $(DIROBJ)\utils\alpha_processing.obj \
    $(DIROBJ)\utils\bit_reader.obj \
-    $(DIROBJ)\utils\bit_writer.obj \
    $(DIROBJ)\utils\color_cache.obj \
    $(DIROBJ)\utils\filters.obj \
    $(DIROBJ)\utils\huffman.obj \
+    $(DIROBJ)\utils\quant_levels_dec.obj \
+    $(DIROBJ)\utils\rescaler.obj \
+    $(DIROBJ)\utils\random.obj \
+    $(DIROBJ)\utils\thread.obj \
+    $(DIROBJ)\utils\utils.obj \
+
+UTILS_ENC_OBJS = \
+    $(DIROBJ)\utils\bit_writer.obj \
    $(DIROBJ)\utils\huffman_encode.obj \
    $(DIROBJ)\utils\quant_levels.obj \
-    $(DIROBJ)\utils\rescaler.obj \
-    $(DIROBJ)\utils\thread.obj \

-LIBWEBP_OBJS = $(DEC_OBJS) $(DSP_OBJS) $(ENC_OBJS) $(UTILS_OBJS) $(LIBWEBP_OBJS)
+LIBWEBPDECODER_OBJS = $(DEC_OBJS) $(DSP_DEC_OBJS) $(UTILS_DEC_OBJS)
+LIBWEBP_OBJS = $(LIBWEBPDECODER_OBJS) $(ENC_OBJS) $(DSP_ENC_OBJS) \
+               $(UTILS_ENC_OBJS) $(DLL_OBJS)
 LIBWEBPMUX_OBJS = $(MUX_OBJS) $(LIBWEBPMUX_OBJS)
+LIBWEBPDEMUX_OBJS = $(DEMUX_OBJS) $(LIBWEBPDEMUX_OBJS)

-OUT_LIBS = $(LIBWEBP)
+OUT_LIBS = $(LIBWEBPDECODER) $(LIBWEBP)
 OUT_EXAMPLES = $(DIRBIN)\cwebp.exe $(DIRBIN)\dwebp.exe
+EXTRA_EXAMPLES = $(DIRBIN)\vwebp.exe $(DIRBIN)\webpmux.exe

-all: $(OUT_LIBS) $(OUT_EXAMPLES)
-$(DIRBIN)\cwebp.exe: $(DIROBJ)\examples\cwebp.obj
+ex: $(OUT_LIBS) $(OUT_EXAMPLES)
+all: ex $(EXTRA_EXAMPLES)
+$(DIRBIN)\cwebp.exe: $(DIROBJ)\examples\cwebp.obj $(EX_FORMAT_DEC_OBJS)
 $(DIRBIN)\dwebp.exe: $(DIROBJ)\examples\dwebp.obj
+$(DIRBIN)\vwebp.exe: $(DIROBJ)\examples\vwebp.obj
+$(DIRBIN)\vwebp.exe: $(EX_UTIL_OBJS) $(LIBWEBPDEMUX) $(LIBWEBP)
 $(DIRBIN)\webpmux.exe: $(DIROBJ)\examples\webpmux.obj $(LIBWEBPMUX)
 $(DIRBIN)\webpmux.exe: $(EX_UTIL_OBJS) $(LIBWEBP)
 $(OUT_EXAMPLES): $(EX_UTIL_OBJS) $(LIBWEBP)
+$(EX_UTIL_OBJS) $(EX_FORMAT_DEC_OBJS): $(OUTPUT_DIRS)

 experimental:
 	$(MAKE) /f Makefile.vc \
 	    CFG=$(CFG) \
 	    CFLAGS="$(CFLAGS) /DWEBP_EXPERIMENTAL_FEATURES" /$(MAKEFLAGS)

+$(LIBWEBPDECODER): $(LIBWEBPDECODER_OBJS)
 $(LIBWEBP): $(LIBWEBP_OBJS)
 $(LIBWEBPMUX): $(LIBWEBPMUX_OBJS)
+$(LIBWEBPDEMUX): $(LIBWEBPDEMUX_OBJS)

-$(LIBWEBP_OBJS) $(LIBWEBPMUX_OBJS): $(OUTPUT_DIRS)
+$(LIBWEBP_OBJS) $(LIBWEBPMUX_OBJS) $(LIBWEBPDEMUX_OBJS): $(OUTPUT_DIRS)

 !IF "$(DLLBUILD)" == "TRUE"
-$(LIBWEBP_OBJS) $(LIBWEBPMUX_OBJS): $(DIROBJ)\$(DLLINC) $(DIROBJ)\$(DLLC)
+$(LIBWEBP_OBJS) $(LIBWEBPMUX_OBJS) $(LIBWEBPDEMUX_OBJS): \
+    $(DIROBJ)\$(DLLINC) $(DIROBJ)\$(DLLC)

 {$(DIROBJ)}.c{$(DIROBJ)}.obj:
 	$(CC) $(CFLAGS) /Fd$(LIBWEBP_PDBNAME) /Fo$@  $<

 $(LIBWEBPMUX): $(LIBWEBP)
+$(LIBWEBPDEMUX): $(LIBWEBP)

-$(LIBWEBP) $(LIBWEBPMUX):
+$(LIBWEBPDECODER) $(LIBWEBP) $(LIBWEBPMUX) $(LIBWEBPDEMUX):
 	$(LNKDLL) /out:$(DIRBIN)\$(@B:_dll=.dll) /implib:$@ $(LFLAGS) $**
 	-xcopy $(DIROBJ)\*.pdb $(DIRLIB) /y

 clean::
 	@-erase /s $(DIROBJ)\$(DLLC) $(DIROBJ)\$(DLLINC) 2> NUL
 !ELSE
-$(LIBWEBP) $(LIBWEBPMUX):
-	$(LNKLIB) /out:$@ $(LFLAGS) $**
+$(LIBWEBPDECODER) $(LIBWEBP) $(LIBWEBPMUX) $(LIBWEBPDEMUX):
+	$(LNKLIB) /out:$@ $**
 	-xcopy $(DIROBJ)\*.pdb $(DIRLIB) /y
 !ENDIF

@ -268,6 +313,8 @@ $(DIROBJ)\$(DLLC): $(DIROBJ)\$(DLLINC)
 	$(CC) $(CFLAGS) /Fd$(DIROBJ)\examples\ /Fo$(DIROBJ)\examples\  $<
 {src\dec}.c{$(DIROBJ)\dec}.obj::
 	$(CC) $(CFLAGS) /Fd$(LIBWEBP_PDBNAME) /Fo$(DIROBJ)\dec\ $<
+{src\demux}.c{$(DIROBJ)\demux}.obj::
+	$(CC) $(CFLAGS) /Fd$(LIBWEBP_PDBNAME) /Fo$(DIROBJ)\demux\ $<
 {src\dsp}.c{$(DIROBJ)\dsp}.obj::
 	$(CC) $(CFLAGS) /Fd$(LIBWEBP_PDBNAME) /Fo$(DIROBJ)\dsp\ $<
 {src\enc}.c{$(DIROBJ)\enc}.obj::
--- a/48
+++ b/48
@ -1,3 +1,51 @@
+- 12/19/13: version 0.4.0
+  * improved gif2webp tool
+  * numerous fixes, compression improvement and speed-up
+  * dither option added to decoder (dwebp -dither 50 ...)
+  * improved multi-threaded modes (-mt option)
+  * improved filtering strength determination
+  * New function: WebPMuxGetCanvasSize
+  * BMP and TIFF format output added to 'dwebp'
+  * Significant memory reduction for decoding lossy images with alpha.
+  * Intertwined decoding of RGB and alpha for a shorter
+    time-to-first-decoded-pixel.
+  * WebPIterator has a new member 'has_alpha' denoting whether the frame
+    contains transparency.
+  * Container spec amended with new 'blending method' for animation.
+
+- 6/13/13: version 0.3.1
+  This is a binary compatible release.
+  * Add incremental decoding support for images containing ALPH and ICCP chunks.
+  * Python bindings via swig for the simple encode/decode interfaces similar to
+    Java.
+
+- 3/20/13: version 0.3.0
+  This is a binary compatible release.
+  * WebPINewRGB/WebPINewYUVA accept being passed a NULL output buffer
+    and will perform auto-allocation.
+  * default filter option is now '-strong -f 60'
+  * encoding speed-up for lossy methods 3 to 6
+  * alpha encoding can be done in parallel to lossy using 'cwebp -mt ...'
+  * color profile, metadata (XMP/EXIF) and animation support finalized in the
+    container.
+  * various NEON assembly additions
+  Tool updates / additions:
+    * gif2webp added
+    * vwebp given color profile & animation support
+    * cwebp can preserve color profile / metadata with '-metadata'
+
+- 10/30/12: version 0.2.1
+  * Various security related fixes
+  * cwebp.exe: fix import errors on Windows XP
+  * enable DLL builds for mingw targets
+
+- 8/3/12: version 0.2.0
+  * Add support for ARGB -> YUVA conversion for lossless decoder
+    New functions: WebPINewYUVA, WebPIDecGetYUVA
+  * Add stats for lossless and alpha encoding
+  * Security related hardening: allocation and size checks
+  * Add PAM output support to dwebp
+
 - 7/19/12: version 0.1.99
  * This is a pre-release of 0.2.0, not an rc to allow for further
    incompatible changes based on user feedback.
--- a/152
+++ b/152
@ -4,7 +4,7 @@
          \__\__/\____/\_____/__/ ____  ___
                / _/ /    \    \ /  _ \/ _/
               /  \_/   / /   \ \   __/  \__
-               \____/____/\_____/_____/____/v0.1.99
+               \____/____/\_____/_____/____/v0.4.0

 Description:
 ============
@ -13,7 +13,7 @@ WebP codec: library to encode and decode images in WebP format. This package
 contains the library that can be used in other programs to add WebP support,
 as well as the command line tools 'cwebp' and 'dwebp'.

-See http://code.google.com/speed/webp
+See http://developers.google.com/speed/webp

 Latest sources are available from http://www.webmproject.org/code/

@ -49,7 +49,7 @@ will build the binaries examples/cwebp and examples/dwebp, along
 with the static library src/libwebp.a. No system-wide installation
 is supplied, as this is a simple alternative to the full installation
 system based on the autoconf tools (see below).
-Please refer to the makefile.unix for additional details and customizations.
+Please refer to makefile.unix for additional details and customizations.

 Using autoconf tools:
 ---------------------
@ -71,16 +71,17 @@ should be all you need to have the following files

 installed.

-Note: The encoding and decoding libraries are compiled separately
-(as src/dec/libwebpdecode.* and src/dec/libwebpencode.*). They
-can be installed independently using a minor modification in the
-corresponding Makefile.am configure files (see comments there).
+Note: A decode-only library, libwebpdecoder, is available using the
+'--enable-libwebpdecoder' flag. The encode library is built separately and can
+be installed independently using a minor modification in the corresponding
+Makefile.am configure files (see comments there). See './configure --help' for
+more options.

 SWIG bindings:
 --------------

-To generate language bindings from swig/libwebp.i swig-1.3
-(http://www.swig.org) is required. 2.0 may work, but has not been tested.
+To generate language bindings from swig/libwebp.swig at least swig-1.3
+(http://www.swig.org) is required.

 Currently the following functions are mapped:
 Decode:
@ -103,12 +104,20 @@ Encode:
  WebPEncodeLosslessRGB
  WebPEncodeLosslessBGR

+See swig/README for more detailed build instructions.
+
 Java bindings:

 To build the swig-generated JNI wrapper code at least JDK-1.5 (or equivalent)
 is necessary for enum support. The output is intended to be a shared object /
 DLL that can be loaded via System.loadLibrary("webp_jni").

+Python bindings:
+
+To build the swig-generated Python extension code at least Python 2.6 is
+required. Python < 2.6 may build with some minor changes to libwebp.swig or the
+generated code, but is untested.
+
 Encoding tool:
 ==============

@ -151,24 +160,36 @@ options:
  -sns <int> ............. Spatial Noise Shaping (0:off, 100:max)
  -f <int> ............... filter strength (0=off..100)
  -sharpness <int> ....... filter sharpness (0:most .. 7:least sharp)
-  -strong ................ use strong filter instead of simple.
+  -strong ................ use strong filter instead of simple (default).
+  -nostrong .............. use simple filter instead of strong.
  -partition_limit <int> . limit quality to fit the 512k limit on
                           the first partition (0=no degradation ... 100=full)
  -pass <int> ............ analysis pass number (1..10)
  -crop <x> <y> <w> <h> .. crop picture with the given rectangle
  -resize <w> <h> ........ resize picture (after any cropping)
+  -mt .................... use multi-threading if available
+  -low_memory ............ reduce memory usage (slower encoding)
  -map <int> ............. print map of extra info.
-  -print_ssim ............ prints averaged SSIM distortion.
  -print_psnr ............ prints averaged PSNR distortion.
+  -print_ssim ............ prints averaged SSIM distortion.
+  -print_lsim ............ prints local-similarity distortion.
  -d <file.pgm> .......... dump the compressed output (PGM file).
  -alpha_method <int> .... Transparency-compression method (0..1)
  -alpha_filter <string> . predictive filtering for alpha plane.
                           One of: none, fast (default) or best.
  -alpha_cleanup ......... Clean RGB values in transparent area.
+  -blend_alpha <hex> ..... Blend colors against background color
+                           expressed as RGB values written in
+                           hexadecimal, e.g. 0xc0e0d0 for red=0xc0
+                           green=0xe0 and blue=0xd0.
  -noalpha ............... discard any transparency information.
  -lossless .............. Encode image losslessly.
  -hint <string> ......... Specify image characteristics hint.
-                           One of: photo or picture
+                           One of: photo, picture or graph
+
+  -metadata <string> ..... comma separated list of metadata to
+                           copy from the input to the output if present.
+                           Valid values: all, none (default), exif, icc, xmp

  -short ................. condense printed message
  -quiet ................. don't print anything.
@ -178,6 +199,7 @@ options:
  -progress .............. report encoding progress

 Experimental Options:
+  -jpeg_like ............. Roughly match expected JPEG size.
  -af .................... auto-adjust filter strength.
  -pre <int> ............. pre-processing filter

@ -205,8 +227,8 @@ Namely:
     in-loop processing. The higher the value, the smoother the
     highly-compressed area will look. This is particularly useful when aiming
     at very small files. Typical values are around 20-30. Note that using the
-     option -strong will change the type of filtering. Use "-f 0" to turn
-     filtering off.
+     option -strong/-nostrong will change the type of filtering. Use "-f 0" to
+     turn filtering off.
  * 'm' controls the trade-off between encoding speed and quality. Default is 4.
     You can try -m 5 or -m 6 to explore more (time-consuming) encoding
     possibilities. A lower value will result in faster encoding at the expense
@ -231,16 +253,25 @@ Usage: dwebp in_file [options] [-o out_file]

 Decodes the WebP image file to PNG format [Default]
 Use following options to convert into alternate image formats:
-  -ppm ......... save the raw RGB samples as color PPM
+  -pam ......... save the raw RGBA samples as a color PAM
+  -ppm ......... save the raw RGB samples as a color PPM
+  -bmp ......... save as uncompressed BMP format
+  -tiff ........ save as uncompressed TIFF format
  -pgm ......... save the raw YUV samples as a grayscale PGM
-                 file with IMC4 layout.
+                 file with IMC4 layout
+  -yuv ......... save the raw YUV samples in flat layout
+
 Other options are:
  -version  .... print version number and exit.
  -nofancy ..... don't use the fancy YUV420 upscaler.
  -nofilter .... disable in-loop filtering.
+  -nodither .... disable dithering.
+  -dither <d> .. dithering strength (in 0..100)
  -mt .......... use multi-threading
  -crop <x> <y> <w> <h> ... crop output with the given rectangle
  -scale <w> <h> .......... scale the output (*after* any cropping)
+  -alpha ....... only save the alpha plane.
+  -incremental . use incremental decoding (useful for tests)
  -h     ....... this help message.
  -v     ....... verbose (e.g. print encoding/decoding times)
  -noasm ....... disable all assembly optimizations.
@ -250,12 +281,89 @@ Visualization tool:

 There's a little self-serve visualization tool called 'vwebp' under the
 examples/ directory. It uses OpenGL to open a simple drawing window and show
-a decoded WebP file. It's not yet integrated in the automake or makefile.unix
-build system, but you can try to manually compile it using the recommendations
-at the top of the source file.
+a decoded WebP file. It's not yet integrated in the automake build system, but
+you can try to manually compile it using the recommendations below.

-Usage: 'vwebp my_picture.webp'
+Usage: vwebp in_file [options]

+Decodes the WebP image file and visualize it using OpenGL
+Options are:
+  -version  .... print version number and exit.
+  -noicc ....... don't use the icc profile if present.
+  -nofancy ..... don't use the fancy YUV420 upscaler.
+  -nofilter .... disable in-loop filtering.
+  -dither <int>  dithering strength (0..100). Default=50.
+  -mt .......... use multi-threading.
+  -info ........ print info.
+  -h     ....... this help message.
+
+Keyboard shortcuts:
+  'c' ................ toggle use of color profile.
+  'i' ................ overlay file information.
+  'q' / 'Q' / ESC .... quit.
+
+Building:
+---------
+
+Prerequisites:
+1) OpenGL & OpenGL Utility Toolkit (GLUT)
+  Linux:
+    $ sudo apt-get install freeglut3-dev mesa-common-dev
+  Mac + XCode:
+    - These libraries should be available in the OpenGL / GLUT frameworks.
+  Windows:
+    http://freeglut.sourceforge.net/index.php#download
+
+2) (Optional) qcms (Quick Color Management System)
+  i. Download qcms from Mozilla / Chromium:
+    http://hg.mozilla.org/mozilla-central/file/0e7639e3bdfb/gfx/qcms
+    http://src.chromium.org/viewvc/chrome/trunk/src/third_party/qcms
+  ii. Build and archive the source files as libqcms.a / qcms.lib
+  iii. Update makefile.unix / Makefile.vc
+    a) Define WEBP_HAVE_QCMS
+    b) Update include / library paths to reference the qcms directory.
+
+Build using makefile.unix / Makefile.vc:
+$ make -f makefile.unix examples/vwebp
+> nmake /f Makefile.vc CFG=release-static \
+    ../obj/x64/release-static/bin/vwebp.exe
+
+Animated GIF conversion:
+========================
+Animated GIF files can be converted to WebP files with animation using the
+gif2webp utility available under examples/. The files can then be viewed using
+vwebp.
+
+Usage:
+ gif2webp [options] gif_file -o webp_file
+options:
+  -h / -help  ............ this help
+  -lossy ................. Encode image using lossy compression.
+  -mixed ................. For each frame in the image, pick lossy
+                           or lossless compression heuristically.
+  -q <float> ............. quality factor (0:small..100:big)
+  -m <int> ............... compression method (0=fast, 6=slowest)
+  -kmin <int> ............ Min distance between key frames
+  -kmax <int> ............ Max distance between key frames
+  -f <int> ............... filter strength (0=off..100)
+  -metadata <string> ..... comma separated list of metadata to
+                           copy from the input to the output if present.
+                           Valid values: all, none, icc, xmp (default)
+  -mt .................... use multi-threading if available
+
+  -version ............... print version number and exit.
+  -v ..................... verbose.
+  -quiet ................. don't print anything.
+
+Building:
+---------
+With the libgif development files installed, gif2webp can be built using
+makefile.unix:
+$ make -f makefile.unix examples/gif2webp
+
+or using autoconf:
+$ ./configure --enable-everything
+$ make

 Encoding API:
 =============
@ -403,12 +511,12 @@ The 'idec' object must always be released (even upon an error condition) by
 calling: WebPDelete(idec).

 To retrieve partially decoded picture samples, one must use the corresponding
-method: WebPIDecGetRGB or WebPIDecGetYUV.
+method: WebPIDecGetRGB or WebPIDecGetYUVA.
 It will return the last displayable pixel row.

 Lastly, note that decoding can also be performed into a pre-allocated pixel
 buffer. This buffer must be passed when creating a WebPIDecoder, calling
-WebPINewRGB() or WebPINewYUV().
+WebPINewRGB() or WebPINewYUVA().

 Please have a look at the src/webp/decode.h header for further details.

--- a/README.mux
+++ b/README.mux
@ -1,23 +1,24 @@
          __   __  ____  ____  ____  __ __  _     __ __
         /  \\/  \/  _ \/  _ \/  _ \/  \  \/ \___/_ / _\
         \       /   __/  _  \   __/      /  /  (_/  /__
-          \__\__/\_____/_____/__/  \__//_/\_____/__/___/
+          \__\__/\_____/_____/__/  \__//_/\_____/__/___/v0.2.0


 Description:
 ============

-WebP Mux: library to create a WebP container object for features like
-color profile, XMP metadata, animation & tiling. A reference command line
-tool 'webpmux' and WebP container specification 'doc/webp-container-spec.txt'
-are also provided in this package.
+WebPMux: set of two libraries 'Mux' and 'Demux' for creation, extraction and
+manipulation of an extended format WebP file, which can have features like
+color profile, metadata and animation. Reference command-line tools 'webpmux'
+and 'vwebp' as well as the WebP container specification
+'doc/webp-container-spec.txt' are also provided in this package.

 WebP Mux tool:
 ==============

 The examples/ directory contains a tool (webpmux) for manipulating WebP
-files. The webpmux tool can be used to create a WebP container file and to
-extract or strip relevant data from the container file.
+files. The webpmux tool can be used to create an extended format WebP file and
+also to extract or strip relevant data from such a file.

 A list of options is available using the -help command line flag:

@ -25,82 +26,150 @@ A list of options is available using the -help command line flag:
 Usage: webpmux -get GET_OPTIONS INPUT -o OUTPUT
       webpmux -set SET_OPTIONS INPUT -o OUTPUT
       webpmux -strip STRIP_OPTIONS INPUT -o OUTPUT
-       webpmux -tile TILE_OPTIONS [-tile...] -o OUTPUT
-       webpmux -frame FRAME_OPTIONS [-frame...] -loop LOOP_COUNT -o OUTPUT
+       webpmux -frame FRAME_OPTIONS [-frame...] [-loop LOOP_COUNT]
+               [-bgcolor BACKGROUND_COLOR] -o OUTPUT
       webpmux -info INPUT
       webpmux [-h|-help]
+       webpmux -version

 GET_OPTIONS:
 Extract relevant data.
-   icc       Get ICCP Color profile.
+   icc       Get ICC profile.
+   exif      Get EXIF metadata.
   xmp       Get XMP metadata.
-   tile n    Get nth tile.
   frame n   Get nth frame.

 SET_OPTIONS:
 Set color profile/metadata.
-   icc       Set ICC Color profile.
-   xmp       Set XMP metadata.
+   icc  file.icc     Set ICC profile.
+   exif file.exif    Set EXIF metadata.
+   xmp  file.xmp     Set XMP metadata.
+   where:    'file.icc' contains the ICC profile to be set,
+             'file.exif' contains the EXIF metadata to be set
+             'file.xmp' contains the XMP metadata to be set

 STRIP_OPTIONS:
 Strip color profile/metadata.
-   icc       Strip ICCP color profile.
+   icc       Strip ICC profile.
+   exif      Strip EXIF metadata.
   xmp       Strip XMP metadata.

-TILE_OPTIONS(i):
- Create tiled image.
-   file_i +xi+yi
-   where:    'file_i' is the i'th tile (webp format),
-             'xi','yi' specify the image offset for this tile.
-
 FRAME_OPTIONS(i):
 Create animation.
-   file_i +xi+yi+di
-   where:    'file_i' is the i'th animation frame (webp format),
-             'xi','yi' specify the image offset for this frame.
+   file_i +di+[xi+yi[+mi[bi]]]
+   where:    'file_i' is the i'th animation frame (WebP format),
             'di' is the pause duration before next frame.
+             'xi','yi' specify the image offset for this frame.
+             'mi' is the dispose method for this frame (0 or 1).
+             'bi' is the blending method for this frame (+b or -b).

-INPUT & OUTPUT are in webp format.
+LOOP_COUNT:
+ Number of times to repeat the animation.
+ Valid range is 0 to 65535 [Default: 0 (infinite)].

-WebP Mux API:
-==============
-The WebP Mux API contains methods for adding data to and reading data from
-WebPMux (a WebP container object). This API currently supports XMP metadata,
-color profile, animation & tiling. Other features will be added in subsequent
-releases.
+BACKGROUND_COLOR:
+ Background color of the canvas.
+  A,R,G,B
+  where:    'A', 'R', 'G' and 'B' are integers in the range 0 to 255 specifying
+            the Alpha, Red, Green and Blue component values respectively
+            [Default: 255,255,255,255].
+
+INPUT & OUTPUT are in WebP format.
+
+Note: The nature of EXIF, XMP and ICC data is not checked and is assumed to be
+valid.
+
+Visualization tool:
+===================
+
+The examples/ directory also contains a tool (vwebp) for viewing WebP files.
+It decodes the image and visualizes it using OpenGL. See the libwebp README
+for details on building and running this program.
+
+Mux API:
+========
+The Mux API contains methods for adding data to and reading data from WebP
+files. This API currently supports XMP/EXIF metadata, ICC profile and animation.
+Other features may be added in subsequent releases.

 Example#1 (pseudo code): Creating a WebPMux object with image data, color
-profile & XMP metadata.
+profile and XMP metadata.

  int copy_data = 0;
  WebPMux* mux = WebPMuxNew();
  // ... (Prepare image data).
  WebPMuxSetImage(mux, &image, copy_data);
-  // ... (Prepare ICCP color profile data).
-  WebPMuxSetColorProfile(mux, &icc_profile, copy_data);
+  // ... (Prepare ICC profile data).
+  WebPMuxSetChunk(mux, "ICCP", &icc_profile, copy_data);
  // ... (Prepare XMP metadata).
-  WebPMuxSetMetadata(mux, &xmp, copy_data);
+  WebPMuxSetChunk(mux, "XMP ", &xmp, copy_data);
  // Get data from mux in WebP RIFF format.
  WebPMuxAssemble(mux, &output_data);
  WebPMuxDelete(mux);
-  // ... (Consume output_data; e.g. write output_data.bytes_ to file).
+  // ... (Consume output_data; e.g. write output_data.bytes to file).
  WebPDataClear(&output_data);


-Example#2 (pseudo code): Get image & color profile data from a WebP file.
+Example#2 (pseudo code): Get image and color profile data from a WebP file.

  int copy_data = 0;
  // ... (Read data from file).
  WebPMux* mux = WebPMuxCreate(&data, copy_data);
-  WebPMuxGetImage(mux, &image);
+  WebPMuxGetFrame(mux, 1, &image);
  // ... (Consume image; e.g. call WebPDecode() to decode the data).
-  WebPMuxGetColorProfile(mux, &icc_profile);
+  WebPMuxGetChunk(mux, "ICCP", &icc_profile);
  // ... (Consume icc_profile).
  WebPMuxDelete(mux);
  free(data);


-For detailed Mux API reference, please refer to the header file (src/webp/mux.h)
+For a detailed Mux API reference, please refer to the header file
+(src/webp/mux.h).
+
+Demux API:
+==========
+The Demux API enables extraction of images and extended format data from
+WebP files. This API currently supports reading of XMP/EXIF metadata, ICC
+profile and animated images. Other features may be added in subsequent
+releases.
+
+Code Example: Demuxing WebP data to extract all the frames, ICC profile
+and EXIF/XMP metadata.
+
+  WebPDemuxer* demux = WebPDemux(&webp_data);
+  uint32_t width = WebPDemuxGetI(demux, WEBP_FF_CANVAS_WIDTH);
+  uint32_t height = WebPDemuxGetI(demux, WEBP_FF_CANVAS_HEIGHT);
+  // ... (Get information about the features present in the WebP file).
+  uint32_t flags = WebPDemuxGetI(demux, WEBP_FF_FORMAT_FLAGS);
+
+  // ... (Iterate over all frames).
+  WebPIterator iter;
+  if (WebPDemuxGetFrame(demux, 1, &iter)) {
+    do {
+      // ... (Consume 'iter'; e.g. Decode 'iter.fragment' with WebPDecode(),
+      // ... and get other frame properties like width, height, offsets etc.
+      // ... see 'struct WebPIterator' below for more info).
+    } while (WebPDemuxNextFrame(&iter));
+    WebPDemuxReleaseIterator(&iter);
+  }
+
+  // ... (Extract metadata).
+  WebPChunkIterator chunk_iter;
+  if (flags & ICCP_FLAG) WebPDemuxGetChunk(demux, "ICCP", 1, &chunk_iter);
+  // ... (Consume the ICC profile in 'chunk_iter.chunk').
+  WebPDemuxReleaseChunkIterator(&chunk_iter);
+  if (flags & EXIF_FLAG) WebPDemuxGetChunk(demux, "EXIF", 1, &chunk_iter);
+  // ... (Consume the EXIF metadata in 'chunk_iter.chunk').
+  WebPDemuxReleaseChunkIterator(&chunk_iter);
+  if (flags & XMP_FLAG) WebPDemuxGetChunk(demux, "XMP ", 1, &chunk_iter);
+  // ... (Consume the XMP metadata in 'chunk_iter.chunk').
+  WebPDemuxReleaseChunkIterator(&chunk_iter);
+  WebPDemuxDelete(demux);
+
+
+For a detailed Demux API reference, please refer to the header file
+(src/webp/demux.h).
+

 Bugs:
 =====
--- a/configure.ac
+++ b/configure.ac
@ -1,21 +1,67 @@
-AC_INIT([libwebp], [0.1.99],
+AC_INIT([libwebp], [0.4.0],
        [http://code.google.com/p/webp/issues],,
-        [http://code.google.com/speed/webp])
+        [http://developers.google.com/speed/webp])
 AC_CANONICAL_TARGET
 AM_INIT_AUTOMAKE([-Wall foreign subdir-objects])
+
+dnl === automake >= 1.12 requires this for 'unusual archivers' support.
+dnl === it must occur before LT_INIT (AC_PROG_LIBTOOL).
+m4_ifdef([AM_PROG_AR], [AM_PROG_AR])
+
 AC_PROG_LIBTOOL
 AM_PROG_CC_C_O

 dnl === Enable less verbose output when building.
-dnl === If an older aclocal exits with an error comment these lines out.
-m4_define_default([AM_SILENT_RULES], [])
-AM_SILENT_RULES
+m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
+
+dnl === SET_IF_UNSET(shell_var, value)
+dnl ===   Set the shell variable 'shell_var' to 'value' if it is unset.
+AC_DEFUN([SET_IF_UNSET], [test "${$1+set}" = "set" || $1=$2])
+
+AC_ARG_ENABLE([everything],
+              AS_HELP_STRING([--enable-everything],
+                             [Enable all optional targets. These can still be
+                              disabled with --disable-target]),
+              [SET_IF_UNSET([enable_libwebpdecoder], [$enableval])
+               SET_IF_UNSET([enable_libwebpdemux], [$enableval])
+               SET_IF_UNSET([enable_libwebpmux], [$enableval])])

 AC_ARG_WITH([pkgconfigdir], AS_HELP_STRING([--with-pkgconfigdir=DIR],
            [Path to the pkgconfig directory @<:@LIBDIR/pkgconfig@:>@]),
            [pkgconfigdir="$withval"], [pkgconfigdir='${libdir}/pkgconfig'])
 AC_SUBST([pkgconfigdir])

+dnl === TEST_AND_ADD_CFLAGS(flag)
+dnl ===   Checks whether $CC supports 'flag' and adds it to AM_CFLAGS on success.
+AC_DEFUN([TEST_AND_ADD_CFLAGS],
+         [SAVED_CFLAGS="$CFLAGS"
+          CFLAGS="-Werror $1"
+          AC_MSG_CHECKING([whether $CC supports $1])
+          dnl Note AC_LANG_PROGRAM([]) uses an old-style main definition.
+          AC_COMPILE_IFELSE([AC_LANG_SOURCE([int main(void) { return 0; }])],
+                            [AC_MSG_RESULT([yes])]
+                            dnl Simply append the variable avoiding a
+                            dnl compatibility ifdef for AS_VAR_APPEND as this
+                            dnl variable shouldn't grow all that large.
+                            [AM_CFLAGS="$AM_CFLAGS $1"],
+                            [AC_MSG_RESULT([no])])
+          CFLAGS="$SAVED_CFLAGS"])
+TEST_AND_ADD_CFLAGS([-Wall])
+TEST_AND_ADD_CFLAGS([-Wdeclaration-after-statement])
+TEST_AND_ADD_CFLAGS([-Wextra])
+TEST_AND_ADD_CFLAGS([-Wmissing-declarations])
+TEST_AND_ADD_CFLAGS([-Wmissing-prototypes])
+TEST_AND_ADD_CFLAGS([-Wold-style-definition])
+TEST_AND_ADD_CFLAGS([-Wshadow])
+TEST_AND_ADD_CFLAGS([-Wunused-but-set-variable])
+TEST_AND_ADD_CFLAGS([-Wunused])
+TEST_AND_ADD_CFLAGS([-Wvla])
+AC_SUBST([AM_CFLAGS])
+
+dnl === CLEAR_LIBVARS([var_pfx])
+dnl ===   Clears <var_pfx>_{INCLUDES,LIBS}.
+AC_DEFUN([CLEAR_LIBVARS], [$1_INCLUDES=""; $1_LIBS=""])
+
 dnl === WITHLIB_OPTION([opt_pfx], [outvar_pfx])
 dnl ===   Defines --with-<opt_pfx>{include,lib}dir options which set
 dnl ===   the variables <outvar_pfx>_{INCLUDES,LIBS}.
@ -29,10 +75,23 @@ AC_DEFUN([WITHLIB_OPTION],
                              [use $2 libraries from DIR]),
               [$2_LIBS="-L$withval"])])

-dnl === Check for native log2
-AC_SEARCH_LIBS([log2], [m],,
-               [AC_DEFINE([NOT_HAVE_LOG2], [1],
-                          [Undefine this if you have log2().])])
+dnl === LIBCHECK_PROLOGUE([var_pfx])
+dnl ===   Caches the current values of CPPFLAGS/LIBS in SAVED_* then
+dnl ===   prepends the current values with <var_pfx>_{INCLUDES,LIBS}.
+AC_DEFUN([LIBCHECK_PROLOGUE],
+         [SAVED_CPPFLAGS=$CPPFLAGS
+          SAVED_LIBS=$LIBS
+          CPPFLAGS="$$1_INCLUDES $CPPFLAGS"
+          LIBS="$$1_LIBS $LIBS"])
+
+dnl === LIBCHECK_EPILOGUE([var_pfx])
+dnl ===   Restores the values of CPPFLAGS/LIBS from SAVED_* and exports
+dnl ===   <var_pfx>_{INCLUDES,LIBS} with AC_SUBST.
+AC_DEFUN([LIBCHECK_EPILOGUE],
+         [AC_SUBST($1_LIBS)
+          AC_SUBST($1_INCLUDES)
+          CPPFLAGS=$SAVED_CPPFLAGS
+          LIBS=$SAVED_LIBS])

 dnl === Check for pthread support
 AC_ARG_ENABLE([threading],
@ -51,10 +110,110 @@ if test "$enable_threading" = "yes"; then
 fi
 AC_MSG_NOTICE([checking if threading is enabled... ${enable_threading-no}])

+dnl === check for OpenGL/GLUT support ===
+CLEAR_LIBVARS([GL])
+WITHLIB_OPTION([gl], [GL])
+
+LIBCHECK_PROLOGUE([GL])
+
+glut_cflags="none"
+glut_ldflags="none"
+case $host_os in
+  darwin*)
+    # Special case for OSX builds. Append these to give the user a chance to
+    # override with --with-gl*
+    glut_cflags="$glut_cflags|-framework GLUT -framework OpenGL"
+    glut_ldflags="$glut_ldflags|-framework GLUT -framework OpenGL"
+    ;;
+esac
+
+GLUT_SAVED_CPPFLAGS="$CPPFLAGS"
+SAVED_IFS="$IFS"
+IFS="|"
+for flag in $glut_cflags; do
+  # restore IFS immediately as the autoconf macros may need the default.
+  IFS="$SAVED_IFS"
+  unset ac_cv_header_GL_glut_h
+  unset ac_cv_header_OpenGL_glut_h
+
+  case $flag in
+    none) ;;
+    *) CPPFLAGS="$flag $CPPFLAGS";;
+  esac
+  AC_CHECK_HEADERS([GL/glut.h GLUT/glut.h OpenGL/glut.h],
+                   [glut_headers=yes;
+                    test "$flag" = "none" || GL_INCLUDES="$CPPFLAGS";
+                    break])
+  CPPFLAGS="$GLUT_SAVED_CPPFLAGS"
+  test "$glut_headers" = "yes" && break
+done
+IFS="$SAVED_IFS"
+
+if test "$glut_headers" = "yes"; then
+  AC_LANG_PUSH([C])
+  GLUT_SAVED_LDFLAGS="$LDFLAGS"
+  SAVED_IFS="$IFS"
+  IFS="|"
+  for flag in $glut_ldflags; do
+    # restore IFS immediately as the autoconf macros may need the default.
+    IFS="$SAVED_IFS"
+    unset ac_cv_search_glBegin
+
+    case $flag in
+      none) ;;
+      *) LDFLAGS="$flag $LDFLAGS";;
+    esac
+
+    # find libGL
+    GL_SAVED_LIBS="$LIBS"
+    AC_SEARCH_LIBS([glBegin], [GL OpenGL])
+    LIBS="$GL_SAVED_LIBS"
+
+    # A direct link to libGL may not be necessary on e.g., linux.
+    GLUT_SAVED_LIBS="$LIBS"
+    for lib in "" "-lglut" "-lglut $ac_cv_search_glBegin"; do
+      LIBS="$lib"
+      AC_LINK_IFELSE(
+        [AC_LANG_PROGRAM([
+           #ifdef __cplusplus
+           # define EXTERN_C extern "C"
+           #else
+           # define EXTERN_C
+           #endif
+           EXTERN_C char glOrtho();
+           EXTERN_C char glutMainLoop();
+          ],[
+           glOrtho();
+           glutMainLoop();
+          ])
+        ],
+        AC_DEFINE(WEBP_HAVE_GL, [1],
+                  [Set to 1 if OpenGL is supported])
+        [glut_support=yes], []
+      )
+      if test "$glut_support" = "yes"; then
+        GL_LIBS="$LDFLAGS $lib"
+        break
+      fi
+    done
+    LIBS="$GLUT_SAVED_LIBS"
+    LDFLAGS="$GLUT_SAVED_LDFLAGS"
+    test "$glut_support" = "yes" && break
+  done
+  IFS="$SAVED_IFS"
+  AC_LANG_POP
+fi
+
+LIBCHECK_EPILOGUE([GL])
+
+if test "$glut_support" = "yes" -a "$enable_libwebpdemux" = "yes"; then
+  build_vwebp=yes
+fi
+AM_CONDITIONAL([BUILD_VWEBP], [test "$build_vwebp" = "yes"])
+
 dnl === check for PNG support ===

-PNG_INCLUDES=""
-PNG_LIBS=""
+CLEAR_LIBVARS([PNG])
 AC_PATH_PROGS(LIBPNG_CONFIG,
              [libpng-config libpng15-config libpng14-config libpng12-config])
 if test -n "$LIBPNG_CONFIG"; then
@ -68,11 +227,7 @@ fi

 WITHLIB_OPTION([png], [PNG])

-SAVED_CPPFLAGS=$CPPFLAGS
-SAVED_LIBS=$LIBS
-CPPFLAGS="$PNG_INCLUDES $CPPFLAGS"
-LIBS="$PNG_LIBS $LIBS"
-
+LIBCHECK_PROLOGUE([PNG])
 AC_CHECK_HEADER(png.h,
  AC_SEARCH_LIBS(png_get_libpng_ver, [png],
                 [test "$ac_cv_search_png_get_libpng_ver" = "none required" \
@ -80,6 +235,7 @@ AC_CHECK_HEADER(png.h,
                  PNG_INCLUDES="$PNG_INCLUDES -DWEBP_HAVE_PNG"
                  AC_DEFINE(WEBP_HAVE_PNG, [1],
                            [Set to 1 if PNG library is installed])
+                  png_support=yes
                 ],
                 [AC_MSG_WARN(Optional png library not found)
                  PNG_LIBS=""
@ -91,71 +247,82 @@ AC_CHECK_HEADER(png.h,
   PNG_INCLUDES=""
  ],
 )
-AC_SUBST(PNG_LIBS)
-AC_SUBST(PNG_INCLUDES)
-
-CPPFLAGS=$SAVED_CPPFLAGS
-LIBS=$SAVED_LIBS
+LIBCHECK_EPILOGUE([PNG])

 dnl === check for JPEG support ===

-JPEG_INCLUDES=""
-JPEG_LIBS=""
+CLEAR_LIBVARS([JPEG])
 WITHLIB_OPTION([jpeg], [JPEG])

-SAVED_CPPFLAGS=$CPPFLAGS
-SAVED_LIBS=$LIBS
-CPPFLAGS="$JPEG_INCLUDES $CPPFLAGS"
-LIBS="$JPEG_LIBS $LIBS"
-
+LIBCHECK_PROLOGUE([JPEG])
 AC_CHECK_HEADER(jpeglib.h,
  AC_CHECK_LIB(jpeg, jpeg_set_defaults,
               [JPEG_LIBS="$JPEG_LIBS -ljpeg"
                JPEG_INCLUDES="$JPEG_INCLUDES -DWEBP_HAVE_JPEG"
                AC_DEFINE(WEBP_HAVE_JPEG, [1],
                          [Set to 1 if JPEG library is installed])
+                jpeg_support=yes
               ],
               AC_MSG_WARN(Optional jpeg library not found),
               [$MATH_LIBS]),
  AC_MSG_WARN(jpeg library not available - no jpeglib.h)
 )
-AC_SUBST(JPEG_LIBS)
-AC_SUBST(JPEG_INCLUDES)
-
-CPPFLAGS=$SAVED_CPPFLAGS
-LIBS=$SAVED_LIBS
+LIBCHECK_EPILOGUE([JPEG])

 dnl === check for TIFF support ===

-TIFF_INCLUDES=""
-TIFF_LIBS=""
+CLEAR_LIBVARS([TIFF])
 WITHLIB_OPTION([tiff], [TIFF])

-SAVED_CPPFLAGS=$CPPFLAGS
-SAVED_LIBS=$LIBS
-CPPFLAGS="$TIFF_INCLUDES $CPPFLAGS"
-LIBS="$TIFF_LIBS $LIBS"
-
+LIBCHECK_PROLOGUE([TIFF])
 AC_CHECK_HEADER(tiffio.h,
  AC_CHECK_LIB(tiff, TIFFGetVersion,
               [TIFF_LIBS="$TIFF_LIBS -ltiff"
                TIFF_INCLUDES="$TIFF_INCLUDES -DWEBP_HAVE_TIFF"
                AC_DEFINE(WEBP_HAVE_TIFF, [1],
                          [Set to 1 if TIFF library is installed])
+                tiff_support=yes
               ],
               AC_MSG_WARN(Optional tiff library not found),
               [$MATH_LIBS]),
  AC_MSG_WARN(tiff library not available - no tiffio.h)
 )
-AC_SUBST(TIFF_LIBS)
-AC_SUBST(TIFF_INCLUDES)
+LIBCHECK_EPILOGUE([TIFF])

-CPPFLAGS=$SAVED_CPPFLAGS
-LIBS=$SAVED_LIBS
+dnl === check for GIF support ===
+
+CLEAR_LIBVARS([GIF])
+WITHLIB_OPTION([gif], [GIF])
+
+LIBCHECK_PROLOGUE([GIF])
+AC_CHECK_HEADER(gif_lib.h,
+  AC_CHECK_LIB([gif], [DGifOpenFileHandle],
+               [GIF_LIBS="$GIF_LIBS -lgif"
+                AC_DEFINE(WEBP_HAVE_GIF, [1],
+                          [Set to 1 if GIF library is installed])
+                gif_support=yes
+               ],
+               AC_MSG_WARN(Optional gif library not found),
+               [$MATH_LIBS]),
+  AC_MSG_WARN(gif library not available - no gif_lib.h)
+)
+LIBCHECK_EPILOGUE([GIF])
+
+if test "$gif_support" = "yes" -a \
+        "$enable_libwebpmux" = "yes"; then
+  build_gif2webp=yes
+fi
+AM_CONDITIONAL([BUILD_GIF2WEBP], [test "${build_gif2webp}" = "yes"])

 dnl === check for WIC support ===

-if test "$target_os" = "mingw32"; then
+AC_ARG_ENABLE([wic],
+              AS_HELP_STRING([--disable-wic],
+                             [Disable Windows Imaging Component (WIC) detection.
+                              @<:@default=auto@:>@]),,
+              [enable_wic=yes])
+
+if test "$target_os" = "mingw32" -a "$enable_wic" = "yes"; then
  AC_CHECK_HEADERS([wincodec.h shlwapi.h windows.h])
  if test "$ac_cv_header_wincodec_h" = "yes"; then
    AC_MSG_CHECKING(for Windows Imaging Component support)
@ -196,6 +363,19 @@ if test "$target_os" = "mingw32"; then
  fi
 fi

+dnl === If --enable-swap-16bit-csp is defined, add -DWEBP_SWAP_16BIT_CSP
+
+USE_SWAP_16BIT_CSP=""
+AC_MSG_CHECKING(if --enable-swap-16bit-csp option is specified)
+AC_ARG_ENABLE([swap-16bit-csp],
+              AS_HELP_STRING([--enable-swap-16bit-csp],
+                             [Enable byte swap for 16 bit colorspaces]))
+if test "$enable_swap_16bit_csp" = "yes"; then
+  USE_SWAP_16BIT_CSP="-DWEBP_SWAP_16BIT_CSP"
+fi
+AC_MSG_RESULT(${enable_swap_16bit_csp-no})
+AC_SUBST(USE_SWAP_16BIT_CSP)
+
 dnl === If --enable-experimental is defined, add -DWEBP_EXPERIMENTAL_FEATURES

 USE_EXPERIMENTAL_CODE=""
@ -203,7 +383,7 @@ AC_MSG_CHECKING(if --enable-experimental option is specified)
 AC_ARG_ENABLE([experimental], AS_HELP_STRING([--enable-experimental],
                                             [Activate experimental features]))
 if test "$enable_experimental" = "yes"; then
-        AC_DEFINE(EXPERIMENTAL,,[Enable experimental code])
+  AC_DEFINE(WEBP_EXPERIMENTAL_FEATURES, [1], [Enable experimental code])
  USE_EXPERIMENTAL_CODE="-DWEBP_EXPERIMENTAL_FEATURES"
 fi
 AC_MSG_RESULT(${enable_experimental-no})
@ -211,11 +391,27 @@ AC_SUBST(USE_EXPERIMENTAL_CODE)

 dnl === Check whether libwebpmux should be built
 AC_MSG_CHECKING(whether libwebpmux is to be built)
-AC_ARG_ENABLE([experimental-libwebpmux],
-              AS_HELP_STRING([--enable-experimental-libwebpmux],
+AC_ARG_ENABLE([libwebpmux],
+              AS_HELP_STRING([--enable-libwebpmux],
                             [Build libwebpmux @<:@default=no@:>@]))
-AC_MSG_RESULT(${enable_experimental_libwebpmux-no})
-AM_CONDITIONAL([WANT_MUX], [test "$enable_experimental_libwebpmux" = "yes"])
+AC_MSG_RESULT(${enable_libwebpmux-no})
+AM_CONDITIONAL([WANT_MUX], [test "$enable_libwebpmux" = "yes"])
+
+dnl === Check whether libwebpdemux should be built
+AC_MSG_CHECKING(whether libwebpdemux is to be built)
+AC_ARG_ENABLE([libwebpdemux],
+              AS_HELP_STRING([--enable-libwebpdemux],
+                             [Build libwebpdemux @<:@default=no@:>@]))
+AC_MSG_RESULT(${enable_libwebpdemux-no})
+AM_CONDITIONAL([WANT_DEMUX], [test "$enable_libwebpdemux" = "yes"])
+
+dnl === Check whether decoder library should be built.
+AC_MSG_CHECKING(whether decoder library is to be built)
+AC_ARG_ENABLE([libwebpdecoder],
+              AS_HELP_STRING([--enable-libwebpdecoder],
+                             [Build libwebpdecoder @<:@default=no@:>@]))
+AC_MSG_RESULT(${enable_libwebpdecoder-no})
+AM_CONDITIONAL([BUILD_LIBWEBPDECODER], [test "$enable_libwebpdecoder" = "yes"])

 dnl =========================

@ -224,9 +420,41 @@ AC_CONFIG_HEADERS([config.h])
 AC_CONFIG_FILES([Makefile src/Makefile man/Makefile \
                 examples/Makefile src/dec/Makefile \
                 src/enc/Makefile src/dsp/Makefile \
+                 src/demux/Makefile src/mux/Makefile \
                 src/utils/Makefile \
-                 src/mux/Makefile \
-                 src/libwebp.pc])
+                 src/libwebp.pc src/libwebpdecoder.pc \
+                 src/demux/libwebpdemux.pc src/mux/libwebpmux.pc])


 AC_OUTPUT
+
+AC_MSG_NOTICE([
+WebP Configuration Summary
+--------------------------
+
+Shared libraries: ${enable_shared}
+Static libraries: ${enable_static}
+Threaded decode: ${enable_threading-no}
+libwebp: yes
+libwebpdecoder: ${enable_libwebpdecoder-no}
+libwebpdemux: ${enable_libwebpdemux-no}
+libwebpmux: ${enable_libwebpmux-no}
+
+Tools:
+cwebp : yes
+  Input format support
+  ====================
+  JPEG : ${jpeg_support-no}
+  PNG  : ${png_support-no}
+  TIFF : ${tiff_support-no}
+  WIC  : ${wic_support-no}
+dwebp : yes
+  Output format support
+  =====================
+  PNG  : ${png_support-no}
+  WIC  : ${wic_support-no}
+GIF support : ${gif_support-no}
+gif2webp    : ${build_gif2webp-no}
+webpmux     : ${enable_libwebpmux-no}
+vwebp       : ${build_vwebp-no}
+])
--- a/doc/webp-container-spec.txt
+++ b/doc/webp-container-spec.txt
@ -13,9 +13,6 @@ end of this file.
 WebP Container Specification
 ============================

-_Working Draft, v0.5, 20120713_
-
-
 * TOC placeholder
 {:toc}

@ -27,8 +24,9 @@ WebP is an image format that uses either (i) the VP8 key frame encoding
 to compress image data in a lossy way, or (ii) the WebP lossless encoding
 (and possibly other encodings in the future). These encoding schemes should
 make it more efficient than currently used formats. It is optimized for fast
-image transfer over the network (e.g., for websites). This document describes
-the structure of a WebP file.
+image transfer over the network (e.g., for websites). The WebP format has
+feature parity (color profile, metadata, animation etc) with other formats as
+well. This document describes the structure of a WebP file.

 The WebP container (i.e., RIFF container for WebP) allows feature support over
 and above the basic use case of WebP (i.e., a file containing a single image
@ -38,24 +36,50 @@ for:
  * **Lossless compression.** An image can be losslessly compressed, using the
    WebP Lossless Format.

+  * **Metadata.** An image may have metadata stored in EXIF or XMP formats.
+
  * **Transparency.** An image may have transparency, i.e., an alpha channel.

+  * **Color Profile.** An image may have an embedded ICC profile as described
+    by the [International Color Consortium][iccspec].
+
+  * **Animation.** An image may have multiple frames with pauses between them,
+    making it an animation.
+
+  * **Image Fragmentation.** A single bitstream in WebP has an inherent
+    limitation for width or height of 2^14 pixels, and, when using VP8, a 512
+    KiB limit on the size of the first compressed partition. To support larger
+    images, the format supports images that are composed of multiple fragments,
+    each encoded as a separate bitstream. All fragments logically form a single
+    image: they have common metadata, color profile, etc. Image fragmentation
+    may also improve efficiency for larger images, e.g., grass can be encoded
+    differently than sky.
+
 The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT",
 "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this
 document are to be interpreted as described in [RFC 2119][].

+**Note:** Out of the features mentioned above, lossy compression, lossless
+compression, transparency, metadata, color profile and animation are finalized
+and are to be considered stable. On the other hand, image fragmentation is
+experimental as of now, and is open to discussion, feedback and comments.
+The same is indicated using annotation "_status: experimental_" in the relevant
+sections of this document.

 Terminology &amp; Basics
 ------------------------

-A WebP file contains a still image (i.e., an encoded matrix of pixels) and,
-optionally, transparency information. In case we need to refer only to the
+A WebP file contains either a still image (i.e., an encoded matrix of pixels)
+or an [animation](#animation). Optionally, it can also contain transparency
+information, color profile and metadata. In case we need to refer only to the
 matrix of pixels, we will call it the _canvas_ of the image.

 Below are additional terms used throughout this document:

-Code that reads WebP files is referred to as a _reader_, while
-code that writes them is referred to as a _writer_.
+_Reader/Writer_
+
+: Code that reads WebP files is referred to as a _reader_, while code that
+writes them is referred to as a _writer_.

 _uint16_

@ -69,10 +93,21 @@ _uint32_

 : A 32-bit, little-endian, unsigned integer.

+_FourCC_
+
+: A _FourCC_ (four-character code) is a _uint32_ created by concatenating four
+  ASCII characters in little-endian order.
+
 _1-based_
+
 : An unsigned integer field storing values offset by `-1`. e.g., Such a field
 would store value _25_ as _24_.

+RIFF file format
+----------------
+The WebP file format is based on the RIFF (resource interchange file format)
+document format.
+
 The basic element of a RIFF file is a _chunk_. It consists of:

     0                   1                   2                   3
@ -87,57 +122,27 @@ The basic element of a RIFF file is a _chunk_. It consists of:

 Chunk FourCC: 32 bits

-: ASCII four character code or _chunk tag_ used for chunk identification.
+: ASCII four-character code used for chunk identification.

 Chunk Size: 32 bits (_uint32_)

-: The size of the chunk (_ckSize_) not including this field, the chunk
-  identifier and padding.
+: The size of the chunk not including this field, the chunk identifier or
+  padding.

 Chunk Payload: _Chunk Size_ bytes

-: The data payload. If _Chunk Size_ is odd a single padding byte that
-  SHOULD be `0` is added.
+: The data payload. If _Chunk Size_ is odd, a single padding byte -- that
+  SHOULD be `0` -- is added.

 _ChunkHeader('ABCD')_

-: This is used to describe the fourcc and size header of individual
-  chunks, where 'ABCD' is the fourcc for the chunk. This element's
+: This is used to describe the _FourCC_ and _Chunk Size_ header of individual
+  chunks, where 'ABCD' is the FourCC for the chunk. This element's
  size is 8 bytes.

-: Note that, in this specification, all chunk tag characters are in
-  file order, not in byte order of a uint32 of any particular
-  architecture.
-
-_list of chunks_
-
-: A concatenation of multiple chunks.
-
-: We will refer to the first chunk as having _position_ 0, the second
-  as position 1, etc. By _chunk with index 0 among "ABCD"_ we mean
-  the first chunk among the chunks of type "ABCD" in the list, the
-  _chunk with index 1 among "ABCD"_ is the second such chunk, etc.
-
-A WebP file MUST begin with a single chunk with a tag 'RIFF'. All
-other defined chunks are contained within this chunk. The file SHOULD
-NOT contain anything after it.
-
-The maximum size of RIFF's _ckSize_ is 2^32 minus 10 bytes. The size
-of the whole file is at most 4GiB minus 2 bytes.
-
-**Note:** some RIFF libraries are said to have bugs when handling files
-larger than 1GiB or 2GiB. If you are using an existing library, check
-that it handles large files correctly.
-
-The first four bytes of the RIFF chunk contents (i.e., bytes 8-11 of the file)
-MUST be the ASCII string "WEBP". They are followed by a list of chunks. As the
-size of any chunk is even, the size of the RIFF chunk is also even.  The
-contents of the chunks in that list will be described in the following sections.
-
-**Note:** RIFF has a convention that all-uppercase chunks are standard
-chunks that apply to any RIFF file format, while chunks specific to a
-file format are all lowercase. WebP does not follow this convention.
-
+**Note:** RIFF has a convention that all-uppercase chunk FourCCs are standard
+chunks that apply to any RIFF file format, while FourCCs specific to a file
+format are all lowercase. WebP does not follow this convention.

 WebP file header
 ----------------
@ -158,12 +163,20 @@ WebP file header

 File Size: 32 bits (_uint32_)

-: The size of the file in bytes starting at offset 8.
+: The size of the file in bytes starting at offset 8. The maximum value of
+this field is 2^32 minus 10 bytes and thus the size of the whole file is at
+most 4GiB minus 2 bytes.

 'WEBP': 32 bits

 : The ASCII characters 'W' 'E' 'B' 'P'.

+A WebP file MUST begin with a RIFF header with the FourCC 'WEBP'. The file size
+in the header is the total size of the chunks that follow plus `4` bytes for
+the 'WEBP' FourCC. The file SHOULD NOT contain anything after it. As the size
+of any chunk is even, the size given by the RIFF header is also even. The
+contents of individual chunks will be described in the following sections.
+
 Simple file format (lossy)
 --------------------------

@ -249,9 +262,25 @@ An extended format file consists of:

  * A 'VP8X' chunk with information about features used in the file.

-  * An optional 'ALPH' chunk with transparency information.
+  * An optional 'ICCP' chunk with color profile.

-  * The image bitstream contained in either a 'VP8 ' or 'VP8L' chunk.
+  * An optional 'ANIM' chunk with animation control data.
+
+  * Image data.
+
+  * An optional 'EXIF' chunk with EXIF metadata.
+
+  * An optional 'XMP ' chunk with XMP metadata.
+
+  * An optional list of [unknown chunks](#unknown-chunks). _\[status: experimental\]_
+
+For a _still image_, the _image data_ consists of a single frame, whereas for
+an _animated image_, it consists of multiple frames. More details about frames
+can be found in the [Animation](#animation) section.
+
+Moreover, each frame can be fragmented or non-fragmented, as will be described
+in the [Extended WebP file header](#extended_header) section. More details about
+fragments can be found in the [Fragments](#fragments) section.

 All chunks SHOULD be placed in the same order as listed above. If a chunk
 appears in the wrong place, the file is invalid, but readers MAY parse the
@ -264,6 +293,7 @@ ignoring late chunks should make programs that need to do a full search
 give the same results as the ones stopping early.

 Extended WebP file header:
+{:#extended_header}

     0                   1                   2                   3
     0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
@ -272,25 +302,42 @@ Extended WebP file header:
    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
    |                      ChunkHeader('VP8X')                      |
    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-    | Rsv |L|  Rsv  |                   Reserved                    |
+    |Rsv|I|L|E|X|A|F|                   Reserved                    |
    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
    |          Canvas Width Minus One               |             ...
    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
    ...  Canvas Height Minus One    |
    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+

-Reserved (Rsv): 4 bits
+Reserved (Rsv): 2 bits

 : SHOULD be `0`.

+ICC profile (I): 1 bit
+
+: Set if the file contains an ICC profile.
+
 Alpha (L): 1 bit

-: Set if the file contains some (or all) images with transparency information
+: Set if any of the frames of the image contain transparency information
 ("alpha").

-Reserved (Rsv): 3 bits
+EXIF metadata (E): 1 bit

-: SHOULD be `0`.
+: Set if the file contains EXIF metadata.
+
+XMP metadata (X): 1 bit
+
+: Set if the file contains XMP metadata.
+
+Animation (A): 1 bit
+
+: Set if this is an animated image. Data in 'ANIM' and 'ANMF' chunks should be
+used to control the animation.
+
+Image Fragmentation (F): 1 bit _\[status: experimental\]_
+
+: Set if any of the frames in the image are represented by fragments.

 Reserved: 24 bits

@ -312,6 +359,209 @@ Future specifications MAY add more fields.

 ### Chunks

+#### Animation
+
+An animation is controlled by ANIM and ANMF chunks.
+
+ANIM Chunk:
+{:#anim_chunk}
+
+For an animated image, this chunk contains the _global parameters_ of the
+animation.
+
+     0                   1                   2                   3
+     0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                      ChunkHeader('ANIM')                      |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                       Background Color                        |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |          Loop Count           |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+Background Color: 32 bits (_uint32_)
+
+: The default background color of the canvas in \[Blue, Green, Red, Alpha\]
+byte order. This color MAY be used to fill the unused space on the canvas around
+the frames, as well as the transparent pixels of the first frame. Background
+color is also used when disposal method is `1`.
+
+**Note**:
+
+  * Background color MAY contain a transparency value (alpha), even if the
+    _Alpha_ flag in [VP8X chunk](#extended_header) is unset.
+
+  * Viewer applications SHOULD treat the background color value as a hint, and
+    are not required to use it.
+
+Loop Count: 16 bits (_uint16_)
+
+: The number of times to loop the animation. `0` means infinitely.
+
+This chunk MUST appear if the _Animation_ flag in the VP8X chunk is set.
+If the _Animation_ flag is not set and this chunk is present, it
+SHOULD be ignored.
+
+
+ANMF chunk:
+
+For animated images, this chunk contains information about a _single_ frame.
+If the _Animation flag_ is not set, then this chunk SHOULD NOT be present.
+
+     0                   1                   2                   3
+     0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                      ChunkHeader('ANMF')                      |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                        Frame X                |             ...
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    ...          Frame Y            |   Frame Width Minus One     ...
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    ...             |           Frame Height Minus One              |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                 Frame Duration                |  Reserved |B|D|
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                         Frame Data                            |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+Frame X: 24 bits (_uint24_)
+
+: The X coordinate of the upper left corner of the frame is `Frame X * 2`
+
+Frame Y: 24 bits (_uint24_)
+
+: The Y coordinate of the upper left corner of the frame is `Frame Y * 2`
+
+Frame Width Minus One: 24 bits (_uint24_)
+
+: The _1-based_ width of the frame.
+  The frame width is `1 + Frame Width Minus One`
+
+Frame Height Minus One: 24 bits (_uint24_)
+
+: The _1-based_ height of the frame.
+  The frame height is `1 + Frame Height Minus One`
+
+Frame Duration: 24 bits (_uint24_)
+
+: The time to wait before displaying the next frame, in 1 millisecond units.
+In particular, frame duration of 0 is useful when one wants to update multiple
+areas of the canvas at once during the animation.
+
+Reserved: 6 bits
+
+: SHOULD be 0.
+
+Blending method (B): 1 bit
+
+: Indicates how transparent pixels of _the current frame_ are to be blended with
+corresponding pixels of the previous canvas:
+
+  * `0`: Use alpha blending. After disposing of the previous frame, render the
+    current frame on the canvas using [alpha-blending](#alpha-blending). If the
+    current frame does not have an alpha channel, assume alpha value of 255,
+    effectively replacing the rectangle.
+
+  * `1`: Do not blend. After disposing of the previous frame, render the
+    current frame on the canvas by overwriting the rectangle covered by the
+    current frame.
+
+Disposal method (D): 1 bit
+
+: Indicates how _the current frame_ is to be treated after it has been displayed
+(before rendering the next frame) on the canvas:
+
+  * `0`: Do not dispose. Leave the canvas as is.
+
+  * `1`: Dispose to background color. Fill the _rectangle_ on the canvas covered
+    by the _current frame_ with background color specified in the
+    [ANIM chunk](#anim_chunk).
+
+**Notes**:
+
+  * The frame disposal only applies to the _frame rectangle_, that is, the
+    rectangle defined by _Frame X_, _Frame Y_, _frame width_ and _frame height_.
+    It may or may not cover the whole canvas.
+
+{:#alpha-blending}
+  * **Alpha-blending**:
+
+    Given that each of the R, G, B and A channels is 8-bit, and the RGB
+    channels are _not premultiplied_ by alpha, the formula for blending
+    'dst' onto 'src' is:
+
+~~~~~
+    blend.A = src.A + dst.A * (1 - src.A / 255)
+    if blend.A = 0 then
+      blend.RGB = 0
+    else
+      blend.RGB = (src.RGB * src.A +
+                   dst.RGB * dst.A * (1 - src.A / 255)) / blend.A
+~~~~~
+
+  * Alpha-blending SHOULD be done in linear color space, by taking into account
+    the [color profile](#color-profile) of the image. If the color profile is
+    not present, sRGB is to be assumed. (Note that sRGB also needs to be
+    linearized due to a gamma of ~2.2).
+
+Frame Data: _Chunk Size_ - `16` bytes
+
+: For a fragmented frame, it consists of multiple [fragment chunks](#fragments).
+
+: For a non-fragmented frame, it consists of:
+
+  * An optional [alpha subchunk](#alpha) for the frame.
+
+  * A [bitstream subchunk](#bitstream-vp8vp8l) for the frame.
+
+  * An optional list of [unknown chunks](#unknown-chunks).
+
+**Note**: The 'ANMF' payload, _Frame Data_ above, consists of individual
+_padded_ chunks as described by the [RIFF file format](#riff-file-format).
+
+#### Fragments _\[status: experimental\]_
+
+For images that are represented by fragments, this chunk contains data for
+a single fragment. If the _Image Fragmentation Flag_ is not set, then this chunk
+SHOULD NOT be present.
+
+     0                   1                   2                   3
+     0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                      ChunkHeader('FRGM')                      |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                  Fragment X                   |             ...
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    ...       Fragment Y            |         Fragment Data         |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+Fragment X: 24 bits (_uint24_)
+
+: The X coordinate of the upper left corner of the fragment is `Fragment X * 2`
+
+Fragment Y: 24 bits (_uint24_)
+
+: The Y coordinate of the upper left corner of the fragment is `Fragment Y * 2`
+
+Fragment Data: _Chunk Size_ - `6` bytes
+
+: It contains:
+
+  * An optional [alpha subchunk](#alpha) for the fragment.
+  * The [bitstream subchunk](#bitstream-vp8vp8l) for the fragment.
+  * An optional list of [unknown chunks](#unknown-chunks).
+
+Note: The width and height of the fragment is obtained from the bitstream
+subchunk.
+
+The fragments of a frame SHOULD have the following properties:
+
+  * They collectively cover the whole frame.
+
+  * No pair of fragments have any overlapping region on the frame.
+
+  * No portion of any fragment should be located outside of the canvas.
+
 #### Alpha

     0                   1                   2                   3
@ -322,12 +572,18 @@ Future specifications MAY add more fields.
    |Rsv| P | F | C |     Alpha Bitstream...                        |
    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+

-Compression method (C): 2 bits
+Reserved (Rsv): 2 bits

-: The compression method used:
+: SHOULD be `0`.

-  * `0`: No compression.
-  * `1`: Compressed using the WebP lossless format.
+Pre-processing (P): 2 bits
+
+: These INFORMATIVE bits are used to signal the pre-processing that has
+been performed during compression. The decoder can use this information to
+e.g. dither the values or smooth the gradients prior to display.
+
+  * `0`: no pre-processing
+  * `1`: level reduction

 Filtering method (F): 2 bits

@ -360,8 +616,8 @@ where `clip(v)` is equal to:
  * v    otherwise

 The final value is derived by adding the decompressed value `X` to the
-predictor and using modulo-256 arithmetic to wrap the [256-511] range
-into the [0-255] one:
+predictor and using modulo-256 arithmetic to wrap the \[256-511\] range
+into the \[0-255\] one:

 `alpha = (predictor + X) % 256`

@ -374,30 +630,24 @@ There are special cases for left-most and top-most pixel positions:
    location (x, 0) are predicted using the location (x-1, 0) on the left.


-Pre-processing (P): 2 bits
-
-: These INFORMATIVE bits are used to signal the pre-processing that has
-been performed during compression. The decoder can use this information to
-e.g. dither the values or smooth the gradients prior to display.
-
-  * `0`: no pre-processing
-  * `1`: level reduction
-
 Decoders are not required to use this information in any specified way.

-Reserved (Rsv): 2 bits
+Compression method (C): 2 bits

-: SHOULD be `0`.
+: The compression method used:
+
+  * `0`: No compression.
+  * `1`: Compressed using the WebP lossless format.

 Alpha bitstream: _Chunk Size_ - `1` bytes

 : Encoded alpha bitstream.

-This optional chunk contains encoded alpha data for the image. An image
-containing a 'VP8L' chunk SHOULD NOT contain this chunk.
+This optional chunk contains encoded alpha data for this frame/fragment. A
+frame/fragment containing a 'VP8L' chunk SHOULD NOT contain this chunk.

-**Rationale**: The transparency information of the image is already part
-of the 'VP8L' chunk.
+**Rationale**: The transparency information is already part of the 'VP8L'
+chunk.

 The alpha channel data is stored as uncompressed raw data (when
 compression method is '0') or compressed using the lossless format
@ -425,7 +675,7 @@ compression method is '0') or compressed using the lossless format

 #### Bitstream (VP8/VP8L)

-This chunk contains compressed image data.
+This chunk contains compressed bitstream data for a single frame/fragment.

 A bitstream chunk may be either (i) a VP8 chunk, using "VP8 " (note the
 significant fourth-character space) as its tag _or_ (ii) a VP8L chunk, using
@ -435,10 +685,166 @@ The formats of VP8 and VP8L chunks are as described in sections
 [Simple file format (lossy)](#simple-file-format-lossy)
 and [Simple file format (lossless)](#simple-file-format-lossless) respectively.

-#### Unknown Chunks
+#### Color profile

-A file MAY contain other unknown chunks. Readers SHOULD ignore these chunks.
-Writers SHOULD preserve them in their original order.
+     0                   1                   2                   3
+     0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                      ChunkHeader('ICCP')                      |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                       Color Profile                           |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+Color Profile: _Chunk Size_ bytes
+
+: ICC profile.
+
+This chunk MUST appear before the image data.
+
+There SHOULD be at most one such chunk. If there are more such chunks, readers
+MAY ignore all except the first one.
+See the [ICC Specification][iccspec] for details.
+
+If this chunk is not present, sRGB SHOULD be assumed.
+
+#### Metadata
+
+Metadata can be stored in 'EXIF' or 'XMP ' chunks.
+
+There SHOULD be at most one chunk of each type ('EXIF' and 'XMP '). If there
+are more such chunks, readers MAY ignore all except the first one. Also, a file
+may possibly contain both 'EXIF' and 'XMP ' chunks.
+
+The chunks are defined as follows:
+
+EXIF chunk:
+
+     0                   1                   2                   3
+     0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                      ChunkHeader('EXIF')                      |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                        EXIF Metadata                          |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+EXIF Metadata: _Chunk Size_ bytes
+
+: image metadata in EXIF format.
+
+
+XMP chunk:
+
+     0                   1                   2                   3
+     0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                      ChunkHeader('XMP ')                      |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                        XMP Metadata                           |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+XMP Metadata: _Chunk Size_ bytes
+
+: image metadata in XMP format.
+
+Additional guidance about handling metadata can be found in the
+Metadata Working Group's [Guidelines for Handling Metadata][metadata].
+
+#### Unknown Chunks _\[status: experimental\]_
+
+A RIFF chunk (described in [this](#terminology-amp-basics) section) whose _chunk
+tag_ is different from any of the chunks described in this document, is
+considered an _unknown chunk_.
+
+**Rationale**: Allowing unknown chunks gives a provision for future extension
+of the format, and also allows storage of any application-specific data.
+
+A file MAY contain unknown chunks:
+
+  * At the end of the file as described in [Extended WebP file
+    header](#extended_header) section.
+  * At the end of FRGM and ANMF chunks as described in [Fragments](#fragments)
+    and [Animation](#animation) sections.
+
+Readers SHOULD ignore these chunks. Writers SHOULD preserve them in their
+original order (unless they specifically intend to modify these chunks).
+
+### Assembling the Canvas from fragments/frames
+
+Here we provide an overview of how a reader should assemble a canvas in case
+of a fragmented-image and in case of an animated image. The notation
+_VP8X.field_ means the field in the 'VP8X' chunk with the same description.
+
+Displaying a _fragmented image_ canvas MUST be equivalent to the following
+pseudocode: _\[status: experimental\]_
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+assert VP8X.flags.hasFragments
+canvas ← new black image of size VP8X.canvasWidth x VP8X.canvasHeight.
+frgm_params ← nil
+for chunk in image_data:
+    assert chunk.tag is "FRGM"
+    frgm_params.fragmentX = Fragment X
+    frgm_params.fragmentY = Fragment Y
+    for subchunk in 'Fragment Data':
+        if subchunk.tag == "ALPH":
+            assert alpha subchunks not found in 'Fragment Data' earlier
+            frgm_params.alpha = alpha_data
+        else if subchunk.tag == "VP8 " OR subchunk.tag == "VP8L":
+            assert bitstream subchunks not found in 'Fragment Data' earlier
+            frgm_params.bitstream = bitstream_data
+    frgm_params.fragmentWidth = Width extracted from bitstream subchunk
+    frgm_params.fragmentHeight = Height extracted from bitstream subchunk
+    assert VP8X.canvasWidth >=
+        frgm_params.fragmentX + frgm_params.fragmentWidth
+    assert VP8X.canvasHeight >=
+        frgm_params.fragmentY + frgm_params.fragmentHeight
+    assert fragment has the properties mentioned in "Image Fragments" section.
+    render fragment with frame_params.alpha and frame_params.bitstream on canvas
+    with top-left corner in (frgm_params.fragmentX, frgm_params.fragmentY).
+canvas contains the decoded canvas.
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Displaying an _animated image_ canvas MUST be equivalent to the following
+pseudocode:
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+assert VP8X.flags.hasAnimation
+canvas ← new image of size VP8X.canvasWidth x VP8X.canvasHeight with
+background color ANIM.background_color.
+loop_count ← ANIM.loopCount
+dispose_method ← ANIM.disposeMethod
+if loop_count == 0:
+    loop_count = ∞
+frame_params ← nil
+for loop = 0, ..., loop_count - 1
+    assert next chunk in image_data is ANMF
+    frame_params.frameX = Frame X
+    frame_params.frameY = Frame Y
+    frame_params.frameWidth = Frame Width Minus One + 1
+    frame_params.frameHeight = Frame Height Minus One + 1
+    frame_params.frameDuration = Frame Duration
+    assert VP8X.canvasWidth >= frame_params.frameX + frame_params.frameWidth
+    assert VP8X.canvasHeight >= frame_params.frameY + frame_params.frameHeight
+    if VP8X.flags.hasFragments and first subchunk in 'Frame Data' is FRGM
+        // Fragmented frame.
+        frame_params.{bitstream,alpha} = canvas decoded from subchunks in
+                                         'Frame Data' as per the pseudocode for
+                                         _fragmented image_ above.
+    else
+        // Non-fragmented frame.
+        for subchunk in 'Frame Data':
+            if subchunk.tag == "ALPH":
+                assert alpha subchunks not found in 'Frame Data' earlier
+                frame_params.alpha = alpha_data
+            else if subchunk.tag == "VP8 " OR subchunk.tag == "VP8L":
+                assert bitstream subchunks not found in 'Frame Data' earlier
+                frame_params.bitstream = bitstream_data
+    render frame with frame_params.alpha and frame_params.bitstream on canvas
+    with top-left corner in (frame_params.frameX, frame_params.frameY), using
+    dispose method dispose_method.
+    Show the contents of the image for frame_params.frameDuration * 1ms.
+canvas contains the decoded canvas.
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 Example file layouts
 --------------------
@ -461,7 +867,43 @@ RIFF/WEBP
 +- VP8L (lossless bitstream)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

+A lossless image with ICC profile and XMP metadata may
+look as follows:
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+RIFF/WEBP
+- VP8X (descriptions of features used)
+- ICCP (color profile)
+- VP8L (lossless bitstream)
+- XMP  (metadata)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+A fragmented image may look as follows:
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+RIFF/WEBP
+- VP8X (descriptions of features used)
+- FRGM (fragment1 parameters + data)
+- FRGM (fragment2 parameters + data)
+- FRGM (fragment3 parameters + data)
+- FRGM (fragment4 parameters + data)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+An animated image with EXIF metadata may look as follows:
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+RIFF/WEBP
+- VP8X (descriptions of features used)
+- ANIM (global animation parameters)
+- ANMF (frame1 parameters + data)
+- ANMF (frame2 parameters + data)
+- ANMF (frame3 parameters + data)
+- ANMF (frame4 parameters + data)
+- EXIF (metadata)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
 [vp8spec]:  http://tools.ietf.org/html/rfc6386
 [webpllspec]: https://gerrit.chromium.org/gerrit/gitweb?p=webm/libwebp.git;a=blob;f=doc/webp-lossless-bitstream-spec.txt;hb=master
+[iccspec]: http://www.color.org/icc_specs2.xalter
 [metadata]: http://www.metadataworkinggroup.org/pdf/mwg_guidance.pdf
 [rfc 2119]: http://tools.ietf.org/html/rfc2119
--- a/doc/webp-lossless-bitstream-spec.txt
+++ b/doc/webp-lossless-bitstream-spec.txt
@ -236,7 +236,7 @@ predicted) is encoded. The _prediction mode_ determines the type of
 prediction to use. We divide the image into squares and all the pixels
 in a square use same prediction mode.

-The first 4 bits of prediction data define the block width and height in
+The first 3 bits of prediction data define the block width and height in
 number of bits. The number of block columns, `block_xsize`, is used in
 indexing two-dimensionally.

@ -361,14 +361,14 @@ int ClampAddSubtractHalf(int a, int b) {
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 There are special handling rules for some border pixels. If there is a
-prediction transform, regardless of the mode [0..13] for these pixels,
+prediction transform, regardless of the mode \[0..13\] for these pixels,
 the predicted value for the left-topmost pixel of the image is
 0xff000000, L-pixel for all pixels on the top row, and T-pixel for all
 pixels on the leftmost column.

 Addressing the TR-pixel for pixels on the rightmost column is
 exceptional. The pixels on the rightmost column are predicted by using
-the modes [0..13] just like pixels not on border, but by using the
+the modes \[0..13\] just like pixels not on border, but by using the
 leftmost pixel on the same row as the current TR-pixel. The TR-pixel
 offset in memory is the same for border and non-border pixels.

@ -420,7 +420,7 @@ void ColorTransform(uint8 red, uint8 blue, uint8 green,

 `ColorTransformDelta` is computed using a signed 8-bit integer
 representing a 3.5-fixed-point number, and a signed 8-bit RGB color
-channel (c) [-128..127] and is defined as follows:
+channel (c) \[-128..127\] and is defined as follows:

 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 int8 ColorTransformDelta(int8 t, int8 c) {
@ -436,12 +436,12 @@ consistent with each other.

 Now we describe the contents of color transform data so that decoding
 can apply the inverse color transform and recover the original red and
-blue values. The first 4 bits of the color transform data contain the
+blue values. The first 3 bits of the color transform data contain the
 width and height of the image block in number of bits, just like the
 predictor transform:

 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-int size_bits = ReadStream(3) + 2;
+int size_bits = ReadBits(3) + 2;
 int block_width = 1 << size_bits;
 int block_height = 1 << size_bits;
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -518,7 +518,7 @@ follows:

 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 // 8 bit value for color table size
-int color_table_size = ReadStream(8) + 1;
+int color_table_size = ReadBits(8) + 1;
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 The color table is stored using the image storage format itself. The
@ -567,10 +567,10 @@ if (color_table_size <= 2) {

 `width_bits` has a value of 0, 1, 2 or 3. A value of 0 indicates no
 pixel bundling to be done for the image. A value of 1 indicates that two
-pixels are combined together, and each pixel has a range of [0..15]. A
+pixels are combined together, and each pixel has a range of \[0..15\]. A
 value of 2 indicates that four pixels are combined together, and each
-pixel has a range of [0..3]. A value of 3 indicates that eight pixels
-are combined together and each pixel has a range of [0..1], i.e., a
+pixel has a range of \[0..3\]. A value of 3 indicates that eight pixels
+are combined together and each pixel has a range of \[0..1\], i.e., a
 binary value.

 The values are packed into the green component as follows:
@ -592,80 +592,107 @@ The values are packed into the green component as follows:
 4 Image Data
 ------------

-Image data is an array of pixel values in scan-line order. We use image
-data in five different roles: The main role, an auxiliary role related
-to entropy coding, and three further roles related to transforms.
+Image data is an array of pixel values in scan-line order.

-  1. ARGB image.
-  2. Entropy image. The red and green components define the meta Huffman
-     code used in a particular area of the image.
-  3. Predictor image. The green component defines which of the 14 values
-     is used within a particular square of the image.
-  4. Color indexing image. An array of up to 256 ARGB colors is used for
-     transforming a green-only image, using the green value as an index
-     to this one-dimensional array.
-  5. Color transformation image. Defines signed 3.5 fixed-point
-     multipliers that are used to predict the red, green, and blue
-     components, to reduce entropy.
+### 4.1 Roles of Image Data

-To divide the image into multiple regions, the image is first divided
-into a set of fixed-size blocks (typically 16x16 blocks). Each of these
-blocks can be modeled using an entropy code, in a way where several
-blocks can share the same entropy code. There is a cost in transmitting
-an entropy code, and in order to minimize this cost, statistically
-similar blocks can share an entropy code. The blocks sharing an entropy
-code can be found by clustering their statistical properties, or by
-repeatedly joining two randomly selected clusters when it reduces the
-overall amount of bits needed to encode the image. See the section
-[Decoding of Meta Huffman Codes](#decoding-of-meta-huffman-codes) in
-[Chapter 5](#entropy-code) for an explanation of how this entropy image
-is stored.
+We use image data in five different roles:

-Each pixel is encoded using one of three possible methods:
+  1. ARGB image: Stores the actual pixels of the image.
+  1. Entropy image: Stores the
+     [meta Huffman codes](#decoding-of-meta-huffman-codes). The red and green
+     components of a pixel define the meta Huffman code used in a particular
+     block of the ARGB image.
+  1. Predictor image: Stores the metadata for [Predictor
+     Transform](#predictor-transform). The green component of a pixel defines
+     which of the 14 predictors is used within a particular block of the
+     ARGB image.
+  1. Color transform image. It is created by `ColorTransformElement` values
+     (defined in [Color Transform](#color-transform)) for different blocks of
+     the image. Each `ColorTransformElement` `'cte'` is treated as a pixel whose
+     alpha component is `255`, red component is `cte.red_to_blue`, green
+     component is `cte.green_to_blue` and blue component is `cte.green_to_red`.
+  1. Color indexing image: An array of of size `color_table_size` (up to 256
+     ARGB values) storing the metadata for the
+     [Color Indexing Transform](#color-indexing-transform). This is stored as an
+     image of width `color_table_size` and height `1`.

-  1. Huffman coded literals, where each channel (green, alpha, red,
-     blue) is entropy-coded independently;
-  2. LZ77, a sequence of pixels in scan-line order copied from elsewhere
+### 4.2 Encoding of Image data
+
+The encoding of image data is independent of its role.
+
+The image is first divided into a set of fixed-size blocks (typically 16x16
+blocks). Each of these blocks are modeled using their own entropy codes. Also,
+several blocks may share the same entropy codes.
+
+**Rationale:** Storing an entropy code incurs a cost. This cost can be minimized
+if statistically similar blocks share an entropy code, thereby storing that code
+only once. For example, an encoder can find similar blocks by clustering them
+using their statistical properties, or by repeatedly joining a pair of randomly
+selected clusters when it reduces the overall amount of bits needed to encode
+the image.
+
+Each pixel is encoded using one of the three possible methods:
+
+  1. Huffman coded literal: each channel (green, red, blue and alpha) is
+     entropy-coded independently;
+  2. LZ77 backward reference: a sequence of pixels are copied from elsewhere
     in the image; or
-  3. Color cache, using a short multiplicative hash code (color cache
+  3. Color cache code: using a short multiplicative hash code (color cache
     index) of a recently seen color.

-In the following sections we introduce the main concepts in LZ77 prefix
-coding, LZ77 entropy coding, LZ77 distance mapping, and color cache
-codes. The actual details of the entropy code are described in more
-detail in [Chapter 5](#entropy-code).
+The following sub-sections describe each of these in detail.

+#### 4.2.1 Huffman Coded Literals

-### LZ77 Prefix Coding
+The pixel is stored as Huffman coded values of green, red, blue and alpha (in
+that order). See [this section](#decoding-entropy-coded-image-data) for details.

-Prefix coding divides large integer values into two parts: the prefix
-code and the extra bits. The benefit of this approach is that entropy
-coding is later used only for the prefix code, reducing the resources
-needed by the entropy code. The extra bits are stored as they are,
-without an entropy code.
+#### 4.2.2 LZ77 Backward Reference

-This prefix code is used for coding backward reference lengths and
-distances. The extra bits form an integer that is added to the lower
-value of the range. Hence the LZ77 lengths and distances are divided
-into prefix codes and extra bits. Performing the Huffman coding only on
-the prefixes reduces the size of the Huffman codes to tens of values
-instead of a million (distance) or several thousands (length).
+Backward references are tuples of _length_ and _distance code_:

-| Prefix code | Value range     | Extra bits |
-| ----------- | --------------- | ---------- |
-| 0           | 1               | 0          |
-| 1           | 2               | 0          |
-| 2           | 3               | 0          |
-| 3           | 4               | 0          |
-| 4           | 5..6            | 1          |
-| 5           | 7..8            | 1          |
-| 6           | 9..12           | 2          |
-| 7           | 13..16          | 2          |
+  * Length indicates how many pixels in scan-line order are to be copied.
+  * Distance code is a number indicating the position of a previously seen
+    pixel, from which the pixels are to be copied. The exact mapping is
+    described [below](#distance-mapping).
+
+The length and distance values are stored using **LZ77 prefix coding**.
+
+LZ77 prefix coding divides large integer values into two parts: the _prefix
+code_ and the _extra bits_: the prefix code is stored using an entropy code,
+while the extra bits are stored as they are (without an entropy code).
+
+**Rationale**: This approach reduces the storage requirement for the entropy
+code. Also, large values are usually rare, and so extra bits would be used for
+very few values in the image. Thus, this approach results in a better
+compression overall.
+
+The following table denotes the prefix codes and extra bits used for storing
+different range of values.
+
+Note: The maximum backward reference length is limited to 4096. Hence, only the
+first 24 prefix codes (with the respective extra bits) are meaningful for length
+values. For distance values, however, all the 40 prefix codes are valid.
+
+| Value range     | Prefix code | Extra bits |
+| --------------- | ----------- | ---------- |
+| 1               | 0           | 0          |
+| 2               | 1           | 0          |
+| 3               | 2           | 0          |
+| 4               | 3           | 0          |
+| 5..6            | 4           | 1          |
+| 7..8            | 5           | 1          |
+| 9..12           | 6           | 2          |
+| 13..16          | 7           | 2          |
 | ...             | ...         | ...        |
-| 38          | 262145..524288  | 18         |
-| 39          | 524289..1048576 | 18         |
+| 3072..4096      | 23          | 10         |
+| ...             | ...         | ...        |
+| 524289..786432  | 38          | 18         |
+| 786433..1048576 | 39          | 18         |

-The code to obtain a value from the prefix code is as follows:
+The pseudocode to obtain a (length or distance) value from the prefix code is
+as follows:

 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 if (prefix_code < 4) {
@ -676,26 +703,28 @@ int offset = (2 + (prefix_code & 1)) << extra_bits;
 return offset + ReadBits(extra_bits) + 1;
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

+**Distance Mapping:**
+{:#distance-mapping}

-### LZ77 Backward Reference Entropy Coding
+As noted previously, distance code is a number indicating the position of a
+previously seen pixel, from which the pixels are to be copied. This sub-section
+defines the mapping between a distance code and the position of a previous
+pixel.

-Backward references are tuples of length and distance. Length indicates
-how many pixels in scan-line order are to be copied. The length is
-codified in two steps: prefix and extra bits. Only the first 24 prefix
-codes with their respective extra bits are used for length codes,
-limiting the maximum length to 4096. For distances, all 40 prefix codes
-are used.
+The distance codes larger than 120 denote the pixel-distance in scan-line
+order, offset by 120.

+The smallest distance codes \[1..120\] are special, and are reserved for a close
+neighborhood of the current pixel. This neighborhood consists of 120 pixels:

-### LZ77 Distance Mapping
+  * Pixels that are 1 to 7 rows above the current pixel, and are up to 8 columns
+    to the left or up to 7 columns to the right of the current pixel. \[Total
+    such pixels = `7 * (8 + 1 + 7) = 112`\].
+  * Pixels that are in same row as the current pixel, and are up to 8 columns to
+    the left of the current pixel. \[`8` such pixels\].

-120 smallest distance codes [1..120] are reserved for a close
-neighborhood within the current pixel. The rest are pure distance codes
-in scan-line order, just offset by 120. The smallest codes are coded
-into x and y offsets by the following table. Each tuple shows the x and
-the y coordinates in 2D offsets -- for example the first tuple (0, 1)
-means 0 for no difference in x, and 1 pixel difference in y (indicating
-previous row).
+The mapping between distance code `i` and the neighboring pixel offset
+`(xi, yi)` is as follows:

 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 (0, 1),  (1, 0),  (1, 1),  (-1, 1), (0, 2),  (2, 0),  (1, 2),  (-1, 2),
@ -715,38 +744,51 @@ previous row).
 (-6, 7), (7, 6),  (-7, 6), (8, 5),  (7, 7),  (-7, 7), (8, 6),  (8, 7)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-The distances codes that map into these tuples are changes into
-scan-line order distances using the following formula:
-_dist = x + y * xsize_, where _xsize_ is the width of the image in
-pixels. If a decoder detects a computed _dist_ value smaller than 1,
-the value of 1 is used instead.
+For example, distance code `1` indicates offset of `(0, 1)` for the neighboring
+pixel, that is, the pixel above the current pixel (0-pixel difference in
+X-direction and 1 pixel difference in Y-direction). Similarly, distance code
+`3` indicates left-top pixel.
+
+The decoder can convert a distances code 'i' to a scan-line order distance
+'dist' as follows:
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+(xi, yi) = distance_map[i]
+dist = x + y * xsize
+if (dist < 1) {
+  dist = 1
+}
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+where 'distance_map' is the mapping noted above and `xsize` is the width of the
+image in pixels.


-### Color Cache Code
+#### 4.2.3 Color Cache Coding

-Color cache stores a set of colors that have been recently used in the
-image. Using the color cache code, the color cache colors can be
-referred to more efficiently than emitting the respective ARGB values
-independently or sending them as backward references with a length of
-one pixel.
+Color cache stores a set of colors that have been recently used in the image.

-Color cache codes are coded as follows. First, there is a bit that
-indicates if the color cache is used or not. If this bit is 0, no color
-cache codes exist, and they are not transmitted in the Huffman code that
-decodes the green symbols and the length prefix codes. However, if this
-bit is 1, the color cache size is read:
+**Rationale:** This way, the recently used colors can sometimes be referred to
+more efficiently than emitting them using other two methods (described in
+[4.2.1](#huffman-coded-literals) and [4.2.2](#lz77-backward-reference)).
+
+Color cache codes are stored as follows. First, there is a 1-bit value that
+indicates if the color cache is used. If this bit is 0, no color cache codes
+exist, and they are not transmitted in the Huffman code that decodes the green
+symbols and the length prefix codes. However, if this bit is 1, the color cache
+size is read next:

 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-int color_cache_code_bits = ReadBits(br, 4);
+int color_cache_code_bits = ReadBits(4);
 int color_cache_size = 1 << color_cache_code_bits;
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 `color_cache_code_bits` defines the size of the color_cache by (1 <<
 `color_cache_code_bits`). The range of allowed values for
-`color_cache_code_bits` is [1..11]. Compliant decoders must indicate a
+`color_cache_code_bits` is \[1..11\]. Compliant decoders must indicate a
 corrupted bitstream for other values.

-A color cache is an array of the size `color_cache_size`. Each entry
+A color cache is an array of size `color_cache_size`. Each entry
 stores one ARGB color. Colors are looked up by indexing them by
 (0x1e35a7bd * `color`) >> (32 - `color_cache_code_bits`). Only one
 lookup is done in a color cache; there is no conflict resolution.
@ -761,91 +803,188 @@ literals, into the cache in the order they appear in the stream.
 5 Entropy Code
 --------------

-### Huffman Coding
+### 5.1 Overview

-Most of the data is coded using a canonical Huffman code. This includes
-the following:
+Most of the data is coded using [canonical Huffman code][canonical_huff]. Hence,
+the codes are transmitted by sending the _Huffman code lengths_, as opposed to
+the actual _Huffman codes_.

-  * a combined code that defines either the value of the green
-    component, a color cache code, or a prefix of the length codes;
-  * the data for alpha, red and blue components; and
-  * prefixes of the distance codes.
+In particular, the format uses **spatially-variant Huffman coding**. In other
+words, different blocks of the image can potentially use different entropy
+codes.

-The Huffman codes are transmitted by sending the code lengths; the
-actual symbols are implicit and done in order for each length. The
-Huffman code lengths are run-length-encoded using three different
-prefixes, and the result of this coding is further Huffman coded.
+**Rationale**: Different areas of the image may have different characteristics. So, allowing them to use different entropy codes provides more flexibility and
+potentially a better compression.

+### 5.2 Details

-### Spatially-variant Huffman Coding
+The encoded image data consists of two parts:

-For every pixel (x, y) in the image, there is a definition of which
-entropy code to use. First, there is an integer called 'meta Huffman
-code' that can be obtained from the entropy image. This
-meta Huffman code identifies a set of five Huffman codes, one for green
-(along with length codes and color cache codes), one for each of red,
-blue and alpha, and one for distance. The Huffman codes are identified
-by their position in a table by an integer.
+  1. Meta Huffman codes
+  1. Entropy-coded image data

+#### 5.2.1 Decoding of Meta Huffman Codes

-### Decoding Flow of Image Data
+As noted earlier, the format allows the use of different Huffman codes for
+different blocks of the image. _Meta Huffman codes_ are indexes identifying
+which Huffman codes to use in different parts of the image.

-Read next symbol S
+Meta Huffman codes may be used _only_ when the image is being used in the
+[role](#roles-of-image-data) of an _ARGB image_.

-  1. S < 256
-     1. Use S as green component
-     2. read alpha
-     3. read red
-     4. read blue
-  2. S < 256 + 24
+There are two possibilities for the meta Huffman codes, indicated by a 1-bit
+value:
+
+  * If this bit is zero, there is only one meta Huffman code used everywhere in
+    the image. No more data is stored.
+  * If this bit is one, the image uses multiple meta Huffman codes. These meta
+    Huffman codes are stored as an _entropy image_ (described below).
+
+**Entropy image:**
+
+The entropy image defines which Huffman codes are used in different parts of the
+image, as described below.
+
+The first 3-bits contain the `huffman_bits` value. The dimensions of the entropy
+image are derived from 'huffman_bits'.
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+int huffman_bits = ReadBits(3) + 2;
+int huffman_xsize = DIV_ROUND_UP(xsize, 1 << huffman_bits);
+int huffman_ysize = DIV_ROUND_UP(ysize, 1 << huffman_bits);
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+where `DIV_ROUND_UP` is as defined [earlier](#predictor-transform).
+
+Next bits contain an entropy image of width `huffman_xsize` and height
+`huffman_ysize`.
+
+**Interpretation of Meta Huffman Codes:**
+
+For any given pixel (x, y), there is a set of five Huffman codes associated with
+it. These codes are (in bitstream order):
+
+  * **Huffman code #1**: used for green channel, backward-reference length and
+    color cache
+  * **Huffman code #2, #3 and #4**: used for red, blue and alpha channels
+    respectively.
+  * **Huffman code #5**: used for backward-reference distance.
+
+From here on, we refer to this set as a **Huffman code group**.
+
+The number of Huffman code groups in the ARGB image can be obtained by finding
+the _largest meta Huffman code_ from the entropy image:
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+int num_huff_groups = max(entropy image) + 1;
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+where `max(entropy image)` indicates the largest Huffman code stored in the
+entropy image.
+
+As each Huffman code groups contains five Huffman codes, the total number of
+Huffman codes is:
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+int num_huff_codes = 5 * num_huff_groups;
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Given a pixel (x, y) in the ARGB image, we can obtain the corresponding Huffman
+codes to be used as follows:
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+int position = (y >> huffman_bits) * huffman_xsize + (x >> huffman_bits);
+int meta_huff_code = (entropy_image[pos] >> 8) & 0xffff;
+HuffmanCodeGroup huff_group = huffman_code_groups[meta_huff_code];
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+where, we have assumed the existence of `HuffmanCodeGroup` structure, which
+represents a set of five Huffman codes. Also, `huffman_code_groups` is an array
+of `HuffmanCodeGroup` (of size `num_huff_groups`).
+
+The decoder then uses Huffman code group `huff_group` to decode the pixel
+(x, y) as explained in the [next section](#decoding-entropy-coded-image-data).
+
+#### 5.2.2 Decoding Entropy-coded Image Data
+
+For the current position (x, y) in the image, the decoder first identifies the
+corresponding Huffman code group (as explained in the last section). Given the
+Huffman code group, the pixel is read and decoded as follows:
+
+Read next symbol S from the bitstream using Huffman code #1. \[See
+[next section](#decoding-the-code-lengths) for details on decoding the Huffman
+code lengths\]. Note that S is any integer in the range `0` to
+`(256 + 24 + ` [`color_cache_size`](#color-cache-code)`- 1)`.
+
+The interpretation of S depends on its value:
+
+  1. if S < 256
+     1. Use S as the green component
+     1. Read red from the bitstream using Huffman code #2
+     1. Read blue from the bitstream using Huffman code #3
+     1. Read alpha from the bitstream using Huffman code #4
+  1. if S < 256 + 24
     1. Use S - 256 as a length prefix code
-     2. read length extra bits
-     3. read distance prefix code
-     4. read distance extra bits
-  3. S >= 256 + 24
-     1. Use ARGB color from the color cache, at index S - 256 + 24
+     1. Read extra bits for length from the bitstream
+     1. Determine backward-reference length L from length prefix code and the
+        extra bits read.
+     1. Read distance prefix code from the bitstream using Huffman code #5
+     1. Read extra bits for distance from the bitstream
+     1. Determine backward-reference distance D from distance prefix code and
+        the extra bits read.
+     1. Copy the L pixels (in scan-line order) from the sequence of pixels
+        prior to them by D pixels.
+  1. if S >= 256 + 24
+     1. Use S - (256 + 24) as the index into the color cache.
+     1. Get ARGB color from the color cache at that index.


-### Decoding the Code Lengths
+**Decoding the Code Lengths:**
+{:#decoding-the-code-lengths}

-There are two different ways to encode the code lengths of a Huffman
-code, indicated by the first bit of the code: _simple code length code_
-(1), and _normal code length code_ (0).
+This section describes the details about reading a symbol from the bitstream by
+decoding the Huffman code length.

+The Huffman code lengths can be coded in two ways. The method used is specified
+by a 1-bit value.

-#### Simple Code Length Code
+  * If this bit is 1, it is a _simple code length code_, and
+  * If this bit is 0, it is a _normal code length code_.

-This variant can codify 1 or 2 non-zero length codes in the range of [0,
-255]. All other code lengths are implicitly zeros.
+**(i) Simple Code Length Code:**

-The first bit indicates the number of codes:
+This variant is used in the special case when only 1 or 2 Huffman code lengths
+are non-zero, and are in the range of \[0, 255\]. All other Huffman code lengths
+are implicitly zeros.
+
+The first bit indicates the number of non-zero code lengths:

 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-int num_symbols = ReadBits(1) + 1;
+int num_code_lengths = ReadBits(1) + 1;
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-The first symbol is stored either using a 1-bit code for values of 0 and
-1, or using a 8-bit code for values in range [0, 255]. The second
-symbol, when present, is coded as an 8-bit code.
+The first code length is stored either using a 1-bit code for values of 0 and 1,
+or using an 8-bit code for values in range \[0, 255\]. The second code length,
+when present, is coded as an 8-bit code.

 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-int first_symbol_len_code = VP8LReadBits(br, 1);
-symbols[0] = ReadBits(1 + 7 * first_symbol_len_code);
-if (num_symbols == 2) {
-  symbols[1] = ReadBits(8);
+int is_first_8bits = ReadBits(1);
+code_lengths[0] = ReadBits(1 + 7 * is_first_8bits);
+if (num_code_lengths == 2) {
+  code_lengths[1] = ReadBits(8);
 }
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-Empty trees can be coded as trees that contain one 0 symbol, and can be
-codified using four bits. For example, a distance tree can be empty if
-there are no backward references. Similarly, alpha, red, and blue trees
-can be empty if all pixels within the same meta Huffman code are
-produced using the color cache.
+**Note:** Another special case is when _all_ Huffman code lengths are _zeros_
+(an empty Huffman code). For example, a Huffman code for distance can be empty
+if there are no backward references. Similarly, Huffman codes for alpha, red,
+and blue can be empty if all pixels within the same meta Huffman code are
+produced using the color cache. However, this case doesn't need a special
+handling, as empty Huffman codes can be coded as those containing a single
+symbol `0`.

+**(ii) Normal Code Length Code:**

-#### Normal Code Length Code
-
-The code lengths of a Huffman code are read as follows: `num_codes`
+The code lengths of a Huffman code are read as follows: `num_code_lengths`
 specifies the number of code lengths; the rest of the code lengths
 (according to the order in `kCodeLengthCodeOrder`) are zeros.

@ -854,91 +993,23 @@ int kCodeLengthCodes = 19;
 int kCodeLengthCodeOrder[kCodeLengthCodes] = {
  17, 18, 0, 1, 2, 3, 4, 5, 16, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
 };
-int num_codes = 4 + ReadStream(4);
-for (i = 0; i < num_codes; ++i) {
+int code_lengths[kCodeLengthCodes] = { 0 };  // All zeros.
+int num_code_lengths = 4 + ReadBits(4);
+for (i = 0; i < num_code_lengths; ++i) {
  code_lengths[kCodeLengthCodeOrder[i]] = ReadBits(3);
 }
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-  * Code length code [0..15] indicates literal code lengths.
+  * Code length code \[0..15\] indicates literal code lengths.
    * Value 0 means no symbols have been coded.
-    * Values [1..15] indicate the bit length of the respective code.
-  * Code 16 repeats the previous non-zero value [3..6] times, i.e.,
-    3 + `ReadStream(2)` times.  If code 16 is used before a non-zero
+    * Values \[1..15\] indicate the bit length of the respective code.
+  * Code 16 repeats the previous non-zero value \[3..6\] times, i.e.,
+    3 + `ReadBits(2)` times.  If code 16 is used before a non-zero
    value has been emitted, a value of 8 is repeated.
-  * Code 17 emits a streak of zeros [3..10], i.e., 3 + `ReadStream(3)`
+  * Code 17 emits a streak of zeros \[3..10\], i.e., 3 + `ReadBits(3)`
    times.
-  * Code 18 emits a streak of zeros of length [11..138], i.e.,
-    11 + `ReadStream(7)` times.
-
-The entropy codes for alpha, red and blue have a total of 256 symbols.
-The entropy code for distance prefix codes has 40 symbols. The entropy
-code for green has 256 + 24 + `color_cache_size`, 256 symbols for
-different green symbols, 24 length code prefix symbols, and symbols for
-the color cache.
-
-The meta Huffman code, specified in the next section, defines how many
-Huffman codes there are. There are always 5 times the number of Huffman
-codes to the number of meta Huffman codes.
-
-
-### Decoding of Meta Huffman Codes
-
-There are two ways to code the meta Huffman codes, indicated by one bit
-for the ARGB image and is an implicit zero, i.e., not present in the
-stream for all transform images and the entropy image itself.
-
-If this bit is zero, there is only one meta Huffman code, using Huffman
-codes 0, 1, 2, 3 and 4 for green, alpha, red, blue and distance,
-respectively. This meta Huffman code is used everywhere in the image.
-
-If this bit is one, the meta Huffman codes are controlled by the entropy
-image, where the index of the meta Huffman code is codified in the red
-and green components. The index can be obtained from the uint32 value by
-_((pixel >> 8) & 0xffff)_, thus there can be up to 65536 unique meta
-Huffman codes. When decoding a Huffman encoded symbol at a pixel x, y,
-one chooses the meta Huffman code respective to these coordinates.
-However, not all bits of the coordinates are used for choosing the meta
-Huffman code, i.e., the entropy image is of subresolution to the real
-image.
-
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-int huffman_bits = ReadBits(3) + 2;
-int huffman_xsize = DIV_ROUND_UP(xsize, 1 << huffman_bits);
-int huffman_ysize = DIV_ROUND_UP(ysize, 1 << huffman_bits);
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-`huffman_bits` gives the amount of subsampling in the entropy image.
-
-After reading the `huffman_bits`, an entropy image stream of size
-`huffman_xsize`, `huffman_ysize` is read.
-
-The meta Huffman code, identifying the five Huffman codes per meta
-Huffman code, is coded only by the number of codes:
-
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-int num_meta_codes = max(entropy_image) + 1;
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Now, we can obtain the five Huffman codes for green, alpha, red, blue
-and distance for a given (x, y) by the following expression:
-
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-meta_codes[(entropy_image[(y >> huffman_bits) * huffman_xsize +
-                          (x >> huffman_bits)] >> 8) & 0xffff]
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The `huffman_code[5 * meta_code + k]`, codes with _k_ == 0 are for the
-green & length code, _k_ == 4 for the distance code, and the codes at
-_k_ == 1, 2, and 3, are for codes of length 256 for red, blue and alpha,
-respectively.
-
-The value of _k_ for the reference position in `meta_code` determines the
-length of the Huffman code:
-
-  * k = 0; length = 256 + 24 + cache_size
-  * k = 1, 2, or 3;  length = 256
-  * k = 4, length = 40.
+  * Code 18 emits a streak of zeros of length \[11..138\], i.e.,
+    11 + `ReadBits(7)` times.


 6 Overall Structure of the Format
@ -953,21 +1024,21 @@ of pixels (xsize * ysize).

 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 <format> ::= <RIFF header><image size><image stream>
-<image stream> ::= (<optional-transform><image stream>);
-                    <spatially-coded image>
+<image stream> ::= <optional-transform><spatially-coded image>
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


 #### Structure of Transforms

 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-<optional-transform> ::= 1-bit <transform> <optional-transform> | 0-bit
+<optional-transform> ::= (1-bit value 1; <transform> <optional-transform>) |
+                         1-bit value 0
 <transform> ::= <predictor-tx> | <color-tx> | <subtract-green-tx> |
                <color-indexing-tx>
 <predictor-tx> ::= 2-bit value 0; <predictor image>
-<predictor image> ::= 3-bit sub-pixel code | <entropy-coded image>
+<predictor image> ::= 3-bit sub-pixel code ; <entropy-coded image>
 <color-tx> ::= 2-bit value 1; <color image>
-<color image> ::= 3-bit sub-pixel code | <entropy-coded image>
+<color image> ::= 3-bit sub-pixel code ; <entropy-coded image>
 <subtract-green-tx> ::= 2-bit value 2
 <color-indexing-tx> ::= 2-bit value 3; <color-indexing image>
 <color-indexing image> ::= 8-bit color count; <entropy-coded image>
@ -984,13 +1055,18 @@ of pixels (xsize * ysize).
 <entropy image> ::= 3-bit subsample value; <entropy-coded image>
 <color cache info> ::= 1 bit value 0 |
                       (1-bit value 1; 4-bit value for color cache size)
-<huffman codes> ::= <huffman code> | <huffman code><huffman codes>
+<huffman codes> ::= <huffman code group> | <huffman code group><huffman codes>
+<huffman code group> ::= <huffman code><huffman code><huffman code>
+                         <huffman code><huffman code>
+                         See "Interpretation of Meta Huffman codes" to
+                         understand what each of these five Huffman codes are
+                         for.
 <huffman code> ::= <simple huffman code> | <normal huffman code>
 <simple huffman code> ::= see "Simple code length code" for details
 <normal huffman code> ::= <code length code>; encoded code lengths
 <code length code> ::= see section "Normal code length code"
-<lz77-coded image> ::= (<argb-pixel> | <color-cache-code> | <lz77-copy>) |
-                       (<lz77-coded image> | "")
+<lz77-coded image> ::= ((<argb-pixel> | <lz77-copy> | <color-cache-code>)
+                       <lz77-coded image>) | ""
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 A possible example sequence:
@ -1001,3 +1077,5 @@ A possible example sequence:
 <color cache info><huffman codes>
 <lz77-coded image>
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+[canonical_huff]: http://en.wikipedia.org/wiki/Canonical_Huffman_code
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@ -1,25 +1,52 @@
 AM_CPPFLAGS = -I$(top_srcdir)/src

 bin_PROGRAMS = dwebp cwebp
+if BUILD_VWEBP
+  bin_PROGRAMS += vwebp
+endif
 if WANT_MUX
  bin_PROGRAMS += webpmux
 endif
+
+if BUILD_GIF2WEBP
+  bin_PROGRAMS += gif2webp
+endif
+
 noinst_LTLIBRARIES = libexampleutil.la

-libexampleutil_la_SOURCES = example_util.c
-libexampleutilinclude_HEADERS = example_util.h
-libexampleutilincludedir =
+libexampleutil_la_SOURCES = example_util.c example_util.h

 dwebp_SOURCES = dwebp.c stopwatch.h
 dwebp_CPPFLAGS  = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE)
 dwebp_CPPFLAGS += $(JPEG_INCLUDES) $(PNG_INCLUDES)
-dwebp_LDADD = libexampleutil.la ../src/libwebp.la $(PNG_LIBS) $(JPEG_LIBS)
+dwebp_LDADD = libexampleutil.la $(PNG_LIBS) $(JPEG_LIBS)

-cwebp_SOURCES = cwebp.c stopwatch.h
+cwebp_SOURCES  = cwebp.c metadata.c metadata.h stopwatch.h
+cwebp_SOURCES += jpegdec.c jpegdec.h
+cwebp_SOURCES += pngdec.c pngdec.h
+cwebp_SOURCES += tiffdec.c tiffdec.h
+cwebp_SOURCES += wicdec.c wicdec.h
 cwebp_CPPFLAGS  = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE)
 cwebp_CPPFLAGS += $(JPEG_INCLUDES) $(PNG_INCLUDES) $(TIFF_INCLUDES)
 cwebp_LDADD = ../src/libwebp.la $(JPEG_LIBS) $(PNG_LIBS) $(TIFF_LIBS)

+gif2webp_SOURCES = gif2webp.c gif2webp_util.c
+gif2webp_CPPFLAGS = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE) $(GIF_INCLUDES)
+gif2webp_LDADD  = libexampleutil.la ../src/mux/libwebpmux.la ../src/libwebp.la
+gif2webp_LDADD += $(GIF_LIBS)
+
 webpmux_SOURCES = webpmux.c
 webpmux_CPPFLAGS = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE)
 webpmux_LDADD = libexampleutil.la ../src/mux/libwebpmux.la ../src/libwebp.la
+
+vwebp_SOURCES = vwebp.c
+vwebp_CPPFLAGS = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE) $(GL_INCLUDES)
+vwebp_LDADD = libexampleutil.la ../src/demux/libwebpdemux.la $(GL_LIBS)
+
+if BUILD_LIBWEBPDECODER
+  dwebp_LDADD += ../src/libwebpdecoder.la
+  vwebp_LDADD += ../src/libwebpdecoder.la
+else
+  dwebp_LDADD += ../src/libwebp.la
+  vwebp_LDADD += ../src/libwebp.la
+endif
--- a/examples/cwebp.c
+++ b/examples/cwebp.c
--- a/examples/dwebp.c
+++ b/examples/dwebp.c
@ -1,13 +1,13 @@
 // Copyright 2010 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
-//  Command-line tool for decoding a WebP image
-//
-//  Compile with:     gcc -o dwebp dwebp.c -lwebpdecode
+//  Command-line tool for decoding a WebP image.
 //
 // Author: Skal (pascal.massimino@gmail.com)

@ -32,24 +32,30 @@
 #define COBJMACROS
 #define _WIN32_IE 0x500  // Workaround bug in shlwapi.h when compiling C++
                         // code with COBJMACROS.
+#include <ole2.h>  // CreateStreamOnHGlobal()
 #include <shlwapi.h>
 #include <windows.h>
 #include <wincodec.h>
 #endif

+#if defined(_WIN32)
+#include <fcntl.h>   // for _O_BINARY
+#include <io.h>      // for _setmode()
+#endif
+
 #include "webp/decode.h"
 #include "./example_util.h"
 #include "./stopwatch.h"

 static int verbose = 0;
 #ifndef WEBP_DLL
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif

 extern void* VP8GetCPUInfo;   // opaque forward declaration.

-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }    // extern "C"
 #endif
 #endif  // WEBP_DLL
@ -59,8 +65,12 @@ extern void* VP8GetCPUInfo;   // opaque forward declaration.
 // Output types
 typedef enum {
  PNG = 0,
+  PAM,
  PPM,
  PGM,
+  BMP,
+  TIFF,
+  YUV,
  ALPHA_PLANE_ONLY  // this is for experimenting only
 } OutputFileFormat;

@ -68,11 +78,9 @@ typedef enum {

 #define IFS(fn)                                                     \
  do {                                                              \
-     if (SUCCEEDED(hr))        \
-     {                         \
+    if (SUCCEEDED(hr)) {                                            \
      hr = (fn);                                                    \
-        if (FAILED(hr) && verbose)           \
-          fprintf(stderr, #fn " failed %08x\n", hr);  \
+      if (FAILED(hr)) fprintf(stderr, #fn " failed %08lx\n", hr);   \
    }                                                               \
  } while (0)

@ -83,71 +91,102 @@ typedef enum {
 #endif

 static HRESULT CreateOutputStream(const char* out_file_name,
-                                  IStream** ppStream) {
+                                  int write_to_mem, IStream** stream) {
  HRESULT hr = S_OK;
-  IFS(SHCreateStreamOnFileA(out_file_name, STGM_WRITE | STGM_CREATE, ppStream));
-  if (FAILED(hr))
-    fprintf(stderr, "Error opening output file %s (%08x)\n", out_file_name, hr);
+  if (write_to_mem) {
+    // Output to a memory buffer. This is freed when 'stream' is released.
+    IFS(CreateStreamOnHGlobal(NULL, TRUE, stream));
+  } else {
+    IFS(SHCreateStreamOnFileA(out_file_name, STGM_WRITE | STGM_CREATE, stream));
+  }
+  if (FAILED(hr)) {
+    fprintf(stderr, "Error opening output file %s (%08lx)\n",
+            out_file_name, hr);
+  }
  return hr;
 }

-static HRESULT WriteUsingWIC(const char* out_file_name, REFGUID container_guid,
-                             unsigned char* rgb, int stride,
+static HRESULT WriteUsingWIC(const char* out_file_name, int use_stdout,
+                             REFGUID container_guid,
+                             uint8_t* rgb, int stride,
                             uint32_t width, uint32_t height, int has_alpha) {
  HRESULT hr = S_OK;
-  IWICImagingFactory* pFactory = NULL;
-  IWICBitmapFrameEncode* pFrame = NULL;
-  IWICBitmapEncoder* pEncoder = NULL;
-  IStream* pStream = NULL;
+  IWICImagingFactory* factory = NULL;
+  IWICBitmapFrameEncode* frame = NULL;
+  IWICBitmapEncoder* encoder = NULL;
+  IStream* stream = NULL;
  WICPixelFormatGUID pixel_format = has_alpha ? GUID_WICPixelFormat32bppBGRA
                                              : GUID_WICPixelFormat24bppBGR;

  IFS(CoInitialize(NULL));
  IFS(CoCreateInstance(MAKE_REFGUID(CLSID_WICImagingFactory), NULL,
-          CLSCTX_INPROC_SERVER, MAKE_REFGUID(IID_IWICImagingFactory),
-          (LPVOID*)&pFactory));
+                       CLSCTX_INPROC_SERVER,
+                       MAKE_REFGUID(IID_IWICImagingFactory),
+                       (LPVOID*)&factory));
  if (hr == REGDB_E_CLASSNOTREG) {
    fprintf(stderr,
            "Couldn't access Windows Imaging Component (are you running "
            "Windows XP SP3 or newer?). PNG support not available. "
            "Use -ppm or -pgm for available PPM and PGM formats.\n");
  }
-  IFS(CreateOutputStream(out_file_name, &pStream));
-  IFS(IWICImagingFactory_CreateEncoder(pFactory, container_guid, NULL,
-          &pEncoder));
-  IFS(IWICBitmapEncoder_Initialize(pEncoder, pStream,
+  IFS(CreateOutputStream(out_file_name, use_stdout, &stream));
+  IFS(IWICImagingFactory_CreateEncoder(factory, container_guid, NULL,
+                                       &encoder));
+  IFS(IWICBitmapEncoder_Initialize(encoder, stream,
                                   WICBitmapEncoderNoCache));
-  IFS(IWICBitmapEncoder_CreateNewFrame(pEncoder, &pFrame, NULL));
-  IFS(IWICBitmapFrameEncode_Initialize(pFrame, NULL));
-  IFS(IWICBitmapFrameEncode_SetSize(pFrame, width, height));
-  IFS(IWICBitmapFrameEncode_SetPixelFormat(pFrame, &pixel_format));
-  IFS(IWICBitmapFrameEncode_WritePixels(pFrame, height, stride,
+  IFS(IWICBitmapEncoder_CreateNewFrame(encoder, &frame, NULL));
+  IFS(IWICBitmapFrameEncode_Initialize(frame, NULL));
+  IFS(IWICBitmapFrameEncode_SetSize(frame, width, height));
+  IFS(IWICBitmapFrameEncode_SetPixelFormat(frame, &pixel_format));
+  IFS(IWICBitmapFrameEncode_WritePixels(frame, height, stride,
                                        height * stride, rgb));
-  IFS(IWICBitmapFrameEncode_Commit(pFrame));
-  IFS(IWICBitmapEncoder_Commit(pEncoder));
+  IFS(IWICBitmapFrameEncode_Commit(frame));
+  IFS(IWICBitmapEncoder_Commit(encoder));

-  if (pFrame != NULL) IUnknown_Release(pFrame);
-  if (pEncoder != NULL) IUnknown_Release(pEncoder);
-  if (pFactory != NULL) IUnknown_Release(pFactory);
-  if (pStream != NULL) IUnknown_Release(pStream);
+  if (SUCCEEDED(hr) && use_stdout) {
+    HGLOBAL image;
+    IFS(GetHGlobalFromStream(stream, &image));
+    if (SUCCEEDED(hr)) {
+      HANDLE std_output = GetStdHandle(STD_OUTPUT_HANDLE);
+      DWORD mode;
+      const BOOL update_mode = GetConsoleMode(std_output, &mode);
+      const void* const image_mem = GlobalLock(image);
+      DWORD bytes_written = 0;
+
+      // Clear output processing if necessary, then output the image.
+      if (update_mode) SetConsoleMode(std_output, 0);
+      if (!WriteFile(std_output, image_mem, (DWORD)GlobalSize(image),
+                     &bytes_written, NULL) ||
+          bytes_written != GlobalSize(image)) {
+        hr = E_FAIL;
+      }
+      if (update_mode) SetConsoleMode(std_output, mode);
+      GlobalUnlock(image);
+    }
+  }
+
+  if (frame != NULL) IUnknown_Release(frame);
+  if (encoder != NULL) IUnknown_Release(encoder);
+  if (factory != NULL) IUnknown_Release(factory);
+  if (stream != NULL) IUnknown_Release(stream);
  return hr;
 }

-static int WritePNG(const char* out_file_name,
+static int WritePNG(const char* out_file_name, int use_stdout,
                    const WebPDecBuffer* const buffer) {
  const uint32_t width = buffer->width;
  const uint32_t height = buffer->height;
-  unsigned char* const rgb = buffer->u.RGBA.rgba;
+  uint8_t* const rgb = buffer->u.RGBA.rgba;
  const int stride = buffer->u.RGBA.stride;
  const int has_alpha = (buffer->colorspace == MODE_BGRA);

-  return SUCCEEDED(WriteUsingWIC(out_file_name,
-             MAKE_REFGUID(GUID_ContainerFormatPng), rgb, stride, width,
-             height, has_alpha));
+  return SUCCEEDED(WriteUsingWIC(out_file_name, use_stdout,
+                                 MAKE_REFGUID(GUID_ContainerFormatPng),
+                                 rgb, stride, width, height, has_alpha));
 }

 #elif defined(WEBP_HAVE_PNG)    // !HAVE_WINCODEC_H
-static void PNGAPI error_function(png_structp png, png_const_charp dummy) {
+static void PNGAPI PNGErrorFunction(png_structp png, png_const_charp dummy) {
  (void)dummy;  // remove variable-unused warning
  longjmp(png_jmpbuf(png), 1);
 }
@ -155,7 +194,7 @@ static void PNGAPI error_function(png_structp png, png_const_charp dummy) {
 static int WritePNG(FILE* out_file, const WebPDecBuffer* const buffer) {
  const uint32_t width = buffer->width;
  const uint32_t height = buffer->height;
-  unsigned char* const rgb = buffer->u.RGBA.rgba;
+  uint8_t* const rgb = buffer->u.RGBA.rgba;
  const int stride = buffer->u.RGBA.stride;
  const int has_alpha = (buffer->colorspace == MODE_RGBA);
  png_structp png;
@ -163,7 +202,7 @@ static int WritePNG(FILE* out_file, const WebPDecBuffer* const buffer) {
  png_uint_32 y;

  png = png_create_write_struct(PNG_LIBPNG_VER_STRING,
-                                NULL, error_function, NULL);
+                                NULL, PNGErrorFunction, NULL);
  if (png == NULL) {
    return 0;
  }
@ -201,25 +240,172 @@ static int WritePNG(FILE* out_file, const WebPDecBuffer* const buffer) {
 }
 #endif

-static int WritePPM(FILE* fout, const WebPDecBuffer* const buffer) {
+static int WritePPM(FILE* fout, const WebPDecBuffer* const buffer, int alpha) {
  const uint32_t width = buffer->width;
  const uint32_t height = buffer->height;
-  const unsigned char* const rgb = buffer->u.RGBA.rgba;
+  const uint8_t* const rgb = buffer->u.RGBA.rgba;
  const int stride = buffer->u.RGBA.stride;
+  const size_t bytes_per_px = alpha ? 4 : 3;
  uint32_t y;
+
+  if (alpha) {
+    fprintf(fout, "P7\nWIDTH %d\nHEIGHT %d\nDEPTH 4\nMAXVAL 255\n"
+                  "TUPLTYPE RGB_ALPHA\nENDHDR\n", width, height);
+  } else {
    fprintf(fout, "P6\n%d %d\n255\n", width, height);
+  }
  for (y = 0; y < height; ++y) {
-    if (fwrite(rgb + y * stride, width, 3, fout) != 3) {
+    if (fwrite(rgb + y * stride, width, bytes_per_px, fout) != bytes_per_px) {
      return 0;
    }
  }
  return 1;
 }

+static void PutLE16(uint8_t* const dst, uint32_t value) {
+  dst[0] = (value >> 0) & 0xff;
+  dst[1] = (value >> 8) & 0xff;
+}
+
+static void PutLE32(uint8_t* const dst, uint32_t value) {
+  PutLE16(dst + 0, (value >>  0) & 0xffff);
+  PutLE16(dst + 2, (value >> 16) & 0xffff);
+}
+
+#define BMP_HEADER_SIZE 54
+static int WriteBMP(FILE* fout, const WebPDecBuffer* const buffer) {
+  const int has_alpha = (buffer->colorspace != MODE_BGR);
+  const uint32_t width = buffer->width;
+  const uint32_t height = buffer->height;
+  const uint8_t* const rgba = buffer->u.RGBA.rgba;
+  const int stride = buffer->u.RGBA.stride;
+  const uint32_t bytes_per_px = has_alpha ? 4 : 3;
+  uint32_t y;
+  const uint32_t line_size = bytes_per_px * width;
+  const uint32_t bmp_stride = (line_size + 3) & ~3;   // pad to 4
+  const uint32_t total_size = bmp_stride * height + BMP_HEADER_SIZE;
+  uint8_t bmp_header[BMP_HEADER_SIZE] = { 0 };
+
+  // bitmap file header
+  PutLE16(bmp_header + 0, 0x4d42);                // signature 'BM'
+  PutLE32(bmp_header + 2, total_size);            // size including header
+  PutLE32(bmp_header + 6, 0);                     // reserved
+  PutLE32(bmp_header + 10, BMP_HEADER_SIZE);      // offset to pixel array
+  // bitmap info header
+  PutLE32(bmp_header + 14, 40);                   // DIB header size
+  PutLE32(bmp_header + 18, width);                // dimensions
+  PutLE32(bmp_header + 22, -(int)height);         // vertical flip!
+  PutLE16(bmp_header + 26, 1);                    // number of planes
+  PutLE16(bmp_header + 28, bytes_per_px * 8);     // bits per pixel
+  PutLE32(bmp_header + 30, 0);                    // no compression (BI_RGB)
+  PutLE32(bmp_header + 34, 0);                    // image size (dummy)
+  PutLE32(bmp_header + 38, 2400);                 // x pixels/meter
+  PutLE32(bmp_header + 42, 2400);                 // y pixels/meter
+  PutLE32(bmp_header + 46, 0);                    // number of palette colors
+  PutLE32(bmp_header + 50, 0);                    // important color count
+
+  // TODO(skal): color profile
+
+  // write header
+  if (fwrite(bmp_header, sizeof(bmp_header), 1, fout) != 1) {
+    return 0;
+  }
+
+  // write pixel array
+  for (y = 0; y < height; ++y) {
+    if (fwrite(rgba + y * stride, line_size, 1, fout) != 1) {
+      return 0;
+    }
+    // write padding zeroes
+    if (bmp_stride != line_size) {
+      const uint8_t zeroes[3] = { 0 };
+      if (fwrite(zeroes, bmp_stride - line_size, 1, fout) != 1) {
+        return 0;
+      }
+    }
+  }
+  return 1;
+}
+#undef BMP_HEADER_SIZE
+
+#define NUM_IFD_ENTRIES 15
+#define EXTRA_DATA_SIZE 16
+// 10b for signature/header + n * 12b entries + 4b for IFD terminator:
+#define EXTRA_DATA_OFFSET (10 + 12 * NUM_IFD_ENTRIES + 4)
+#define TIFF_HEADER_SIZE (EXTRA_DATA_OFFSET + EXTRA_DATA_SIZE)
+
+static int WriteTIFF(FILE* fout, const WebPDecBuffer* const buffer) {
+  const int has_alpha = (buffer->colorspace != MODE_RGB);
+  const uint32_t width = buffer->width;
+  const uint32_t height = buffer->height;
+  const uint8_t* const rgba = buffer->u.RGBA.rgba;
+  const int stride = buffer->u.RGBA.stride;
+  const uint8_t bytes_per_px = has_alpha ? 4 : 3;
+  // For non-alpha case, we omit tag 0x152 (ExtraSamples).
+  const uint8_t num_ifd_entries = has_alpha ? NUM_IFD_ENTRIES
+                                            : NUM_IFD_ENTRIES - 1;
+  uint8_t tiff_header[TIFF_HEADER_SIZE] = {
+    0x49, 0x49, 0x2a, 0x00,   // little endian signature
+    8, 0, 0, 0,               // offset to the unique IFD that follows
+    // IFD (offset = 8). Entries must be written in increasing tag order.
+    num_ifd_entries, 0,       // Number of entries in the IFD (12 bytes each).
+    0x00, 0x01, 3, 0, 1, 0, 0, 0, 0, 0, 0, 0,    //  10: Width  (TBD)
+    0x01, 0x01, 3, 0, 1, 0, 0, 0, 0, 0, 0, 0,    //  22: Height (TBD)
+    0x02, 0x01, 3, 0, bytes_per_px, 0, 0, 0,     //  34: BitsPerSample: 8888
+        EXTRA_DATA_OFFSET + 0, 0, 0, 0,
+    0x03, 0x01, 3, 0, 1, 0, 0, 0, 1, 0, 0, 0,    //  46: Compression: none
+    0x06, 0x01, 3, 0, 1, 0, 0, 0, 2, 0, 0, 0,    //  58: Photometric: RGB
+    0x11, 0x01, 4, 0, 1, 0, 0, 0,                //  70: Strips offset:
+        TIFF_HEADER_SIZE, 0, 0, 0,               //      data follows header
+    0x12, 0x01, 3, 0, 1, 0, 0, 0, 1, 0, 0, 0,    //  82: Orientation: topleft
+    0x15, 0x01, 3, 0, 1, 0, 0, 0,                //  94: SamplesPerPixels
+        bytes_per_px, 0, 0, 0,
+    0x16, 0x01, 3, 0, 1, 0, 0, 0, 0, 0, 0, 0,    // 106: Rows per strip (TBD)
+    0x17, 0x01, 4, 0, 1, 0, 0, 0, 0, 0, 0, 0,    // 118: StripByteCount (TBD)
+    0x1a, 0x01, 5, 0, 1, 0, 0, 0,                // 130: X-resolution
+        EXTRA_DATA_OFFSET + 8, 0, 0, 0,
+    0x1b, 0x01, 5, 0, 1, 0, 0, 0,                // 142: Y-resolution
+        EXTRA_DATA_OFFSET + 8, 0, 0, 0,
+    0x1c, 0x01, 3, 0, 1, 0, 0, 0, 1, 0, 0, 0,    // 154: PlanarConfiguration
+    0x28, 0x01, 3, 0, 1, 0, 0, 0, 2, 0, 0, 0,    // 166: ResolutionUnit (inch)
+    0x52, 0x01, 3, 0, 1, 0, 0, 0, 1, 0, 0, 0,    // 178: ExtraSamples: rgbA
+    0, 0, 0, 0,                                  // 190: IFD terminator
+    // EXTRA_DATA_OFFSET:
+    8, 0, 8, 0, 8, 0, 8, 0,      // BitsPerSample
+    72, 0, 0, 0, 1, 0, 0, 0      // 72 pixels/inch, for X/Y-resolution
+  };
+  uint32_t y;
+
+  // Fill placeholders in IFD:
+  PutLE32(tiff_header + 10 + 8, width);
+  PutLE32(tiff_header + 22 + 8, height);
+  PutLE32(tiff_header + 106 + 8, height);
+  PutLE32(tiff_header + 118 + 8, width * bytes_per_px * height);
+  if (!has_alpha) PutLE32(tiff_header + 178, 0);  // IFD terminator
+
+  // write header
+  if (fwrite(tiff_header, sizeof(tiff_header), 1, fout) != 1) {
+    return 0;
+  }
+  // write pixel values
+  for (y = 0; y < height; ++y) {
+    if (fwrite(rgba + y * stride, bytes_per_px, width, fout) != width) {
+      return 0;
+    }
+  }
+
+  return 1;
+}
+
+#undef TIFF_HEADER_SIZE
+#undef EXTRA_DATA_OFFSET
+#undef EXTRA_DATA_SIZE
+#undef NUM_IFD_ENTRIES
+
 static int WriteAlphaPlane(FILE* fout, const WebPDecBuffer* const buffer) {
  const uint32_t width = buffer->width;
  const uint32_t height = buffer->height;
-  const unsigned char* const a = buffer->u.YUVA.a;
+  const uint8_t* const a = buffer->u.YUVA.a;
  const int a_stride = buffer->u.YUVA.a_stride;
  uint32_t y;
  assert(a != NULL);
@ -232,101 +418,150 @@ static int WriteAlphaPlane(FILE* fout, const WebPDecBuffer* const buffer) {
  return 1;
 }

-static int WritePGM(FILE* fout, const WebPDecBuffer* const buffer) {
+// format=PGM: save a grayscale PGM file using the IMC4 layout
+// (http://www.fourcc.org/yuv.php#IMC4). This is a very convenient format for
+// viewing the samples, esp. for odd dimensions.
+// format=YUV: just save the Y/U/V/A planes sequentially without header.
+static int WritePGMOrYUV(FILE* fout, const WebPDecBuffer* const buffer,
+                         OutputFileFormat format) {
  const int width = buffer->width;
  const int height = buffer->height;
  const WebPYUVABuffer* const yuv = &buffer->u.YUVA;
-  // Save a grayscale PGM file using the IMC4 layout
-  // (http://www.fourcc.org/yuv.php#IMC4). This is a very
-  // convenient format for viewing the samples, esp. for
-  // odd dimensions.
  int ok = 1;
  int y;
+  const int pad = (format == YUV) ? 0 : 1;
  const int uv_width = (width + 1) / 2;
  const int uv_height = (height + 1) / 2;
-  const int out_stride = (width + 1) & ~1;
+  const int out_stride = (width + pad) & ~pad;
  const int a_height = yuv->a ? height : 0;
-  fprintf(fout, "P5\n%d %d\n255\n", out_stride, height + uv_height + a_height);
+  if (format == PGM) {
+    fprintf(fout, "P5\n%d %d\n255\n",
+            out_stride, height + uv_height + a_height);
+  }
  for (y = 0; ok && y < height; ++y) {
    ok &= (fwrite(yuv->y + y * yuv->y_stride, width, 1, fout) == 1);
+    if (format == PGM) {
      if (width & 1) fputc(0, fout);    // padding byte
    }
+  }
+  if (format == PGM) {   // IMC4 layout
    for (y = 0; ok && y < uv_height; ++y) {
      ok &= (fwrite(yuv->u + y * yuv->u_stride, uv_width, 1, fout) == 1);
      ok &= (fwrite(yuv->v + y * yuv->v_stride, uv_width, 1, fout) == 1);
    }
+  } else {
+    for (y = 0; ok && y < uv_height; ++y) {
+      ok &= (fwrite(yuv->u + y * yuv->u_stride, uv_width, 1, fout) == 1);
+    }
+    for (y = 0; ok && y < uv_height; ++y) {
+      ok &= (fwrite(yuv->v + y * yuv->v_stride, uv_width, 1, fout) == 1);
+    }
+  }
  for (y = 0; ok && y < a_height; ++y) {
    ok &= (fwrite(yuv->a + y * yuv->a_stride, width, 1, fout) == 1);
+    if (format == PGM) {
      if (width & 1) fputc(0, fout);    // padding byte
    }
+  }
  return ok;
 }

-static void SaveOutput(const WebPDecBuffer* const buffer,
+static int SaveOutput(const WebPDecBuffer* const buffer,
                      OutputFileFormat format, const char* const out_file) {
  FILE* fout = NULL;
  int needs_open_file = 1;
+  const int use_stdout = !strcmp(out_file, "-");
  int ok = 1;
  Stopwatch stop_watch;

-  if (verbose)
-    StopwatchReadAndReset(&stop_watch);
+  if (verbose) {
+    StopwatchReset(&stop_watch);
+  }

 #ifdef HAVE_WINCODEC_H
  needs_open_file = (format != PNG);
 #endif
+
+#if defined(_WIN32)
+  if (use_stdout && _setmode(_fileno(stdout), _O_BINARY) == -1) {
+    fprintf(stderr, "Failed to reopen stdout in O_BINARY mode.\n");
+    return -1;
+  }
+#endif
+
  if (needs_open_file) {
-    fout = fopen(out_file, "wb");
-    if (!fout) {
+    fout = use_stdout ? stdout : fopen(out_file, "wb");
+    if (fout == NULL) {
      fprintf(stderr, "Error opening output file %s\n", out_file);
-      return;
+      return 0;
    }
  }

  if (format == PNG) {
 #ifdef HAVE_WINCODEC_H
-    ok &= WritePNG(out_file, buffer);
+    ok &= WritePNG(out_file, use_stdout, buffer);
 #else
    ok &= WritePNG(fout, buffer);
 #endif
+  } else if (format == PAM) {
+    ok &= WritePPM(fout, buffer, 1);
  } else if (format == PPM) {
-    ok &= WritePPM(fout, buffer);
-  } else if (format == PGM) {
-    ok &= WritePGM(fout, buffer);
+    ok &= WritePPM(fout, buffer, 0);
+  } else if (format == BMP) {
+    ok &= WriteBMP(fout, buffer);
+  } else if (format == TIFF) {
+    ok &= WriteTIFF(fout, buffer);
+  } else if (format == PGM || format == YUV) {
+    ok &= WritePGMOrYUV(fout, buffer, format);
  } else if (format == ALPHA_PLANE_ONLY) {
    ok &= WriteAlphaPlane(fout, buffer);
  }
-  if (fout) {
+  if (fout != NULL && fout != stdout) {
    fclose(fout);
  }
  if (ok) {
-    printf("Saved file %s\n", out_file);
-    if (verbose) {
-      const double time = StopwatchReadAndReset(&stop_watch);
-      printf("Time to write output: %.3fs\n", time);
+    if (use_stdout) {
+      fprintf(stderr, "Saved to stdout\n");
+    } else {
+      fprintf(stderr, "Saved file %s\n", out_file);
    }
+    if (verbose) {
+      const double write_time = StopwatchReadAndReset(&stop_watch);
+      fprintf(stderr, "Time to write output: %.3fs\n", write_time);
+    }
+  } else {
+    if (use_stdout) {
+      fprintf(stderr, "Error writing to stdout !!\n");
    } else {
      fprintf(stderr, "Error writing file %s !!\n", out_file);
    }
  }
+  return ok;
+}

 static void Help(void) {
  printf("Usage: dwebp in_file [options] [-o out_file]\n\n"
         "Decodes the WebP image file to PNG format [Default]\n"
         "Use following options to convert into alternate image formats:\n"
-         "  -ppm ......... save the raw RGB samples as color PPM\n"
+         "  -pam ......... save the raw RGBA samples as a color PAM\n"
+         "  -ppm ......... save the raw RGB samples as a color PPM\n"
+         "  -bmp ......... save as uncompressed BMP format\n"
+         "  -tiff ........ save as uncompressed TIFF format\n"
         "  -pgm ......... save the raw YUV samples as a grayscale PGM\n"
-         "                 file with IMC4 layout.\n"
+         "                 file with IMC4 layout\n"
+         "  -yuv ......... save the raw YUV samples in flat layout\n"
+         "\n"
         " Other options are:\n"
         "  -version  .... print version number and exit.\n"
         "  -nofancy ..... don't use the fancy YUV420 upscaler.\n"
         "  -nofilter .... disable in-loop filtering.\n"
+         "  -nodither .... disable dithering.\n"
+         "  -dither <d> .. dithering strength (in 0..100)\n"
         "  -mt .......... use multi-threading\n"
         "  -crop <x> <y> <w> <h> ... crop output with the given rectangle\n"
         "  -scale <w> <h> .......... scale the output (*after* any cropping)\n"
-#ifdef WEBP_EXPERIMENTAL_FEATURES
         "  -alpha ....... only save the alpha plane.\n"
-#endif
+         "  -incremental . use incremental decoding (useful for tests)\n"
         "  -h     ....... this help message.\n"
         "  -v     ....... verbose (e.g. print encoding/decoding times)\n"
 #ifndef WEBP_DLL
@ -340,7 +575,12 @@ static const char* const kStatusMessages[] = {
  "UNSUPPORTED_FEATURE", "SUSPENDED", "USER_ABORT", "NOT_ENOUGH_DATA"
 };

+static const char* const kFormatType[] = {
+  "unspecified", "lossy", "lossless"
+};
+
 int main(int argc, const char *argv[]) {
+  int ok = 0;
  const char *in_file = NULL;
  const char *out_file = NULL;

@ -348,6 +588,7 @@ int main(int argc, const char *argv[]) {
  WebPDecBuffer* const output_buffer = &config.output;
  WebPBitstreamFeatures* const bitstream = &config.input;
  OutputFileFormat format = PNG;
+  int incremental = 0;
  int c;

  if (!WebPInitDecoderConfig(&config)) {
@ -367,8 +608,14 @@ int main(int argc, const char *argv[]) {
      config.options.no_fancy_upsampling = 1;
    } else if (!strcmp(argv[c], "-nofilter")) {
      config.options.bypass_filtering = 1;
+    } else if (!strcmp(argv[c], "-pam")) {
+      format = PAM;
    } else if (!strcmp(argv[c], "-ppm")) {
      format = PPM;
+    } else if (!strcmp(argv[c], "-bmp")) {
+      format = BMP;
+    } else if (!strcmp(argv[c], "-tiff")) {
+      format = TIFF;
    } else if (!strcmp(argv[c], "-version")) {
      const int version = WebPGetDecoderVersion();
      printf("%d.%d.%d\n",
@ -376,8 +623,14 @@ int main(int argc, const char *argv[]) {
      return 0;
    } else if (!strcmp(argv[c], "-pgm")) {
      format = PGM;
+    } else if (!strcmp(argv[c], "-yuv")) {
+      format = YUV;
    } else if (!strcmp(argv[c], "-mt")) {
      config.options.use_threads = 1;
+    } else if (!strcmp(argv[c], "-nodither")) {
+      config.options.dithering_strength = 0;
+    } else if (!strcmp(argv[c], "-dither") && c < argc - 1) {
+      config.options.dithering_strength = strtol(argv[++c], NULL, 0);
    } else if (!strcmp(argv[c], "-crop") && c < argc - 4) {
      config.options.use_cropping = 1;
      config.options.crop_left   = strtol(argv[++c], NULL, 0);
@ -394,6 +647,11 @@ int main(int argc, const char *argv[]) {
    } else if (!strcmp(argv[c], "-noasm")) {
      VP8GetCPUInfo = NULL;
 #endif
+    } else if (!strcmp(argv[c], "-incremental")) {
+      incremental = 1;
+    } else if (!strcmp(argv[c], "--")) {
+      if (c < argc - 1) in_file = argv[++c];
+      break;
    } else if (argv[c][0] == '-') {
      fprintf(stderr, "Unknown option '%s'\n", argv[c]);
      Help();
@ -412,20 +670,27 @@ int main(int argc, const char *argv[]) {
  {
    Stopwatch stop_watch;
    VP8StatusCode status = VP8_STATUS_OK;
-    int ok;
    size_t data_size = 0;
    const uint8_t* data = NULL;

    if (!ExUtilReadFile(in_file, &data, &data_size)) return -1;

-    if (verbose)
-      StopwatchReadAndReset(&stop_watch);
+    if (verbose) {
+      StopwatchReset(&stop_watch);
+    }

    status = WebPGetFeatures(data, data_size, bitstream);
    if (status != VP8_STATUS_OK) {
      goto end;
    }

+    if (bitstream->has_animation) {
+      fprintf(stderr,
+              "Error! Decoding of an animated WebP file is not supported.\n"
+              "       Use webpmux to extract the individual frames or\n"
+              "       vwebp to view this image.\n");
+    }
+
    switch (format) {
      case PNG:
 #ifdef HAVE_WINCODEC_H
@ -434,10 +699,21 @@ int main(int argc, const char *argv[]) {
        output_buffer->colorspace = bitstream->has_alpha ? MODE_RGBA : MODE_RGB;
 #endif
        break;
+      case PAM:
+        output_buffer->colorspace = MODE_RGBA;
+        break;
      case PPM:
        output_buffer->colorspace = MODE_RGB;  // drops alpha for PPM
        break;
+      case BMP:
+        output_buffer->colorspace = bitstream->has_alpha ? MODE_BGRA : MODE_BGR;
+        break;
+      case TIFF:    // note: force pre-multiplied alpha
+        output_buffer->colorspace =
+            bitstream->has_alpha ? MODE_rgbA : MODE_RGB;
+        break;
      case PGM:
+      case YUV:
        output_buffer->colorspace = bitstream->has_alpha ? MODE_YUVA : MODE_YUV;
        break;
      case ALPHA_PLANE_ONLY:
@ -447,11 +723,25 @@ int main(int argc, const char *argv[]) {
        free((void*)data);
        return -1;
    }
+
+    // Decoding call.
+    if (!incremental) {
      status = WebPDecode(data, data_size, &config);
+    } else {
+      WebPIDecoder* const idec = WebPIDecode(data, data_size, &config);
+      if (idec == NULL) {
+        fprintf(stderr, "Failed during WebPINewDecoder().\n");
+        status = VP8_STATUS_OUT_OF_MEMORY;
+        goto end;
+      } else {
+        status = WebPIUpdate(idec, data, data_size);
+        WebPIDelete(idec);
+      }
+    }

    if (verbose) {
-      const double time = StopwatchReadAndReset(&stop_watch);
-      printf("Time to decode picture: %.3fs\n", time);
+      const double decode_time = StopwatchReadAndReset(&stop_watch);
+      fprintf(stderr, "Time to decode picture: %.3fs\n", decode_time);
    }
 end:
    free((void*)data);
@ -459,24 +749,29 @@ int main(int argc, const char *argv[]) {
    if (!ok) {
      fprintf(stderr, "Decoding of %s failed.\n", in_file);
      fprintf(stderr, "Status: %d (%s)\n", status, kStatusMessages[status]);
-      return -1;
+      goto Exit;
    }
  }

-  if (out_file) {
-    printf("Decoded %s. Dimensions: %d x %d%s. Now saving...\n", in_file,
-           output_buffer->width, output_buffer->height,
-           bitstream->has_alpha ? " (with alpha)" : "");
-    SaveOutput(output_buffer, format, out_file);
-  } else {
-    printf("File %s can be decoded (dimensions: %d x %d)%s.\n",
+  if (out_file != NULL) {
+    fprintf(stderr, "Decoded %s. Dimensions: %d x %d %s. Format: %s. "
+                    "Now saving...\n",
            in_file, output_buffer->width, output_buffer->height,
-           bitstream->has_alpha ? " (with alpha)" : "");
-    printf("Nothing written; use -o flag to save the result as e.g. PNG.\n");
+            bitstream->has_alpha ? " (with alpha)" : "",
+            kFormatType[bitstream->format]);
+    ok = SaveOutput(output_buffer, format, out_file);
+  } else {
+    fprintf(stderr, "File %s can be decoded "
+                    "(dimensions: %d x %d %s. Format: %s).\n",
+            in_file, output_buffer->width, output_buffer->height,
+            bitstream->has_alpha ? " (with alpha)" : "",
+            kFormatType[bitstream->format]);
+    fprintf(stderr, "Nothing written; "
+                    "use -o flag to save the result as e.g. PNG.\n");
  }
+ Exit:
  WebPFreeDecBuffer(output_buffer);
-
-  return 0;
+  return ok ? 0 : -1;
 }

 //------------------------------------------------------------------------------
--- a/examples/example_util.c
+++ b/examples/example_util.c
@ -1,8 +1,10 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 //  Utility functions used by the example programs.
@ -12,10 +14,6 @@
 #include <stdio.h>
 #include <stdlib.h>

-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 // -----------------------------------------------------------------------------
 // File I/O

@ -44,8 +42,8 @@ int ExUtilReadFile(const char* const file_name,
  fclose(in);

  if (!ok) {
-    fprintf(stderr, "Could not read %zu bytes of data from file %s\n",
-            file_size, file_name);
+    fprintf(stderr, "Could not read %d bytes of data from file %s\n",
+            (int)file_size, file_name);
    free(file_data);
    return 0;
  }
@ -54,6 +52,21 @@ int ExUtilReadFile(const char* const file_name,
  return 1;
 }

-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
+int ExUtilWriteFile(const char* const file_name,
+                    const uint8_t* data, size_t data_size) {
+  int ok;
+  FILE* out;
+
+  if (file_name == NULL || data == NULL) {
+    return 0;
+  }
+  out = fopen(file_name, "wb");
+  if (out == NULL) {
+    fprintf(stderr, "Error! Cannot open output file '%s'\n", file_name);
+    return 0;
+  }
+  ok = (fwrite(data, data_size, 1, out) == 1);
+  fclose(out);
+  return ok;
+}
+
--- a/examples/example_util.h
+++ b/examples/example_util.h
@ -1,8 +1,10 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 //  Utility functions used by the example programs.
@ -13,7 +15,7 @@

 #include "webp/types.h"

-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif

@ -23,7 +25,11 @@ extern "C" {
 int ExUtilReadFile(const char* const file_name,
                   const uint8_t** data, size_t* data_size);

-#if defined(__cplusplus) || defined(c_plusplus)
+// Write a data segment into a file named 'file_name'. Returns true if ok.
+int ExUtilWriteFile(const char* const file_name,
+                    const uint8_t* data, size_t data_size);
+
+#ifdef __cplusplus
 }    // extern "C"
 #endif

--- a/examples/gif2webp.c
+++ b/examples/gif2webp.c
@ -0,0 +1,663 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+//  simple tool to convert animated GIFs to WebP
+//
+// Authors: Skal (pascal.massimino@gmail.com)
+//          Urvang (urvang@google.com)
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#ifdef WEBP_HAVE_GIF
+
+#include <gif_lib.h>
+#include "webp/encode.h"
+#include "webp/mux.h"
+#include "./example_util.h"
+#include "./gif2webp_util.h"
+
+#define GIF_TRANSPARENT_MASK 0x01
+#define GIF_DISPOSE_MASK     0x07
+#define GIF_DISPOSE_SHIFT    2
+#define WHITE_COLOR          0xffffffff
+#define MAX_CACHE_SIZE       30
+
+//------------------------------------------------------------------------------
+
+static int transparent_index = -1;  // Index of transparent color in the map.
+
+static void SanitizeKeyFrameIntervals(size_t* const kmin_ptr,
+                                      size_t* const kmax_ptr) {
+  size_t kmin = *kmin_ptr;
+  size_t kmax = *kmax_ptr;
+  int print_warning = 1;
+
+  if (kmin == 0) {  // Disable keyframe insertion.
+    kmax = ~0;
+    kmin = kmax - 1;
+    print_warning = 0;
+  }
+  if (kmax == 0) {
+    kmax = ~0;
+    print_warning = 0;
+  }
+
+  if (kmin >= kmax) {
+    kmin = kmax - 1;
+    if (print_warning) {
+      fprintf(stderr,
+              "WARNING: Setting kmin = %d, so that kmin < kmax.\n", (int)kmin);
+    }
+  } else if (kmin < (kmax / 2 + 1)) {
+    // This ensures that cache.keyframe + kmin >= kmax is always true. So, we
+    // can flush all the frames in the ‘count_since_key_frame == kmax’ case.
+    kmin = (kmax / 2 + 1);
+    if (print_warning) {
+      fprintf(stderr,
+              "WARNING: Setting kmin = %d, so that kmin >= kmax / 2 + 1.\n",
+              (int)kmin);
+    }
+  }
+  // Limit the max number of frames that are allocated.
+  if (kmax - kmin > MAX_CACHE_SIZE) {
+    kmin = kmax - MAX_CACHE_SIZE;
+    if (print_warning) {
+      fprintf(stderr,
+              "WARNING: Setting kmin = %d, so that kmax - kmin <= 30.\n",
+              (int)kmin);
+    }
+  }
+  *kmin_ptr = kmin;
+  *kmax_ptr = kmax;
+}
+
+static void Remap(const uint8_t* const src, const GifFileType* const gif,
+                  uint32_t* dst, int len) {
+  int i;
+  const GifColorType* colors;
+  const ColorMapObject* const cmap =
+      gif->Image.ColorMap ? gif->Image.ColorMap : gif->SColorMap;
+  if (cmap == NULL) return;
+  colors = cmap->Colors;
+
+  for (i = 0; i < len; ++i) {
+    const GifColorType c = colors[src[i]];
+    dst[i] = (src[i] == transparent_index) ? WEBP_UTIL_TRANSPARENT_COLOR
+           : c.Blue | (c.Green << 8) | (c.Red << 16) | (0xff << 24);
+  }
+}
+
+// Read the GIF image frame.
+static int ReadFrame(GifFileType* const gif, WebPFrameRect* const gif_rect,
+                     WebPPicture* const webp_frame) {
+  WebPPicture sub_image;
+  const GifImageDesc image_desc = gif->Image;
+  uint32_t* dst = NULL;
+  uint8_t* tmp = NULL;
+  int ok = 0;
+  WebPFrameRect rect = {
+      image_desc.Left, image_desc.Top, image_desc.Width, image_desc.Height
+  };
+  *gif_rect = rect;
+
+  // Use a view for the sub-picture:
+  if (!WebPPictureView(webp_frame, rect.x_offset, rect.y_offset,
+                       rect.width, rect.height, &sub_image)) {
+    fprintf(stderr, "Sub-image %dx%d at position %d,%d is invalid!\n",
+            rect.width, rect.height, rect.x_offset, rect.y_offset);
+    return 0;
+  }
+  dst = sub_image.argb;
+
+  tmp = (uint8_t*)malloc(rect.width * sizeof(*tmp));
+  if (tmp == NULL) goto End;
+
+  if (image_desc.Interlace) {  // Interlaced image.
+    // We need 4 passes, with the following offsets and jumps.
+    const int interlace_offsets[] = { 0, 4, 2, 1 };
+    const int interlace_jumps[]   = { 8, 8, 4, 2 };
+    int pass;
+    for (pass = 0; pass < 4; ++pass) {
+      int y;
+      for (y = interlace_offsets[pass]; y < rect.height;
+           y += interlace_jumps[pass]) {
+        if (DGifGetLine(gif, tmp, rect.width) == GIF_ERROR) goto End;
+        Remap(tmp, gif, dst + y * sub_image.argb_stride, rect.width);
+      }
+    }
+  } else {  // Non-interlaced image.
+    int y;
+    for (y = 0; y < rect.height; ++y) {
+      if (DGifGetLine(gif, tmp, rect.width) == GIF_ERROR) goto End;
+      Remap(tmp, gif, dst + y * sub_image.argb_stride, rect.width);
+    }
+  }
+  ok = 1;
+
+ End:
+  if (!ok) webp_frame->error_code = sub_image.error_code;
+  WebPPictureFree(&sub_image);
+  free(tmp);
+  return ok;
+}
+
+static int GetBackgroundColor(const ColorMapObject* const color_map,
+                              int bgcolor_idx, uint32_t* const bgcolor) {
+  if (transparent_index != -1 && bgcolor_idx == transparent_index) {
+    *bgcolor = WEBP_UTIL_TRANSPARENT_COLOR;  // Special case.
+    return 1;
+  } else if (color_map == NULL || color_map->Colors == NULL
+             || bgcolor_idx >= color_map->ColorCount) {
+    return 0;  // Invalid color map or index.
+  } else {
+    const GifColorType color = color_map->Colors[bgcolor_idx];
+    *bgcolor = (0xff        << 24)
+             | (color.Red   << 16)
+             | (color.Green <<  8)
+             | (color.Blue  <<  0);
+    return 1;
+  }
+}
+
+static void DisplayGifError(const GifFileType* const gif, int gif_error) {
+  // GIFLIB_MAJOR is only defined in libgif >= 4.2.0.
+  // libgif 4.2.0 has retired PrintGifError() and added GifErrorString().
+#if defined(GIFLIB_MAJOR) && defined(GIFLIB_MINOR) && \
+        ((GIFLIB_MAJOR == 4 && GIFLIB_MINOR >= 2) || GIFLIB_MAJOR > 4)
+#if GIFLIB_MAJOR >= 5
+  // Static string actually, hence the const char* cast.
+  const char* error_str = (const char*)GifErrorString(
+      (gif == NULL) ? gif_error : gif->Error);
+#else
+  const char* error_str = (const char*)GifErrorString();
+  (void)gif;
+#endif
+  if (error_str == NULL) error_str = "Unknown error";
+  fprintf(stderr, "GIFLib Error %d: %s\n", gif_error, error_str);
+#else
+  (void)gif;
+  fprintf(stderr, "GIFLib Error %d: ", gif_error);
+  PrintGifError();
+  fprintf(stderr, "\n");
+#endif
+}
+
+static const char* const kErrorMessages[] = {
+  "WEBP_MUX_NOT_FOUND", "WEBP_MUX_INVALID_ARGUMENT", "WEBP_MUX_BAD_DATA",
+  "WEBP_MUX_MEMORY_ERROR", "WEBP_MUX_NOT_ENOUGH_DATA"
+};
+
+static const char* ErrorString(WebPMuxError err) {
+  assert(err <= WEBP_MUX_NOT_FOUND && err >= WEBP_MUX_NOT_ENOUGH_DATA);
+  return kErrorMessages[-err];
+}
+
+enum {
+  METADATA_ICC  = (1 << 0),
+  METADATA_XMP  = (1 << 1),
+  METADATA_ALL  = METADATA_ICC | METADATA_XMP
+};
+
+//------------------------------------------------------------------------------
+
+static void Help(void) {
+  printf("Usage:\n");
+  printf(" gif2webp [options] gif_file -o webp_file\n");
+  printf("options:\n");
+  printf("  -h / -help  ............ this help\n");
+  printf("  -lossy ................. Encode image using lossy compression.\n");
+  printf("  -mixed ................. For each frame in the image, pick lossy\n"
+         "                           or lossless compression heuristically.\n");
+  printf("  -q <float> ............. quality factor (0:small..100:big)\n");
+  printf("  -m <int> ............... compression method (0=fast, 6=slowest)\n");
+  printf("  -kmin <int> ............ Min distance between key frames\n");
+  printf("  -kmax <int> ............ Max distance between key frames\n");
+  printf("  -f <int> ............... filter strength (0=off..100)\n");
+  printf("  -metadata <string> ..... comma separated list of metadata to\n");
+  printf("                           ");
+  printf("copy from the input to the output if present.\n");
+  printf("                           "
+         "Valid values: all, none, icc, xmp (default)\n");
+  printf("  -mt .................... use multi-threading if available\n");
+  printf("\n");
+  printf("  -version ............... print version number and exit.\n");
+  printf("  -v ..................... verbose.\n");
+  printf("  -quiet ................. don't print anything.\n");
+  printf("\n");
+}
+
+//------------------------------------------------------------------------------
+
+int main(int argc, const char *argv[]) {
+  int verbose = 0;
+  int gif_error = GIF_ERROR;
+  WebPMuxError err = WEBP_MUX_OK;
+  int ok = 0;
+  const char *in_file = NULL, *out_file = NULL;
+  FILE* out = NULL;
+  GifFileType* gif = NULL;
+  WebPConfig config;
+  WebPPicture frame;
+  WebPMuxFrameInfo info;
+  WebPMuxAnimParams anim = { WHITE_COLOR, 0 };
+  WebPFrameCache* cache = NULL;
+
+  int is_first_frame = 1;     // Whether we are processing the first frame.
+  int done;
+  int c;
+  int quiet = 0;
+  WebPMux* mux = NULL;
+  WebPData webp_data = { NULL, 0 };
+  int keep_metadata = METADATA_XMP;  // ICC not output by default.
+  int stored_icc = 0;  // Whether we have already stored an ICC profile.
+  int stored_xmp = 0;
+
+  int default_kmin = 1;  // Whether to use default kmin value.
+  int default_kmax = 1;
+  size_t kmin = 0;
+  size_t kmax = 0;
+  int allow_mixed = 0;   // If true, each frame can be lossy or lossless.
+
+  memset(&info, 0, sizeof(info));
+  info.id = WEBP_CHUNK_ANMF;
+  info.dispose_method = WEBP_MUX_DISPOSE_BACKGROUND;
+  info.blend_method = WEBP_MUX_BLEND;
+
+  if (!WebPConfigInit(&config) || !WebPPictureInit(&frame)) {
+    fprintf(stderr, "Error! Version mismatch!\n");
+    return -1;
+  }
+  config.lossless = 1;  // Use lossless compression by default.
+  config.image_hint = WEBP_HINT_GRAPH;   // always low-color
+
+  if (argc == 1) {
+    Help();
+    return 0;
+  }
+
+  for (c = 1; c < argc; ++c) {
+    if (!strcmp(argv[c], "-h") || !strcmp(argv[c], "-help")) {
+      Help();
+      return 0;
+    } else if (!strcmp(argv[c], "-o") && c < argc - 1) {
+      out_file = argv[++c];
+    } else if (!strcmp(argv[c], "-lossy")) {
+      config.lossless = 0;
+    } else if (!strcmp(argv[c], "-mixed")) {
+      allow_mixed = 1;
+      config.lossless = 0;
+    } else if (!strcmp(argv[c], "-q") && c < argc - 1) {
+      config.quality = (float)strtod(argv[++c], NULL);
+    } else if (!strcmp(argv[c], "-m") && c < argc - 1) {
+      config.method = strtol(argv[++c], NULL, 0);
+    } else if (!strcmp(argv[c], "-kmax") && c < argc - 1) {
+      kmax = strtoul(argv[++c], NULL, 0);
+      default_kmax = 0;
+    } else if (!strcmp(argv[c], "-kmin") && c < argc - 1) {
+      kmin = strtoul(argv[++c], NULL, 0);
+      default_kmin = 0;
+    } else if (!strcmp(argv[c], "-f") && c < argc - 1) {
+      config.filter_strength = strtol(argv[++c], NULL, 0);
+    } else if (!strcmp(argv[c], "-metadata") && c < argc - 1) {
+      static const struct {
+        const char* option;
+        int flag;
+      } kTokens[] = {
+        { "all",  METADATA_ALL },
+        { "none", 0 },
+        { "icc",  METADATA_ICC },
+        { "xmp",  METADATA_XMP },
+      };
+      const size_t kNumTokens = sizeof(kTokens) / sizeof(*kTokens);
+      const char* start = argv[++c];
+      const char* const end = start + strlen(start);
+
+      keep_metadata = 0;
+      while (start < end) {
+        size_t i;
+        const char* token = strchr(start, ',');
+        if (token == NULL) token = end;
+
+        for (i = 0; i < kNumTokens; ++i) {
+          if ((size_t)(token - start) == strlen(kTokens[i].option) &&
+              !strncmp(start, kTokens[i].option, strlen(kTokens[i].option))) {
+            if (kTokens[i].flag != 0) {
+              keep_metadata |= kTokens[i].flag;
+            } else {
+              keep_metadata = 0;
+            }
+            break;
+          }
+        }
+        if (i == kNumTokens) {
+          fprintf(stderr, "Error! Unknown metadata type '%.*s'\n",
+                  (int)(token - start), start);
+          Help();
+          return -1;
+        }
+        start = token + 1;
+      }
+    } else if (!strcmp(argv[c], "-mt")) {
+      ++config.thread_level;
+    } else if (!strcmp(argv[c], "-version")) {
+      const int enc_version = WebPGetEncoderVersion();
+      const int mux_version = WebPGetMuxVersion();
+      printf("WebP Encoder version: %d.%d.%d\nWebP Mux version: %d.%d.%d\n",
+             (enc_version >> 16) & 0xff, (enc_version >> 8) & 0xff,
+             enc_version & 0xff, (mux_version >> 16) & 0xff,
+             (mux_version >> 8) & 0xff, mux_version & 0xff);
+      return 0;
+    } else if (!strcmp(argv[c], "-quiet")) {
+      quiet = 1;
+    } else if (!strcmp(argv[c], "-v")) {
+      verbose = 1;
+    } else if (!strcmp(argv[c], "--")) {
+      if (c < argc - 1) in_file = argv[++c];
+      break;
+    } else if (argv[c][0] == '-') {
+      fprintf(stderr, "Error! Unknown option '%s'\n", argv[c]);
+      Help();
+      return -1;
+    } else {
+      in_file = argv[c];
+    }
+  }
+
+  // Appropriate default kmin, kmax values for lossy and lossless.
+  if (default_kmin) {
+    kmin = config.lossless ? 9 : 3;
+  }
+  if (default_kmax) {
+    kmax = config.lossless ? 17 : 5;
+  }
+  SanitizeKeyFrameIntervals(&kmin, &kmax);
+
+  if (!WebPValidateConfig(&config)) {
+    fprintf(stderr, "Error! Invalid configuration.\n");
+    goto End;
+  }
+
+  if (in_file == NULL) {
+    fprintf(stderr, "No input file specified!\n");
+    Help();
+    goto End;
+  }
+
+  // Start the decoder object
+#if defined(GIFLIB_MAJOR) && (GIFLIB_MAJOR >= 5)
+  // There was an API change in version 5.0.0.
+  gif = DGifOpenFileName(in_file, &gif_error);
+#else
+  gif = DGifOpenFileName(in_file);
+#endif
+  if (gif == NULL) goto End;
+
+  // Allocate current buffer
+  frame.width = gif->SWidth;
+  frame.height = gif->SHeight;
+  frame.use_argb = 1;
+  if (!WebPPictureAlloc(&frame)) goto End;
+
+  // Initialize cache
+  cache = WebPFrameCacheNew(frame.width, frame.height, kmin, kmax, allow_mixed);
+  if (cache == NULL) goto End;
+
+  mux = WebPMuxNew();
+  if (mux == NULL) {
+    fprintf(stderr, "ERROR: could not create a mux object.\n");
+    goto End;
+  }
+
+  // Loop over GIF images
+  done = 0;
+  do {
+    GifRecordType type;
+    if (DGifGetRecordType(gif, &type) == GIF_ERROR) goto End;
+
+    switch (type) {
+      case IMAGE_DESC_RECORD_TYPE: {
+        WebPFrameRect gif_rect;
+
+        if (!DGifGetImageDesc(gif)) goto End;
+        if (!ReadFrame(gif, &gif_rect, &frame)) {
+          goto End;
+        }
+
+        if (!WebPFrameCacheAddFrame(cache, &config, &gif_rect, &frame, &info)) {
+          fprintf(stderr, "Error! Cannot encode frame as WebP\n");
+          fprintf(stderr, "Error code: %d\n", frame.error_code);
+        }
+
+        err = WebPFrameCacheFlush(cache, verbose, mux);
+        if (err != WEBP_MUX_OK) {
+          fprintf(stderr, "ERROR (%s): Could not add animation frame.\n",
+                  ErrorString(err));
+          goto End;
+        }
+        is_first_frame = 0;
+        break;
+      }
+      case EXTENSION_RECORD_TYPE: {
+        int extension;
+        GifByteType *data = NULL;
+        if (DGifGetExtension(gif, &extension, &data) == GIF_ERROR) {
+          goto End;
+        }
+        switch (extension) {
+          case COMMENT_EXT_FUNC_CODE: {
+            break;  // Do nothing for now.
+          }
+          case GRAPHICS_EXT_FUNC_CODE: {
+            const int flags = data[1];
+            const int dispose = (flags >> GIF_DISPOSE_SHIFT) & GIF_DISPOSE_MASK;
+            const int delay = data[2] | (data[3] << 8);  // In 10 ms units.
+            if (data[0] != 4) goto End;
+            info.duration = delay * 10;  // Duration is in 1 ms units for WebP.
+            if (dispose == 3) {
+              static int warning_printed = 0;
+              if (!warning_printed) {
+                fprintf(stderr, "WARNING: GIF_DISPOSE_RESTORE unsupported.\n");
+                warning_printed = 1;
+              }
+              // failsafe. TODO(urvang): emulate the correct behaviour by
+              // recoding the whole frame.
+              info.dispose_method = WEBP_MUX_DISPOSE_BACKGROUND;
+            } else {
+              info.dispose_method =
+                  (dispose == 2) ? WEBP_MUX_DISPOSE_BACKGROUND
+                                 : WEBP_MUX_DISPOSE_NONE;
+            }
+            transparent_index = (flags & GIF_TRANSPARENT_MASK) ? data[4] : -1;
+            if (is_first_frame) {
+              if (!GetBackgroundColor(gif->SColorMap, gif->SBackGroundColor,
+                                      &anim.bgcolor)) {
+                fprintf(stderr, "GIF decode warning: invalid background color "
+                                "index. Assuming white background.\n");
+              }
+              WebPUtilClearPic(&frame, NULL);
+            }
+            break;
+          }
+          case PLAINTEXT_EXT_FUNC_CODE: {
+            break;
+          }
+          case APPLICATION_EXT_FUNC_CODE: {
+            if (data[0] != 11) break;    // Chunk is too short
+            if (!memcmp(data + 1, "NETSCAPE2.0", 11)) {
+              // Recognize and parse Netscape2.0 NAB extension for loop count.
+              if (DGifGetExtensionNext(gif, &data) == GIF_ERROR) goto End;
+              if (data == NULL) goto End;  // Loop count sub-block missing.
+              if (data[0] != 3 && data[1] != 1) break;   // wrong size/marker
+              anim.loop_count = data[2] | (data[3] << 8);
+              if (verbose) printf("Loop count: %d\n", anim.loop_count);
+            } else {  // An extension containing metadata.
+              // We only store the first encountered chunk of each type, and
+              // only if requested by the user.
+              const int is_xmp = (keep_metadata & METADATA_XMP) &&
+                                 !stored_xmp &&
+                                 !memcmp(data + 1, "XMP DataXMP", 11);
+              const int is_icc = (keep_metadata & METADATA_ICC) &&
+                                 !stored_icc &&
+                                 !memcmp(data + 1, "ICCRGBG1012", 11);
+              if (is_xmp || is_icc) {
+                const char* const fourccs[2] = { "XMP " , "ICCP" };
+                const char* const features[2] = { "XMP" , "ICC" };
+                WebPData metadata = { NULL, 0 };
+                // Construct metadata from sub-blocks.
+                // Usual case (including ICC profile): In each sub-block, the
+                // first byte specifies its size in bytes (0 to 255) and the
+                // rest of the bytes contain the data.
+                // Special case for XMP data: In each sub-block, the first byte
+                // is also part of the XMP payload. XMP in GIF also has a 257
+                // byte padding data. See the XMP specification for details.
+                while (1) {
+                  WebPData prev_metadata = metadata;
+                  WebPData subblock;
+                  if (DGifGetExtensionNext(gif, &data) == GIF_ERROR) {
+                    WebPDataClear(&metadata);
+                    goto End;
+                  }
+                  if (data == NULL) break;  // Finished.
+                  subblock.size = is_xmp ? data[0] + 1 : data[0];
+                  assert(subblock.size > 0);
+                  subblock.bytes = is_xmp ? data : data + 1;
+                  metadata.bytes =
+                      (uint8_t*)realloc((void*)metadata.bytes,
+                                        prev_metadata.size + subblock.size);
+                  if (metadata.bytes == NULL) {
+                    WebPDataClear(&prev_metadata);
+                    goto End;
+                  }
+                  metadata.size += subblock.size;
+                  memcpy((void*)(metadata.bytes + prev_metadata.size),
+                         subblock.bytes, subblock.size);
+                }
+                if (is_xmp) {
+                  // XMP padding data is 0x01, 0xff, 0xfe ... 0x01, 0x00.
+                  const size_t xmp_pading_size = 257;
+                  if (metadata.size > xmp_pading_size) {
+                    metadata.size -= xmp_pading_size;
+                  }
+                }
+
+                // Add metadata chunk.
+                err = WebPMuxSetChunk(mux, fourccs[is_icc], &metadata, 1);
+                if (verbose) {
+                  printf("%s size: %d\n", features[is_icc], (int)metadata.size);
+                }
+                WebPDataClear(&metadata);
+                if (err != WEBP_MUX_OK) {
+                  fprintf(stderr, "ERROR (%s): Could not set %s chunk.\n",
+                          ErrorString(err), features[is_icc]);
+                  goto End;
+                }
+                if (is_icc) {
+                  stored_icc = 1;
+                } else if (is_xmp) {
+                  stored_xmp = 1;
+                }
+              }
+            }
+            break;
+          }
+          default: {
+            break;  // skip
+          }
+        }
+        while (data != NULL) {
+          if (DGifGetExtensionNext(gif, &data) == GIF_ERROR) goto End;
+        }
+        break;
+      }
+      case TERMINATE_RECORD_TYPE: {
+        done = 1;
+        break;
+      }
+      default: {
+        if (verbose) {
+          fprintf(stderr, "Skipping over unknown record type %d\n", type);
+        }
+        break;
+      }
+    }
+  } while (!done);
+
+  // Flush any pending frames.
+  err = WebPFrameCacheFlushAll(cache, verbose, mux);
+  if (err != WEBP_MUX_OK) {
+    fprintf(stderr, "ERROR (%s): Could not add animation frame.\n",
+            ErrorString(err));
+    goto End;
+  }
+
+  // Finish muxing
+  err = WebPMuxSetAnimationParams(mux, &anim);
+  if (err != WEBP_MUX_OK) {
+    fprintf(stderr, "ERROR (%s): Could not set animation parameters.\n",
+            ErrorString(err));
+    goto End;
+  }
+
+  err = WebPMuxAssemble(mux, &webp_data);
+  if (err != WEBP_MUX_OK) {
+    fprintf(stderr, "ERROR (%s) assembling the WebP file.\n", ErrorString(err));
+    goto End;
+  }
+  if (out_file != NULL) {
+    if (!ExUtilWriteFile(out_file, webp_data.bytes, webp_data.size)) {
+      fprintf(stderr, "Error writing output file: %s\n", out_file);
+      goto End;
+    }
+    if (!quiet) {
+      printf("Saved output file: %s\n", out_file);
+    }
+  } else {
+    if (!quiet) {
+      printf("Nothing written; use -o flag to save the result.\n");
+    }
+  }
+
+  // All OK.
+  ok = 1;
+  gif_error = GIF_OK;
+
+ End:
+  WebPDataClear(&webp_data);
+  WebPMuxDelete(mux);
+  WebPPictureFree(&frame);
+  WebPFrameCacheDelete(cache);
+  if (out != NULL && out_file != NULL) fclose(out);
+
+  if (gif_error != GIF_OK) {
+    DisplayGifError(gif, gif_error);
+  }
+  if (gif != NULL) {
+    DGifCloseFile(gif);
+  }
+
+  return !ok;
+}
+
+#else  // !WEBP_HAVE_GIF
+
+int main(int argc, const char *argv[]) {
+  fprintf(stderr, "GIF support not enabled in %s.\n", argv[0]);
+  (void)argc;
+  return 0;
+}
+
+#endif
+
+//------------------------------------------------------------------------------
--- a/examples/gif2webp_util.c
+++ b/examples/gif2webp_util.c
@ -0,0 +1,667 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+//  Helper structs and methods for gif2webp tool.
+//
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "webp/encode.h"
+#include "./gif2webp_util.h"
+
+#define DELTA_INFINITY      1ULL << 32
+#define KEYFRAME_NONE       -1
+
+//------------------------------------------------------------------------------
+// Helper utilities.
+
+static void ClearRectangle(WebPPicture* const picture,
+                           int left, int top, int width, int height) {
+  int j;
+  for (j = top; j < top + height; ++j) {
+    uint32_t* const dst = picture->argb + j * picture->argb_stride;
+    int i;
+    for (i = left; i < left + width; ++i) {
+      dst[i] = WEBP_UTIL_TRANSPARENT_COLOR;
+    }
+  }
+}
+
+void WebPUtilClearPic(WebPPicture* const picture,
+                      const WebPFrameRect* const rect) {
+  if (rect != NULL) {
+    ClearRectangle(picture, rect->x_offset, rect->y_offset,
+                   rect->width, rect->height);
+  } else {
+    ClearRectangle(picture, 0, 0, picture->width, picture->height);
+  }
+}
+
+// TODO: Also used in picture.c. Move to a common location?
+// Copy width x height pixels from 'src' to 'dst' honoring the strides.
+static void CopyPlane(const uint8_t* src, int src_stride,
+                      uint8_t* dst, int dst_stride, int width, int height) {
+  while (height-- > 0) {
+    memcpy(dst, src, width);
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+// Copy pixels from 'src' to 'dst' honoring strides. 'src' and 'dst' are assumed
+// to be already allocated.
+static void CopyPixels(const WebPPicture* const src, WebPPicture* const dst) {
+  assert(src->width == dst->width && src->height == dst->height);
+  CopyPlane((uint8_t*)src->argb, 4 * src->argb_stride, (uint8_t*)dst->argb,
+            4 * dst->argb_stride, 4 * src->width, src->height);
+}
+
+// Given 'src' picture and its frame rectangle 'rect', blend it into 'dst'.
+static void BlendPixels(const WebPPicture* const src,
+                        const WebPFrameRect* const rect,
+                        WebPPicture* const dst) {
+  int j;
+  assert(src->width == dst->width && src->height == dst->height);
+  for (j = rect->y_offset; j < rect->y_offset + rect->height; ++j) {
+    int i;
+    for (i = rect->x_offset; i < rect->x_offset + rect->width; ++i) {
+      const uint32_t src_pixel = src->argb[j * src->argb_stride + i];
+      const int src_alpha = src_pixel >> 24;
+      if (src_alpha != 0) {
+        dst->argb[j * dst->argb_stride + i] = src_pixel;
+      }
+    }
+  }
+}
+
+// Replace transparent pixels within 'dst_rect' of 'dst' by those in the 'src'.
+static void ReduceTransparency(const WebPPicture* const src,
+                               const WebPFrameRect* const rect,
+                               WebPPicture* const dst) {
+  int i, j;
+  assert(src != NULL && dst != NULL && rect != NULL);
+  assert(src->width == dst->width && src->height == dst->height);
+  for (j = rect->y_offset; j < rect->y_offset + rect->height; ++j) {
+    for (i = rect->x_offset; i < rect->x_offset + rect->width; ++i) {
+      const uint32_t src_pixel = src->argb[j * src->argb_stride + i];
+      const int src_alpha = src_pixel >> 24;
+      const uint32_t dst_pixel = dst->argb[j * dst->argb_stride + i];
+      const int dst_alpha = dst_pixel >> 24;
+      if (dst_alpha == 0 && src_alpha == 0xff) {
+        dst->argb[j * dst->argb_stride + i] = src_pixel;
+      }
+    }
+  }
+}
+
+// Replace similar blocks of pixels by a 'see-through' transparent block
+// with uniform average color.
+static void FlattenSimilarBlocks(const WebPPicture* const src,
+                                 const WebPFrameRect* const rect,
+                                 WebPPicture* const dst) {
+  int i, j;
+  const int block_size = 8;
+  const int y_start = (rect->y_offset + block_size) & ~(block_size - 1);
+  const int y_end = (rect->y_offset + rect->height) & ~(block_size - 1);
+  const int x_start = (rect->x_offset + block_size) & ~(block_size - 1);
+  const int x_end = (rect->x_offset + rect->width) & ~(block_size - 1);
+  assert(src != NULL && dst != NULL && rect != NULL);
+  assert(src->width == dst->width && src->height == dst->height);
+  assert((block_size & (block_size - 1)) == 0);  // must be a power of 2
+  // Iterate over each block and count similar pixels.
+  for (j = y_start; j < y_end; j += block_size) {
+    for (i = x_start; i < x_end; i += block_size) {
+      int cnt = 0;
+      int avg_r = 0, avg_g = 0, avg_b = 0;
+      int x, y;
+      const uint32_t* const psrc = src->argb + j * src->argb_stride + i;
+      uint32_t* const pdst = dst->argb + j * dst->argb_stride + i;
+      for (y = 0; y < block_size; ++y) {
+        for (x = 0; x < block_size; ++x) {
+          const uint32_t src_pixel = psrc[x + y * src->argb_stride];
+          const int alpha = src_pixel >> 24;
+          if (alpha == 0xff &&
+              src_pixel == pdst[x + y * dst->argb_stride]) {
+              ++cnt;
+              avg_r += (src_pixel >> 16) & 0xff;
+              avg_g += (src_pixel >>  8) & 0xff;
+              avg_b += (src_pixel >>  0) & 0xff;
+          }
+        }
+      }
+      // If we have a fully similar block, we replace it with an
+      // average transparent block. This compresses better in lossy mode.
+      if (cnt == block_size * block_size) {
+        const uint32_t color = (0x00          << 24) |
+                               ((avg_r / cnt) << 16) |
+                               ((avg_g / cnt) <<  8) |
+                               ((avg_b / cnt) <<  0);
+        for (y = 0; y < block_size; ++y) {
+          for (x = 0; x < block_size; ++x) {
+            pdst[x + y * dst->argb_stride] = color;
+          }
+        }
+      }
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Key frame related utilities.
+
+// Returns true if 'curr' frame with frame rectangle 'curr_rect' is a key frame,
+// that is, it can be decoded independently of 'prev' canvas.
+static int IsKeyFrame(const WebPPicture* const curr,
+                      const WebPFrameRect* const curr_rect,
+                      const WebPPicture* const prev) {
+  int i, j;
+  int is_key_frame = 1;
+
+  // If previous canvas (with previous frame disposed) is all transparent,
+  // current frame is a key frame.
+  for (i = 0; i < prev->width; ++i) {
+    for (j = 0; j < prev->height; ++j) {
+      const uint32_t prev_alpha = (prev->argb[j * prev->argb_stride + i]) >> 24;
+      if (prev_alpha != 0) {
+        is_key_frame = 0;
+        break;
+      }
+    }
+    if (!is_key_frame) break;
+  }
+  if (is_key_frame) return 1;
+
+  // If current frame covers the whole canvas and does not contain any
+  // transparent pixels that depend on previous canvas, then current frame is
+  // a key frame.
+  if (curr_rect->width == curr->width && curr_rect->height == curr->height) {
+    assert(curr_rect->x_offset == 0 && curr_rect->y_offset == 0);
+    is_key_frame = 1;
+    for (j = 0; j < prev->height; ++j) {
+      for (i = 0; i < prev->width; ++i) {
+        const uint32_t prev_alpha =
+            (prev->argb[j * prev->argb_stride + i]) >> 24;
+        const uint32_t curr_alpha =
+            (curr->argb[j * curr->argb_stride + i]) >> 24;
+        if (curr_alpha != 0xff && prev_alpha != 0) {
+          is_key_frame = 0;
+          break;
+        }
+      }
+      if (!is_key_frame) break;
+    }
+    if (is_key_frame) return 1;
+  }
+
+  return 0;
+}
+
+// Given 'prev' frame and current frame rectangle 'rect', convert 'curr' frame
+// to a key frame.
+static void ConvertToKeyFrame(const WebPPicture* const prev,
+                              WebPFrameRect* const rect,
+                              WebPPicture* const curr) {
+  int j;
+  assert(curr->width == prev->width && curr->height == prev->height);
+
+  // Replace transparent pixels of current canvas with those from previous
+  // canvas (with previous frame disposed).
+  for (j = 0; j < curr->height; ++j) {
+    int i;
+    for (i = 0; i < curr->width; ++i) {
+      uint32_t* const curr_pixel = curr->argb + j * curr->argb_stride + i;
+      const int curr_alpha = *curr_pixel >> 24;
+      if (curr_alpha == 0) {
+        *curr_pixel = prev->argb[j * prev->argb_stride + i];
+      }
+    }
+  }
+
+  // Frame rectangle now covers the whole canvas.
+  rect->x_offset = 0;
+  rect->y_offset = 0;
+  rect->width = curr->width;
+  rect->height = curr->height;
+}
+
+//------------------------------------------------------------------------------
+// Encoded frame.
+
+// Used to store two candidates of encoded data for an animation frame. One of
+// the two will be chosen later.
+typedef struct {
+  WebPMuxFrameInfo sub_frame;  // Encoded frame rectangle.
+  WebPMuxFrameInfo key_frame;  // Encoded frame if it was converted to keyframe.
+} EncodedFrame;
+
+// Release the data contained by 'encoded_frame'.
+static void FrameRelease(EncodedFrame* const encoded_frame) {
+  if (encoded_frame != NULL) {
+    WebPDataClear(&encoded_frame->sub_frame.bitstream);
+    WebPDataClear(&encoded_frame->key_frame.bitstream);
+    memset(encoded_frame, 0, sizeof(*encoded_frame));
+  }
+}
+
+//------------------------------------------------------------------------------
+// Frame cache.
+
+// Used to store encoded frames that haven't been output yet.
+struct WebPFrameCache {
+  EncodedFrame* encoded_frames;  // Array of encoded frames.
+  size_t size;               // Number of allocated data elements.
+  size_t start;              // Start index.
+  size_t count;              // Number of valid data elements.
+  int flush_count;           // If >0, ‘flush_count’ frames starting from
+                             // 'start' are ready to be added to mux.
+  int64_t best_delta;        // min(canvas size - frame size) over the frames.
+                             // Can be negative in certain cases due to
+                             // transparent pixels in a frame.
+  int keyframe;              // Index of selected keyframe relative to 'start'.
+
+  size_t kmin;                   // Min distance between key frames.
+  size_t kmax;                   // Max distance between key frames.
+  size_t count_since_key_frame;  // Frames seen since the last key frame.
+  int allow_mixed;           // If true, each frame can be lossy or lossless.
+  WebPPicture prev_canvas;   // Previous canvas (properly disposed).
+  WebPPicture curr_canvas;   // Current canvas (temporary buffer).
+  int is_first_frame;        // True if no frames have been added to the cache
+                             // since WebPFrameCacheNew().
+};
+
+// Reset the counters in the cache struct. Doesn't touch 'cache->encoded_frames'
+// and 'cache->size'.
+static void CacheReset(WebPFrameCache* const cache) {
+  cache->start = 0;
+  cache->count = 0;
+  cache->flush_count = 0;
+  cache->best_delta = DELTA_INFINITY;
+  cache->keyframe = KEYFRAME_NONE;
+}
+
+WebPFrameCache* WebPFrameCacheNew(int width, int height,
+                                  size_t kmin, size_t kmax, int allow_mixed) {
+  WebPFrameCache* cache = (WebPFrameCache*)malloc(sizeof(*cache));
+  if (cache == NULL) return NULL;
+  CacheReset(cache);
+  // sanity init, so we can call WebPFrameCacheDelete():
+  cache->encoded_frames = NULL;
+
+  cache->is_first_frame = 1;
+
+  // Picture buffers.
+  if (!WebPPictureInit(&cache->prev_canvas) ||
+      !WebPPictureInit(&cache->curr_canvas)) {
+    return NULL;
+  }
+  cache->prev_canvas.width = width;
+  cache->prev_canvas.height = height;
+  cache->prev_canvas.use_argb = 1;
+  if (!WebPPictureAlloc(&cache->prev_canvas) ||
+      !WebPPictureCopy(&cache->prev_canvas, &cache->curr_canvas)) {
+    goto Err;
+  }
+  WebPUtilClearPic(&cache->prev_canvas, NULL);
+
+  // Cache data.
+  cache->allow_mixed = allow_mixed;
+  cache->kmin = kmin;
+  cache->kmax = kmax;
+  cache->count_since_key_frame = 0;
+  assert(kmax > kmin);
+  cache->size = kmax - kmin;
+  cache->encoded_frames =
+      (EncodedFrame*)calloc(cache->size, sizeof(*cache->encoded_frames));
+  if (cache->encoded_frames == NULL) goto Err;
+
+  return cache;  // All OK.
+
+ Err:
+  WebPFrameCacheDelete(cache);
+  return NULL;
+}
+
+void WebPFrameCacheDelete(WebPFrameCache* const cache) {
+  if (cache != NULL) {
+    if (cache->encoded_frames != NULL) {
+      size_t i;
+      for (i = 0; i < cache->size; ++i) {
+        FrameRelease(&cache->encoded_frames[i]);
+      }
+      free(cache->encoded_frames);
+    }
+    WebPPictureFree(&cache->prev_canvas);
+    WebPPictureFree(&cache->curr_canvas);
+    free(cache);
+  }
+}
+
+static int EncodeFrame(const WebPConfig* const config, WebPPicture* const pic,
+                       WebPMemoryWriter* const memory) {
+  pic->use_argb = 1;
+  pic->writer = WebPMemoryWrite;
+  pic->custom_ptr = memory;
+  if (!WebPEncode(config, pic)) {
+    return 0;
+  }
+  return 1;
+}
+
+static void GetEncodedData(const WebPMemoryWriter* const memory,
+                           WebPData* const encoded_data) {
+  encoded_data->bytes = memory->mem;
+  encoded_data->size  = memory->size;
+}
+
+#define MIN_COLORS_LOSSY     31  // Don't try lossy below this threshold.
+#define MAX_COLORS_LOSSLESS 194  // Don't try lossless above this threshold.
+#define MAX_COLOR_COUNT     256  // Power of 2 greater than MAX_COLORS_LOSSLESS.
+#define HASH_SIZE (MAX_COLOR_COUNT * 4)
+#define HASH_RIGHT_SHIFT     22  // 32 - log2(HASH_SIZE).
+
+// TODO(urvang): Also used in enc/vp8l.c. Move to utils.
+// If the number of colors in the 'pic' is at least MAX_COLOR_COUNT, return
+// MAX_COLOR_COUNT. Otherwise, return the exact number of colors in the 'pic'.
+static int GetColorCount(const WebPPicture* const pic) {
+  int x, y;
+  int num_colors = 0;
+  uint8_t in_use[HASH_SIZE] = { 0 };
+  uint32_t colors[HASH_SIZE];
+  static const uint32_t kHashMul = 0x1e35a7bd;
+  const uint32_t* argb = pic->argb;
+  const int width = pic->width;
+  const int height = pic->height;
+  uint32_t last_pix = ~argb[0];   // so we're sure that last_pix != argb[0]
+
+  for (y = 0; y < height; ++y) {
+    for (x = 0; x < width; ++x) {
+      int key;
+      if (argb[x] == last_pix) {
+        continue;
+      }
+      last_pix = argb[x];
+      key = (kHashMul * last_pix) >> HASH_RIGHT_SHIFT;
+      while (1) {
+        if (!in_use[key]) {
+          colors[key] = last_pix;
+          in_use[key] = 1;
+          ++num_colors;
+          if (num_colors >= MAX_COLOR_COUNT) {
+            return MAX_COLOR_COUNT;  // Exact count not needed.
+          }
+          break;
+        } else if (colors[key] == last_pix) {
+          break;  // The color is already there.
+        } else {
+          // Some other color sits here, so do linear conflict resolution.
+          ++key;
+          key &= (HASH_SIZE - 1);  // Key mask.
+        }
+      }
+    }
+    argb += pic->argb_stride;
+  }
+  return num_colors;
+}
+
+#undef MAX_COLOR_COUNT
+#undef HASH_SIZE
+#undef HASH_RIGHT_SHIFT
+
+static int SetFrame(const WebPConfig* const config, int allow_mixed,
+                    int is_key_frame, const WebPPicture* const prev_canvas,
+                    WebPPicture* const frame, const WebPFrameRect* const rect,
+                    const WebPMuxFrameInfo* const info,
+                    WebPPicture* const sub_frame, EncodedFrame* encoded_frame) {
+  int try_lossless;
+  int try_lossy;
+  int try_both;
+  WebPMemoryWriter mem1, mem2;
+  WebPData* encoded_data;
+  WebPMuxFrameInfo* const dst =
+      is_key_frame ? &encoded_frame->key_frame : &encoded_frame->sub_frame;
+  *dst = *info;
+  encoded_data = &dst->bitstream;
+  WebPMemoryWriterInit(&mem1);
+  WebPMemoryWriterInit(&mem2);
+
+  if (!allow_mixed) {
+    try_lossless = config->lossless;
+    try_lossy = !try_lossless;
+  } else {  // Use a heuristic for trying lossless and/or lossy compression.
+    const int num_colors = GetColorCount(sub_frame);
+    try_lossless = (num_colors < MAX_COLORS_LOSSLESS);
+    try_lossy = (num_colors >= MIN_COLORS_LOSSY);
+  }
+  try_both = try_lossless && try_lossy;
+
+  if (try_lossless) {
+    WebPConfig config_ll = *config;
+    config_ll.lossless = 1;
+    if (!EncodeFrame(&config_ll, sub_frame, &mem1)) {
+      goto Err;
+    }
+  }
+
+  if (try_lossy) {
+    WebPConfig config_lossy = *config;
+    config_lossy.lossless = 0;
+    if (!is_key_frame) {
+      // For lossy compression of a frame, it's better to replace transparent
+      // pixels of 'curr' with actual RGB values, whenever possible.
+      ReduceTransparency(prev_canvas, rect, frame);
+      // TODO(later): Investigate if this helps lossless compression as well.
+      FlattenSimilarBlocks(prev_canvas, rect, frame);
+    }
+    if (!EncodeFrame(&config_lossy, sub_frame, &mem2)) {
+      goto Err;
+    }
+  }
+
+  if (try_both) {  // Pick the encoding with smallest size.
+    // TODO(later): Perhaps a rough SSIM/PSNR produced by the encoder should
+    // also be a criteria, in addition to sizes.
+    if (mem1.size <= mem2.size) {
+      free(mem2.mem);
+      GetEncodedData(&mem1, encoded_data);
+    } else {
+      free(mem1.mem);
+      GetEncodedData(&mem2, encoded_data);
+    }
+  } else {
+    GetEncodedData(try_lossless ? &mem1 : &mem2, encoded_data);
+  }
+  return 1;
+
+ Err:
+  free(mem1.mem);
+  free(mem2.mem);
+  return 0;
+}
+
+#undef MIN_COLORS_LOSSY
+#undef MAX_COLORS_LOSSLESS
+
+// Returns cached frame at given 'position' index.
+static EncodedFrame* CacheGetFrame(const WebPFrameCache* const cache,
+                                   size_t position) {
+  assert(cache->start + position < cache->size);
+  return &cache->encoded_frames[cache->start + position];
+}
+
+// Calculate the penalty incurred if we encode given frame as a key frame
+// instead of a sub-frame.
+static int64_t KeyFramePenalty(const EncodedFrame* const encoded_frame) {
+  return ((int64_t)encoded_frame->key_frame.bitstream.size -
+          encoded_frame->sub_frame.bitstream.size);
+}
+
+static void DisposeFrame(WebPMuxAnimDispose dispose_method,
+                         const WebPFrameRect* const gif_rect,
+                         WebPPicture* const frame, WebPPicture* const canvas) {
+  if (dispose_method == WEBP_MUX_DISPOSE_BACKGROUND) {
+    WebPUtilClearPic(frame, NULL);
+    WebPUtilClearPic(canvas, gif_rect);
+  }
+}
+
+int WebPFrameCacheAddFrame(WebPFrameCache* const cache,
+                           const WebPConfig* const config,
+                           const WebPFrameRect* const orig_rect,
+                           WebPPicture* const frame,
+                           WebPMuxFrameInfo* const info) {
+  int ok = 0;
+  WebPFrameRect rect = *orig_rect;
+  WebPPicture sub_image;  // View extracted from 'frame' with rectangle 'rect'.
+  WebPPicture* const prev_canvas = &cache->prev_canvas;
+  const size_t position = cache->count;
+  const int allow_mixed = cache->allow_mixed;
+  EncodedFrame* const encoded_frame = CacheGetFrame(cache, position);
+  assert(position < cache->size);
+
+  // Snap to even offsets (and adjust dimensions if needed).
+  rect.width += (rect.x_offset & 1);
+  rect.height += (rect.y_offset & 1);
+  rect.x_offset &= ~1;
+  rect.y_offset &= ~1;
+
+  if (!WebPPictureView(frame, rect.x_offset, rect.y_offset,
+                       rect.width, rect.height, &sub_image)) {
+    return 0;
+  }
+  info->x_offset = rect.x_offset;
+  info->y_offset = rect.y_offset;
+
+  ++cache->count;
+
+  if (cache->is_first_frame || IsKeyFrame(frame, &rect, prev_canvas)) {
+    // Add this as a key frame.
+    if (!SetFrame(config, allow_mixed, 1, NULL, NULL, NULL, info, &sub_image,
+                  encoded_frame)) {
+      goto End;
+    }
+    cache->keyframe = position;
+    cache->flush_count = cache->count;
+    cache->count_since_key_frame = 0;
+    // Update prev_canvas by simply copying from 'curr'.
+    CopyPixels(frame, prev_canvas);
+  } else {
+    ++cache->count_since_key_frame;
+    if (cache->count_since_key_frame <= cache->kmin) {
+      // Add this as a frame rectangle.
+      if (!SetFrame(config, allow_mixed, 0, prev_canvas, frame, &rect, info,
+                    &sub_image, encoded_frame)) {
+        goto End;
+      }
+      cache->flush_count = cache->count;
+      // Update prev_canvas by blending 'curr' into it.
+      BlendPixels(frame, orig_rect, prev_canvas);
+    } else {
+      WebPPicture full_image;
+      WebPMuxFrameInfo full_image_info;
+      int frame_added;
+      int64_t curr_delta;
+
+      // Add frame rectangle to cache.
+      if (!SetFrame(config, allow_mixed, 0, prev_canvas, frame, &rect, info,
+                    &sub_image, encoded_frame)) {
+        goto End;
+      }
+
+      // Convert to a key frame.
+      CopyPixels(frame, &cache->curr_canvas);
+      ConvertToKeyFrame(prev_canvas, &rect, &cache->curr_canvas);
+      if (!WebPPictureView(&cache->curr_canvas, rect.x_offset, rect.y_offset,
+                           rect.width, rect.height, &full_image)) {
+        goto End;
+      }
+      full_image_info = *info;
+      full_image_info.x_offset = rect.x_offset;
+      full_image_info.y_offset = rect.y_offset;
+
+      // Add key frame to cache, too.
+      frame_added = SetFrame(config, allow_mixed, 1, NULL, NULL, NULL,
+                             &full_image_info, &full_image, encoded_frame);
+      WebPPictureFree(&full_image);
+      if (!frame_added) goto End;
+
+      // Analyze size difference of the two variants.
+      curr_delta = KeyFramePenalty(encoded_frame);
+      if (curr_delta <= cache->best_delta) {  // Pick this as keyframe.
+        cache->keyframe = position;
+        cache->best_delta = curr_delta;
+        cache->flush_count = cache->count - 1;  // We can flush previous frames.
+      }
+      if (cache->count_since_key_frame == cache->kmax) {
+        cache->flush_count = cache->count;
+        cache->count_since_key_frame = 0;
+      }
+
+      // Update prev_canvas by simply copying from 'curr_canvas'.
+      CopyPixels(&cache->curr_canvas, prev_canvas);
+    }
+  }
+
+  DisposeFrame(info->dispose_method, orig_rect, frame, prev_canvas);
+
+  cache->is_first_frame = 0;
+  ok = 1;
+
+ End:
+  WebPPictureFree(&sub_image);
+  if (!ok) {
+    FrameRelease(encoded_frame);
+    --cache->count;  // We reset the count, as the frame addition failed.
+  }
+  return ok;
+}
+
+WebPMuxError WebPFrameCacheFlush(WebPFrameCache* const cache, int verbose,
+                                 WebPMux* const mux) {
+  while (cache->flush_count > 0) {
+    WebPMuxFrameInfo* info;
+    WebPMuxError err;
+    EncodedFrame* const curr = CacheGetFrame(cache, 0);
+    // Pick frame or full canvas.
+    if (cache->keyframe == 0) {
+      info = &curr->key_frame;
+      info->blend_method = WEBP_MUX_NO_BLEND;
+      cache->keyframe = KEYFRAME_NONE;
+      cache->best_delta = DELTA_INFINITY;
+    } else {
+      info = &curr->sub_frame;
+      info->blend_method = WEBP_MUX_BLEND;
+    }
+    // Add to mux.
+    err = WebPMuxPushFrame(mux, info, 1);
+    if (err != WEBP_MUX_OK) return err;
+    if (verbose) {
+      printf("Added frame. offset:%d,%d duration:%d dispose:%d blend:%d\n",
+             info->x_offset, info->y_offset, info->duration,
+             info->dispose_method, info->blend_method);
+    }
+    FrameRelease(curr);
+    ++cache->start;
+    --cache->flush_count;
+    --cache->count;
+    if (cache->keyframe != KEYFRAME_NONE) --cache->keyframe;
+  }
+
+  if (cache->count == 0) CacheReset(cache);
+  return WEBP_MUX_OK;
+}
+
+WebPMuxError WebPFrameCacheFlushAll(WebPFrameCache* const cache, int verbose,
+                                    WebPMux* const mux) {
+  cache->flush_count = cache->count;  // Force flushing of all frames.
+  return WebPFrameCacheFlush(cache, verbose, mux);
+}
+
+//------------------------------------------------------------------------------
--- a/examples/gif2webp_util.h
+++ b/examples/gif2webp_util.h
@ -0,0 +1,80 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+//  Helper structs and methods for gif2webp tool.
+//
+// Author: Urvang (urvang@google.com)
+
+#ifndef WEBP_EXAMPLES_GIF2WEBP_UTIL_H_
+#define WEBP_EXAMPLES_GIF2WEBP_UTIL_H_
+
+#include <stdlib.h>
+
+#include "webp/mux.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//------------------------------------------------------------------------------
+// Helper utilities.
+
+#define WEBP_UTIL_TRANSPARENT_COLOR 0x00ffffff
+
+struct WebPPicture;
+
+typedef struct {
+  int x_offset, y_offset, width, height;
+} WebPFrameRect;
+
+// Clear pixels in 'picture' within given 'rect' to transparent color.
+void WebPUtilClearPic(struct WebPPicture* const picture,
+                      const WebPFrameRect* const rect);
+
+//------------------------------------------------------------------------------
+// Frame cache.
+
+typedef struct WebPFrameCache WebPFrameCache;
+
+// Given the minimum distance between key frames 'kmin' and maximum distance
+// between key frames 'kmax', returns an appropriately allocated cache object.
+// If 'allow_mixed' is true, the subsequent calls to WebPFrameCacheAddFrame()
+// will heuristically pick lossy or lossless compression for each frame.
+// Use WebPFrameCacheDelete() to deallocate the 'cache'.
+WebPFrameCache* WebPFrameCacheNew(int width, int height,
+                                  size_t kmin, size_t kmax, int allow_mixed);
+
+// Release all the frame data from 'cache' and free 'cache'.
+void WebPFrameCacheDelete(WebPFrameCache* const cache);
+
+// Given an image described by 'frame', 'info' and 'orig_rect', optimize it for
+// WebP, encode it and add it to 'cache'.
+// This takes care of frame disposal too, according to 'info->dispose_method'.
+int WebPFrameCacheAddFrame(WebPFrameCache* const cache,
+                           const WebPConfig* const config,
+                           const WebPFrameRect* const orig_rect,
+                           WebPPicture* const frame,
+                           WebPMuxFrameInfo* const info);
+
+// Flush the *ready* frames from cache and add them to 'mux'. If 'verbose' is
+// true, prints the information about these frames.
+WebPMuxError WebPFrameCacheFlush(WebPFrameCache* const cache, int verbose,
+                                 WebPMux* const mux);
+
+// Similar to 'WebPFrameCacheFlushFrames()', but flushes *all* the frames.
+WebPMuxError WebPFrameCacheFlushAll(WebPFrameCache* const cache, int verbose,
+                                    WebPMux* const mux);
+
+//------------------------------------------------------------------------------
+
+#ifdef __cplusplus
+}    // extern "C"
+#endif
+
+#endif  // WEBP_EXAMPLES_GIF2WEBP_UTIL_H_
--- a/examples/jpegdec.c
+++ b/examples/jpegdec.c
@ -0,0 +1,293 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// JPEG decode.
+
+#include "./jpegdec.h"
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <stdio.h>
+
+#ifdef WEBP_HAVE_JPEG
+#include <jpeglib.h>
+#include <setjmp.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "webp/encode.h"
+#include "./metadata.h"
+
+// -----------------------------------------------------------------------------
+// Metadata processing
+
+#ifndef JPEG_APP1
+# define JPEG_APP1 (JPEG_APP0 + 1)
+#endif
+#ifndef JPEG_APP2
+# define JPEG_APP2 (JPEG_APP0 + 2)
+#endif
+
+typedef struct {
+  const uint8_t* data;
+  size_t data_length;
+  int seq;  // this segment's sequence number [1, 255] for use in reassembly.
+} ICCPSegment;
+
+static void SaveMetadataMarkers(j_decompress_ptr dinfo) {
+  const unsigned int max_marker_length = 0xffff;
+  jpeg_save_markers(dinfo, JPEG_APP1, max_marker_length);  // Exif/XMP
+  jpeg_save_markers(dinfo, JPEG_APP2, max_marker_length);  // ICC profile
+}
+
+static int CompareICCPSegments(const void* a, const void* b) {
+  const ICCPSegment* s1 = (const ICCPSegment*)a;
+  const ICCPSegment* s2 = (const ICCPSegment*)b;
+  return s1->seq - s2->seq;
+}
+
+// Extract ICC profile segments from the marker list in 'dinfo', reassembling
+// and storing them in 'iccp'.
+// Returns true on success and false for memory errors and corrupt profiles.
+static int StoreICCP(j_decompress_ptr dinfo, MetadataPayload* const iccp) {
+  // ICC.1:2010-12 (4.3.0.0) Annex B.4 Embedding ICC Profiles in JPEG files
+  static const char kICCPSignature[] = "ICC_PROFILE";
+  static const size_t kICCPSignatureLength = 12;  // signature includes '\0'
+  static const size_t kICCPSkipLength = 14;  // signature + seq & count
+  int expected_count = 0;
+  int actual_count = 0;
+  int seq_max = 0;
+  size_t total_size = 0;
+  ICCPSegment iccp_segments[255];
+  jpeg_saved_marker_ptr marker;
+
+  memset(iccp_segments, 0, sizeof(iccp_segments));
+  for (marker = dinfo->marker_list; marker != NULL; marker = marker->next) {
+    if (marker->marker == JPEG_APP2 &&
+        marker->data_length > kICCPSkipLength &&
+        !memcmp(marker->data, kICCPSignature, kICCPSignatureLength)) {
+      // ICC_PROFILE\0<seq><count>; 'seq' starts at 1.
+      const int seq = marker->data[kICCPSignatureLength];
+      const int count = marker->data[kICCPSignatureLength + 1];
+      const size_t segment_size = marker->data_length - kICCPSkipLength;
+      ICCPSegment* segment;
+
+      if (segment_size == 0 || count == 0 || seq == 0) {
+        fprintf(stderr, "[ICCP] size (%d) / count (%d) / sequence number (%d)"
+                        " cannot be 0!\n",
+                (int)segment_size, seq, count);
+        return 0;
+      }
+
+      if (expected_count == 0) {
+        expected_count = count;
+      } else if (expected_count != count) {
+        fprintf(stderr, "[ICCP] Inconsistent segment count (%d / %d)!\n",
+                expected_count, count);
+        return 0;
+      }
+
+      segment = iccp_segments + seq - 1;
+      if (segment->data_length != 0) {
+        fprintf(stderr, "[ICCP] Duplicate segment number (%d)!\n" , seq);
+        return 0;
+      }
+
+      segment->data = marker->data + kICCPSkipLength;
+      segment->data_length = segment_size;
+      segment->seq = seq;
+      total_size += segment_size;
+      if (seq > seq_max) seq_max = seq;
+      ++actual_count;
+    }
+  }
+
+  if (actual_count == 0) return 1;
+  if (seq_max != actual_count) {
+    fprintf(stderr, "[ICCP] Discontinuous segments, expected: %d actual: %d!\n",
+            actual_count, seq_max);
+    return 0;
+  }
+  if (expected_count != actual_count) {
+    fprintf(stderr, "[ICCP] Segment count: %d does not match expected: %d!\n",
+            actual_count, expected_count);
+    return 0;
+  }
+
+  // The segments may appear out of order in the file, sort them based on
+  // sequence number before assembling the payload.
+  qsort(iccp_segments, actual_count, sizeof(*iccp_segments),
+        CompareICCPSegments);
+
+  iccp->bytes = (uint8_t*)malloc(total_size);
+  if (iccp->bytes == NULL) return 0;
+  iccp->size = total_size;
+
+  {
+    int i;
+    size_t offset = 0;
+    for (i = 0; i < seq_max; ++i) {
+      memcpy(iccp->bytes + offset,
+             iccp_segments[i].data, iccp_segments[i].data_length);
+      offset += iccp_segments[i].data_length;
+    }
+  }
+  return 1;
+}
+
+// Returns true on success and false for memory errors and corrupt profiles.
+// The caller must use MetadataFree() on 'metadata' in all cases.
+static int ExtractMetadataFromJPEG(j_decompress_ptr dinfo,
+                                   Metadata* const metadata) {
+  static const struct {
+    int marker;
+    const char* signature;
+    size_t signature_length;
+    size_t storage_offset;
+  } kJPEGMetadataMap[] = {
+    // Exif 2.2 Section 4.7.2 Interoperability Structure of APP1 ...
+    { JPEG_APP1, "Exif\0",                        6, METADATA_OFFSET(exif) },
+    // XMP Specification Part 3 Section 3 Embedding XMP Metadata ... #JPEG
+    // TODO(jzern) Add support for 'ExtendedXMP'
+    { JPEG_APP1, "http://ns.adobe.com/xap/1.0/", 29, METADATA_OFFSET(xmp) },
+    { 0, NULL, 0, 0 },
+  };
+  jpeg_saved_marker_ptr marker;
+  // Treat ICC profiles separately as they may be segmented and out of order.
+  if (!StoreICCP(dinfo, &metadata->iccp)) return 0;
+
+  for (marker = dinfo->marker_list; marker != NULL; marker = marker->next) {
+    int i;
+    for (i = 0; kJPEGMetadataMap[i].marker != 0; ++i) {
+      if (marker->marker == kJPEGMetadataMap[i].marker &&
+          marker->data_length > kJPEGMetadataMap[i].signature_length &&
+          !memcmp(marker->data, kJPEGMetadataMap[i].signature,
+                  kJPEGMetadataMap[i].signature_length)) {
+        MetadataPayload* const payload =
+            (MetadataPayload*)((uint8_t*)metadata +
+                               kJPEGMetadataMap[i].storage_offset);
+
+        if (payload->bytes == NULL) {
+          const char* marker_data = (const char*)marker->data +
+                                    kJPEGMetadataMap[i].signature_length;
+          const size_t marker_data_length =
+              marker->data_length - kJPEGMetadataMap[i].signature_length;
+          if (!MetadataCopy(marker_data, marker_data_length, payload)) return 0;
+        } else {
+          fprintf(stderr, "Ignoring additional '%s' marker\n",
+                  kJPEGMetadataMap[i].signature);
+        }
+      }
+    }
+  }
+  return 1;
+}
+
+#undef JPEG_APP1
+#undef JPEG_APP2
+
+// -----------------------------------------------------------------------------
+// JPEG decoding
+
+struct my_error_mgr {
+  struct jpeg_error_mgr pub;
+  jmp_buf setjmp_buffer;
+};
+
+static void my_error_exit(j_common_ptr dinfo) {
+  struct my_error_mgr* myerr = (struct my_error_mgr*)dinfo->err;
+  dinfo->err->output_message(dinfo);
+  longjmp(myerr->setjmp_buffer, 1);
+}
+
+int ReadJPEG(FILE* in_file, WebPPicture* const pic, Metadata* const metadata) {
+  int ok = 0;
+  int stride, width, height;
+  struct jpeg_decompress_struct dinfo;
+  struct my_error_mgr jerr;
+  uint8_t* rgb = NULL;
+  JSAMPROW buffer[1];
+
+  dinfo.err = jpeg_std_error(&jerr.pub);
+  jerr.pub.error_exit = my_error_exit;
+
+  if (setjmp(jerr.setjmp_buffer)) {
+ Error:
+    MetadataFree(metadata);
+    jpeg_destroy_decompress(&dinfo);
+    goto End;
+  }
+
+  jpeg_create_decompress(&dinfo);
+  jpeg_stdio_src(&dinfo, in_file);
+  if (metadata != NULL) SaveMetadataMarkers(&dinfo);
+  jpeg_read_header(&dinfo, TRUE);
+
+  dinfo.out_color_space = JCS_RGB;
+  dinfo.do_fancy_upsampling = TRUE;
+
+  jpeg_start_decompress(&dinfo);
+
+  if (dinfo.output_components != 3) {
+    goto Error;
+  }
+
+  width = dinfo.output_width;
+  height = dinfo.output_height;
+  stride = dinfo.output_width * dinfo.output_components * sizeof(*rgb);
+
+  rgb = (uint8_t*)malloc(stride * height);
+  if (rgb == NULL) {
+    goto End;
+  }
+  buffer[0] = (JSAMPLE*)rgb;
+
+  while (dinfo.output_scanline < dinfo.output_height) {
+    if (jpeg_read_scanlines(&dinfo, buffer, 1) != 1) {
+      goto End;
+    }
+    buffer[0] += stride;
+  }
+
+  if (metadata != NULL) {
+    ok = ExtractMetadataFromJPEG(&dinfo, metadata);
+    if (!ok) {
+      fprintf(stderr, "Error extracting JPEG metadata!\n");
+      goto Error;
+    }
+  }
+
+  jpeg_finish_decompress(&dinfo);
+  jpeg_destroy_decompress(&dinfo);
+
+  // WebP conversion.
+  pic->width = width;
+  pic->height = height;
+  ok = WebPPictureImportRGB(pic, rgb, stride);
+  if (!ok) goto Error;
+
+ End:
+  free(rgb);
+  return ok;
+}
+#else  // !WEBP_HAVE_JPEG
+int ReadJPEG(FILE* in_file, struct WebPPicture* const pic,
+             struct Metadata* const metadata) {
+  (void)in_file;
+  (void)pic;
+  (void)metadata;
+  fprintf(stderr, "JPEG support not compiled. Please install the libjpeg "
+          "development package before building.\n");
+  return 0;
+}
+#endif  // WEBP_HAVE_JPEG
+
+// -----------------------------------------------------------------------------
--- a/examples/jpegdec.h
+++ b/examples/jpegdec.h
@ -0,0 +1,35 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// JPEG decode.
+
+#ifndef WEBP_EXAMPLES_JPEGDEC_H_
+#define WEBP_EXAMPLES_JPEGDEC_H_
+
+#include <stdio.h>
+#include "webp/types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct Metadata;
+struct WebPPicture;
+
+// Reads a JPEG from 'in_file', returning the decoded output in 'pic'.
+// The output is RGB.
+// Returns true on success.
+int ReadJPEG(FILE* in_file, struct WebPPicture* const pic,
+             struct Metadata* const metadata);
+
+#ifdef __cplusplus
+}    // extern "C"
+#endif
+
+#endif  // WEBP_EXAMPLES_JPEGDEC_H_
--- a/examples/metadata.c
+++ b/examples/metadata.c
@ -0,0 +1,49 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+//  Metadata types and functions.
+//
+
+#include "./metadata.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "webp/types.h"
+
+void MetadataInit(Metadata* const metadata) {
+  if (metadata == NULL) return;
+  memset(metadata, 0, sizeof(*metadata));
+}
+
+void MetadataPayloadDelete(MetadataPayload* const payload) {
+  if (payload == NULL) return;
+  free(payload->bytes);
+  payload->bytes = NULL;
+  payload->size = 0;
+}
+
+void MetadataFree(Metadata* const metadata) {
+  if (metadata == NULL) return;
+  MetadataPayloadDelete(&metadata->exif);
+  MetadataPayloadDelete(&metadata->iccp);
+  MetadataPayloadDelete(&metadata->xmp);
+}
+
+int MetadataCopy(const char* metadata, size_t metadata_len,
+                 MetadataPayload* const payload) {
+  if (metadata == NULL || metadata_len == 0 || payload == NULL) return 0;
+  payload->bytes = (uint8_t*)malloc(metadata_len);
+  if (payload->bytes == NULL) return 0;
+  payload->size = metadata_len;
+  memcpy(payload->bytes, metadata, metadata_len);
+  return 1;
+}
+
+// -----------------------------------------------------------------------------
--- a/examples/metadata.h
+++ b/examples/metadata.h
@ -0,0 +1,47 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+//  Metadata types and functions.
+//
+
+#ifndef WEBP_EXAMPLES_METADATA_H_
+#define WEBP_EXAMPLES_METADATA_H_
+
+#include "webp/types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct MetadataPayload {
+  uint8_t* bytes;
+  size_t size;
+} MetadataPayload;
+
+typedef struct Metadata {
+  MetadataPayload exif;
+  MetadataPayload iccp;
+  MetadataPayload xmp;
+} Metadata;
+
+#define METADATA_OFFSET(x) offsetof(Metadata, x)
+
+void MetadataInit(Metadata* const metadata);
+void MetadataPayloadDelete(MetadataPayload* const payload);
+void MetadataFree(Metadata* const metadata);
+
+// Stores 'metadata' to 'payload->bytes', returns false on allocation error.
+int MetadataCopy(const char* metadata, size_t metadata_len,
+                 MetadataPayload* const payload);
+
+#ifdef __cplusplus
+}    // extern "C"
+#endif
+
+#endif  // WEBP_EXAMPLES_METADATA_H_
--- a/examples/pngdec.c
+++ b/examples/pngdec.c
@ -0,0 +1,299 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// PNG decode.
+
+#include "./pngdec.h"
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <stdio.h>
+
+#ifdef WEBP_HAVE_PNG
+#include <png.h>
+#include <setjmp.h>   // note: this must be included *after* png.h
+#include <stdlib.h>
+#include <string.h>
+
+#include "webp/encode.h"
+#include "./metadata.h"
+
+static void PNGAPI error_function(png_structp png, png_const_charp error) {
+  if (error != NULL) fprintf(stderr, "libpng error: %s\n", error);
+  longjmp(png_jmpbuf(png), 1);
+}
+
+// Converts the NULL terminated 'hexstring' which contains 2-byte character
+// representations of hex values to raw data.
+// 'hexstring' may contain values consisting of [A-F][a-f][0-9] in pairs,
+// e.g., 7af2..., separated by any number of newlines.
+// 'expected_length' is the anticipated processed size.
+// On success the raw buffer is returned with its length equivalent to
+// 'expected_length'. NULL is returned if the processed length is less than
+// 'expected_length' or any character aside from those above is encountered.
+// The returned buffer must be freed by the caller.
+static uint8_t* HexStringToBytes(const char* hexstring,
+                                 size_t expected_length) {
+  const char* src = hexstring;
+  size_t actual_length = 0;
+  uint8_t* const raw_data = (uint8_t*)malloc(expected_length);
+  uint8_t* dst;
+
+  if (raw_data == NULL) return NULL;
+
+  for (dst = raw_data; actual_length < expected_length && *src != '\0'; ++src) {
+    char* end;
+    char val[3];
+    if (*src == '\n') continue;
+    val[0] = *src++;
+    val[1] = *src;
+    val[2] = '\0';
+    *dst++ = (uint8_t)strtol(val, &end, 16);
+    if (end != val + 2) break;
+    ++actual_length;
+  }
+
+  if (actual_length != expected_length) {
+    free(raw_data);
+    return NULL;
+  }
+  return raw_data;
+}
+
+static int ProcessRawProfile(const char* profile, size_t profile_len,
+                             MetadataPayload* const payload) {
+  const char* src = profile;
+  char* end;
+  int expected_length;
+
+  if (profile == NULL || profile_len == 0) return 0;
+
+  // ImageMagick formats 'raw profiles' as
+  // '\n<name>\n<length>(%8lu)\n<hex payload>\n'.
+  if (*src != '\n') {
+    fprintf(stderr, "Malformed raw profile, expected '\\n' got '\\x%.2X'\n",
+            *src);
+    return 0;
+  }
+  ++src;
+  // skip the profile name and extract the length.
+  while (*src != '\0' && *src++ != '\n') {}
+  expected_length = (int)strtol(src, &end, 10);
+  if (*end != '\n') {
+    fprintf(stderr, "Malformed raw profile, expected '\\n' got '\\x%.2X'\n",
+            *end);
+    return 0;
+  }
+  ++end;
+
+  // 'end' now points to the profile payload.
+  payload->bytes = HexStringToBytes(end, expected_length);
+  if (payload->bytes == NULL) return 0;
+  payload->size = expected_length;
+  return 1;
+}
+
+static const struct {
+  const char* name;
+  int (*process)(const char* profile, size_t profile_len,
+                 MetadataPayload* const payload);
+  size_t storage_offset;
+} kPNGMetadataMap[] = {
+  // http://www.sno.phy.queensu.ca/~phil/exiftool/TagNames/PNG.html#TextualData
+  // See also: ExifTool on CPAN.
+  { "Raw profile type exif", ProcessRawProfile, METADATA_OFFSET(exif) },
+  { "Raw profile type xmp",  ProcessRawProfile, METADATA_OFFSET(xmp) },
+  // Exiftool puts exif data in APP1 chunk, too.
+  { "Raw profile type APP1", ProcessRawProfile, METADATA_OFFSET(exif) },
+  // XMP Specification Part 3, Section 3 #PNG
+  { "XML:com.adobe.xmp",     MetadataCopy,      METADATA_OFFSET(xmp) },
+  { NULL, NULL, 0 },
+};
+
+// Looks for metadata at both the beginning and end of the PNG file, giving
+// preference to the head.
+// Returns true on success. The caller must use MetadataFree() on 'metadata' in
+// all cases.
+static int ExtractMetadataFromPNG(png_structp png,
+                                  png_infop const head_info,
+                                  png_infop const end_info,
+                                  Metadata* const metadata) {
+  int p;
+
+  for (p = 0; p < 2; ++p)  {
+    png_infop const info = (p == 0) ? head_info : end_info;
+    png_textp text = NULL;
+    const int num = png_get_text(png, info, &text, NULL);
+    int i;
+    // Look for EXIF / XMP metadata.
+    for (i = 0; i < num; ++i, ++text) {
+      int j;
+      for (j = 0; kPNGMetadataMap[j].name != NULL; ++j) {
+        if (!strcmp(text->key, kPNGMetadataMap[j].name)) {
+          MetadataPayload* const payload =
+              (MetadataPayload*)((uint8_t*)metadata +
+                                 kPNGMetadataMap[j].storage_offset);
+          png_size_t text_length;
+          switch (text->compression) {
+#ifdef PNG_iTXt_SUPPORTED
+            case PNG_ITXT_COMPRESSION_NONE:
+            case PNG_ITXT_COMPRESSION_zTXt:
+              text_length = text->itxt_length;
+              break;
+#endif
+            case PNG_TEXT_COMPRESSION_NONE:
+            case PNG_TEXT_COMPRESSION_zTXt:
+            default:
+              text_length = text->text_length;
+              break;
+          }
+          if (payload->bytes != NULL) {
+            fprintf(stderr, "Ignoring additional '%s'\n", text->key);
+          } else if (!kPNGMetadataMap[j].process(text->text, text_length,
+                                                 payload)) {
+            fprintf(stderr, "Failed to process: '%s'\n", text->key);
+            return 0;
+          }
+          break;
+        }
+      }
+    }
+    // Look for an ICC profile.
+    {
+      png_charp name;
+      int comp_type;
+#if ((PNG_LIBPNG_VER_MAJOR << 8) | PNG_LIBPNG_VER_MINOR << 0) < \
+    ((1 << 8) | (5 << 0))
+      png_charp profile;
+#else  // >= libpng 1.5.0
+      png_bytep profile;
+#endif
+      png_uint_32 len;
+
+      if (png_get_iCCP(png, info,
+                       &name, &comp_type, &profile, &len) == PNG_INFO_iCCP) {
+        if (!MetadataCopy((const char*)profile, len, &metadata->iccp)) return 0;
+      }
+    }
+  }
+
+  return 1;
+}
+
+int ReadPNG(FILE* in_file, WebPPicture* const pic, int keep_alpha,
+            Metadata* const metadata) {
+  png_structp png;
+  png_infop info = NULL;
+  png_infop end_info = NULL;
+  int color_type, bit_depth, interlaced;
+  int has_alpha;
+  int num_passes;
+  int p;
+  int ok = 0;
+  png_uint_32 width, height, y;
+  int stride;
+  uint8_t* rgb = NULL;
+
+  png = png_create_read_struct(PNG_LIBPNG_VER_STRING, 0, 0, 0);
+  if (png == NULL) {
+    goto End;
+  }
+
+  png_set_error_fn(png, 0, error_function, NULL);
+  if (setjmp(png_jmpbuf(png))) {
+ Error:
+    MetadataFree(metadata);
+    png_destroy_read_struct(&png, &info, &end_info);
+    goto End;
+  }
+
+  info = png_create_info_struct(png);
+  if (info == NULL) goto Error;
+  end_info = png_create_info_struct(png);
+  if (end_info == NULL) goto Error;
+
+  png_init_io(png, in_file);
+  png_read_info(png, info);
+  if (!png_get_IHDR(png, info,
+                    &width, &height, &bit_depth, &color_type, &interlaced,
+                    NULL, NULL)) goto Error;
+
+  png_set_strip_16(png);
+  png_set_packing(png);
+  if (color_type == PNG_COLOR_TYPE_PALETTE) png_set_palette_to_rgb(png);
+  if (color_type == PNG_COLOR_TYPE_GRAY ||
+      color_type == PNG_COLOR_TYPE_GRAY_ALPHA) {
+    if (bit_depth < 8) {
+      png_set_expand_gray_1_2_4_to_8(png);
+    }
+    png_set_gray_to_rgb(png);
+  }
+  if (png_get_valid(png, info, PNG_INFO_tRNS)) {
+    png_set_tRNS_to_alpha(png);
+    has_alpha = 1;
+  } else {
+    has_alpha = !!(color_type & PNG_COLOR_MASK_ALPHA);
+  }
+
+  if (!keep_alpha) {
+    png_set_strip_alpha(png);
+    has_alpha = 0;
+  }
+
+  num_passes = png_set_interlace_handling(png);
+  png_read_update_info(png, info);
+  stride = (has_alpha ? 4 : 3) * width * sizeof(*rgb);
+  rgb = (uint8_t*)malloc(stride * height);
+  if (rgb == NULL) goto Error;
+  for (p = 0; p < num_passes; ++p) {
+    for (y = 0; y < height; ++y) {
+      png_bytep row = rgb + y * stride;
+      png_read_rows(png, &row, NULL, 1);
+    }
+  }
+  png_read_end(png, end_info);
+
+  if (metadata != NULL &&
+      !ExtractMetadataFromPNG(png, info, end_info, metadata)) {
+    fprintf(stderr, "Error extracting PNG metadata!\n");
+    goto Error;
+  }
+
+  png_destroy_read_struct(&png, &info, &end_info);
+
+  pic->width = width;
+  pic->height = height;
+  pic->use_argb = 1;
+  ok = has_alpha ? WebPPictureImportRGBA(pic, rgb, stride)
+                 : WebPPictureImportRGB(pic, rgb, stride);
+
+  if (!ok) {
+    goto Error;
+  }
+
+ End:
+  free(rgb);
+  return ok;
+}
+#else  // !WEBP_HAVE_PNG
+int ReadPNG(FILE* in_file, struct WebPPicture* const pic, int keep_alpha,
+            struct Metadata* const metadata) {
+  (void)in_file;
+  (void)pic;
+  (void)keep_alpha;
+  (void)metadata;
+  fprintf(stderr, "PNG support not compiled. Please install the libpng "
+          "development package before building.\n");
+  return 0;
+}
+#endif  // WEBP_HAVE_PNG
+
+// -----------------------------------------------------------------------------
--- a/examples/pngdec.h
+++ b/examples/pngdec.h
@ -0,0 +1,35 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// PNG decode.
+
+#ifndef WEBP_EXAMPLES_PNGDEC_H_
+#define WEBP_EXAMPLES_PNGDEC_H_
+
+#include <stdio.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct Metadata;
+struct WebPPicture;
+
+// Reads a PNG from 'in_file', returning the decoded output in 'pic'.
+// If 'keep_alpha' is true and the PNG has an alpha channel, the output is RGBA
+// otherwise it will be RGB.
+// Returns true on success.
+int ReadPNG(FILE* in_file, struct WebPPicture* const pic, int keep_alpha,
+            struct Metadata* const metadata);
+
+#ifdef __cplusplus
+}    // extern "C"
+#endif
+
+#endif  // WEBP_EXAMPLES_PNGDEC_H_
--- a/examples/stopwatch.h
+++ b/examples/stopwatch.h
@ -1,8 +1,10 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 //  Helper functions to measure elapsed time.
@ -17,6 +19,10 @@

 typedef LARGE_INTEGER Stopwatch;

+static WEBP_INLINE void StopwatchReset(Stopwatch* watch) {
+  QueryPerformanceCounter(watch);
+}
+
 static WEBP_INLINE double StopwatchReadAndReset(Stopwatch* watch) {
  const LARGE_INTEGER old_value = *watch;
  LARGE_INTEGER freq;
@ -35,6 +41,10 @@ static WEBP_INLINE double StopwatchReadAndReset(Stopwatch* watch) {

 typedef struct timeval Stopwatch;

+static WEBP_INLINE void StopwatchReset(Stopwatch* watch) {
+  gettimeofday(watch, NULL);
+}
+
 static WEBP_INLINE double StopwatchReadAndReset(Stopwatch* watch) {
  const struct timeval old_value = *watch;
  gettimeofday(watch, NULL);
--- a/examples/tiffdec.c
+++ b/examples/tiffdec.c
@ -0,0 +1,141 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// TIFF decode.
+
+#include "./tiffdec.h"
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <stdio.h>
+
+#ifdef WEBP_HAVE_TIFF
+#include <tiffio.h>
+
+#include "webp/encode.h"
+#include "./metadata.h"
+
+static const struct {
+  ttag_t tag;
+  size_t storage_offset;
+} kTIFFMetadataMap[] = {
+  { TIFFTAG_ICCPROFILE, METADATA_OFFSET(iccp) },
+  { TIFFTAG_XMLPACKET,  METADATA_OFFSET(xmp) },
+  { 0, 0 },
+};
+
+// Returns true on success. The caller must use MetadataFree() on 'metadata' in
+// all cases.
+static int ExtractMetadataFromTIFF(TIFF* const tif, Metadata* const metadata) {
+  int i;
+  toff_t exif_ifd_offset;
+
+  for (i = 0; kTIFFMetadataMap[i].tag != 0; ++i) {
+    MetadataPayload* const payload =
+        (MetadataPayload*)((uint8_t*)metadata +
+                           kTIFFMetadataMap[i].storage_offset);
+    void* tag_data;
+    uint32 tag_data_len;
+
+    if (TIFFGetField(tif, kTIFFMetadataMap[i].tag, &tag_data_len, &tag_data) &&
+        !MetadataCopy((const char*)tag_data, tag_data_len, payload)) {
+      return 0;
+    }
+  }
+
+  // TODO(jzern): To extract the raw EXIF directory some parsing of it would be
+  // necessary to determine the overall size. In addition, value offsets in
+  // individual directory entries may need to be updated as, depending on the
+  // type, they are file based.
+  // Exif 2.2 Section 4.6.2 Tag Structure
+  // TIFF Revision 6.0 Part 1 Section 2 TIFF Structure #Image File Directory
+  if (TIFFGetField(tif, TIFFTAG_EXIFIFD, &exif_ifd_offset)) {
+    fprintf(stderr, "Warning: EXIF extraction from TIFF is unsupported.\n");
+  }
+  return 1;
+}
+
+int ReadTIFF(const char* const filename,
+             WebPPicture* const pic, int keep_alpha,
+             Metadata* const metadata) {
+  TIFF* const tif = TIFFOpen(filename, "r");
+  uint32 width, height;
+  uint32* raster;
+  int ok = 0;
+  tdir_t dircount;
+
+  if (tif == NULL) {
+    fprintf(stderr, "Error! Cannot open TIFF file '%s'\n", filename);
+    return 0;
+  }
+
+  dircount = TIFFNumberOfDirectories(tif);
+  if (dircount > 1) {
+    fprintf(stderr, "Warning: multi-directory TIFF files are not supported.\n"
+                    "Only the first will be used, %d will be ignored.\n",
+                    dircount - 1);
+  }
+
+  if (!(TIFFGetField(tif, TIFFTAG_IMAGEWIDTH, &width) &&
+        TIFFGetField(tif, TIFFTAG_IMAGELENGTH, &height))) {
+    fprintf(stderr, "Error! Cannot retrieve TIFF image dimensions.\n");
+    return 0;
+  }
+  raster = (uint32*)_TIFFmalloc(width * height * sizeof(*raster));
+  if (raster != NULL) {
+    if (TIFFReadRGBAImageOriented(tif, width, height, raster,
+                                  ORIENTATION_TOPLEFT, 1)) {
+      const int stride = width * sizeof(*raster);
+      pic->width = width;
+      pic->height = height;
+      // TIFF data is ABGR
+#ifdef __BIG_ENDIAN__
+      TIFFSwabArrayOfLong(raster, width * height);
+#endif
+      pic->use_argb = 1;
+      ok = keep_alpha
+         ? WebPPictureImportRGBA(pic, (const uint8_t*)raster, stride)
+         : WebPPictureImportRGBX(pic, (const uint8_t*)raster, stride);
+    }
+    _TIFFfree(raster);
+  } else {
+    fprintf(stderr, "Error allocating TIFF RGBA memory!\n");
+  }
+
+  if (ok) {
+    if (metadata != NULL) {
+      ok = ExtractMetadataFromTIFF(tif, metadata);
+      if (!ok) {
+        fprintf(stderr, "Error extracting TIFF metadata!\n");
+        MetadataFree(metadata);
+        WebPPictureFree(pic);
+      }
+    }
+  }
+
+  TIFFClose(tif);
+  return ok;
+}
+#else  // !WEBP_HAVE_TIFF
+int ReadTIFF(const char* const filename,
+             struct WebPPicture* const pic, int keep_alpha,
+             struct Metadata* const metadata) {
+  (void)filename;
+  (void)pic;
+  (void)keep_alpha;
+  (void)metadata;
+  fprintf(stderr, "TIFF support not compiled. Please install the libtiff "
+          "development package before building.\n");
+  return 0;
+}
+#endif  // WEBP_HAVE_TIFF
+
+// -----------------------------------------------------------------------------
--- a/examples/tiffdec.h
+++ b/examples/tiffdec.h
@ -0,0 +1,34 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// TIFF decode.
+
+#ifndef WEBP_EXAMPLES_TIFFDEC_H_
+#define WEBP_EXAMPLES_TIFFDEC_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct Metadata;
+struct WebPPicture;
+
+// Reads a TIFF from 'filename', returning the decoded output in 'pic'.
+// If 'keep_alpha' is true and the TIFF has an alpha channel, the output is RGBA
+// otherwise it will be RGB.
+// Returns true on success.
+int ReadTIFF(const char* const filename,
+             struct WebPPicture* const pic, int keep_alpha,
+             struct Metadata* const metadata);
+
+#ifdef __cplusplus
+}    // extern "C"
+#endif
+
+#endif  // WEBP_EXAMPLES_TIFFDEC_H_
--- a/examples/vwebp.c
+++ b/examples/vwebp.c
@ -1,28 +1,26 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
-//  Simple WebP file viewer.
-//
-// Compiling on linux:
-//   sudo apt-get install libglut3-dev mesa-common-dev
-//   gcc -o vwebp vwebp.c -O3 -lwebp -lwebpmux -lglut -lGL -lpthread -lm
-// Compiling on Mac + XCode:
-//   gcc -o vwebp vwebp.c -lwebp -lwebpmux -framework GLUT -framework OpenGL
+//  Simple OpenGL-based WebP file viewer.
 //
 // Author: Skal (pascal.massimino@gmail.com)
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif

 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>

-#include "webp/decode.h"
-#include "webp/mux.h"
+#if defined(WEBP_HAVE_GL)

-#ifdef __APPLE__
+#if defined(HAVE_GLUT_GLUT_H)
 #include <GLUT/glut.h>
 #else
 #include <GL/glut.h>
@ -31,6 +29,13 @@
 #endif
 #endif

+#ifdef WEBP_HAVE_QCMS
+#include <qcms.h>
+#endif
+
+#include "webp/decode.h"
+#include "webp/demux.h"
+
 #include "./example_util.h"

 #ifdef _MSC_VER
@ -42,26 +47,25 @@ static void Help(void);
 // Unfortunate global variables. Gathered into a struct for comfort.
 static struct {
  int has_animation;
+  int has_color_profile;
  int done;
  int decoding_error;
  int print_info;
+  int use_color_profile;

-  uint32_t flags;
+  int canvas_width, canvas_height;
  int loop_count;
-  int frame_num;
-  int frame_max;
+  uint32_t bg_color;

  const char* file_name;
  WebPData data;
-  WebPMux* mux;
-  WebPDecoderConfig* config;
+  WebPDecoderConfig config;
  const WebPDecBuffer* pic;
-} kParams = {
-  0, 0, 0, 0,         // has_animation, ...
-  0, 1, 1, 0,         // flags, ...
-  NULL, { NULL, 0 },  // file_name, ...
-  NULL, NULL, NULL    // mux, ...
-};
+  WebPDemuxer* dmux;
+  WebPIterator curr_frame;
+  WebPIterator prev_frame;
+  WebPChunkIterator iccp;
+} kParams;

 static void ClearPreviousPic(void) {
  WebPFreeDecBuffer((WebPDecBuffer*)kParams.pic);
@ -71,8 +75,128 @@ static void ClearPreviousPic(void) {
 static void ClearParams(void) {
  ClearPreviousPic();
  WebPDataClear(&kParams.data);
-  WebPMuxDelete(kParams.mux);
-  kParams.mux = NULL;
+  WebPDemuxReleaseIterator(&kParams.curr_frame);
+  WebPDemuxReleaseIterator(&kParams.prev_frame);
+  WebPDemuxReleaseChunkIterator(&kParams.iccp);
+  WebPDemuxDelete(kParams.dmux);
+  kParams.dmux = NULL;
+}
+
+// -----------------------------------------------------------------------------
+// Color profile handling
+static int ApplyColorProfile(const WebPData* const profile,
+                             WebPDecBuffer* const rgba) {
+#ifdef WEBP_HAVE_QCMS
+  int i, ok = 0;
+  uint8_t* line;
+  uint8_t major_revision;
+  qcms_profile* input_profile = NULL;
+  qcms_profile* output_profile = NULL;
+  qcms_transform* transform = NULL;
+  const qcms_data_type input_type = QCMS_DATA_RGBA_8;
+  const qcms_data_type output_type = QCMS_DATA_RGBA_8;
+  const qcms_intent intent = QCMS_INTENT_DEFAULT;
+
+  if (profile == NULL || rgba == NULL) return 0;
+  if (profile->bytes == NULL || profile->size < 10) return 1;
+  major_revision = profile->bytes[8];
+
+  qcms_enable_iccv4();
+  input_profile = qcms_profile_from_memory(profile->bytes, profile->size);
+  // qcms_profile_is_bogus() is broken with ICCv4.
+  if (input_profile == NULL ||
+      (major_revision < 4 && qcms_profile_is_bogus(input_profile))) {
+    fprintf(stderr, "Color profile is bogus!\n");
+    goto Error;
+  }
+
+  output_profile = qcms_profile_sRGB();
+  if (output_profile == NULL) {
+    fprintf(stderr, "Error creating output color profile!\n");
+    goto Error;
+  }
+
+  qcms_profile_precache_output_transform(output_profile);
+  transform = qcms_transform_create(input_profile, input_type,
+                                    output_profile, output_type,
+                                    intent);
+  if (transform == NULL) {
+    fprintf(stderr, "Error creating color transform!\n");
+    goto Error;
+  }
+
+  line = rgba->u.RGBA.rgba;
+  for (i = 0; i < rgba->height; ++i, line += rgba->u.RGBA.stride) {
+    qcms_transform_data(transform, line, line, rgba->width);
+  }
+  ok = 1;
+
+ Error:
+  if (input_profile != NULL) qcms_profile_release(input_profile);
+  if (output_profile != NULL) qcms_profile_release(output_profile);
+  if (transform != NULL) qcms_transform_release(transform);
+  return ok;
+#else
+  (void)profile;
+  (void)rgba;
+  return 1;
+#endif  // WEBP_HAVE_QCMS
+}
+
+//------------------------------------------------------------------------------
+// File decoding
+
+static int Decode(void) {   // Fills kParams.curr_frame
+  const WebPIterator* const curr = &kParams.curr_frame;
+  WebPDecoderConfig* const config = &kParams.config;
+  WebPDecBuffer* const output_buffer = &config->output;
+  int ok = 0;
+
+  ClearPreviousPic();
+  output_buffer->colorspace = MODE_RGBA;
+  ok = (WebPDecode(curr->fragment.bytes, curr->fragment.size,
+                   config) == VP8_STATUS_OK);
+  if (!ok) {
+    fprintf(stderr, "Decoding of frame #%d failed!\n", curr->frame_num);
+  } else {
+    kParams.pic = output_buffer;
+    if (kParams.use_color_profile) {
+      ok = ApplyColorProfile(&kParams.iccp.chunk, output_buffer);
+      if (!ok) {
+        fprintf(stderr, "Applying color profile to frame #%d failed!\n",
+                curr->frame_num);
+      }
+    }
+  }
+  return ok;
+}
+
+static void decode_callback(int what) {
+  if (what == 0 && !kParams.done) {
+    int duration = 0;
+    if (kParams.dmux != NULL) {
+      WebPIterator* const curr = &kParams.curr_frame;
+      if (!WebPDemuxNextFrame(curr)) {
+        WebPDemuxReleaseIterator(curr);
+        if (WebPDemuxGetFrame(kParams.dmux, 1, curr)) {
+          --kParams.loop_count;
+          kParams.done = (kParams.loop_count == 0);
+        } else {
+          kParams.decoding_error = 1;
+          kParams.done = 1;
+          return;
+        }
+      }
+      duration = curr->duration;
+    }
+    if (!Decode()) {
+      kParams.decoding_error = 1;
+      kParams.done = 1;
+    } else {
+      glutPostRedisplay();
+      glutTimerFunc(duration, decode_callback, what);
+    }
+  }
 }

 //------------------------------------------------------------------------------
@ -88,6 +212,24 @@ static void HandleKey(unsigned char key, int pos_x, int pos_y) {
    ClearParams();
    exit(0);
 #endif
+  } else if (key == 'c') {
+    if (kParams.has_color_profile && !kParams.decoding_error) {
+      kParams.use_color_profile = 1 - kParams.use_color_profile;
+
+      if (kParams.has_animation) {
+        // Restart the completed animation to pickup the color profile change.
+        if (kParams.done && kParams.loop_count == 0) {
+          kParams.loop_count =
+              (int)WebPDemuxGetI(kParams.dmux, WEBP_FF_LOOP_COUNT) + 1;
+          kParams.done = 0;
+          // Start the decode loop immediately.
+          glutTimerFunc(0, decode_callback, 0);
+        }
+      } else {
+        Decode();
+        glutPostRedisplay();
+      }
+    }
  } else if (key == 'i') {
    kParams.print_info = 1 - kParams.print_info;
    glutPostRedisplay();
@ -112,6 +254,10 @@ static void PrintString(const char* const text) {
  }
 }

+static float GetColorf(uint32_t color, int shift) {
+  return (color >> shift) / 255.f;
+}
+
 static void DrawCheckerBoard(void) {
  const int square_size = 8;  // must be a power of 2
  int x, y;
@ -133,104 +279,90 @@ static void DrawCheckerBoard(void) {
 }

 static void HandleDisplay(void) {
-  const WebPDecBuffer* pic = kParams.pic;
+  const WebPDecBuffer* const pic = kParams.pic;
+  const WebPIterator* const curr = &kParams.curr_frame;
+  WebPIterator* const prev = &kParams.prev_frame;
+  GLfloat xoff, yoff;
  if (pic == NULL) return;
-  glClear(GL_COLOR_BUFFER_BIT);
  glPushMatrix();
  glPixelZoom(1, -1);
-  glRasterPos2f(-1, 1);
+  xoff = (GLfloat)(2. * curr->x_offset / kParams.canvas_width);
+  yoff = (GLfloat)(2. * curr->y_offset / kParams.canvas_height);
+  glRasterPos2f(-1.f + xoff, 1.f - yoff);
  glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
  glPixelStorei(GL_UNPACK_ROW_LENGTH, pic->u.RGBA.stride / 4);
+
+  if (prev->dispose_method == WEBP_MUX_DISPOSE_BACKGROUND ||
+      curr->blend_method == WEBP_MUX_NO_BLEND) {
+    // TODO(later): these offsets and those above should factor in window size.
+    //              they will be incorrect if the window is resized.
+    // glScissor() takes window coordinates (0,0 at bottom left).
+    int window_x, window_y;
+    if (prev->dispose_method == WEBP_MUX_DISPOSE_BACKGROUND) {
+      // Clear the previous frame rectangle.
+      window_x = prev->x_offset;
+      window_y = kParams.canvas_height - prev->y_offset - prev->height;
+    } else {  // curr->blend_method == WEBP_MUX_NO_BLEND.
+      // We simulate no-blending behavior by first clearing the current frame
+      // rectangle (to a checker-board) and then alpha-blending against it.
+      window_x = curr->x_offset;
+      window_y = kParams.canvas_height - curr->y_offset - curr->height;
+    }
+    glEnable(GL_SCISSOR_TEST);
+    // Only update the requested area, not the whole canvas.
+    glScissor(window_x, window_y, prev->width, prev->height);
+
+    glClear(GL_COLOR_BUFFER_BIT);  // use clear color
    DrawCheckerBoard();
+
+    glDisable(GL_SCISSOR_TEST);
+  }
+
+  *prev = *curr;
+
  glDrawPixels(pic->width, pic->height,
               GL_RGBA, GL_UNSIGNED_BYTE,
               (GLvoid*)pic->u.RGBA.rgba);
  if (kParams.print_info) {
    char tmp[32];

-    glColor4f(0.0, 0.0, 0.0, 1.0);
+    glColor4f(0.90f, 0.0f, 0.90f, 1.0f);
    glRasterPos2f(-0.95f, 0.90f);
    PrintString(kParams.file_name);

    snprintf(tmp, sizeof(tmp), "Dimension:%d x %d", pic->width, pic->height);
-    glColor4f(0.0, 0.0, 0.0, 1.0);
+    glColor4f(0.90f, 0.0f, 0.90f, 1.0f);
    glRasterPos2f(-0.95f, 0.80f);
    PrintString(tmp);
+    if (curr->x_offset != 0 || curr->y_offset != 0) {
+      snprintf(tmp, sizeof(tmp), " (offset:%d,%d)",
+               curr->x_offset, curr->y_offset);
+      glRasterPos2f(-0.95f, 0.70f);
+      PrintString(tmp);
+    }
  }
  glPopMatrix();
  glFlush();
 }

-static void StartDisplay(const WebPDecBuffer* const pic) {
+static void StartDisplay(void) {
+  const int width = kParams.canvas_width;
+  const int height = kParams.canvas_height;
  glutInitDisplayMode(GLUT_RGBA);
-  glutInitWindowSize(pic->width, pic->height);
+  glutInitWindowSize(width, height);
  glutCreateWindow("WebP viewer");
  glutDisplayFunc(HandleDisplay);
  glutIdleFunc(NULL);
  glutKeyboardFunc(HandleKey);
  glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA);
  glEnable(GL_BLEND);
-  glClearColor(0.0, 0.0, 0.0, 0.0);
-  HandleReshape(pic->width, pic->height);
-}
-
-//------------------------------------------------------------------------------
-// File decoding
-
-static int Decode(const int frame_number, int* const duration) {
-  WebPDecoderConfig* const config = kParams.config;
-  WebPData *data, image_data;
-  int x_off = 0, y_off = 0;
-  WebPDecBuffer* const output_buffer = &config->output;
-  int ok = 0;
-
-  ClearPreviousPic();
-  if (kParams.has_animation) {
-    if (WebPMuxGetFrame(kParams.mux, frame_number, &image_data,
-                        &x_off, &y_off, duration) != WEBP_MUX_OK) {
-      goto end;
-    }
-    if (x_off != 0 || y_off != 0) {
-      fprintf(stderr,
-              "Frame offsets not yet supported! Forcing offset to 0,0\n");
-      x_off = y_off = 0;
-    }
-    data = &image_data;
-  } else {
-    data = &kParams.data;
-  }
-
-  output_buffer->colorspace = MODE_RGBA;
-  ok = (WebPDecode(data->bytes_, data->size_, config) == VP8_STATUS_OK);
-
- end:
-  if (!ok) {
-    fprintf(stderr, "Decoding of frame #%d failed!\n", frame_number);
-  } else {
-    kParams.pic = output_buffer;
-  }
-  return ok;
-}
-
-static void decode_callback(int what) {
-  if (what == 0 && !kParams.done) {
-    int duration = 0;
-    if (kParams.mux != NULL) {
-      if (!Decode(kParams.frame_num, &duration)) {
-        kParams.decoding_error = 1;
-        kParams.done = 1;
-      } else {
-        ++kParams.frame_num;
-        if (kParams.frame_num > kParams.frame_max) {
-          kParams.frame_num = 1;
-          --kParams.loop_count;
-          kParams.done = (kParams.loop_count == 0);
-        }
-      }
-    }
-    glutPostRedisplay();
-    glutTimerFunc(duration, decode_callback, what);
-  }
+  glClearColor(GetColorf(kParams.bg_color, 0),
+               GetColorf(kParams.bg_color, 8),
+               GetColorf(kParams.bg_color, 16),
+               GetColorf(kParams.bg_color, 24));
+  HandleReshape(width, height);
+  glClear(GL_COLOR_BUFFER_BIT);
+  DrawCheckerBoard();
 }

 //------------------------------------------------------------------------------
@ -241,51 +373,61 @@ static void Help(void) {
         "Decodes the WebP image file and visualize it using OpenGL\n"
         "Options are:\n"
         "  -version  .... print version number and exit.\n"
+         "  -noicc ....... don't use the icc profile if present.\n"
         "  -nofancy ..... don't use the fancy YUV420 upscaler.\n"
         "  -nofilter .... disable in-loop filtering.\n"
-         "  -mt .......... use multi-threading\n"
-         "  -crop <x> <y> <w> <h> ... crop output with the given rectangle\n"
-         "  -scale <w> <h> .......... scale the output (*after* any cropping)\n"
+         "  -dither <int>  dithering strength (0..100). Default=50.\n"
+         "  -mt .......... use multi-threading.\n"
+         "  -info ........ print info.\n"
         "  -h     ....... this help message.\n"
+         "\n"
+         "Keyboard shortcuts:\n"
+         "  'c' ................ toggle use of color profile.\n"
+         "  'i' ................ overlay file information.\n"
+         "  'q' / 'Q' / ESC .... quit.\n"
        );
 }

 int main(int argc, char *argv[]) {
-  WebPDecoderConfig config;
-  WebPMuxError mux_err;
  int c;
+  WebPDecoderConfig* const config = &kParams.config;
+  WebPIterator* const curr = &kParams.curr_frame;
+  WebPIterator* const prev = &kParams.prev_frame;

-  if (!WebPInitDecoderConfig(&config)) {
+  if (!WebPInitDecoderConfig(config)) {
    fprintf(stderr, "Library version mismatch!\n");
    return -1;
  }
-  kParams.config = &config;
+  config->options.dithering_strength = 50;
+  kParams.use_color_profile = 1;

  for (c = 1; c < argc; ++c) {
    if (!strcmp(argv[c], "-h") || !strcmp(argv[c], "-help")) {
      Help();
      return 0;
+    } else if (!strcmp(argv[c], "-noicc")) {
+      kParams.use_color_profile = 0;
    } else if (!strcmp(argv[c], "-nofancy")) {
-      config.options.no_fancy_upsampling = 1;
+      config->options.no_fancy_upsampling = 1;
    } else if (!strcmp(argv[c], "-nofilter")) {
-      config.options.bypass_filtering = 1;
+      config->options.bypass_filtering = 1;
+    } else if (!strcmp(argv[c], "-dither") && c + 1 < argc) {
+      config->options.dithering_strength = strtol(argv[++c], NULL, 0);
+    } else if (!strcmp(argv[c], "-info")) {
+      kParams.print_info = 1;
    } else if (!strcmp(argv[c], "-version")) {
-      const int version = WebPGetDecoderVersion();
-      printf("%d.%d.%d\n",
-        (version >> 16) & 0xff, (version >> 8) & 0xff, version & 0xff);
+      const int dec_version = WebPGetDecoderVersion();
+      const int dmux_version = WebPGetDemuxVersion();
+      printf("WebP Decoder version: %d.%d.%d\nWebP Demux version: %d.%d.%d\n",
+             (dec_version >> 16) & 0xff, (dec_version >> 8) & 0xff,
+             dec_version & 0xff, (dmux_version >> 16) & 0xff,
+             (dmux_version >> 8) & 0xff, dmux_version & 0xff);
      return 0;
    } else if (!strcmp(argv[c], "-mt")) {
-      config.options.use_threads = 1;
-    } else if (!strcmp(argv[c], "-crop") && c < argc - 4) {
-      config.options.use_cropping = 1;
-      config.options.crop_left   = strtol(argv[++c], NULL, 0);
-      config.options.crop_top    = strtol(argv[++c], NULL, 0);
-      config.options.crop_width  = strtol(argv[++c], NULL, 0);
-      config.options.crop_height = strtol(argv[++c], NULL, 0);
-    } else if (!strcmp(argv[c], "-scale") && c < argc - 2) {
-      config.options.use_scaling = 1;
-      config.options.scaled_width  = strtol(argv[++c], NULL, 0);
-      config.options.scaled_height = strtol(argv[++c], NULL, 0);
+      config->options.use_threads = 1;
+    } else if (!strcmp(argv[c], "--")) {
+      if (c < argc - 1) kParams.file_name = argv[++c];
+      break;
    } else if (argv[c][0] == '-') {
      printf("Unknown option '%s'\n", argv[c]);
      Help();
@ -302,53 +444,73 @@ int main(int argc, char *argv[]) {
  }

  if (!ExUtilReadFile(kParams.file_name,
-                      &kParams.data.bytes_, &kParams.data.size_)) {
+                      &kParams.data.bytes, &kParams.data.size)) {
    goto Error;
  }

-  kParams.mux = WebPMuxCreate(&kParams.data, 0);
-  if (kParams.mux == NULL) {
+  if (!WebPGetInfo(kParams.data.bytes, kParams.data.size, NULL, NULL)) {
+    fprintf(stderr, "Input file doesn't appear to be WebP format.\n");
+    goto Error;
+  }
+
+  kParams.dmux = WebPDemux(&kParams.data);
+  if (kParams.dmux == NULL) {
    fprintf(stderr, "Could not create demuxing object!\n");
    goto Error;
  }

-  mux_err = WebPMuxGetFeatures(kParams.mux, &kParams.flags);
-  if (mux_err != WEBP_MUX_OK) {
+  if (WebPDemuxGetI(kParams.dmux, WEBP_FF_FORMAT_FLAGS) & FRAGMENTS_FLAG) {
+    fprintf(stderr, "Image fragments are not supported for now!\n");
    goto Error;
  }
-  if (kParams.flags & TILE_FLAG) {
-    fprintf(stderr, "Tiling is not supported for now!\n");
-    goto Error;
+  kParams.canvas_width = WebPDemuxGetI(kParams.dmux, WEBP_FF_CANVAS_WIDTH);
+  kParams.canvas_height = WebPDemuxGetI(kParams.dmux, WEBP_FF_CANVAS_HEIGHT);
+  if (kParams.print_info) {
+    printf("Canvas: %d x %d\n", kParams.canvas_width, kParams.canvas_height);
  }

-  kParams.has_animation = !!(kParams.flags & ANIMATION_FLAG);
+  prev->width = kParams.canvas_width;
+  prev->height = kParams.canvas_height;
+  prev->x_offset = prev->y_offset = 0;
+  prev->dispose_method = WEBP_MUX_DISPOSE_BACKGROUND;

-  if (kParams.has_animation) {
-    mux_err = WebPMuxGetLoopCount(kParams.mux, &kParams.loop_count);
-    if (mux_err != WEBP_MUX_OK && mux_err != WEBP_MUX_NOT_FOUND) {
-      goto Error;
-    }
-    mux_err = WebPMuxNumChunks(kParams.mux, WEBP_CHUNK_IMAGE,
-                                      &kParams.frame_max);
-    if (mux_err != WEBP_MUX_OK) {
-      goto Error;
+  memset(&kParams.iccp, 0, sizeof(kParams.iccp));
+  kParams.has_color_profile =
+      !!(WebPDemuxGetI(kParams.dmux, WEBP_FF_FORMAT_FLAGS) & ICCP_FLAG);
+  if (kParams.has_color_profile) {
+#ifdef WEBP_HAVE_QCMS
+    if (!WebPDemuxGetChunk(kParams.dmux, "ICCP", 1, &kParams.iccp)) goto Error;
+    printf("VP8X: Found color profile\n");
+#else
+    fprintf(stderr, "Warning: color profile present, but qcms is unavailable!\n"
+            "Build libqcms from Mozilla or Chromium and define WEBP_HAVE_QCMS "
+            "before building.\n");
+#endif
  }
+
+  if (!WebPDemuxGetFrame(kParams.dmux, 1, curr)) goto Error;
+
+  kParams.has_animation = (curr->num_frames > 1);
+  kParams.loop_count = (int)WebPDemuxGetI(kParams.dmux, WEBP_FF_LOOP_COUNT);
+  kParams.bg_color = WebPDemuxGetI(kParams.dmux, WEBP_FF_BACKGROUND_COLOR);
  printf("VP8X: Found %d images in file (loop count = %d)\n",
-           kParams.frame_max, kParams.loop_count);
-  }
+         curr->num_frames, kParams.loop_count);

  // Decode first frame
-  {
-    int duration;
-    if (!Decode(1, &duration)) goto Error;
-  }
+  if (!Decode()) goto Error;
+
+  // Position iterator to last frame. Next call to HandleDisplay will wrap over.
+  // We take this into account by bumping up loop_count.
+  WebPDemuxGetFrame(kParams.dmux, 0, curr);
+  if (kParams.loop_count) ++kParams.loop_count;

  // Start display (and timer)
  glutInit(&argc, argv);
 #ifdef FREEGLUT
  glutSetOption(GLUT_ACTION_ON_WINDOW_CLOSE, GLUT_ACTION_CONTINUE_EXECUTION);
 #endif
-  StartDisplay(kParams.pic);
+  StartDisplay();
+
  if (kParams.has_animation) glutTimerFunc(0, decode_callback, 0);
  glutMainLoop();

@ -361,4 +523,14 @@ int main(int argc, char *argv[]) {
  return -1;
 }

+#else   // !WEBP_HAVE_GL
+
+int main(int argc, const char *argv[]) {
+  fprintf(stderr, "OpenGL support not enabled in %s.\n", argv[0]);
+  (void)argc;
+  return 0;
+}
+
+#endif
+
 //------------------------------------------------------------------------------
--- a/examples/webpmux.c
+++ b/examples/webpmux.c
--- a/examples/wicdec.c
+++ b/examples/wicdec.c
@ -0,0 +1,349 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Windows Imaging Component (WIC) decode.
+
+#include "./wicdec.h"
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <stdio.h>
+
+#ifdef HAVE_WINCODEC_H
+#ifdef __MINGW32__
+#define INITGUID  // Without this GUIDs are declared extern and fail to link
+#endif
+#define CINTERFACE
+#define COBJMACROS
+#define _WIN32_IE 0x500  // Workaround bug in shlwapi.h when compiling C++
+                         // code with COBJMACROS.
+#include <shlwapi.h>
+#include <windows.h>
+#include <wincodec.h>
+
+#include "webp/encode.h"
+#include "./metadata.h"
+
+#define IFS(fn)                                                     \
+  do {                                                              \
+    if (SUCCEEDED(hr)) {                                            \
+      hr = (fn);                                                    \
+      if (FAILED(hr)) fprintf(stderr, #fn " failed %08lx\n", hr);   \
+    }                                                               \
+  } while (0)
+
+// modified version of DEFINE_GUID from guiddef.h.
+#define WEBP_DEFINE_GUID(name, l, w1, w2, b1, b2, b3, b4, b5, b6, b7, b8) \
+  static const GUID name = \
+      { l, w1, w2, { b1, b2,  b3,  b4,  b5,  b6,  b7,  b8 } }
+
+#ifdef __cplusplus
+#define MAKE_REFGUID(x) (x)
+#else
+#define MAKE_REFGUID(x) &(x)
+#endif
+
+typedef struct WICFormatImporter {
+  const GUID* pixel_format;
+  int bytes_per_pixel;
+  int (*import)(WebPPicture* const, const uint8_t* const, int);
+} WICFormatImporter;
+
+// From Microsoft SDK 7.0a -- wincodec.h
+// Create local copies for compatibility when building against earlier
+// versions of the SDK.
+WEBP_DEFINE_GUID(GUID_WICPixelFormat24bppBGR_,
+                 0x6fddc324, 0x4e03, 0x4bfe,
+                 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x0c);
+WEBP_DEFINE_GUID(GUID_WICPixelFormat24bppRGB_,
+                 0x6fddc324, 0x4e03, 0x4bfe,
+                 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x0d);
+WEBP_DEFINE_GUID(GUID_WICPixelFormat32bppBGRA_,
+                 0x6fddc324, 0x4e03, 0x4bfe,
+                 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x0f);
+WEBP_DEFINE_GUID(GUID_WICPixelFormat32bppRGBA_,
+                 0xf5c7ad2d, 0x6a8d, 0x43dd,
+                 0xa7, 0xa8, 0xa2, 0x99, 0x35, 0x26, 0x1a, 0xe9);
+
+static HRESULT OpenInputStream(const char* filename, IStream** stream) {
+  HRESULT hr = S_OK;
+  IFS(SHCreateStreamOnFileA(filename, STGM_READ, stream));
+  if (FAILED(hr)) {
+    fprintf(stderr, "Error opening input file %s (%08lx)\n", filename, hr);
+  }
+  return hr;
+}
+
+// -----------------------------------------------------------------------------
+// Metadata processing
+
+// Stores the first non-zero sized color profile from 'frame' to 'iccp'.
+// Returns an HRESULT to indicate success or failure. The caller is responsible
+// for freeing 'iccp->bytes' in either case.
+static HRESULT ExtractICCP(IWICImagingFactory* const factory,
+                           IWICBitmapFrameDecode* const frame,
+                           MetadataPayload* const iccp) {
+  HRESULT hr = S_OK;
+  UINT i, count;
+  IWICColorContext** color_contexts;
+
+  IFS(IWICBitmapFrameDecode_GetColorContexts(frame, 0, NULL, &count));
+  if (FAILED(hr) || count == 0) return hr;
+
+  color_contexts = (IWICColorContext**)calloc(count, sizeof(*color_contexts));
+  if (color_contexts == NULL) return E_OUTOFMEMORY;
+  for (i = 0; SUCCEEDED(hr) && i < count; ++i) {
+    IFS(IWICImagingFactory_CreateColorContext(factory, &color_contexts[i]));
+  }
+
+  if (SUCCEEDED(hr)) {
+    UINT num_color_contexts;
+    IFS(IWICBitmapFrameDecode_GetColorContexts(frame,
+                                               count, color_contexts,
+                                               &num_color_contexts));
+    for (i = 0; SUCCEEDED(hr) && i < num_color_contexts; ++i) {
+      WICColorContextType type;
+      IFS(IWICColorContext_GetType(color_contexts[i], &type));
+      if (SUCCEEDED(hr) && type == WICColorContextProfile) {
+        UINT size;
+        IFS(IWICColorContext_GetProfileBytes(color_contexts[i],
+                                             0, NULL, &size));
+        if (size > 0) {
+          iccp->bytes = (uint8_t*)malloc(size);
+          if (iccp->bytes == NULL) {
+            hr = E_OUTOFMEMORY;
+            break;
+          }
+          iccp->size = size;
+          IFS(IWICColorContext_GetProfileBytes(color_contexts[i],
+                                               (UINT)iccp->size, iccp->bytes,
+                                               &size));
+          if (SUCCEEDED(hr) && size != iccp->size) {
+            fprintf(stderr, "Warning! ICC profile size (%u) != expected (%u)\n",
+                    size, (uint32_t)iccp->size);
+            iccp->size = size;
+          }
+          break;
+        }
+      }
+    }
+  }
+  for (i = 0; i < count; ++i) {
+    if (color_contexts[i] != NULL) IUnknown_Release(color_contexts[i]);
+  }
+  free(color_contexts);
+  return hr;
+}
+
+static HRESULT ExtractMetadata(IWICImagingFactory* const factory,
+                               IWICBitmapFrameDecode* const frame,
+                               Metadata* const metadata) {
+  // TODO(jzern): add XMP/EXIF extraction.
+  const HRESULT hr = ExtractICCP(factory, frame, &metadata->iccp);
+  if (FAILED(hr)) MetadataFree(metadata);
+  return hr;
+}
+
+// -----------------------------------------------------------------------------
+
+static int HasPalette(GUID pixel_format) {
+  return (IsEqualGUID(MAKE_REFGUID(pixel_format),
+                      MAKE_REFGUID(GUID_WICPixelFormat1bppIndexed)) ||
+          IsEqualGUID(MAKE_REFGUID(pixel_format),
+                      MAKE_REFGUID(GUID_WICPixelFormat2bppIndexed)) ||
+          IsEqualGUID(MAKE_REFGUID(pixel_format),
+                      MAKE_REFGUID(GUID_WICPixelFormat4bppIndexed)) ||
+          IsEqualGUID(MAKE_REFGUID(pixel_format),
+                      MAKE_REFGUID(GUID_WICPixelFormat8bppIndexed)));
+}
+
+static int HasAlpha(IWICImagingFactory* const factory,
+                    IWICBitmapDecoder* const decoder,
+                    IWICBitmapFrameDecode* const frame,
+                    GUID pixel_format) {
+  int has_alpha;
+  if (HasPalette(pixel_format)) {
+    IWICPalette* frame_palette = NULL;
+    IWICPalette* global_palette = NULL;
+    BOOL frame_palette_has_alpha = FALSE;
+    BOOL global_palette_has_alpha = FALSE;
+
+    // A palette may exist at the frame or container level,
+    // check IWICPalette::HasAlpha() for both if present.
+    if (SUCCEEDED(IWICImagingFactory_CreatePalette(factory, &frame_palette)) &&
+        SUCCEEDED(IWICBitmapFrameDecode_CopyPalette(frame, frame_palette))) {
+      IWICPalette_HasAlpha(frame_palette, &frame_palette_has_alpha);
+    }
+    if (SUCCEEDED(IWICImagingFactory_CreatePalette(factory, &global_palette)) &&
+        SUCCEEDED(IWICBitmapDecoder_CopyPalette(decoder, global_palette))) {
+      IWICPalette_HasAlpha(global_palette, &global_palette_has_alpha);
+    }
+    has_alpha = frame_palette_has_alpha || global_palette_has_alpha;
+
+    if (frame_palette != NULL) IUnknown_Release(frame_palette);
+    if (global_palette != NULL) IUnknown_Release(global_palette);
+  } else {
+    has_alpha = IsEqualGUID(MAKE_REFGUID(pixel_format),
+                            MAKE_REFGUID(GUID_WICPixelFormat32bppRGBA_)) ||
+                IsEqualGUID(MAKE_REFGUID(pixel_format),
+                            MAKE_REFGUID(GUID_WICPixelFormat32bppBGRA_));
+  }
+  return has_alpha;
+}
+
+int ReadPictureWithWIC(const char* const filename,
+                       WebPPicture* const pic, int keep_alpha,
+                       Metadata* const metadata) {
+  // From Microsoft SDK 6.0a -- ks.h
+  // Define a local copy to avoid link errors under mingw.
+  WEBP_DEFINE_GUID(GUID_NULL_, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+  static const WICFormatImporter kAlphaFormatImporters[] = {
+    { &GUID_WICPixelFormat32bppBGRA_, 4, WebPPictureImportBGRA },
+    { &GUID_WICPixelFormat32bppRGBA_, 4, WebPPictureImportRGBA },
+    { NULL, 0, NULL },
+  };
+  static const WICFormatImporter kNonAlphaFormatImporters[] = {
+    { &GUID_WICPixelFormat24bppBGR_, 3, WebPPictureImportBGR },
+    { &GUID_WICPixelFormat24bppRGB_, 3, WebPPictureImportRGB },
+    { NULL, 0, NULL },
+  };
+  HRESULT hr = S_OK;
+  IWICBitmapFrameDecode* frame = NULL;
+  IWICFormatConverter* converter = NULL;
+  IWICImagingFactory* factory = NULL;
+  IWICBitmapDecoder* decoder = NULL;
+  IStream* stream = NULL;
+  UINT frame_count = 0;
+  UINT width = 0, height = 0;
+  BYTE* rgb = NULL;
+  WICPixelFormatGUID src_pixel_format = GUID_WICPixelFormatUndefined;
+  const WICFormatImporter* importer = NULL;
+  GUID src_container_format = GUID_NULL_;
+  static const GUID* kAlphaContainers[] = {
+    &GUID_ContainerFormatBmp,
+    &GUID_ContainerFormatPng,
+    &GUID_ContainerFormatTiff,
+    NULL
+  };
+  int has_alpha = 0;
+  int stride;
+
+  IFS(CoInitialize(NULL));
+  IFS(CoCreateInstance(MAKE_REFGUID(CLSID_WICImagingFactory), NULL,
+                       CLSCTX_INPROC_SERVER,
+                       MAKE_REFGUID(IID_IWICImagingFactory),
+                       (LPVOID*)&factory));
+  if (hr == REGDB_E_CLASSNOTREG) {
+    fprintf(stderr,
+            "Couldn't access Windows Imaging Component (are you running "
+            "Windows XP SP3 or newer?). Most formats not available. "
+            "Use -s for the available YUV input.\n");
+  }
+  // Prepare for image decoding.
+  IFS(OpenInputStream(filename, &stream));
+  IFS(IWICImagingFactory_CreateDecoderFromStream(
+          factory, stream, NULL,
+          WICDecodeMetadataCacheOnDemand, &decoder));
+  IFS(IWICBitmapDecoder_GetFrameCount(decoder, &frame_count));
+  if (SUCCEEDED(hr) && frame_count == 0) {
+    fprintf(stderr, "No frame found in input file.\n");
+    hr = E_FAIL;
+  }
+  IFS(IWICBitmapDecoder_GetFrame(decoder, 0, &frame));
+  IFS(IWICBitmapFrameDecode_GetPixelFormat(frame, &src_pixel_format));
+  IFS(IWICBitmapDecoder_GetContainerFormat(decoder, &src_container_format));
+
+  if (keep_alpha) {
+    const GUID** guid;
+    for (guid = kAlphaContainers; *guid != NULL; ++guid) {
+      if (IsEqualGUID(MAKE_REFGUID(src_container_format),
+                      MAKE_REFGUID(**guid))) {
+        has_alpha = HasAlpha(factory, decoder, frame, src_pixel_format);
+        break;
+      }
+    }
+  }
+
+  // Prepare for pixel format conversion (if necessary).
+  IFS(IWICImagingFactory_CreateFormatConverter(factory, &converter));
+
+  for (importer = has_alpha ? kAlphaFormatImporters : kNonAlphaFormatImporters;
+       hr == S_OK && importer->import != NULL; ++importer) {
+    BOOL can_convert;
+    const HRESULT cchr = IWICFormatConverter_CanConvert(
+        converter,
+        MAKE_REFGUID(src_pixel_format),
+        MAKE_REFGUID(*importer->pixel_format),
+        &can_convert);
+    if (SUCCEEDED(cchr) && can_convert) break;
+  }
+  if (importer->import == NULL) hr = E_FAIL;
+
+  IFS(IWICFormatConverter_Initialize(converter, (IWICBitmapSource*)frame,
+                                     importer->pixel_format,
+                                     WICBitmapDitherTypeNone,
+                                     NULL, 0.0, WICBitmapPaletteTypeCustom));
+
+  // Decode.
+  IFS(IWICFormatConverter_GetSize(converter, &width, &height));
+  stride = importer->bytes_per_pixel * width * sizeof(*rgb);
+  if (SUCCEEDED(hr)) {
+    rgb = (BYTE*)malloc(stride * height);
+    if (rgb == NULL)
+      hr = E_OUTOFMEMORY;
+  }
+  IFS(IWICFormatConverter_CopyPixels(converter, NULL,
+                                     stride, stride * height, rgb));
+
+  // WebP conversion.
+  if (SUCCEEDED(hr)) {
+    int ok;
+    pic->width = width;
+    pic->height = height;
+    pic->use_argb = 1;
+    ok = importer->import(pic, rgb, stride);
+    if (!ok) hr = E_FAIL;
+  }
+  if (SUCCEEDED(hr)) {
+    if (metadata != NULL) {
+      hr = ExtractMetadata(factory, frame, metadata);
+      if (FAILED(hr)) {
+        fprintf(stderr, "Error extracting image metadata using WIC!\n");
+      }
+    }
+  }
+
+  // Cleanup.
+  if (converter != NULL) IUnknown_Release(converter);
+  if (frame != NULL) IUnknown_Release(frame);
+  if (decoder != NULL) IUnknown_Release(decoder);
+  if (factory != NULL) IUnknown_Release(factory);
+  if (stream != NULL) IUnknown_Release(stream);
+  free(rgb);
+  return SUCCEEDED(hr);
+}
+#else  // !HAVE_WINCODEC_H
+int ReadPictureWithWIC(const char* const filename,
+                       struct WebPPicture* const pic, int keep_alpha,
+                       struct Metadata* const metadata) {
+  (void)filename;
+  (void)pic;
+  (void)keep_alpha;
+  (void)metadata;
+  fprintf(stderr, "Windows Imaging Component (WIC) support not compiled. "
+                  "Visual Studio and mingw-w64 builds support WIC. Make sure "
+                  "wincodec.h detection is working correctly if using autoconf "
+                  "and HAVE_WINCODEC_H is defined before building.\n");
+  return 0;
+}
+#endif  // HAVE_WINCODEC_H
+
+// -----------------------------------------------------------------------------
--- a/examples/wicdec.h
+++ b/examples/wicdec.h
@ -0,0 +1,34 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Windows Imaging Component (WIC) decode.
+
+#ifndef WEBP_EXAMPLES_WICDEC_H_
+#define WEBP_EXAMPLES_WICDEC_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct Metadata;
+struct WebPPicture;
+
+// Reads an image from 'filename', returning the decoded output in 'pic'.
+// If 'keep_alpha' is true and the image has an alpha channel, the output is
+// RGBA otherwise it will be RGB.
+// Returns true on success.
+int ReadPictureWithWIC(const char* const filename,
+                       struct WebPPicture* const pic, int keep_alpha,
+                       struct Metadata* const metadata);
+
+#ifdef __cplusplus
+}    // extern "C"
+#endif
+
+#endif  // WEBP_EXAMPLES_WICDEC_H_
--- a/iosbuild.sh
+++ b/iosbuild.sh
@ -0,0 +1,108 @@
+#!/bin/bash
+#
+# This script generates 'WebP.framework'. An iOS app can decode WebP images
+# by including 'WebP.framework'.
+#
+# Run ./iosbuild.sh to generate 'WebP.framework' under the current directory
+# (previous build will be erased if it exists).
+#
+# This script is inspired by the build script written by Carson McDonald.
+# (http://www.ioncannon.net/programming/1483/using-webp-to-reduce-native-ios-app-size/).
+
+set -e
+
+# Extract the latest SDK version from the final field of the form: iphoneosX.Y
+declare -r SDK=$(xcodebuild -showsdks \
+  | grep iphoneos | sort | tail -n 1 | awk '{print substr($NF, 9)}'
+)
+# Extract Xcode version.
+declare -r XCODE=$(xcodebuild -version | grep Xcode | cut -d " " -f2)
+
+declare -r OLDPATH=${PATH}
+
+# Add iPhoneOS-V6 to the list of platforms below if you need armv6 support.
+# Note that iPhoneOS-V6 support is not available with the iOS6 SDK.
+declare -r PLATFORMS="iPhoneSimulator iPhoneOS-V7 iPhoneOS-V7s"
+declare -r SRCDIR=$(dirname $0)
+declare -r TOPDIR=$(pwd)
+declare -r BUILDDIR="${TOPDIR}/iosbuild"
+declare -r TARGETDIR="${TOPDIR}/WebP.framework"
+declare -r DEVELOPER=$(xcode-select --print-path)
+declare -r PLATFORMSROOT="${DEVELOPER}/Platforms"
+declare -r LIPO=$(xcrun -sdk iphoneos${SDK} -find lipo)
+LIBLIST=''
+
+if [[ -z "${SDK}" ]]; then
+  echo "iOS SDK not available"
+  exit 1
+elif [[ ${SDK} < 4.0 ]]; then
+  echo "You need iOS SDK version 4.0 or above"
+  exit 1
+else
+  echo "iOS SDK Version ${SDK}"
+fi
+
+rm -rf ${BUILDDIR}
+rm -rf ${TARGETDIR}
+mkdir -p ${BUILDDIR}
+mkdir -p ${TARGETDIR}/Headers/
+
+[[ -e ${SRCDIR}/configure ]] || (cd ${SRCDIR} && sh autogen.sh)
+
+for PLATFORM in ${PLATFORMS}; do
+  if [[ "${PLATFORM}" == "iPhoneOS-V7s" ]]; then
+    PLATFORM="iPhoneOS"
+    ARCH="armv7s"
+  elif [[ "${PLATFORM}" == "iPhoneOS-V7" ]]; then
+    PLATFORM="iPhoneOS"
+    ARCH="armv7"
+  elif [[ "${PLATFORM}" == "iPhoneOS-V6" ]]; then
+    PLATFORM="iPhoneOS"
+    ARCH="armv6"
+  else
+    ARCH="i386"
+  fi
+
+  ROOTDIR="${BUILDDIR}/${PLATFORM}-${SDK}-${ARCH}"
+  mkdir -p "${ROOTDIR}"
+
+  SDKROOT="${PLATFORMSROOT}/${PLATFORM}.platform/Developer/SDKs/${PLATFORM}${SDK}.sdk/"
+  CFLAGS="-arch ${ARCH} -pipe -isysroot ${SDKROOT}"
+  LDFLAGS="-arch ${ARCH} -pipe -isysroot ${SDKROOT}"
+
+  if [[ -z "${XCODE}" ]]; then
+    echo "XCODE not available"
+    exit 1
+  elif [[ ${SDK} < 5.0.0 ]]; then
+    DEVROOT="${PLATFORMSROOT}/${PLATFORM}.platform/Developer/"
+  else
+    DEVROOT="${DEVELOPER}/Toolchains/XcodeDefault.xctoolchain"
+    CFLAGS+=" -miphoneos-version-min=5.0"
+    LDFLAGS+=" -miphoneos-version-min=5.0"
+  fi
+
+  export CFLAGS
+  export LDFLAGS
+  export CXXFLAGS=${CFLAGS}
+  export PATH="${DEVROOT}/usr/bin:${OLDPATH}"
+
+  ${SRCDIR}/configure --host=${ARCH}-apple-darwin --prefix=${ROOTDIR} \
+    --build=$(${SRCDIR}/config.guess) \
+    --disable-shared --enable-static \
+    --enable-libwebpdecoder --enable-swap-16bit-csp
+
+  # run make only in the src/ directory to create libwebpdecoder.a
+  cd src/
+  make V=0
+  make install
+
+  LIBLIST+=" ${ROOTDIR}/lib/libwebpdecoder.a"
+
+  make clean
+  cd ..
+
+  export PATH=${OLDPATH}
+done
+
+cp -a ${SRCDIR}/src/webp/* ${TARGETDIR}/Headers/
+${LIPO} -create ${LIBLIST} -output ${TARGETDIR}/WebP
--- a/makefile.unix
+++ b/makefile.unix
@ -2,7 +2,8 @@
 # system, for simple local building of the libraries and tools.
 # It will not install the libraries system-wide, but just create the 'cwebp'
 # and 'dwebp' tools in the examples/ directory, along with the static
-# libraries 'src/libwebp.a' and 'src/mux/libwebpmux.a'.
+# libraries 'src/libwebp.a', 'src/libwebpdecoder.a', 'src/mux/libwebpmux.a' and
+# 'src/demux/libwebpdemux.a'.
 #
 # To build the library and examples, use:
 #    make -f makefile.unix
@ -10,15 +11,20 @@

 #### Customizable part ####

-# These flag assume you have libpng and libjpeg installed. If not, either
-# follow below install instructions or just comment out the next lines.
+# These flags assume you have libpng, libjpeg, libtiff and libgif installed. If
+# not, either follow the install instructions below or just comment out the next
+# four lines.
 EXTRA_FLAGS= -DWEBP_HAVE_PNG -DWEBP_HAVE_JPEG -DWEBP_HAVE_TIFF
-EXTRA_LIBS= -lpng -ltiff -ljpeg -lz
+DWEBP_LIBS= -lpng -lz
+CWEBP_LIBS= $(DWEBP_LIBS) -ljpeg -ltiff
+GIF_LIBS = -lgif
+
 ifeq ($(strip $(shell uname)), Darwin)
  # Work around a problem linking tables marked as common symbols,
  # cf., src/enc/yuv.[hc]
  # Failure observed with: gcc 4.2.1 and 4.0.1.
  EXTRA_FLAGS += -fno-common
+  EXTRA_FLAGS += -DHAVE_GLUT_GLUT_H
  EXTRA_FLAGS += -I/opt/local/include
  EXTRA_LIBS  += -L/opt/local/lib
  GL_LIBS = -framework GLUT -framework OpenGL
@ -26,16 +32,19 @@ else
  GL_LIBS = -lglut -lGL
 endif

+
 # To install libraries on Mac OS X:
 # 1. Install MacPorts (http://www.macports.org/install.php)
 # 2. Run "sudo port install jpeg"
 # 3. Run "sudo port install libpng"
 # 4. Run "sudo port install tiff"
+# 5. Run "sudo port install giflib"

 # To install libraries on Linux:
 # 1. Run "sudo apt-get install libjpeg62-dev"
 # 2. Run "sudo apt-get install libpng12-dev"
 # 3. Run "sudo apt-get install libtiff4-dev"
+# 4. Run "sudo apt-get install libgif-dev"

 # Uncomment for build for 32bit platform
 # Alternatively, you can just use the command
@ -45,6 +54,9 @@ endif
 # Extra flags to enable experimental features and code
 # EXTRA_FLAGS += -DWEBP_EXPERIMENTAL_FEATURES

+# Extra flags to enable byte swap for 16 bit colorspaces.
+# EXTRA_FLAGS += -DWEBP_SWAP_16BIT_CSP
+
 # Extra flags to enable multi-threading
 EXTRA_FLAGS += -DWEBP_USE_THREAD
 EXTRA_LIBS += -lpthread
@ -54,6 +66,7 @@ EXTRA_FLAGS += -Wextra -Wold-style-definition
 EXTRA_FLAGS += -Wmissing-prototypes
 EXTRA_FLAGS += -Wmissing-declarations
 EXTRA_FLAGS += -Wdeclaration-after-statement
+EXTRA_FLAGS += -Wshadow
 # EXTRA_FLAGS += -Wvla

 #### Nothing should normally be changed below this line ####
@ -66,7 +79,7 @@ CFLAGS = -O3 -DNDEBUG $(EXTRA_FLAGS)
 INSTALL = install
 GROFF = /usr/bin/groff
 COL = /usr/bin/col
-LDFLAGS = $(EXTRA_LIBS) -lm
+LDFLAGS = $(EXTRA_LIBS) $(EXTRA_FLAGS) -lm

 DEC_OBJS = \
    src/dec/alpha.o \
@ -81,18 +94,25 @@ DEC_OBJS = \
    src/dec/vp8l.o \
    src/dec/webp.o \

-DSP_OBJS = \
+DEMUX_OBJS = \
+    src/demux/demux.o \
+
+DSP_DEC_OBJS = \
    src/dsp/cpu.o \
    src/dsp/dec.o \
    src/dsp/dec_neon.o \
    src/dsp/dec_sse2.o \
-    src/dsp/enc.o \
-    src/dsp/enc_sse2.o \
    src/dsp/lossless.o \
    src/dsp/upsampling.o \
+    src/dsp/upsampling_neon.o \
    src/dsp/upsampling_sse2.o \
    src/dsp/yuv.o \

+DSP_ENC_OBJS = \
+    src/dsp/enc.o \
+    src/dsp/enc_neon.o \
+    src/dsp/enc_sse2.o \
+
 ENC_OBJS = \
    src/enc/alpha.o \
    src/enc/analysis.o \
@ -107,39 +127,61 @@ ENC_OBJS = \
    src/enc/picture.o \
    src/enc/quant.o \
    src/enc/syntax.o \
+    src/enc/token.o \
    src/enc/tree.o \
    src/enc/vp8l.o \
    src/enc/webpenc.o \

+EX_FORMAT_DEC_OBJS = \
+    examples/jpegdec.o \
+    examples/metadata.o \
+    examples/pngdec.o \
+    examples/tiffdec.o \
+
 EX_UTIL_OBJS = \
    examples/example_util.o \

+GIF2WEBP_UTIL_OBJS = \
+    examples/gif2webp_util.o \
+
 MUX_OBJS = \
-    src/mux/demux.o \
    src/mux/muxedit.o \
    src/mux/muxinternal.o \
    src/mux/muxread.o \

-UTILS_OBJS = \
+UTILS_DEC_OBJS = \
+    src/utils/alpha_processing.o \
    src/utils/bit_reader.o \
-    src/utils/bit_writer.o \
    src/utils/color_cache.o \
    src/utils/filters.o \
    src/utils/huffman.o \
-    src/utils/huffman_encode.o \
-    src/utils/quant_levels.o \
+    src/utils/quant_levels_dec.o \
+    src/utils/random.o \
    src/utils/rescaler.o \
    src/utils/thread.o \
+    src/utils/utils.o \

-LIBWEBP_OBJS = $(DEC_OBJS) $(DSP_OBJS) $(ENC_OBJS) $(UTILS_OBJS)
+UTILS_ENC_OBJS = \
+    src/utils/bit_writer.o \
+    src/utils/huffman_encode.o \
+    src/utils/quant_levels.o \
+
+LIBWEBPDECODER_OBJS = $(DEC_OBJS) $(DSP_DEC_OBJS) $(UTILS_DEC_OBJS)
+LIBWEBP_OBJS = $(LIBWEBPDECODER_OBJS) $(ENC_OBJS) $(DSP_ENC_OBJS) \
+               $(UTILS_ENC_OBJS)
 LIBWEBPMUX_OBJS = $(MUX_OBJS)
+LIBWEBPDEMUX_OBJS = $(DEMUX_OBJS)

 HDRS_INSTALLED = \
    src/webp/decode.h \
+    src/webp/demux.h \
    src/webp/encode.h \
+    src/webp/mux.h \
+    src/webp/mux_types.h \
    src/webp/types.h \

 HDRS = \
+    src/dec/alphai.h \
    src/dec/decode_vp8.h \
    src/dec/vp8i.h \
    src/dec/vp8li.h \
@ -149,6 +191,7 @@ HDRS = \
    src/dsp/yuv.h \
    src/enc/cost.h \
    src/enc/vp8enci.h \
+    src/utils/alpha_processing.h \
    src/utils/bit_reader.h \
    src/utils/bit_writer.h \
    src/utils/color_cache.h \
@ -156,57 +199,78 @@ HDRS = \
    src/utils/huffman.h \
    src/utils/huffman_encode.h \
    src/utils/quant_levels.h \
+    src/utils/quant_levels_dec.h \
+    src/utils/random.h \
    src/utils/rescaler.h \
    src/utils/thread.h \
    src/webp/format_constants.h \
-    src/webp/mux.h \
    $(HDRS_INSTALLED) \

-OUT_LIBS = examples/libexample_util.a src/libwebp.a
+OUT_LIBS = examples/libexample_util.a src/libwebpdecoder.a src/libwebp.a
 OUT_EXAMPLES = examples/cwebp examples/dwebp
+EXTRA_EXAMPLES = examples/gif2webp examples/vwebp examples/webpmux

 OUTPUT = $(OUT_LIBS) $(OUT_EXAMPLES)
 ifeq ($(MAKECMDGOALS),clean)
-  OUTPUT += examples/vwebp examples/webpmux src/mux/libwebpmux.a
+  OUTPUT += $(EXTRA_EXAMPLES)
+  OUTPUT += src/demux/libwebpdemux.a src/mux/libwebpmux.a
+  OUTPUT += examples/libgif2webp_util.a
 endif

-all: ex
+ex: $(OUT_EXAMPLES)
+all: ex $(EXTRA_EXAMPLES)
+
+$(EX_FORMAT_DEC_OBJS): %.o: %.h

 %.o: %.c $(HDRS)
 	$(CC) $(CFLAGS) $(CPPFLAGS) -c $< -o $@

 examples/libexample_util.a: $(EX_UTIL_OBJS)
+examples/libgif2webp_util.a: $(GIF2WEBP_UTIL_OBJS)
+src/libwebpdecoder.a: $(LIBWEBPDECODER_OBJS)
 src/libwebp.a: $(LIBWEBP_OBJS)
 src/mux/libwebpmux.a: $(LIBWEBPMUX_OBJS)
+src/demux/libwebpdemux.a: $(LIBWEBPDEMUX_OBJS)

 %.a:
 	$(AR) $(ARFLAGS) $@ $^

-ex: $(OUT_EXAMPLES)
-
-examples/cwebp: examples/cwebp.o
+examples/cwebp: examples/cwebp.o $(EX_FORMAT_DEC_OBJS)
 examples/dwebp: examples/dwebp.o
+examples/gif2webp: examples/gif2webp.o
 examples/vwebp: examples/vwebp.o
 examples/webpmux: examples/webpmux.o

 examples/cwebp: src/libwebp.a
-examples/dwebp: examples/libexample_util.a src/libwebp.a
-examples/vwebp: examples/libexample_util.a src/mux/libwebpmux.a src/libwebp.a
+examples/cwebp: EXTRA_LIBS += $(CWEBP_LIBS)
+examples/dwebp: examples/libexample_util.a src/libwebpdecoder.a
+examples/dwebp: EXTRA_LIBS += $(DWEBP_LIBS)
+examples/gif2webp: examples/libexample_util.a examples/libgif2webp_util.a
+examples/gif2webp: src/mux/libwebpmux.a src/libwebp.a
+examples/gif2webp: EXTRA_LIBS += $(GIF_LIBS)
+examples/gif2webp: EXTRA_FLAGS += -DWEBP_HAVE_GIF
+examples/vwebp: examples/libexample_util.a src/demux/libwebpdemux.a
+examples/vwebp: src/libwebp.a
 examples/vwebp: EXTRA_LIBS += $(GL_LIBS)
-examples/webpmux: examples/libexample_util.a src/mux/libwebpmux.a src/libwebp.a
+examples/vwebp: EXTRA_FLAGS += -DWEBP_HAVE_GL
+examples/webpmux: examples/libexample_util.a src/mux/libwebpmux.a
+examples/webpmux: src/libwebpdecoder.a

-$(OUT_EXAMPLES) examples/vwebp examples/webpmux:
+$(OUT_EXAMPLES) $(EXTRA_EXAMPLES):
 	$(CC) -o $@ $^ $(LDFLAGS)

 dist: DESTDIR := dist
+dist: OUT_EXAMPLES += $(EXTRA_EXAMPLES)
 dist: all
 	$(INSTALL) -m755 -d $(DESTDIR)/include/webp \
-	           $(DESTDIR)/doc $(DESTDIR)/lib
-	$(INSTALL) -m755 -s $(OUT_EXAMPLES) $(DESTDIR)
+	           $(DESTDIR)/bin $(DESTDIR)/doc $(DESTDIR)/lib
+	$(INSTALL) -m755 -s $(OUT_EXAMPLES) $(DESTDIR)/bin
 	$(INSTALL) -m644 $(HDRS_INSTALLED) $(DESTDIR)/include/webp
 	$(INSTALL) -m644 src/libwebp.a $(DESTDIR)/lib
+	$(INSTALL) -m644 src/demux/libwebpdemux.a $(DESTDIR)/lib
+	$(INSTALL) -m644 src/mux/libwebpmux.a $(DESTDIR)/lib
 	umask 022; \
-	for m in man/[cd]webp.1; do \
+	for m in man/[cd]webp.1 man/gif2webp.1 man/webpmux.1; do \
 	  basenam=$$(basename $$m .1); \
 	  $(GROFF) -t -e -man -T utf8 $$m \
 	    | $(COL) -bx >$(DESTDIR)/doc/$${basenam}.txt; \
@ -218,6 +282,7 @@ clean:
 	$(RM) $(OUTPUT) *~ \
              examples/*.o examples/*~ \
              src/dec/*.o src/dec/*~ \
+              src/demux/*.o src/demux/*~ \
              src/dsp/*.o src/dsp/*~ \
              src/enc/*.o src/enc/*~ \
              src/mux/*.o src/mux/*~ \
@ -233,7 +298,8 @@ superclean: clean
 	$(RM) Makefile */Makefile */*/Makefile
 	$(RM) Makefile.in */Makefile.in */*/Makefile.in
 	$(RM) config.log autom4te.cache libtool config.h stamp-h1
-	$(RM) aclocal.m4 compile config.guess config.h.in config.sub config.status
+	$(RM) aclocal.m4 compile
+	$(RM) config.guess config.h.in config.sub config.status
 	$(RM) configure depcomp install-sh ltmain.sh missing src/libwebp.pc
 	$(RM) m4/*

--- a/man/Makefile.am
+++ b/man/Makefile.am
@ -2,4 +2,7 @@ man_MANS = cwebp.1 dwebp.1
 if WANT_MUX
  man_MANS += webpmux.1
 endif
+if BUILD_GIF2WEBP
+  man_MANS += gif2webp.1
+endif
 EXTRA_DIST = $(man_MANS)
--- a/man/cwebp.1
+++ b/man/cwebp.1
@ -1,5 +1,5 @@
 .\"                                      Hey, EMACS: -*- nroff -*-
-.TH CWEBP 1 "July 19, 2012"
+.TH CWEBP 1 "December 12, 2013"
 .SH NAME
 cwebp \- compress an image file to a WebP file
 .SH SYNOPSIS
@ -16,7 +16,7 @@ Input format can be either PNG, JPEG, TIFF or raw Y'CbCr samples.
 .SH OPTIONS
 The basic options are:
 .TP
-.B \-o string
+.BI \-o " string
 Specify the name of the output WebP file. If omitted, \fBcwebp\fP will
 perform compression but only report statistics.
 .TP
@ -29,24 +29,30 @@ A summary of all the possible options.
 .B \-version
 Print the version number (as major.minor.revision) and exit.
 .TP
-.B \-q float
-Specify the compression factor for RGB channels between 0 and 100. A small
-factor produces a smaller file with lower quality. Best quality is achieved
-using a value of 100. The default is 75.
+.BI \-q " float
+Specify the compression factor for RGB channels between 0 and 100. The default
+is 75.
+.br
+In case of lossy compression (default), a small factor produces a smaller file
+with lower quality. Best quality is achieved by using a value of 100.
+.br
+In case of lossless compression (specified by the \-lossless option), a small
+factor enables faster compression speed, but produces a larger file. Maximum
+compression is achieved by using a value of 100.
 .TP
-.B \-alpha_q int
+.BI \-alpha_q " int
 Specify the compression factor for alpha compression between 0 and 100.
 Lossless compression of alpha is achieved using a value of 100, while the lower
 values result in a lossy compression. The default is 100.
 .TP
-.B \-f int
+.BI \-f " int
 Specify the strength of the deblocking filter, between 0 (no filtering)
 and 100 (maximum filtering). A value of 0 will turn off any filtering.
 Higher value will increase the strength of the filtering process applied
 after decoding the picture. The higher the value the smoother the picture will
 appear. Typical values are usually in the range of 20 to 50.
 .TP
-.B \-preset string
+.BI \-preset " string
 Specify a set of pre-defined parameters to suit a particular type of
 source material. Possible values are:  \fBdefault\fP, \fBphoto\fP,
 \fBpicture\fP, \fBdrawing\fP, \fBicon\fP, \fBtext\fP. Since
@ -54,22 +60,41 @@ source material. Possible values are:  \fBdefault\fP, \fBphoto\fP,
 \fB\-q\fP one), this option should preferably appear first in the
 order of the arguments.
 .TP
-.B \-sns int
+.BI \-sns " int
 Specify the amplitude of the spatial noise shaping. Spatial noise shaping
 (or \fBsns\fP for short) refers to a general collection of built-in algorithms
 used to decide which area of the picture should use relatively less bits,
 and where else to better transfer these bits. The possible range goes from
 0 (algorithm is off) to 100 (the maximal effect). The default value is 80.
 .TP
-.B \-m int
+.BI \-m " int
 Specify the compression method to use. This parameter controls the
 trade off between encoding speed and the compressed file size and quality.
 Possible values range from 0 to 6. Default value is 4.
 When higher values are used, the encoder will spend more time inspecting
 additional encoding possibilities and decide on the quality gain.
-Lower value can result is faster processing time at the expense of
+Lower value can result in faster processing time at the expense of
 larger file size and lower compression quality.
 .TP
+.B \-jpeg_like
+Change the internal parameter mapping to better match the expected size
+of JPEG compression. This flag will generally produce an output file of
+similar size to its JPEG equivalent (for the same \fB\-q\fP setting), but
+with less visual distortion.
+.TP
+.B \-mt
+Use multi-threading for encoding, if possible. This option is only effective
+when using lossy compression on a source with a transparency channel.
+.TP
+.B \-low_memory
+Reduce memory usage of lossy encoding by saving four times the compressed
+size (typically). This will make the encoding slower and the output slightly
+different in size and distortion. This flag is only effective for methods
+3 and up, and is off by default. Note that leaving this flag off will have
+some side effects on the bitstream: it forces certain bitstream features
+like number of partitions (forced to 1). Note that a more detailed report
+of bitstream size is printed by \fBcwebp\fP when using this option.
+.TP
 .B \-af
 Turns auto-filter on. This algorithm will spend additional time optimizing
 the filtering strength to reach a well-balanced quality.
@ -77,19 +102,25 @@ the filtering strength to reach a well-balanced quality.
 .SH ADDITIONAL OPTIONS
 More advanced options are:
 .TP
-.B \-sharpness int
+.BI \-sharpness " int
 Specify the sharpness of the filtering (if used).
 Range is 0 (sharpest) to 7 (least sharp). Default is 0.
 .TP
 .B \-strong
-Use a stronger filtering than the default one (if filtering is being
-used thanks to the \fB\-f\fP option). Strong filtering is off by default.
+Use strong filtering (if filtering is being used thanks to the
+\fB\-f\fP option). Strong filtering is on by default.
 .TP
-.B \-segments int
+.B \-nostrong
+Disable strong filtering (if filtering is being used thanks to the
+\fB\-f\fP option) and use simple filtering instead.
+.TP
+.BI \-segments " int
 Change the number of partitions to use during the segmentation of the
 sns algorithm. Segments should be in range 1 to 4. Default value is 4.
+This option has no effect for methods 3 and up, unless \fB\-low_memory\fP
+is used.
 .TP
-.B \-partition_limit int
+.BI \-partition_limit " int
 Degrade quality by limiting the number of bits used by some macroblocks.
 Range is 0 (no degradation, the default) to 100 (full degradation).
 Useful values are usually around 30-70 for moderately large images.
@ -108,39 +139,45 @@ If using \fB-partition_limit\fP is not enough to meet the 512k constraint, one
 should use less segments in order to save more header bits per macroblock.
 See the \fB-segments\fP option.
 .TP
-.B \-size int
+.BI \-size " int
 Specify a target size (in bytes) to try and reach for the compressed output.
 Compressor will make several pass of partial encoding in order to get as
 close as possible to this target.
 .TP
-.B \-psnr float
+.BI \-psnr " float
 Specify a target PSNR (in dB) to try and reach for the compressed output.
 Compressor will make several pass of partial encoding in order to get as
 close as possible to this target.
 .TP
-.B \-pass int
+.BI \-pass " int
 Set a maximum number of passes to use during the dichotomy used by
 options \fB\-size\fP or \fB\-psnr\fP. Maximum value is 10.
 .TP
-.B \-crop x_position y_position width height
+.BI \-resize " width height
+Resize the source to a rectangle with size \fBwidth\fP x \fBheight\fP.
+If either (but not both) of the \fBwidth\fP or \fBheight\fP parameters is 0,
+the value will be calculated preserving the aspect-ratio.
+.TP
+.BI \-crop " x_position y_position width height
 Crop the source to a rectangle with top-left corner at coordinates
 (\fBx_position\fP, \fBy_position\fP) and size \fBwidth\fP x \fBheight\fP.
 This cropping area must be fully contained within the source rectangle.
 .TP
-.B \-s width height
+.BI \-s " width height
 Specify that the input file actually consists of raw Y'CbCr samples following
 the ITU-R BT.601 recommendation, in 4:2:0 linear format.
 The luma plane has size \fBwidth\fP x \fBheight\fP.
 .TP
-.B \-map int
+.BI \-map " int
 Output additional ASCII-map of encoding information. Possible map values
 range from 1 to 6. This is only meant to help debugging.
 .TP
-.B \-pre int
-Specify a pre-processing filter. This option is a placeholder
-and has currently no effect.
+.BI \-pre " int
+Specify some pre-processing steps. Using a value of '2' will trigger
+quality-dependent pseudo-random dithering during RGBA->YUVA conversion
+(lossy compression only).
 .TP
-.B \-alpha_filter string
+.BI \-alpha_filter " string
 Specify the predictive filtering method for the alpha plane. One of 'none',
 \&'fast' or 'best', in increasing complexity and slowness order. Default is
 \&'fast'. Internally, alpha filtering is performed using four possible
@ -148,7 +185,7 @@ predictions (none, horizontal, vertical, gradient). The 'best' mode will try
 each mode in turn and pick the one which gives the smaller size. The 'fast'
 mode will just try to form an a-priori guess without testing all modes.
 .TP
-.B \-alpha_method int
+.BI \-alpha_method " int
 Specify the algorithm used for alpha compression: 0 or 1. Algorithm 0 denotes
 no compression, 1 uses WebP lossless format for compression. The default is 1.
 .TP
@ -156,15 +193,28 @@ no compression, 1 uses WebP lossless format for compression. The default is 1.
 Modify unseen RGB values under fully transparent area, to help compressibility.
 The default is off.
 .TP
+.BI \-blend_alpha " int
+This option blends the alpha channel (if present) with the source using the
+background color specified in hexadecimal as 0xrrggbb. The alpha channel is
+afterward reset to the opaque value 255.
+.TP
 .B \-noalpha
 Using this option will discard the alpha channel.
 .TP
 .B \-lossless
 Encode the image without any loss.
 .TP
-.B \-hint string
+.BI \-hint " string
 Specify the hint about input image type. Possible values are:
-\fBphoto\fP, and \fBpicture\fP.
+\fBphoto\fP, \fBpicture\fP or \fBgraph\fP.
+.TP
+.BI \-metadata " string
+A comma separated list of metadata to copy from the input to the output if
+present.
+Valid values: \fBall\fP, \fBnone\fP, \fBexif\fP, \fBicc\fP, \fBxmp\fP.
+The default is \fBnone\fP.
+
+Note: each input format may not support all combinations.
 .TP
 .B \-noasm
 Disable all assembly optimizations.
@ -176,7 +226,12 @@ Print extra information (encoding time in particular).
 Compute and report average PSNR (Peak-Signal-To-Noise ratio).
 .TP
 .B \-print_ssim
-Compute and report average SSIM (structural similarity metric)
+Compute and report average SSIM (structural similarity
+metric, see http://en.wikipedia.org/wiki/SSIM for additional details).
+.TP
+.B \-print_lsim
+Compute and report local similarity metric (sum of lowest error amongst the
+collocated pixel neighbors).
 .TP
 .B \-progress
 Report encoding progress in percent.
@ -199,7 +254,9 @@ cwebp \-q 50 -lossless picture.png \-o picture_lossless.webp
 .br
 cwebp \-q 70 picture_with_alpha.png \-o picture_with_alpha.webp
 .br
-cwebp \-sns 70 \-f 50 \-strong \-af \-size 60000 picture.png \-o picture.webp
+cwebp \-sns 70 \-f 50 \-size 60000 picture.png \-o picture.webp
+.br
+cwebp \-o picture.webp \-\- \-\-\-picture.png

 .SH AUTHORS
 \fBcwebp\fP was written by the WebP team.
@ -210,6 +267,8 @@ This manual page was written by Pascal Massimino <pascal.massimino@gmail.com>,
 for the Debian project (and may be used by others).

 .SH SEE ALSO
-.BR dwebp (1).
+.BR dwebp (1),
+.BR gif2webp (1)
 .br
-Please refer to http://code.google.com/speed/webp/ for additional information.
+Please refer to http://developers.google.com/speed/webp/ for additional
+information.
--- a/man/dwebp.1
+++ b/man/dwebp.1
@ -1,5 +1,5 @@
 .\"                                      Hey, EMACS: -*- nroff -*-
-.TH DWEBP 1 "January 24, 2012"
+.TH DWEBP 1 "December 12, 2013"
 .SH NAME
 dwebp \- decompress a WebP file to an image file
 .SH SYNOPSIS
@ -11,7 +11,7 @@ This manual page documents the
 .B dwebp
 command.
 .PP
-\fBdwebp\fP decompresses WebP files into PNG, PPM or PGM images.
+\fBdwebp\fP decompresses WebP files into PNG, PAM, PPM or PGM images.
 .SH OPTIONS
 The basic options are:
 .TP
@ -21,16 +21,32 @@ Print usage summary.
 .B \-version
 Print the version number (as major.minor.revision) and exit.
 .TP
-.B \-o string
+.BI \-o " string
 Specify the name of the output file (as PNG format by default).
+Using "-" as output name will direct output to 'stdout'.
+.TP
+.B \-bmp
+Change the output format to uncompressed BMP.
+.TP
+.B \-tiff
+Change the output format to uncompressed TIFF.
+.TP
+.B \-pam
+Change the output format to PAM (retains alpha).
 .TP
 .B \-ppm
-Change the output format to PPM.
+Change the output format to PPM (discards alpha).
 .TP
 .B \-pgm
-Change the output format to PGM. The output consist of luma/chroma
-samples instead of RGB, using the ICM4 layout. This option is mainly
-for verification and debugging purpose.
+Change the output format to PGM. The output consists of luma/chroma
+samples instead of RGB, using the IMC4 layout. This option is mainly
+for verification and debugging purposes.
+.TP
+.B \-yuv
+Change the output format to raw YUV. The output consists of
+luma/chroma-U/chroma-V samples instead of RGB, saved sequentially as
+individual planes. This option is mainly for verification and debugging
+purposes.
 .TP
 .B \-nofancy
 Don't use the fancy upscaler for YUV420. This may lead to jaggy
@ -39,12 +55,20 @@ edges (especially the red ones), but should be faster.
 .B \-nofilter
 Don't use the in-loop filtering process even if it is required by
 the bitstream. This may produce visible blocks on the non-compliant output,
-but will make the decoding faster.
+but it will make the decoding faster.
+.TP
+.B \-dither " strength
+Specify a dithering \fBstrength\fP between 0 and 100. Dithering is a
+post-processing effect applied to chroma components in lossy compression.
+It helps by smoothing gradients and avoiding banding artifacts.
+.TP
+.B \-nodither
+Disable all dithering (default).
 .TP
 .B \-mt
 Use multi-threading for decoding, if possible.
 .TP
-.B \-crop x_position y_position width height
+.BI \-crop " x_position y_position width height
 Crop the decoded picture to a rectangle with top-left corner at coordinates
 (\fBx_position\fP, \fBy_position\fP) and size \fBwidth\fP x \fBheight\fP.
 This cropping area must be fully contained within the source rectangle.
@ -52,7 +76,7 @@ The top-left corner will be snapped to even coordinates if needed.
 This option is meant to reduce the memory needed for cropping large images.
 Note: the cropping is applied \fIbefore\fP any scaling.
 .TP
-.B \-scale width height
+.BI \-scale " width height
 Rescale the decoded picture to dimension \fBwidth\fP x \fBheight\fP. This
 option is mostly intended to reducing the memory needed to decode large images,
 when only a small version is needed (thumbnail, preview, etc.).  Note: scaling
@ -75,6 +99,8 @@ http://www.webmproject.org/code/contribute/submitting-patches/
 dwebp picture.webp \-o output.png
 .br
 dwebp picture.webp \-ppm \-o output.ppm
+.br
+dwebp \-o output.ppm \-\- \-\-\-picture.webp

 .SH AUTHORS
 \fBdwebp\fP was written by the WebP team.
@ -85,6 +111,17 @@ This manual page was written by Pascal Massimino <pascal.massimino@gmail.com>,
 for the Debian project (and may be used by others).

 .SH SEE ALSO
-.BR cwebp (1).
+.BR cwebp (1),
+.BR gif2webp (1),
+.BR webpmux (1)
 .br
-Please refer to http://code.google.com/speed/webp/ for additional information.
+Please refer to http://developers.google.com/speed/webp/ for additional
+information.
+.SS Output file format details
+PAM: http://netpbm.sourceforge.net/doc/pam.html
+.br
+PGM: http://netpbm.sourceforge.net/doc/pgm.html
+.br
+PPM: http://netpbm.sourceforge.net/doc/ppm.html
+.br
+PNG: http://www.libpng.org/pub/png/png-sitemap.html#info
--- a/man/gif2webp.1
+++ b/man/gif2webp.1
@ -0,0 +1,143 @@
+.\"                                      Hey, EMACS: -*- nroff -*-
+.TH GIF2WEBP 1 "December 17, 2013"
+.SH NAME
+gif2webp \- Convert a GIF image to WebP
+.SH SYNOPSIS
+.B gif2webp
+.RI [ options ] " input_file.gif \-o output_file.webp
+.br
+.SH DESCRIPTION
+This manual page documents the
+.B gif2webp
+command.
+.PP
+\fBgif2webp\fP converts a GIF image to a WebP image.
+.SH OPTIONS
+The basic options are:
+.TP
+.BI \-o " string
+Specify the name of the output WebP file. If omitted, \fBgif2webp\fP will
+perform conversion but only report statistics.
+.TP
+.B \-h, \-help
+Usage information.
+.TP
+.B \-version
+Print the version number (as major.minor.revision) and exit.
+.TP
+.B \-lossy
+Encode the image using lossy compression.
+.TP
+.B \-mixed
+Mixed compression mode: optimize compression of the image by picking either
+lossy or lossless compression for each frame heuristically.
+.TP
+.BI \-q " float
+Specify the compression factor for RGB channels between 0 and 100. The default
+is 75.
+.br
+In case of lossless compression (default), a small factor enables faster
+compression speed, but produces a larger file. Maximum compression is achieved
+by using a value of 100.
+.br
+In case of lossy compression (specified by the \-lossy option), a small factor
+produces a smaller file with lower quality. Best quality is achieved by using a
+value of 100.
+.TP
+.BI \-m " int
+Specify the compression method to use. This parameter controls the
+trade off between encoding speed and the compressed file size and quality.
+Possible values range from 0 to 6. Default value is 4.
+When higher values are used, the encoder will spend more time inspecting
+additional encoding possibilities and decide on the quality gain.
+Lower value can result is faster processing time at the expense of
+larger file size and lower compression quality.
+.TP
+.BI \-kmin " int
+.TP
+.BI \-kmax " int
+Specify the minimum and maximum distance between consecutive key frames
+(independently decodable frames) in the output animation. The tool will insert
+some key frames into the output animation as needed so that this criteria is
+satisfied.
+.br
+A 'kmin' value of 0 will turn off insertion of key frames.
+Typical values are in the range 3 to 30. Default values are kmin = 9,
+kmax = 17 for lossless compression and kmin = 3, kmax = 5 for lossy compression.
+.br
+These two options are relevant only for animated images with large number of
+frames (>50).
+.br
+When lower values are used, more frames will be converted to key frames. This
+may lead to smaller number of frames required to decode a frame on average,
+thereby improving the decoding performance. But this may lead to slightly bigger
+file sizes.
+Higher values may lead to worse decoding performance, but smaller file sizes.
+.br
+Some restrictions:
+.br
+(i) kmin < kmax,
+.br
+(ii) kmin >= kmax / 2 + 1 and
+.br
+(iii) kmax - kmin <= 30.
+.br
+If any of these restrictions are not met, they will be enforced automatically.
+.TP
+.BI \-metadata " string
+A comma separated list of metadata to copy from the input to the output if
+present.
+Valid values: \fBall\fP, \fBnone\fP, \fBicc\fP, \fBxmp\fP.
+The default is \fBxmp\fP.
+.TP
+.BI \-f " int
+For lossy encoding only (specified by the \-lossy option). Specify the strength
+of the deblocking filter, between 0 (no filtering) and 100 (maximum filtering).
+A value of 0 will turn off any filtering. Higher value will increase the
+strength of the filtering process applied after decoding the picture. The higher
+the value the smoother the picture will appear. Typical values are usually in
+the range of 20 to 50.
+.TP
+.B \-mt
+Use multi-threading for encoding, if possible. This option is only effective
+when using lossy compression.
+.TP
+.B \-v
+Print extra information.
+.TP
+.B \-quiet
+Do not print anything.
+
+.SH BUGS
+Please report all bugs to our issue tracker:
+http://code.google.com/p/webp/issues
+.br
+Patches welcome! See this page to get started:
+http://www.webmproject.org/code/contribute/submitting-patches/
+
+.SH EXAMPLES
+gif2webp picture.gif \-o picture.webp
+.br
+gif2webp \-q 70 picture.gif \-o picture.webp
+.br
+gif2webp \-lossy \-m 3 picture.gif \-o picture_lossy.webp
+.br
+gif2webp \-lossy \-f 50 picture.gif \-o picture.webp
+.br
+gif2webp \-q 70 \-o picture.webp \-\- \-\-\-picture.gif
+
+.SH AUTHORS
+\fBgif2webp\fP was written by the WebP team.
+.br
+The latest source tree is available at http://www.webmproject.org/code
+.PP
+This manual page was written by Urvang Joshi <urvang@google.com>, for the
+Debian project (and may be used by others).
+
+.SH SEE ALSO
+.BR cwebp (1),
+.BR dwebp (1),
+.BR webpmux (1)
+.br
+Please refer to http://developers.google.com/speed/webp/ for additional
+information.
--- a/man/webpmux.1
+++ b/man/webpmux.1
@ -1,5 +1,5 @@
 .\"                                      Hey, EMACS: -*- nroff -*-
-.TH WEBPMUX 1 "January 24, 2012"
+.TH WEBPMUX 1 "December 17, 2013"
 .SH NAME
 webpmux \- command line tool to create WebP Mux/container file.
 .SH SYNOPSIS
@ -21,22 +21,25 @@ webpmux \- command line tool to create WebP Mux/container file.
 .B \-o
 .I OUTPUT
 .br
-.B webpmux \-tile
-.I TILE_OPTIONS
-.B [\-tile...] \-o
-.I OUTPUT
-.br
 .B webpmux \-frame
 .I FRAME_OPTIONS
-.B [\-frame...] \-loop
+.B [ \-frame ... ] [ \-loop
 .I LOOP_COUNT
-.B \-o
+.B ]
+.br
+.RS 8
+.B [ \-bgcolor
+.I BACKGROUND_COLOR
+.B ] \-o
 .I OUTPUT
+.RE
 .br
 .B webpmux \-info
 .I INPUT
 .br
 .B webpmux [\-h|\-help]
+.br
+.B webpmux \-version
 .SH DESCRIPTION
 This manual page documents the
 .B webpmux
@ -48,47 +51,67 @@ and extract/strip relevant data from the container file.
 .SS GET_OPTIONS (\-get):
 .TP
 .B icc
-Get ICC Color profile.
+Get ICC profile.
+.TP
+.B exif
+Get EXIF metadata.
 .TP
 .B xmp
 Get XMP metadata.
 .TP
-.B tile n
-Get nth tile.
-.TP
-.B frame n
+.BI frame " n
 Get nth frame.

 .SS SET_OPTIONS (\-set)
 .TP
-.B icc
-Set ICC Color profile.
+.BI icc " file.icc
+Set ICC profile.
+.P
+Where: 'file.icc' contains the ICC profile to be set.
 .TP
-.B xmp
+.BI exif " file.exif
+Set EXIF metadata.
+.P
+Where: 'file.exif' contains the EXIF metadata to be set.
+.TP
+.BI xmp " file.xmp
 Set XMP metadata.
+.P
+Where: 'file.xmp' contains the XMP metadata to be set.

 .SS STRIP_OPTIONS (\-strip)
 .TP
 .B icc
-Strip ICC Color profile.
+Strip ICC profile.
+.TP
+.B exif
+Strip EXIF metadata.
 .TP
 .B xmp
 Strip XMP metadata.

-.SS TILE_OPTIONS (\-tile)
-.TP
-.B file_i +xi+yi
-Where: 'file_i' is the i'th tile (webp format) and 'xi','yi' specify the image
-offset for this tile.
-
 .SS FRAME_OPTIONS (\-frame)
 .TP
-.B file_i +xi+yi+di
-Where: 'file_i' is the i'th frame (webp format), 'xi','yi' specify the image
-offset for this frame and 'di' is the pause duration before next frame.
+.I file_i +di[+xi+yi[+mi[bi]]]
+Where: 'file_i' is the i'th frame (WebP format), 'xi','yi' specify the image
+offset for this frame, 'di' is the pause duration before next frame, 'mi' is
+the dispose method for this frame (0 for NONE or 1 for BACKGROUND) and 'bi' is
+the blending method for this frame (+b for BLEND or -b for NO_BLEND).
+Argument 'bi' can be omitted and will default to +b (BLEND).
+Also, 'mi' can be omitted if 'bi' is omitted and will default to 0 (NONE).
+Finally, if 'mi' and 'bi' are omitted then 'xi' and 'yi' can be omitted and will
+default to +0+0.
 .TP
-.B \-loop n
+.BI \-loop " n
 Loop the frames n number of times. 0 indicates the frames should loop forever.
+Valid range is 0 to 65535 [Default: 0 (infinite)].
+.TP
+.BI \-bgcolor " A,R,G,B
+Background color of the canvas.
+.br
+where: 'A', 'R', 'G' and 'B' are integers in the range 0 to 255 specifying the
+Alpha, Red, Green and Blue component values respectively
+[Default: 255,255,255,255].

 .SS INPUT
 .TP
@ -98,6 +121,10 @@ Input file in WebP format.
 .TP
 Output file in WebP format.

+.SS Note:
+.TP
+The nature of EXIF, XMP and ICC data is not checked and is assumed to be valid.
+
 .SH BUGS
 Please report all bugs to our issue tracker:
 http://code.google.com/p/webp/issues
@ -110,14 +137,36 @@ webpmux \-set icc image_profile.icc in.webp \-o icc_container.webp
 .br
 webpmux \-get icc icc_container.webp \-o image_profile.icc
 .br
+webpmux \-strip icc icc_container.webp \-o without_icc.webp
+.br
 webpmux \-set xmp image_metadata.xmp in.webp \-o xmp_container.webp
 .br
 webpmux \-get xmp xmp_container.webp \-o image_metadata.xmp
 .br
-webpmux \-frame anim_1.webp +0+0+0 \-frame anim_2.webp +50+50+0 \-loop 10
+webpmux \-strip xmp xmp_container.webp \-o without_xmp.webp
+.br
+webpmux \-set exif image_metadata.exif in.webp \-o exif_container.webp
+.br
+webpmux \-get exif exif_container.webp \-o image_metadata.exif
+.br
+webpmux \-strip exif exif_container.webp \-o without_exif.webp
+.br
+webpmux \-frame anim_1.webp +100 \-frame anim_2.webp +100+50+50
+.br
+.RS 8
+\-frame anim_2.webp +100+50+50+1+b \-loop 10 \-bgcolor 255,255,255,255
+.br
+.RS 8
 \-o anim_container.webp
+.RE
 .br
 webpmux \-get frame 2 anim_container.webp \-o frame_2.webp
+.br
+webpmux \-set icc image_profile.icc \-o icc_container.webp \-\- \-\-\-in.webp
+.br
+webpmux \-get icc \-o image_profile.icc \-\- \-\-\-icc_container.webp
+.br
+webpmux \-strip icc \-o without_icc.webp \-\- \-\-\-icc_container.webp

 .SH AUTHORS
 \fBwebpmux\fP is written by the WebP team.
@ -128,7 +177,9 @@ This manual page was written by Vikas Arora <vikaas.arora@gmail.com>,
 for the Debian project (and may be used by others).

 .SH SEE ALSO
+.BR cwebp (1),
 .BR dwebp (1),
-.BR cwebp (1).
+.BR gif2webp (1)
 .br
-Please refer to http://code.google.com/speed/webp/ for additional information.
+Please refer to http://developers.google.com/speed/webp/ for additional
+information.
--- a/src/Makefile.am
+++ b/src/Makefile.am
@ -1,16 +1,28 @@
-SUBDIRS = dec enc dsp utils
+# The mux and demux libraries depend on libwebp, thus the '.' to force the
+# build order so it's available to them.
+SUBDIRS = dec enc dsp utils .
 if WANT_MUX
  SUBDIRS += mux
 endif
+if WANT_DEMUX
+  SUBDIRS += demux
+endif

 AM_CPPFLAGS = -I$(top_srcdir)/src
 lib_LTLIBRARIES = libwebp.la

+if BUILD_LIBWEBPDECODER
+  lib_LTLIBRARIES += libwebpdecoder.la
+endif
+
+common_HEADERS =
+common_HEADERS += webp/decode.h
+common_HEADERS += webp/types.h
+commondir = $(includedir)/webp
+
 libwebp_la_SOURCES =
 libwebpinclude_HEADERS =
-libwebpinclude_HEADERS += webp/decode.h
 libwebpinclude_HEADERS += webp/encode.h
-libwebpinclude_HEADERS += webp/types.h
 noinst_HEADERS =
 noinst_HEADERS += webp/format_constants.h

@ -20,8 +32,24 @@ libwebp_la_LIBADD += dsp/libwebpdsp.la
 libwebp_la_LIBADD += enc/libwebpencode.la
 libwebp_la_LIBADD += utils/libwebputils.la

-libwebp_la_LDFLAGS = -version-info 3:0:0
+# Use '-no-undefined' to declare that libwebp does not depend on any libraries
+# other than the ones listed on the command line, i.e., after linking, it will
+# not have unresolved symbols. Some platforms (Windows among them) require all
+# symbols in shared libraries to be resolved at library creation.
+libwebp_la_LDFLAGS = -no-undefined -version-info 5:0:0
 libwebpincludedir = $(includedir)/webp
-
 pkgconfig_DATA = libwebp.pc
+
+if BUILD_LIBWEBPDECODER
+  libwebpdecoder_la_SOURCES =
+
+  libwebpdecoder_la_LIBADD =
+  libwebpdecoder_la_LIBADD += dec/libwebpdecode.la
+  libwebpdecoder_la_LIBADD += dsp/libwebpdspdecode.la
+  libwebpdecoder_la_LIBADD += utils/libwebputilsdecode.la
+
+  libwebpdecoder_la_LDFLAGS = -no-undefined -version-info 1:0:0
+  pkgconfig_DATA += libwebpdecoder.pc
+endif
+
 ${pkgconfig_DATA}: ${top_builddir}/config.status
--- a/src/dec/Makefile.am
+++ b/src/dec/Makefile.am
@ -3,6 +3,7 @@ noinst_LTLIBRARIES = libwebpdecode.la

 libwebpdecode_la_SOURCES =
 libwebpdecode_la_SOURCES += alpha.c
+libwebpdecode_la_SOURCES += alphai.h
 libwebpdecode_la_SOURCES += buffer.c
 libwebpdecode_la_SOURCES += decode_vp8.h
 libwebpdecode_la_SOURCES += frame.c
--- a/src/dec/alpha.c
+++ b/src/dec/alpha.c
@ -1,8 +1,10 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Alpha-plane decompression.
@ -10,131 +12,150 @@
 // Author: Skal (pascal.massimino@gmail.com)

 #include <stdlib.h>
+#include "./alphai.h"
 #include "./vp8i.h"
 #include "./vp8li.h"
-#include "../utils/filters.h"
-#include "../utils/quant_levels.h"
+#include "../utils/quant_levels_dec.h"
 #include "../webp/format_constants.h"

-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
+//------------------------------------------------------------------------------
+// ALPHDecoder object.

-// TODO(skal): move to dsp/ ?
-static void CopyPlane(const uint8_t* src, int src_stride,
-                      uint8_t* dst, int dst_stride, int width, int height) {
-  while (height-- > 0) {
-    memcpy(dst, src, width);
-    src += src_stride;
-    dst += dst_stride;
+ALPHDecoder* ALPHNew(void) {
+  ALPHDecoder* const dec = (ALPHDecoder*)calloc(1, sizeof(*dec));
+  return dec;
+}
+
+void ALPHDelete(ALPHDecoder* const dec) {
+  if (dec != NULL) {
+    VP8LDelete(dec->vp8l_dec_);
+    dec->vp8l_dec_ = NULL;
+    free(dec);
  }
 }

 //------------------------------------------------------------------------------
-// Decodes the compressed data 'data' of size 'data_size' into the 'output'.
-// The 'output' buffer should be pre-allocated and must be of the same
-// dimension 'height'x'stride', as that of the image.
-//
-// Returns 1 on successfully decoding the compressed alpha and
-//         0 if either:
-//           error in bit-stream header (invalid compression mode or filter), or
-//           error returned by appropriate compression method.
+// Decoding.

-static int DecodeAlpha(const uint8_t* data, size_t data_size,
-                       int width, int height, int stride, uint8_t* output) {
-  uint8_t* decoded_data = NULL;
-  const size_t decoded_size = height * width;
-  uint8_t* unfiltered_data = NULL;
-  WEBP_FILTER_TYPE filter;
-  int pre_processing;
-  int rsrv;
+// Initialize alpha decoding by parsing the alpha header and decoding the image
+// header for alpha data stored using lossless compression.
+// Returns false in case of error in alpha header (data too short, invalid
+// compression method or filter, error in lossless header data etc).
+static int ALPHInit(ALPHDecoder* const dec, const uint8_t* data,
+                    size_t data_size, int width, int height, uint8_t* output) {
  int ok = 0;
-  int method;
+  const uint8_t* const alpha_data = data + ALPHA_HEADER_LEN;
+  const size_t alpha_data_size = data_size - ALPHA_HEADER_LEN;
+  int rsrv;

-  assert(width > 0 && height > 0 && stride >= width);
+  assert(width > 0 && height > 0);
  assert(data != NULL && output != NULL);

+  dec->width_ = width;
+  dec->height_ = height;
+
  if (data_size <= ALPHA_HEADER_LEN) {
    return 0;
  }

-  method = (data[0] >> 0) & 0x03;
-  filter = (data[0] >> 2) & 0x03;
-  pre_processing = (data[0] >> 4) & 0x03;
+  dec->method_ = (data[0] >> 0) & 0x03;
+  dec->filter_ = (data[0] >> 2) & 0x03;
+  dec->pre_processing_ = (data[0] >> 4) & 0x03;
  rsrv = (data[0] >> 6) & 0x03;
-  if (method < ALPHA_NO_COMPRESSION ||
-      method > ALPHA_LOSSLESS_COMPRESSION ||
-      filter >= WEBP_FILTER_LAST ||
-      pre_processing > ALPHA_PREPROCESSED_LEVELS ||
+  if (dec->method_ < ALPHA_NO_COMPRESSION ||
+      dec->method_ > ALPHA_LOSSLESS_COMPRESSION ||
+      dec->filter_ >= WEBP_FILTER_LAST ||
+      dec->pre_processing_ > ALPHA_PREPROCESSED_LEVELS ||
      rsrv != 0) {
    return 0;
  }

-  if (method == ALPHA_NO_COMPRESSION) {
-    ok = (data_size >= decoded_size);
-    decoded_data = (uint8_t*)data + ALPHA_HEADER_LEN;
+  if (dec->method_ == ALPHA_NO_COMPRESSION) {
+    const size_t alpha_decoded_size = dec->width_ * dec->height_;
+    ok = (alpha_data_size >= alpha_decoded_size);
  } else {
-    decoded_data = (uint8_t*)malloc(decoded_size);
-    if (decoded_data == NULL) return 0;
-    ok = VP8LDecodeAlphaImageStream(width, height,
-                                    data + ALPHA_HEADER_LEN,
-                                    data_size - ALPHA_HEADER_LEN,
-                                    decoded_data);
-  }
-
-  if (ok) {
-    WebPFilterFunc unfilter_func = WebPUnfilters[filter];
-    if (unfilter_func != NULL) {
-      unfiltered_data = (uint8_t*)malloc(decoded_size);
-      if (unfiltered_data == NULL) {
-        ok = 0;
-        goto Error;
-      }
-      // TODO(vikas): Implement on-the-fly decoding & filter mechanism to decode
-      // and apply filter per image-row.
-      unfilter_func(decoded_data, width, height, 1, width, unfiltered_data);
-      // Construct raw_data (height x stride) from alpha data (height x width).
-      CopyPlane(unfiltered_data, width, output, stride, width, height);
-      free(unfiltered_data);
-    } else {
-      // Construct raw_data (height x stride) from alpha data (height x width).
-      CopyPlane(decoded_data, width, output, stride, width, height);
-    }
-    if (pre_processing == ALPHA_PREPROCESSED_LEVELS) {
-      ok = DequantizeLevels(decoded_data, width, height);
-    }
-  }
-
- Error:
-  if (method != ALPHA_NO_COMPRESSION) {
-    free(decoded_data);
+    assert(dec->method_ == ALPHA_LOSSLESS_COMPRESSION);
+    ok = VP8LDecodeAlphaHeader(dec, alpha_data, alpha_data_size, output);
  }
  return ok;
 }

+// Decodes, unfilters and dequantizes *at least* 'num_rows' rows of alpha
+// starting from row number 'row'. It assumes that rows up to (row - 1) have
+// already been decoded.
+// Returns false in case of bitstream error.
+static int ALPHDecode(VP8Decoder* const dec, int row, int num_rows) {
+  ALPHDecoder* const alph_dec = dec->alph_dec_;
+  const int width = alph_dec->width_;
+  const int height = alph_dec->height_;
+  WebPUnfilterFunc unfilter_func = WebPUnfilters[alph_dec->filter_];
+  uint8_t* const output = dec->alpha_plane_;
+  if (alph_dec->method_ == ALPHA_NO_COMPRESSION) {
+    const size_t offset = row * width;
+    const size_t num_pixels = num_rows * width;
+    assert(dec->alpha_data_size_ >= ALPHA_HEADER_LEN + offset + num_pixels);
+    memcpy(dec->alpha_plane_ + offset,
+           dec->alpha_data_ + ALPHA_HEADER_LEN + offset, num_pixels);
+  } else {  // alph_dec->method_ == ALPHA_LOSSLESS_COMPRESSION
+    assert(alph_dec->vp8l_dec_ != NULL);
+    if (!VP8LDecodeAlphaImageStream(alph_dec, row + num_rows)) {
+      return 0;
+    }
+  }
+
+  if (unfilter_func != NULL) {
+    unfilter_func(width, height, width, row, num_rows, output);
+  }
+
+  if (alph_dec->pre_processing_ == ALPHA_PREPROCESSED_LEVELS) {
+    if (!DequantizeLevels(output, width, height, row, num_rows)) {
+      return 0;
+    }
+  }
+
+  if (row + num_rows == dec->pic_hdr_.height_) {
+    dec->is_alpha_decoded_ = 1;
+  }
+  return 1;
+}
+
 //------------------------------------------------------------------------------
+// Main entry point.

 const uint8_t* VP8DecompressAlphaRows(VP8Decoder* const dec,
                                      int row, int num_rows) {
-  const int stride = dec->pic_hdr_.width_;
+  const int width = dec->pic_hdr_.width_;
+  const int height = dec->pic_hdr_.height_;

-  if (row < 0 || num_rows < 0 || row + num_rows > dec->pic_hdr_.height_) {
+  if (row < 0 || num_rows <= 0 || row + num_rows > height) {
    return NULL;    // sanity check.
  }

  if (row == 0) {
-    // Decode everything during the first call.
-    if (!DecodeAlpha(dec->alpha_data_, (size_t)dec->alpha_data_size_,
-                     dec->pic_hdr_.width_, dec->pic_hdr_.height_, stride,
-                     dec->alpha_plane_)) {
-      return NULL;  // Error.
+    // Initialize decoding.
+    assert(dec->alpha_plane_ != NULL);
+    dec->alph_dec_ = ALPHNew();
+    if (dec->alph_dec_ == NULL) return NULL;
+    if (!ALPHInit(dec->alph_dec_, dec->alpha_data_, dec->alpha_data_size_,
+                  width, height, dec->alpha_plane_)) {
+      ALPHDelete(dec->alph_dec_);
+      dec->alph_dec_ = NULL;
+      return NULL;
    }
  }

+  if (!dec->is_alpha_decoded_) {
+    int ok = 0;
+    assert(dec->alph_dec_ != NULL);
+    ok = ALPHDecode(dec, row, num_rows);
+    if (!ok || dec->is_alpha_decoded_) {
+      ALPHDelete(dec->alph_dec_);
+      dec->alph_dec_ = NULL;
+    }
+    if (!ok) return NULL;  // Error.
+  }
+
  // Return a pointer to the current decoded row.
-  return dec->alpha_plane_ + row * stride;
+  return dec->alpha_plane_ + row * width;
 }

-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
--- a/src/dec/alphai.h
+++ b/src/dec/alphai.h
@ -0,0 +1,55 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Alpha decoder: internal header.
+//
+// Author: Urvang (urvang@google.com)
+
+#ifndef WEBP_DEC_ALPHAI_H_
+#define WEBP_DEC_ALPHAI_H_
+
+#include "./webpi.h"
+#include "../utils/filters.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP8LDecoder;  // Defined in dec/vp8li.h.
+
+typedef struct ALPHDecoder ALPHDecoder;
+struct ALPHDecoder {
+  int width_;
+  int height_;
+  int method_;
+  WEBP_FILTER_TYPE filter_;
+  int pre_processing_;
+  struct VP8LDecoder* vp8l_dec_;
+  VP8Io io_;
+  int use_8b_decode;  // Although alpha channel requires only 1 byte per
+                      // pixel, sometimes VP8LDecoder may need to allocate
+                      // 4 bytes per pixel internally during decode.
+};
+
+//------------------------------------------------------------------------------
+// internal functions. Not public.
+
+// Allocates a new alpha decoder instance.
+ALPHDecoder* ALPHNew(void);
+
+// Clears and deallocates an alpha decoder instance.
+void ALPHDelete(ALPHDecoder* const dec);
+
+//------------------------------------------------------------------------------
+
+#ifdef __cplusplus
+}    // extern "C"
+#endif
+
+#endif  /* WEBP_DEC_ALPHAI_H_ */
--- a/src/dec/buffer.c
+++ b/src/dec/buffer.c
@ -1,8 +1,10 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Everything about WebPDecBuffer
@ -13,10 +15,7 @@

 #include "./vp8i.h"
 #include "./webpi.h"
-
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
+#include "../utils/utils.h"

 //------------------------------------------------------------------------------
 // WebPDecBuffer
@ -50,18 +49,23 @@ static VP8StatusCode CheckDecBuffer(const WebPDecBuffer* const buffer) {
    ok &= (y_size <= buf->y_size);
    ok &= (u_size <= buf->u_size);
    ok &= (v_size <= buf->v_size);
-    ok &= (a_size <= buf->a_size);
    ok &= (buf->y_stride >= width);
    ok &= (buf->u_stride >= (width + 1) / 2);
    ok &= (buf->v_stride >= (width + 1) / 2);
-    if (buf->a) {
+    ok &= (buf->y != NULL);
+    ok &= (buf->u != NULL);
+    ok &= (buf->v != NULL);
+    if (mode == MODE_YUVA) {
      ok &= (buf->a_stride >= width);
+      ok &= (a_size <= buf->a_size);
+      ok &= (buf->a != NULL);
    }
  } else {    // RGB checks
    const WebPRGBABuffer* const buf = &buffer->u.RGBA;
    const uint64_t size = (uint64_t)buf->stride * height;
    ok &= (size <= buf->size);
    ok &= (buf->stride >= width * kModeBpp[mode]);
+    ok &= (buf->rgba != NULL);
  }
  return ok ? VP8_STATUS_OK : VP8_STATUS_INVALID_PARAM;
 }
@ -95,14 +99,11 @@ static VP8StatusCode AllocateBuffer(WebPDecBuffer* const buffer) {
    total_size = size + 2 * uv_size + a_size;

    // Security/sanity checks
-    if (((size_t)total_size != total_size) || (total_size >= (1ULL << 40))) {
-      return VP8_STATUS_INVALID_PARAM;
-    }
-
-    buffer->private_memory = output = (uint8_t*)malloc((size_t)total_size);
+    output = (uint8_t*)WebPSafeMalloc(total_size, sizeof(*output));
    if (output == NULL) {
      return VP8_STATUS_OUT_OF_MEMORY;
    }
+    buffer->private_memory = output;

    if (!WebPIsRGBMode(mode)) {   // YUVA initialization
      WebPYUVABuffer* const buf = &buffer->u.YUVA;
@ -207,6 +208,3 @@ void WebPGrabDecBuffer(WebPDecBuffer* const src, WebPDecBuffer* const dst) {

 //------------------------------------------------------------------------------

-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
--- a/src/dec/decode_vp8.h
+++ b/src/dec/decode_vp8.h
@ -1,8 +1,10 @@
 // Copyright 2010 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 //  Low-level API for VP8 decoder
@ -14,7 +16,7 @@

 #include "../webp/decode.h"

-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif

@ -130,7 +132,8 @@ static WEBP_INLINE int VP8InitIo(VP8Io* const io) {
  return VP8InitIoInternal(io, WEBP_DECODER_ABI_VERSION);
 }

-// Start decoding a new picture. Returns true if ok.
+// Decode the VP8 frame header. Returns true if ok.
+// Note: 'io->data' must be pointing to the start of the VP8 frame header.
 int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io);

 // Decode a picture. Will call VP8GetHeaders() if it wasn't done already.
@ -175,7 +178,7 @@ WEBP_EXTERN(int) VP8LGetInfo(
    const uint8_t* data, size_t data_size,  // data available so far
    int* const width, int* const height, int* const has_alpha);

-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }    // extern "C"
 #endif

--- a/src/dec/frame.c
+++ b/src/dec/frame.c
@ -1,8 +1,10 @@
 // Copyright 2010 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Frame-reconstruction function. Memory allocation.
@ -11,13 +13,13 @@

 #include <stdlib.h>
 #include "./vp8i.h"
-
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
+#include "../utils/utils.h"

 #define ALIGN_MASK (32 - 1)

+static void ReconstructRow(const VP8Decoder* const dec,
+                           const VP8ThreadContext* ctx);  // TODO(skal): remove
+
 //------------------------------------------------------------------------------
 // Filtering

@ -28,25 +30,18 @@ extern "C" {
 //                 U/V, so it's 8 samples total (because of the 2x upsampling).
 static const uint8_t kFilterExtraRows[3] = { 0, 2, 8 };

-static WEBP_INLINE int hev_thresh_from_level(int level, int keyframe) {
-  if (keyframe) {
-    return (level >= 40) ? 2 : (level >= 15) ? 1 : 0;
-  } else {
-    return (level >= 40) ? 3 : (level >= 20) ? 2 : (level >= 15) ? 1 : 0;
-  }
-}
-
 static void DoFilter(const VP8Decoder* const dec, int mb_x, int mb_y) {
  const VP8ThreadContext* const ctx = &dec->thread_ctx_;
+  const int cache_id = ctx->id_;
  const int y_bps = dec->cache_y_stride_;
-  VP8FInfo* const f_info = ctx->f_info_ + mb_x;
-  uint8_t* const y_dst = dec->cache_y_ + ctx->id_ * 16 * y_bps + mb_x * 16;
-  const int level = f_info->f_level_;
+  const VP8FInfo* const f_info = ctx->f_info_ + mb_x;
+  uint8_t* const y_dst = dec->cache_y_ + cache_id * 16 * y_bps + mb_x * 16;
  const int ilevel = f_info->f_ilevel_;
-  const int limit = 2 * level + ilevel;
-  if (level == 0) {
+  const int limit = f_info->f_limit_;
+  if (limit == 0) {
    return;
  }
+  assert(limit >= 3);
  if (dec->filter_type_ == 1) {   // simple
    if (mb_x > 0) {
      VP8SimpleHFilter16(y_dst, y_bps, limit + 4);
@ -62,10 +57,9 @@ static void DoFilter(const VP8Decoder* const dec, int mb_x, int mb_y) {
    }
  } else {    // complex
    const int uv_bps = dec->cache_uv_stride_;
-    uint8_t* const u_dst = dec->cache_u_ + ctx->id_ * 8 * uv_bps + mb_x * 8;
-    uint8_t* const v_dst = dec->cache_v_ + ctx->id_ * 8 * uv_bps + mb_x * 8;
-    const int hev_thresh =
-        hev_thresh_from_level(level, dec->frm_hdr_.key_frame_);
+    uint8_t* const u_dst = dec->cache_u_ + cache_id * 8 * uv_bps + mb_x * 8;
+    uint8_t* const v_dst = dec->cache_v_ + cache_id * 8 * uv_bps + mb_x * 8;
+    const int hev_thresh = f_info->hev_thresh_;
    if (mb_x > 0) {
      VP8HFilter16(y_dst, y_bps, limit + 4, ilevel, hev_thresh);
      VP8HFilter8(u_dst, v_dst, uv_bps, limit + 4, ilevel, hev_thresh);
@ -96,53 +90,132 @@ static void FilterRow(const VP8Decoder* const dec) {
 }

 //------------------------------------------------------------------------------
+// Precompute the filtering strength for each segment and each i4x4/i16x16 mode.

-void VP8StoreBlock(VP8Decoder* const dec) {
+static void PrecomputeFilterStrengths(VP8Decoder* const dec) {
  if (dec->filter_type_ > 0) {
-    VP8FInfo* const info = dec->f_info_ + dec->mb_x_;
-    const int skip = dec->mb_info_[dec->mb_x_].skip_;
-    int level = dec->filter_levels_[dec->segment_];
-    if (dec->filter_hdr_.use_lf_delta_) {
+    int s;
+    const VP8FilterHeader* const hdr = &dec->filter_hdr_;
+    for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
+      int i4x4;
+      // First, compute the initial level
+      int base_level;
+      if (dec->segment_hdr_.use_segment_) {
+        base_level = dec->segment_hdr_.filter_strength_[s];
+        if (!dec->segment_hdr_.absolute_delta_) {
+          base_level += hdr->level_;
+        }
+      } else {
+        base_level = hdr->level_;
+      }
+      for (i4x4 = 0; i4x4 <= 1; ++i4x4) {
+        VP8FInfo* const info = &dec->fstrengths_[s][i4x4];
+        int level = base_level;
+        if (hdr->use_lf_delta_) {
          // TODO(skal): only CURRENT is handled for now.
-      level += dec->filter_hdr_.ref_lf_delta_[0];
-      if (dec->is_i4x4_) {
-        level += dec->filter_hdr_.mode_lf_delta_[0];
+          level += hdr->ref_lf_delta_[0];
+          if (i4x4) {
+            level += hdr->mode_lf_delta_[0];
          }
        }
        level = (level < 0) ? 0 : (level > 63) ? 63 : level;
-    info->f_level_ = level;
-
-    if (dec->filter_hdr_.sharpness_ > 0) {
-      if (dec->filter_hdr_.sharpness_ > 4) {
-        level >>= 2;
+        if (level > 0) {
+          int ilevel = level;
+          if (hdr->sharpness_ > 0) {
+            if (hdr->sharpness_ > 4) {
+              ilevel >>= 2;
            } else {
-        level >>= 1;
+              ilevel >>= 1;
+            }
+            if (ilevel > 9 - hdr->sharpness_) {
+              ilevel = 9 - hdr->sharpness_;
+            }
+          }
+          if (ilevel < 1) ilevel = 1;
+          info->f_ilevel_ = ilevel;
+          info->f_limit_ = 2 * level + ilevel;
+          info->hev_thresh_ = (level >= 40) ? 2 : (level >= 15) ? 1 : 0;
+        } else {
+          info->f_limit_ = 0;  // no filtering
+        }
+        info->f_inner_ = i4x4;
+      }
    }
-      if (level > 9 - dec->filter_hdr_.sharpness_) {
-        level = 9 - dec->filter_hdr_.sharpness_;
  }
 }

-    info->f_ilevel_ = (level < 1) ? 1 : level;
-    info->f_inner_ = (!skip || dec->is_i4x4_);
+//------------------------------------------------------------------------------
+// Dithering
+
+#define DITHER_AMP_TAB_SIZE 12
+static const int kQuantToDitherAmp[DITHER_AMP_TAB_SIZE] = {
+  // roughly, it's dqm->uv_mat_[1]
+  8, 7, 6, 4, 4, 2, 2, 2, 1, 1, 1, 1
+};
+
+void VP8InitDithering(const WebPDecoderOptions* const options,
+                      VP8Decoder* const dec) {
+  assert(dec != NULL);
+  if (options != NULL) {
+    const int d = options->dithering_strength;
+    const int max_amp = (1 << VP8_RANDOM_DITHER_FIX) - 1;
+    const int f = (d < 0) ? 0 : (d > 100) ? max_amp : (d * max_amp / 100);
+    if (f > 0) {
+      int s;
+      int all_amp = 0;
+      for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
+        VP8QuantMatrix* const dqm = &dec->dqm_[s];
+        if (dqm->uv_quant_ < DITHER_AMP_TAB_SIZE) {
+          // TODO(skal): should we specially dither more for uv_quant_ < 0?
+          const int idx = (dqm->uv_quant_ < 0) ? 0 : dqm->uv_quant_;
+          dqm->dither_ = (f * kQuantToDitherAmp[idx]) >> 3;
        }
-  {
-    // Transfer samples to row cache
-    int y;
-    const int y_offset = dec->cache_id_ * 16 * dec->cache_y_stride_;
-    const int uv_offset = dec->cache_id_ * 8 * dec->cache_uv_stride_;
-    uint8_t* const ydst = dec->cache_y_ + dec->mb_x_ * 16 + y_offset;
-    uint8_t* const udst = dec->cache_u_ + dec->mb_x_ * 8 + uv_offset;
-    uint8_t* const vdst = dec->cache_v_ + dec->mb_x_ * 8 + uv_offset;
-    for (y = 0; y < 16; ++y) {
-      memcpy(ydst + y * dec->cache_y_stride_,
-             dec->yuv_b_ + Y_OFF + y * BPS, 16);
+        all_amp |= dqm->dither_;
      }
-    for (y = 0; y < 8; ++y) {
-      memcpy(udst + y * dec->cache_uv_stride_,
-           dec->yuv_b_ + U_OFF + y * BPS, 8);
-      memcpy(vdst + y * dec->cache_uv_stride_,
-           dec->yuv_b_ + V_OFF + y * BPS, 8);
+      if (all_amp != 0) {
+        VP8InitRandom(&dec->dithering_rg_, 1.0f);
+        dec->dither_ = 1;
+      }
+    }
+  }
+}
+
+// minimal amp that will provide a non-zero dithering effect
+#define MIN_DITHER_AMP 4
+#define DITHER_DESCALE 4
+#define DITHER_DESCALE_ROUNDER (1 << (DITHER_DESCALE - 1))
+#define DITHER_AMP_BITS 8
+#define DITHER_AMP_CENTER (1 << DITHER_AMP_BITS)
+
+static void Dither8x8(VP8Random* const rg, uint8_t* dst, int bps, int amp) {
+  int i, j;
+  for (j = 0; j < 8; ++j) {
+    for (i = 0; i < 8; ++i) {
+      // TODO: could be made faster with SSE2
+      const int bits =
+          VP8RandomBits2(rg, DITHER_AMP_BITS + 1, amp) - DITHER_AMP_CENTER;
+      // Convert to range: [-2,2] for dither=50, [-4,4] for dither=100
+      const int delta = (bits + DITHER_DESCALE_ROUNDER) >> DITHER_DESCALE;
+      const int v = (int)dst[i] + delta;
+      dst[i] = (v < 0) ? 0 : (v > 255) ? 255u : (uint8_t)v;
+    }
+    dst += bps;
+  }
+}
+
+static void DitherRow(VP8Decoder* const dec) {
+  int mb_x;
+  assert(dec->dither_);
+  for (mb_x = dec->tl_mb_x_; mb_x < dec->br_mb_x_; ++mb_x) {
+    const VP8ThreadContext* const ctx = &dec->thread_ctx_;
+    const VP8MBData* const data = ctx->mb_data_ + mb_x;
+    const int cache_id = ctx->id_;
+    const int uv_bps = dec->cache_uv_stride_;
+    if (data->dither_ >= MIN_DITHER_AMP) {
+      uint8_t* const u_dst = dec->cache_u_ + cache_id * 8 * uv_bps + mb_x * 8;
+      uint8_t* const v_dst = dec->cache_v_ + cache_id * 8 * uv_bps + mb_x * 8;
+      Dither8x8(&dec->dithering_rg_, u_dst, uv_bps, data->dither_);
+      Dither8x8(&dec->dithering_rg_, v_dst, uv_bps, data->dither_);
    }
  }
 }
@ -164,25 +237,35 @@ void VP8StoreBlock(VP8Decoder* const dec) {
 static int FinishRow(VP8Decoder* const dec, VP8Io* const io) {
  int ok = 1;
  const VP8ThreadContext* const ctx = &dec->thread_ctx_;
+  const int cache_id = ctx->id_;
  const int extra_y_rows = kFilterExtraRows[dec->filter_type_];
  const int ysize = extra_y_rows * dec->cache_y_stride_;
  const int uvsize = (extra_y_rows / 2) * dec->cache_uv_stride_;
-  const int y_offset = ctx->id_ * 16 * dec->cache_y_stride_;
-  const int uv_offset = ctx->id_ * 8 * dec->cache_uv_stride_;
+  const int y_offset = cache_id * 16 * dec->cache_y_stride_;
+  const int uv_offset = cache_id * 8 * dec->cache_uv_stride_;
  uint8_t* const ydst = dec->cache_y_ - ysize + y_offset;
  uint8_t* const udst = dec->cache_u_ - uvsize + uv_offset;
  uint8_t* const vdst = dec->cache_v_ - uvsize + uv_offset;
-  const int first_row = (ctx->mb_y_ == 0);
-  const int last_row = (ctx->mb_y_ >= dec->br_mb_y_ - 1);
-  int y_start = MACROBLOCK_VPOS(ctx->mb_y_);
-  int y_end = MACROBLOCK_VPOS(ctx->mb_y_ + 1);
+  const int mb_y = ctx->mb_y_;
+  const int is_first_row = (mb_y == 0);
+  const int is_last_row = (mb_y >= dec->br_mb_y_ - 1);
+
+  if (dec->mt_method_ == 2) {
+    ReconstructRow(dec, ctx);
+  }

  if (ctx->filter_row_) {
    FilterRow(dec);
  }

-  if (io->put) {
-    if (!first_row) {
+  if (dec->dither_) {
+    DitherRow(dec);
+  }
+
+  if (io->put != NULL) {
+    int y_start = MACROBLOCK_VPOS(mb_y);
+    int y_end = MACROBLOCK_VPOS(mb_y + 1);
+    if (!is_first_row) {
      y_start -= extra_y_rows;
      io->y = ydst;
      io->u = udst;
@ -193,7 +276,7 @@ static int FinishRow(VP8Decoder* const dec, VP8Io* const io) {
      io->v = dec->cache_v_ + uv_offset;
    }

-    if (!last_row) {
+    if (!is_last_row) {
      y_end -= extra_y_rows;
    }
    if (y_end > io->crop_bottom) {
@ -201,11 +284,8 @@ static int FinishRow(VP8Decoder* const dec, VP8Io* const io) {
    }
    io->a = NULL;
    if (dec->alpha_data_ != NULL && y_start < y_end) {
-      // TODO(skal): several things to correct here:
-      // * testing presence of alpha with dec->alpha_data_ is not a good idea
-      // * we're actually decompressing the full plane only once. It should be
-      //   more obvious from signature.
-      // * we could free alpha_data_ right after this call, but we don't own.
+      // TODO(skal): testing presence of alpha with dec->alpha_data_ is not a
+      // good idea.
      io->a = VP8DecompressAlphaRows(dec, y_start, y_end - y_start);
      if (io->a == NULL) {
        return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
@ -237,8 +317,8 @@ static int FinishRow(VP8Decoder* const dec, VP8Io* const io) {
    }
  }
  // rotate top samples if needed
-  if (ctx->id_ + 1 == dec->num_caches_) {
-    if (!last_row) {
+  if (cache_id + 1 == dec->num_caches_) {
+    if (!is_last_row) {
      memcpy(dec->cache_y_ - ysize, ydst + 16 * dec->cache_y_stride_, ysize);
      memcpy(dec->cache_u_ - uvsize, udst + 8 * dec->cache_uv_stride_, uvsize);
      memcpy(dec->cache_v_ - uvsize, vdst + 8 * dec->cache_uv_stride_, uvsize);
@ -255,10 +335,14 @@ static int FinishRow(VP8Decoder* const dec, VP8Io* const io) {
 int VP8ProcessRow(VP8Decoder* const dec, VP8Io* const io) {
  int ok = 1;
  VP8ThreadContext* const ctx = &dec->thread_ctx_;
-  if (!dec->use_threads_) {
+  const int filter_row =
+      (dec->filter_type_ > 0) &&
+      (dec->mb_y_ >= dec->tl_mb_y_) && (dec->mb_y_ <= dec->br_mb_y_);
+  if (dec->mt_method_ == 0) {
    // ctx->id_ and ctx->f_info_ are already set
    ctx->mb_y_ = dec->mb_y_;
-    ctx->filter_row_ = dec->filter_row_;
+    ctx->filter_row_ = filter_row;
+    ReconstructRow(dec, ctx);
    ok = FinishRow(dec, io);
  } else {
    WebPWorker* const worker = &dec->worker_;
@ -269,13 +353,21 @@ int VP8ProcessRow(VP8Decoder* const dec, VP8Io* const io) {
      ctx->io_ = *io;
      ctx->id_ = dec->cache_id_;
      ctx->mb_y_ = dec->mb_y_;
-      ctx->filter_row_ = dec->filter_row_;
-      if (ctx->filter_row_) {    // just swap filter info
+      ctx->filter_row_ = filter_row;
+      if (dec->mt_method_ == 2) {  // swap macroblock data
+        VP8MBData* const tmp = ctx->mb_data_;
+        ctx->mb_data_ = dec->mb_data_;
+        dec->mb_data_ = tmp;
+      } else {
+        // perform reconstruction directly in main thread
+        ReconstructRow(dec, ctx);
+      }
+      if (filter_row) {            // swap filter info
        VP8FInfo* const tmp = ctx->f_info_;
        ctx->f_info_ = dec->f_info_;
        dec->f_info_ = tmp;
      }
-      WebPWorkerLaunch(worker);
+      WebPWorkerLaunch(worker);    // (reconstruct)+filter in parallel
      if (++dec->cache_id_ == dec->num_caches_) {
        dec->cache_id_ = 0;
      }
@ -289,8 +381,8 @@ int VP8ProcessRow(VP8Decoder* const dec, VP8Io* const io) {

 VP8StatusCode VP8EnterCritical(VP8Decoder* const dec, VP8Io* const io) {
  // Call setup() first. This may trigger additional decoding features on 'io'.
-  // Note: Afterward, we must call teardown() not matter what.
-  if (io->setup && !io->setup(io)) {
+  // Note: Afterward, we must call teardown() no matter what.
+  if (io->setup != NULL && !io->setup(io)) {
    VP8SetError(dec, VP8_STATUS_USER_ABORT, "Frame setup failed");
    return dec->status_;
  }
@ -303,7 +395,7 @@ VP8StatusCode VP8EnterCritical(VP8Decoder* const dec, VP8Io* const io) {

  // Define the area where we can skip in-loop filtering, in case of cropping.
  //
-  // 'Simple' filter reads two luma samples outside of the macroblock and
+  // 'Simple' filter reads two luma samples outside of the macroblock
  // and filters one. It doesn't filter the chroma samples. Hence, we can
  // avoid doing the in-loop filtering before crop_top/crop_left position.
  // For the 'Complex' filter, 3 samples are read and up to 3 are filtered.
@ -338,16 +430,17 @@ VP8StatusCode VP8EnterCritical(VP8Decoder* const dec, VP8Io* const io) {
      dec->br_mb_y_ = dec->mb_h_;
    }
  }
+  PrecomputeFilterStrengths(dec);
  return VP8_STATUS_OK;
 }

 int VP8ExitCritical(VP8Decoder* const dec, VP8Io* const io) {
  int ok = 1;
-  if (dec->use_threads_) {
+  if (dec->mt_method_ > 0) {
    ok = WebPWorkerSync(&dec->worker_);
  }

-  if (io->teardown) {
+  if (io->teardown != NULL) {
    io->teardown(io);
  }
  return ok;
@ -383,7 +476,7 @@ int VP8ExitCritical(VP8Decoder* const dec, VP8Io* const io) {
 // Initialize multi/single-thread worker
 static int InitThreadContext(VP8Decoder* const dec) {
  dec->cache_id_ = 0;
-  if (dec->use_threads_) {
+  if (dec->mt_method_ > 0) {
    WebPWorker* const worker = &dec->worker_;
    if (!WebPWorkerReset(worker)) {
      return VP8SetError(dec, VP8_STATUS_OUT_OF_MEMORY,
@ -400,6 +493,28 @@ static int InitThreadContext(VP8Decoder* const dec) {
  return 1;
 }

+int VP8GetThreadMethod(const WebPDecoderOptions* const options,
+                       const WebPHeaderStructure* const headers,
+                       int width, int height) {
+  if (options == NULL || options->use_threads == 0) {
+    return 0;
+  }
+  (void)headers;
+  (void)width;
+  (void)height;
+  assert(!headers->is_lossless);
+#if defined(WEBP_USE_THREAD)
+  if (width < MIN_WIDTH_FOR_THREADS) return 0;
+  // TODO(skal): tune the heuristic further
+#if 0
+  if (height < 2 * width) return 2;
+#endif
+  return 2;
+#else   // !WEBP_USE_THREAD
+  return 0;
+#endif
+}
+
 #undef MT_CACHE_LINES
 #undef ST_CACHE_LINES

@ -411,14 +526,15 @@ static int AllocateMemory(VP8Decoder* const dec) {
  const int mb_w = dec->mb_w_;
  // Note: we use 'size_t' when there's no overflow risk, uint64_t otherwise.
  const size_t intra_pred_mode_size = 4 * mb_w * sizeof(uint8_t);
-  const size_t top_size = (16 + 8 + 8) * mb_w;
+  const size_t top_size = sizeof(VP8TopSamples) * mb_w;
  const size_t mb_info_size = (mb_w + 1) * sizeof(VP8MB);
  const size_t f_info_size =
      (dec->filter_type_ > 0) ?
-          mb_w * (dec->use_threads_ ? 2 : 1) * sizeof(VP8FInfo)
+          mb_w * (dec->mt_method_ > 0 ? 2 : 1) * sizeof(VP8FInfo)
        : 0;
  const size_t yuv_size = YUV_SIZE * sizeof(*dec->yuv_b_);
-  const size_t coeffs_size = 384 * sizeof(*dec->coeffs_);
+  const size_t mb_data_size =
+      (dec->mt_method_ == 2 ? 2 : 1) * mb_w * sizeof(*dec->mb_data_);
  const size_t cache_height = (16 * num_caches
                            + kFilterExtraRows[dec->filter_type_]) * 3 / 2;
  const size_t cache_size = top_size * cache_height;
@ -427,7 +543,7 @@ static int AllocateMemory(VP8Decoder* const dec) {
      (uint64_t)dec->pic_hdr_.width_ * dec->pic_hdr_.height_ : 0ULL;
  const uint64_t needed = (uint64_t)intra_pred_mode_size
                        + top_size + mb_info_size + f_info_size
-                        + yuv_size + coeffs_size
+                        + yuv_size + mb_data_size
                        + cache_size + alpha_size + ALIGN_MASK;
  uint8_t* mem;

@ -435,11 +551,12 @@ static int AllocateMemory(VP8Decoder* const dec) {
  if (needed > dec->mem_size_) {
    free(dec->mem_);
    dec->mem_size_ = 0;
-    dec->mem_ = (uint8_t*)malloc((size_t)needed);
+    dec->mem_ = WebPSafeMalloc(needed, sizeof(uint8_t));
    if (dec->mem_ == NULL) {
      return VP8SetError(dec, VP8_STATUS_OUT_OF_MEMORY,
                         "no memory during frame initialization.");
    }
+    // down-cast is ok, thanks to WebPSafeAlloc() above.
    dec->mem_size_ = (size_t)needed;
  }

@ -447,12 +564,8 @@ static int AllocateMemory(VP8Decoder* const dec) {
  dec->intra_t_ = (uint8_t*)mem;
  mem += intra_pred_mode_size;

-  dec->y_t_ = (uint8_t*)mem;
-  mem += 16 * mb_w;
-  dec->u_t_ = (uint8_t*)mem;
-  mem += 8 * mb_w;
-  dec->v_t_ = (uint8_t*)mem;
-  mem += 8 * mb_w;
+  dec->yuv_t_ = (VP8TopSamples*)mem;
+  mem += top_size;

  dec->mb_info_ = ((VP8MB*)mem) + 1;
  mem += mb_info_size;
@ -461,7 +574,7 @@ static int AllocateMemory(VP8Decoder* const dec) {
  mem += f_info_size;
  dec->thread_ctx_.id_ = 0;
  dec->thread_ctx_.f_info_ = dec->f_info_;
-  if (dec->use_threads_) {
+  if (dec->mt_method_ > 0) {
    // secondary cache line. The deblocking process need to make use of the
    // filtering strength from previous macroblock row, while the new ones
    // are being decoded in parallel. We'll just swap the pointers.
@ -473,8 +586,12 @@ static int AllocateMemory(VP8Decoder* const dec) {
  dec->yuv_b_ = (uint8_t*)mem;
  mem += yuv_size;

-  dec->coeffs_ = (int16_t*)mem;
-  mem += coeffs_size;
+  dec->mb_data_ = (VP8MBData*)mem;
+  dec->thread_ctx_.mb_data_ = (VP8MBData*)mem;
+  if (dec->mt_method_ == 2) {
+    dec->thread_ctx_.mb_data_ += mb_w;
+  }
+  mem += mb_data_size;

  dec->cache_y_stride_ = 16 * mb_w;
  dec->cache_uv_stride_ = 8 * mb_w;
@ -494,9 +611,11 @@ static int AllocateMemory(VP8Decoder* const dec) {
  // alpha plane
  dec->alpha_plane_ = alpha_size ? (uint8_t*)mem : NULL;
  mem += alpha_size;
+  assert(mem <= (uint8_t*)dec->mem_ + dec->mem_size_);

-  // note: left-info is initialized once for all.
+  // note: left/top-info is initialized once for all.
  memset(dec->mb_info_ - 1, 0, mb_info_size);
+  VP8InitScanline(dec);   // initialize left too.

  // initialize top
  memset(dec->intra_t_, B_DC_PRED, intra_pred_mode_size);
@ -533,30 +652,64 @@ static const int kScan[16] = {
  0 + 12 * BPS,  4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS
 };

-static WEBP_INLINE int CheckMode(VP8Decoder* const dec, int mode) {
+static int CheckMode(int mb_x, int mb_y, int mode) {
  if (mode == B_DC_PRED) {
-    if (dec->mb_x_ == 0) {
-      return (dec->mb_y_ == 0) ? B_DC_PRED_NOTOPLEFT : B_DC_PRED_NOLEFT;
+    if (mb_x == 0) {
+      return (mb_y == 0) ? B_DC_PRED_NOTOPLEFT : B_DC_PRED_NOLEFT;
    } else {
-      return (dec->mb_y_ == 0) ? B_DC_PRED_NOTOP : B_DC_PRED;
+      return (mb_y == 0) ? B_DC_PRED_NOTOP : B_DC_PRED;
    }
  }
  return mode;
 }

-static WEBP_INLINE void Copy32b(uint8_t* dst, uint8_t* src) {
-  *(uint32_t*)dst = *(uint32_t*)src;
+static void Copy32b(uint8_t* dst, uint8_t* src) {
+  memcpy(dst, src, 4);
 }

-void VP8ReconstructBlock(VP8Decoder* const dec) {
+static WEBP_INLINE void DoTransform(uint32_t bits, const int16_t* const src,
+                                    uint8_t* const dst) {
+  switch (bits >> 30) {
+    case 3:
+      VP8Transform(src, dst, 0);
+      break;
+    case 2:
+      VP8TransformAC3(src, dst);
+      break;
+    case 1:
+      VP8TransformDC(src, dst);
+      break;
+    default:
+      break;
+  }
+}
+
+static void DoUVTransform(uint32_t bits, const int16_t* const src,
+                          uint8_t* const dst) {
+  if (bits & 0xff) {    // any non-zero coeff at all?
+    if (bits & 0xaa) {  // any non-zero AC coefficient?
+      VP8TransformUV(src, dst);   // note we don't use the AC3 variant for U/V
+    } else {
+      VP8TransformDCUV(src, dst);
+    }
+  }
+}
+
+static void ReconstructRow(const VP8Decoder* const dec,
+                           const VP8ThreadContext* ctx) {
+  int j;
+  int mb_x;
+  const int mb_y = ctx->mb_y_;
+  const int cache_id = ctx->id_;
  uint8_t* const y_dst = dec->yuv_b_ + Y_OFF;
  uint8_t* const u_dst = dec->yuv_b_ + U_OFF;
  uint8_t* const v_dst = dec->yuv_b_ + V_OFF;
+  for (mb_x = 0; mb_x < dec->mb_w_; ++mb_x) {
+    const VP8MBData* const block = ctx->mb_data_ + mb_x;

    // Rotate in the left samples from previously decoded block. We move four
    // pixels at a time for alignment reason, and because of in-loop filter.
-  if (dec->mb_x_ > 0) {
-    int j;
+    if (mb_x > 0) {
      for (j = -1; j < 16; ++j) {
        Copy32b(&y_dst[j * BPS - 4], &y_dst[j * BPS + 12]);
      }
@ -565,7 +718,6 @@ void VP8ReconstructBlock(VP8Decoder* const dec) {
        Copy32b(&v_dst[j * BPS - 4], &v_dst[j * BPS + 4]);
      }
    } else {
-    int j;
      for (j = 0; j < 16; ++j) {
        y_dst[j * BPS - 1] = 129;
      }
@ -574,23 +726,22 @@ void VP8ReconstructBlock(VP8Decoder* const dec) {
        v_dst[j * BPS - 1] = 129;
      }
      // Init top-left sample on left column too
-    if (dec->mb_y_ > 0) {
+      if (mb_y > 0) {
        y_dst[-1 - BPS] = u_dst[-1 - BPS] = v_dst[-1 - BPS] = 129;
      }
    }
    {
      // bring top samples into the cache
-    uint8_t* const top_y = dec->y_t_ + dec->mb_x_ * 16;
-    uint8_t* const top_u = dec->u_t_ + dec->mb_x_ * 8;
-    uint8_t* const top_v = dec->v_t_ + dec->mb_x_ * 8;
-    const int16_t* coeffs = dec->coeffs_;
+      VP8TopSamples* const top_yuv = dec->yuv_t_ + mb_x;
+      const int16_t* const coeffs = block->coeffs_;
+      uint32_t bits = block->non_zero_y_;
      int n;

-    if (dec->mb_y_ > 0) {
-      memcpy(y_dst - BPS, top_y, 16);
-      memcpy(u_dst - BPS, top_u, 8);
-      memcpy(v_dst - BPS, top_v, 8);
-    } else if (dec->mb_x_ == 0) {
+      if (mb_y > 0) {
+        memcpy(y_dst - BPS, top_yuv[0].y, 16);
+        memcpy(u_dst - BPS, top_yuv[0].u, 8);
+        memcpy(v_dst - BPS, top_yuv[0].v, 8);
+      } else if (mb_x == 0) {
        // we only need to do this init once at block (0,0).
        // Afterward, it remains valid for the whole topmost row.
        memset(y_dst - BPS - 1, 127, 16 + 4 + 1);
@ -599,72 +750,65 @@ void VP8ReconstructBlock(VP8Decoder* const dec) {
      }

      // predict and add residuals
-
-    if (dec->is_i4x4_) {   // 4x4
+      if (block->is_i4x4_) {   // 4x4
        uint32_t* const top_right = (uint32_t*)(y_dst - BPS + 16);

-      if (dec->mb_y_ > 0) {
-        if (dec->mb_x_ >= dec->mb_w_ - 1) {    // on rightmost border
-          top_right[0] = top_y[15] * 0x01010101u;
+        if (mb_y > 0) {
+          if (mb_x >= dec->mb_w_ - 1) {    // on rightmost border
+            memset(top_right, top_yuv[0].y[15], sizeof(*top_right));
          } else {
-          memcpy(top_right, top_y + 16, sizeof(*top_right));
+            memcpy(top_right, top_yuv[1].y, sizeof(*top_right));
          }
        }
        // replicate the top-right pixels below
        top_right[BPS] = top_right[2 * BPS] = top_right[3 * BPS] = top_right[0];

-      // predict and add residues for all 4x4 blocks in turn.
-      for (n = 0; n < 16; n++) {
+        // predict and add residuals for all 4x4 blocks in turn.
+        for (n = 0; n < 16; ++n, bits <<= 2) {
          uint8_t* const dst = y_dst + kScan[n];
-        VP8PredLuma4[dec->imodes_[n]](dst);
-        if (dec->non_zero_ac_ & (1 << n)) {
-          VP8Transform(coeffs + n * 16, dst, 0);
-        } else if (dec->non_zero_ & (1 << n)) {  // only DC is present
-          VP8TransformDC(coeffs + n * 16, dst);
-        }
+          VP8PredLuma4[block->imodes_[n]](dst);
+          DoTransform(bits, coeffs + n * 16, dst);
        }
      } else {    // 16x16
-      const int pred_func = CheckMode(dec, dec->imodes_[0]);
+        const int pred_func = CheckMode(mb_x, mb_y,
+                                        block->imodes_[0]);
        VP8PredLuma16[pred_func](y_dst);
-      if (dec->non_zero_) {
-        for (n = 0; n < 16; n++) {
-          uint8_t* const dst = y_dst + kScan[n];
-          if (dec->non_zero_ac_ & (1 << n)) {
-            VP8Transform(coeffs + n * 16, dst, 0);
-          } else if (dec->non_zero_ & (1 << n)) {  // only DC is present
-            VP8TransformDC(coeffs + n * 16, dst);
-          }
+        if (bits != 0) {
+          for (n = 0; n < 16; ++n, bits <<= 2) {
+            DoTransform(bits, coeffs + n * 16, y_dst + kScan[n]);
          }
        }
      }
      {
        // Chroma
-      const int pred_func = CheckMode(dec, dec->uvmode_);
+        const uint32_t bits_uv = block->non_zero_uv_;
+        const int pred_func = CheckMode(mb_x, mb_y, block->uvmode_);
        VP8PredChroma8[pred_func](u_dst);
        VP8PredChroma8[pred_func](v_dst);
-
-      if (dec->non_zero_ & 0x0f0000) {   // chroma-U
-        const int16_t* const u_coeffs = dec->coeffs_ + 16 * 16;
-        if (dec->non_zero_ac_ & 0x0f0000) {
-          VP8TransformUV(u_coeffs, u_dst);
-        } else {
-          VP8TransformDCUV(u_coeffs, u_dst);
-        }
-      }
-      if (dec->non_zero_ & 0xf00000) {   // chroma-V
-        const int16_t* const v_coeffs = dec->coeffs_ + 20 * 16;
-        if (dec->non_zero_ac_ & 0xf00000) {
-          VP8TransformUV(v_coeffs, v_dst);
-        } else {
-          VP8TransformDCUV(v_coeffs, v_dst);
-        }
+        DoUVTransform(bits_uv >> 0, coeffs + 16 * 16, u_dst);
+        DoUVTransform(bits_uv >> 8, coeffs + 20 * 16, v_dst);
      }

      // stash away top samples for next block
-      if (dec->mb_y_ < dec->mb_h_ - 1) {
-        memcpy(top_y, y_dst + 15 * BPS, 16);
-        memcpy(top_u, u_dst +  7 * BPS,  8);
-        memcpy(top_v, v_dst +  7 * BPS,  8);
+      if (mb_y < dec->mb_h_ - 1) {
+        memcpy(top_yuv[0].y, y_dst + 15 * BPS, 16);
+        memcpy(top_yuv[0].u, u_dst +  7 * BPS,  8);
+        memcpy(top_yuv[0].v, v_dst +  7 * BPS,  8);
+      }
+    }
+    // Transfer reconstructed samples from yuv_b_ cache to final destination.
+    {
+      const int y_offset = cache_id * 16 * dec->cache_y_stride_;
+      const int uv_offset = cache_id * 8 * dec->cache_uv_stride_;
+      uint8_t* const y_out = dec->cache_y_ + mb_x * 16 + y_offset;
+      uint8_t* const u_out = dec->cache_u_ + mb_x * 8 + uv_offset;
+      uint8_t* const v_out = dec->cache_v_ + mb_x * 8 + uv_offset;
+      for (j = 0; j < 16; ++j) {
+        memcpy(y_out + j * dec->cache_y_stride_, y_dst + j * BPS, 16);
+      }
+      for (j = 0; j < 8; ++j) {
+        memcpy(u_out + j * dec->cache_uv_stride_, u_dst + j * BPS, 8);
+        memcpy(v_out + j * dec->cache_uv_stride_, v_dst + j * BPS, 8);
      }
    }
  }
@ -672,6 +816,3 @@ void VP8ReconstructBlock(VP8Decoder* const dec) {

 //------------------------------------------------------------------------------

-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
--- a/src/dec/idec.c
+++ b/src/dec/idec.c
@ -1,8 +1,10 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Incremental decoding
@ -13,12 +15,10 @@
 #include <string.h>
 #include <stdlib.h>

+#include "./alphai.h"
 #include "./webpi.h"
 #include "./vp8i.h"
-
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
+#include "../utils/utils.h"

 // In append mode, buffer allocations increase as multiples of this value.
 // Needs to be a power of 2.
@ -28,11 +28,13 @@ extern "C" {
 //------------------------------------------------------------------------------
 // Data structures for memory and states

-// Decoding states. State normally flows like HEADER->PARTS0->DATA->DONE.
+// Decoding states. State normally flows as:
+// WEBP_HEADER->VP8_HEADER->VP8_PARTS0->VP8_DATA->DONE for a lossy image, and
+// WEBP_HEADER->VP8L_HEADER->VP8L_DATA->DONE for a lossless image.
 // If there is any error the decoder goes into state ERROR.
 typedef enum {
-  STATE_PRE_VP8,  // All data before that of the first VP8 chunk.
-  STATE_VP8_FRAME_HEADER,  // For VP8 Frame header (within VP8 chunk).
+  STATE_WEBP_HEADER,  // All the data before that of the VP8/VP8L chunk.
+  STATE_VP8_HEADER,   // The VP8 Frame header (within the VP8 chunk).
  STATE_VP8_PARTS0,
  STATE_VP8_DATA,
  STATE_VP8L_HEADER,
@ -96,6 +98,23 @@ static WEBP_INLINE size_t MemDataSize(const MemBuffer* mem) {
  return (mem->end_ - mem->start_);
 }

+// Check if we need to preserve the compressed alpha data, as it may not have
+// been decoded yet.
+static int NeedCompressedAlpha(const WebPIDecoder* const idec) {
+  if (idec->state_ == STATE_WEBP_HEADER) {
+    // We haven't parsed the headers yet, so we don't know whether the image is
+    // lossy or lossless. This also means that we haven't parsed the ALPH chunk.
+    return 0;
+  }
+  if (idec->is_lossless_) {
+    return 0;  // ALPH chunk is not present for lossless images.
+  } else {
+    const VP8Decoder* const dec = (VP8Decoder*)idec->dec_;
+    assert(dec != NULL);  // Must be true as idec->state_ != STATE_WEBP_HEADER.
+    return (dec->alpha_data_ != NULL) && !dec->is_alpha_decoded_;
+  }
+}
+
 static void DoRemap(WebPIDecoder* const idec, ptrdiff_t offset) {
  MemBuffer* const mem = &idec->mem_;
  const uint8_t* const new_base = mem->buf_ + mem->start_;
@ -121,6 +140,22 @@ static void DoRemap(WebPIDecoder* const idec, ptrdiff_t offset) {
      }
      assert(last_part >= 0);
      dec->parts_[last_part].buf_end_ = mem->buf_ + mem->end_;
+      if (NeedCompressedAlpha(idec)) {
+        ALPHDecoder* const alph_dec = dec->alph_dec_;
+        dec->alpha_data_ += offset;
+        if (alph_dec != NULL) {
+          if (alph_dec->method_ == ALPHA_LOSSLESS_COMPRESSION) {
+            VP8LDecoder* const alph_vp8l_dec = alph_dec->vp8l_dec_;
+            assert(alph_vp8l_dec != NULL);
+            assert(dec->alpha_data_size_ >= ALPHA_HEADER_LEN);
+            VP8LBitReaderSetBuffer(&alph_vp8l_dec->br_,
+                                   dec->alpha_data_ + ALPHA_HEADER_LEN,
+                                   dec->alpha_data_size_ - ALPHA_HEADER_LEN);
+          } else {  // alph_dec->method_ == ALPHA_NO_COMPRESSION
+            // Nothing special to do in this case.
+          }
+        }
+      }
    } else {    // Resize lossless bitreader
      VP8LDecoder* const dec = (VP8LDecoder*)idec->dec_;
      VP8LBitReaderSetBuffer(&dec->br_, new_base, MemDataSize(mem));
@ -132,8 +167,12 @@ static void DoRemap(WebPIDecoder* const idec, ptrdiff_t offset) {
 // size if required and also updates VP8BitReader's if new memory is allocated.
 static int AppendToMemBuffer(WebPIDecoder* const idec,
                             const uint8_t* const data, size_t data_size) {
+  VP8Decoder* const dec = (VP8Decoder*)idec->dec_;
  MemBuffer* const mem = &idec->mem_;
-  const uint8_t* const old_base = mem->buf_ + mem->start_;
+  const int need_compressed_alpha = NeedCompressedAlpha(idec);
+  const uint8_t* const old_start = mem->buf_ + mem->start_;
+  const uint8_t* const old_base =
+      need_compressed_alpha ? dec->alpha_data_ : old_start;
  assert(mem->mode_ == MEM_MODE_APPEND);
  if (data_size > MAX_CHUNK_PAYLOAD) {
    // security safeguard: trying to allocate more than what the format
@ -142,16 +181,18 @@ static int AppendToMemBuffer(WebPIDecoder* const idec,
  }

  if (mem->end_ + data_size > mem->buf_size_) {  // Need some free memory
-    const size_t current_size = MemDataSize(mem);
-    const size_t new_size = current_size + data_size;
-    const size_t extra_size = (new_size + CHUNK_SIZE - 1) & ~(CHUNK_SIZE - 1);
-    uint8_t* const new_buf = (uint8_t*)malloc(extra_size);
+    const size_t new_mem_start = old_start - old_base;
+    const size_t current_size = MemDataSize(mem) + new_mem_start;
+    const uint64_t new_size = (uint64_t)current_size + data_size;
+    const uint64_t extra_size = (new_size + CHUNK_SIZE - 1) & ~(CHUNK_SIZE - 1);
+    uint8_t* const new_buf =
+        (uint8_t*)WebPSafeMalloc(extra_size, sizeof(*new_buf));
    if (new_buf == NULL) return 0;
    memcpy(new_buf, old_base, current_size);
    free(mem->buf_);
    mem->buf_ = new_buf;
-    mem->buf_size_ = extra_size;
-    mem->start_ = 0;
+    mem->buf_size_ = (size_t)extra_size;
+    mem->start_ = new_mem_start;
    mem->end_ = current_size;
  }

@ -159,14 +200,15 @@ static int AppendToMemBuffer(WebPIDecoder* const idec,
  mem->end_ += data_size;
  assert(mem->end_ <= mem->buf_size_);

-  DoRemap(idec, mem->buf_ + mem->start_ - old_base);
+  DoRemap(idec, mem->buf_ + mem->start_ - old_start);
  return 1;
 }

 static int RemapMemBuffer(WebPIDecoder* const idec,
                          const uint8_t* const data, size_t data_size) {
  MemBuffer* const mem = &idec->mem_;
-  const uint8_t* const old_base = mem->buf_ + mem->start_;
+  const uint8_t* const old_buf = mem->buf_;
+  const uint8_t* const old_start = old_buf + mem->start_;
  assert(mem->mode_ == MEM_MODE_MAP);

  if (data_size < mem->buf_size_) return 0;  // can't remap to a shorter buffer!
@ -174,7 +216,7 @@ static int RemapMemBuffer(WebPIDecoder* const idec,
  mem->buf_ = (uint8_t*)data;
  mem->end_ = mem->buf_size_ = data_size;

-  DoRemap(idec, mem->buf_ + mem->start_ - old_base);
+  DoRemap(idec, mem->buf_ + mem->start_ - old_start);
  return 1;
 }

@ -240,7 +282,7 @@ static void RestoreContext(const MBContext* context, VP8Decoder* const dec,
 static VP8StatusCode IDecError(WebPIDecoder* const idec, VP8StatusCode error) {
  if (idec->state_ == STATE_VP8_DATA) {
    VP8Io* const io = &idec->io_;
-    if (io->teardown) {
+    if (io->teardown != NULL) {
      io->teardown(io);
    }
  }
@ -283,15 +325,9 @@ static VP8StatusCode DecodeWebPHeaders(WebPIDecoder* const idec) {
      return VP8_STATUS_OUT_OF_MEMORY;
    }
    idec->dec_ = dec;
-#ifdef WEBP_USE_THREAD
-    dec->use_threads_ = (idec->params_.options != NULL) &&
-                        (idec->params_.options->use_threads > 0);
-#else
-    dec->use_threads_ = 0;
-#endif
    dec->alpha_data_ = headers.alpha_data;
    dec->alpha_data_size_ = headers.alpha_data_size;
-    ChangeState(idec, STATE_VP8_FRAME_HEADER, headers.offset);
+    ChangeState(idec, STATE_VP8_HEADER, headers.offset);
  } else {
    VP8LDecoder* const dec = VP8LNew();
    if (dec == NULL) {
@ -306,13 +342,14 @@ static VP8StatusCode DecodeWebPHeaders(WebPIDecoder* const idec) {
 static VP8StatusCode DecodeVP8FrameHeader(WebPIDecoder* const idec) {
  const uint8_t* data = idec->mem_.buf_ + idec->mem_.start_;
  const size_t curr_size = MemDataSize(&idec->mem_);
+  int width, height;
  uint32_t bits;

  if (curr_size < VP8_FRAME_HEADER_SIZE) {
    // Not enough data bytes to extract VP8 Frame Header.
    return VP8_STATUS_SUSPENDED;
  }
-  if (!VP8GetInfo(data, curr_size, idec->chunk_size_, NULL, NULL)) {
+  if (!VP8GetInfo(data, curr_size, idec->chunk_size_, &width, &height)) {
    return IDecError(idec, VP8_STATUS_BITSTREAM_ERROR);
  }

@ -379,7 +416,10 @@ static VP8StatusCode DecodePartition0(WebPIDecoder* const idec) {
  if (dec->status_ != VP8_STATUS_OK) {
    return IDecError(idec, dec->status_);
  }
-
+  // This change must be done before calling VP8InitFrame()
+  dec->mt_method_ = VP8GetThreadMethod(params->options, NULL,
+                                       io->width, io->height);
+  VP8InitDithering(params->options, dec);
  if (!CopyParts0Data(idec)) {
    return IDecError(idec, VP8_STATUS_OUT_OF_MEMORY);
  }
@ -405,16 +445,11 @@ static VP8StatusCode DecodeRemaining(WebPIDecoder* const idec) {
  VP8Io* const io = &idec->io_;

  assert(dec->ready_);
-
  for (; dec->mb_y_ < dec->mb_h_; ++dec->mb_y_) {
    VP8BitReader* token_br = &dec->parts_[dec->mb_y_ & (dec->num_parts_ - 1)];
-    if (dec->mb_x_ == 0) {
-      VP8InitScanline(dec);
-    }
-    for (; dec->mb_x_ < dec->mb_w_;  dec->mb_x_++) {
+    for (; dec->mb_x_ < dec->mb_w_; ++dec->mb_x_) {
      MBContext context;
      SaveContext(dec, token_br, &context);
-
      if (!VP8DecodeMB(dec, token_br)) {
        RestoreContext(&context, dec, token_br);
        // We shouldn't fail when MAX_MB data was available
@ -423,20 +458,18 @@ static VP8StatusCode DecodeRemaining(WebPIDecoder* const idec) {
        }
        return VP8_STATUS_SUSPENDED;
      }
-      VP8ReconstructBlock(dec);
-      // Store data and save block's filtering params
-      VP8StoreBlock(dec);
-
      // Release buffer only if there is only one partition
      if (dec->num_parts_ == 1) {
        idec->mem_.start_ = token_br->buf_ - idec->mem_.buf_;
        assert(idec->mem_.start_ <= idec->mem_.end_);
      }
    }
+    VP8InitScanline(dec);   // Prepare for next scanline
+
+    // Reconstruct, filter and emit the row.
    if (!VP8ProcessRow(dec, io)) {
      return IDecError(idec, VP8_STATUS_USER_ABORT);
    }
-    dec->mb_x_ = 0;
  }
  // Synchronize the thread and check for errors.
  if (!VP8ExitCritical(dec, io)) {
@ -448,7 +481,8 @@ static VP8StatusCode DecodeRemaining(WebPIDecoder* const idec) {
  return VP8_STATUS_OK;
 }

-static int ErrorStatusLossless(WebPIDecoder* const idec, VP8StatusCode status) {
+static VP8StatusCode ErrorStatusLossless(WebPIDecoder* const idec,
+                                         VP8StatusCode status) {
  if (status == VP8_STATUS_SUSPENDED || status == VP8_STATUS_NOT_ENOUGH_DATA) {
    return VP8_STATUS_SUSPENDED;
  }
@ -505,14 +539,14 @@ static VP8StatusCode DecodeVP8LData(WebPIDecoder* const idec) {
 static VP8StatusCode IDecode(WebPIDecoder* idec) {
  VP8StatusCode status = VP8_STATUS_SUSPENDED;

-  if (idec->state_ == STATE_PRE_VP8) {
+  if (idec->state_ == STATE_WEBP_HEADER) {
    status = DecodeWebPHeaders(idec);
  } else {
    if (idec->dec_ == NULL) {
      return VP8_STATUS_SUSPENDED;    // can't continue if we have no decoder.
    }
  }
-  if (idec->state_ == STATE_VP8_FRAME_HEADER) {
+  if (idec->state_ == STATE_VP8_HEADER) {
    status = DecodeVP8FrameHeader(idec);
  }
  if (idec->state_ == STATE_VP8_PARTS0) {
@ -534,12 +568,12 @@ static VP8StatusCode IDecode(WebPIDecoder* idec) {
 // Public functions

 WebPIDecoder* WebPINewDecoder(WebPDecBuffer* output_buffer) {
-  WebPIDecoder* idec = (WebPIDecoder*)calloc(1, sizeof(WebPIDecoder));
+  WebPIDecoder* idec = (WebPIDecoder*)calloc(1, sizeof(*idec));
  if (idec == NULL) {
    return NULL;
  }

-  idec->state_ = STATE_PRE_VP8;
+  idec->state_ = STATE_WEBP_HEADER;
  idec->chunk_size_ = 0;

  InitMemBuffer(&idec->mem_);
@ -547,7 +581,8 @@ WebPIDecoder* WebPINewDecoder(WebPDecBuffer* output_buffer) {
  VP8InitIo(&idec->io_);

  WebPResetDecParams(&idec->params_);
-  idec->params_.output = output_buffer ? output_buffer : &idec->output_;
+  idec->params_.output = (output_buffer != NULL) ? output_buffer
+                                                 : &idec->output_;
  WebPInitCustomIo(&idec->params_, &idec->io_);  // Plug the I/O functions.

  return idec;
@ -565,7 +600,7 @@ WebPIDecoder* WebPIDecode(const uint8_t* data, size_t data_size,
  }
  // Create an instance of the incremental decoder
  idec = WebPINewDecoder(config ? &config->output : NULL);
-  if (!idec) {
+  if (idec == NULL) {
    return NULL;
  }
  // Finish initialization
@ -579,9 +614,13 @@ void WebPIDelete(WebPIDecoder* idec) {
  if (idec == NULL) return;
  if (idec->dec_ != NULL) {
    if (!idec->is_lossless_) {
-      VP8Delete(idec->dec_);
+      if (idec->state_ == STATE_VP8_DATA) {
+        // Synchronize the thread, clean-up and check for errors.
+        VP8ExitCritical((VP8Decoder*)idec->dec_, &idec->io_);
+      }
+      VP8Delete((VP8Decoder*)idec->dec_);
    } else {
-      VP8LDelete(idec->dec_);
+      VP8LDelete((VP8LDecoder*)idec->dec_);
    }
  }
  ClearMemBuffer(&idec->mem_);
@ -594,25 +633,56 @@ void WebPIDelete(WebPIDecoder* idec) {

 WebPIDecoder* WebPINewRGB(WEBP_CSP_MODE mode, uint8_t* output_buffer,
                          size_t output_buffer_size, int output_stride) {
+  const int is_external_memory = (output_buffer != NULL);
  WebPIDecoder* idec;
+
  if (mode >= MODE_YUV) return NULL;
+  if (!is_external_memory) {    // Overwrite parameters to sane values.
+    output_buffer_size = 0;
+    output_stride = 0;
+  } else {  // A buffer was passed. Validate the other params.
+    if (output_stride == 0 || output_buffer_size == 0) {
+      return NULL;   // invalid parameter.
+    }
+  }
  idec = WebPINewDecoder(NULL);
-  if (!idec) return NULL;
+  if (idec == NULL) return NULL;
  idec->output_.colorspace = mode;
-  idec->output_.is_external_memory = 1;
+  idec->output_.is_external_memory = is_external_memory;
  idec->output_.u.RGBA.rgba = output_buffer;
  idec->output_.u.RGBA.stride = output_stride;
  idec->output_.u.RGBA.size = output_buffer_size;
  return idec;
 }

-WebPIDecoder* WebPINewYUV(uint8_t* luma, size_t luma_size, int luma_stride,
+WebPIDecoder* WebPINewYUVA(uint8_t* luma, size_t luma_size, int luma_stride,
                           uint8_t* u, size_t u_size, int u_stride,
-                          uint8_t* v, size_t v_size, int v_stride) {
-  WebPIDecoder* const idec = WebPINewDecoder(NULL);
-  if (!idec) return NULL;
-  idec->output_.colorspace = MODE_YUV;
-  idec->output_.is_external_memory = 1;
+                           uint8_t* v, size_t v_size, int v_stride,
+                           uint8_t* a, size_t a_size, int a_stride) {
+  const int is_external_memory = (luma != NULL);
+  WebPIDecoder* idec;
+  WEBP_CSP_MODE colorspace;
+
+  if (!is_external_memory) {    // Overwrite parameters to sane values.
+    luma_size = u_size = v_size = a_size = 0;
+    luma_stride = u_stride = v_stride = a_stride = 0;
+    u = v = a = NULL;
+    colorspace = MODE_YUVA;
+  } else {  // A luma buffer was passed. Validate the other parameters.
+    if (u == NULL || v == NULL) return NULL;
+    if (luma_size == 0 || u_size == 0 || v_size == 0) return NULL;
+    if (luma_stride == 0 || u_stride == 0 || v_stride == 0) return NULL;
+    if (a != NULL) {
+      if (a_size == 0 || a_stride == 0) return NULL;
+    }
+    colorspace = (a == NULL) ? MODE_YUV : MODE_YUVA;
+  }
+
+  idec = WebPINewDecoder(NULL);
+  if (idec == NULL) return NULL;
+
+  idec->output_.colorspace = colorspace;
+  idec->output_.is_external_memory = is_external_memory;
  idec->output_.u.YUVA.y = luma;
  idec->output_.u.YUVA.y_stride = luma_stride;
  idec->output_.u.YUVA.y_size = luma_size;
@ -622,9 +692,21 @@ WebPIDecoder* WebPINewYUV(uint8_t* luma, size_t luma_size, int luma_stride,
  idec->output_.u.YUVA.v = v;
  idec->output_.u.YUVA.v_stride = v_stride;
  idec->output_.u.YUVA.v_size = v_size;
+  idec->output_.u.YUVA.a = a;
+  idec->output_.u.YUVA.a_stride = a_stride;
+  idec->output_.u.YUVA.a_size = a_size;
  return idec;
 }

+WebPIDecoder* WebPINewYUV(uint8_t* luma, size_t luma_size, int luma_stride,
+                          uint8_t* u, size_t u_size, int u_stride,
+                          uint8_t* v, size_t v_size, int v_stride) {
+  return WebPINewYUVA(luma, luma_size, luma_stride,
+                      u, u_size, u_stride,
+                      v, v_size, v_stride,
+                      NULL, 0, 0);
+}
+
 //------------------------------------------------------------------------------

 static VP8StatusCode IDecCheckStatus(const WebPIDecoder* const idec) {
@ -696,15 +778,15 @@ const WebPDecBuffer* WebPIDecodedArea(const WebPIDecoder* idec,
                                      int* left, int* top,
                                      int* width, int* height) {
  const WebPDecBuffer* const src = GetOutputBuffer(idec);
-  if (left) *left = 0;
-  if (top) *top = 0;
+  if (left != NULL) *left = 0;
+  if (top != NULL) *top = 0;
  // TODO(skal): later include handling of rotations.
  if (src) {
-    if (width) *width = src->width;
-    if (height) *height = idec->params_.last_y;
+    if (width != NULL) *width = src->width;
+    if (height != NULL) *height = idec->params_.last_y;
  } else {
-    if (width) *width = 0;
-    if (height) *height = 0;
+    if (width != NULL) *width = 0;
+    if (height != NULL) *height = 0;
  }
  return src;
 }
@ -712,35 +794,38 @@ const WebPDecBuffer* WebPIDecodedArea(const WebPIDecoder* idec,
 uint8_t* WebPIDecGetRGB(const WebPIDecoder* idec, int* last_y,
                        int* width, int* height, int* stride) {
  const WebPDecBuffer* const src = GetOutputBuffer(idec);
-  if (!src) return NULL;
+  if (src == NULL) return NULL;
  if (src->colorspace >= MODE_YUV) {
    return NULL;
  }

-  if (last_y) *last_y = idec->params_.last_y;
-  if (width) *width = src->width;
-  if (height) *height = src->height;
-  if (stride) *stride = src->u.RGBA.stride;
+  if (last_y != NULL) *last_y = idec->params_.last_y;
+  if (width != NULL) *width = src->width;
+  if (height != NULL) *height = src->height;
+  if (stride != NULL) *stride = src->u.RGBA.stride;

  return src->u.RGBA.rgba;
 }

-uint8_t* WebPIDecGetYUV(const WebPIDecoder* idec, int* last_y,
-                        uint8_t** u, uint8_t** v,
-                        int* width, int* height, int *stride, int* uv_stride) {
+uint8_t* WebPIDecGetYUVA(const WebPIDecoder* idec, int* last_y,
+                         uint8_t** u, uint8_t** v, uint8_t** a,
+                         int* width, int* height,
+                         int* stride, int* uv_stride, int* a_stride) {
  const WebPDecBuffer* const src = GetOutputBuffer(idec);
-  if (!src) return NULL;
+  if (src == NULL) return NULL;
  if (src->colorspace < MODE_YUV) {
    return NULL;
  }

-  if (last_y) *last_y = idec->params_.last_y;
-  if (u) *u = src->u.YUVA.u;
-  if (v) *v = src->u.YUVA.v;
-  if (width) *width = src->width;
-  if (height) *height = src->height;
-  if (stride) *stride = src->u.YUVA.y_stride;
-  if (uv_stride) *uv_stride = src->u.YUVA.u_stride;
+  if (last_y != NULL) *last_y = idec->params_.last_y;
+  if (u != NULL) *u = src->u.YUVA.u;
+  if (v != NULL) *v = src->u.YUVA.v;
+  if (a != NULL) *a = src->u.YUVA.a;
+  if (width != NULL) *width = src->width;
+  if (height != NULL) *height = src->height;
+  if (stride != NULL) *stride = src->u.YUVA.y_stride;
+  if (uv_stride != NULL) *uv_stride = src->u.YUVA.u_stride;
+  if (a_stride != NULL) *a_stride = src->u.YUVA.a_stride;

  return src->u.YUVA.y;
 }
@ -750,7 +835,7 @@ int WebPISetIOHooks(WebPIDecoder* const idec,
                    VP8IoSetupHook setup,
                    VP8IoTeardownHook teardown,
                    void* user_data) {
-  if (idec == NULL || idec->state_ > STATE_PRE_VP8) {
+  if (idec == NULL || idec->state_ > STATE_WEBP_HEADER) {
    return 0;
  }

@ -762,6 +847,3 @@ int WebPISetIOHooks(WebPIDecoder* const idec,
  return 1;
 }

-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
--- a/src/dec/io.c
+++ b/src/dec/io.c
@ -1,8 +1,10 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // functions for sample output.
@ -16,10 +18,6 @@
 #include "../dsp/dsp.h"
 #include "../dsp/yuv.h"

-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 //------------------------------------------------------------------------------
 // Main YUV<->RGB conversion functions

@ -111,13 +109,13 @@ static int EmitFancyRGB(const VP8Io* const io, WebPDecParams* const p) {
  const uint8_t* top_u = p->tmp_u;
  const uint8_t* top_v = p->tmp_v;
  int y = io->mb_y;
-  int y_end = io->mb_y + io->mb_h;
+  const int y_end = io->mb_y + io->mb_h;
  const int mb_w = io->mb_w;
  const int uv_w = (mb_w + 1) / 2;

  if (y == 0) {
    // First line is special cased. We mirror the u/v samples at boundary.
-    upsample(NULL, cur_y, cur_u, cur_v, cur_u, cur_v, NULL, dst, mb_w);
+    upsample(cur_y, NULL, cur_u, cur_v, cur_u, cur_v, dst, NULL, mb_w);
  } else {
    // We can finish the left-over line from previous call.
    upsample(p->tmp_y, cur_y, top_u, top_v, cur_u, cur_v,
@ -162,66 +160,84 @@ static int EmitFancyRGB(const VP8Io* const io, WebPDecParams* const p) {

 static int EmitAlphaYUV(const VP8Io* const io, WebPDecParams* const p) {
  const uint8_t* alpha = io->a;
-  if (alpha != NULL) {
-    int j;
+  const WebPYUVABuffer* const buf = &p->output->u.YUVA;
  const int mb_w = io->mb_w;
  const int mb_h = io->mb_h;
-    const WebPYUVABuffer* const buf = &p->output->u.YUVA;
  uint8_t* dst = buf->a + io->mb_y * buf->a_stride;
+  int j;
+
+  if (alpha != NULL) {
    for (j = 0; j < mb_h; ++j) {
      memcpy(dst, alpha, mb_w * sizeof(*dst));
      alpha += io->width;
      dst += buf->a_stride;
    }
+  } else if (buf->a != NULL) {
+    // the user requested alpha, but there is none, set it to opaque.
+    for (j = 0; j < mb_h; ++j) {
+      memset(dst, 0xff, mb_w * sizeof(*dst));
+      dst += buf->a_stride;
+    }
  }
  return 0;
 }

-static int EmitAlphaRGB(const VP8Io* const io, WebPDecParams* const p) {
-  const uint8_t* alpha = io->a;
-  if (alpha != NULL) {
-    const int mb_w = io->mb_w;
-    const int mb_h = io->mb_h;
-    int i, j;
-    const WEBP_CSP_MODE colorspace = p->output->colorspace;
-    const int alpha_first =
-        (colorspace == MODE_ARGB || colorspace == MODE_Argb);
-    const WebPRGBABuffer* const buf = &p->output->u.RGBA;
+static int GetAlphaSourceRow(const VP8Io* const io,
+                             const uint8_t** alpha, int* const num_rows) {
  int start_y = io->mb_y;
-    int num_rows = mb_h;
+  *num_rows = io->mb_h;

-    // We compensate for the 1-line delay of fancy upscaler.
+  // Compensate for the 1-line delay of the fancy upscaler.
  // This is similar to EmitFancyRGB().
  if (io->fancy_upsampling) {
    if (start_y == 0) {
-        // We don't process the last row yet. It'll be done during next call.
-        --num_rows;
+      // We don't process the last row yet. It'll be done during the next call.
+      --*num_rows;
    } else {
      --start_y;
      // Fortunately, *alpha data is persistent, so we can go back
      // one row and finish alpha blending, now that the fancy upscaler
      // completed the YUV->RGB interpolation.
-        alpha -= io->width;
+      *alpha -= io->width;
    }
-      if (io->crop_top + io->mb_y + mb_h == io->crop_bottom) {
-        // If it's the very last call, we process all the remaing rows!
-        num_rows = io->crop_bottom - io->crop_top - start_y;
+    if (io->crop_top + io->mb_y + io->mb_h == io->crop_bottom) {
+      // If it's the very last call, we process all the remaining rows!
+      *num_rows = io->crop_bottom - io->crop_top - start_y;
    }
  }
-    {
+  return start_y;
+}
+
+static int EmitAlphaRGB(const VP8Io* const io, WebPDecParams* const p) {
+  const uint8_t* alpha = io->a;
+  if (alpha != NULL) {
+    const int mb_w = io->mb_w;
+    const WEBP_CSP_MODE colorspace = p->output->colorspace;
+    const int alpha_first =
+        (colorspace == MODE_ARGB || colorspace == MODE_Argb);
+    const WebPRGBABuffer* const buf = &p->output->u.RGBA;
+    int num_rows;
+    const int start_y = GetAlphaSourceRow(io, &alpha, &num_rows);
    uint8_t* const base_rgba = buf->rgba + start_y * buf->stride;
    uint8_t* dst = base_rgba + (alpha_first ? 0 : 3);
+    uint32_t alpha_mask = 0xff;
+    int i, j;
+
    for (j = 0; j < num_rows; ++j) {
-        for (i = 0; i < mb_w; ++i) dst[4 * i] = alpha[i];
+      for (i = 0; i < mb_w; ++i) {
+        const uint32_t alpha_value = alpha[i];
+        dst[4 * i] = alpha_value;
+        alpha_mask &= alpha_value;
+      }
      alpha += io->width;
      dst += buf->stride;
    }
-      if (WebPIsPremultipliedMode(colorspace)) {
+    // alpha_mask is < 0xff if there's non-trivial alpha to premultiply with.
+    if (alpha_mask != 0xff && WebPIsPremultipliedMode(colorspace)) {
      WebPApplyAlphaMultiply(base_rgba, alpha_first,
                             mb_w, num_rows, buf->stride);
    }
  }
-  }
  return 0;
 }

@ -229,22 +245,27 @@ static int EmitAlphaRGBA4444(const VP8Io* const io, WebPDecParams* const p) {
  const uint8_t* alpha = io->a;
  if (alpha != NULL) {
    const int mb_w = io->mb_w;
-    const int mb_h = io->mb_h;
-    int i, j;
+    const WEBP_CSP_MODE colorspace = p->output->colorspace;
    const WebPRGBABuffer* const buf = &p->output->u.RGBA;
-    uint8_t* const base_rgba = buf->rgba + io->mb_y * buf->stride;
+    int num_rows;
+    const int start_y = GetAlphaSourceRow(io, &alpha, &num_rows);
+    uint8_t* const base_rgba = buf->rgba + start_y * buf->stride;
    uint8_t* alpha_dst = base_rgba + 1;
-    for (j = 0; j < mb_h; ++j) {
+    uint32_t alpha_mask = 0x0f;
+    int i, j;
+
+    for (j = 0; j < num_rows; ++j) {
      for (i = 0; i < mb_w; ++i) {
        // Fill in the alpha value (converted to 4 bits).
-        const uint32_t alpha_val = VP8Clip4Bits(alpha[i]);
-        alpha_dst[2 * i] = (alpha_dst[2 * i] & 0xf0) | alpha_val;
+        const uint32_t alpha_value = alpha[i] >> 4;
+        alpha_dst[2 * i] = (alpha_dst[2 * i] & 0xf0) | alpha_value;
+        alpha_mask &= alpha_value;
      }
      alpha += io->width;
      alpha_dst += buf->stride;
    }
-    if (p->output->colorspace == MODE_rgbA_4444) {
-      WebPApplyAlphaMultiply4444(base_rgba, mb_w, mb_h, buf->stride);
+    if (alpha_mask != 0x0f && WebPIsPremultipliedMode(colorspace)) {
+      WebPApplyAlphaMultiply4444(base_rgba, mb_w, num_rows, buf->stride);
    }
  }
  return 0;
@ -389,17 +410,22 @@ static int ExportAlpha(WebPDecParams* const p, int y_pos) {
  uint8_t* dst = base_rgba + (alpha_first ? 0 : 3);
  int num_lines_out = 0;
  const int is_premult_alpha = WebPIsPremultipliedMode(colorspace);
+  uint32_t alpha_mask = 0xff;
  const int width = p->scaler_a.dst_width;

  while (WebPRescalerHasPendingOutput(&p->scaler_a)) {
    int i;
    assert(p->last_y + y_pos + num_lines_out < p->output->height);
    WebPRescalerExportRow(&p->scaler_a);
-    for (i = 0; i < width; ++i) dst[4 * i] = p->scaler_a.dst[i];
+    for (i = 0; i < width; ++i) {
+      const uint32_t alpha_value = p->scaler_a.dst[i];
+      dst[4 * i] = alpha_value;
+      alpha_mask &= alpha_value;
+    }
    dst += buf->stride;
    ++num_lines_out;
  }
-  if (is_premult_alpha) {
+  if (is_premult_alpha && alpha_mask != 0xff) {
    WebPApplyAlphaMultiply(base_rgba, alpha_first,
                           width, num_lines_out, buf->stride);
  }
@ -414,6 +440,7 @@ static int ExportAlphaRGBA4444(WebPDecParams* const p, int y_pos) {
  const WEBP_CSP_MODE colorspace = p->output->colorspace;
  const int width = p->scaler_a.dst_width;
  const int is_premult_alpha = WebPIsPremultipliedMode(colorspace);
+  uint32_t alpha_mask = 0x0f;

  while (WebPRescalerHasPendingOutput(&p->scaler_a)) {
    int i;
@ -421,13 +448,14 @@ static int ExportAlphaRGBA4444(WebPDecParams* const p, int y_pos) {
    WebPRescalerExportRow(&p->scaler_a);
    for (i = 0; i < width; ++i) {
      // Fill in the alpha value (converted to 4 bits).
-      const uint32_t alpha_val = VP8Clip4Bits(p->scaler_a.dst[i]);
-      alpha_dst[2 * i] = (alpha_dst[2 * i] & 0xf0) | alpha_val;
+      const uint32_t alpha_value = p->scaler_a.dst[i] >> 4;
+      alpha_dst[2 * i] = (alpha_dst[2 * i] & 0xf0) | alpha_value;
+      alpha_mask &= alpha_value;
    }
    alpha_dst += buf->stride;
    ++num_lines_out;
  }
-  if (is_premult_alpha) {
+  if (is_premult_alpha && alpha_mask != 0x0f) {
    WebPApplyAlphaMultiply4444(base_rgba, width, num_lines_out, buf->stride);
  }
  return num_lines_out;
@ -464,8 +492,7 @@ static int InitRGBRescaler(const VP8Io* const io, WebPDecParams* const p) {
    tmp_size1 += work_size;
    tmp_size2 += out_width;
  }
-  p->memory =
-      calloc(1, tmp_size1 * sizeof(*work) + tmp_size2 * sizeof(*tmp));
+  p->memory = calloc(1, tmp_size1 * sizeof(*work) + tmp_size2 * sizeof(*tmp));
  if (p->memory == NULL) {
    return 0;   // memory error
  }
@ -562,7 +589,7 @@ static int CustomSetup(VP8Io* io) {
 //------------------------------------------------------------------------------

 static int CustomPut(const VP8Io* io) {
-  WebPDecParams* p = (WebPDecParams*)io->opaque;
+  WebPDecParams* const p = (WebPDecParams*)io->opaque;
  const int mb_w = io->mb_w;
  const int mb_h = io->mb_h;
  int num_lines_out;
@ -572,7 +599,7 @@ static int CustomPut(const VP8Io* io) {
    return 0;
  }
  num_lines_out = p->emit(io, p);
-  if (p->emit_alpha) {
+  if (p->emit_alpha != NULL) {
    p->emit_alpha(io, p);
  }
  p->last_y += num_lines_out;
@ -599,6 +626,3 @@ void WebPInitCustomIo(WebPDecParams* const params, VP8Io* const io) {

 //------------------------------------------------------------------------------

-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
--- a/src/dec/layer.c
+++ b/src/dec/layer.c
@ -1,8 +1,10 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Enhancement layer (for YUV444/422)
@ -14,10 +16,6 @@

 #include "./vp8i.h"

-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 //------------------------------------------------------------------------------

 int VP8DecodeLayer(VP8Decoder* const dec) {
@ -30,6 +28,3 @@ int VP8DecodeLayer(VP8Decoder* const dec) {
  return 1;
 }

-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
--- a/src/dec/quant.c
+++ b/src/dec/quant.c
@ -1,8 +1,10 @@
 // Copyright 2010 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Quantizer initialization
@ -11,10 +13,6 @@

 #include "./vp8i.h"

-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 static WEBP_INLINE int clip(int v, int M) {
  return v < 0 ? 0 : v > M ? M : v;
 }
@ -94,18 +92,19 @@ void VP8ParseQuant(VP8Decoder* const dec) {
      m->y1_mat_[1] = kAcTable[clip(q + 0,       127)];

      m->y2_mat_[0] = kDcTable[clip(q + dqy2_dc, 127)] * 2;
-      // TODO(skal): make it another table?
-      m->y2_mat_[1] = kAcTable[clip(q + dqy2_ac, 127)] * 155 / 100;
+      // For all x in [0..284], x*155/100 is bitwise equal to (x*101581) >> 16.
+      // The smallest precision for that is '(x*6349) >> 12' but 16 is a good
+      // word size.
+      m->y2_mat_[1] = (kAcTable[clip(q + dqy2_ac, 127)] * 101581) >> 16;
      if (m->y2_mat_[1] < 8) m->y2_mat_[1] = 8;

      m->uv_mat_[0] = kDcTable[clip(q + dquv_dc, 117)];
      m->uv_mat_[1] = kAcTable[clip(q + dquv_ac, 127)];
+
+      m->uv_quant_ = q + dquv_ac;   // for dithering strength evaluation
    }
  }
 }

 //------------------------------------------------------------------------------

-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
--- a/src/dec/tree.c
+++ b/src/dec/tree.c
@ -1,8 +1,10 @@
 // Copyright 2010 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Coding trees and probas
@ -13,10 +15,6 @@

 #define USE_GENERIC_TREE

-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 #ifdef USE_GENERIC_TREE
 static const int8_t kYModesIntra4[18] = {
  -B_DC_PRED, 1,
@ -31,61 +29,12 @@ static const int8_t kYModesIntra4[18] = {
 };
 #endif

-#ifndef ONLY_KEYFRAME_CODE
-
-// inter prediction modes
-enum {
-  LEFT4 = 0, ABOVE4 = 1, ZERO4 = 2, NEW4 = 3,
-  NEARESTMV, NEARMV, ZEROMV, NEWMV, SPLITMV };
-
-static const int8_t kYModesInter[8] = {
-  -DC_PRED, 1,
-    2, 3,
-      -V_PRED, -H_PRED,
-      -TM_PRED, -B_PRED
-};
-
-static const int8_t kMBSplit[6] = {
-  -3, 1,
-    -2, 2,
-      -0, -1
-};
-
-static const int8_t kMVRef[8] = {
-  -ZEROMV, 1,
-    -NEARESTMV, 2,
-      -NEARMV, 3,
-        -NEWMV, -SPLITMV
-};
-
-static const int8_t kMVRef4[6] = {
-  -LEFT4, 1,
-    -ABOVE4, 2,
-      -ZERO4, -NEW4
-};
-#endif
-
 //------------------------------------------------------------------------------
 // Default probabilities

-// Inter
-#ifndef ONLY_KEYFRAME_CODE
-static const uint8_t kYModeProbaInter0[4] = { 112, 86, 140, 37 };
-static const uint8_t kUVModeProbaInter0[3] = { 162, 101, 204 };
-static const uint8_t kMVProba0[2][NUM_MV_PROBAS] = {
-  { 162, 128, 225, 146, 172, 147, 214,  39,
-    156, 128, 129, 132,  75, 145, 178, 206,
-    239, 254, 254 },
-  { 164, 128, 204, 170, 119, 235, 140, 230,
-    228, 128, 130, 130,  74, 148, 180, 203,
-    236, 254, 254 }
-};
-#endif
-
 // Paragraph 13.5
 static const uint8_t
  CoeffsProba0[NUM_TYPES][NUM_BANDS][NUM_CTX][NUM_PROBAS] = {
-  // genereated using vp8_default_coef_probs() in entropy.c:129
  { { { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
@ -326,28 +275,25 @@ static const uint8_t kBModesProba[NUM_BMODES][NUM_BMODES][NUM_BMODES - 1] = {

 void VP8ResetProba(VP8Proba* const proba) {
  memset(proba->segments_, 255u, sizeof(proba->segments_));
-  memcpy(proba->coeffs_, CoeffsProba0, sizeof(CoeffsProba0));
-#ifndef ONLY_KEYFRAME_CODE
-  memcpy(proba->mv_, kMVProba0, sizeof(kMVProba0));
-  memcpy(proba->ymode_, kYModeProbaInter0, sizeof(kYModeProbaInter0));
-  memcpy(proba->uvmode_, kUVModeProbaInter0, sizeof(kUVModeProbaInter0));
-#endif
+  // proba->bands_[][] is initialized later
 }

 void VP8ParseIntraMode(VP8BitReader* const br, VP8Decoder* const dec) {
  uint8_t* const top = dec->intra_t_ + 4 * dec->mb_x_;
  uint8_t* const left = dec->intra_l_;
+  VP8MBData* const block = dec->mb_data_ + dec->mb_x_;
+
+  block->is_i4x4_ = !VP8GetBit(br, 145);   // decide for B_PRED first
+  if (!block->is_i4x4_) {
    // Hardcoded 16x16 intra-mode decision tree.
-  dec->is_i4x4_ = !VP8GetBit(br, 145);   // decide for B_PRED first
-  if (!dec->is_i4x4_) {
    const int ymode =
        VP8GetBit(br, 156) ? (VP8GetBit(br, 128) ? TM_PRED : H_PRED)
                           : (VP8GetBit(br, 163) ? V_PRED : DC_PRED);
-    dec->imodes_[0] = ymode;
-    memset(top, ymode, 4 * sizeof(top[0]));
-    memset(left, ymode, 4 * sizeof(left[0]));
+    block->imodes_[0] = ymode;
+    memset(top, ymode, 4 * sizeof(*top));
+    memset(left, ymode, 4 * sizeof(*left));
  } else {
-    uint8_t* modes = dec->imodes_;
+    uint8_t* modes = block->imodes_;
    int y;
    for (y = 0; y < 4; ++y) {
      int ymode = left[y];
@ -356,10 +302,10 @@ void VP8ParseIntraMode(VP8BitReader* const br,  VP8Decoder* const dec) {
        const uint8_t* const prob = kBModesProba[top[x]][ymode];
 #ifdef USE_GENERIC_TREE
        // Generic tree-parsing
-        int i = 0;
-        do {
+        int i = kYModesIntra4[VP8GetBit(br, prob[0])];
+        while (i > 0) {
          i = kYModesIntra4[2 * i + VP8GetBit(br, prob[i])];
-        } while (i > 0);
+        }
        ymode = -i;
 #else
        // Hardcoded tree parsing
@ -374,13 +320,14 @@ void VP8ParseIntraMode(VP8BitReader* const br,  VP8Decoder* const dec) {
                            (!VP8GetBit(br, prob[8]) ? B_HD_PRED : B_HU_PRED)));
 #endif    // USE_GENERIC_TREE
        top[x] = ymode;
-        *modes++ = ymode;
      }
+      memcpy(modes, top, 4 * sizeof(*top));
+      modes += 4;
      left[y] = ymode;
    }
  }
  // Hardcoded UVMode decision tree
-  dec->uvmode_ = !VP8GetBit(br, 142) ? DC_PRED
+  block->uvmode_ = !VP8GetBit(br, 142) ? DC_PRED
                 : !VP8GetBit(br, 114) ? V_PRED
                 : VP8GetBit(br, 183) ? TM_PRED : H_PRED;
 }
@ -524,17 +471,6 @@ static const uint8_t
  }
 };

-#ifndef ONLY_KEYFRAME_CODE
-static const uint8_t MVUpdateProba[2][NUM_MV_PROBAS] = {
-  { 237, 246, 253, 253, 254, 254, 254, 254,
-    254, 254, 254, 254, 254, 254, 250, 250,
-    252, 254, 254 },
-  { 231, 243, 245, 253, 254, 254, 254, 254,
-    254, 254, 254, 254, 254, 254, 251, 251,
-    254, 254, 254 }
-};
-#endif
-
 // Paragraph 9.9
 void VP8ParseProba(VP8BitReader* const br, VP8Decoder* const dec) {
  VP8Proba* const proba = &dec->proba_;
@ -543,9 +479,9 @@ void VP8ParseProba(VP8BitReader* const br, VP8Decoder* const dec) {
    for (b = 0; b < NUM_BANDS; ++b) {
      for (c = 0; c < NUM_CTX; ++c) {
        for (p = 0; p < NUM_PROBAS; ++p) {
-          if (VP8GetBit(br, CoeffsUpdateProba[t][b][c][p])) {
-            proba->coeffs_[t][b][c][p] = VP8GetValue(br, 8);
-          }
+          const int v = VP8GetBit(br, CoeffsUpdateProba[t][b][c][p]) ?
+                        VP8GetValue(br, 8) : CoeffsProba0[t][b][c][p];
+          proba->bands_[t][b].probas_[c][p] = v;
        }
      }
    }
@ -554,36 +490,5 @@ void VP8ParseProba(VP8BitReader* const br, VP8Decoder* const dec) {
  if (dec->use_skip_proba_) {
    dec->skip_p_ = VP8GetValue(br, 8);
  }
-#ifndef ONLY_KEYFRAME_CODE
-  if (!dec->frm_hdr_.key_frame_) {
-    int i;
-    dec->intra_p_ = VP8GetValue(br, 8);
-    dec->last_p_ = VP8GetValue(br, 8);
-    dec->golden_p_ = VP8GetValue(br, 8);
-    if (VP8Get(br)) {   // update y-mode
-      for (i = 0; i < 4; ++i) {
-        proba->ymode_[i] = VP8GetValue(br, 8);
-      }
-    }
-    if (VP8Get(br)) {   // update uv-mode
-      for (i = 0; i < 3; ++i) {
-        proba->uvmode_[i] = VP8GetValue(br, 8);
-      }
-    }
-    // update MV
-    for (i = 0; i < 2; ++i) {
-      int k;
-      for (k = 0; k < NUM_MV_PROBAS; ++k) {
-        if (VP8GetBit(br, MVUpdateProba[i][k])) {
-          const int v = VP8GetValue(br, 7);
-          proba->mv_[i][k] = v ? v << 1 : 1;
-        }
-      }
-    }
-  }
-#endif
 }

-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
--- a/src/dec/vp8.c
+++ b/src/dec/vp8.c
@ -1,8 +1,10 @@
 // Copyright 2010 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // main entry for the decoder
@ -11,15 +13,12 @@

 #include <stdlib.h>

+#include "./alphai.h"
 #include "./vp8i.h"
 #include "./vp8li.h"
 #include "./webpi.h"
 #include "../utils/bit_reader.h"

-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 //------------------------------------------------------------------------------

 int WebPGetDecoderVersion(void) {
@ -45,7 +44,7 @@ int VP8InitIoInternal(VP8Io* const io, int version) {
 }

 VP8Decoder* VP8New(void) {
-  VP8Decoder* const dec = (VP8Decoder*)calloc(1, sizeof(VP8Decoder));
+  VP8Decoder* const dec = (VP8Decoder*)calloc(1, sizeof(*dec));
  if (dec != NULL) {
    SetOk(dec);
    WebPWorkerInit(&dec->worker_);
@ -121,6 +120,9 @@ int VP8GetInfo(const uint8_t* data, size_t data_size, size_t chunk_size,
    if (((bits >> 5)) >= chunk_size) {  // partition_length
      return 0;         // inconsistent size information.
    }
+    if (w == 0 || h == 0) {
+      return 0;         // We don't support both width and height to be zero.
+    }

    if (width) {
      *width = w;
@ -236,20 +238,6 @@ static int ParseFilterHeader(VP8BitReader* br, VP8Decoder* const dec) {
    }
  }
  dec->filter_type_ = (hdr->level_ == 0) ? 0 : hdr->simple_ ? 1 : 2;
-  if (dec->filter_type_ > 0) {    // precompute filter levels per segment
-    if (dec->segment_hdr_.use_segment_) {
-      int s;
-      for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
-        int strength = dec->segment_hdr_.filter_strength_[s];
-        if (!dec->segment_hdr_.absolute_delta_) {
-          strength += hdr->level_;
-        }
-        dec->filter_levels_[s] = strength;
-      }
-    } else {
-      dec->filter_levels_[0] = hdr->level_;
-    }
-  }
  return !br->eof_;
 }

@ -261,7 +249,6 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
  VP8PictureHeader* pic_hdr;
  VP8BitReader* br;
  VP8StatusCode status;
-  WebPHeaderStructure headers;

  if (dec == NULL) {
    return 0;
@ -271,33 +258,8 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
    return VP8SetError(dec, VP8_STATUS_INVALID_PARAM,
                       "null VP8Io passed to VP8GetHeaders()");
  }
-
-  // Process Pre-VP8 chunks.
-  headers.data = io->data;
-  headers.data_size = io->data_size;
-  status = WebPParseHeaders(&headers);
-  if (status != VP8_STATUS_OK) {
-    return VP8SetError(dec, status, "Incorrect/incomplete header.");
-  }
-  if (headers.is_lossless) {
-    return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
-                       "Unexpected lossless format encountered.");
-  }
-
-  if (dec->alpha_data_ == NULL) {
-    assert(dec->alpha_data_size_ == 0);
-    // We have NOT set alpha data yet. Set it now.
-    // (This is to ensure that dec->alpha_data_ is NOT reset to NULL if
-    // WebPParseHeaders() is called more than once, as in incremental decoding
-    // case.)
-    dec->alpha_data_ = headers.alpha_data;
-    dec->alpha_data_size_ = headers.alpha_data_size;
-  }
-
-  // Process the VP8 frame header.
-  buf = headers.data + headers.offset;
-  buf_size = headers.data_size - headers.offset;
-  assert(headers.data_size >= headers.offset);  // WebPParseHeaders' guarantee
+  buf = io->data;
+  buf_size = io->data_size;
  if (buf_size < 4) {
    return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
                       "Truncated header.");
@ -393,38 +355,11 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {

  // Frame buffer marking
  if (!frm_hdr->key_frame_) {
-    // Paragraph 9.7
-#ifndef ONLY_KEYFRAME_CODE
-    dec->buffer_flags_ = VP8Get(br) << 0;   // update golden
-    dec->buffer_flags_ |= VP8Get(br) << 1;  // update alt ref
-    if (!(dec->buffer_flags_ & 1)) {
-      dec->buffer_flags_ |= VP8GetValue(br, 2) << 2;
-    }
-    if (!(dec->buffer_flags_ & 2)) {
-      dec->buffer_flags_ |= VP8GetValue(br, 2) << 4;
-    }
-    dec->buffer_flags_ |= VP8Get(br) << 6;    // sign bias golden
-    dec->buffer_flags_ |= VP8Get(br) << 7;    // sign bias alt ref
-#else
    return VP8SetError(dec, VP8_STATUS_UNSUPPORTED_FEATURE,
                       "Not a key frame.");
-#endif
-  } else {
-    dec->buffer_flags_ = 0x003 | 0x100;
  }

-  // Paragraph 9.8
-#ifndef ONLY_KEYFRAME_CODE
-  dec->update_proba_ = VP8Get(br);
-  if (!dec->update_proba_) {    // save for later restore
-    dec->proba_saved_ = dec->proba_;
-  }
-  dec->buffer_flags_ &= 1 << 8;
-  dec->buffer_flags_ |=
-      (frm_hdr->key_frame_ || VP8Get(br)) << 8;    // refresh last frame
-#else
-  VP8Get(br);   // just ignore the value of update_proba_
-#endif
+  VP8Get(br);   // ignore the value of update_proba_

  VP8ParseProba(br, dec);

@ -458,7 +393,7 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
 //------------------------------------------------------------------------------
 // Residual decoding (Paragraph 13.2 / 13.3)

-static const uint8_t kBands[16 + 1] = {
+static const int kBands[16 + 1] = {
  0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7,
  0  // extra entry as sentinel
 };
@ -473,27 +408,9 @@ static const uint8_t kZigzag[16] = {
  0, 1, 4, 8,  5, 2, 3, 6,  9, 12, 13, 10,  7, 11, 14, 15
 };

-typedef const uint8_t (*ProbaArray)[NUM_CTX][NUM_PROBAS];  // for const-casting
-
-// Returns the position of the last non-zero coeff plus one
-// (and 0 if there's no coeff at all)
-static int GetCoeffs(VP8BitReader* const br, ProbaArray prob,
-                     int ctx, const quant_t dq, int n, int16_t* out) {
-  // n is either 0 or 1 here. kBands[n] is not necessary for extracting '*p'.
-  const uint8_t* p = prob[n][ctx];
-  if (!VP8GetBit(br, p[0])) {   // first EOB is more a 'CBP' bit.
-    return 0;
-  }
-  while (1) {
-    ++n;
-    if (!VP8GetBit(br, p[1])) {
-      p = prob[kBands[n]][0];
-    } else {  // non zero coeff
-      int v, j;
-      if (!VP8GetBit(br, p[2])) {
-        p = prob[kBands[n]][1];
-        v = 1;
-      } else {
+// See section 13-2: http://tools.ietf.org/html/rfc6386#section-13.2
+static int GetLargeValue(VP8BitReader* const br, const uint8_t* const p) {
+  int v;
  if (!VP8GetBit(br, p[3])) {
    if (!VP8GetBit(br, p[4])) {
      v = 2;
@ -520,121 +437,134 @@ static int GetCoeffs(VP8BitReader* const br, ProbaArray prob,
      v += 3 + (8 << cat);
    }
  }
-        p = prob[kBands[n]][2];
+  return v;
 }
-      j = kZigzag[n - 1];
-      out[j] = VP8GetSigned(br, v) * dq[j > 0];
-      if (n == 16 || !VP8GetBit(br, p[0])) {   // EOB
-        return n;
+
+// Returns the position of the last non-zero coeff plus one
+static int GetCoeffs(VP8BitReader* const br, const VP8BandProbas* const prob,
+                     int ctx, const quant_t dq, int n, int16_t* out) {
+  // n is either 0 or 1 here. kBands[n] is not necessary for extracting '*p'.
+  const uint8_t* p = prob[n].probas_[ctx];
+  for (; n < 16; ++n) {
+    if (!VP8GetBit(br, p[0])) {
+      return n;  // previous coeff was last non-zero coeff
+    }
+    while (!VP8GetBit(br, p[1])) {       // sequence of zero coeffs
+      p = prob[kBands[++n]].probas_[0];
+      if (n == 16) return 16;
+    }
+    {        // non zero coeff
+      const VP8ProbaArray* const p_ctx = &prob[kBands[n + 1]].probas_[0];
+      int v;
+      if (!VP8GetBit(br, p[2])) {
+        v = 1;
+        p = p_ctx[1];
+      } else {
+        v = GetLargeValue(br, p);
+        p = p_ctx[2];
+      }
+      out[kZigzag[n]] = VP8GetSigned(br, v) * dq[n > 0];
    }
  }
-    if (n == 16) {
  return 16;
 }
-  }
+
+static WEBP_INLINE uint32_t NzCodeBits(uint32_t nz_coeffs, int nz, int dc_nz) {
+  nz_coeffs <<= 2;
+  nz_coeffs |= (nz > 3) ? 3 : (nz > 1) ? 2 : dc_nz;
+  return nz_coeffs;
 }

-// Alias-safe way of converting 4bytes to 32bits.
-typedef union {
-  uint8_t  i8[4];
-  uint32_t i32;
-} PackedNz;
-
-// Table to unpack four bits into four bytes
-static const PackedNz kUnpackTab[16] = {
-  {{0, 0, 0, 0}},  {{1, 0, 0, 0}},  {{0, 1, 0, 0}},  {{1, 1, 0, 0}},
-  {{0, 0, 1, 0}},  {{1, 0, 1, 0}},  {{0, 1, 1, 0}},  {{1, 1, 1, 0}},
-  {{0, 0, 0, 1}},  {{1, 0, 0, 1}},  {{0, 1, 0, 1}},  {{1, 1, 0, 1}},
-  {{0, 0, 1, 1}},  {{1, 0, 1, 1}},  {{0, 1, 1, 1}},  {{1, 1, 1, 1}} };
-
-// Macro to pack four LSB of four bytes into four bits.
-#if defined(__PPC__) || defined(_M_PPC) || defined(_ARCH_PPC) || \
-    defined(__BIG_ENDIAN__)
-#define PACK_CST 0x08040201U
-#else
-#define PACK_CST 0x01020408U
-#endif
-#define PACK(X, S) ((((X).i32 * PACK_CST) & 0xff000000) >> (S))
-
-static void ParseResiduals(VP8Decoder* const dec,
+static int ParseResiduals(VP8Decoder* const dec,
                          VP8MB* const mb, VP8BitReader* const token_br) {
-  int out_t_nz, out_l_nz, first;
-  ProbaArray ac_prob;
-  const VP8QuantMatrix* q = &dec->dqm_[dec->segment_];
-  int16_t* dst = dec->coeffs_;
+  VP8BandProbas (* const bands)[NUM_BANDS] = dec->proba_.bands_;
+  const VP8BandProbas* ac_proba;
+  const VP8QuantMatrix* const q = &dec->dqm_[dec->segment_];
+  VP8MBData* const block = dec->mb_data_ + dec->mb_x_;
+  int16_t* dst = block->coeffs_;
  VP8MB* const left_mb = dec->mb_info_ - 1;
-  PackedNz nz_ac, nz_dc;
-  PackedNz tnz, lnz;
-  uint32_t non_zero_ac = 0;
-  uint32_t non_zero_dc = 0;
+  uint8_t tnz, lnz;
+  uint32_t non_zero_y = 0;
+  uint32_t non_zero_uv = 0;
  int x, y, ch;
+  uint32_t out_t_nz, out_l_nz;
+  int first;

-  nz_dc.i32 = nz_ac.i32 = 0;
  memset(dst, 0, 384 * sizeof(*dst));
-  if (!dec->is_i4x4_) {    // parse DC
+  if (!block->is_i4x4_) {    // parse DC
    int16_t dc[16] = { 0 };
-    const int ctx = mb->dc_nz_ + left_mb->dc_nz_;
-    mb->dc_nz_ = left_mb->dc_nz_ =
-        (GetCoeffs(token_br, (ProbaArray)dec->proba_.coeffs_[1],
-                   ctx, q->y2_mat_, 0, dc) > 0);
-    first = 1;
-    ac_prob = (ProbaArray)dec->proba_.coeffs_[0];
+    const int ctx = mb->nz_dc_ + left_mb->nz_dc_;
+    const int nz = GetCoeffs(token_br, bands[1], ctx, q->y2_mat_, 0, dc);
+    mb->nz_dc_ = left_mb->nz_dc_ = (nz > 0);
+    if (nz > 1) {   // more than just the DC -> perform the full transform
      VP8TransformWHT(dc, dst);
+    } else {        // only DC is non-zero -> inlined simplified transform
+      int i;
+      const int dc0 = (dc[0] + 3) >> 3;
+      for (i = 0; i < 16 * 16; i += 16) dst[i] = dc0;
+    }
+    first = 1;
+    ac_proba = bands[0];
  } else {
    first = 0;
-    ac_prob = (ProbaArray)dec->proba_.coeffs_[3];
+    ac_proba = bands[3];
  }

-  tnz = kUnpackTab[mb->nz_ & 0xf];
-  lnz = kUnpackTab[left_mb->nz_ & 0xf];
+  tnz = mb->nz_ & 0x0f;
+  lnz = left_mb->nz_ & 0x0f;
  for (y = 0; y < 4; ++y) {
-    int l = lnz.i8[y];
+    int l = lnz & 1;
+    uint32_t nz_coeffs = 0;
    for (x = 0; x < 4; ++x) {
-      const int ctx = l + tnz.i8[x];
-      const int nz = GetCoeffs(token_br, ac_prob, ctx,
-                               q->y1_mat_, first, dst);
-      tnz.i8[x] = l = (nz > 0);
-      nz_dc.i8[x] = (dst[0] != 0);
-      nz_ac.i8[x] = (nz > 1);
+      const int ctx = l + (tnz & 1);
+      const int nz = GetCoeffs(token_br, ac_proba, ctx, q->y1_mat_, first, dst);
+      l = (nz > first);
+      tnz = (tnz >> 1) | (l << 7);
+      nz_coeffs = NzCodeBits(nz_coeffs, nz, dst[0] != 0);
      dst += 16;
    }
-    lnz.i8[y] = l;
-    non_zero_dc |= PACK(nz_dc, 24 - y * 4);
-    non_zero_ac |= PACK(nz_ac, 24 - y * 4);
+    tnz >>= 4;
+    lnz = (lnz >> 1) | (l << 7);
+    non_zero_y = (non_zero_y << 8) | nz_coeffs;
  }
-  out_t_nz = PACK(tnz, 24);
-  out_l_nz = PACK(lnz, 24);
+  out_t_nz = tnz;
+  out_l_nz = lnz >> 4;

-  tnz = kUnpackTab[mb->nz_ >> 4];
-  lnz = kUnpackTab[left_mb->nz_ >> 4];
  for (ch = 0; ch < 4; ch += 2) {
+    uint32_t nz_coeffs = 0;
+    tnz = mb->nz_ >> (4 + ch);
+    lnz = left_mb->nz_ >> (4 + ch);
    for (y = 0; y < 2; ++y) {
-      int l = lnz.i8[ch + y];
+      int l = lnz & 1;
      for (x = 0; x < 2; ++x) {
-        const int ctx = l + tnz.i8[ch + x];
-        const int nz =
-            GetCoeffs(token_br, (ProbaArray)dec->proba_.coeffs_[2],
-                      ctx, q->uv_mat_, 0, dst);
-        tnz.i8[ch + x] = l = (nz > 0);
-        nz_dc.i8[y * 2 + x] = (dst[0] != 0);
-        nz_ac.i8[y * 2 + x] = (nz > 1);
+        const int ctx = l + (tnz & 1);
+        const int nz = GetCoeffs(token_br, bands[2], ctx, q->uv_mat_, 0, dst);
+        l = (nz > 0);
+        tnz = (tnz >> 1) | (l << 3);
+        nz_coeffs = NzCodeBits(nz_coeffs, nz, dst[0] != 0);
        dst += 16;
      }
-      lnz.i8[ch + y] = l;
+      tnz >>= 2;
+      lnz = (lnz >> 1) | (l << 5);
    }
-    non_zero_dc |= PACK(nz_dc, 8 - ch * 2);
-    non_zero_ac |= PACK(nz_ac, 8 - ch * 2);
+    // Note: we don't really need the per-4x4 details for U/V blocks.
+    non_zero_uv |= nz_coeffs << (4 * ch);
+    out_t_nz |= (tnz << 4) << ch;
+    out_l_nz |= (lnz & 0xf0) << ch;
  }
-  out_t_nz |= PACK(tnz, 20);
-  out_l_nz |= PACK(lnz, 20);
  mb->nz_ = out_t_nz;
  left_mb->nz_ = out_l_nz;

-  dec->non_zero_ac_ = non_zero_ac;
-  dec->non_zero_ = non_zero_ac | non_zero_dc;
-  mb->skip_ = !dec->non_zero_;
+  block->non_zero_y_ = non_zero_y;
+  block->non_zero_uv_ = non_zero_uv;
+
+  // We look at the mode-code of each block and check if some blocks have less
+  // than three non-zero coeffs (code < 2). This is to avoid dithering flat and
+  // empty blocks.
+  block->dither_ = (non_zero_uv & 0xaaaa) ? 0 : q->dither_;
+
+  return !(non_zero_y | non_zero_uv);  // will be used for further optimization
 }
-#undef PACK

 //------------------------------------------------------------------------------
 // Main loop
@ -642,7 +572,9 @@ static void ParseResiduals(VP8Decoder* const dec,
 int VP8DecodeMB(VP8Decoder* const dec, VP8BitReader* const token_br) {
  VP8BitReader* const br = &dec->br_;
  VP8MB* const left = dec->mb_info_ - 1;
-  VP8MB* const info = dec->mb_info_ + dec->mb_x_;
+  VP8MB* const mb = dec->mb_info_ + dec->mb_x_;
+  VP8MBData* const block = dec->mb_data_ + dec->mb_x_;
+  int skip;

  // Note: we don't save segment map (yet), as we don't expect
  // to decode more than 1 keyframe.
@ -652,67 +584,64 @@ int VP8DecodeMB(VP8Decoder* const dec, VP8BitReader* const token_br) {
        VP8GetBit(br, dec->proba_.segments_[1]) :
        2 + VP8GetBit(br, dec->proba_.segments_[2]);
  }
-  info->skip_ = dec->use_skip_proba_ ? VP8GetBit(br, dec->skip_p_) : 0;
+  skip = dec->use_skip_proba_ ? VP8GetBit(br, dec->skip_p_) : 0;

  VP8ParseIntraMode(br, dec);
  if (br->eof_) {
    return 0;
  }

-  if (!info->skip_) {
-    ParseResiduals(dec, info, token_br);
+  if (!skip) {
+    skip = ParseResiduals(dec, mb, token_br);
  } else {
-    left->nz_ = info->nz_ = 0;
-    if (!dec->is_i4x4_) {
-      left->dc_nz_ = info->dc_nz_ = 0;
+    left->nz_ = mb->nz_ = 0;
+    if (!block->is_i4x4_) {
+      left->nz_dc_ = mb->nz_dc_ = 0;
    }
-    dec->non_zero_ = 0;
-    dec->non_zero_ac_ = 0;
+    block->non_zero_y_ = 0;
+    block->non_zero_uv_ = 0;
  }

-  return (!token_br->eof_);
+  if (dec->filter_type_ > 0) {  // store filter info
+    VP8FInfo* const finfo = dec->f_info_ + dec->mb_x_;
+    *finfo = dec->fstrengths_[dec->segment_][block->is_i4x4_];
+    finfo->f_inner_ |= !skip;
+  }
+
+  return !token_br->eof_;
 }

 void VP8InitScanline(VP8Decoder* const dec) {
  VP8MB* const left = dec->mb_info_ - 1;
  left->nz_ = 0;
-  left->dc_nz_ = 0;
+  left->nz_dc_ = 0;
  memset(dec->intra_l_, B_DC_PRED, sizeof(dec->intra_l_));
-  dec->filter_row_ =
-    (dec->filter_type_ > 0) &&
-    (dec->mb_y_ >= dec->tl_mb_y_) && (dec->mb_y_ <= dec->br_mb_y_);
+  dec->mb_x_ = 0;
 }

 static int ParseFrame(VP8Decoder* const dec, VP8Io* io) {
  for (dec->mb_y_ = 0; dec->mb_y_ < dec->br_mb_y_; ++dec->mb_y_) {
+    // Parse bitstream for this row.
    VP8BitReader* const token_br =
        &dec->parts_[dec->mb_y_ & (dec->num_parts_ - 1)];
-    VP8InitScanline(dec);
-    for (dec->mb_x_ = 0; dec->mb_x_ < dec->mb_w_;  dec->mb_x_++) {
+    for (; dec->mb_x_ < dec->mb_w_; ++dec->mb_x_) {
      if (!VP8DecodeMB(dec, token_br)) {
        return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
                           "Premature end-of-file encountered.");
      }
-      VP8ReconstructBlock(dec);
-
-      // Store data and save block's filtering params
-      VP8StoreBlock(dec);
    }
+    VP8InitScanline(dec);   // Prepare for next scanline
+
+    // Reconstruct, filter and emit the row.
    if (!VP8ProcessRow(dec, io)) {
      return VP8SetError(dec, VP8_STATUS_USER_ABORT, "Output aborted.");
    }
  }
-  if (dec->use_threads_ && !WebPWorkerSync(&dec->worker_)) {
-    return 0;
+  if (dec->mt_method_ > 0) {
+    if (!WebPWorkerSync(&dec->worker_)) return 0;
  }

  // Finish
-#ifndef ONLY_KEYFRAME_CODE
-  if (!dec->update_proba_) {
-    dec->proba_ = dec->proba_saved_;
-  }
-#endif
-
 #ifdef WEBP_EXPERIMENTAL_FEATURES
  if (dec->layer_data_size_ > 0) {
    if (!VP8DecodeLayer(dec)) {
@ -768,12 +697,12 @@ void VP8Clear(VP8Decoder* const dec) {
  if (dec == NULL) {
    return;
  }
-  if (dec->use_threads_) {
+  if (dec->mt_method_ > 0) {
    WebPWorkerEnd(&dec->worker_);
  }
-  if (dec->mem_) {
+  ALPHDelete(dec->alph_dec_);
+  dec->alph_dec_ = NULL;
  free(dec->mem_);
-  }
  dec->mem_ = NULL;
  dec->mem_size_ = 0;
  memset(&dec->br_, 0, sizeof(dec->br_));
@ -782,6 +711,3 @@ void VP8Clear(VP8Decoder* const dec) {

 //------------------------------------------------------------------------------

-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
--- a/src/dec/vp8i.h
+++ b/src/dec/vp8i.h
@ -1,8 +1,10 @@
 // Copyright 2010 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // VP8 decoder: internal header.
@ -15,10 +17,11 @@
 #include <string.h>     // for memcpy()
 #include "./vp8li.h"
 #include "../utils/bit_reader.h"
+#include "../utils/random.h"
 #include "../utils/thread.h"
 #include "../dsp/dsp.h"

-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif

@ -27,10 +30,8 @@ extern "C" {

 // version numbers
 #define DEC_MAJ_VERSION 0
-#define DEC_MIN_VERSION 1
-#define DEC_REV_VERSION 99
-
-#define ONLY_KEYFRAME_CODE      // to remove any code related to P-Frames
+#define DEC_MIN_VERSION 4
+#define DEC_REV_VERSION 0

 // intra prediction modes
 enum { B_DC_PRED = 0,   // 4x4 modes
@ -98,6 +99,9 @@ enum { MB_FEATURE_TREE_PROBS = 3,
 #define U_OFF    (Y_OFF + BPS * 16 + BPS)
 #define V_OFF    (U_OFF + 16)

+// minimal width under which lossy multi-threading is always disabled
+#define MIN_WIDTH_FOR_THREADS 512
+
 //------------------------------------------------------------------------------
 // Headers

@ -126,15 +130,19 @@ typedef struct {
  int8_t filter_strength_[NUM_MB_SEGMENTS];  // filter strength for segments
 } VP8SegmentHeader;

+
+// probas associated to one of the contexts
+typedef uint8_t VP8ProbaArray[NUM_PROBAS];
+
+typedef struct {   // all the probas associated to one band
+  VP8ProbaArray probas_[NUM_CTX];
+} VP8BandProbas;
+
 // Struct collecting all frame-persistent probabilities.
 typedef struct {
  uint8_t segments_[MB_FEATURE_TREE_PROBS];
  // Type: 0:Intra16-AC  1:Intra16-DC   2:Chroma   3:Intra4
-  uint8_t coeffs_[NUM_TYPES][NUM_BANDS][NUM_CTX][NUM_PROBAS];
-#ifndef ONLY_KEYFRAME_CODE
-  uint8_t ymode_[4], uvmode_[3];
-  uint8_t mv_[2][NUM_MV_PROBAS];
-#endif
+  VP8BandProbas bands_[NUM_TYPES][NUM_BANDS];
 } VP8Proba;

 // Filter parameters
@ -151,32 +159,59 @@ typedef struct {
 // Informations about the macroblocks.

 typedef struct {  // filter specs
-  unsigned int f_level_:6;      // filter strength: 0..63
-  unsigned int f_ilevel_:6;     // inner limit: 1..63
-  unsigned int f_inner_:1;      // do inner filtering?
+  uint8_t f_limit_;      // filter limit in [3..189], or 0 if no filtering
+  uint8_t f_ilevel_;     // inner limit in [1..63]
+  uint8_t f_inner_;      // do inner filtering?
+  uint8_t hev_thresh_;   // high edge variance threshold in [0..2]
 } VP8FInfo;

-typedef struct {  // used for syntax-parsing
-  unsigned int nz_;          // non-zero AC/DC coeffs
-  unsigned int dc_nz_:1;     // non-zero DC coeffs
-  unsigned int skip_:1;      // block type
+typedef struct {  // Top/Left Contexts used for syntax-parsing
+  uint8_t nz_;        // non-zero AC/DC coeffs (4bit for luma + 4bit for chroma)
+  uint8_t nz_dc_;     // non-zero DC coeff (1bit)
 } VP8MB;

 // Dequantization matrices
 typedef int quant_t[2];      // [DC / AC].  Can be 'uint16_t[2]' too (~slower).
 typedef struct {
  quant_t y1_mat_, y2_mat_, uv_mat_;
+
+  int uv_quant_;   // U/V quantizer value
+  int dither_;     // dithering amplitude (0 = off, max=255)
 } VP8QuantMatrix;

+// Data needed to reconstruct a macroblock
+typedef struct {
+  int16_t coeffs_[384];   // 384 coeffs = (16+4+4) * 4*4
+  uint8_t is_i4x4_;       // true if intra4x4
+  uint8_t imodes_[16];    // one 16x16 mode (#0) or sixteen 4x4 modes
+  uint8_t uvmode_;        // chroma prediction mode
+  // bit-wise info about the content of each sub-4x4 blocks (in decoding order).
+  // Each of the 4x4 blocks for y/u/v is associated with a 2b code according to:
+  //   code=0 -> no coefficient
+  //   code=1 -> only DC
+  //   code=2 -> first three coefficients are non-zero
+  //   code=3 -> more than three coefficients are non-zero
+  // This allows to call specialized transform functions.
+  uint32_t non_zero_y_;
+  uint32_t non_zero_uv_;
+  uint8_t dither_;      // local dithering strength (deduced from non_zero_*)
+} VP8MBData;
+
 // Persistent information needed by the parallel processing
 typedef struct {
  int id_;              // cache row to process (in [0..2])
  int mb_y_;            // macroblock position of the row
  int filter_row_;      // true if row-filtering is needed
-  VP8FInfo* f_info_;  // filter strengths
+  VP8FInfo* f_info_;    // filter strengths (swapped with dec->f_info_)
+  VP8MBData* mb_data_;  // reconstruction data (swapped with dec->mb_data_)
  VP8Io io_;            // copy of the VP8Io to pass to put()
 } VP8ThreadContext;

+// Saved top samples, per macroblock. Fits into a cache-line.
+typedef struct {
+  uint8_t y[16], u[8], v[8];
+} VP8TopSamples;
+
 //------------------------------------------------------------------------------
 // VP8Decoder: the main opaque structure handed over to user

@ -196,7 +231,8 @@ struct VP8Decoder {

  // Worker
  WebPWorker worker_;
-  int use_threads_;    // use multi-thread
+  int mt_method_;      // multi-thread method: 0=off, 1=[parse+recon][filter]
+                       // 2=[parse][recon+filter]
  int cache_id_;       // current cache row
  int num_caches_;     // number of cached rows of 16 pixels (1, 2 or 3)
  VP8ThreadContext thread_ctx_;  // Thread context
@ -213,12 +249,9 @@ struct VP8Decoder {
  // per-partition boolean decoders.
  VP8BitReader parts_[MAX_NUM_PARTITIONS];

-  // buffer refresh flags
-  //   bit 0: refresh Gold, bit 1: refresh Alt
-  //   bit 2-3: copy to Gold, bit 4-5: copy to Alt
-  //   bit 6: Gold sign bias, bit 7: Alt sign bias
-  //   bit 8: refresh last frame
-  uint32_t buffer_flags_;
+  // Dithering strength, deduced from decoding options
+  int dither_;                // whether to use dithering or not
+  VP8Random dithering_rg_;    // random generator for dithering

  // dequantization (one set of DC/AC dequant factor per segment)
  VP8QuantMatrix dqm_[NUM_MB_SEGMENTS];
@ -227,22 +260,17 @@ struct VP8Decoder {
  VP8Proba proba_;
  int use_skip_proba_;
  uint8_t skip_p_;
-#ifndef ONLY_KEYFRAME_CODE
-  uint8_t intra_p_, last_p_, golden_p_;
-  VP8Proba proba_saved_;
-  int update_proba_;
-#endif

  // Boundary data cache and persistent buffers.
  uint8_t* intra_t_;      // top intra modes values: 4 * mb_w_
  uint8_t  intra_l_[4];   // left intra modes values
-  uint8_t* y_t_;         // top luma samples: 16 * mb_w_
-  uint8_t* u_t_, *v_t_;  // top u/v samples: 8 * mb_w_ each
+
+  uint8_t segment_;       // segment of the currently parsed block
+  VP8TopSamples* yuv_t_;  // top y/u/v samples

  VP8MB* mb_info_;        // contextual macroblock info (mb_w_ + 1)
  VP8FInfo* f_info_;      // filter strength info
  uint8_t* yuv_b_;        // main block for Y/U/V (size = YUV_SIZE)
-  int16_t* coeffs_;      // 384 coeffs = (16+8+8) * 4*4

  uint8_t* cache_y_;      // macroblock row for storing unfiltered samples
  uint8_t* cache_u_;
@ -256,28 +284,20 @@ struct VP8Decoder {

  // Per macroblock non-persistent infos.
  int mb_x_, mb_y_;       // current position, in macroblock units
-  uint8_t is_i4x4_;       // true if intra4x4
-  uint8_t imodes_[16];    // one 16x16 mode (#0) or sixteen 4x4 modes
-  uint8_t uvmode_;        // chroma prediction mode
-  uint8_t segment_;       // block's segment
-
-  // bit-wise info about the content of each sub-4x4 blocks: there are 16 bits
-  // for luma (bits #0->#15), then 4 bits for chroma-u (#16->#19) and 4 bits for
-  // chroma-v (#20->#23), each corresponding to one 4x4 block in decoding order.
-  // If the bit is set, the 4x4 block contains some non-zero coefficients.
-  uint32_t non_zero_;
-  uint32_t non_zero_ac_;
+  VP8MBData* mb_data_;    // parsed reconstruction data

  // Filtering side-info
  int filter_type_;                          // 0=off, 1=simple, 2=complex
-  int filter_row_;                          // per-row flag
-  uint8_t filter_levels_[NUM_MB_SEGMENTS];  // precalculated per-segment
+  VP8FInfo fstrengths_[NUM_MB_SEGMENTS][2];  // precalculated per-segment/type

-  // extensions
+  // Alpha
+  struct ALPHDecoder* alph_dec_;  // alpha-plane decoder object
  const uint8_t* alpha_data_;     // compressed alpha data (if present)
  size_t alpha_data_size_;
+  int is_alpha_decoded_;  // true if alpha_data_ is decoded in alpha_plane_
  uint8_t* alpha_plane_;        // output. Persistent, contains the whole data.

+  // extensions
  int layer_colorspace_;
  const uint8_t* layer_data_;   // compressed layer data (if present)
  size_t layer_data_size_;
@ -300,8 +320,6 @@ void VP8ParseQuant(VP8Decoder* const dec);

 // in frame.c
 int VP8InitFrame(VP8Decoder* const dec, VP8Io* io);
-// Predict a block and add residual
-void VP8ReconstructBlock(VP8Decoder* const dec);
 // Call io->setup() and finish setting up scan parameters.
 // After this call returns, one must always call VP8ExitCritical() with the
 // same parameters. Both functions should be used in pair. Returns VP8_STATUS_OK
@ -310,10 +328,16 @@ VP8StatusCode VP8EnterCritical(VP8Decoder* const dec, VP8Io* const io);
 // Must always be called in pair with VP8EnterCritical().
 // Returns false in case of error.
 int VP8ExitCritical(VP8Decoder* const dec, VP8Io* const io);
-// Process the last decoded row (filtering + output)
+// Return the multi-threading method to use (0=off), depending
+// on options and bitstream size. Only for lossy decoding.
+int VP8GetThreadMethod(const WebPDecoderOptions* const options,
+                       const WebPHeaderStructure* const headers,
+                       int width, int height);
+// Initialize dithering post-process if needed.
+void VP8InitDithering(const WebPDecoderOptions* const options,
+                      VP8Decoder* const dec);
+// Process the last decoded row (filtering + output).
 int VP8ProcessRow(VP8Decoder* const dec, VP8Io* const io);
-// Store a block, along with filtering params
-void VP8StoreBlock(VP8Decoder* const dec);
 // To be called at the start of a new scanline, to initialize predictors.
 void VP8InitScanline(VP8Decoder* const dec);
 // Decode one macroblock. Returns false if there is not enough data.
@ -328,7 +352,7 @@ int VP8DecodeLayer(VP8Decoder* const dec);

 //------------------------------------------------------------------------------

-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }    // extern "C"
 #endif

--- a/src/dec/vp8l.c
+++ b/src/dec/vp8l.c
--- a/src/dec/vp8li.h
+++ b/src/dec/vp8li.h
@ -1,8 +1,10 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Lossless decoder: internal header.
@ -20,7 +22,7 @@
 #include "../utils/huffman.h"
 #include "../webp/format_constants.h"

-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif

@ -55,13 +57,17 @@ typedef struct {
  HTreeGroup     *htree_groups_;
 } VP8LMetadata;

-typedef struct {
+typedef struct VP8LDecoder VP8LDecoder;
+struct VP8LDecoder {
  VP8StatusCode    status_;
  VP8LDecodeState  action_;
  VP8LDecodeState  state_;
  VP8Io           *io_;

-  uint32_t        *argb_;          // Internal data: always in BGRA color mode.
+  const WebPDecBuffer *output_;    // shortcut to io->opaque->output
+
+  uint32_t        *pixels_;        // Internal data: either uint8_t* for alpha
+                                   // or uint32_t* for BGRA.
  uint32_t        *argb_cache_;    // Scratch buffer for temporary BGRA storage.

  VP8LBitReader    br_;
@ -69,6 +75,9 @@ typedef struct {
  int              width_;
  int              height_;
  int              last_row_;      // last input row decoded so far.
+  int              last_pixel_;    // last pixel decoded so far. However, it may
+                                   // not be transformed, scaled and
+                                   // color-converted yet.
  int              last_out_row_;  // last row output so far.

  VP8LMetadata     hdr_;
@ -80,18 +89,27 @@ typedef struct {

  uint8_t         *rescaler_memory;  // Working memory for rescaling work.
  WebPRescaler    *rescaler;         // Common rescaler for all channels.
-} VP8LDecoder;
+};

 //------------------------------------------------------------------------------
 // internal functions. Not public.

+struct ALPHDecoder;  // Defined in dec/alphai.h.
+
 // in vp8l.c

-// Decodes a raw image stream (without header) and store the alpha data
-// into *output, which must be of size width x height. Returns false in case
-// of error.
-int VP8LDecodeAlphaImageStream(int width, int height, const uint8_t* const data,
-                               size_t data_size, uint8_t* const output);
+// Decodes image header for alpha data stored using lossless compression.
+// Returns false in case of error.
+int VP8LDecodeAlphaHeader(struct ALPHDecoder* const alph_dec,
+                          const uint8_t* const data, size_t data_size,
+                          uint8_t* const output);
+
+// Decodes *at least* 'last_row' rows of alpha. If some of the initial rows are
+// already decoded in previous call(s), it will resume decoding from where it
+// was paused.
+// Returns false in case of bitstream error.
+int VP8LDecodeAlphaImageStream(struct ALPHDecoder* const alph_dec,
+                               int last_row);

 // Allocates and initialize a new lossless decoder instance.
 VP8LDecoder* VP8LNew(void);
@ -112,7 +130,7 @@ void VP8LDelete(VP8LDecoder* const dec);

 //------------------------------------------------------------------------------

-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }    // extern "C"
 #endif

--- a/src/dec/webp.c
+++ b/src/dec/webp.c
@ -1,8 +1,10 @@
 // Copyright 2010 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Main decoding functions for WEBP images.
@ -14,11 +16,7 @@
 #include "./vp8i.h"
 #include "./vp8li.h"
 #include "./webpi.h"
-#include "../webp/format_constants.h"
-
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
+#include "../webp/mux_types.h"  // ALPHA_FLAG

 //------------------------------------------------------------------------------
 // RIFF layout is:
@ -40,8 +38,8 @@ extern "C" {
 //   20..23  VP8X flags bit-map corresponding to the chunk-types present.
 //   24..26  Width of the Canvas Image.
 //   27..29  Height of the Canvas Image.
-// There can be extra chunks after the "VP8X" chunk (ICCP, TILE, FRM, VP8,
-// META  ...)
+// There can be extra chunks after the "VP8X" chunk (ICCP, FRGM, ANMF, VP8,
+// VP8L, XMP, EXIF  ...)
 // All sizes are in little-endian order.
 // Note: chunk data size must be padded to multiple of 2 when written.

@ -76,6 +74,9 @@ static VP8StatusCode ParseRIFF(const uint8_t** const data,
      if (size < TAG_SIZE + CHUNK_HEADER_SIZE) {
        return VP8_STATUS_BITSTREAM_ERROR;
      }
+      if (size > MAX_CHUNK_PAYLOAD) {
+        return VP8_STATUS_BITSTREAM_ERROR;
+      }
      // We have a RIFF container. Skip it.
      *riff_size = size;
      *data += RIFF_HEADER_SIZE;
@ -177,6 +178,9 @@ static VP8StatusCode ParseOptionalChunks(const uint8_t** const data,
    }

    chunk_size = get_le32(buf + TAG_SIZE);
+    if (chunk_size > MAX_CHUNK_PAYLOAD) {
+      return VP8_STATUS_BITSTREAM_ERROR;          // Not a valid chunk size.
+    }
    // For odd-sized chunk-payload, there's one byte padding at the end.
    disk_chunk_size = (CHUNK_HEADER_SIZE + chunk_size + 1) & ~1;
    total_size += disk_chunk_size;
@ -186,6 +190,15 @@ static VP8StatusCode ParseOptionalChunks(const uint8_t** const data,
      return VP8_STATUS_BITSTREAM_ERROR;          // Not a valid chunk size.
    }

+    // Start of a (possibly incomplete) VP8/VP8L chunk implies that we have
+    // parsed all the optional chunks.
+    // Note: This check must occur before the check 'buf_size < disk_chunk_size'
+    // below to allow incomplete VP8/VP8L chunks.
+    if (!memcmp(buf, "VP8 ", TAG_SIZE) ||
+        !memcmp(buf, "VP8L", TAG_SIZE)) {
+      return VP8_STATUS_OK;
+    }
+
    if (buf_size < disk_chunk_size) {             // Insufficient data.
      return VP8_STATUS_NOT_ENOUGH_DATA;
    }
@ -193,9 +206,6 @@ static VP8StatusCode ParseOptionalChunks(const uint8_t** const data,
    if (!memcmp(buf, "ALPH", TAG_SIZE)) {         // A valid ALPH header.
      *alpha_data = buf + CHUNK_HEADER_SIZE;
      *alpha_size = chunk_size;
-    } else if (!memcmp(buf, "VP8 ", TAG_SIZE) ||
-               !memcmp(buf, "VP8L", TAG_SIZE)) {  // A valid VP8/VP8L header.
-      return VP8_STATUS_OK;  // Found.
    }

    // We have a full and valid chunk; skip it.
@ -270,9 +280,18 @@ static VP8StatusCode ParseHeadersInternal(const uint8_t* data,
                                          int* const width,
                                          int* const height,
                                          int* const has_alpha,
+                                          int* const has_animation,
+                                          int* const format,
                                          WebPHeaderStructure* const headers) {
+  int canvas_width = 0;
+  int canvas_height = 0;
+  int image_width = 0;
+  int image_height = 0;
  int found_riff = 0;
  int found_vp8x = 0;
+  int animation_present = 0;
+  int fragments_present = 0;
+
  VP8StatusCode status;
  WebPHeaderStructure hdrs;

@ -293,22 +312,35 @@ static VP8StatusCode ParseHeadersInternal(const uint8_t* data,
  // Skip over VP8X.
  {
    uint32_t flags = 0;
-    status = ParseVP8X(&data, &data_size, &found_vp8x, width, height, &flags);
+    status = ParseVP8X(&data, &data_size, &found_vp8x,
+                       &canvas_width, &canvas_height, &flags);
    if (status != VP8_STATUS_OK) {
      return status;  // Wrong VP8X / insufficient data.
    }
+    animation_present = !!(flags & ANIMATION_FLAG);
+    fragments_present = !!(flags & FRAGMENTS_FLAG);
    if (!found_riff && found_vp8x) {
      // Note: This restriction may be removed in the future, if it becomes
      // necessary to send VP8X chunk to the decoder.
      return VP8_STATUS_BITSTREAM_ERROR;
    }
-    if (has_alpha != NULL) *has_alpha = !!(flags & ALPHA_FLAG_BIT);
-    if (found_vp8x && headers == NULL) {
-      return VP8_STATUS_OK;  // Return features from VP8X header.
+    if (has_alpha != NULL) *has_alpha = !!(flags & ALPHA_FLAG);
+    if (has_animation != NULL) *has_animation = animation_present;
+    if (format != NULL) *format = 0;   // default = undefined
+
+    image_width = canvas_width;
+    image_height = canvas_height;
+    if (found_vp8x && (animation_present || fragments_present) &&
+        headers == NULL) {
+      status = VP8_STATUS_OK;
+      goto ReturnWidthHeight;  // Just return features from VP8X header.
    }
  }

-  if (data_size < TAG_SIZE) return VP8_STATUS_NOT_ENOUGH_DATA;
+  if (data_size < TAG_SIZE) {
+    status = VP8_STATUS_NOT_ENOUGH_DATA;
+    goto ReturnWidthHeight;
+  }

  // Skip over optional chunks if data started with "RIFF + VP8X" or "ALPH".
  if ((found_riff && found_vp8x) ||
@ -316,7 +348,7 @@ static VP8StatusCode ParseHeadersInternal(const uint8_t* data,
    status = ParseOptionalChunks(&data, &data_size, hdrs.riff_size,
                                 &hdrs.alpha_data, &hdrs.alpha_data_size);
    if (status != VP8_STATUS_OK) {
-      return status;  // Found an invalid chunk size / insufficient data.
+      goto ReturnWidthHeight;  // Invalid chunk size / insufficient data.
    }
  }

@ -324,35 +356,41 @@ static VP8StatusCode ParseHeadersInternal(const uint8_t* data,
  status = ParseVP8Header(&data, &data_size, hdrs.riff_size,
                          &hdrs.compressed_size, &hdrs.is_lossless);
  if (status != VP8_STATUS_OK) {
-    return status;  // Wrong VP8/VP8L chunk-header / insufficient data.
+    goto ReturnWidthHeight;  // Wrong VP8/VP8L chunk-header / insufficient data.
  }
  if (hdrs.compressed_size > MAX_CHUNK_PAYLOAD) {
    return VP8_STATUS_BITSTREAM_ERROR;
  }

+  if (format != NULL && !(animation_present || fragments_present)) {
+    *format = hdrs.is_lossless ? 2 : 1;
+  }
+
  if (!hdrs.is_lossless) {
    if (data_size < VP8_FRAME_HEADER_SIZE) {
-      return VP8_STATUS_NOT_ENOUGH_DATA;
+      status = VP8_STATUS_NOT_ENOUGH_DATA;
+      goto ReturnWidthHeight;
    }
    // Validates raw VP8 data.
-    if (!VP8GetInfo(data, data_size,
-                    (uint32_t)hdrs.compressed_size, width, height)) {
+    if (!VP8GetInfo(data, data_size, (uint32_t)hdrs.compressed_size,
+                    &image_width, &image_height)) {
      return VP8_STATUS_BITSTREAM_ERROR;
    }
  } else {
    if (data_size < VP8L_FRAME_HEADER_SIZE) {
-      return VP8_STATUS_NOT_ENOUGH_DATA;
+      status = VP8_STATUS_NOT_ENOUGH_DATA;
+      goto ReturnWidthHeight;
    }
    // Validates raw VP8L data.
-    if (!VP8LGetInfo(data, data_size, width, height, has_alpha)) {
+    if (!VP8LGetInfo(data, data_size, &image_width, &image_height, has_alpha)) {
      return VP8_STATUS_BITSTREAM_ERROR;
    }
  }
-
-  if (has_alpha != NULL) {
-    // If the data did not contain a VP8X/VP8L chunk the only definitive way
-    // to set this is by looking for alpha data (from an ALPH chunk).
-    *has_alpha |= (hdrs.alpha_data != NULL);
+  // Validates image size coherency.
+  if (found_vp8x) {
+    if (canvas_width != image_width || canvas_height != image_height) {
+      return VP8_STATUS_BITSTREAM_ERROR;
+    }
  }
  if (headers != NULL) {
    *headers = hdrs;
@ -360,21 +398,44 @@ static VP8StatusCode ParseHeadersInternal(const uint8_t* data,
    assert((uint64_t)(data - headers->data) < MAX_CHUNK_PAYLOAD);
    assert(headers->offset == headers->data_size - data_size);
  }
-  return VP8_STATUS_OK;  // Return features from VP8 header.
+ ReturnWidthHeight:
+  if (status == VP8_STATUS_OK ||
+      (status == VP8_STATUS_NOT_ENOUGH_DATA && found_vp8x && headers == NULL)) {
+    if (has_alpha != NULL) {
+      // If the data did not contain a VP8X/VP8L chunk the only definitive way
+      // to set this is by looking for alpha data (from an ALPH chunk).
+      *has_alpha |= (hdrs.alpha_data != NULL);
+    }
+    if (width != NULL) *width = image_width;
+    if (height != NULL) *height = image_height;
+    return VP8_STATUS_OK;
+  } else {
+    return status;
+  }
 }

 VP8StatusCode WebPParseHeaders(WebPHeaderStructure* const headers) {
+  VP8StatusCode status;
+  int has_animation = 0;
  assert(headers != NULL);
  // fill out headers, ignore width/height/has_alpha.
-  return ParseHeadersInternal(headers->data, headers->data_size,
-                              NULL, NULL, NULL, headers);
+  status = ParseHeadersInternal(headers->data, headers->data_size,
+                                NULL, NULL, NULL, &has_animation,
+                                NULL, headers);
+  if (status == VP8_STATUS_OK || status == VP8_STATUS_NOT_ENOUGH_DATA) {
+    // TODO(jzern): full support of animation frames will require API additions.
+    if (has_animation) {
+      status = VP8_STATUS_UNSUPPORTED_FEATURE;
+    }
+  }
+  return status;
 }

 //------------------------------------------------------------------------------
 // WebPDecParams

 void WebPResetDecParams(WebPDecParams* const params) {
-  if (params) {
+  if (params != NULL) {
    memset(params, 0, sizeof(*params));
  }
 }
@ -407,11 +468,6 @@ static VP8StatusCode DecodeInto(const uint8_t* const data, size_t data_size,
    if (dec == NULL) {
      return VP8_STATUS_OUT_OF_MEMORY;
    }
-#ifdef WEBP_USE_THREAD
-    dec->use_threads_ = params->options && (params->options->use_threads > 0);
-#else
-    dec->use_threads_ = 0;
-#endif
    dec->alpha_data_ = headers.alpha_data;
    dec->alpha_data_size_ = headers.alpha_data_size;

@ -423,6 +479,10 @@ static VP8StatusCode DecodeInto(const uint8_t* const data, size_t data_size,
      status = WebPAllocateDecBuffer(io.width, io.height, params->options,
                                     params->output);
      if (status == VP8_STATUS_OK) {  // Decode
+        // This change must be done before calling VP8Decode()
+        dec->mt_method_ = VP8GetThreadMethod(params->options, &headers,
+                                             io.width, io.height);
+        VP8InitDithering(params->options, dec);
        if (!VP8Decode(dec, &io)) {
          status = dec->status_;
        }
@ -609,7 +669,6 @@ uint8_t* WebPDecodeYUV(const uint8_t* data, size_t data_size,
 static void DefaultFeatures(WebPBitstreamFeatures* const features) {
  assert(features != NULL);
  memset(features, 0, sizeof(*features));
-  features->bitstream_version = 0;
 }

 static VP8StatusCode GetFeatures(const uint8_t* const data, size_t data_size,
@ -619,10 +678,11 @@ static VP8StatusCode GetFeatures(const uint8_t* const data, size_t data_size,
  }
  DefaultFeatures(features);

-  // Only parse enough of the data to retrieve width/height/has_alpha.
+  // Only parse enough of the data to retrieve the features.
  return ParseHeadersInternal(data, data_size,
                              &features->width, &features->height,
-                              &features->has_alpha, NULL);
+                              &features->has_alpha, &features->has_animation,
+                              &features->format, NULL);
 }

 //------------------------------------------------------------------------------
@ -666,19 +726,13 @@ int WebPInitDecoderConfigInternal(WebPDecoderConfig* config,
 VP8StatusCode WebPGetFeaturesInternal(const uint8_t* data, size_t data_size,
                                      WebPBitstreamFeatures* features,
                                      int version) {
-  VP8StatusCode status;
  if (WEBP_ABI_IS_INCOMPATIBLE(version, WEBP_DECODER_ABI_VERSION)) {
    return VP8_STATUS_INVALID_PARAM;   // version mismatch
  }
  if (features == NULL) {
    return VP8_STATUS_INVALID_PARAM;
  }
-
-  status = GetFeatures(data, data_size, features);
-  if (status == VP8_STATUS_NOT_ENOUGH_DATA) {
-    return VP8_STATUS_BITSTREAM_ERROR;  // Not-enough-data treated as error.
-  }
-  return status;
+  return GetFeatures(data, data_size, features);
 }

 VP8StatusCode WebPDecode(const uint8_t* data, size_t data_size,
@ -766,6 +820,3 @@ int WebPIoInitFromOptions(const WebPDecoderOptions* const options,

 //------------------------------------------------------------------------------

-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
--- a/src/dec/webpi.h
+++ b/src/dec/webpi.h
@ -1,8 +1,10 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Internal header: WebP decoding parameters and custom IO on buffer
@ -12,7 +14,7 @@
 #ifndef WEBP_DEC_WEBPI_H_
 #define WEBP_DEC_WEBPI_H_

-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif

@ -61,10 +63,10 @@ typedef struct {
 } WebPHeaderStructure;

 // Skips over all valid chunks prior to the first VP8/VP8L frame header.
-// Returns VP8_STATUS_OK on success,
-//         VP8_STATUS_BITSTREAM_ERROR if an invalid header/chunk is found, and
-//         VP8_STATUS_NOT_ENOUGH_DATA if case of insufficient data.
-// In 'headers', compressed_size, offset, alpha_data, alpha_size and lossless
+// Returns: VP8_STATUS_OK, VP8_STATUS_BITSTREAM_ERROR (invalid header/chunk),
+// VP8_STATUS_NOT_ENOUGH_DATA (partial input) or VP8_STATUS_UNSUPPORTED_FEATURE
+// in the case of non-decodable features (animation for instance).
+// In 'headers', compressed_size, offset, alpha_data, alpha_size, and lossless
 // fields are updated appropriately upon success.
 VP8StatusCode WebPParseHeaders(WebPHeaderStructure* const headers);

@ -107,7 +109,7 @@ void WebPGrabDecBuffer(WebPDecBuffer* const src, WebPDecBuffer* const dst);

 //------------------------------------------------------------------------------

-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }    // extern "C"
 #endif

--- a/src/demux/Makefile.am
+++ b/src/demux/Makefile.am
@ -0,0 +1,15 @@
+AM_CPPFLAGS = -I$(top_srcdir)/src
+lib_LTLIBRARIES = libwebpdemux.la
+
+libwebpdemux_la_SOURCES =
+libwebpdemux_la_SOURCES += demux.c
+
+libwebpdemuxinclude_HEADERS =
+libwebpdemuxinclude_HEADERS += ../webp/demux.h
+libwebpdemuxinclude_HEADERS += ../webp/mux_types.h
+libwebpdemuxinclude_HEADERS += ../webp/types.h
+
+libwebpdemux_la_LIBADD = ../libwebp.la
+libwebpdemux_la_LDFLAGS = -no-undefined -version-info 1:0:0
+libwebpdemuxincludedir = $(includedir)/webp
+pkgconfig_DATA = libwebpdemux.pc
--- a/src/demux/demux.c
+++ b/src/demux/demux.c
@ -1,26 +1,31 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 //  WebP container demux.
 //

-#include "../webp/mux.h"
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif

+#include <assert.h>
 #include <stdlib.h>
 #include <string.h>

-#include "../webp/decode.h"  // WebPGetInfo
+#include "../utils/utils.h"
+#include "../webp/decode.h"     // WebPGetFeatures
+#include "../webp/demux.h"
 #include "../webp/format_constants.h"

-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
-#define MKFOURCC(a, b, c, d) ((uint32_t)(a) | (b) << 8 | (c) << 16 | (d) << 24)
+#define DMUX_MAJ_VERSION 0
+#define DMUX_MIN_VERSION 2
+#define DMUX_REV_VERSION 0

 typedef struct {
  size_t start_;        // start location of the data
@ -38,9 +43,12 @@ typedef struct {
 typedef struct Frame {
  int x_offset_, y_offset_;
  int width_, height_;
+  int has_alpha_;
  int duration_;
-  int is_tile_;    // this is an image fragment from a 'TILE'.
-  int frame_num_;  // the referent frame number for use in assembling tiles.
+  WebPMuxAnimDispose dispose_method_;
+  WebPMuxAnimBlend blend_method_;
+  int is_fragment_;  // this is a frame fragment (and not a full frame).
+  int frame_num_;  // the referent frame number for use in assembling fragments.
  int complete_;   // img_components_ contains a full image.
  ChunkData img_components_[2];  // 0=VP8{,L} 1=ALPH
  struct Frame* next_;
@ -58,9 +66,12 @@ struct WebPDemuxer {
  uint32_t feature_flags_;
  int canvas_width_, canvas_height_;
  int loop_count_;
+  uint32_t bgcolor_;
  int num_frames_;
  Frame* frames_;
+  Frame** frames_tail_;
  Chunk* chunks_;  // non-image chunks
+  Chunk** chunks_tail_;
 };

 typedef enum {
@ -87,6 +98,12 @@ static const ChunkParser kMasterChunks[] = {
  { { '0', '0', '0', '0' }, NULL,             NULL },
 };

+//------------------------------------------------------------------------------
+
+int WebPGetDemuxVersion(void) {
+  return (DMUX_MAJ_VERSION << 16) | (DMUX_MIN_VERSION << 8) | DMUX_REV_VERSION;
+}
+
 // -----------------------------------------------------------------------------
 // MemBuffer

@ -127,43 +144,30 @@ static WEBP_INLINE const uint8_t* GetBuffer(MemBuffer* const mem) {
  return mem->buf_ + mem->start_;
 }

-static WEBP_INLINE uint8_t GetByte(MemBuffer* const mem) {
+// Read from 'mem' and skip the read bytes.
+static WEBP_INLINE uint8_t ReadByte(MemBuffer* const mem) {
  const uint8_t byte = mem->buf_[mem->start_];
  Skip(mem, 1);
  return byte;
 }

-// Read 16, 24 or 32 bits stored in little-endian order.
-static WEBP_INLINE int ReadLE16s(const uint8_t* const data) {
-  return (int)(data[0] << 0) | (data[1] << 8);
-}
-
-static WEBP_INLINE int ReadLE24s(const uint8_t* const data) {
-  return ReadLE16s(data) | (data[2] << 16);
-}
-
-static WEBP_INLINE uint32_t ReadLE32(const uint8_t* const data) {
-  return (uint32_t)ReadLE24s(data) | (data[3] << 24);
-}
-
-// In addition to reading, skip the read bytes.
-static WEBP_INLINE int GetLE16s(MemBuffer* const mem) {
+static WEBP_INLINE int ReadLE16s(MemBuffer* const mem) {
  const uint8_t* const data = mem->buf_ + mem->start_;
-  const int val = ReadLE16s(data);
+  const int val = GetLE16(data);
  Skip(mem, 2);
  return val;
 }

-static WEBP_INLINE int GetLE24s(MemBuffer* const mem) {
+static WEBP_INLINE int ReadLE24s(MemBuffer* const mem) {
  const uint8_t* const data = mem->buf_ + mem->start_;
-  const int val = ReadLE24s(data);
+  const int val = GetLE24(data);
  Skip(mem, 3);
  return val;
 }

-static WEBP_INLINE uint32_t GetLE32(MemBuffer* const mem) {
+static WEBP_INLINE uint32_t ReadLE32(MemBuffer* const mem) {
  const uint8_t* const data = mem->buf_ + mem->start_;
-  const uint32_t val = ReadLE32(data);
+  const uint32_t val = GetLE32(data);
  Skip(mem, 4);
  return val;
 }
@ -172,41 +176,37 @@ static WEBP_INLINE uint32_t GetLE32(MemBuffer* const mem) {
 // Secondary chunk parsing

 static void AddChunk(WebPDemuxer* const dmux, Chunk* const chunk) {
-  Chunk** c = &dmux->chunks_;
-  while (*c != NULL) c = &(*c)->next_;
-  *c = chunk;
+  *dmux->chunks_tail_ = chunk;
  chunk->next_ = NULL;
+  dmux->chunks_tail_ = &chunk->next_;
 }

 // Add a frame to the end of the list, ensuring the last frame is complete.
 // Returns true on success, false otherwise.
 static int AddFrame(WebPDemuxer* const dmux, Frame* const frame) {
-  const Frame* last_frame = NULL;
-  Frame** f = &dmux->frames_;
-  while (*f != NULL) {
-    last_frame = *f;
-    f = &(*f)->next_;
-  }
+  const Frame* const last_frame = *dmux->frames_tail_;
  if (last_frame != NULL && !last_frame->complete_) return 0;
-  *f = frame;
+
+  *dmux->frames_tail_ = frame;
  frame->next_ = NULL;
+  dmux->frames_tail_ = &frame->next_;
  return 1;
 }

 // Store image bearing chunks to 'frame'.
-static ParseStatus StoreFrame(int frame_num, MemBuffer* const mem,
-                              Frame* const frame) {
+static ParseStatus StoreFrame(int frame_num, uint32_t min_size,
+                              MemBuffer* const mem, Frame* const frame) {
  int alpha_chunks = 0;
  int image_chunks = 0;
-  int done = (MemDataSize(mem) < CHUNK_HEADER_SIZE);
+  int done = (MemDataSize(mem) < min_size);
  ParseStatus status = PARSE_OK;

  if (done) return PARSE_NEED_MORE_DATA;

  do {
    const size_t chunk_start_offset = mem->start_;
-    const uint32_t fourcc = GetLE32(mem);
-    const uint32_t payload_size = GetLE32(mem);
+    const uint32_t fourcc = ReadLE32(mem);
+    const uint32_t payload_size = ReadLE32(mem);
    const uint32_t payload_size_padded = payload_size + (payload_size & 1);
    const size_t payload_available = (payload_size_padded > MemDataSize(mem))
                                   ? MemDataSize(mem) : payload_size_padded;
@ -222,29 +222,37 @@ static ParseStatus StoreFrame(int frame_num, MemBuffer* const mem,
          ++alpha_chunks;
          frame->img_components_[1].offset_ = chunk_start_offset;
          frame->img_components_[1].size_ = chunk_size;
+          frame->has_alpha_ = 1;
          frame->frame_num_ = frame_num;
          Skip(mem, payload_available);
        } else {
          goto Done;
        }
        break;
-      case MKFOURCC('V', 'P', '8', ' '):
      case MKFOURCC('V', 'P', '8', 'L'):
+        if (alpha_chunks > 0) return PARSE_ERROR;  // VP8L has its own alpha
+        // fall through
+      case MKFOURCC('V', 'P', '8', ' '):
        if (image_chunks == 0) {
-          int width = 0, height = 0;
+          // Extract the bitstream features, tolerating failures when the data
+          // is incomplete.
+          WebPBitstreamFeatures features;
+          const VP8StatusCode vp8_status =
+              WebPGetFeatures(mem->buf_ + chunk_start_offset, chunk_size,
+                              &features);
+          if (status == PARSE_NEED_MORE_DATA &&
+              vp8_status == VP8_STATUS_NOT_ENOUGH_DATA) {
+            return PARSE_NEED_MORE_DATA;
+          } else if (vp8_status != VP8_STATUS_OK) {
+            // We have enough data, and yet WebPGetFeatures() failed.
+            return PARSE_ERROR;
+          }
          ++image_chunks;
          frame->img_components_[0].offset_ = chunk_start_offset;
          frame->img_components_[0].size_ = chunk_size;
-          // Extract the width and height from the bitstream, tolerating
-          // failures when the data is incomplete.
-          if (!WebPGetInfo(mem->buf_ + frame->img_components_[0].offset_,
-                           frame->img_components_[0].size_, &width, &height) &&
-              status != PARSE_NEED_MORE_DATA) {
-            return PARSE_ERROR;
-          }
-
-          frame->width_ = width;
-          frame->height_ = height;
+          frame->width_ = features.width;
+          frame->height_ = features.height;
+          frame->has_alpha_ |= features.has_alpha;
          frame->frame_num_ = frame_num;
          frame->complete_ = (status == PARSE_OK);
          Skip(mem, payload_available);
@ -275,43 +283,48 @@ static ParseStatus StoreFrame(int frame_num, MemBuffer* const mem,
 // Returns PARSE_OK on success with *frame pointing to the new Frame.
 // Returns PARSE_NEED_MORE_DATA with insufficient data, PARSE_ERROR otherwise.
 static ParseStatus NewFrame(const MemBuffer* const mem,
-                            uint32_t min_size, uint32_t expected_size,
-                            uint32_t actual_size, Frame** frame) {
+                            uint32_t min_size, uint32_t actual_size,
+                            Frame** frame) {
  if (SizeIsInvalid(mem, min_size)) return PARSE_ERROR;
-  if (actual_size < expected_size) return PARSE_ERROR;
+  if (actual_size < min_size) return PARSE_ERROR;
  if (MemDataSize(mem) < min_size)  return PARSE_NEED_MORE_DATA;

  *frame = (Frame*)calloc(1, sizeof(**frame));
  return (*frame == NULL) ? PARSE_ERROR : PARSE_OK;
 }

-// Parse a 'FRM ' chunk and any image bearing chunks that immediately follow.
+// Parse a 'ANMF' chunk and any image bearing chunks that immediately follow.
 // 'frame_chunk_size' is the previously validated, padded chunk size.
-static ParseStatus ParseFrame(
+static ParseStatus ParseAnimationFrame(
    WebPDemuxer* const dmux, uint32_t frame_chunk_size) {
-  const int has_frames = !!(dmux->feature_flags_ & ANIMATION_FLAG);
-  const uint32_t min_size = frame_chunk_size + CHUNK_HEADER_SIZE;
+  const int is_animation = !!(dmux->feature_flags_ & ANIMATION_FLAG);
+  const uint32_t anmf_payload_size = frame_chunk_size - ANMF_CHUNK_SIZE;
  int added_frame = 0;
+  int bits;
  MemBuffer* const mem = &dmux->mem_;
  Frame* frame;
  ParseStatus status =
-      NewFrame(mem, min_size, FRAME_CHUNK_SIZE, frame_chunk_size, &frame);
+      NewFrame(mem, ANMF_CHUNK_SIZE, frame_chunk_size, &frame);
  if (status != PARSE_OK) return status;

-  frame->x_offset_ = 2 * GetLE24s(mem);
-  frame->y_offset_ = 2 * GetLE24s(mem);
-  frame->width_    = 1 + GetLE24s(mem);
-  frame->height_   = 1 + GetLE24s(mem);
-  frame->duration_ = 1 + GetLE24s(mem);
-  Skip(mem, frame_chunk_size - FRAME_CHUNK_SIZE);  // skip any trailing data.
+  frame->x_offset_       = 2 * ReadLE24s(mem);
+  frame->y_offset_       = 2 * ReadLE24s(mem);
+  frame->width_          = 1 + ReadLE24s(mem);
+  frame->height_         = 1 + ReadLE24s(mem);
+  frame->duration_       = ReadLE24s(mem);
+  bits = ReadByte(mem);
+  frame->dispose_method_ =
+      (bits & 1) ? WEBP_MUX_DISPOSE_BACKGROUND : WEBP_MUX_DISPOSE_NONE;
+  frame->blend_method_ = (bits & 2) ? WEBP_MUX_NO_BLEND : WEBP_MUX_BLEND;
  if (frame->width_ * (uint64_t)frame->height_ >= MAX_IMAGE_AREA) {
+    free(frame);
    return PARSE_ERROR;
  }

-  // Store a (potentially partial) frame only if the animation flag is set
-  // and there is some data in 'frame'.
-  status = StoreFrame(dmux->num_frames_ + 1, mem, frame);
-  if (status != PARSE_ERROR && has_frames && frame->frame_num_ > 0) {
+  // Store a frame only if the animation flag is set there is some data for
+  // this frame is available.
+  status = StoreFrame(dmux->num_frames_ + 1, anmf_payload_size, mem, frame);
+  if (status != PARSE_ERROR && is_animation && frame->frame_num_ > 0) {
    added_frame = AddFrame(dmux, frame);
    if (added_frame) {
      ++dmux->num_frames_;
@ -324,38 +337,43 @@ static ParseStatus ParseFrame(
  return status;
 }

-// Parse a 'TILE' chunk and any image bearing chunks that immediately follow.
-// 'tile_chunk_size' is the previously validated, padded chunk size.
-static ParseStatus ParseTile(WebPDemuxer* const dmux,
-                             uint32_t tile_chunk_size) {
-  const int has_tiles = !!(dmux->feature_flags_ & TILE_FLAG);
-  const uint32_t min_size = tile_chunk_size + CHUNK_HEADER_SIZE;
-  int added_tile = 0;
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+// Parse a 'FRGM' chunk and any image bearing chunks that immediately follow.
+// 'fragment_chunk_size' is the previously validated, padded chunk size.
+static ParseStatus ParseFragment(WebPDemuxer* const dmux,
+                                 uint32_t fragment_chunk_size) {
+  const int frame_num = 1;  // All fragments belong to the 1st (and only) frame.
+  const int is_fragmented = !!(dmux->feature_flags_ & FRAGMENTS_FLAG);
+  const uint32_t frgm_payload_size = fragment_chunk_size - FRGM_CHUNK_SIZE;
+  int added_fragment = 0;
  MemBuffer* const mem = &dmux->mem_;
  Frame* frame;
  ParseStatus status =
-      NewFrame(mem, min_size, TILE_CHUNK_SIZE, tile_chunk_size, &frame);
+      NewFrame(mem, FRGM_CHUNK_SIZE, fragment_chunk_size, &frame);
  if (status != PARSE_OK) return status;

-  frame->is_tile_  = 1;
-  frame->x_offset_ = 2 * GetLE24s(mem);
-  frame->y_offset_ = 2 * GetLE24s(mem);
-  Skip(mem, tile_chunk_size - TILE_CHUNK_SIZE);  // skip any trailing data.
+  frame->is_fragment_ = 1;
+  frame->x_offset_ = 2 * ReadLE24s(mem);
+  frame->y_offset_ = 2 * ReadLE24s(mem);

-  // Store a (potentially partial) tile only if the tile flag is set
-  // and the tile contains some data.
-  status = StoreFrame(dmux->num_frames_, mem, frame);
-  if (status != PARSE_ERROR && has_tiles && frame->frame_num_ > 0) {
-    // Note num_frames_ is incremented only when all tiles have been consumed.
-    added_tile = AddFrame(dmux, frame);
-    if (!added_tile) status = PARSE_ERROR;
+  // Store a fragment only if the 'fragments' flag is set and there is some
+  // data available.
+  status = StoreFrame(frame_num, frgm_payload_size, mem, frame);
+  if (status != PARSE_ERROR && is_fragmented && frame->frame_num_ > 0) {
+    added_fragment = AddFrame(dmux, frame);
+    if (!added_fragment) {
+      status = PARSE_ERROR;
+    } else {
+      dmux->num_frames_ = 1;
+    }
  }

-  if (!added_tile) free(frame);
+  if (!added_fragment) free(frame);
  return status;
 }
+#endif  // WEBP_EXPERIMENTAL_FEATURES

-// General chunk storage starting with the header at 'start_offset' allowing
+// General chunk storage, starting with the header at 'start_offset', allowing
 // the user to request the payload via a fourcc string. 'size' includes the
 // header and the unpadded payload size.
 // Returns true on success, false otherwise.
@ -373,20 +391,20 @@ static int StoreChunk(WebPDemuxer* const dmux,
 // -----------------------------------------------------------------------------
 // Primary chunk parsing

-static int ReadHeader(MemBuffer* const mem) {
+static ParseStatus ReadHeader(MemBuffer* const mem) {
  const size_t min_size = RIFF_HEADER_SIZE + CHUNK_HEADER_SIZE;
  uint32_t riff_size;

  // Basic file level validation.
-  if (MemDataSize(mem) < min_size) return 0;
+  if (MemDataSize(mem) < min_size) return PARSE_NEED_MORE_DATA;
  if (memcmp(GetBuffer(mem), "RIFF", CHUNK_SIZE_BYTES) ||
      memcmp(GetBuffer(mem) + CHUNK_HEADER_SIZE, "WEBP", CHUNK_SIZE_BYTES)) {
-    return 0;
+    return PARSE_ERROR;
  }

-  riff_size = ReadLE32(GetBuffer(mem) + TAG_SIZE);
-  if (riff_size < CHUNK_HEADER_SIZE) return 0;
-  if (riff_size > MAX_CHUNK_PAYLOAD) return 0;
+  riff_size = GetLE32(GetBuffer(mem) + TAG_SIZE);
+  if (riff_size < CHUNK_HEADER_SIZE) return PARSE_ERROR;
+  if (riff_size > MAX_CHUNK_PAYLOAD) return PARSE_ERROR;

  // There's no point in reading past the end of the RIFF chunk
  mem->riff_end_ = riff_size + CHUNK_HEADER_SIZE;
@ -395,7 +413,7 @@ static int ReadHeader(MemBuffer* const mem) {
  }

  Skip(mem, RIFF_HEADER_SIZE);
-  return 1;
+  return PARSE_OK;
 }

 static ParseStatus ParseSingleImage(WebPDemuxer* const dmux) {
@ -403,6 +421,7 @@ static ParseStatus ParseSingleImage(WebPDemuxer* const dmux) {
  MemBuffer* const mem = &dmux->mem_;
  Frame* frame;
  ParseStatus status;
+  int image_added = 0;

  if (dmux->frames_ != NULL) return PARSE_ERROR;
  if (SizeIsInvalid(mem, min_size)) return PARSE_ERROR;
@ -411,65 +430,49 @@ static ParseStatus ParseSingleImage(WebPDemuxer* const dmux) {
  frame = (Frame*)calloc(1, sizeof(*frame));
  if (frame == NULL) return PARSE_ERROR;

-  status = StoreFrame(1, &dmux->mem_, frame);
+  // For the single image case we allow parsing of a partial frame, but we need
+  // at least CHUNK_HEADER_SIZE for parsing.
+  status = StoreFrame(1, CHUNK_HEADER_SIZE, &dmux->mem_, frame);
  if (status != PARSE_ERROR) {
    const int has_alpha = !!(dmux->feature_flags_ & ALPHA_FLAG);
    // Clear any alpha when the alpha flag is missing.
    if (!has_alpha && frame->img_components_[1].size_ > 0) {
      frame->img_components_[1].offset_ = 0;
      frame->img_components_[1].size_ = 0;
+      frame->has_alpha_ = 0;
    }

    // Use the frame width/height as the canvas values for non-vp8x files.
+    // Also, set ALPHA_FLAG if this is a lossless image with alpha.
    if (!dmux->is_ext_format_ && frame->width_ > 0 && frame->height_ > 0) {
      dmux->state_ = WEBP_DEMUX_PARSED_HEADER;
      dmux->canvas_width_ = frame->width_;
      dmux->canvas_height_ = frame->height_;
+      dmux->feature_flags_ |= frame->has_alpha_ ? ALPHA_FLAG : 0;
    }
-    AddFrame(dmux, frame);
-    dmux->num_frames_ = 1;
+    if (!AddFrame(dmux, frame)) {
+      status = PARSE_ERROR;  // last frame was left incomplete
    } else {
-    free(frame);
+      image_added = 1;
+      dmux->num_frames_ = 1;
+    }
  }

+  if (!image_added) free(frame);
  return status;
 }

-static ParseStatus ParseVP8X(WebPDemuxer* const dmux) {
+static ParseStatus ParseVP8XChunks(WebPDemuxer* const dmux) {
+  const int is_animation = !!(dmux->feature_flags_ & ANIMATION_FLAG);
  MemBuffer* const mem = &dmux->mem_;
-  int loop_chunks = 0;
-  uint32_t vp8x_size;
+  int anim_chunks = 0;
  ParseStatus status = PARSE_OK;

-  if (MemDataSize(mem) < CHUNK_HEADER_SIZE) return PARSE_NEED_MORE_DATA;
-
-  dmux->is_ext_format_ = 1;
-  Skip(mem, TAG_SIZE);  // VP8X
-  vp8x_size = GetLE32(mem);
-  if (vp8x_size > MAX_CHUNK_PAYLOAD) return PARSE_ERROR;
-  if (vp8x_size < VP8X_CHUNK_SIZE) return PARSE_ERROR;
-  vp8x_size += vp8x_size & 1;
-  if (SizeIsInvalid(mem, vp8x_size)) return PARSE_ERROR;
-  if (MemDataSize(mem) < vp8x_size) return PARSE_NEED_MORE_DATA;
-
-  dmux->feature_flags_ = GetByte(mem);
-  Skip(mem, 3);  // Reserved.
-  dmux->canvas_width_  = 1 + GetLE24s(mem);
-  dmux->canvas_height_ = 1 + GetLE24s(mem);
-  if (dmux->canvas_width_ * (uint64_t)dmux->canvas_height_ >= MAX_IMAGE_AREA) {
-    return PARSE_ERROR;  // image final dimension is too large
-  }
-  Skip(mem, vp8x_size - VP8X_CHUNK_SIZE);  // skip any trailing data.
-  dmux->state_ = WEBP_DEMUX_PARSED_HEADER;
-
-  if (SizeIsInvalid(mem, CHUNK_HEADER_SIZE)) return PARSE_ERROR;
-  if (MemDataSize(mem) < CHUNK_HEADER_SIZE) return PARSE_NEED_MORE_DATA;
-
  do {
    int store_chunk = 1;
    const size_t chunk_start_offset = mem->start_;
-    const uint32_t fourcc = GetLE32(mem);
-    const uint32_t chunk_size = GetLE32(mem);
+    const uint32_t fourcc = ReadLE32(mem);
+    const uint32_t chunk_size = ReadLE32(mem);
    const uint32_t chunk_size_padded = chunk_size + (chunk_size & 1);

    if (chunk_size > MAX_CHUNK_PAYLOAD) return PARSE_ERROR;
@ -482,40 +485,50 @@ static ParseStatus ParseVP8X(WebPDemuxer* const dmux) {
      case MKFOURCC('A', 'L', 'P', 'H'):
      case MKFOURCC('V', 'P', '8', ' '):
      case MKFOURCC('V', 'P', '8', 'L'): {
+        // check that this isn't an animation (all frames should be in an ANMF).
+        if (anim_chunks > 0 || is_animation) return PARSE_ERROR;
+
        Rewind(mem, CHUNK_HEADER_SIZE);
        status = ParseSingleImage(dmux);
        break;
      }
-      case MKFOURCC('L', 'O', 'O', 'P'): {
-        if (chunk_size_padded < LOOP_CHUNK_SIZE) return PARSE_ERROR;
+      case MKFOURCC('A', 'N', 'I', 'M'): {
+        if (chunk_size_padded < ANIM_CHUNK_SIZE) return PARSE_ERROR;

        if (MemDataSize(mem) < chunk_size_padded) {
          status = PARSE_NEED_MORE_DATA;
-        } else if (loop_chunks == 0) {
-          ++loop_chunks;
-          dmux->loop_count_ = GetLE16s(mem);
-          Skip(mem, chunk_size_padded - LOOP_CHUNK_SIZE);
+        } else if (anim_chunks == 0) {
+          ++anim_chunks;
+          dmux->bgcolor_ = ReadLE32(mem);
+          dmux->loop_count_ = ReadLE16s(mem);
+          Skip(mem, chunk_size_padded - ANIM_CHUNK_SIZE);
        } else {
          store_chunk = 0;
          goto Skip;
        }
        break;
      }
-      case MKFOURCC('F', 'R', 'M', ' '): {
-        status = ParseFrame(dmux, chunk_size_padded);
+      case MKFOURCC('A', 'N', 'M', 'F'): {
+        if (anim_chunks == 0) return PARSE_ERROR;  // 'ANIM' precedes frames.
+        status = ParseAnimationFrame(dmux, chunk_size_padded);
        break;
      }
-      case MKFOURCC('T', 'I', 'L', 'E'): {
-        if (dmux->num_frames_ == 0) dmux->num_frames_ = 1;
-        status = ParseTile(dmux, chunk_size_padded);
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+      case MKFOURCC('F', 'R', 'G', 'M'): {
+        status = ParseFragment(dmux, chunk_size_padded);
        break;
      }
+#endif
      case MKFOURCC('I', 'C', 'C', 'P'): {
        store_chunk = !!(dmux->feature_flags_ & ICCP_FLAG);
        goto Skip;
      }
-      case MKFOURCC('M', 'E', 'T', 'A'): {
-        store_chunk = !!(dmux->feature_flags_ & META_FLAG);
+      case MKFOURCC('E', 'X', 'I', 'F'): {
+        store_chunk = !!(dmux->feature_flags_ & EXIF_FLAG);
+        goto Skip;
+      }
+      case MKFOURCC('X', 'M', 'P', ' '): {
+        store_chunk = !!(dmux->feature_flags_ & XMP_FLAG);
        goto Skip;
      }
 Skip:
@ -546,6 +559,37 @@ static ParseStatus ParseVP8X(WebPDemuxer* const dmux) {
  return status;
 }

+static ParseStatus ParseVP8X(WebPDemuxer* const dmux) {
+  MemBuffer* const mem = &dmux->mem_;
+  uint32_t vp8x_size;
+
+  if (MemDataSize(mem) < CHUNK_HEADER_SIZE) return PARSE_NEED_MORE_DATA;
+
+  dmux->is_ext_format_ = 1;
+  Skip(mem, TAG_SIZE);  // VP8X
+  vp8x_size = ReadLE32(mem);
+  if (vp8x_size > MAX_CHUNK_PAYLOAD) return PARSE_ERROR;
+  if (vp8x_size < VP8X_CHUNK_SIZE) return PARSE_ERROR;
+  vp8x_size += vp8x_size & 1;
+  if (SizeIsInvalid(mem, vp8x_size)) return PARSE_ERROR;
+  if (MemDataSize(mem) < vp8x_size) return PARSE_NEED_MORE_DATA;
+
+  dmux->feature_flags_ = ReadByte(mem);
+  Skip(mem, 3);  // Reserved.
+  dmux->canvas_width_  = 1 + ReadLE24s(mem);
+  dmux->canvas_height_ = 1 + ReadLE24s(mem);
+  if (dmux->canvas_width_ * (uint64_t)dmux->canvas_height_ >= MAX_IMAGE_AREA) {
+    return PARSE_ERROR;  // image final dimension is too large
+  }
+  Skip(mem, vp8x_size - VP8X_CHUNK_SIZE);  // skip any trailing data.
+  dmux->state_ = WEBP_DEMUX_PARSED_HEADER;
+
+  if (SizeIsInvalid(mem, CHUNK_HEADER_SIZE)) return PARSE_ERROR;
+  if (MemDataSize(mem) < CHUNK_HEADER_SIZE) return PARSE_NEED_MORE_DATA;
+
+  return ParseVP8XChunks(dmux);
+}
+
 // -----------------------------------------------------------------------------
 // Format validation

@ -560,30 +604,55 @@ static int IsValidSimpleFormat(const WebPDemuxer* const dmux) {
  return 1;
 }

+// If 'exact' is true, check that the image resolution matches the canvas.
+// If 'exact' is false, check that the x/y offsets do not exceed the canvas.
+// TODO(jzern): this is insufficient in the fragmented image case if the
+// expectation is that the fragments completely cover the canvas.
+static int CheckFrameBounds(const Frame* const frame, int exact,
+                            int canvas_width, int canvas_height) {
+  if (exact) {
+    if (frame->x_offset_ != 0 || frame->y_offset_ != 0) {
+      return 0;
+    }
+    if (frame->width_ != canvas_width || frame->height_ != canvas_height) {
+      return 0;
+    }
+  } else {
+    if (frame->x_offset_ < 0 || frame->y_offset_ < 0) return 0;
+    if (frame->width_ + frame->x_offset_ > canvas_width) return 0;
+    if (frame->height_ + frame->y_offset_ > canvas_height) return 0;
+  }
+  return 1;
+}
+
 static int IsValidExtendedFormat(const WebPDemuxer* const dmux) {
-  const int has_tiles = !!(dmux->feature_flags_ & TILE_FLAG);
-  const int has_frames = !!(dmux->feature_flags_ & ANIMATION_FLAG);
-  const Frame* f;
+  const int is_animation = !!(dmux->feature_flags_ & ANIMATION_FLAG);
+  const int is_fragmented = !!(dmux->feature_flags_ & FRAGMENTS_FLAG);
+  const Frame* f = dmux->frames_;

  if (dmux->state_ == WEBP_DEMUX_PARSING_HEADER) return 1;

  if (dmux->canvas_width_ <= 0 || dmux->canvas_height_ <= 0) return 0;
  if (dmux->loop_count_ < 0) return 0;
  if (dmux->state_ == WEBP_DEMUX_DONE && dmux->frames_ == NULL) return 0;
+#ifndef WEBP_EXPERIMENTAL_FEATURES
+  if (is_fragmented) return 0;
+#endif

-  for (f = dmux->frames_; f != NULL; f = f->next_) {
+  while (f != NULL) {
    const int cur_frame_set = f->frame_num_;
-    int frame_count = 0, tile_count = 0;
+    int frame_count = 0, fragment_count = 0;

-    // Check frame properties and if the image is composed of tiles that each
-    // fragment came from a 'TILE'.
+    // Check frame properties and if the image is composed of fragments that
+    // each fragment came from a fragment.
    for (; f != NULL && f->frame_num_ == cur_frame_set; f = f->next_) {
      const ChunkData* const image = f->img_components_;
      const ChunkData* const alpha = f->img_components_ + 1;

-      if (!has_tiles && f->is_tile_) return 0;
-      if (!has_frames && f->frame_num_ > 1) return 0;
-      if (f->x_offset_ < 0 || f->y_offset_ < 0) return 0;
+      if (is_fragmented && !f->is_fragment_) return 0;
+      if (!is_fragmented && f->is_fragment_) return 0;
+      if (!is_animation && f->frame_num_ > 1) return 0;
+
      if (f->complete_) {
        if (alpha->size_ == 0 && image->size_ == 0) return 0;
        // Ensure alpha precedes image bitstream.
@ -593,6 +662,9 @@ static int IsValidExtendedFormat(const WebPDemuxer* const dmux) {

        if (f->width_ <= 0 || f->height_ <= 0) return 0;
      } else {
+        // There shouldn't be a partial frame in a complete file.
+        if (dmux->state_ == WEBP_DEMUX_DONE) return 0;
+
        // Ensure alpha precedes image bitstream.
        if (alpha->size_ > 0 && image->size_ > 0 &&
            alpha->offset_ > image->offset_) {
@ -602,12 +674,17 @@ static int IsValidExtendedFormat(const WebPDemuxer* const dmux) {
        if (f->next_ != NULL) return 0;
      }

-      tile_count += f->is_tile_;
+      if (f->width_ > 0 && f->height_ > 0 &&
+          !CheckFrameBounds(f, !(is_animation || is_fragmented),
+                            dmux->canvas_width_, dmux->canvas_height_)) {
+        return 0;
+      }
+
+      fragment_count += f->is_fragment_;
      ++frame_count;
    }
-    if (!has_tiles && frame_count > 1) return 0;
-    if (tile_count > 0 && frame_count != tile_count) return 0;
-    if (f == NULL) break;
+    if (!is_fragmented && frame_count > 1) return 0;
+    if (fragment_count > 0 && frame_count != fragment_count) return 0;
  }
  return 1;
 }
@ -618,8 +695,11 @@ static int IsValidExtendedFormat(const WebPDemuxer* const dmux) {
 static void InitDemux(WebPDemuxer* const dmux, const MemBuffer* const mem) {
  dmux->state_ = WEBP_DEMUX_PARSING_HEADER;
  dmux->loop_count_ = 1;
+  dmux->bgcolor_ = 0xFFFFFFFF;  // White background by default.
  dmux->canvas_width_ = -1;
  dmux->canvas_height_ = -1;
+  dmux->frames_tail_ = &dmux->frames_;
+  dmux->chunks_tail_ = &dmux->chunks_;
  dmux->mem_ = *mem;
 }

@ -631,11 +711,20 @@ WebPDemuxer* WebPDemuxInternal(const WebPData* data, int allow_partial,
  MemBuffer mem;
  WebPDemuxer* dmux;

-  if (WEBP_ABI_IS_INCOMPATIBLE(version, WEBP_DEMUX_ABI_VERSION)) return NULL;
-  if (data == NULL || data->bytes_ == NULL || data->size_ == 0) return NULL;
+  if (state != NULL) *state = WEBP_DEMUX_PARSE_ERROR;

-  if (!InitMemBuffer(&mem, data->bytes_, data->size_)) return NULL;
-  if (!ReadHeader(&mem)) return NULL;
+  if (WEBP_ABI_IS_INCOMPATIBLE(version, WEBP_DEMUX_ABI_VERSION)) return NULL;
+  if (data == NULL || data->bytes == NULL || data->size == 0) return NULL;
+
+  if (!InitMemBuffer(&mem, data->bytes, data->size)) return NULL;
+  status = ReadHeader(&mem);
+  if (status != PARSE_OK) {
+    if (state != NULL) {
+      *state = (status == PARSE_NEED_MORE_DATA) ? WEBP_DEMUX_PARSING_HEADER
+                                                : WEBP_DEMUX_PARSE_ERROR;
+    }
+    return NULL;
+  }

  partial = (mem.buf_size_ < mem.riff_end_);
  if (!allow_partial && partial) return NULL;
@ -644,15 +733,18 @@ WebPDemuxer* WebPDemuxInternal(const WebPData* data, int allow_partial,
  if (dmux == NULL) return NULL;
  InitDemux(dmux, &mem);

+  status = PARSE_ERROR;
  for (parser = kMasterChunks; parser->parse != NULL; ++parser) {
    if (!memcmp(parser->id, GetBuffer(&dmux->mem_), TAG_SIZE)) {
      status = parser->parse(dmux);
      if (status == PARSE_OK) dmux->state_ = WEBP_DEMUX_DONE;
+      if (status == PARSE_NEED_MORE_DATA && !partial) status = PARSE_ERROR;
      if (status != PARSE_ERROR && !parser->valid(dmux)) status = PARSE_ERROR;
+      if (status == PARSE_ERROR) dmux->state_ = WEBP_DEMUX_PARSE_ERROR;
      break;
    }
  }
-  if (state) *state = dmux->state_;
+  if (state != NULL) *state = dmux->state_;

  if (status == PARSE_ERROR) {
    WebPDemuxDelete(dmux);
@ -689,6 +781,8 @@ uint32_t WebPDemuxGetI(const WebPDemuxer* dmux, WebPFormatFeature feature) {
    case WEBP_FF_CANVAS_WIDTH:     return (uint32_t)dmux->canvas_width_;
    case WEBP_FF_CANVAS_HEIGHT:    return (uint32_t)dmux->canvas_height_;
    case WEBP_FF_LOOP_COUNT:       return (uint32_t)dmux->loop_count_;
+    case WEBP_FF_BACKGROUND_COLOR: return dmux->bgcolor_;
+    case WEBP_FF_FRAME_COUNT:      return (uint32_t)dmux->num_frames_;
  }
  return 0;
 }
@ -696,7 +790,8 @@ uint32_t WebPDemuxGetI(const WebPDemuxer* dmux, WebPFormatFeature feature) {
 // -----------------------------------------------------------------------------
 // Frame iteration

-// Find the first 'frame_num' frame. There may be multiple in a tiled frame.
+// Find the first 'frame_num' frame. There may be multiple such frames in a
+// fragmented frame.
 static const Frame* GetFrame(const WebPDemuxer* const dmux, int frame_num) {
  const Frame* f;
  for (f = dmux->frames_; f != NULL; f = f->next_) {
@ -705,19 +800,19 @@ static const Frame* GetFrame(const WebPDemuxer* const dmux, int frame_num) {
  return f;
 }

-// Returns tile 'tile_num' and the total count.
-static const Frame* GetTile(
-    const Frame* const frame_set, int tile_num, int* const count) {
+// Returns fragment 'fragment_num' and the total count.
+static const Frame* GetFragment(
+    const Frame* const frame_set, int fragment_num, int* const count) {
  const int this_frame = frame_set->frame_num_;
  const Frame* f = frame_set;
-  const Frame* tile = NULL;
+  const Frame* fragment = NULL;
  int total;

  for (total = 0; f != NULL && f->frame_num_ == this_frame; f = f->next_) {
-    if (++total == tile_num) tile = f;
+    if (++total == fragment_num) fragment = f;
  }
  *count = total;
-  return tile;
+  return fragment;
 }

 static const uint8_t* GetFramePayload(const uint8_t* const mem_buf,
@ -747,27 +842,33 @@ static const uint8_t* GetFramePayload(const uint8_t* const mem_buf,
 // Create a whole 'frame' from VP8 (+ alpha) or lossless.
 static int SynthesizeFrame(const WebPDemuxer* const dmux,
                           const Frame* const first_frame,
-                           int tile_num, WebPIterator* const iter) {
+                           int fragment_num, WebPIterator* const iter) {
  const uint8_t* const mem_buf = dmux->mem_.buf_;
-  int num_tiles;
+  int num_fragments;
  size_t payload_size = 0;
-  const Frame* const tile = GetTile(first_frame, tile_num, &num_tiles);
-  const uint8_t* const payload = GetFramePayload(mem_buf, tile, &payload_size);
+  const Frame* const fragment =
+      GetFragment(first_frame, fragment_num, &num_fragments);
+  const uint8_t* const payload =
+      GetFramePayload(mem_buf, fragment, &payload_size);
  if (payload == NULL) return 0;
+  assert(first_frame != NULL);

-  iter->frame_num_   = first_frame->frame_num_;
-  iter->num_frames_  = dmux->num_frames_;
-  iter->tile_num_    = tile_num;
-  iter->num_tiles_   = num_tiles;
-  iter->x_offset_    = tile->x_offset_;
-  iter->y_offset_    = tile->y_offset_;
-  iter->width_       = tile->width_;
-  iter->height_      = tile->height_;
-  iter->duration_    = tile->duration_;
-  iter->complete_    = tile->complete_;
-  iter->tile_.bytes_ = payload;
-  iter->tile_.size_  = payload_size;
-  // TODO(jzern): adjust offsets for 'TILE's embedded in 'FRM 's
+  iter->frame_num      = first_frame->frame_num_;
+  iter->num_frames     = dmux->num_frames_;
+  iter->fragment_num   = fragment_num;
+  iter->num_fragments  = num_fragments;
+  iter->x_offset       = fragment->x_offset_;
+  iter->y_offset       = fragment->y_offset_;
+  iter->width          = fragment->width_;
+  iter->height         = fragment->height_;
+  iter->has_alpha      = fragment->has_alpha_;
+  iter->duration       = fragment->duration_;
+  iter->dispose_method = fragment->dispose_method_;
+  iter->blend_method   = fragment->blend_method_;
+  iter->complete       = fragment->complete_;
+  iter->fragment.bytes = payload;
+  iter->fragment.size  = payload_size;
+  // TODO(jzern): adjust offsets for 'FRGM's embedded in 'ANMF's
  return 1;
 }

@ -779,6 +880,8 @@ static int SetFrame(int frame_num, WebPIterator* const iter) {
  if (frame_num == 0) frame_num = dmux->num_frames_;

  frame = GetFrame(dmux, frame_num);
+  if (frame == NULL) return 0;
+
  return SynthesizeFrame(dmux, frame, 1, iter);
 }

@ -792,22 +895,22 @@ int WebPDemuxGetFrame(const WebPDemuxer* dmux, int frame, WebPIterator* iter) {

 int WebPDemuxNextFrame(WebPIterator* iter) {
  if (iter == NULL) return 0;
-  return SetFrame(iter->frame_num_ + 1, iter);
+  return SetFrame(iter->frame_num + 1, iter);
 }

 int WebPDemuxPrevFrame(WebPIterator* iter) {
  if (iter == NULL) return 0;
-  if (iter->frame_num_ <= 1) return 0;
-  return SetFrame(iter->frame_num_ - 1, iter);
+  if (iter->frame_num <= 1) return 0;
+  return SetFrame(iter->frame_num - 1, iter);
 }

-int WebPDemuxSelectTile(WebPIterator* iter, int tile) {
-  if (iter != NULL && iter->private_ != NULL && tile > 0) {
+int WebPDemuxSelectFragment(WebPIterator* iter, int fragment_num) {
+  if (iter != NULL && iter->private_ != NULL && fragment_num > 0) {
    const WebPDemuxer* const dmux = (WebPDemuxer*)iter->private_;
-    const Frame* const frame = GetFrame(dmux, iter->frame_num_);
+    const Frame* const frame = GetFrame(dmux, iter->frame_num);
    if (frame == NULL) return 0;

-    return SynthesizeFrame(dmux, frame, tile, iter);
+    return SynthesizeFrame(dmux, frame, fragment_num, iter);
  }
  return 0;
 }
@ -856,10 +959,10 @@ static int SetChunk(const char fourcc[4], int chunk_num,
  if (chunk_num <= count) {
    const uint8_t* const mem_buf = dmux->mem_.buf_;
    const Chunk* const chunk = GetChunk(dmux, fourcc, chunk_num);
-    iter->chunk_.bytes_ = mem_buf + chunk->data_.offset_ + CHUNK_HEADER_SIZE;
-    iter->chunk_.size_  = chunk->data_.size_ - CHUNK_HEADER_SIZE;
-    iter->num_chunks_   = count;
-    iter->chunk_num_    = chunk_num;
+    iter->chunk.bytes = mem_buf + chunk->data_.offset_ + CHUNK_HEADER_SIZE;
+    iter->chunk.size  = chunk->data_.size_ - CHUNK_HEADER_SIZE;
+    iter->num_chunks  = count;
+    iter->chunk_num   = chunk_num;
    return 1;
  }
  return 0;
@ -878,17 +981,17 @@ int WebPDemuxGetChunk(const WebPDemuxer* dmux,
 int WebPDemuxNextChunk(WebPChunkIterator* iter) {
  if (iter != NULL) {
    const char* const fourcc =
-        (const char*)iter->chunk_.bytes_ - CHUNK_HEADER_SIZE;
-    return SetChunk(fourcc, iter->chunk_num_ + 1, iter);
+        (const char*)iter->chunk.bytes - CHUNK_HEADER_SIZE;
+    return SetChunk(fourcc, iter->chunk_num + 1, iter);
  }
  return 0;
 }

 int WebPDemuxPrevChunk(WebPChunkIterator* iter) {
-  if (iter != NULL && iter->chunk_num_ > 1) {
+  if (iter != NULL && iter->chunk_num > 1) {
    const char* const fourcc =
-        (const char*)iter->chunk_.bytes_ - CHUNK_HEADER_SIZE;
-    return SetChunk(fourcc, iter->chunk_num_ - 1, iter);
+        (const char*)iter->chunk.bytes - CHUNK_HEADER_SIZE;
+    return SetChunk(fourcc, iter->chunk_num - 1, iter);
  }
  return 0;
 }
@ -897,6 +1000,3 @@ void WebPDemuxReleaseChunkIterator(WebPChunkIterator* iter) {
  (void)iter;
 }

-#if defined(__cplusplus) || defined(c_plusplus)
-}  // extern "C"
-#endif
--- a/src/demux/libwebpdemux.pc.in
+++ b/src/demux/libwebpdemux.pc.in
@ -0,0 +1,11 @@
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+
+Name: libwebpdemux
+Description: Library for parsing the WebP graphics format container
+Version: @PACKAGE_VERSION@
+Requires: libwebp >= 0.2.0
+Cflags: -I${includedir}
+Libs: -L${libdir} -lwebpdemux
--- a/src/dsp/Makefile.am
+++ b/src/dsp/Makefile.am
@ -1,26 +1,44 @@
 AM_CPPFLAGS = -I$(top_srcdir)/src
 noinst_LTLIBRARIES = libwebpdsp.la

-libwebpdsp_la_SOURCES =
-libwebpdsp_la_SOURCES += cpu.c
-libwebpdsp_la_SOURCES += dec.c
-libwebpdsp_la_SOURCES += dec_neon.c
-libwebpdsp_la_SOURCES += dec_sse2.c
-libwebpdsp_la_SOURCES += dsp.h
-libwebpdsp_la_SOURCES += enc.c
-libwebpdsp_la_SOURCES += enc_sse2.c
-libwebpdsp_la_SOURCES += lossless.c
-libwebpdsp_la_SOURCES += lossless.h
-libwebpdsp_la_SOURCES += upsampling.c
-libwebpdsp_la_SOURCES += upsampling_sse2.c
-libwebpdsp_la_SOURCES += yuv.c
-libwebpdsp_la_SOURCES += yuv.h
+if BUILD_LIBWEBPDECODER
+  noinst_LTLIBRARIES += libwebpdspdecode.la
+endif
+
+common_HEADERS = ../webp/types.h
+commondir = $(includedir)/webp
+
+COMMON_SOURCES =
+COMMON_SOURCES += cpu.c
+COMMON_SOURCES += dec.c
+COMMON_SOURCES += dec_neon.c
+COMMON_SOURCES += dec_sse2.c
+COMMON_SOURCES += dsp.h
+COMMON_SOURCES += lossless.c
+COMMON_SOURCES += lossless.h
+COMMON_SOURCES += upsampling.c
+COMMON_SOURCES += upsampling_neon.c
+COMMON_SOURCES += upsampling_sse2.c
+COMMON_SOURCES += yuv.c
+COMMON_SOURCES += yuv.h
+
+ENC_SOURCES =
+ENC_SOURCES += enc.c
+ENC_SOURCES += enc_neon.c
+ENC_SOURCES += enc_sse2.c
+
+libwebpdsp_la_SOURCES = $(COMMON_SOURCES) $(ENC_SOURCES)

-libwebpdspinclude_HEADERS = ../webp/types.h
 noinst_HEADERS =
 noinst_HEADERS += ../dec/decode_vp8.h
 noinst_HEADERS += ../webp/decode.h

 libwebpdsp_la_LDFLAGS = -lm
-libwebpdsp_la_CPPFLAGS = $(USE_EXPERIMENTAL_CODE)
-libwebpdspincludedir = $(includedir)/webp
+libwebpdsp_la_CPPFLAGS = $(USE_EXPERIMENTAL_CODE) $(USE_SWAP_16BIT_CSP)
+
+if BUILD_LIBWEBPDECODER
+  libwebpdspdecode_la_SOURCES = $(COMMON_SOURCES)
+
+  libwebpdspdecode_la_LDFLAGS = $(libwebpdsp_la_LDFLAGS)
+  libwebpdspdecode_la_CPPFLAGS = $(libwebpdsp_la_CPPFLAGS)
+endif
--- a/src/dsp/cpu.c
+++ b/src/dsp/cpu.c
@ -1,8 +1,10 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // CPU detection
@ -15,10 +17,6 @@
 #include <cpu-features.h>
 #endif

-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 //------------------------------------------------------------------------------
 // SSE2 detection.
 //
@ -80,6 +78,3 @@ VP8CPUInfo VP8GetCPUInfo = armCPUInfo;
 VP8CPUInfo VP8GetCPUInfo = NULL;
 #endif

-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
--- a/src/dsp/dec.c
+++ b/src/dsp/dec.c
@ -1,8 +1,10 @@
 // Copyright 2010 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Speed-critical decoding functions.
@ -12,10 +14,6 @@
 #include "./dsp.h"
 #include "../dec/vp8i.h"

-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 //------------------------------------------------------------------------------
 // run-time tables (~4k)

@ -59,6 +57,14 @@ static WEBP_INLINE uint8_t clip_8b(int v) {
 #define STORE(x, y, v) \
  dst[x + y * BPS] = clip_8b(dst[x + y * BPS] + ((v) >> 3))

+#define STORE2(y, dc, d, c) do {    \
+  const int DC = (dc);              \
+  STORE(0, y, DC + (d));            \
+  STORE(1, y, DC + (c));            \
+  STORE(2, y, DC - (c));            \
+  STORE(3, y, DC - (d));            \
+} while (0)
+
 static const int kC1 = 20091 + (1 << 16);
 static const int kC2 = 35468;
 #define MUL(a, b) (((a) * (b)) >> 16)
@ -101,7 +107,21 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
    dst += BPS;
  }
 }
+
+// Simplified transform when only in[0], in[1] and in[4] are non-zero
+static void TransformAC3(const int16_t* in, uint8_t* dst) {
+  const int a = in[0] + 4;
+  const int c4 = MUL(in[4], kC2);
+  const int d4 = MUL(in[4], kC1);
+  const int c1 = MUL(in[1], kC2);
+  const int d1 = MUL(in[1], kC1);
+  STORE2(0, a + d4, d1, c1);
+  STORE2(1, a + c4, d1, c1);
+  STORE2(2, a - c4, d1, c1);
+  STORE2(3, a - d4, d1, c1);
+}
 #undef MUL
+#undef STORE2

 static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
  TransformOne(in, dst);
@ -426,11 +446,16 @@ static void HE8uv(uint8_t *dst) {    // horizontal
 }

 // helper for chroma-DC predictions
-static WEBP_INLINE void Put8x8uv(uint64_t v, uint8_t* dst) {
+static WEBP_INLINE void Put8x8uv(uint8_t value, uint8_t* dst) {
  int j;
+#ifndef WEBP_REFERENCE_IMPLEMENTATION
+  const uint64_t v = (uint64_t)value * 0x0101010101010101ULL;
  for (j = 0; j < 8; ++j) {
    *(uint64_t*)(dst + j * BPS) = v;
  }
+#else
+  for (j = 0; j < 8; ++j) memset(dst + j * BPS, value, 8);
+#endif
 }

 static void DC8uv(uint8_t *dst) {     // DC
@ -439,7 +464,7 @@ static void DC8uv(uint8_t *dst) {     // DC
  for (i = 0; i < 8; ++i) {
    dc0 += dst[i - BPS] + dst[-1 + i * BPS];
  }
-  Put8x8uv((uint64_t)((dc0 >> 4) * 0x0101010101010101ULL), dst);
+  Put8x8uv(dc0 >> 4, dst);
 }

 static void DC8uvNoLeft(uint8_t *dst) {   // DC with no left samples
@ -448,7 +473,7 @@ static void DC8uvNoLeft(uint8_t *dst) {   // DC with no left samples
  for (i = 0; i < 8; ++i) {
    dc0 += dst[i - BPS];
  }
-  Put8x8uv((uint64_t)((dc0 >> 3) * 0x0101010101010101ULL), dst);
+  Put8x8uv(dc0 >> 3, dst);
 }

 static void DC8uvNoTop(uint8_t *dst) {  // DC with no top samples
@ -457,11 +482,11 @@ static void DC8uvNoTop(uint8_t *dst) {  // DC with no top samples
  for (i = 0; i < 8; ++i) {
    dc0 += dst[-1 + i * BPS];
  }
-  Put8x8uv((uint64_t)((dc0 >> 3) * 0x0101010101010101ULL), dst);
+  Put8x8uv(dc0 >> 3, dst);
 }

 static void DC8uvNoTopLeft(uint8_t *dst) {    // DC with nothing
-  Put8x8uv(0x8080808080808080ULL, dst);
+  Put8x8uv(0x80, dst);
 }

 //------------------------------------------------------------------------------
@ -672,6 +697,7 @@ static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
 //------------------------------------------------------------------------------

 VP8DecIdct2 VP8Transform;
+VP8DecIdct VP8TransformAC3;
 VP8DecIdct VP8TransformUV;
 VP8DecIdct VP8TransformDC;
 VP8DecIdct VP8TransformDCUV;
@ -699,6 +725,7 @@ void VP8DspInit(void) {
  VP8TransformUV = TransformUV;
  VP8TransformDC = TransformDC;
  VP8TransformDCUV = TransformDCUV;
+  VP8TransformAC3 = TransformAC3;

  VP8VFilter16 = VFilter16;
  VP8HFilter16 = HFilter16;
@ -727,6 +754,3 @@ void VP8DspInit(void) {
  }
 }

-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
--- a/src/dsp/dec_neon.c
+++ b/src/dsp/dec_neon.c
@ -1,8 +1,10 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // ARM NEON version of dsp functions and loop filtering.
@ -16,11 +18,7 @@

 #include "../dec/vp8i.h"

-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
-#define QRegs "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",                  \
+#define QRegs "q0", "q1", "q2", "q3",                                          \
              "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

 #define FLIP_SIGN_BIT2(a, b, s)                                                \
@ -99,9 +97,9 @@ static void SimpleVFilter16NEON(uint8_t* p, int stride, int thresh) {
    "vld1.u8    {q1}, [%[p]], %[stride]        \n"  // p1
    "vld1.u8    {q2}, [%[p]], %[stride]        \n"  // p0
    "vld1.u8    {q3}, [%[p]], %[stride]        \n"  // q0
-    "vld1.u8    {q4}, [%[p]]                   \n"  // q1
+    "vld1.u8    {q12}, [%[p]]                  \n"  // q1

-    DO_FILTER2(q1, q2, q3, q4, %[thresh])
+    DO_FILTER2(q1, q2, q3, q12, %[thresh])

    "sub        %[p], %[p], %[stride], lsl #1  \n"  // p -= 2 * stride

@ -120,18 +118,18 @@ static void SimpleHFilter16NEON(uint8_t* p, int stride, int thresh) {
    "add        r5, r4, %[stride]              \n"  // base2 = base1 + stride

    LOAD8x4(d2, d3, d4, d5, [r4], [r5], r6)
-    LOAD8x4(d6, d7, d8, d9, [r4], [r5], r6)
-    "vswp       d3, d6                         \n"  // p1:q1 p0:q3
-    "vswp       d5, d8                         \n"  // q0:q2 q1:q4
-    "vswp       q2, q3                         \n"  // p1:q1 p0:q2 q0:q3 q1:q4
+    LOAD8x4(d24, d25, d26, d27, [r4], [r5], r6)
+    "vswp       d3, d24                        \n"  // p1:q1 p0:q3
+    "vswp       d5, d26                        \n"  // q0:q2 q1:q4
+    "vswp       q2, q12                        \n"  // p1:q1 p0:q2 q0:q3 q1:q4

-    DO_FILTER2(q1, q2, q3, q4, %[thresh])
+    DO_FILTER2(q1, q2, q12, q13, %[thresh])

    "sub        %[p], %[p], #1                 \n"  // p - 1

-    "vswp        d5, d6                        \n"
+    "vswp        d5, d24                       \n"
    STORE8x2(d4, d5, [%[p]], %[stride])
-    STORE8x2(d6, d7, [%[p]], %[stride])
+    STORE8x2(d24, d25, [%[p]], %[stride])

    : [p] "+r"(p)
    : [stride] "r"(stride), [thresh] "r"(thresh)
@ -155,7 +153,10 @@ static void SimpleHFilter16iNEON(uint8_t* p, int stride, int thresh) {
  }
 }

-static void TransformOneNEON(const int16_t *in, uint8_t *dst) {
+//-----------------------------------------------------------------------------
+// Inverse transforms (Paragraph 14.4)
+
+static void TransformOne(const int16_t* in, uint8_t* dst) {
  const int kBPS = BPS;
  const int16_t constants[] = {20091, 17734, 0, 0};
  /* kC1, kC2. Padded because vld1.16 loads 8 bytes
@ -304,26 +305,129 @@ static void TransformOneNEON(const int16_t *in, uint8_t *dst) {
  );
 }

-static void TransformTwoNEON(const int16_t* in, uint8_t* dst, int do_two) {
-  TransformOneNEON(in, dst);
+static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
+  TransformOne(in, dst);
  if (do_two) {
-    TransformOneNEON(in + 16, dst + 4);
+    TransformOne(in + 16, dst + 4);
  }
 }

+static void TransformDC(const int16_t* in, uint8_t* dst) {
+  const int DC = (in[0] + 4) >> 3;
+  const int kBPS = BPS;
+  __asm__ volatile (
+    "vdup.16         q1, %[DC]        \n"
+
+    "vld1.32         d0[0], [%[dst]], %[kBPS]    \n"
+    "vld1.32         d1[0], [%[dst]], %[kBPS]    \n"
+    "vld1.32         d0[1], [%[dst]], %[kBPS]    \n"
+    "vld1.32         d1[1], [%[dst]], %[kBPS]    \n"
+
+    "sub         %[dst], %[dst], %[kBPS], lsl #2 \n"
+
+    // add DC and convert to s16.
+    "vaddw.u8        q2, q1, d0                  \n"
+    "vaddw.u8        q3, q1, d1                  \n"
+    // convert back to u8 with saturation
+    "vqmovun.s16     d0,  q2                     \n"
+    "vqmovun.s16     d1,  q3                     \n"
+
+    "vst1.32         d0[0], [%[dst]], %[kBPS]    \n"
+    "vst1.32         d1[0], [%[dst]], %[kBPS]    \n"
+    "vst1.32         d0[1], [%[dst]], %[kBPS]    \n"
+    "vst1.32         d1[1], [%[dst]]             \n"
+    : [in] "+r"(in), [dst] "+r"(dst)  /* modified registers */
+    : [kBPS] "r"(kBPS),   /* constants */
+      [DC] "r"(DC)
+    : "memory", "q0", "q1", "q2", "q3"  /* clobbered */
+  );
+}
+
+static void TransformWHT(const int16_t* in, int16_t* out) {
+  const int kStep = 32;  // The store is only incrementing the pointer as if we
+                         // had stored a single byte.
+  __asm__ volatile (
+    // part 1
+    // load data into q0, q1
+    "vld1.16         {q0, q1}, [%[in]]           \n"
+
+    "vaddl.s16       q2, d0, d3                  \n"  // a0 = in[0] + in[12]
+    "vaddl.s16       q3, d1, d2                  \n"  // a1 = in[4] + in[8]
+    "vsubl.s16       q10, d1, d2                 \n"  // a2 = in[4] - in[8]
+    "vsubl.s16       q11, d0, d3                 \n"  // a3 = in[0] - in[12]
+
+    "vadd.s32        q0, q2, q3                  \n"  // tmp[0] = a0 + a1
+    "vsub.s32        q2, q2, q3                  \n"  // tmp[8] = a0 - a1
+    "vadd.s32        q1, q11, q10                \n"  // tmp[4] = a3 + a2
+    "vsub.s32        q3, q11, q10                \n"  // tmp[12] = a3 - a2
+
+    // Transpose
+    // q0 = tmp[0, 4, 8, 12], q1 = tmp[2, 6, 10, 14]
+    // q2 = tmp[1, 5, 9, 13], q3 = tmp[3, 7, 11, 15]
+    "vswp            d1, d4                      \n"  // vtrn.64 q0, q2
+    "vswp            d3, d6                      \n"  // vtrn.64 q1, q3
+    "vtrn.32         q0, q1                      \n"
+    "vtrn.32         q2, q3                      \n"
+
+    "vmov.s32        q10, #3                     \n"  // dc = 3
+    "vadd.s32        q0, q0, q10                 \n"  // dc = tmp[0] + 3
+    "vadd.s32        q12, q0, q3                 \n"  // a0 = dc + tmp[3]
+    "vadd.s32        q13, q1, q2                 \n"  // a1 = tmp[1] + tmp[2]
+    "vsub.s32        q8, q1, q2                  \n"  // a2 = tmp[1] - tmp[2]
+    "vsub.s32        q9, q0, q3                  \n"  // a3 = dc - tmp[3]
+
+    "vadd.s32        q0, q12, q13                \n"
+    "vshrn.s32       d0, q0, #3                  \n"  // (a0 + a1) >> 3
+    "vadd.s32        q1, q9, q8                  \n"
+    "vshrn.s32       d1, q1, #3                  \n"  // (a3 + a2) >> 3
+    "vsub.s32        q2, q12, q13                \n"
+    "vshrn.s32       d2, q2, #3                  \n"  // (a0 - a1) >> 3
+    "vsub.s32        q3, q9, q8                  \n"
+    "vshrn.s32       d3, q3, #3                  \n"  // (a3 - a2) >> 3
+
+    // set the results to output
+    "vst1.16         d0[0], [%[out]], %[kStep]   \n"
+    "vst1.16         d1[0], [%[out]], %[kStep]   \n"
+    "vst1.16         d2[0], [%[out]], %[kStep]   \n"
+    "vst1.16         d3[0], [%[out]], %[kStep]   \n"
+    "vst1.16         d0[1], [%[out]], %[kStep]   \n"
+    "vst1.16         d1[1], [%[out]], %[kStep]   \n"
+    "vst1.16         d2[1], [%[out]], %[kStep]   \n"
+    "vst1.16         d3[1], [%[out]], %[kStep]   \n"
+    "vst1.16         d0[2], [%[out]], %[kStep]   \n"
+    "vst1.16         d1[2], [%[out]], %[kStep]   \n"
+    "vst1.16         d2[2], [%[out]], %[kStep]   \n"
+    "vst1.16         d3[2], [%[out]], %[kStep]   \n"
+    "vst1.16         d0[3], [%[out]], %[kStep]   \n"
+    "vst1.16         d1[3], [%[out]], %[kStep]   \n"
+    "vst1.16         d2[3], [%[out]], %[kStep]   \n"
+    "vst1.16         d3[3], [%[out]], %[kStep]   \n"
+
+    : [out] "+r"(out)  // modified registers
+    : [in] "r"(in), [kStep] "r"(kStep)  // constants
+    : "memory", "q0", "q1", "q2", "q3",
+      "q8", "q9", "q10", "q11", "q12", "q13"  // clobbered
+  );
+}
+
+#endif   // WEBP_USE_NEON
+
+//------------------------------------------------------------------------------
+// Entry point
+
 extern void VP8DspInitNEON(void);

 void VP8DspInitNEON(void) {
-  VP8Transform = TransformTwoNEON;
+#if defined(WEBP_USE_NEON)
+  VP8Transform = TransformTwo;
+  VP8TransformAC3 = TransformOne;  // no special code here
+  VP8TransformDC = TransformDC;
+  VP8TransformWHT = TransformWHT;

  VP8SimpleVFilter16 = SimpleVFilter16NEON;
  VP8SimpleHFilter16 = SimpleHFilter16NEON;
  VP8SimpleVFilter16i = SimpleVFilter16iNEON;
  VP8SimpleHFilter16i = SimpleHFilter16iNEON;
+#endif   // WEBP_USE_NEON
 }

-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
-
-#endif   // WEBP_USE_NEON
--- a/src/dsp/dec_sse2.c
+++ b/src/dsp/dec_sse2.c
@ -1,8 +1,10 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // SSE2 version of some decoding functions (idct, loop filtering).
@ -14,13 +16,13 @@

 #if defined(WEBP_USE_SSE2)

+// The 3-coeff sparse transform in SSE2 is not really faster than the plain-C
+// one it seems => disable it by default. Uncomment the following to enable:
+// #define USE_TRANSFORM_AC3
+
 #include <emmintrin.h>
 #include "../dec/vp8i.h"

-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 //------------------------------------------------------------------------------
 // Transforms (Paragraph 14.4)

@ -194,21 +196,21 @@ static void TransformSSE2(const int16_t* in, uint8_t* dst, int do_two) {

  // Add inverse transform to 'dst' and store.
  {
-    const __m128i zero = _mm_set1_epi16(0);
+    const __m128i zero = _mm_setzero_si128();
    // Load the reference(s).
    __m128i dst0, dst1, dst2, dst3;
    if (do_two) {
      // Load eight bytes/pixels per line.
-      dst0 = _mm_loadl_epi64((__m128i*)&dst[0 * BPS]);
-      dst1 = _mm_loadl_epi64((__m128i*)&dst[1 * BPS]);
-      dst2 = _mm_loadl_epi64((__m128i*)&dst[2 * BPS]);
-      dst3 = _mm_loadl_epi64((__m128i*)&dst[3 * BPS]);
+      dst0 = _mm_loadl_epi64((__m128i*)(dst + 0 * BPS));
+      dst1 = _mm_loadl_epi64((__m128i*)(dst + 1 * BPS));
+      dst2 = _mm_loadl_epi64((__m128i*)(dst + 2 * BPS));
+      dst3 = _mm_loadl_epi64((__m128i*)(dst + 3 * BPS));
    } else {
      // Load four bytes/pixels per line.
-      dst0 = _mm_cvtsi32_si128(*(int*)&dst[0 * BPS]);
-      dst1 = _mm_cvtsi32_si128(*(int*)&dst[1 * BPS]);
-      dst2 = _mm_cvtsi32_si128(*(int*)&dst[2 * BPS]);
-      dst3 = _mm_cvtsi32_si128(*(int*)&dst[3 * BPS]);
+      dst0 = _mm_cvtsi32_si128(*(int*)(dst + 0 * BPS));
+      dst1 = _mm_cvtsi32_si128(*(int*)(dst + 1 * BPS));
+      dst2 = _mm_cvtsi32_si128(*(int*)(dst + 2 * BPS));
+      dst3 = _mm_cvtsi32_si128(*(int*)(dst + 3 * BPS));
    }
    // Convert to 16b.
    dst0 = _mm_unpacklo_epi8(dst0, zero);
@ -228,20 +230,66 @@ static void TransformSSE2(const int16_t* in, uint8_t* dst, int do_two) {
    // Store the results.
    if (do_two) {
      // Store eight bytes/pixels per line.
-      _mm_storel_epi64((__m128i*)&dst[0 * BPS], dst0);
-      _mm_storel_epi64((__m128i*)&dst[1 * BPS], dst1);
-      _mm_storel_epi64((__m128i*)&dst[2 * BPS], dst2);
-      _mm_storel_epi64((__m128i*)&dst[3 * BPS], dst3);
+      _mm_storel_epi64((__m128i*)(dst + 0 * BPS), dst0);
+      _mm_storel_epi64((__m128i*)(dst + 1 * BPS), dst1);
+      _mm_storel_epi64((__m128i*)(dst + 2 * BPS), dst2);
+      _mm_storel_epi64((__m128i*)(dst + 3 * BPS), dst3);
    } else {
      // Store four bytes/pixels per line.
-      *((int32_t *)&dst[0 * BPS]) = _mm_cvtsi128_si32(dst0);
-      *((int32_t *)&dst[1 * BPS]) = _mm_cvtsi128_si32(dst1);
-      *((int32_t *)&dst[2 * BPS]) = _mm_cvtsi128_si32(dst2);
-      *((int32_t *)&dst[3 * BPS]) = _mm_cvtsi128_si32(dst3);
+      *(int*)(dst + 0 * BPS) = _mm_cvtsi128_si32(dst0);
+      *(int*)(dst + 1 * BPS) = _mm_cvtsi128_si32(dst1);
+      *(int*)(dst + 2 * BPS) = _mm_cvtsi128_si32(dst2);
+      *(int*)(dst + 3 * BPS) = _mm_cvtsi128_si32(dst3);
    }
  }
 }

+#if defined(USE_TRANSFORM_AC3)
+#define MUL(a, b) (((a) * (b)) >> 16)
+static void TransformAC3SSE2(const int16_t* in, uint8_t* dst) {
+  static const int kC1 = 20091 + (1 << 16);
+  static const int kC2 = 35468;
+  const __m128i A = _mm_set1_epi16(in[0] + 4);
+  const __m128i c4 = _mm_set1_epi16(MUL(in[4], kC2));
+  const __m128i d4 = _mm_set1_epi16(MUL(in[4], kC1));
+  const int c1 = MUL(in[1], kC2);
+  const int d1 = MUL(in[1], kC1);
+  const __m128i CD = _mm_set_epi16(0, 0, 0, 0, -d1, -c1, c1, d1);
+  const __m128i B = _mm_adds_epi16(A, CD);
+  const __m128i m0 = _mm_adds_epi16(B, d4);
+  const __m128i m1 = _mm_adds_epi16(B, c4);
+  const __m128i m2 = _mm_subs_epi16(B, c4);
+  const __m128i m3 = _mm_subs_epi16(B, d4);
+  const __m128i zero = _mm_setzero_si128();
+  // Load the source pixels.
+  __m128i dst0 = _mm_cvtsi32_si128(*(int*)(dst + 0 * BPS));
+  __m128i dst1 = _mm_cvtsi32_si128(*(int*)(dst + 1 * BPS));
+  __m128i dst2 = _mm_cvtsi32_si128(*(int*)(dst + 2 * BPS));
+  __m128i dst3 = _mm_cvtsi32_si128(*(int*)(dst + 3 * BPS));
+  // Convert to 16b.
+  dst0 = _mm_unpacklo_epi8(dst0, zero);
+  dst1 = _mm_unpacklo_epi8(dst1, zero);
+  dst2 = _mm_unpacklo_epi8(dst2, zero);
+  dst3 = _mm_unpacklo_epi8(dst3, zero);
+  // Add the inverse transform.
+  dst0 = _mm_adds_epi16(dst0, _mm_srai_epi16(m0, 3));
+  dst1 = _mm_adds_epi16(dst1, _mm_srai_epi16(m1, 3));
+  dst2 = _mm_adds_epi16(dst2, _mm_srai_epi16(m2, 3));
+  dst3 = _mm_adds_epi16(dst3, _mm_srai_epi16(m3, 3));
+  // Unsigned saturate to 8b.
+  dst0 = _mm_packus_epi16(dst0, dst0);
+  dst1 = _mm_packus_epi16(dst1, dst1);
+  dst2 = _mm_packus_epi16(dst2, dst2);
+  dst3 = _mm_packus_epi16(dst3, dst3);
+  // Store the results.
+  *(int*)(dst + 0 * BPS) = _mm_cvtsi128_si32(dst0);
+  *(int*)(dst + 1 * BPS) = _mm_cvtsi128_si32(dst1);
+  *(int*)(dst + 2 * BPS) = _mm_cvtsi128_si32(dst2);
+  *(int*)(dst + 3 * BPS) = _mm_cvtsi128_si32(dst3);
+}
+#undef MUL
+#endif   // USE_TRANSFORM_AC3
+
 //------------------------------------------------------------------------------
 // Loop Filter (Paragraph 15)

@ -278,14 +326,14 @@ static void TransformSSE2(const int16_t* in, uint8_t* dst, int do_two) {

 #define GET_NOTHEV(p1, p0, q0, q1, hev_thresh, not_hev) {                      \
  const __m128i zero = _mm_setzero_si128();                                    \
-  const __m128i t1 = MM_ABS(p1, p0);                                           \
-  const __m128i t2 = MM_ABS(q1, q0);                                           \
+  const __m128i t_1 = MM_ABS(p1, p0);                                          \
+  const __m128i t_2 = MM_ABS(q1, q0);                                          \
                                                                               \
  const __m128i h = _mm_set1_epi8(hev_thresh);                                 \
-  const __m128i t3 = _mm_subs_epu8(t1, h);  /* abs(p1 - p0) - hev_tresh */     \
-  const __m128i t4 = _mm_subs_epu8(t2, h);  /* abs(q1 - q0) - hev_tresh */     \
+  const __m128i t_3 = _mm_subs_epu8(t_1, h);  /* abs(p1 - p0) - hev_tresh */   \
+  const __m128i t_4 = _mm_subs_epu8(t_2, h);  /* abs(q1 - q0) - hev_tresh */   \
                                                                               \
-  not_hev = _mm_or_si128(t3, t4);                                              \
+  not_hev = _mm_or_si128(t_3, t_4);                                            \
  not_hev = _mm_cmpeq_epi8(not_hev, zero); /* not_hev <= t1 && not_hev <= t2 */\
 }

@ -314,13 +362,13 @@ static void TransformSSE2(const int16_t* in, uint8_t* dst, int do_two) {

 // Updates values of 2 pixels at MB edge during complex filtering.
 // Update operations:
-// q = q - a and p = p + a; where a = [(a_hi >> 7), (a_lo >> 7)]
+// q = q - delta and p = p + delta; where delta = [(a_hi >> 7), (a_lo >> 7)]
 #define UPDATE_2PIXELS(pi, qi, a_lo, a_hi) {                                   \
  const __m128i a_lo7 = _mm_srai_epi16(a_lo, 7);                               \
  const __m128i a_hi7 = _mm_srai_epi16(a_hi, 7);                               \
-  const __m128i a = _mm_packs_epi16(a_lo7, a_hi7);                             \
-  pi = _mm_adds_epi8(pi, a);                                                   \
-  qi = _mm_subs_epi8(qi, a);                                                   \
+  const __m128i delta = _mm_packs_epi16(a_lo7, a_hi7);                         \
+  pi = _mm_adds_epi8(pi, delta);                                               \
+  qi = _mm_subs_epi8(qi, delta);                                               \
 }

 static void NeedsFilter(const __m128i* p1, const __m128i* p0, const __m128i* q0,
@ -876,10 +924,19 @@ static void HFilter8iSSE2(uint8_t* u, uint8_t* v, int stride,
  Store16x4(u, v, stride, &p1, &p0, &q0, &q1);
 }

+#endif   // WEBP_USE_SSE2
+
+//------------------------------------------------------------------------------
+// Entry point
+
 extern void VP8DspInitSSE2(void);

 void VP8DspInitSSE2(void) {
+#if defined(WEBP_USE_SSE2)
  VP8Transform = TransformSSE2;
+#if defined(USE_TRANSFORM_AC3)
+  VP8TransformAC3 = TransformAC3SSE2;
+#endif

  VP8VFilter16 = VFilter16SSE2;
  VP8HFilter16 = HFilter16SSE2;
@ -894,10 +951,6 @@ void VP8DspInitSSE2(void) {
  VP8SimpleHFilter16 = SimpleHFilter16SSE2;
  VP8SimpleVFilter16i = SimpleVFilter16iSSE2;
  VP8SimpleHFilter16i = SimpleHFilter16iSSE2;
+#endif   // WEBP_USE_SSE2
 }

-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
-
-#endif   // WEBP_USE_SSE2
--- a/src/dsp/dsp.h
+++ b/src/dsp/dsp.h
@ -1,8 +1,10 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 //   Speed-critical functions.
@ -14,14 +16,15 @@

 #include "../webp/types.h"

-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif

 //------------------------------------------------------------------------------
 // CPU detection

-#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
+#if defined(_MSC_VER) && _MSC_VER > 1310 && \
+    (defined(_M_X64) || defined(_M_IX86))
 #define WEBP_MSC_SSE2  // Visual C++ SSE2 targets
 #endif

@ -49,8 +52,6 @@ extern VP8CPUInfo VP8GetCPUInfo;
 //------------------------------------------------------------------------------
 // Encoding

-int VP8GetAlpha(const int histo[]);
-
 // Transforms
 // VP8Idct: Does one of two inverse transforms. If do_two is set, the transforms
 //          will be done for (ref, in, dst) and (ref + 4, in + 16, dst + 4).
@ -85,10 +86,16 @@ typedef int (*VP8QuantizeBlock)(int16_t in[16], int16_t out[16],
                                int n, const struct VP8Matrix* const mtx);
 extern VP8QuantizeBlock VP8EncQuantizeBlock;

-// Compute susceptibility based on DCT-coeff histograms:
-// the higher, the "easier" the macroblock is to compress.
-typedef int (*VP8CHisto)(const uint8_t* ref, const uint8_t* pred,
-                         int start_block, int end_block);
+// specific to 2nd transform:
+typedef int (*VP8QuantizeBlockWHT)(int16_t in[16], int16_t out[16],
+                                   const struct VP8Matrix* const mtx);
+extern VP8QuantizeBlockWHT VP8EncQuantizeBlockWHT;
+
+// Collect histogram for susceptibility calculation and accumulate in histo[].
+struct VP8Histogram;
+typedef void (*VP8CHisto)(const uint8_t* ref, const uint8_t* pred,
+                          int start_block, int end_block,
+                          struct VP8Histogram* const histo);
 extern const int VP8DspScan[16 + 4 + 4];
 extern VP8CHisto VP8CollectHistogram;

@ -101,10 +108,11 @@ typedef void (*VP8DecIdct)(const int16_t* coeffs, uint8_t* dst);
 // when doing two transforms, coeffs is actually int16_t[2][16].
 typedef void (*VP8DecIdct2)(const int16_t* coeffs, uint8_t* dst, int do_two);
 extern VP8DecIdct2 VP8Transform;
+extern VP8DecIdct VP8TransformAC3;
 extern VP8DecIdct VP8TransformUV;
 extern VP8DecIdct VP8TransformDC;
 extern VP8DecIdct VP8TransformDCUV;
-extern void (*VP8TransformWHT)(const int16_t* in, int16_t* out);
+extern VP8WHT VP8TransformWHT;

 // *dst is the destination block, with stride BPS. Boundary samples are
 // assumed accessible when needed.
@ -145,6 +153,8 @@ void VP8DspInit(void);

 #define FANCY_UPSAMPLING   // undefined to remove fancy upsampling support

+// Convert a pair of y/u/v lines together to the output rgb/a colorspace.
+// bottom_y can be NULL if only one line of output is needed (at top/bottom).
 typedef void (*WebPUpsampleLinePairFunc)(
    const uint8_t* top_y, const uint8_t* bottom_y,
    const uint8_t* top_u, const uint8_t* top_v,
@ -159,6 +169,9 @@ extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
 // Initializes SSE2 version of the fancy upsamplers.
 void WebPInitUpsamplersSSE2(void);

+// NEON version
+void WebPInitUpsamplersNEON(void);
+
 #endif    // FANCY_UPSAMPLING

 // Point-sampling methods.
@ -200,10 +213,11 @@ extern void (*WebPApplyAlphaMultiply4444)(
 void WebPInitPremultiply(void);

 void WebPInitPremultiplySSE2(void);   // should not be called directly.
+void WebPInitPremultiplyNEON(void);

 //------------------------------------------------------------------------------

-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }    // extern "C"
 #endif

--- a/src/dsp/enc.c
+++ b/src/dsp/enc.c
@ -1,47 +1,34 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Speed-critical encoding functions.
 //
 // Author: Skal (pascal.massimino@gmail.com)

+#include <assert.h>
 #include <stdlib.h>  // for abs()
+
 #include "./dsp.h"
 #include "../enc/vp8enci.h"

-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
+static WEBP_INLINE uint8_t clip_8b(int v) {
+  return (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
+}
+
+static WEBP_INLINE int clip_max(int v, int max) {
+  return (v > max) ? max : v;
+}

 //------------------------------------------------------------------------------
 // Compute susceptibility based on DCT-coeff histograms:
 // the higher, the "easier" the macroblock is to compress.

-static int ClipAlpha(int alpha) {
-  return alpha < 0 ? 0 : alpha > 255 ? 255 : alpha;
-}
-
-int VP8GetAlpha(const int histo[MAX_COEFF_THRESH + 1]) {
-  int num = 0, den = 0, val = 0;
-  int k;
-  int alpha;
-  // note: changing this loop to avoid the numerous "k + 1" slows things down.
-  for (k = 0; k < MAX_COEFF_THRESH; ++k) {
-    if (histo[k + 1]) {
-      val += histo[k + 1];
-      num += val * (k + 1);
-      den += (k + 1) * (k + 1);
-    }
-  }
-  // we scale the value to a usable [0..255] range
-  alpha = den ? 10 * num / den - 5 : 0;
-  return ClipAlpha(alpha);
-}
-
 const int VP8DspScan[16 + 4 + 4] = {
  // Luma
  0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
@ -53,27 +40,23 @@ const int VP8DspScan[16 + 4 + 4] = {
  8 + 0 * BPS,  12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS     // V
 };

-static int CollectHistogram(const uint8_t* ref, const uint8_t* pred,
-                            int start_block, int end_block) {
-  int histo[MAX_COEFF_THRESH + 1] = { 0 };
-  int16_t out[16];
-  int j, k;
+static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
+                             int start_block, int end_block,
+                             VP8Histogram* const histo) {
+  int j;
  for (j = start_block; j < end_block; ++j) {
+    int k;
+    int16_t out[16];
+
    VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);

-    // Convert coefficients to bin (within out[]).
+    // Convert coefficients to bin.
    for (k = 0; k < 16; ++k) {
-      const int v = abs(out[k]) >> 2;
-      out[k] = (v > MAX_COEFF_THRESH) ? MAX_COEFF_THRESH : v;
-    }
-
-    // Use bin to update histogram.
-    for (k = 0; k < 16; ++k) {
-      histo[out[k]]++;
+      const int v = abs(out[k]) >> 3;  // TODO(skal): add rounding?
+      const int clipped_value = clip_max(v, MAX_COEFF_THRESH);
+      histo->distribution[clipped_value]++;
    }
  }
-
-  return VP8GetAlpha(histo);
 }

 //------------------------------------------------------------------------------
@ -89,15 +72,12 @@ static void InitTables(void) {
  if (!tables_ok) {
    int i;
    for (i = -255; i <= 255 + 255; ++i) {
-      clip1[255 + i] = (i < 0) ? 0 : (i > 255) ? 255 : i;
+      clip1[255 + i] = clip_8b(i);
    }
    tables_ok = 1;
  }
 }

-static WEBP_INLINE uint8_t clip_8b(int v) {
-  return (!(v & ~0xff)) ? v : v < 0 ? 0 : 255;
-}

 //------------------------------------------------------------------------------
 // Transforms (Paragraph 14.4)
@ -154,25 +134,25 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
  int i;
  int tmp[16];
  for (i = 0; i < 4; ++i, src += BPS, ref += BPS) {
-    const int d0 = src[0] - ref[0];
+    const int d0 = src[0] - ref[0];   // 9bit dynamic range ([-255,255])
    const int d1 = src[1] - ref[1];
    const int d2 = src[2] - ref[2];
    const int d3 = src[3] - ref[3];
-    const int a0 = (d0 + d3) << 3;
-    const int a1 = (d1 + d2) << 3;
-    const int a2 = (d1 - d2) << 3;
-    const int a3 = (d0 - d3) << 3;
-    tmp[0 + i * 4] = (a0 + a1);
-    tmp[1 + i * 4] = (a2 * 2217 + a3 * 5352 + 14500) >> 12;
-    tmp[2 + i * 4] = (a0 - a1);
-    tmp[3 + i * 4] = (a3 * 2217 - a2 * 5352 +  7500) >> 12;
+    const int a0 = (d0 + d3);         // 10b                      [-510,510]
+    const int a1 = (d1 + d2);
+    const int a2 = (d1 - d2);
+    const int a3 = (d0 - d3);
+    tmp[0 + i * 4] = (a0 + a1) * 8;   // 14b                      [-8160,8160]
+    tmp[1 + i * 4] = (a2 * 2217 + a3 * 5352 + 1812) >> 9;      // [-7536,7542]
+    tmp[2 + i * 4] = (a0 - a1) * 8;
+    tmp[3 + i * 4] = (a3 * 2217 - a2 * 5352 +  937) >> 9;
  }
  for (i = 0; i < 4; ++i) {
-    const int a0 = (tmp[0 + i] + tmp[12 + i]);
+    const int a0 = (tmp[0 + i] + tmp[12 + i]);  // 15b
    const int a1 = (tmp[4 + i] + tmp[ 8 + i]);
    const int a2 = (tmp[4 + i] - tmp[ 8 + i]);
    const int a3 = (tmp[0 + i] - tmp[12 + i]);
-    out[0 + i] = (a0 + a1 + 7) >> 4;
+    out[0 + i] = (a0 + a1 + 7) >> 4;            // 12b
    out[4 + i] = ((a2 * 2217 + a3 * 5352 + 12000) >> 16) + (a3 != 0);
    out[8 + i] = (a0 - a1 + 7) >> 4;
    out[12+ i] = ((a3 * 2217 - a2 * 5352 + 51000) >> 16);
@ -207,31 +187,32 @@ static void ITransformWHT(const int16_t* in, int16_t* out) {
 }

 static void FTransformWHT(const int16_t* in, int16_t* out) {
-  int tmp[16];
+  // input is 12b signed
+  int32_t tmp[16];
  int i;
  for (i = 0; i < 4; ++i, in += 64) {
-    const int a0 = (in[0 * 16] + in[2 * 16]) << 2;
-    const int a1 = (in[1 * 16] + in[3 * 16]) << 2;
-    const int a2 = (in[1 * 16] - in[3 * 16]) << 2;
-    const int a3 = (in[0 * 16] - in[2 * 16]) << 2;
-    tmp[0 + i * 4] = (a0 + a1) + (a0 != 0);
+    const int a0 = (in[0 * 16] + in[2 * 16]);  // 13b
+    const int a1 = (in[1 * 16] + in[3 * 16]);
+    const int a2 = (in[1 * 16] - in[3 * 16]);
+    const int a3 = (in[0 * 16] - in[2 * 16]);
+    tmp[0 + i * 4] = a0 + a1;   // 14b
    tmp[1 + i * 4] = a3 + a2;
    tmp[2 + i * 4] = a3 - a2;
    tmp[3 + i * 4] = a0 - a1;
  }
  for (i = 0; i < 4; ++i) {
-    const int a0 = (tmp[0 + i] + tmp[8 + i]);
+    const int a0 = (tmp[0 + i] + tmp[8 + i]);  // 15b
    const int a1 = (tmp[4 + i] + tmp[12+ i]);
    const int a2 = (tmp[4 + i] - tmp[12+ i]);
    const int a3 = (tmp[0 + i] - tmp[8 + i]);
-    const int b0 = a0 + a1;
+    const int b0 = a0 + a1;    // 16b
    const int b1 = a3 + a2;
    const int b2 = a3 - a2;
    const int b3 = a0 - a1;
-    out[ 0 + i] = (b0 + (b0 > 0) + 3) >> 3;
-    out[ 4 + i] = (b1 + (b1 > 0) + 3) >> 3;
-    out[ 8 + i] = (b2 + (b2 > 0) + 3) >> 3;
-    out[12 + i] = (b3 + (b3 > 0) + 3) >> 3;
+    out[ 0 + i] = b0 >> 1;     // 15b
+    out[ 4 + i] = b1 >> 1;
+    out[ 8 + i] = b2 >> 1;
+    out[12 + i] = b3 >> 1;
  }
 }

@ -589,30 +570,30 @@ static int TTransform(const uint8_t* in, const uint16_t* w) {
  int i;
  // horizontal pass
  for (i = 0; i < 4; ++i, in += BPS) {
-    const int a0 = (in[0] + in[2]) << 2;
-    const int a1 = (in[1] + in[3]) << 2;
-    const int a2 = (in[1] - in[3]) << 2;
-    const int a3 = (in[0] - in[2]) << 2;
-    tmp[0 + i * 4] = a0 + a1 + (a0 != 0);
+    const int a0 = in[0] + in[2];
+    const int a1 = in[1] + in[3];
+    const int a2 = in[1] - in[3];
+    const int a3 = in[0] - in[2];
+    tmp[0 + i * 4] = a0 + a1;
    tmp[1 + i * 4] = a3 + a2;
    tmp[2 + i * 4] = a3 - a2;
    tmp[3 + i * 4] = a0 - a1;
  }
  // vertical pass
  for (i = 0; i < 4; ++i, ++w) {
-    const int a0 = (tmp[0 + i] + tmp[8 + i]);
-    const int a1 = (tmp[4 + i] + tmp[12+ i]);
-    const int a2 = (tmp[4 + i] - tmp[12+ i]);
-    const int a3 = (tmp[0 + i] - tmp[8 + i]);
+    const int a0 = tmp[0 + i] + tmp[8 + i];
+    const int a1 = tmp[4 + i] + tmp[12+ i];
+    const int a2 = tmp[4 + i] - tmp[12+ i];
+    const int a3 = tmp[0 + i] - tmp[8 + i];
    const int b0 = a0 + a1;
    const int b1 = a3 + a2;
    const int b2 = a3 - a2;
    const int b3 = a0 - a1;
-    // abs((b + (b<0) + 3) >> 3) = (abs(b) + 3) >> 3
-    sum += w[ 0] * ((abs(b0) + 3) >> 3);
-    sum += w[ 4] * ((abs(b1) + 3) >> 3);
-    sum += w[ 8] * ((abs(b2) + 3) >> 3);
-    sum += w[12] * ((abs(b3) + 3) >> 3);
+
+    sum += w[ 0] * abs(b0);
+    sum += w[ 4] * abs(b1);
+    sum += w[ 8] * abs(b2);
+    sum += w[12] * abs(b3);
  }
  return sum;
 }
@ -621,7 +602,7 @@ static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
                    const uint16_t* const w) {
  const int sum1 = TTransform(a, w);
  const int sum2 = TTransform(b, w);
-  return (abs(sum2 - sum1) + 8) >> 4;
+  return abs(sum2 - sum1) >> 5;
 }

 static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
@ -651,13 +632,38 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
  for (; n < 16; ++n) {
    const int j = kZigzag[n];
    const int sign = (in[j] < 0);
-    int coeff = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
-    if (coeff > 2047) coeff = 2047;
+    const int coeff = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
    if (coeff > mtx->zthresh_[j]) {
      const int Q = mtx->q_[j];
      const int iQ = mtx->iq_[j];
      const int B = mtx->bias_[j];
      out[n] = QUANTDIV(coeff, iQ, B);
+      if (out[n] > MAX_LEVEL) out[n] = MAX_LEVEL;
+      if (sign) out[n] = -out[n];
+      in[j] = out[n] * Q;
+      if (out[n]) last = n;
+    } else {
+      out[n] = 0;
+      in[j] = 0;
+    }
+  }
+  return (last >= 0);
+}
+
+static int QuantizeBlockWHT(int16_t in[16], int16_t out[16],
+                            const VP8Matrix* const mtx) {
+  int n, last = -1;
+  for (n = 0; n < 16; ++n) {
+    const int j = kZigzag[n];
+    const int sign = (in[j] < 0);
+    const int coeff = sign ? -in[j] : in[j];
+    assert(mtx->sharpen_[j] == 0);
+    if (coeff > mtx->zthresh_[j]) {
+      const int Q = mtx->q_[j];
+      const int iQ = mtx->iq_[j];
+      const int B = mtx->bias_[j];
+      out[n] = QUANTDIV(coeff, iQ, B);
+      if (out[n] > MAX_LEVEL) out[n] = MAX_LEVEL;
      if (sign) out[n] = -out[n];
      in[j] = out[n] * Q;
      if (out[n]) last = n;
@ -703,9 +709,11 @@ VP8Metric VP8SSE4x4;
 VP8WMetric VP8TDisto4x4;
 VP8WMetric VP8TDisto16x16;
 VP8QuantizeBlock VP8EncQuantizeBlock;
+VP8QuantizeBlockWHT VP8EncQuantizeBlockWHT;
 VP8BlockCopy VP8Copy4x4;

 extern void VP8EncDspInitSSE2(void);
+extern void VP8EncDspInitNEON(void);

 void VP8EncDspInit(void) {
  InitTables();
@ -726,6 +734,7 @@ void VP8EncDspInit(void) {
  VP8TDisto4x4 = Disto4x4;
  VP8TDisto16x16 = Disto16x16;
  VP8EncQuantizeBlock = QuantizeBlock;
+  VP8EncQuantizeBlockWHT = QuantizeBlockWHT;
  VP8Copy4x4 = Copy4x4;

  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
@ -734,10 +743,11 @@ void VP8EncDspInit(void) {
    if (VP8GetCPUInfo(kSSE2)) {
      VP8EncDspInitSSE2();
    }
+#elif defined(WEBP_USE_NEON)
+    if (VP8GetCPUInfo(kNEON)) {
+      VP8EncDspInitNEON();
+    }
 #endif
  }
 }

-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
--- a/src/dsp/enc_neon.c
+++ b/src/dsp/enc_neon.c
@ -0,0 +1,632 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// ARM NEON version of speed-critical encoding functions.
+//
+// adapted from libvpx (http://www.webmproject.org/code/)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_NEON)
+
+#include "../enc/vp8enci.h"
+
+//------------------------------------------------------------------------------
+// Transforms (Paragraph 14.4)
+
+// Inverse transform.
+// This code is pretty much the same as TransformOneNEON in the decoder, except
+// for subtraction to *ref. See the comments there for algorithmic explanations.
+static void ITransformOne(const uint8_t* ref,
+                          const int16_t* in, uint8_t* dst) {
+  const int kBPS = BPS;
+  const int16_t kC1C2[] = { 20091, 17734, 0, 0 };  // kC1 / (kC2 >> 1) / 0 / 0
+
+  __asm__ volatile (
+    "vld1.16         {q1, q2}, [%[in]]           \n"
+    "vld1.16         {d0}, [%[kC1C2]]            \n"
+
+    // d2: in[0]
+    // d3: in[8]
+    // d4: in[4]
+    // d5: in[12]
+    "vswp            d3, d4                      \n"
+
+    // q8 = {in[4], in[12]} * kC1 * 2 >> 16
+    // q9 = {in[4], in[12]} * kC2 >> 16
+    "vqdmulh.s16     q8, q2, d0[0]               \n"
+    "vqdmulh.s16     q9, q2, d0[1]               \n"
+
+    // d22 = a = in[0] + in[8]
+    // d23 = b = in[0] - in[8]
+    "vqadd.s16       d22, d2, d3                 \n"
+    "vqsub.s16       d23, d2, d3                 \n"
+
+    //  q8 = in[4]/[12] * kC1 >> 16
+    "vshr.s16        q8, q8, #1                  \n"
+
+    // Add {in[4], in[12]} back after the multiplication.
+    "vqadd.s16       q8, q2, q8                  \n"
+
+    // d20 = c = in[4]*kC2 - in[12]*kC1
+    // d21 = d = in[4]*kC1 + in[12]*kC2
+    "vqsub.s16       d20, d18, d17               \n"
+    "vqadd.s16       d21, d19, d16               \n"
+
+    // d2 = tmp[0] = a + d
+    // d3 = tmp[1] = b + c
+    // d4 = tmp[2] = b - c
+    // d5 = tmp[3] = a - d
+    "vqadd.s16       d2, d22, d21                \n"
+    "vqadd.s16       d3, d23, d20                \n"
+    "vqsub.s16       d4, d23, d20                \n"
+    "vqsub.s16       d5, d22, d21                \n"
+
+    "vzip.16         q1, q2                      \n"
+    "vzip.16         q1, q2                      \n"
+
+    "vswp            d3, d4                      \n"
+
+    // q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16
+    // q9 = {tmp[4], tmp[12]} * kC2 >> 16
+    "vqdmulh.s16     q8, q2, d0[0]               \n"
+    "vqdmulh.s16     q9, q2, d0[1]               \n"
+
+    // d22 = a = tmp[0] + tmp[8]
+    // d23 = b = tmp[0] - tmp[8]
+    "vqadd.s16       d22, d2, d3                 \n"
+    "vqsub.s16       d23, d2, d3                 \n"
+
+    "vshr.s16        q8, q8, #1                  \n"
+    "vqadd.s16       q8, q2, q8                  \n"
+
+    // d20 = c = in[4]*kC2 - in[12]*kC1
+    // d21 = d = in[4]*kC1 + in[12]*kC2
+    "vqsub.s16       d20, d18, d17               \n"
+    "vqadd.s16       d21, d19, d16               \n"
+
+    // d2 = tmp[0] = a + d
+    // d3 = tmp[1] = b + c
+    // d4 = tmp[2] = b - c
+    // d5 = tmp[3] = a - d
+    "vqadd.s16       d2, d22, d21                \n"
+    "vqadd.s16       d3, d23, d20                \n"
+    "vqsub.s16       d4, d23, d20                \n"
+    "vqsub.s16       d5, d22, d21                \n"
+
+    "vld1.32         d6[0], [%[ref]], %[kBPS]    \n"
+    "vld1.32         d6[1], [%[ref]], %[kBPS]    \n"
+    "vld1.32         d7[0], [%[ref]], %[kBPS]    \n"
+    "vld1.32         d7[1], [%[ref]], %[kBPS]    \n"
+
+    "sub         %[ref], %[ref], %[kBPS], lsl #2 \n"
+
+    // (val) + 4 >> 3
+    "vrshr.s16       d2, d2, #3                  \n"
+    "vrshr.s16       d3, d3, #3                  \n"
+    "vrshr.s16       d4, d4, #3                  \n"
+    "vrshr.s16       d5, d5, #3                  \n"
+
+    "vzip.16         q1, q2                      \n"
+    "vzip.16         q1, q2                      \n"
+
+    // Must accumulate before saturating
+    "vmovl.u8        q8, d6                      \n"
+    "vmovl.u8        q9, d7                      \n"
+
+    "vqadd.s16       q1, q1, q8                  \n"
+    "vqadd.s16       q2, q2, q9                  \n"
+
+    "vqmovun.s16     d0, q1                      \n"
+    "vqmovun.s16     d1, q2                      \n"
+
+    "vst1.32         d0[0], [%[dst]], %[kBPS]    \n"
+    "vst1.32         d0[1], [%[dst]], %[kBPS]    \n"
+    "vst1.32         d1[0], [%[dst]], %[kBPS]    \n"
+    "vst1.32         d1[1], [%[dst]]             \n"
+
+    : [in] "+r"(in), [dst] "+r"(dst)               // modified registers
+    : [kBPS] "r"(kBPS), [kC1C2] "r"(kC1C2), [ref] "r"(ref)  // constants
+    : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11"  // clobbered
+  );
+}
+
+static void ITransform(const uint8_t* ref,
+                       const int16_t* in, uint8_t* dst, int do_two) {
+  ITransformOne(ref, in, dst);
+  if (do_two) {
+    ITransformOne(ref + 4, in + 16, dst + 4);
+  }
+}
+
+// Same code as dec_neon.c
+static void ITransformWHT(const int16_t* in, int16_t* out) {
+  const int kStep = 32;  // The store is only incrementing the pointer as if we
+                         // had stored a single byte.
+  __asm__ volatile (
+    // part 1
+    // load data into q0, q1
+    "vld1.16         {q0, q1}, [%[in]]           \n"
+
+    "vaddl.s16       q2, d0, d3                  \n" // a0 = in[0] + in[12]
+    "vaddl.s16       q3, d1, d2                  \n" // a1 = in[4] + in[8]
+    "vsubl.s16       q4, d1, d2                  \n" // a2 = in[4] - in[8]
+    "vsubl.s16       q5, d0, d3                  \n" // a3 = in[0] - in[12]
+
+    "vadd.s32        q0, q2, q3                  \n" // tmp[0] = a0 + a1
+    "vsub.s32        q2, q2, q3                  \n" // tmp[8] = a0 - a1
+    "vadd.s32        q1, q5, q4                  \n" // tmp[4] = a3 + a2
+    "vsub.s32        q3, q5, q4                  \n" // tmp[12] = a3 - a2
+
+    // Transpose
+    // q0 = tmp[0, 4, 8, 12], q1 = tmp[2, 6, 10, 14]
+    // q2 = tmp[1, 5, 9, 13], q3 = tmp[3, 7, 11, 15]
+    "vswp            d1, d4                      \n" // vtrn.64 q0, q2
+    "vswp            d3, d6                      \n" // vtrn.64 q1, q3
+    "vtrn.32         q0, q1                      \n"
+    "vtrn.32         q2, q3                      \n"
+
+    "vmov.s32        q4, #3                      \n" // dc = 3
+    "vadd.s32        q0, q0, q4                  \n" // dc = tmp[0] + 3
+    "vadd.s32        q6, q0, q3                  \n" // a0 = dc + tmp[3]
+    "vadd.s32        q7, q1, q2                  \n" // a1 = tmp[1] + tmp[2]
+    "vsub.s32        q8, q1, q2                  \n" // a2 = tmp[1] - tmp[2]
+    "vsub.s32        q9, q0, q3                  \n" // a3 = dc - tmp[3]
+
+    "vadd.s32        q0, q6, q7                  \n"
+    "vshrn.s32       d0, q0, #3                  \n" // (a0 + a1) >> 3
+    "vadd.s32        q1, q9, q8                  \n"
+    "vshrn.s32       d1, q1, #3                  \n" // (a3 + a2) >> 3
+    "vsub.s32        q2, q6, q7                  \n"
+    "vshrn.s32       d2, q2, #3                  \n" // (a0 - a1) >> 3
+    "vsub.s32        q3, q9, q8                  \n"
+    "vshrn.s32       d3, q3, #3                  \n" // (a3 - a2) >> 3
+
+    // set the results to output
+    "vst1.16         d0[0], [%[out]], %[kStep]      \n"
+    "vst1.16         d1[0], [%[out]], %[kStep]      \n"
+    "vst1.16         d2[0], [%[out]], %[kStep]      \n"
+    "vst1.16         d3[0], [%[out]], %[kStep]      \n"
+    "vst1.16         d0[1], [%[out]], %[kStep]      \n"
+    "vst1.16         d1[1], [%[out]], %[kStep]      \n"
+    "vst1.16         d2[1], [%[out]], %[kStep]      \n"
+    "vst1.16         d3[1], [%[out]], %[kStep]      \n"
+    "vst1.16         d0[2], [%[out]], %[kStep]      \n"
+    "vst1.16         d1[2], [%[out]], %[kStep]      \n"
+    "vst1.16         d2[2], [%[out]], %[kStep]      \n"
+    "vst1.16         d3[2], [%[out]], %[kStep]      \n"
+    "vst1.16         d0[3], [%[out]], %[kStep]      \n"
+    "vst1.16         d1[3], [%[out]], %[kStep]      \n"
+    "vst1.16         d2[3], [%[out]], %[kStep]      \n"
+    "vst1.16         d3[3], [%[out]], %[kStep]      \n"
+
+    : [out] "+r"(out)  // modified registers
+    : [in] "r"(in), [kStep] "r"(kStep)  // constants
+    : "memory", "q0", "q1", "q2", "q3", "q4",
+      "q5", "q6", "q7", "q8", "q9" // clobbered
+  );
+}
+
+// Forward transform.
+
+// adapted from vp8/encoder/arm/neon/shortfdct_neon.asm
+static const int16_t kCoeff16[] = {
+  5352,  5352,  5352, 5352, 2217,  2217,  2217, 2217
+};
+static const int32_t kCoeff32[] = {
+   1812,  1812,  1812,  1812,
+    937,   937,   937,   937,
+  12000, 12000, 12000, 12000,
+  51000, 51000, 51000, 51000
+};
+
+static void FTransform(const uint8_t* src, const uint8_t* ref,
+                       int16_t* out) {
+  const int kBPS = BPS;
+  const uint8_t* src_ptr = src;
+  const uint8_t* ref_ptr = ref;
+  const int16_t* coeff16 = kCoeff16;
+  const int32_t* coeff32 = kCoeff32;
+
+  __asm__ volatile (
+    // load src into q4, q5 in high half
+    "vld1.8 {d8},  [%[src_ptr]], %[kBPS]      \n"
+    "vld1.8 {d10}, [%[src_ptr]], %[kBPS]      \n"
+    "vld1.8 {d9},  [%[src_ptr]], %[kBPS]      \n"
+    "vld1.8 {d11}, [%[src_ptr]]               \n"
+
+    // load ref into q6, q7 in high half
+    "vld1.8 {d12}, [%[ref_ptr]], %[kBPS]      \n"
+    "vld1.8 {d14}, [%[ref_ptr]], %[kBPS]      \n"
+    "vld1.8 {d13}, [%[ref_ptr]], %[kBPS]      \n"
+    "vld1.8 {d15}, [%[ref_ptr]]               \n"
+
+    // Pack the high values in to q4 and q6
+    "vtrn.32     q4, q5                       \n"
+    "vtrn.32     q6, q7                       \n"
+
+    // d[0-3] = src - ref
+    "vsubl.u8    q0, d8, d12                  \n"
+    "vsubl.u8    q1, d9, d13                  \n"
+
+    // load coeff16 into q8(d16=5352, d17=2217)
+    "vld1.16     {q8}, [%[coeff16]]           \n"
+
+    // load coeff32 high half into q9 = 1812, q10 = 937
+    "vld1.32     {q9, q10}, [%[coeff32]]!     \n"
+
+    // load coeff32 low half into q11=12000, q12=51000
+    "vld1.32     {q11,q12}, [%[coeff32]]      \n"
+
+    // part 1
+    // Transpose. Register dN is the same as dN in C
+    "vtrn.32         d0, d2                   \n"
+    "vtrn.32         d1, d3                   \n"
+    "vtrn.16         d0, d1                   \n"
+    "vtrn.16         d2, d3                   \n"
+
+    "vadd.s16        d4, d0, d3               \n" // a0 = d0 + d3
+    "vadd.s16        d5, d1, d2               \n" // a1 = d1 + d2
+    "vsub.s16        d6, d1, d2               \n" // a2 = d1 - d2
+    "vsub.s16        d7, d0, d3               \n" // a3 = d0 - d3
+
+    "vadd.s16        d0, d4, d5               \n" // a0 + a1
+    "vshl.s16        d0, d0, #3               \n" // temp[0+i*4] = (a0+a1) << 3
+    "vsub.s16        d2, d4, d5               \n" // a0 - a1
+    "vshl.s16        d2, d2, #3               \n" // (temp[2+i*4] = (a0-a1) << 3
+
+    "vmlal.s16       q9, d7, d16              \n" // a3*5352 + 1812
+    "vmlal.s16       q10, d7, d17             \n" // a3*2217 + 937
+    "vmlal.s16       q9, d6, d17              \n" // a2*2217 + a3*5352 + 1812
+    "vmlsl.s16       q10, d6, d16             \n" // a3*2217 + 937 - a2*5352
+
+    // temp[1+i*4] = (d2*2217 + d3*5352 + 1812) >> 9
+    // temp[3+i*4] = (d3*2217 + 937 - d2*5352) >> 9
+    "vshrn.s32       d1, q9, #9               \n"
+    "vshrn.s32       d3, q10, #9              \n"
+
+    // part 2
+    // transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
+    "vtrn.32         d0, d2                   \n"
+    "vtrn.32         d1, d3                   \n"
+    "vtrn.16         d0, d1                   \n"
+    "vtrn.16         d2, d3                   \n"
+
+    "vmov.s16        d26, #7                  \n"
+
+    "vadd.s16        d4, d0, d3               \n" // a1 = ip[0] + ip[12]
+    "vadd.s16        d5, d1, d2               \n" // b1 = ip[4] + ip[8]
+    "vsub.s16        d6, d1, d2               \n" // c1 = ip[4] - ip[8]
+    "vadd.s16        d4, d4, d26              \n" // a1 + 7
+    "vsub.s16        d7, d0, d3               \n" // d1 = ip[0] - ip[12]
+
+    "vadd.s16        d0, d4, d5               \n" // op[0] = a1 + b1 + 7
+    "vsub.s16        d2, d4, d5               \n" // op[8] = a1 - b1 + 7
+
+    "vmlal.s16       q11, d7, d16             \n" // d1*5352 + 12000
+    "vmlal.s16       q12, d7, d17             \n" // d1*2217 + 51000
+
+    "vceq.s16        d4, d7, #0               \n"
+
+    "vshr.s16        d0, d0, #4               \n"
+    "vshr.s16        d2, d2, #4               \n"
+
+    "vmlal.s16       q11, d6, d17             \n" // c1*2217 + d1*5352 + 12000
+    "vmlsl.s16       q12, d6, d16             \n" // d1*2217 - c1*5352 + 51000
+
+    "vmvn            d4, d4                   \n" // !(d1 == 0)
+    // op[4] = (c1*2217 + d1*5352 + 12000)>>16
+    "vshrn.s32       d1, q11, #16             \n"
+    // op[4] += (d1!=0)
+    "vsub.s16        d1, d1, d4               \n"
+    // op[12]= (d1*2217 - c1*5352 + 51000)>>16
+    "vshrn.s32       d3, q12, #16             \n"
+
+    // set result to out array
+    "vst1.16         {q0, q1}, [%[out]]   \n"
+    : [src_ptr] "+r"(src_ptr), [ref_ptr] "+r"(ref_ptr),
+      [coeff32] "+r"(coeff32)          // modified registers
+    : [kBPS] "r"(kBPS), [coeff16] "r"(coeff16),
+      [out] "r"(out)                   // constants
+    : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
+      "q10", "q11", "q12", "q13"       // clobbered
+  );
+}
+
+static void FTransformWHT(const int16_t* in, int16_t* out) {
+  const int kStep = 32;
+  __asm__ volatile (
+    // d0 = in[0 * 16] , d1 = in[1 * 16]
+    // d2 = in[2 * 16] , d3 = in[3 * 16]
+    "vld1.16         d0[0], [%[in]], %[kStep]   \n"
+    "vld1.16         d1[0], [%[in]], %[kStep]   \n"
+    "vld1.16         d2[0], [%[in]], %[kStep]   \n"
+    "vld1.16         d3[0], [%[in]], %[kStep]   \n"
+    "vld1.16         d0[1], [%[in]], %[kStep]   \n"
+    "vld1.16         d1[1], [%[in]], %[kStep]   \n"
+    "vld1.16         d2[1], [%[in]], %[kStep]   \n"
+    "vld1.16         d3[1], [%[in]], %[kStep]   \n"
+    "vld1.16         d0[2], [%[in]], %[kStep]   \n"
+    "vld1.16         d1[2], [%[in]], %[kStep]   \n"
+    "vld1.16         d2[2], [%[in]], %[kStep]   \n"
+    "vld1.16         d3[2], [%[in]], %[kStep]   \n"
+    "vld1.16         d0[3], [%[in]], %[kStep]   \n"
+    "vld1.16         d1[3], [%[in]], %[kStep]   \n"
+    "vld1.16         d2[3], [%[in]], %[kStep]   \n"
+    "vld1.16         d3[3], [%[in]], %[kStep]   \n"
+
+    "vaddl.s16       q2, d0, d2                 \n" // a0=(in[0*16]+in[2*16])
+    "vaddl.s16       q3, d1, d3                 \n" // a1=(in[1*16]+in[3*16])
+    "vsubl.s16       q4, d1, d3                 \n" // a2=(in[1*16]-in[3*16])
+    "vsubl.s16       q5, d0, d2                 \n" // a3=(in[0*16]-in[2*16])
+
+    "vqadd.s32       q6, q2, q3                 \n" // a0 + a1
+    "vqadd.s32       q7, q5, q4                 \n" // a3 + a2
+    "vqsub.s32       q8, q5, q4                 \n" // a3 - a2
+    "vqsub.s32       q9, q2, q3                 \n" // a0 - a1
+
+    // Transpose
+    // q6 = tmp[0, 1,  2,  3] ; q7 = tmp[ 4,  5,  6,  7]
+    // q8 = tmp[8, 9, 10, 11] ; q9 = tmp[12, 13, 14, 15]
+    "vswp            d13, d16                   \n" // vtrn.64 q0, q2
+    "vswp            d15, d18                   \n" // vtrn.64 q1, q3
+    "vtrn.32         q6, q7                     \n"
+    "vtrn.32         q8, q9                     \n"
+
+    "vqadd.s32       q0, q6, q8                 \n" // a0 = tmp[0] + tmp[8]
+    "vqadd.s32       q1, q7, q9                 \n" // a1 = tmp[4] + tmp[12]
+    "vqsub.s32       q2, q7, q9                 \n" // a2 = tmp[4] - tmp[12]
+    "vqsub.s32       q3, q6, q8                 \n" // a3 = tmp[0] - tmp[8]
+
+    "vqadd.s32       q4, q0, q1                 \n" // b0 = a0 + a1
+    "vqadd.s32       q5, q3, q2                 \n" // b1 = a3 + a2
+    "vqsub.s32       q6, q3, q2                 \n" // b2 = a3 - a2
+    "vqsub.s32       q7, q0, q1                 \n" // b3 = a0 - a1
+
+    "vshrn.s32       d18, q4, #1                \n" // b0 >> 1
+    "vshrn.s32       d19, q5, #1                \n" // b1 >> 1
+    "vshrn.s32       d20, q6, #1                \n" // b2 >> 1
+    "vshrn.s32       d21, q7, #1                \n" // b3 >> 1
+
+    "vst1.16         {q9, q10}, [%[out]]        \n"
+
+    : [in] "+r"(in)
+    : [kStep] "r"(kStep), [out] "r"(out)
+    : "memory", "q0", "q1", "q2", "q3", "q4", "q5",
+      "q6", "q7", "q8", "q9", "q10"       // clobbered
+  ) ;
+}
+
+//------------------------------------------------------------------------------
+// Texture distortion
+//
+// We try to match the spectral content (weighted) between source and
+// reconstructed samples.
+
+// Hadamard transform
+// Returns the weighted sum of the absolute value of transformed coefficients.
+// This uses a TTransform helper function in C
+static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
+                    const uint16_t* const w) {
+  const int kBPS = BPS;
+  const uint8_t* A = a;
+  const uint8_t* B = b;
+  const uint16_t* W = w;
+  int sum;
+  __asm__ volatile (
+    "vld1.32         d0[0], [%[a]], %[kBPS]   \n"
+    "vld1.32         d0[1], [%[a]], %[kBPS]   \n"
+    "vld1.32         d2[0], [%[a]], %[kBPS]   \n"
+    "vld1.32         d2[1], [%[a]]            \n"
+
+    "vld1.32         d1[0], [%[b]], %[kBPS]   \n"
+    "vld1.32         d1[1], [%[b]], %[kBPS]   \n"
+    "vld1.32         d3[0], [%[b]], %[kBPS]   \n"
+    "vld1.32         d3[1], [%[b]]            \n"
+
+    // a d0/d2, b d1/d3
+    // d0/d1: 01 01 01 01
+    // d2/d3: 23 23 23 23
+    // But: it goes 01 45 23 67
+    // Notice the middle values are transposed
+    "vtrn.16         q0, q1                   \n"
+
+    // {a0, a1} = {in[0] + in[2], in[1] + in[3]}
+    "vaddl.u8        q2, d0, d2               \n"
+    "vaddl.u8        q10, d1, d3              \n"
+    // {a3, a2} = {in[0] - in[2], in[1] - in[3]}
+    "vsubl.u8        q3, d0, d2               \n"
+    "vsubl.u8        q11, d1, d3              \n"
+
+    // tmp[0] = a0 + a1
+    "vpaddl.s16      q0, q2                   \n"
+    "vpaddl.s16      q8, q10                  \n"
+
+    // tmp[1] = a3 + a2
+    "vpaddl.s16      q1, q3                   \n"
+    "vpaddl.s16      q9, q11                  \n"
+
+    // No pair subtract
+    // q2 = {a0, a3}
+    // q3 = {a1, a2}
+    "vtrn.16         q2, q3                   \n"
+    "vtrn.16         q10, q11                 \n"
+
+    // {tmp[3], tmp[2]} = {a0 - a1, a3 - a2}
+    "vsubl.s16       q12, d4, d6              \n"
+    "vsubl.s16       q13, d5, d7              \n"
+    "vsubl.s16       q14, d20, d22            \n"
+    "vsubl.s16       q15, d21, d23            \n"
+
+    // separate tmp[3] and tmp[2]
+    // q12 = tmp[3]
+    // q13 = tmp[2]
+    "vtrn.32         q12, q13                 \n"
+    "vtrn.32         q14, q15                 \n"
+
+    // Transpose tmp for a
+    "vswp            d1, d26                  \n" // vtrn.64
+    "vswp            d3, d24                  \n" // vtrn.64
+    "vtrn.32         q0, q1                   \n"
+    "vtrn.32         q13, q12                 \n"
+
+    // Transpose tmp for b
+    "vswp            d17, d30                 \n" // vtrn.64
+    "vswp            d19, d28                 \n" // vtrn.64
+    "vtrn.32         q8, q9                   \n"
+    "vtrn.32         q15, q14                 \n"
+
+    // The first Q register is a, the second b.
+    // q0/8 tmp[0-3]
+    // q13/15 tmp[4-7]
+    // q1/9 tmp[8-11]
+    // q12/14 tmp[12-15]
+
+    // These are still in 01 45 23 67 order. We fix it easily in the addition
+    // case but the subtraction propagates them.
+    "vswp            d3, d27                  \n"
+    "vswp            d19, d31                 \n"
+
+    // a0 = tmp[0] + tmp[8]
+    "vadd.s32        q2, q0, q1               \n"
+    "vadd.s32        q3, q8, q9               \n"
+
+    // a1 = tmp[4] + tmp[12]
+    "vadd.s32        q10, q13, q12            \n"
+    "vadd.s32        q11, q15, q14            \n"
+
+    // a2 = tmp[4] - tmp[12]
+    "vsub.s32        q13, q13, q12            \n"
+    "vsub.s32        q15, q15, q14            \n"
+
+    // a3 = tmp[0] - tmp[8]
+    "vsub.s32        q0, q0, q1               \n"
+    "vsub.s32        q8, q8, q9               \n"
+
+    // b0 = a0 + a1
+    "vadd.s32        q1, q2, q10              \n"
+    "vadd.s32        q9, q3, q11              \n"
+
+    // b1 = a3 + a2
+    "vadd.s32        q12, q0, q13             \n"
+    "vadd.s32        q14, q8, q15             \n"
+
+    // b2 = a3 - a2
+    "vsub.s32        q0, q0, q13              \n"
+    "vsub.s32        q8, q8, q15              \n"
+
+    // b3 = a0 - a1
+    "vsub.s32        q2, q2, q10              \n"
+    "vsub.s32        q3, q3, q11              \n"
+
+    "vld1.64         {q10, q11}, [%[w]]       \n"
+
+    // abs(b0)
+    "vabs.s32        q1, q1                   \n"
+    "vabs.s32        q9, q9                   \n"
+    // abs(b1)
+    "vabs.s32        q12, q12                 \n"
+    "vabs.s32        q14, q14                 \n"
+    // abs(b2)
+    "vabs.s32        q0, q0                   \n"
+    "vabs.s32        q8, q8                   \n"
+    // abs(b3)
+    "vabs.s32        q2, q2                   \n"
+    "vabs.s32        q3, q3                   \n"
+
+    // expand w before using.
+    "vmovl.u16       q13, d20                 \n"
+    "vmovl.u16       q15, d21                 \n"
+
+    // w[0] * abs(b0)
+    "vmul.u32        q1, q1, q13              \n"
+    "vmul.u32        q9, q9, q13              \n"
+
+    // w[4] * abs(b1)
+    "vmla.u32        q1, q12, q15             \n"
+    "vmla.u32        q9, q14, q15             \n"
+
+    // expand w before using.
+    "vmovl.u16       q13, d22                 \n"
+    "vmovl.u16       q15, d23                 \n"
+
+    // w[8] * abs(b1)
+    "vmla.u32        q1, q0, q13              \n"
+    "vmla.u32        q9, q8, q13              \n"
+
+    // w[12] * abs(b1)
+    "vmla.u32        q1, q2, q15              \n"
+    "vmla.u32        q9, q3, q15              \n"
+
+    // Sum the arrays
+    "vpaddl.u32      q1, q1                   \n"
+    "vpaddl.u32      q9, q9                   \n"
+    "vadd.u64        d2, d3                   \n"
+    "vadd.u64        d18, d19                 \n"
+
+    // Hadamard transform needs 4 bits of extra precision (2 bits in each
+    // direction) for dynamic raw. Weights w[] are 16bits at max, so the maximum
+    // precision for coeff is 8bit of input + 4bits of Hadamard transform +
+    // 16bits for w[] + 2 bits of abs() summation.
+    //
+    // This uses a maximum of 31 bits (signed). Discarding the top 32 bits is
+    // A-OK.
+
+    // sum2 - sum1
+    "vsub.u32        d0, d2, d18              \n"
+    // abs(sum2 - sum1)
+    "vabs.s32        d0, d0                   \n"
+    // abs(sum2 - sum1) >> 5
+    "vshr.u32        d0, #5                   \n"
+
+    // It would be better to move the value straight into r0 but I'm not
+    // entirely sure how this works with inline assembly.
+    "vmov.32         %[sum], d0[0]            \n"
+
+    : [sum] "=r"(sum), [a] "+r"(A), [b] "+r"(B), [w] "+r"(W)
+    : [kBPS] "r"(kBPS)
+    : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
+      "q10", "q11", "q12", "q13", "q14", "q15"  // clobbered
+  ) ;
+
+  return sum;
+}
+
+static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
+                      const uint16_t* const w) {
+  int D = 0;
+  int x, y;
+  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
+    for (x = 0; x < 16; x += 4) {
+      D += Disto4x4(a + x + y, b + x + y, w);
+    }
+  }
+  return D;
+}
+
+#endif   // WEBP_USE_NEON
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8EncDspInitNEON(void);
+
+void VP8EncDspInitNEON(void) {
+#if defined(WEBP_USE_NEON)
+  VP8ITransform = ITransform;
+  VP8FTransform = FTransform;
+
+  VP8ITransformWHT = ITransformWHT;
+  VP8FTransformWHT = FTransformWHT;
+
+  VP8TDisto4x4 = Disto4x4;
+  VP8TDisto16x16 = Disto16x16;
+#endif   // WEBP_USE_NEON
+}
+
--- a/src/dsp/enc_sse2.c
+++ b/src/dsp/enc_sse2.c
@ -1,8 +1,10 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // SSE2 version of speed-critical encoding functions.
@ -17,21 +19,48 @@

 #include "../enc/vp8enci.h"

-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
+//------------------------------------------------------------------------------
+// Quite useful macro for debugging. Left here for convenience.
+
+#if 0
+#include <stdio.h>
+static void PrintReg(const __m128i r, const char* const name, int size) {
+  int n;
+  union {
+    __m128i r;
+    uint8_t i8[16];
+    uint16_t i16[8];
+    uint32_t i32[4];
+    uint64_t i64[2];
+  } tmp;
+  tmp.r = r;
+  printf("%s\t: ", name);
+  if (size == 8) {
+    for (n = 0; n < 16; ++n) printf("%.2x ", tmp.i8[n]);
+  } else if (size == 16) {
+    for (n = 0; n < 8; ++n) printf("%.4x ", tmp.i16[n]);
+  } else if (size == 32) {
+    for (n = 0; n < 4; ++n) printf("%.8x ", tmp.i32[n]);
+  } else {
+    for (n = 0; n < 2; ++n) printf("%.16lx ", tmp.i64[n]);
+  }
+  printf("\n");
+}
 #endif

 //------------------------------------------------------------------------------
 // Compute susceptibility based on DCT-coeff histograms:
 // the higher, the "easier" the macroblock is to compress.

-static int CollectHistogramSSE2(const uint8_t* ref, const uint8_t* pred,
-                                int start_block, int end_block) {
-  int histo[MAX_COEFF_THRESH + 1] = { 0 };
-  int16_t out[16];
-  int j, k;
+static void CollectHistogramSSE2(const uint8_t* ref, const uint8_t* pred,
+                                 int start_block, int end_block,
+                                 VP8Histogram* const histo) {
  const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);
+  int j;
  for (j = start_block; j < end_block; ++j) {
+    int16_t out[16];
+    int k;
+
    VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);

    // Convert coefficients to bin (within out[]).
@ -47,9 +76,9 @@ static int CollectHistogramSSE2(const uint8_t* ref, const uint8_t* pred,
      const __m128i xor1 = _mm_xor_si128(out1, sign1);
      const __m128i abs0 = _mm_sub_epi16(xor0, sign0);
      const __m128i abs1 = _mm_sub_epi16(xor1, sign1);
-      // v = abs(out) >> 2
-      const __m128i v0 = _mm_srai_epi16(abs0, 2);
-      const __m128i v1 = _mm_srai_epi16(abs1, 2);
+      // v = abs(out) >> 3
+      const __m128i v0 = _mm_srai_epi16(abs0, 3);
+      const __m128i v1 = _mm_srai_epi16(abs1, 3);
      // bin = min(v, MAX_COEFF_THRESH)
      const __m128i bin0 = _mm_min_epi16(v0, max_coeff_thresh);
      const __m128i bin1 = _mm_min_epi16(v1, max_coeff_thresh);
@ -58,13 +87,11 @@ static int CollectHistogramSSE2(const uint8_t* ref, const uint8_t* pred,
      _mm_storeu_si128((__m128i*)&out[8], bin1);
    }

-    // Use bin to update histogram.
+    // Convert coefficients to bin.
    for (k = 0; k < 16; ++k) {
-      histo[out[k]]++;
+      histo->distribution[out[k]]++;
    }
  }
-
-  return VP8GetAlpha(histo);
 }

 //------------------------------------------------------------------------------
@ -243,7 +270,7 @@ static void ITransformSSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst,

  // Add inverse transform to 'ref' and store.
  {
-    const __m128i zero = _mm_set1_epi16(0);
+    const __m128i zero = _mm_setzero_si128();
    // Load the reference(s).
    __m128i ref0, ref1, ref2, ref3;
    if (do_two) {
@ -295,17 +322,23 @@ static void FTransformSSE2(const uint8_t* src, const uint8_t* ref,
                           int16_t* out) {
  const __m128i zero = _mm_setzero_si128();
  const __m128i seven = _mm_set1_epi16(7);
-  const __m128i k7500 = _mm_set1_epi32(7500);
-  const __m128i k14500 = _mm_set1_epi32(14500);
+  const __m128i k937 = _mm_set1_epi32(937);
+  const __m128i k1812 = _mm_set1_epi32(1812);
  const __m128i k51000 = _mm_set1_epi32(51000);
  const __m128i k12000_plus_one = _mm_set1_epi32(12000 + (1 << 16));
  const __m128i k5352_2217 = _mm_set_epi16(5352,  2217, 5352,  2217,
                                           5352,  2217, 5352,  2217);
  const __m128i k2217_5352 = _mm_set_epi16(2217, -5352, 2217, -5352,
                                           2217, -5352, 2217, -5352);
-
+  const __m128i k88p = _mm_set_epi16(8, 8, 8, 8, 8, 8, 8, 8);
+  const __m128i k88m = _mm_set_epi16(-8, 8, -8, 8, -8, 8, -8, 8);
+  const __m128i k5352_2217p = _mm_set_epi16(2217, 5352, 2217, 5352,
+                                            2217, 5352, 2217, 5352);
+  const __m128i k5352_2217m = _mm_set_epi16(-5352, 2217, -5352, 2217,
+                                            -5352, 2217, -5352, 2217);
  __m128i v01, v32;

+
  // Difference between src and ref and initial transpose.
  {
    // Load src and convert to 16b.
@ -326,73 +359,52 @@ static void FTransformSSE2(const uint8_t* src, const uint8_t* ref,
    const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero);
    const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero);
    const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero);
-    // Compute difference.
+    // Compute difference. -> 00 01 02 03 00 00 00 00
    const __m128i diff0 = _mm_sub_epi16(src_0, ref_0);
    const __m128i diff1 = _mm_sub_epi16(src_1, ref_1);
    const __m128i diff2 = _mm_sub_epi16(src_2, ref_2);
    const __m128i diff3 = _mm_sub_epi16(src_3, ref_3);

-    // Transpose.
+
+    // Unpack and shuffle
    // 00 01 02 03   0 0 0 0
    // 10 11 12 13   0 0 0 0
    // 20 21 22 23   0 0 0 0
    // 30 31 32 33   0 0 0 0
-    const __m128i transpose0_0 = _mm_unpacklo_epi16(diff0, diff1);
-    const __m128i transpose0_1 = _mm_unpacklo_epi16(diff2, diff3);
-    // 00 10 01 11   02 12 03 13
-    // 20 30 21 31   22 32 23 33
-    const __m128i v23 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
-    v01 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
-    v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2));
-    // a02 a12 a22 a32   a03 a13 a23 a33
-    // a00 a10 a20 a30   a01 a11 a21 a31
-    // a03 a13 a23 a33   a02 a12 a22 a32
-  }
+    const __m128i shuf01 = _mm_unpacklo_epi32(diff0, diff1);
+    const __m128i shuf23 = _mm_unpacklo_epi32(diff2, diff3);
+    // 00 01 10 11 02 03 12 13
+    // 20 21 30 31 22 23 32 33
+    const __m128i shuf01_p =
+        _mm_shufflehi_epi16(shuf01, _MM_SHUFFLE(2, 3, 0, 1));
+    const __m128i shuf23_p =
+        _mm_shufflehi_epi16(shuf23, _MM_SHUFFLE(2, 3, 0, 1));
+    // 00 01 10 11 03 02 13 12
+    // 20 21 30 31 23 22 33 32
+    const __m128i s01 = _mm_unpacklo_epi64(shuf01_p, shuf23_p);
+    const __m128i s32 = _mm_unpackhi_epi64(shuf01_p, shuf23_p);
+    // 00 01 10 11 20 21 30 31
+    // 03 02 13 12 23 22 33 32
+    const __m128i a01 = _mm_add_epi16(s01, s32);
+    const __m128i a32 = _mm_sub_epi16(s01, s32);
+    // [d0 + d3 | d1 + d2 | ...] = [a0 a1 | a0' a1' | ... ]
+    // [d0 - d3 | d1 - d2 | ...] = [a3 a2 | a3' a2' | ... ]

-  // First pass and subsequent transpose.
-  {
-    // Same operations are done on the (0,3) and (1,2) pairs.
-    // b0 = (a0 + a3) << 3
-    // b1 = (a1 + a2) << 3
-    // b3 = (a0 - a3) << 3
-    // b2 = (a1 - a2) << 3
-    const __m128i a01 = _mm_add_epi16(v01, v32);
-    const __m128i a32 = _mm_sub_epi16(v01, v32);
-    const __m128i b01 = _mm_slli_epi16(a01, 3);
-    const __m128i b32 = _mm_slli_epi16(a32, 3);
-    const __m128i b11 = _mm_unpackhi_epi64(b01, b01);
-    const __m128i b22 = _mm_unpackhi_epi64(b32, b32);
-
-    // e0 = b0 + b1
-    // e2 = b0 - b1
-    const __m128i e0 = _mm_add_epi16(b01, b11);
-    const __m128i e2 = _mm_sub_epi16(b01, b11);
-    const __m128i e02 = _mm_unpacklo_epi64(e0, e2);
-
-    // e1 = (b3 * 5352 + b2 * 2217 + 14500) >> 12
-    // e3 = (b3 * 2217 - b2 * 5352 +  7500) >> 12
-    const __m128i b23 = _mm_unpacklo_epi16(b22, b32);
-    const __m128i c1 = _mm_madd_epi16(b23, k5352_2217);
-    const __m128i c3 = _mm_madd_epi16(b23, k2217_5352);
-    const __m128i d1 = _mm_add_epi32(c1, k14500);
-    const __m128i d3 = _mm_add_epi32(c3, k7500);
-    const __m128i e1 = _mm_srai_epi32(d1, 12);
-    const __m128i e3 = _mm_srai_epi32(d3, 12);
-    const __m128i e13 = _mm_packs_epi32(e1, e3);
-
-    // Transpose.
-    // 00 01 02 03  20 21 22 23
-    // 10 11 12 13  30 31 32 33
-    const __m128i transpose0_0 = _mm_unpacklo_epi16(e02, e13);
-    const __m128i transpose0_1 = _mm_unpackhi_epi16(e02, e13);
-    // 00 10 01 11   02 12 03 13
-    // 20 30 21 31   22 32 23 33
-    const __m128i v23 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
-    v01 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
-    v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2));
-    // 02 12 22 32   03 13 23 33
-    // 00 10 20 30   01 11 21 31
-    // 03 13 23 33   02 12 22 32
+    const __m128i tmp0 = _mm_madd_epi16(a01, k88p);  // [ (a0 + a1) << 3, ... ]
+    const __m128i tmp2 = _mm_madd_epi16(a01, k88m);  // [ (a0 - a1) << 3, ... ]
+    const __m128i tmp1_1 = _mm_madd_epi16(a32, k5352_2217p);
+    const __m128i tmp3_1 = _mm_madd_epi16(a32, k5352_2217m);
+    const __m128i tmp1_2 = _mm_add_epi32(tmp1_1, k1812);
+    const __m128i tmp3_2 = _mm_add_epi32(tmp3_1, k937);
+    const __m128i tmp1   = _mm_srai_epi32(tmp1_2, 9);
+    const __m128i tmp3   = _mm_srai_epi32(tmp3_2, 9);
+    const __m128i s03 = _mm_packs_epi32(tmp0, tmp2);
+    const __m128i s12 = _mm_packs_epi32(tmp1, tmp3);
+    const __m128i s_lo = _mm_unpacklo_epi16(s03, s12);   // 0 1 0 1 0 1...
+    const __m128i s_hi = _mm_unpackhi_epi16(s03, s12);   // 2 3 2 3 2 3
+    const __m128i v23 = _mm_unpackhi_epi32(s_lo, s_hi);
+    v01 = _mm_unpacklo_epi32(s_lo, s_hi);
+    v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2));  // 3 2 3 2 3 2..
  }

  // Second pass
@ -406,13 +418,12 @@ static void FTransformSSE2(const uint8_t* src, const uint8_t* ref,
    const __m128i a32 = _mm_sub_epi16(v01, v32);
    const __m128i a11 = _mm_unpackhi_epi64(a01, a01);
    const __m128i a22 = _mm_unpackhi_epi64(a32, a32);
+    const __m128i a01_plus_7 = _mm_add_epi16(a01, seven);

    // d0 = (a0 + a1 + 7) >> 4;
    // d2 = (a0 - a1 + 7) >> 4;
-    const __m128i b0 = _mm_add_epi16(a01, a11);
-    const __m128i b2 = _mm_sub_epi16(a01, a11);
-    const __m128i c0 = _mm_add_epi16(b0, seven);
-    const __m128i c2 = _mm_add_epi16(b2, seven);
+    const __m128i c0 = _mm_add_epi16(a01_plus_7, a11);
+    const __m128i c2 = _mm_sub_epi16(a01_plus_7, a11);
    const __m128i d0 = _mm_srai_epi16(c0, 4);
    const __m128i d2 = _mm_srai_epi16(c2, 4);

@ -430,6 +441,7 @@ static void FTransformSSE2(const uint8_t* src, const uint8_t* ref,
    // f1 = f1 + (a3 != 0);
    // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the
    // desired (0, 1), we add one earlier through k12000_plus_one.
+    // -> f1 = f1 + 1 - (a3 == 0)
    const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero));

    _mm_storel_epi64((__m128i*)&out[ 0], d0);
@ -439,13 +451,137 @@ static void FTransformSSE2(const uint8_t* src, const uint8_t* ref,
  }
 }

+static void FTransformWHTSSE2(const int16_t* in, int16_t* out) {
+  int32_t tmp[16];
+  int i;
+  for (i = 0; i < 4; ++i, in += 64) {
+    const int a0 = (in[0 * 16] + in[2 * 16]);
+    const int a1 = (in[1 * 16] + in[3 * 16]);
+    const int a2 = (in[1 * 16] - in[3 * 16]);
+    const int a3 = (in[0 * 16] - in[2 * 16]);
+    tmp[0 + i * 4] = a0 + a1;
+    tmp[1 + i * 4] = a3 + a2;
+    tmp[2 + i * 4] = a3 - a2;
+    tmp[3 + i * 4] = a0 - a1;
+  }
+  {
+    const __m128i src0 = _mm_loadu_si128((__m128i*)&tmp[0]);
+    const __m128i src1 = _mm_loadu_si128((__m128i*)&tmp[4]);
+    const __m128i src2 = _mm_loadu_si128((__m128i*)&tmp[8]);
+    const __m128i src3 = _mm_loadu_si128((__m128i*)&tmp[12]);
+    const __m128i a0 = _mm_add_epi32(src0, src2);
+    const __m128i a1 = _mm_add_epi32(src1, src3);
+    const __m128i a2 = _mm_sub_epi32(src1, src3);
+    const __m128i a3 = _mm_sub_epi32(src0, src2);
+    const __m128i b0 = _mm_srai_epi32(_mm_add_epi32(a0, a1), 1);
+    const __m128i b1 = _mm_srai_epi32(_mm_add_epi32(a3, a2), 1);
+    const __m128i b2 = _mm_srai_epi32(_mm_sub_epi32(a3, a2), 1);
+    const __m128i b3 = _mm_srai_epi32(_mm_sub_epi32(a0, a1), 1);
+    const __m128i out0 = _mm_packs_epi32(b0, b1);
+    const __m128i out1 = _mm_packs_epi32(b2, b3);
+    _mm_storeu_si128((__m128i*)&out[0], out0);
+    _mm_storeu_si128((__m128i*)&out[8], out1);
+  }
+}
+
 //------------------------------------------------------------------------------
 // Metric

-static int SSE4x4SSE2(const uint8_t* a, const uint8_t* b) {
-  const __m128i zero = _mm_set1_epi16(0);
+static int SSE_Nx4SSE2(const uint8_t* a, const uint8_t* b,
+                       int num_quads, int do_16) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i sum1 = zero;
+  __m128i sum2 = zero;

-  // Load values.
+  while (num_quads-- > 0) {
+    // Note: for the !do_16 case, we read 16 pixels instead of 8 but that's ok,
+    // thanks to buffer over-allocation to that effect.
+    const __m128i a0 = _mm_loadu_si128((__m128i*)&a[BPS * 0]);
+    const __m128i a1 = _mm_loadu_si128((__m128i*)&a[BPS * 1]);
+    const __m128i a2 = _mm_loadu_si128((__m128i*)&a[BPS * 2]);
+    const __m128i a3 = _mm_loadu_si128((__m128i*)&a[BPS * 3]);
+    const __m128i b0 = _mm_loadu_si128((__m128i*)&b[BPS * 0]);
+    const __m128i b1 = _mm_loadu_si128((__m128i*)&b[BPS * 1]);
+    const __m128i b2 = _mm_loadu_si128((__m128i*)&b[BPS * 2]);
+    const __m128i b3 = _mm_loadu_si128((__m128i*)&b[BPS * 3]);
+
+    // compute clip0(a-b) and clip0(b-a)
+    const __m128i a0p = _mm_subs_epu8(a0, b0);
+    const __m128i a0m = _mm_subs_epu8(b0, a0);
+    const __m128i a1p = _mm_subs_epu8(a1, b1);
+    const __m128i a1m = _mm_subs_epu8(b1, a1);
+    const __m128i a2p = _mm_subs_epu8(a2, b2);
+    const __m128i a2m = _mm_subs_epu8(b2, a2);
+    const __m128i a3p = _mm_subs_epu8(a3, b3);
+    const __m128i a3m = _mm_subs_epu8(b3, a3);
+
+    // compute |a-b| with 8b arithmetic as clip0(a-b) | clip0(b-a)
+    const __m128i diff0 = _mm_or_si128(a0p, a0m);
+    const __m128i diff1 = _mm_or_si128(a1p, a1m);
+    const __m128i diff2 = _mm_or_si128(a2p, a2m);
+    const __m128i diff3 = _mm_or_si128(a3p, a3m);
+
+    // unpack (only four operations, instead of eight)
+    const __m128i low0 = _mm_unpacklo_epi8(diff0, zero);
+    const __m128i low1 = _mm_unpacklo_epi8(diff1, zero);
+    const __m128i low2 = _mm_unpacklo_epi8(diff2, zero);
+    const __m128i low3 = _mm_unpacklo_epi8(diff3, zero);
+
+    // multiply with self
+    const __m128i low_madd0 = _mm_madd_epi16(low0, low0);
+    const __m128i low_madd1 = _mm_madd_epi16(low1, low1);
+    const __m128i low_madd2 = _mm_madd_epi16(low2, low2);
+    const __m128i low_madd3 = _mm_madd_epi16(low3, low3);
+
+    // collect in a cascading way
+    const __m128i low_sum0 = _mm_add_epi32(low_madd0, low_madd1);
+    const __m128i low_sum1 = _mm_add_epi32(low_madd2, low_madd3);
+    sum1 = _mm_add_epi32(sum1, low_sum0);
+    sum2 = _mm_add_epi32(sum2, low_sum1);
+
+    if (do_16) {  // if necessary, process the higher 8 bytes similarly
+      const __m128i hi0 = _mm_unpackhi_epi8(diff0, zero);
+      const __m128i hi1 = _mm_unpackhi_epi8(diff1, zero);
+      const __m128i hi2 = _mm_unpackhi_epi8(diff2, zero);
+      const __m128i hi3 = _mm_unpackhi_epi8(diff3, zero);
+
+      const __m128i hi_madd0 = _mm_madd_epi16(hi0, hi0);
+      const __m128i hi_madd1 = _mm_madd_epi16(hi1, hi1);
+      const __m128i hi_madd2 = _mm_madd_epi16(hi2, hi2);
+      const __m128i hi_madd3 = _mm_madd_epi16(hi3, hi3);
+      const __m128i hi_sum0 = _mm_add_epi32(hi_madd0, hi_madd1);
+      const __m128i hi_sum1 = _mm_add_epi32(hi_madd2, hi_madd3);
+      sum1 = _mm_add_epi32(sum1, hi_sum0);
+      sum2 = _mm_add_epi32(sum2, hi_sum1);
+    }
+    a += 4 * BPS;
+    b += 4 * BPS;
+  }
+  {
+    int32_t tmp[4];
+    const __m128i sum = _mm_add_epi32(sum1, sum2);
+    _mm_storeu_si128((__m128i*)tmp, sum);
+    return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
+  }
+}
+
+static int SSE16x16SSE2(const uint8_t* a, const uint8_t* b) {
+  return SSE_Nx4SSE2(a, b, 4, 1);
+}
+
+static int SSE16x8SSE2(const uint8_t* a, const uint8_t* b) {
+  return SSE_Nx4SSE2(a, b, 2, 1);
+}
+
+static int SSE8x8SSE2(const uint8_t* a, const uint8_t* b) {
+  return SSE_Nx4SSE2(a, b, 2, 0);
+}
+
+static int SSE4x4SSE2(const uint8_t* a, const uint8_t* b) {
+  const __m128i zero = _mm_setzero_si128();
+
+  // Load values. Note that we read 8 pixels instead of 4,
+  // but the a/b buffers are over-allocated to that effect.
  const __m128i a0 = _mm_loadl_epi64((__m128i*)&a[BPS * 0]);
  const __m128i a1 = _mm_loadl_epi64((__m128i*)&a[BPS * 1]);
  const __m128i a2 = _mm_loadl_epi64((__m128i*)&a[BPS * 2]);
@ -483,6 +619,7 @@ static int SSE4x4SSE2(const uint8_t* a, const uint8_t* b) {
  const __m128i sum0 = _mm_add_epi32(madd0, madd1);
  const __m128i sum1 = _mm_add_epi32(madd2, madd3);
  const __m128i sum2 = _mm_add_epi32(sum0, sum1);
+
  int32_t tmp[4];
  _mm_storeu_si128((__m128i*)tmp, sum2);
  return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
@ -502,10 +639,8 @@ static int TTransformSSE2(const uint8_t* inA, const uint8_t* inB,
  int32_t sum[4];
  __m128i tmp_0, tmp_1, tmp_2, tmp_3;
  const __m128i zero = _mm_setzero_si128();
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i three = _mm_set1_epi16(3);

-  // Load, combine and tranpose inputs.
+  // Load, combine and transpose inputs.
  {
    const __m128i inA_0 = _mm_loadl_epi64((__m128i*)&inA[BPS * 0]);
    const __m128i inA_1 = _mm_loadl_epi64((__m128i*)&inA[BPS * 1]);
@ -550,17 +685,14 @@ static int TTransformSSE2(const uint8_t* inA, const uint8_t* inB,
  // Horizontal pass and subsequent transpose.
  {
    // Calculate a and b (two 4x4 at once).
-    const __m128i a0 = _mm_slli_epi16(_mm_add_epi16(tmp_0, tmp_2), 2);
-    const __m128i a1 = _mm_slli_epi16(_mm_add_epi16(tmp_1, tmp_3), 2);
-    const __m128i a2 = _mm_slli_epi16(_mm_sub_epi16(tmp_1, tmp_3), 2);
-    const __m128i a3 = _mm_slli_epi16(_mm_sub_epi16(tmp_0, tmp_2), 2);
-    // b0_extra = (a0 != 0);
-    const __m128i b0_extra = _mm_andnot_si128(_mm_cmpeq_epi16 (a0, zero), one);
-    const __m128i b0_base = _mm_add_epi16(a0, a1);
+    const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);
+    const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3);
+    const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3);
+    const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2);
+    const __m128i b0 = _mm_add_epi16(a0, a1);
    const __m128i b1 = _mm_add_epi16(a3, a2);
    const __m128i b2 = _mm_sub_epi16(a3, a2);
    const __m128i b3 = _mm_sub_epi16(a0, a1);
-    const __m128i b0 = _mm_add_epi16(b0_base, b0_extra);
    // a00 a01 a02 a03   b00 b01 b02 b03
    // a10 a11 a12 a13   b10 b11 b12 b13
    // a20 a21 a22 a23   b20 b21 b22 b23
@ -635,19 +767,6 @@ static int TTransformSSE2(const uint8_t* inA, const uint8_t* inB,
      B_b2 = _mm_sub_epi16(B_b2, sign_B_b2);
    }

-    // b = abs(b) + 3
-    A_b0 = _mm_add_epi16(A_b0, three);
-    A_b2 = _mm_add_epi16(A_b2, three);
-    B_b0 = _mm_add_epi16(B_b0, three);
-    B_b2 = _mm_add_epi16(B_b2, three);
-
-    // abs((b + (b<0) + 3) >> 3) = (abs(b) + 3) >> 3
-    // b = (abs(b) + 3) >> 3
-    A_b0 = _mm_srai_epi16(A_b0, 3);
-    A_b2 = _mm_srai_epi16(A_b2, 3);
-    B_b0 = _mm_srai_epi16(B_b0, 3);
-    B_b2 = _mm_srai_epi16(B_b2, 3);
-
    // weighted sums
    A_b0 = _mm_madd_epi16(A_b0, w_0);
    A_b2 = _mm_madd_epi16(A_b2, w_8);
@ -666,7 +785,7 @@ static int TTransformSSE2(const uint8_t* inA, const uint8_t* inB,
 static int Disto4x4SSE2(const uint8_t* const a, const uint8_t* const b,
                        const uint16_t* const w) {
  const int diff_sum = TTransformSSE2(a, b, w);
-  return (abs(diff_sum) + 8) >> 4;
+  return abs(diff_sum) >> 5;
 }

 static int Disto16x16SSE2(const uint8_t* const a, const uint8_t* const b,
@ -681,7 +800,6 @@ static int Disto16x16SSE2(const uint8_t* const a, const uint8_t* const b,
  return D;
 }

-
 //------------------------------------------------------------------------------
 // Quantization
 //
@ -689,9 +807,8 @@ static int Disto16x16SSE2(const uint8_t* const a, const uint8_t* const b,
 // Simple quantization
 static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
                             int n, const VP8Matrix* const mtx) {
-  const __m128i max_coeff_2047 = _mm_set1_epi16(2047);
-  const __m128i zero = _mm_set1_epi16(0);
-  __m128i sign0, sign8;
+  const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL);
+  const __m128i zero = _mm_setzero_si128();
  __m128i coeff0, coeff8;
  __m128i out0, out8;
  __m128i packed_out;
@ -709,12 +826,10 @@ static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
  const __m128i bias8 = _mm_loadu_si128((__m128i*)&mtx->bias_[8]);
  const __m128i q0 = _mm_loadu_si128((__m128i*)&mtx->q_[0]);
  const __m128i q8 = _mm_loadu_si128((__m128i*)&mtx->q_[8]);
-  const __m128i zthresh0 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[0]);
-  const __m128i zthresh8 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[8]);

  // sign(in) = in >> 15  (0x0000 if positive, 0xffff if negative)
-  sign0 = _mm_srai_epi16(in0, 15);
-  sign8 = _mm_srai_epi16(in8, 15);
+  const __m128i sign0 = _mm_srai_epi16(in0, 15);
+  const __m128i sign8 = _mm_srai_epi16(in8, 15);

  // coeff = abs(in) = (in ^ sign) - sign
  coeff0 = _mm_xor_si128(in0, sign0);
@ -726,10 +841,6 @@ static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
  coeff0 = _mm_add_epi16(coeff0, sharpen0);
  coeff8 = _mm_add_epi16(coeff8, sharpen8);

-  // if (coeff > 2047) coeff = 2047
-  coeff0 = _mm_min_epi16(coeff0, max_coeff_2047);
-  coeff8 = _mm_min_epi16(coeff8, max_coeff_2047);
-
  // out = (coeff * iQ + B) >> QFIX;
  {
    // doing calculations with 32b precision (QFIX=17)
@ -757,9 +868,14 @@ static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
    out_04 = _mm_srai_epi32(out_04, QFIX);
    out_08 = _mm_srai_epi32(out_08, QFIX);
    out_12 = _mm_srai_epi32(out_12, QFIX);
+
    // pack result as 16b
    out0 = _mm_packs_epi32(out_00, out_04);
    out8 = _mm_packs_epi32(out_08, out_12);
+
+    // if (coeff > 2047) coeff = 2047
+    out0 = _mm_min_epi16(out0, max_coeff_2047);
+    out8 = _mm_min_epi16(out8, max_coeff_2047);
  }

  // get sign back (if (sign[j]) out_n = -out_n)
@ -772,17 +888,8 @@ static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
  in0 = _mm_mullo_epi16(out0, q0);
  in8 = _mm_mullo_epi16(out8, q8);

-  // if (coeff <= mtx->zthresh_) {in=0; out=0;}
-  {
-    __m128i cmp0 = _mm_cmpgt_epi16(coeff0, zthresh0);
-    __m128i cmp8 = _mm_cmpgt_epi16(coeff8, zthresh8);
-    in0 = _mm_and_si128(in0, cmp0);
-    in8 = _mm_and_si128(in8, cmp8);
  _mm_storeu_si128((__m128i*)&in[0], in0);
  _mm_storeu_si128((__m128i*)&in[8], in8);
-    out0 = _mm_and_si128(out0, cmp0);
-    out8 = _mm_and_si128(out8, cmp8);
-  }

  // zigzag the output before storing it.
  //
@ -819,19 +926,32 @@ static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
  }
 }

+static int QuantizeBlockWHTSSE2(int16_t in[16], int16_t out[16],
+                                const VP8Matrix* const mtx) {
+  return QuantizeBlockSSE2(in, out, 0, mtx);
+}
+
+#endif   // WEBP_USE_SSE2
+
+//------------------------------------------------------------------------------
+// Entry point
+
 extern void VP8EncDspInitSSE2(void);
+
 void VP8EncDspInitSSE2(void) {
+#if defined(WEBP_USE_SSE2)
  VP8CollectHistogram = CollectHistogramSSE2;
  VP8EncQuantizeBlock = QuantizeBlockSSE2;
+  VP8EncQuantizeBlockWHT = QuantizeBlockWHTSSE2;
  VP8ITransform = ITransformSSE2;
  VP8FTransform = FTransformSSE2;
+  VP8FTransformWHT = FTransformWHTSSE2;
+  VP8SSE16x16 = SSE16x16SSE2;
+  VP8SSE16x8 = SSE16x8SSE2;
+  VP8SSE8x8 = SSE8x8SSE2;
  VP8SSE4x4 = SSE4x4SSE2;
  VP8TDisto4x4 = Disto4x4SSE2;
  VP8TDisto16x16 = Disto16x16SSE2;
+#endif   // WEBP_USE_SSE2
 }

-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
-
-#endif   // WEBP_USE_SSE2
--- a/src/dsp/lossless.c
+++ b/src/dsp/lossless.c
--- a/src/dsp/lossless.h
+++ b/src/dsp/lossless.h
@ -1,8 +1,10 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Image transforms and color space conversion methods for lossless decoder.
@ -16,10 +18,30 @@
 #include "../webp/types.h"
 #include "../webp/decode.h"

-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif

+//------------------------------------------------------------------------------
+//
+
+typedef uint32_t (*VP8LPredClampedAddSubFunc)(uint32_t c0, uint32_t c1,
+                                              uint32_t c2);
+typedef uint32_t (*VP8LPredSelectFunc)(uint32_t c0, uint32_t c1, uint32_t c2);
+typedef void (*VP8LSubtractGreenFromBlueAndRedFunc)(uint32_t* argb_data,
+                                                    int num_pixs);
+typedef void (*VP8LAddGreenToBlueAndRedFunc)(uint32_t* data_start,
+                                             const uint32_t* data_end);
+
+extern VP8LPredClampedAddSubFunc VP8LClampedAddSubtractFull;
+extern VP8LPredClampedAddSubFunc VP8LClampedAddSubtractHalf;
+extern VP8LPredSelectFunc VP8LSelect;
+extern VP8LSubtractGreenFromBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed;
+extern VP8LAddGreenToBlueAndRedFunc VP8LAddGreenToBlueAndRed;
+
+// Must be called before calling any of the above methods.
+void VP8LDspInit(void);
+
 //------------------------------------------------------------------------------
 // Image transforms.

@ -33,8 +55,12 @@ void VP8LInverseTransform(const struct VP8LTransform* const transform,
                          int row_start, int row_end,
                          const uint32_t* const in, uint32_t* const out);

-// Subtracts green from blue and red channels.
-void VP8LSubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixs);
+// Similar to the static method ColorIndexInverseTransform() that is part of
+// lossless.c, but used only for alpha decoding. It takes uint8_t (rather than
+// uint32_t) arguments for 'src' and 'dst'.
+void VP8LColorIndexInverseTransformAlpha(
+    const struct VP8LTransform* const transform, int y_start, int y_end,
+    const uint8_t* src, uint8_t* dst);

 void VP8LResidualImage(int width, int height, int bits,
                       uint32_t* const argb, uint32_t* const argb_scratch,
@ -59,8 +85,119 @@ static WEBP_INLINE uint32_t VP8LSubSampleSize(uint32_t size,
  return (size + (1 << sampling_bits) - 1) >> sampling_bits;
 }

-// Faster logarithm for small integers, with the property of log(0) == 0.
-float VP8LFastLog(int v);
+// Faster logarithm for integers. Small values use a look-up table.
+#define LOG_LOOKUP_IDX_MAX 256
+extern const float kLog2Table[LOG_LOOKUP_IDX_MAX];
+extern const float kSLog2Table[LOG_LOOKUP_IDX_MAX];
+float VP8LFastLog2Slow(int v);
+float VP8LFastSLog2Slow(int v);
+static WEBP_INLINE float VP8LFastLog2(int v) {
+  return (v < LOG_LOOKUP_IDX_MAX) ? kLog2Table[v] : VP8LFastLog2Slow(v);
+}
+// Fast calculation of v * log2(v) for integer input.
+static WEBP_INLINE float VP8LFastSLog2(int v) {
+  return (v < LOG_LOOKUP_IDX_MAX) ? kSLog2Table[v] : VP8LFastSLog2Slow(v);
+}
+
+// -----------------------------------------------------------------------------
+// PrefixEncode()
+
+// use GNU builtins where available.
+#if defined(__GNUC__) && \
+    ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4)
+static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
+  return 31 ^ __builtin_clz(n);
+}
+#elif defined(_MSC_VER) && _MSC_VER > 1310 && \
+      (defined(_M_X64) || defined(_M_IX86))
+#include <intrin.h>
+#pragma intrinsic(_BitScanReverse)
+
+static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
+  unsigned long first_set_bit;
+  _BitScanReverse(&first_set_bit, n);
+  return first_set_bit;
+}
+#else
+// Returns (int)floor(log2(n)). n must be > 0.
+static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
+  int log = 0;
+  uint32_t value = n;
+  int i;
+
+  for (i = 4; i >= 0; --i) {
+    const int shift = (1 << i);
+    const uint32_t x = value >> shift;
+    if (x != 0) {
+      value = x;
+      log += shift;
+    }
+  }
+  return log;
+}
+#endif
+
+static WEBP_INLINE int VP8LBitsLog2Ceiling(uint32_t n) {
+  const int log_floor = BitsLog2Floor(n);
+  if (n == (n & ~(n - 1)))  // zero or a power of two.
+    return log_floor;
+  else
+    return log_floor + 1;
+}
+
+// Splitting of distance and length codes into prefixes and
+// extra bits. The prefixes are encoded with an entropy code
+// while the extra bits are stored just as normal bits.
+static WEBP_INLINE void VP8LPrefixEncodeBitsNoLUT(int distance, int* const code,
+                                                  int* const extra_bits) {
+  const int highest_bit = BitsLog2Floor(--distance);
+  const int second_highest_bit = (distance >> (highest_bit - 1)) & 1;
+  *extra_bits = highest_bit - 1;
+  *code = 2 * highest_bit + second_highest_bit;
+}
+
+static WEBP_INLINE void VP8LPrefixEncodeNoLUT(int distance, int* const code,
+                                              int* const extra_bits,
+                                              int* const extra_bits_value) {
+  const int highest_bit = BitsLog2Floor(--distance);
+  const int second_highest_bit = (distance >> (highest_bit - 1)) & 1;
+  *extra_bits = highest_bit - 1;
+  *extra_bits_value = distance & ((1 << *extra_bits) - 1);
+  *code = 2 * highest_bit + second_highest_bit;
+}
+
+#define PREFIX_LOOKUP_IDX_MAX   512
+typedef struct {
+  int8_t code_;
+  int8_t extra_bits_;
+} VP8LPrefixCode;
+
+// These tables are derived using VP8LPrefixEncodeNoLUT.
+extern const VP8LPrefixCode kPrefixEncodeCode[PREFIX_LOOKUP_IDX_MAX];
+extern const uint8_t kPrefixEncodeExtraBitsValue[PREFIX_LOOKUP_IDX_MAX];
+static WEBP_INLINE void VP8LPrefixEncodeBits(int distance, int* const code,
+                                             int* const extra_bits) {
+  if (distance < PREFIX_LOOKUP_IDX_MAX) {
+    const VP8LPrefixCode prefix_code = kPrefixEncodeCode[distance];
+    *code = prefix_code.code_;
+    *extra_bits = prefix_code.extra_bits_;
+  } else {
+    VP8LPrefixEncodeBitsNoLUT(distance, code, extra_bits);
+  }
+}
+
+static WEBP_INLINE void VP8LPrefixEncode(int distance, int* const code,
+                                         int* const extra_bits,
+                                         int* const extra_bits_value) {
+  if (distance < PREFIX_LOOKUP_IDX_MAX) {
+    const VP8LPrefixCode prefix_code = kPrefixEncodeCode[distance];
+    *code = prefix_code.code_;
+    *extra_bits = prefix_code.extra_bits_;
+    *extra_bits_value = kPrefixEncodeExtraBitsValue[distance];
+  } else {
+    VP8LPrefixEncodeNoLUT(distance, code, extra_bits, extra_bits_value);
+  }
+}

 // In-place difference of each component with mod 256.
 static WEBP_INLINE uint32_t VP8LSubPixels(uint32_t a, uint32_t b) {
@ -71,9 +208,12 @@ static WEBP_INLINE uint32_t VP8LSubPixels(uint32_t a, uint32_t b) {
  return (alpha_and_green & 0xff00ff00u) | (red_and_blue & 0x00ff00ffu);
 }

+void VP8LBundleColorMap(const uint8_t* const row, int width,
+                        int xbits, uint32_t* const dst);
+
 //------------------------------------------------------------------------------

-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }    // extern "C"
 #endif

--- a/src/dsp/upsampling.c
+++ b/src/dsp/upsampling.c
@ -1,8 +1,10 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // YUV to RGB upsampling functions.
@ -12,9 +14,7 @@
 #include "./dsp.h"
 #include "./yuv.h"

-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
+#include <assert.h>

 //------------------------------------------------------------------------------
 // Fancy upsampler
@ -43,11 +43,12 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
  const int last_pixel_pair = (len - 1) >> 1;                                  \
  uint32_t tl_uv = LOAD_UV(top_u[0], top_v[0]);   /* top-left sample */        \
  uint32_t l_uv  = LOAD_UV(cur_u[0], cur_v[0]);   /* left-sample */            \
-  if (top_y) {                                                                 \
+  assert(top_y != NULL);                                                       \
+  {                                                                            \
    const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2;                \
    FUNC(top_y[0], uv0 & 0xff, (uv0 >> 16), top_dst);                          \
  }                                                                            \
-  if (bottom_y) {                                                              \
+  if (bottom_y != NULL) {                                                      \
    const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2;                \
    FUNC(bottom_y[0], uv0 & 0xff, (uv0 >> 16), bottom_dst);                    \
  }                                                                            \
@ -58,7 +59,7 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
    const uint32_t avg = tl_uv + t_uv + l_uv + uv + 0x00080008u;               \
    const uint32_t diag_12 = (avg + 2 * (t_uv + l_uv)) >> 3;                   \
    const uint32_t diag_03 = (avg + 2 * (tl_uv + uv)) >> 3;                    \
-    if (top_y) {                                                               \
+    {                                                                          \
      const uint32_t uv0 = (diag_12 + tl_uv) >> 1;                             \
      const uint32_t uv1 = (diag_03 + t_uv) >> 1;                              \
      FUNC(top_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16),                          \
@ -66,7 +67,7 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
      FUNC(top_y[2 * x - 0], uv1 & 0xff, (uv1 >> 16),                          \
           top_dst + (2 * x - 0) * XSTEP);                                     \
    }                                                                          \
-    if (bottom_y) {                                                            \
+    if (bottom_y != NULL) {                                                    \
      const uint32_t uv0 = (diag_03 + l_uv) >> 1;                              \
      const uint32_t uv1 = (diag_12 + uv) >> 1;                                \
      FUNC(bottom_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16),                       \
@ -78,12 +79,12 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
    l_uv = uv;                                                                 \
  }                                                                            \
  if (!(len & 1)) {                                                            \
-    if (top_y) {                                                               \
+    {                                                                          \
      const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2;              \
      FUNC(top_y[len - 1], uv0 & 0xff, (uv0 >> 16),                            \
           top_dst + (len - 1) * XSTEP);                                       \
    }                                                                          \
-    if (bottom_y) {                                                            \
+    if (bottom_y != NULL) {                                                    \
      const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2;              \
      FUNC(bottom_y[len - 1], uv0 & 0xff, (uv0 >> 16),                         \
           bottom_dst + (len - 1) * XSTEP);                                    \
@ -166,7 +167,8 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bot_y,              \
                      uint8_t* top_dst, uint8_t* bot_dst, int len) {           \
  const int half_len = len >> 1;                                               \
  int x;                                                                       \
-  if (top_dst != NULL) {                                                       \
+  assert(top_dst != NULL);                                                     \
+  {                                                                            \
    for (x = 0; x < half_len; ++x) {                                           \
      FUNC(top_y[2 * x + 0], top_u[x], top_v[x], top_dst + 8 * x + 0);         \
      FUNC(top_y[2 * x + 1], top_u[x], top_v[x], top_dst + 8 * x + 4);         \
@ -271,8 +273,7 @@ static void ApplyAlphaMultiply(uint8_t* rgba, int alpha_first,

 // rgbA4444

-#define MULTIPLIER(a)  ((a) * 0x11)
-#define PREMULTIPLY(x, m) (((x) * (m)) >> 12)
+#define MULTIPLIER(a)  ((a) * 0x1111)    // 0x1111 ~= (1 << 16) / 15

 static WEBP_INLINE uint8_t dither_hi(uint8_t x) {
  return (x & 0xf0) | (x >> 4);
@ -282,24 +283,27 @@ static WEBP_INLINE uint8_t dither_lo(uint8_t x) {
  return (x & 0x0f) | (x << 4);
 }

+static WEBP_INLINE uint8_t multiply(uint8_t x, uint32_t m) {
+  return (x * m) >> 16;
+}
+
 static void ApplyAlphaMultiply4444(uint8_t* rgba4444,
                                   int w, int h, int stride) {
  while (h-- > 0) {
    int i;
    for (i = 0; i < w; ++i) {
-      const uint8_t a = dither_lo(rgba4444[2 * i + 1]);
+      const uint8_t a = (rgba4444[2 * i + 1] & 0x0f);
      const uint32_t mult = MULTIPLIER(a);
-      const uint8_t r = PREMULTIPLY(dither_hi(rgba4444[2 * i + 0]), mult);
-      const uint8_t g = PREMULTIPLY(dither_lo(rgba4444[2 * i + 0]), mult);
-      const uint8_t b = PREMULTIPLY(dither_hi(rgba4444[2 * i + 1]), mult);
-      rgba4444[2 * i + 0] = (r & 0xf0) | (g & 0x0f);
+      const uint8_t r = multiply(dither_hi(rgba4444[2 * i + 0]), mult);
+      const uint8_t g = multiply(dither_lo(rgba4444[2 * i + 0]), mult);
+      const uint8_t b = multiply(dither_hi(rgba4444[2 * i + 1]), mult);
+      rgba4444[2 * i + 0] = (r & 0xf0) | ((g >> 4) & 0x0f);
      rgba4444[2 * i + 1] = (b & 0xf0) | a;
    }
    rgba4444 += stride;
  }
 }
 #undef MULTIPLIER
-#undef PREMULTIPLY

 void (*WebPApplyAlphaMultiply)(uint8_t*, int, int, int, int)
    = ApplyAlphaMultiply;
@ -325,6 +329,11 @@ void WebPInitUpsamplers(void) {
    if (VP8GetCPUInfo(kSSE2)) {
      WebPInitUpsamplersSSE2();
    }
+#endif
+#if defined(WEBP_USE_NEON)
+    if (VP8GetCPUInfo(kNEON)) {
+      WebPInitUpsamplersNEON();
+    }
 #endif
  }
 #endif  // FANCY_UPSAMPLING
@ -345,11 +354,13 @@ void WebPInitPremultiply(void) {
    if (VP8GetCPUInfo(kSSE2)) {
      WebPInitPremultiplySSE2();
    }
+#endif
+#if defined(WEBP_USE_NEON)
+    if (VP8GetCPUInfo(kNEON)) {
+      WebPInitPremultiplyNEON();
+    }
 #endif
  }
 #endif  // FANCY_UPSAMPLING
 }

-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
--- a/src/dsp/upsampling_neon.c
+++ b/src/dsp/upsampling_neon.c
@ -0,0 +1,265 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// NEON version of YUV to RGB upsampling functions.
+//
+// Author: mans@mansr.com (Mans Rullgard)
+// Based on SSE code by: somnath@google.com (Somnath Banerjee)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_NEON)
+
+#include <assert.h>
+#include <arm_neon.h>
+#include <string.h>
+#include "./yuv.h"
+
+#ifdef FANCY_UPSAMPLING
+
+//-----------------------------------------------------------------------------
+// U/V upsampling
+
+// Loads 9 pixels each from rows r1 and r2 and generates 16 pixels.
+#define UPSAMPLE_16PIXELS(r1, r2, out) {                                \
+  uint8x8_t a = vld1_u8(r1);                                            \
+  uint8x8_t b = vld1_u8(r1 + 1);                                        \
+  uint8x8_t c = vld1_u8(r2);                                            \
+  uint8x8_t d = vld1_u8(r2 + 1);                                        \
+                                                                        \
+  uint16x8_t al = vshll_n_u8(a, 1);                                     \
+  uint16x8_t bl = vshll_n_u8(b, 1);                                     \
+  uint16x8_t cl = vshll_n_u8(c, 1);                                     \
+  uint16x8_t dl = vshll_n_u8(d, 1);                                     \
+                                                                        \
+  uint8x8_t diag1, diag2;                                               \
+  uint16x8_t sl;                                                        \
+                                                                        \
+  /* a + b + c + d */                                                   \
+  sl = vaddl_u8(a,  b);                                                 \
+  sl = vaddw_u8(sl, c);                                                 \
+  sl = vaddw_u8(sl, d);                                                 \
+                                                                        \
+  al = vaddq_u16(sl, al); /* 3a +  b +  c +  d */                       \
+  bl = vaddq_u16(sl, bl); /*  a + 3b +  c +  d */                       \
+                                                                        \
+  al = vaddq_u16(al, dl); /* 3a +  b +  c + 3d */                       \
+  bl = vaddq_u16(bl, cl); /*  a + 3b + 3c +  d */                       \
+                                                                        \
+  diag2 = vshrn_n_u16(al, 3);                                           \
+  diag1 = vshrn_n_u16(bl, 3);                                           \
+                                                                        \
+  a = vrhadd_u8(a, diag1);                                              \
+  b = vrhadd_u8(b, diag2);                                              \
+  c = vrhadd_u8(c, diag2);                                              \
+  d = vrhadd_u8(d, diag1);                                              \
+                                                                        \
+  {                                                                     \
+    const uint8x8x2_t a_b = {{ a, b }};                                 \
+    const uint8x8x2_t c_d = {{ c, d }};                                 \
+    vst2_u8(out,      a_b);                                             \
+    vst2_u8(out + 32, c_d);                                             \
+  }                                                                     \
+}
+
+// Turn the macro into a function for reducing code-size when non-critical
+static void Upsample16Pixels(const uint8_t *r1, const uint8_t *r2,
+                             uint8_t *out) {
+  UPSAMPLE_16PIXELS(r1, r2, out);
+}
+
+#define UPSAMPLE_LAST_BLOCK(tb, bb, num_pixels, out) {                  \
+  uint8_t r1[9], r2[9];                                                 \
+  memcpy(r1, (tb), (num_pixels));                                       \
+  memcpy(r2, (bb), (num_pixels));                                       \
+  /* replicate last byte */                                             \
+  memset(r1 + (num_pixels), r1[(num_pixels) - 1], 9 - (num_pixels));    \
+  memset(r2 + (num_pixels), r2[(num_pixels) - 1], 9 - (num_pixels));    \
+  Upsample16Pixels(r1, r2, out);                                        \
+}
+
+//-----------------------------------------------------------------------------
+// YUV->RGB conversion
+
+static const int16_t kCoeffs[4] = { kYScale, kVToR, kUToG, kVToG };
+
+#define v255 vmov_n_u8(255)
+
+#define STORE_Rgb(out, r, g, b) do {                                    \
+  const uint8x8x3_t r_g_b = {{ r, g, b }};                              \
+  vst3_u8(out, r_g_b);                                                  \
+} while (0)
+
+#define STORE_Bgr(out, r, g, b) do {                                    \
+  const uint8x8x3_t b_g_r = {{ b, g, r }};                              \
+  vst3_u8(out, b_g_r);                                                  \
+} while (0)
+
+#define STORE_Rgba(out, r, g, b) do {                                   \
+  const uint8x8x4_t r_g_b_v255 = {{ r, g, b, v255 }};                   \
+  vst4_u8(out, r_g_b_v255);                                             \
+} while (0)
+
+#define STORE_Bgra(out, r, g, b) do {                                   \
+  const uint8x8x4_t b_g_r_v255 = {{ b, g, r, v255 }};                   \
+  vst4_u8(out, b_g_r_v255);                                             \
+} while (0)
+
+#define CONVERT8(FMT, XSTEP, N, src_y, src_uv, out, cur_x) {            \
+  int i;                                                                \
+  for (i = 0; i < N; i += 8) {                                          \
+    const int off = ((cur_x) + i) * XSTEP;                              \
+    uint8x8_t y  = vld1_u8((src_y) + (cur_x)  + i);                     \
+    uint8x8_t u  = vld1_u8((src_uv) + i);                               \
+    uint8x8_t v  = vld1_u8((src_uv) + i + 16);                          \
+    const int16x8_t yy = vreinterpretq_s16_u16(vsubl_u8(y, u16));       \
+    const int16x8_t uu = vreinterpretq_s16_u16(vsubl_u8(u, u128));      \
+    const int16x8_t vv = vreinterpretq_s16_u16(vsubl_u8(v, u128));      \
+    int32x4_t yl = vmull_lane_s16(vget_low_s16(yy),  cf16, 0);          \
+    int32x4_t yh = vmull_lane_s16(vget_high_s16(yy), cf16, 0);          \
+    const int32x4_t rl = vmlal_lane_s16(yl, vget_low_s16(vv),  cf16, 1);\
+    const int32x4_t rh = vmlal_lane_s16(yh, vget_high_s16(vv), cf16, 1);\
+    int32x4_t gl = vmlsl_lane_s16(yl, vget_low_s16(uu),  cf16, 2);      \
+    int32x4_t gh = vmlsl_lane_s16(yh, vget_high_s16(uu), cf16, 2);      \
+    const int32x4_t bl = vmovl_s16(vget_low_s16(uu));                   \
+    const int32x4_t bh = vmovl_s16(vget_high_s16(uu));                  \
+    gl = vmlsl_lane_s16(gl, vget_low_s16(vv),  cf16, 3);                \
+    gh = vmlsl_lane_s16(gh, vget_high_s16(vv), cf16, 3);                \
+    yl = vmlaq_lane_s32(yl, bl, cf32, 0);                               \
+    yh = vmlaq_lane_s32(yh, bh, cf32, 0);                               \
+    /* vrshrn_n_s32() already incorporates the rounding constant */     \
+    y = vqmovun_s16(vcombine_s16(vrshrn_n_s32(rl, YUV_FIX2),            \
+                                 vrshrn_n_s32(rh, YUV_FIX2)));          \
+    u = vqmovun_s16(vcombine_s16(vrshrn_n_s32(gl, YUV_FIX2),            \
+                                 vrshrn_n_s32(gh, YUV_FIX2)));          \
+    v = vqmovun_s16(vcombine_s16(vrshrn_n_s32(yl, YUV_FIX2),            \
+                                 vrshrn_n_s32(yh, YUV_FIX2)));          \
+    STORE_ ## FMT(out + off, y, u, v);                                  \
+  }                                                                     \
+}
+
+#define CONVERT1(FUNC, XSTEP, N, src_y, src_uv, rgb, cur_x) {           \
+  int i;                                                                \
+  for (i = 0; i < N; i++) {                                             \
+    const int off = ((cur_x) + i) * XSTEP;                              \
+    const int y = src_y[(cur_x) + i];                                   \
+    const int u = (src_uv)[i];                                          \
+    const int v = (src_uv)[i + 16];                                     \
+    FUNC(y, u, v, rgb + off);                                           \
+  }                                                                     \
+}
+
+#define CONVERT2RGB_8(FMT, XSTEP, top_y, bottom_y, uv,                  \
+                      top_dst, bottom_dst, cur_x, len) {                \
+  CONVERT8(FMT, XSTEP, len, top_y, uv, top_dst, cur_x)                  \
+  if (bottom_y != NULL) {                                               \
+    CONVERT8(FMT, XSTEP, len, bottom_y, (uv) + 32, bottom_dst, cur_x)   \
+  }                                                                     \
+}
+
+#define CONVERT2RGB_1(FUNC, XSTEP, top_y, bottom_y, uv,                 \
+                      top_dst, bottom_dst, cur_x, len) {                \
+  CONVERT1(FUNC, XSTEP, len, top_y, uv, top_dst, cur_x);                \
+  if (bottom_y != NULL) {                                               \
+    CONVERT1(FUNC, XSTEP, len, bottom_y, (uv) + 32, bottom_dst, cur_x); \
+  }                                                                     \
+}
+
+#define NEON_UPSAMPLE_FUNC(FUNC_NAME, FMT, XSTEP)                       \
+static void FUNC_NAME(const uint8_t *top_y, const uint8_t *bottom_y,    \
+                      const uint8_t *top_u, const uint8_t *top_v,       \
+                      const uint8_t *cur_u, const uint8_t *cur_v,       \
+                      uint8_t *top_dst, uint8_t *bottom_dst, int len) { \
+  int block;                                                            \
+  /* 16 byte aligned array to cache reconstructed u and v */            \
+  uint8_t uv_buf[2 * 32 + 15];                                          \
+  uint8_t *const r_uv = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15);     \
+  const int uv_len = (len + 1) >> 1;                                    \
+  /* 9 pixels must be read-able for each block */                       \
+  const int num_blocks = (uv_len - 1) >> 3;                             \
+  const int leftover = uv_len - num_blocks * 8;                         \
+  const int last_pos = 1 + 16 * num_blocks;                             \
+                                                                        \
+  const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1;                  \
+  const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1;                  \
+                                                                        \
+  const int16x4_t cf16 = vld1_s16(kCoeffs);                             \
+  const int32x2_t cf32 = vmov_n_s32(kUToB);                             \
+  const uint8x8_t u16  = vmov_n_u8(16);                                 \
+  const uint8x8_t u128 = vmov_n_u8(128);                                \
+                                                                        \
+  /* Treat the first pixel in regular way */                            \
+  assert(top_y != NULL);                                                \
+  {                                                                     \
+    const int u0 = (top_u[0] + u_diag) >> 1;                            \
+    const int v0 = (top_v[0] + v_diag) >> 1;                            \
+    VP8YuvTo ## FMT(top_y[0], u0, v0, top_dst);                         \
+  }                                                                     \
+  if (bottom_y != NULL) {                                               \
+    const int u0 = (cur_u[0] + u_diag) >> 1;                            \
+    const int v0 = (cur_v[0] + v_diag) >> 1;                            \
+    VP8YuvTo ## FMT(bottom_y[0], u0, v0, bottom_dst);                   \
+  }                                                                     \
+                                                                        \
+  for (block = 0; block < num_blocks; ++block) {                        \
+    UPSAMPLE_16PIXELS(top_u, cur_u, r_uv);                              \
+    UPSAMPLE_16PIXELS(top_v, cur_v, r_uv + 16);                         \
+    CONVERT2RGB_8(FMT, XSTEP, top_y, bottom_y, r_uv,                    \
+                  top_dst, bottom_dst, 16 * block + 1, 16);             \
+    top_u += 8;                                                         \
+    cur_u += 8;                                                         \
+    top_v += 8;                                                         \
+    cur_v += 8;                                                         \
+  }                                                                     \
+                                                                        \
+  UPSAMPLE_LAST_BLOCK(top_u, cur_u, leftover, r_uv);                    \
+  UPSAMPLE_LAST_BLOCK(top_v, cur_v, leftover, r_uv + 16);               \
+  CONVERT2RGB_1(VP8YuvTo ## FMT, XSTEP, top_y, bottom_y, r_uv,          \
+                top_dst, bottom_dst, last_pos, len - last_pos);         \
+}
+
+// NEON variants of the fancy upsampler.
+NEON_UPSAMPLE_FUNC(UpsampleRgbLinePairNEON,  Rgb,  3)
+NEON_UPSAMPLE_FUNC(UpsampleBgrLinePairNEON,  Bgr,  3)
+NEON_UPSAMPLE_FUNC(UpsampleRgbaLinePairNEON, Rgba, 4)
+NEON_UPSAMPLE_FUNC(UpsampleBgraLinePairNEON, Bgra, 4)
+
+#endif  // FANCY_UPSAMPLING
+
+#endif   // WEBP_USE_NEON
+
+//------------------------------------------------------------------------------
+
+#ifdef FANCY_UPSAMPLING
+
+extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
+
+void WebPInitUpsamplersNEON(void) {
+#if defined(WEBP_USE_NEON)
+  WebPUpsamplers[MODE_RGB]  = UpsampleRgbLinePairNEON;
+  WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePairNEON;
+  WebPUpsamplers[MODE_BGR]  = UpsampleBgrLinePairNEON;
+  WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePairNEON;
+#endif   // WEBP_USE_NEON
+}
+
+void WebPInitPremultiplyNEON(void) {
+#if defined(WEBP_USE_NEON)
+  WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePairNEON;
+  WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePairNEON;
+#endif   // WEBP_USE_NEON
+}
+
+#else
+
+// this empty function is to avoid an empty .o
+void WebPInitPremultiplyNEON(void) {}
+
+#endif  // FANCY_UPSAMPLING
+
--- a/src/dsp/upsampling_sse2.c
+++ b/src/dsp/upsampling_sse2.c
@ -1,8 +1,10 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // SSE2 version of YUV to RGB upsampling functions.
@ -18,10 +20,6 @@
 #include <string.h>
 #include "./yuv.h"

-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 #ifdef FANCY_UPSAMPLING

 // We compute (9*a + 3*b + 3*c + d + 8) / 16 as follows
@ -49,14 +47,14 @@ extern "C" {
  (out) = _mm_sub_epi8(tmp0, tmp4);    /* (k + in + 1) / 2 - lsb_correction */ \
 } while (0)

-// pack and store two alterning pixel rows
+// pack and store two alternating pixel rows
 #define PACK_AND_STORE(a, b, da, db, out) do {                                 \
-  const __m128i ta = _mm_avg_epu8(a, da);  /* (9a + 3b + 3c +  d + 8) / 16 */  \
-  const __m128i tb = _mm_avg_epu8(b, db);  /* (3a + 9b +  c + 3d + 8) / 16 */  \
-  const __m128i t1 = _mm_unpacklo_epi8(ta, tb);                                \
-  const __m128i t2 = _mm_unpackhi_epi8(ta, tb);                                \
-  _mm_store_si128(((__m128i*)(out)) + 0, t1);                                  \
-  _mm_store_si128(((__m128i*)(out)) + 1, t2);                                  \
+  const __m128i t_a = _mm_avg_epu8(a, da);  /* (9a + 3b + 3c +  d + 8) / 16 */ \
+  const __m128i t_b = _mm_avg_epu8(b, db);  /* (3a + 9b +  c + 3d + 8) / 16 */ \
+  const __m128i t_1 = _mm_unpacklo_epi8(t_a, t_b);                             \
+  const __m128i t_2 = _mm_unpackhi_epi8(t_a, t_b);                             \
+  _mm_store_si128(((__m128i*)(out)) + 0, t_1);                                 \
+  _mm_store_si128(((__m128i*)(out)) + 1, t_2);                                 \
 } while (0)

 // Loads 17 pixels each from rows r1 and r2 and generates 32 pixels.
@ -85,8 +83,8 @@ extern "C" {
  GET_M(ad, s, diag2);                  /* diag2 = (3a + b + c + 3d) / 8 */    \
                                                                               \
  /* pack the alternate pixels */                                              \
-  PACK_AND_STORE(a, b, diag1, diag2, &(out)[0 * 32]);                          \
-  PACK_AND_STORE(c, d, diag2, diag1, &(out)[2 * 32]);                          \
+  PACK_AND_STORE(a, b, diag1, diag2, out +      0);  /* store top */           \
+  PACK_AND_STORE(c, d, diag2, diag1, out + 2 * 32);  /* store bottom */        \
 }

 // Turn the macro into a function for reducing code-size when non-critical
@ -106,69 +104,68 @@ static void Upsample32Pixels(const uint8_t r1[], const uint8_t r2[],
  Upsample32Pixels(r1, r2, out);                                               \
 }

-#define CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, uv,                          \
+#define CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y,                              \
                    top_dst, bottom_dst, cur_x, num_pixels) {                  \
  int n;                                                                       \
-  if (top_y) {                                                                 \
  for (n = 0; n < (num_pixels); ++n) {                                         \
-      FUNC(top_y[(cur_x) + n], (uv)[n], (uv)[32 + n],                          \
+    FUNC(top_y[(cur_x) + n], r_u[n], r_v[n],                                   \
         top_dst + ((cur_x) + n) * XSTEP);                                     \
  }                                                                            \
-  }                                                                            \
-  if (bottom_y) {                                                              \
+  if (bottom_y != NULL) {                                                      \
    for (n = 0; n < (num_pixels); ++n) {                                       \
-      FUNC(bottom_y[(cur_x) + n], (uv)[64 + n], (uv)[64 + 32 + n],             \
+      FUNC(bottom_y[(cur_x) + n], r_u[64 + n], r_v[64 + n],                    \
           bottom_dst + ((cur_x) + n) * XSTEP);                                \
    }                                                                          \
  }                                                                            \
 }

+#define CONVERT2RGB_32(FUNC, XSTEP, top_y, bottom_y,                           \
+                       top_dst, bottom_dst, cur_x) do {                        \
+  FUNC##32(top_y + (cur_x), r_u, r_v, top_dst + (cur_x) * XSTEP);              \
+  if (bottom_y != NULL) {                                                      \
+    FUNC##32(bottom_y + (cur_x), r_u + 64, r_v + 64,                           \
+             bottom_dst + (cur_x) * XSTEP);                                    \
+  }                                                                            \
+} while (0)
+
 #define SSE2_UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP)                             \
 static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
                      const uint8_t* top_u, const uint8_t* top_v,              \
                      const uint8_t* cur_u, const uint8_t* cur_v,              \
                      uint8_t* top_dst, uint8_t* bottom_dst, int len) {        \
-  int b;                                                                       \
-  /* 16 byte aligned array to cache reconstructed u and v */                   \
+  int uv_pos, pos;                                                             \
+  /* 16byte-aligned array to cache reconstructed u and v */                    \
  uint8_t uv_buf[4 * 32 + 15];                                                 \
-  uint8_t* const r_uv = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15);            \
-  const int uv_len = (len + 1) >> 1;                                           \
-  /* 17 pixels must be read-able for each block */                             \
-  const int num_blocks = (uv_len - 1) >> 4;                                    \
-  const int leftover = uv_len - num_blocks * 16;                               \
-  const int last_pos = 1 + 32 * num_blocks;                                    \
+  uint8_t* const r_u = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15);             \
+  uint8_t* const r_v = r_u + 32;                                               \
                                                                               \
+  assert(top_y != NULL);                                                       \
+  {   /* Treat the first pixel in regular way */                               \
    const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1;                       \
    const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1;                       \
-                                                                               \
-  assert(len > 0);                                                             \
-  /* Treat the first pixel in regular way */                                   \
-  if (top_y) {                                                                 \
-    const int u0 = (top_u[0] + u_diag) >> 1;                                   \
-    const int v0 = (top_v[0] + v_diag) >> 1;                                   \
-    FUNC(top_y[0], u0, v0, top_dst);                                           \
+    const int u0_t = (top_u[0] + u_diag) >> 1;                                 \
+    const int v0_t = (top_v[0] + v_diag) >> 1;                                 \
+    FUNC(top_y[0], u0_t, v0_t, top_dst);                                       \
+    if (bottom_y != NULL) {                                                    \
+      const int u0_b = (cur_u[0] + u_diag) >> 1;                               \
+      const int v0_b = (cur_v[0] + v_diag) >> 1;                               \
+      FUNC(bottom_y[0], u0_b, v0_b, bottom_dst);                               \
    }                                                                          \
-  if (bottom_y) {                                                              \
-    const int u0 = (cur_u[0] + u_diag) >> 1;                                   \
-    const int v0 = (cur_v[0] + v_diag) >> 1;                                   \
-    FUNC(bottom_y[0], u0, v0, bottom_dst);                                     \
  }                                                                            \
-                                                                               \
-  for (b = 0; b < num_blocks; ++b) {                                           \
-    UPSAMPLE_32PIXELS(top_u, cur_u, r_uv + 0 * 32);                            \
-    UPSAMPLE_32PIXELS(top_v, cur_v, r_uv + 1 * 32);                            \
-    CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, r_uv, top_dst, bottom_dst,       \
-                32 * b + 1, 32)                                                \
-    top_u += 16;                                                               \
-    cur_u += 16;                                                               \
-    top_v += 16;                                                               \
-    cur_v += 16;                                                               \
+  /* For UPSAMPLE_32PIXELS, 17 u/v values must be read-able for each block */  \
+  for (pos = 1, uv_pos = 0; pos + 32 + 1 <= len; pos += 32, uv_pos += 16) {    \
+    UPSAMPLE_32PIXELS(top_u + uv_pos, cur_u + uv_pos, r_u);                    \
+    UPSAMPLE_32PIXELS(top_v + uv_pos, cur_v + uv_pos, r_v);                    \
+    CONVERT2RGB_32(FUNC, XSTEP, top_y, bottom_y, top_dst, bottom_dst, pos);    \
+  }                                                                            \
+  if (len > 1) {                                                               \
+    const int left_over = ((len + 1) >> 1) - (pos >> 1);                       \
+    assert(left_over > 0);                                                     \
+    UPSAMPLE_LAST_BLOCK(top_u + uv_pos, cur_u + uv_pos, left_over, r_u);       \
+    UPSAMPLE_LAST_BLOCK(top_v + uv_pos, cur_v + uv_pos, left_over, r_v);       \
+    CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, top_dst, bottom_dst,             \
+                pos, len - pos);                                               \
  }                                                                            \
-                                                                               \
-  UPSAMPLE_LAST_BLOCK(top_u, cur_u, leftover, r_uv + 0 * 32);                  \
-  UPSAMPLE_LAST_BLOCK(top_v, cur_v, leftover, r_uv + 1 * 32);                  \
-  CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, r_uv, top_dst, bottom_dst,         \
-              last_pos, len - last_pos);                                       \
 }

 // SSE2 variants of the fancy upsampler.
@ -182,28 +179,40 @@ SSE2_UPSAMPLE_FUNC(UpsampleBgraLinePairSSE2, VP8YuvToBgra, 4)
 #undef UPSAMPLE_32PIXELS
 #undef UPSAMPLE_LAST_BLOCK
 #undef CONVERT2RGB
+#undef CONVERT2RGB_32
 #undef SSE2_UPSAMPLE_FUNC

+#endif  // FANCY_UPSAMPLING
+
+#endif   // WEBP_USE_SSE2
+
 //------------------------------------------------------------------------------

+#ifdef FANCY_UPSAMPLING
+
 extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];

 void WebPInitUpsamplersSSE2(void) {
+#if defined(WEBP_USE_SSE2)
+  VP8YUVInitSSE2();
  WebPUpsamplers[MODE_RGB]  = UpsampleRgbLinePairSSE2;
  WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePairSSE2;
  WebPUpsamplers[MODE_BGR]  = UpsampleBgrLinePairSSE2;
  WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePairSSE2;
+#endif   // WEBP_USE_SSE2
 }

 void WebPInitPremultiplySSE2(void) {
+#if defined(WEBP_USE_SSE2)
  WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePairSSE2;
  WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePairSSE2;
+#endif   // WEBP_USE_SSE2
 }

+#else
+
+// this empty function is to avoid an empty .o
+void WebPInitPremultiplySSE2(void) {}
+
 #endif  // FANCY_UPSAMPLING

-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
-
-#endif   // WEBP_USE_SSE2
--- a/src/dsp/yuv.c
+++ b/src/dsp/yuv.c
@ -1,8 +1,10 @@
 // Copyright 2010 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // YUV->RGB conversion function
@ -11,16 +13,8 @@

 #include "./yuv.h"

-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif

-enum { YUV_HALF = 1 << (YUV_FIX - 1) };
-
-int16_t VP8kVToR[256], VP8kUToB[256];
-int32_t VP8kVToG[256], VP8kUToG[256];
-uint8_t VP8kClip[YUV_RANGE_MAX - YUV_RANGE_MIN];
-uint8_t VP8kClip4Bits[YUV_RANGE_MAX - YUV_RANGE_MIN];
+#if defined(WEBP_YUV_USE_TABLE)

 static int done = 0;

@ -28,11 +22,17 @@ static WEBP_INLINE uint8_t clip(int v, int max_value) {
  return v < 0 ? 0 : v > max_value ? max_value : v;
 }

+int16_t VP8kVToR[256], VP8kUToB[256];
+int32_t VP8kVToG[256], VP8kUToG[256];
+uint8_t VP8kClip[YUV_RANGE_MAX - YUV_RANGE_MIN];
+uint8_t VP8kClip4Bits[YUV_RANGE_MAX - YUV_RANGE_MIN];
+
 void VP8YUVInit(void) {
  int i;
  if (done) {
    return;
  }
+#ifndef USE_YUVj
  for (i = 0; i < 256; ++i) {
    VP8kVToR[i] = (89858 * (i - 128) + YUV_HALF) >> YUV_FIX;
    VP8kUToG[i] = -22014 * (i - 128) + YUV_HALF;
@ -44,9 +44,164 @@ void VP8YUVInit(void) {
    VP8kClip[i - YUV_RANGE_MIN] = clip(k, 255);
    VP8kClip4Bits[i - YUV_RANGE_MIN] = clip((k + 8) >> 4, 15);
  }
+#else
+  for (i = 0; i < 256; ++i) {
+    VP8kVToR[i] = (91881 * (i - 128) + YUV_HALF) >> YUV_FIX;
+    VP8kUToG[i] = -22554 * (i - 128) + YUV_HALF;
+    VP8kVToG[i] = -46802 * (i - 128);
+    VP8kUToB[i] = (116130 * (i - 128) + YUV_HALF) >> YUV_FIX;
+  }
+  for (i = YUV_RANGE_MIN; i < YUV_RANGE_MAX; ++i) {
+    const int k = i;
+    VP8kClip[i - YUV_RANGE_MIN] = clip(k, 255);
+    VP8kClip4Bits[i - YUV_RANGE_MIN] = clip((k + 8) >> 4, 15);
+  }
+#endif
+
  done = 1;
 }

-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
+#else
+
+void VP8YUVInit(void) {}
+
+#endif  // WEBP_YUV_USE_TABLE
+
+//-----------------------------------------------------------------------------
+// SSE2 extras
+
+#if defined(WEBP_USE_SSE2)
+
+#ifdef FANCY_UPSAMPLING
+
+#include <emmintrin.h>
+#include <string.h>   // for memcpy
+
+typedef union {   // handy struct for converting SSE2 registers
+  int32_t i32[4];
+  uint8_t u8[16];
+  __m128i m;
+} VP8kCstSSE2;
+
+static int done_sse2 = 0;
+static VP8kCstSSE2 VP8kUtoRGBA[256], VP8kVtoRGBA[256], VP8kYtoRGBA[256];
+
+void VP8YUVInitSSE2(void) {
+  if (!done_sse2) {
+    int i;
+    for (i = 0; i < 256; ++i) {
+      VP8kYtoRGBA[i].i32[0] =
+        VP8kYtoRGBA[i].i32[1] =
+        VP8kYtoRGBA[i].i32[2] = (i - 16) * kYScale + YUV_HALF2;
+      VP8kYtoRGBA[i].i32[3] = 0xff << YUV_FIX2;
+
+      VP8kUtoRGBA[i].i32[0] = 0;
+      VP8kUtoRGBA[i].i32[1] = -kUToG * (i - 128);
+      VP8kUtoRGBA[i].i32[2] =  kUToB * (i - 128);
+      VP8kUtoRGBA[i].i32[3] = 0;
+
+      VP8kVtoRGBA[i].i32[0] =  kVToR * (i - 128);
+      VP8kVtoRGBA[i].i32[1] = -kVToG * (i - 128);
+      VP8kVtoRGBA[i].i32[2] = 0;
+      VP8kVtoRGBA[i].i32[3] = 0;
+    }
+    done_sse2 = 1;
+  }
+}
+
+static WEBP_INLINE __m128i VP8GetRGBA32b(int y, int u, int v) {
+  const __m128i u_part = _mm_loadu_si128(&VP8kUtoRGBA[u].m);
+  const __m128i v_part = _mm_loadu_si128(&VP8kVtoRGBA[v].m);
+  const __m128i y_part = _mm_loadu_si128(&VP8kYtoRGBA[y].m);
+  const __m128i uv_part = _mm_add_epi32(u_part, v_part);
+  const __m128i rgba1 = _mm_add_epi32(y_part, uv_part);
+  const __m128i rgba2 = _mm_srai_epi32(rgba1, YUV_FIX2);
+  return rgba2;
+}
+
+static WEBP_INLINE void VP8YuvToRgbSSE2(uint8_t y, uint8_t u, uint8_t v,
+                                        uint8_t* const rgb) {
+  const __m128i tmp0 = VP8GetRGBA32b(y, u, v);
+  const __m128i tmp1 = _mm_packs_epi32(tmp0, tmp0);
+  const __m128i tmp2 = _mm_packus_epi16(tmp1, tmp1);
+  // Note: we store 8 bytes at a time, not 3 bytes! -> memory stomp
+  _mm_storel_epi64((__m128i*)rgb, tmp2);
+}
+
+static WEBP_INLINE void VP8YuvToBgrSSE2(uint8_t y, uint8_t u, uint8_t v,
+                                        uint8_t* const bgr) {
+  const __m128i tmp0 = VP8GetRGBA32b(y, u, v);
+  const __m128i tmp1 = _mm_shuffle_epi32(tmp0, _MM_SHUFFLE(3, 0, 1, 2));
+  const __m128i tmp2 = _mm_packs_epi32(tmp1, tmp1);
+  const __m128i tmp3 = _mm_packus_epi16(tmp2, tmp2);
+  // Note: we store 8 bytes at a time, not 3 bytes! -> memory stomp
+  _mm_storel_epi64((__m128i*)bgr, tmp3);
+}
+
+void VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                    uint8_t* dst) {
+  int n;
+  for (n = 0; n < 32; n += 4) {
+    const __m128i tmp0_1 = VP8GetRGBA32b(y[n + 0], u[n + 0], v[n + 0]);
+    const __m128i tmp0_2 = VP8GetRGBA32b(y[n + 1], u[n + 1], v[n + 1]);
+    const __m128i tmp0_3 = VP8GetRGBA32b(y[n + 2], u[n + 2], v[n + 2]);
+    const __m128i tmp0_4 = VP8GetRGBA32b(y[n + 3], u[n + 3], v[n + 3]);
+    const __m128i tmp1_1 = _mm_packs_epi32(tmp0_1, tmp0_2);
+    const __m128i tmp1_2 = _mm_packs_epi32(tmp0_3, tmp0_4);
+    const __m128i tmp2 = _mm_packus_epi16(tmp1_1, tmp1_2);
+    _mm_storeu_si128((__m128i*)dst, tmp2);
+    dst += 4 * 4;
+  }
+}
+
+void VP8YuvToBgra32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                    uint8_t* dst) {
+  int n;
+  for (n = 0; n < 32; n += 2) {
+    const __m128i tmp0_1 = VP8GetRGBA32b(y[n + 0], u[n + 0], v[n + 0]);
+    const __m128i tmp0_2 = VP8GetRGBA32b(y[n + 1], u[n + 1], v[n + 1]);
+    const __m128i tmp1_1 = _mm_shuffle_epi32(tmp0_1, _MM_SHUFFLE(3, 0, 1, 2));
+    const __m128i tmp1_2 = _mm_shuffle_epi32(tmp0_2, _MM_SHUFFLE(3, 0, 1, 2));
+    const __m128i tmp2_1 = _mm_packs_epi32(tmp1_1, tmp1_2);
+    const __m128i tmp3 = _mm_packus_epi16(tmp2_1, tmp2_1);
+    _mm_storel_epi64((__m128i*)dst, tmp3);
+    dst += 4 * 2;
+  }
+}
+
+void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                   uint8_t* dst) {
+  int n;
+  uint8_t tmp0[2 * 3 + 5 + 15];
+  uint8_t* const tmp = (uint8_t*)((uintptr_t)(tmp0 + 15) & ~15);  // align
+  for (n = 0; n < 30; ++n) {   // we directly stomp the *dst memory
+    VP8YuvToRgbSSE2(y[n], u[n], v[n], dst + n * 3);
+  }
+  // Last two pixels are special: we write in a tmp buffer before sending
+  // to dst.
+  VP8YuvToRgbSSE2(y[n + 0], u[n + 0], v[n + 0], tmp + 0);
+  VP8YuvToRgbSSE2(y[n + 1], u[n + 1], v[n + 1], tmp + 3);
+  memcpy(dst + n * 3, tmp, 2 * 3);
+}
+
+void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                   uint8_t* dst) {
+  int n;
+  uint8_t tmp0[2 * 3 + 5 + 15];
+  uint8_t* const tmp = (uint8_t*)((uintptr_t)(tmp0 + 15) & ~15);  // align
+  for (n = 0; n < 30; ++n) {
+    VP8YuvToBgrSSE2(y[n], u[n], v[n], dst + n * 3);
+  }
+  VP8YuvToBgrSSE2(y[n + 0], u[n + 0], v[n + 0], tmp + 0);
+  VP8YuvToBgrSSE2(y[n + 1], u[n + 1], v[n + 1], tmp + 3);
+  memcpy(dst + n * 3, tmp, 2 * 3);
+}
+
+#else
+
+void VP8YUVInitSSE2(void) {}
+
+#endif  // FANCY_UPSAMPLING
+
+#endif  // WEBP_USE_SSE2
+
--- a/src/dsp/yuv.h
+++ b/src/dsp/yuv.h
@ -1,33 +1,165 @@
 // Copyright 2010 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
-// inline YUV->RGB conversion function
+// inline YUV<->RGB conversion function
+//
+// The exact naming is Y'CbCr, following the ITU-R BT.601 standard.
+// More information at: http://en.wikipedia.org/wiki/YCbCr
+// Y = 0.2569 * R + 0.5044 * G + 0.0979 * B + 16
+// U = -0.1483 * R - 0.2911 * G + 0.4394 * B + 128
+// V = 0.4394 * R - 0.3679 * G - 0.0715 * B + 128
+// We use 16bit fixed point operations for RGB->YUV conversion (YUV_FIX).
+//
+// For the Y'CbCr to RGB conversion, the BT.601 specification reads:
+//   R = 1.164 * (Y-16) + 1.596 * (V-128)
+//   G = 1.164 * (Y-16) - 0.813 * (V-128) - 0.391 * (U-128)
+//   B = 1.164 * (Y-16)                   + 2.018 * (U-128)
+// where Y is in the [16,235] range, and U/V in the [16,240] range.
+// In the table-lookup version (WEBP_YUV_USE_TABLE), the common factor
+// "1.164 * (Y-16)" can be handled as an offset in the VP8kClip[] table.
+// So in this case the formulae should read:
+//   R = 1.164 * [Y + 1.371 * (V-128)                  ] - 18.624
+//   G = 1.164 * [Y - 0.698 * (V-128) - 0.336 * (U-128)] - 18.624
+//   B = 1.164 * [Y                   + 1.733 * (U-128)] - 18.624
+// once factorized.
+// For YUV->RGB conversion, only 14bit fixed precision is used (YUV_FIX2).
+// That's the maximum possible for a convenient ARM implementation.
 //
 // Author: Skal (pascal.massimino@gmail.com)

 #ifndef WEBP_DSP_YUV_H_
 #define WEBP_DSP_YUV_H_

+#include "./dsp.h"
 #include "../dec/decode_vp8.h"

-#if defined(__cplusplus) || defined(c_plusplus)
+// Define the following to use the LUT-based code:
+// #define WEBP_YUV_USE_TABLE
+
+#if defined(WEBP_EXPERIMENTAL_FEATURES)
+// Do NOT activate this feature for real compression. This is only experimental!
+// This flag is for comparison purpose against JPEG's "YUVj" natural colorspace.
+// This colorspace is close to Rec.601's Y'CbCr model with the notable
+// difference of allowing larger range for luma/chroma.
+// See http://en.wikipedia.org/wiki/YCbCr#JPEG_conversion paragraph, and its
+// difference with http://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion
+// #define USE_YUVj
+#endif
+
+//------------------------------------------------------------------------------
+// YUV -> RGB conversion
+
+#ifdef __cplusplus
 extern "C" {
 #endif

-enum { YUV_FIX = 16,                // fixed-point precision
+enum {
+  YUV_FIX = 16,                    // fixed-point precision for RGB->YUV
+  YUV_HALF = 1 << (YUV_FIX - 1),
+  YUV_MASK = (256 << YUV_FIX) - 1,
  YUV_RANGE_MIN = -227,            // min value of r/g/b output
-       YUV_RANGE_MAX = 256 + 226    // max value of r/g/b output
+  YUV_RANGE_MAX = 256 + 226,       // max value of r/g/b output
+
+  YUV_FIX2 = 14,                   // fixed-point precision for YUV->RGB
+  YUV_HALF2 = 1 << (YUV_FIX2 - 1),
+  YUV_MASK2 = (256 << YUV_FIX2) - 1
 };
+
+// These constants are 14b fixed-point version of ITU-R BT.601 constants.
+#define kYScale 19077    // 1.164 = 255 / 219
+#define kVToR   26149    // 1.596 = 255 / 112 * 0.701
+#define kUToG   6419     // 0.391 = 255 / 112 * 0.886 * 0.114 / 0.587
+#define kVToG   13320    // 0.813 = 255 / 112 * 0.701 * 0.299 / 0.587
+#define kUToB   33050    // 2.018 = 255 / 112 * 0.886
+#define kRCst (-kYScale * 16 - kVToR * 128 + YUV_HALF2)
+#define kGCst (-kYScale * 16 + kUToG * 128 + kVToG * 128 + YUV_HALF2)
+#define kBCst (-kYScale * 16 - kUToB * 128 + YUV_HALF2)
+
+//------------------------------------------------------------------------------
+
+#if !defined(WEBP_YUV_USE_TABLE)
+
+// slower on x86 by ~7-8%, but bit-exact with the SSE2 version
+
+static WEBP_INLINE int VP8Clip8(int v) {
+  return ((v & ~YUV_MASK2) == 0) ? (v >> YUV_FIX2) : (v < 0) ? 0 : 255;
+}
+
+static WEBP_INLINE int VP8YUVToR(int y, int v) {
+  return VP8Clip8(kYScale * y + kVToR * v + kRCst);
+}
+
+static WEBP_INLINE int VP8YUVToG(int y, int u, int v) {
+  return VP8Clip8(kYScale * y - kUToG * u - kVToG * v + kGCst);
+}
+
+static WEBP_INLINE int VP8YUVToB(int y, int u) {
+  return VP8Clip8(kYScale * y + kUToB * u + kBCst);
+}
+
+static WEBP_INLINE void VP8YuvToRgb(int y, int u, int v,
+                                    uint8_t* const rgb) {
+  rgb[0] = VP8YUVToR(y, v);
+  rgb[1] = VP8YUVToG(y, u, v);
+  rgb[2] = VP8YUVToB(y, u);
+}
+
+static WEBP_INLINE void VP8YuvToBgr(int y, int u, int v,
+                                    uint8_t* const bgr) {
+  bgr[0] = VP8YUVToB(y, u);
+  bgr[1] = VP8YUVToG(y, u, v);
+  bgr[2] = VP8YUVToR(y, v);
+}
+
+static WEBP_INLINE void VP8YuvToRgb565(int y, int u, int v,
+                                       uint8_t* const rgb) {
+  const int r = VP8YUVToR(y, v);      // 5 usable bits
+  const int g = VP8YUVToG(y, u, v);   // 6 usable bits
+  const int b = VP8YUVToB(y, u);      // 5 usable bits
+  const int rg = (r & 0xf8) | (g >> 5);
+  const int gb = ((g << 3) & 0xe0) | (b >> 3);
+#ifdef WEBP_SWAP_16BIT_CSP
+  rgb[0] = gb;
+  rgb[1] = rg;
+#else
+  rgb[0] = rg;
+  rgb[1] = gb;
+#endif
+}
+
+static WEBP_INLINE void VP8YuvToRgba4444(int y, int u, int v,
+                                         uint8_t* const argb) {
+  const int r = VP8YUVToR(y, v);        // 4 usable bits
+  const int g = VP8YUVToG(y, u, v);     // 4 usable bits
+  const int b = VP8YUVToB(y, u);        // 4 usable bits
+  const int rg = (r & 0xf0) | (g >> 4);
+  const int ba = (b & 0xf0) | 0x0f;     // overwrite the lower 4 bits
+#ifdef WEBP_SWAP_16BIT_CSP
+  argb[0] = ba;
+  argb[1] = rg;
+#else
+  argb[0] = rg;
+  argb[1] = ba;
+#endif
+}
+
+#else
+
+// Table-based version, not totally equivalent to the SSE2 version.
+// Rounding diff is only +/-1 though.
+
 extern int16_t VP8kVToR[256], VP8kUToB[256];
 extern int32_t VP8kVToG[256], VP8kUToG[256];
 extern uint8_t VP8kClip[YUV_RANGE_MAX - YUV_RANGE_MIN];
 extern uint8_t VP8kClip4Bits[YUV_RANGE_MAX - YUV_RANGE_MIN];

-static WEBP_INLINE void VP8YuvToRgb(uint8_t y, uint8_t u, uint8_t v,
+static WEBP_INLINE void VP8YuvToRgb(int y, int u, int v,
                                    uint8_t* const rgb) {
  const int r_off = VP8kVToR[v];
  const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
@ -37,35 +169,7 @@ static WEBP_INLINE void VP8YuvToRgb(uint8_t y, uint8_t u, uint8_t v,
  rgb[2] = VP8kClip[y + b_off - YUV_RANGE_MIN];
 }

-static WEBP_INLINE void VP8YuvToRgb565(uint8_t y, uint8_t u, uint8_t v,
-                                       uint8_t* const rgb) {
-  const int r_off = VP8kVToR[v];
-  const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
-  const int b_off = VP8kUToB[u];
-  rgb[0] = ((VP8kClip[y + r_off - YUV_RANGE_MIN] & 0xf8) |
-            (VP8kClip[y + g_off - YUV_RANGE_MIN] >> 5));
-  rgb[1] = (((VP8kClip[y + g_off - YUV_RANGE_MIN] << 3) & 0xe0) |
-            (VP8kClip[y + b_off - YUV_RANGE_MIN] >> 3));
-}
-
-static WEBP_INLINE void VP8YuvToArgb(uint8_t y, uint8_t u, uint8_t v,
-                                     uint8_t* const argb) {
-  argb[0] = 0xff;
-  VP8YuvToRgb(y, u, v, argb + 1);
-}
-
-static WEBP_INLINE void VP8YuvToRgba4444(uint8_t y, uint8_t u, uint8_t v,
-                                         uint8_t* const argb) {
-  const int r_off = VP8kVToR[v];
-  const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
-  const int b_off = VP8kUToB[u];
-  // Don't update alpha (last 4 bits of argb[1])
-  argb[0] = ((VP8kClip4Bits[y + r_off - YUV_RANGE_MIN] << 4) |
-             VP8kClip4Bits[y + g_off - YUV_RANGE_MIN]);
-  argb[1] = 0x0f | (VP8kClip4Bits[y + b_off - YUV_RANGE_MIN] << 4);
-}
-
-static WEBP_INLINE void VP8YuvToBgr(uint8_t y, uint8_t u, uint8_t v,
+static WEBP_INLINE void VP8YuvToBgr(int y, int u, int v,
                                    uint8_t* const bgr) {
  const int r_off = VP8kVToR[v];
  const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
@ -75,6 +179,52 @@ static WEBP_INLINE void VP8YuvToBgr(uint8_t y, uint8_t u, uint8_t v,
  bgr[2] = VP8kClip[y + r_off - YUV_RANGE_MIN];
 }

+static WEBP_INLINE void VP8YuvToRgb565(int y, int u, int v,
+                                       uint8_t* const rgb) {
+  const int r_off = VP8kVToR[v];
+  const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
+  const int b_off = VP8kUToB[u];
+  const int rg = ((VP8kClip[y + r_off - YUV_RANGE_MIN] & 0xf8) |
+                  (VP8kClip[y + g_off - YUV_RANGE_MIN] >> 5));
+  const int gb = (((VP8kClip[y + g_off - YUV_RANGE_MIN] << 3) & 0xe0) |
+                   (VP8kClip[y + b_off - YUV_RANGE_MIN] >> 3));
+#ifdef WEBP_SWAP_16BIT_CSP
+  rgb[0] = gb;
+  rgb[1] = rg;
+#else
+  rgb[0] = rg;
+  rgb[1] = gb;
+#endif
+}
+
+static WEBP_INLINE void VP8YuvToRgba4444(int y, int u, int v,
+                                         uint8_t* const argb) {
+  const int r_off = VP8kVToR[v];
+  const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
+  const int b_off = VP8kUToB[u];
+  const int rg = ((VP8kClip4Bits[y + r_off - YUV_RANGE_MIN] << 4) |
+                   VP8kClip4Bits[y + g_off - YUV_RANGE_MIN]);
+  const int ba = (VP8kClip4Bits[y + b_off - YUV_RANGE_MIN] << 4) | 0x0f;
+#ifdef WEBP_SWAP_16BIT_CSP
+  argb[0] = ba;
+  argb[1] = rg;
+#else
+  argb[0] = rg;
+  argb[1] = ba;
+#endif
+}
+
+#endif  // WEBP_YUV_USE_TABLE
+
+//-----------------------------------------------------------------------------
+// Alpha handling variants
+
+static WEBP_INLINE void VP8YuvToArgb(uint8_t y, uint8_t u, uint8_t v,
+                                     uint8_t* const argb) {
+  argb[0] = 0xff;
+  VP8YuvToRgb(y, u, v, argb + 1);
+}
+
 static WEBP_INLINE void VP8YuvToBgra(uint8_t y, uint8_t u, uint8_t v,
                                     uint8_t* const bgra) {
  VP8YuvToBgr(y, u, v, bgra);
@ -87,15 +237,80 @@ static WEBP_INLINE void VP8YuvToRgba(uint8_t y, uint8_t u, uint8_t v,
  rgba[3] = 0xff;
 }

-static WEBP_INLINE uint32_t VP8Clip4Bits(uint8_t c) {
-  const uint32_t v = (c + 8) >> 4;
-  return (v > 15) ? 15 : v;
-}
-
 // Must be called before everything, to initialize the tables.
 void VP8YUVInit(void);

-#if defined(__cplusplus) || defined(c_plusplus)
+//-----------------------------------------------------------------------------
+// SSE2 extra functions (mostly for upsampling_sse2.c)
+
+#if defined(WEBP_USE_SSE2)
+
+#if defined(FANCY_UPSAMPLING)
+// Process 32 pixels and store the result (24b or 32b per pixel) in *dst.
+void VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                    uint8_t* dst);
+void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                   uint8_t* dst);
+void VP8YuvToBgra32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                    uint8_t* dst);
+void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                   uint8_t* dst);
+#endif  // FANCY_UPSAMPLING
+
+// Must be called to initialize tables before using the functions.
+void VP8YUVInitSSE2(void);
+
+#endif    // WEBP_USE_SSE2
+
+//------------------------------------------------------------------------------
+// RGB -> YUV conversion
+
+// Stub functions that can be called with various rounding values:
+static WEBP_INLINE int VP8ClipUV(int uv, int rounding) {
+  uv = (uv + rounding + (128 << (YUV_FIX + 2))) >> (YUV_FIX + 2);
+  return ((uv & ~0xff) == 0) ? uv : (uv < 0) ? 0 : 255;
+}
+
+#ifndef USE_YUVj
+
+static WEBP_INLINE int VP8RGBToY(int r, int g, int b, int rounding) {
+  const int luma = 16839 * r + 33059 * g + 6420 * b;
+  return (luma + rounding + (16 << YUV_FIX)) >> YUV_FIX;  // no need to clip
+}
+
+static WEBP_INLINE int VP8RGBToU(int r, int g, int b, int rounding) {
+  const int u = -9719 * r - 19081 * g + 28800 * b;
+  return VP8ClipUV(u, rounding);
+}
+
+static WEBP_INLINE int VP8RGBToV(int r, int g, int b, int rounding) {
+  const int v = +28800 * r - 24116 * g - 4684 * b;
+  return VP8ClipUV(v, rounding);
+}
+
+#else
+
+// This JPEG-YUV colorspace, only for comparison!
+// These are also 16bit precision coefficients from Rec.601, but with full
+// [0..255] output range.
+static WEBP_INLINE int VP8RGBToY(int r, int g, int b, int rounding) {
+  const int luma = 19595 * r + 38470 * g + 7471 * b;
+  return (luma + rounding) >> YUV_FIX;  // no need to clip
+}
+
+static WEBP_INLINE int VP8_RGB_TO_U(int r, int g, int b, int rounding) {
+  const int u = -11058 * r - 21710 * g + 32768 * b;
+  return VP8ClipUV(u, rounding);
+}
+
+static WEBP_INLINE int VP8_RGB_TO_V(int r, int g, int b, int rounding) {
+  const int v = 32768 * r - 27439 * g - 5329 * b;
+  return VP8ClipUV(v, rounding);
+}
+
+#endif    // USE_YUVj
+
+#ifdef __cplusplus
 }    // extern "C"
 #endif

--- a/src/enc/Makefile.am
+++ b/src/enc/Makefile.am
@ -16,6 +16,7 @@ libwebpencode_la_SOURCES += layer.c
 libwebpencode_la_SOURCES += picture.c
 libwebpencode_la_SOURCES += quant.c
 libwebpencode_la_SOURCES += syntax.c
+libwebpencode_la_SOURCES += token.c
 libwebpencode_la_SOURCES += tree.c
 libwebpencode_la_SOURCES += vp8enci.h
 libwebpencode_la_SOURCES += vp8l.c
--- a/src/enc/alpha.c
+++ b/src/enc/alpha.c
@ -1,8 +1,10 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Alpha-plane compression.
@ -17,24 +19,16 @@
 #include "../utils/quant_levels.h"
 #include "../webp/format_constants.h"

-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 // -----------------------------------------------------------------------------
-// int EncodeAlpha(const uint8_t* data, int width, int height, int stride,
-//                 int quality, int method, int filter, int effort_level,
-//                 uint8_t** output, size_t* output_size)
-//
-// Encodes the given alpha data 'data' of size 'stride'x'height' via specified
-// compression method 'method'. The pre-processing (Quantization) is
-// performed if 'quality' is less than 100. For such cases, the encoding is
-// lossy. Valid ranges for 'quality' is [0, 100] and 'method' is [0, 1]:
+// Encodes the given alpha data via specified compression method 'method'.
+// The pre-processing (quantization) is performed if 'quality' is less than 100.
+// For such cases, the encoding is lossy. The valid range is [0, 100] for
+// 'quality' and [0, 1] for 'method':
 //   'method = 0' - No compression;
 //   'method = 1' - Use lossless coder on the alpha plane only
 // 'filter' values [0, 4] correspond to prediction modes none, horizontal,
 // vertical & gradient filters. The prediction mode 4 will try all the
-// prediction modes (0 to 3) and pick the best prediction mode.
+// prediction modes 0 to 3 and pick the best one.
 // 'effort_level': specifies how much effort must be spent to try and reduce
 //  the compressed output size. In range 0 (quick) to 6 (slow).
 //
@ -50,10 +44,10 @@ extern "C" {

 #include "../enc/vp8li.h"

-static int EncodeLossless(const uint8_t* data, int width, int height,
+static int EncodeLossless(const uint8_t* const data, int width, int height,
                          int effort_level,  // in [0..6] range
-                          VP8BitWriter* const bw) {
-
+                          VP8BitWriter* const bw,
+                          WebPAuxStats* const stats) {
  int ok = 0;
  WebPConfig config;
  WebPPicture picture;
@ -63,6 +57,7 @@ static int EncodeLossless(const uint8_t* data, int width, int height,
  picture.width = width;
  picture.height = height;
  picture.use_argb = 1;
+  picture.stats = stats;
  if (!WebPPictureAlloc(&picture)) return 0;

  // Transfer the alpha values to the green channel.
@ -72,7 +67,7 @@ static int EncodeLossless(const uint8_t* data, int width, int height,
    const uint8_t* src = data;
    for (j = 0; j < picture.height; ++j) {
      for (i = 0; i < picture.width; ++i) {
-        dst[i] = (src[i] << 8) | 0xff000000u;
+        dst[i] = src[i] << 8;  // we leave A/R/B channels zero'd.
      }
      src += width;
      dst += picture.argb_stride;
@ -82,18 +77,19 @@ static int EncodeLossless(const uint8_t* data, int width, int height,
  WebPConfigInit(&config);
  config.lossless = 1;
  config.method = effort_level;  // impact is very small
-  // Set moderate default quality setting for alpha. Higher qualities (80 and
-  // above) could be very slow.
-  config.quality = 10.f + 15.f * effort_level;
-  if (config.quality > 100.f) config.quality = 100.f;
+  // Set a low default quality for encoding alpha. Ensure that Alpha quality at
+  // lower methods (3 and below) is less than the threshold for triggering
+  // costly 'BackwardReferencesTraceBackwards'.
+  config.quality = 8.f * effort_level;
+  assert(config.quality >= 0 && config.quality <= 100.f);

-  VP8LBitWriterInit(&tmp_bw, (width * height) >> 3);
-  ok = (VP8LEncodeStream(&config, &picture, &tmp_bw) == VP8_ENC_OK);
+  ok = VP8LBitWriterInit(&tmp_bw, (width * height) >> 3);
+  ok = ok && (VP8LEncodeStream(&config, &picture, &tmp_bw) == VP8_ENC_OK);
  WebPPictureFree(&picture);
  if (ok) {
-    const uint8_t* const data = VP8LBitWriterFinish(&tmp_bw);
-    const size_t data_size = VP8LBitWriterNumBytes(&tmp_bw);
-    VP8BitWriterAppend(bw, data, data_size);
+    const uint8_t* const buffer = VP8LBitWriterFinish(&tmp_bw);
+    const size_t buffer_size = VP8LBitWriterNumBytes(&tmp_bw);
+    VP8BitWriterAppend(bw, buffer, buffer_size);
  }
  VP8LBitWriterDestroy(&tmp_bw);
  return ok && !bw->error_;
@ -101,10 +97,19 @@ static int EncodeLossless(const uint8_t* data, int width, int height,

 // -----------------------------------------------------------------------------

-static int EncodeAlphaInternal(const uint8_t* data, int width, int height,
+// Small struct to hold the result of a filter mode compression attempt.
+typedef struct {
+  size_t score;
+  VP8BitWriter bw;
+  WebPAuxStats stats;
+} FilterTrial;
+
+// This function always returns an initialized 'bw' object, even upon error.
+static int EncodeAlphaInternal(const uint8_t* const data, int width, int height,
                               int method, int filter, int reduce_levels,
                               int effort_level,  // in [0..6] range
-                               uint8_t* tmp_alpha, VP8BitWriter* const bw) {
+                               uint8_t* const tmp_alpha,
+                               FilterTrial* result) {
  int ok = 0;
  const uint8_t* alpha_src;
  WebPFilterFunc filter_func;
@ -112,6 +117,7 @@ static int EncodeAlphaInternal(const uint8_t* data, int width, int height,
  size_t expected_size;
  const size_t data_size = width * height;

+  assert((uint64_t)data_size == (uint64_t)width * height);  // as per spec
  assert(filter >= 0 && filter < WEBP_FILTER_LAST);
  assert(method >= ALPHA_NO_COMPRESSION);
  assert(method <= ALPHA_LOSSLESS_COMPRESSION);
@ -124,24 +130,26 @@ static int EncodeAlphaInternal(const uint8_t* data, int width, int height,
  header = method | (filter << 2);
  if (reduce_levels) header |= ALPHA_PREPROCESSED_LEVELS << 4;

-  VP8BitWriterInit(bw, expected_size);
-  VP8BitWriterAppend(bw, &header, ALPHA_HEADER_LEN);
+  VP8BitWriterInit(&result->bw, expected_size);
+  VP8BitWriterAppend(&result->bw, &header, ALPHA_HEADER_LEN);

  filter_func = WebPFilters[filter];
-  if (filter_func) {
-    filter_func(data, width, height, 1, width, tmp_alpha);
+  if (filter_func != NULL) {
+    filter_func(data, width, height, width, tmp_alpha);
    alpha_src = tmp_alpha;
  }  else {
    alpha_src = data;
  }

  if (method == ALPHA_NO_COMPRESSION) {
-    ok = VP8BitWriterAppend(bw, alpha_src, width * height);
-    ok = ok && !bw->error_;
+    ok = VP8BitWriterAppend(&result->bw, alpha_src, width * height);
+    ok = ok && !result->bw.error_;
  } else {
-    ok = EncodeLossless(alpha_src, width, height, effort_level, bw);
-    VP8BitWriterFinish(bw);
+    ok = EncodeLossless(alpha_src, width, height, effort_level,
+                        &result->bw, &result->stats);
+    VP8BitWriterFinish(&result->bw);
  }
+  result->score = VP8BitWriterSize(&result->bw);
  return ok;
 }

@ -157,19 +165,124 @@ static void CopyPlane(const uint8_t* src, int src_stride,
  }
 }

-static int EncodeAlpha(const uint8_t* data, int width, int height, int stride,
+static int GetNumColors(const uint8_t* data, int width, int height,
+                        int stride) {
+  int j;
+  int colors = 0;
+  uint8_t color[256] = { 0 };
+
+  for (j = 0; j < height; ++j) {
+    int i;
+    const uint8_t* const p = data + j * stride;
+    for (i = 0; i < width; ++i) {
+      color[p[i]] = 1;
+    }
+  }
+  for (j = 0; j < 256; ++j) {
+    if (color[j] > 0) ++colors;
+  }
+  return colors;
+}
+
+#define FILTER_TRY_NONE (1 << WEBP_FILTER_NONE)
+#define FILTER_TRY_ALL ((1 << WEBP_FILTER_LAST) - 1)
+
+// Given the input 'filter' option, return an OR'd bit-set of filters to try.
+static uint32_t GetFilterMap(const uint8_t* alpha, int width, int height,
+                             int filter, int effort_level) {
+  uint32_t bit_map = 0U;
+  if (filter == WEBP_FILTER_FAST) {
+    // Quick estimate of the best candidate.
+    int try_filter_none = (effort_level > 3);
+    const int kMinColorsForFilterNone = 16;
+    const int kMaxColorsForFilterNone = 192;
+    const int num_colors = GetNumColors(alpha, width, height, width);
+    // For low number of colors, NONE yields better compression.
+    filter = (num_colors <= kMinColorsForFilterNone) ? WEBP_FILTER_NONE :
+             EstimateBestFilter(alpha, width, height, width);
+    bit_map |= 1 << filter;
+    // For large number of colors, try FILTER_NONE in addition to the best
+    // filter as well.
+    if (try_filter_none || num_colors > kMaxColorsForFilterNone) {
+      bit_map |= FILTER_TRY_NONE;
+    }
+  } else if (filter == WEBP_FILTER_NONE) {
+    bit_map = FILTER_TRY_NONE;
+  } else {  // WEBP_FILTER_BEST -> try all
+    bit_map = FILTER_TRY_ALL;
+  }
+  return bit_map;
+}
+
+static void InitFilterTrial(FilterTrial* const score) {
+  score->score = (size_t)~0U;
+  VP8BitWriterInit(&score->bw, 0);
+}
+
+static int ApplyFiltersAndEncode(const uint8_t* alpha, int width, int height,
+                                 size_t data_size, int method, int filter,
+                                 int reduce_levels, int effort_level,
+                                 uint8_t** const output,
+                                 size_t* const output_size,
+                                 WebPAuxStats* const stats) {
+  int ok = 1;
+  FilterTrial best;
+  uint32_t try_map =
+      GetFilterMap(alpha, width, height, filter, effort_level);
+  InitFilterTrial(&best);
+  if (try_map != FILTER_TRY_NONE) {
+    uint8_t* filtered_alpha =  (uint8_t*)malloc(data_size);
+    if (filtered_alpha == NULL) return 0;
+
+    for (filter = WEBP_FILTER_NONE; ok && try_map; ++filter, try_map >>= 1) {
+      if (try_map & 1) {
+        FilterTrial trial;
+        ok = EncodeAlphaInternal(alpha, width, height, method, filter,
+                                 reduce_levels, effort_level, filtered_alpha,
+                                 &trial);
+        if (ok && trial.score < best.score) {
+          VP8BitWriterWipeOut(&best.bw);
+          best = trial;
+        } else {
+          VP8BitWriterWipeOut(&trial.bw);
+        }
+      }
+    }
+    free(filtered_alpha);
+  } else {
+    ok = EncodeAlphaInternal(alpha, width, height, method, WEBP_FILTER_NONE,
+                             reduce_levels, effort_level, NULL, &best);
+  }
+  if (ok) {
+    if (stats != NULL) *stats = best.stats;
+    *output_size = VP8BitWriterSize(&best.bw);
+    *output = VP8BitWriterBuf(&best.bw);
+  } else {
+    VP8BitWriterWipeOut(&best.bw);
+  }
+  return ok;
+}
+
+static int EncodeAlpha(VP8Encoder* const enc,
                       int quality, int method, int filter,
                       int effort_level,
-                       uint8_t** output, size_t* output_size) {
+                       uint8_t** const output, size_t* const output_size) {
+  const WebPPicture* const pic = enc->pic_;
+  const int width = pic->width;
+  const int height = pic->height;
+
  uint8_t* quant_alpha = NULL;
  const size_t data_size = width * height;
+  uint64_t sse = 0;
  int ok = 1;
  const int reduce_levels = (quality < 100);

  // quick sanity checks
-  assert(data != NULL && output != NULL && output_size != NULL);
+  assert((uint64_t)data_size == (uint64_t)width * height);  // as per spec
+  assert(enc != NULL && pic != NULL && pic->a != NULL);
+  assert(output != NULL && output_size != NULL);
  assert(width > 0 && height > 0);
-  assert(stride >= width);
+  assert(pic->a_stride >= width);
  assert(filter >= WEBP_FILTER_NONE && filter <= WEBP_FILTER_FAST);

  if (quality < 0 || quality > 100) {
@ -180,13 +293,18 @@ static int EncodeAlpha(const uint8_t* data, int width, int height, int stride,
    return 0;
  }

+  if (method == ALPHA_NO_COMPRESSION) {
+    // Don't filter, as filtering will make no impact on compressed size.
+    filter = WEBP_FILTER_NONE;
+  }
+
  quant_alpha = (uint8_t*)malloc(data_size);
  if (quant_alpha == NULL) {
    return 0;
  }

  // Extract alpha data (width x height) from raw_data (stride x height).
-  CopyPlane(data, stride, quant_alpha, width, width, height);
+  CopyPlane(pic->a, pic->a_stride, quant_alpha, width, width, height);

  if (reduce_levels) {  // No Quantization required for 'quality = 100'.
    // 16 alpha levels gives quite a low MSE w.r.t original alpha plane hence
@ -194,122 +312,99 @@ static int EncodeAlpha(const uint8_t* data, int width, int height, int stride,
    // and Quality:]70, 100] -> Levels:]16, 256].
    const int alpha_levels = (quality <= 70) ? (2 + quality / 5)
                                             : (16 + (quality - 70) * 8);
-    ok = QuantizeLevels(quant_alpha, width, height, alpha_levels, NULL);
+    ok = QuantizeLevels(quant_alpha, width, height, alpha_levels, &sse);
  }

  if (ok) {
-    VP8BitWriter bw;
-    size_t best_score;
-    int test_filter;
-    uint8_t* filtered_alpha = NULL;
-
-    // We always test WEBP_FILTER_NONE first.
-    ok = EncodeAlphaInternal(quant_alpha, width, height,
-                             method, WEBP_FILTER_NONE, reduce_levels,
-                             effort_level, NULL, &bw);
-    if (!ok) {
-      VP8BitWriterWipeOut(&bw);
-      goto End;
+    ok = ApplyFiltersAndEncode(quant_alpha, width, height, data_size, method,
+                               filter, reduce_levels, effort_level, output,
+                               output_size, pic->stats);
+    if (pic->stats != NULL) {  // need stats?
+      pic->stats->coded_size += (int)(*output_size);
+      enc->sse_[3] = sse;
    }
-    best_score = VP8BitWriterSize(&bw);
-
-    if (filter == WEBP_FILTER_FAST) {  // Quick estimate of a second candidate?
-      filter = EstimateBestFilter(quant_alpha, width, height, width);
-    }
-    // Stop?
-    if (filter == WEBP_FILTER_NONE) {
-      goto Ok;
  }

-    filtered_alpha = (uint8_t*)malloc(data_size);
-    ok = (filtered_alpha != NULL);
-    if (!ok) {
-      goto End;
-    }
-
-    // Try the other mode(s).
-    for (test_filter = WEBP_FILTER_HORIZONTAL;
-         ok && (test_filter <= WEBP_FILTER_GRADIENT);
-         ++test_filter) {
-      VP8BitWriter tmp_bw;
-      if (filter != WEBP_FILTER_BEST && test_filter != filter) {
-        continue;
-      }
-
-      ok = EncodeAlphaInternal(quant_alpha, width, height,
-                               method, test_filter, reduce_levels,
-                               effort_level, filtered_alpha, &tmp_bw);
-      if (ok) {
-        const size_t score = VP8BitWriterSize(&tmp_bw);
-        if (score < best_score) {
-          // swap bitwriter objects.
-          VP8BitWriter tmp = tmp_bw;
-          tmp_bw = bw;
-          bw = tmp;
-          best_score = score;
-        }
-      } else {
-        VP8BitWriterWipeOut(&bw);
-      }
-      VP8BitWriterWipeOut(&tmp_bw);
-    }
- Ok:
-    if (ok) {
-      *output_size = VP8BitWriterSize(&bw);
-      *output = VP8BitWriterBuf(&bw);
-    }
-    free(filtered_alpha);
-  }
- End:
  free(quant_alpha);
  return ok;
 }

-
 //------------------------------------------------------------------------------
 // Main calls

-void VP8EncInitAlpha(VP8Encoder* enc) {
-  enc->has_alpha_ = WebPPictureHasTransparency(enc->pic_);
-  enc->alpha_data_ = NULL;
-  enc->alpha_data_size_ = 0;
-}
-
-int VP8EncFinishAlpha(VP8Encoder* enc) {
-  if (enc->has_alpha_) {
+static int CompressAlphaJob(VP8Encoder* const enc, void* dummy) {
  const WebPConfig* config = enc->config_;
-    const WebPPicture* pic = enc->pic_;
-    uint8_t* tmp_data = NULL;
-    size_t tmp_size = 0;
+  uint8_t* alpha_data = NULL;
+  size_t alpha_size = 0;
  const int effort_level = config->method;  // maps to [0..6]
  const WEBP_FILTER_TYPE filter =
      (config->alpha_filtering == 0) ? WEBP_FILTER_NONE :
      (config->alpha_filtering == 1) ? WEBP_FILTER_FAST :
                                       WEBP_FILTER_BEST;
+  if (!EncodeAlpha(enc, config->alpha_quality, config->alpha_compression,
+                   filter, effort_level, &alpha_data, &alpha_size)) {
+    return 0;
+  }
+  if (alpha_size != (uint32_t)alpha_size) {  // Sanity check.
+    free(alpha_data);
+    return 0;
+  }
+  enc->alpha_data_size_ = (uint32_t)alpha_size;
+  enc->alpha_data_ = alpha_data;
+  (void)dummy;
+  return 1;
+}

-    assert(pic->a);
-    if (!EncodeAlpha(pic->a, pic->width, pic->height, pic->a_stride,
-                     config->alpha_quality, config->alpha_compression,
-                     filter, effort_level, &tmp_data, &tmp_size)) {
+void VP8EncInitAlpha(VP8Encoder* const enc) {
+  enc->has_alpha_ = WebPPictureHasTransparency(enc->pic_);
+  enc->alpha_data_ = NULL;
+  enc->alpha_data_size_ = 0;
+  if (enc->thread_level_ > 0) {
+    WebPWorker* const worker = &enc->alpha_worker_;
+    WebPWorkerInit(worker);
+    worker->data1 = enc;
+    worker->data2 = NULL;
+    worker->hook = (WebPWorkerHook)CompressAlphaJob;
+  }
+}
+
+int VP8EncStartAlpha(VP8Encoder* const enc) {
+  if (enc->has_alpha_) {
+    if (enc->thread_level_ > 0) {
+      WebPWorker* const worker = &enc->alpha_worker_;
+      if (!WebPWorkerReset(worker)) {    // Makes sure worker is good to go.
        return 0;
      }
-    if (tmp_size != (uint32_t)tmp_size) {  // Sanity check.
-      free(tmp_data);
-      return 0;
+      WebPWorkerLaunch(worker);
+      return 1;
+    } else {
+      return CompressAlphaJob(enc, NULL);   // just do the job right away
+    }
+  }
+  return 1;
+}
+
+int VP8EncFinishAlpha(VP8Encoder* const enc) {
+  if (enc->has_alpha_) {
+    if (enc->thread_level_ > 0) {
+      WebPWorker* const worker = &enc->alpha_worker_;
+      if (!WebPWorkerSync(worker)) return 0;  // error
    }
-    enc->alpha_data_size_ = (uint32_t)tmp_size;
-    enc->alpha_data_ = tmp_data;
  }
  return WebPReportProgress(enc->pic_, enc->percent_ + 20, &enc->percent_);
 }

-void VP8EncDeleteAlpha(VP8Encoder* enc) {
+int VP8EncDeleteAlpha(VP8Encoder* const enc) {
+  int ok = 1;
+  if (enc->thread_level_ > 0) {
+    WebPWorker* const worker = &enc->alpha_worker_;
+    ok = WebPWorkerSync(worker);  // finish anything left in flight
+    WebPWorkerEnd(worker);  // still need to end the worker, even if !ok
+  }
  free(enc->alpha_data_);
  enc->alpha_data_ = NULL;
  enc->alpha_data_size_ = 0;
  enc->has_alpha_ = 0;
+  return ok;
 }

-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
--- a/src/enc/analysis.c
+++ b/src/enc/analysis.c
@ -1,8 +1,10 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Macroblock analysis
@ -15,17 +17,10 @@

 #include "./vp8enci.h"
 #include "./cost.h"
-
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
+#include "../utils/utils.h"

 #define MAX_ITERS_K_MEANS  6

-static int ClipAlpha(int alpha) {
-  return alpha < 0 ? 0 : alpha > 255 ? 255 : alpha;
-}
-
 //------------------------------------------------------------------------------
 // Smooth the segment map by replacing isolated block by the majority of its
 // neighbours.
@ -35,7 +30,8 @@ static void SmoothSegmentMap(VP8Encoder* const enc) {
  const int w = enc->mb_w_;
  const int h = enc->mb_h_;
  const int majority_cnt_3_x_3_grid = 5;
-  uint8_t* const tmp = (uint8_t*)malloc(w * h * sizeof(uint8_t));
+  uint8_t* const tmp = (uint8_t*)WebPSafeMalloc((uint64_t)w * h, sizeof(*tmp));
+  assert((uint64_t)(w * h) == (uint64_t)w * h);   // no overflow, as per spec

  if (tmp == NULL) return;
  for (y = 1; y < h - 1; ++y) {
@ -55,6 +51,7 @@ static void SmoothSegmentMap(VP8Encoder* const enc) {
      for (n = 0; n < NUM_MB_SEGMENTS; ++n) {
        if (cnt[n] >= majority_cnt_3_x_3_grid) {
          majority_seg = n;
+          break;
        }
      }
      tmp[x + y * w] = majority_seg;
@ -70,50 +67,10 @@ static void SmoothSegmentMap(VP8Encoder* const enc) {
 }

 //------------------------------------------------------------------------------
-// Finalize Segment probability based on the coding tree
-
-static int GetProba(int a, int b) {
-  int proba;
-  const int total = a + b;
-  if (total == 0) return 255;  // that's the default probability.
-  proba = (255 * a + total / 2) / total;
-  return proba;
-}
-
-static void SetSegmentProbas(VP8Encoder* const enc) {
-  int p[NUM_MB_SEGMENTS] = { 0 };
-  int n;
-
-  for (n = 0; n < enc->mb_w_ * enc->mb_h_; ++n) {
-    const VP8MBInfo* const mb = &enc->mb_info_[n];
-    p[mb->segment_]++;
-  }
-  if (enc->pic_->stats) {
-    for (n = 0; n < NUM_MB_SEGMENTS; ++n) {
-      enc->pic_->stats->segment_size[n] = p[n];
-    }
-  }
-  if (enc->segment_hdr_.num_segments_ > 1) {
-    uint8_t* const probas = enc->proba_.segments_;
-    probas[0] = GetProba(p[0] + p[1], p[2] + p[3]);
-    probas[1] = GetProba(p[0], p[1]);
-    probas[2] = GetProba(p[2], p[3]);
-
-    enc->segment_hdr_.update_map_ =
-        (probas[0] != 255) || (probas[1] != 255) || (probas[2] != 255);
-    enc->segment_hdr_.size_ =
-      p[0] * (VP8BitCost(0, probas[0]) + VP8BitCost(0, probas[1])) +
-      p[1] * (VP8BitCost(0, probas[0]) + VP8BitCost(1, probas[1])) +
-      p[2] * (VP8BitCost(1, probas[0]) + VP8BitCost(0, probas[2])) +
-      p[3] * (VP8BitCost(1, probas[0]) + VP8BitCost(1, probas[2]));
-  } else {
-    enc->segment_hdr_.update_map_ = 0;
-    enc->segment_hdr_.size_ = 0;
-  }
-}
+// set segment susceptibility alpha_ / beta_

 static WEBP_INLINE int clip(int v, int m, int M) {
-  return v < m ? m : v > M ? M : v;
+  return (v < m) ? m : (v > M) ? M : v;
 }

 static void SetSegmentAlphas(VP8Encoder* const enc,
@ -139,29 +96,73 @@ static void SetSegmentAlphas(VP8Encoder* const enc,
  }
 }

+//------------------------------------------------------------------------------
+// Compute susceptibility based on DCT-coeff histograms:
+// the higher, the "easier" the macroblock is to compress.
+
+#define MAX_ALPHA 255                // 8b of precision for susceptibilities.
+#define ALPHA_SCALE (2 * MAX_ALPHA)  // scaling factor for alpha.
+#define DEFAULT_ALPHA (-1)
+#define IS_BETTER_ALPHA(alpha, best_alpha) ((alpha) > (best_alpha))
+
+static int FinalAlphaValue(int alpha) {
+  alpha = MAX_ALPHA - alpha;
+  return clip(alpha, 0, MAX_ALPHA);
+}
+
+static int GetAlpha(const VP8Histogram* const histo) {
+  int max_value = 0, last_non_zero = 1;
+  int k;
+  int alpha;
+  for (k = 0; k <= MAX_COEFF_THRESH; ++k) {
+    const int value = histo->distribution[k];
+    if (value > 0) {
+      if (value > max_value) max_value = value;
+      last_non_zero = k;
+    }
+  }
+  // 'alpha' will later be clipped to [0..MAX_ALPHA] range, clamping outer
+  // values which happen to be mostly noise. This leaves the maximum precision
+  // for handling the useful small values which contribute most.
+  alpha = (max_value > 1) ? ALPHA_SCALE * last_non_zero / max_value : 0;
+  return alpha;
+}
+
+static void MergeHistograms(const VP8Histogram* const in,
+                            VP8Histogram* const out) {
+  int i;
+  for (i = 0; i <= MAX_COEFF_THRESH; ++i) {
+    out->distribution[i] += in->distribution[i];
+  }
+}
+
 //------------------------------------------------------------------------------
 // Simplified k-Means, to assign Nb segments based on alpha-histogram

-static void AssignSegments(VP8Encoder* const enc, const int alphas[256]) {
+static void AssignSegments(VP8Encoder* const enc,
+                           const int alphas[MAX_ALPHA + 1]) {
  const int nb = enc->segment_hdr_.num_segments_;
  int centers[NUM_MB_SEGMENTS];
-  int weighted_average;
-  int map[256];
+  int weighted_average = 0;
+  int map[MAX_ALPHA + 1];
  int a, n, k;
-  int min_a = 0, max_a = 255, range_a;
+  int min_a = 0, max_a = MAX_ALPHA, range_a;
  // 'int' type is ok for histo, and won't overflow
  int accum[NUM_MB_SEGMENTS], dist_accum[NUM_MB_SEGMENTS];

+  assert(nb >= 1);
+
  // bracket the input
-  for (n = 0; n < 256 && alphas[n] == 0; ++n) {}
+  for (n = 0; n <= MAX_ALPHA && alphas[n] == 0; ++n) {}
  min_a = n;
-  for (n = 255; n > min_a && alphas[n] == 0; --n) {}
+  for (n = MAX_ALPHA; n > min_a && alphas[n] == 0; --n) {}
  max_a = n;
  range_a = max_a - min_a;

  // Spread initial centers evenly
-  for (n = 1, k = 0; n < 2 * nb; n += 2) {
-    centers[k++] = min_a + (n * range_a) / (2 * nb);
+  for (k = 0, n = 1; k < nb; ++k, n += 2) {
+    assert(n < 2 * nb);
+    centers[k] = min_a + (n * range_a) / (2 * nb);
  }

  for (k = 0; k < MAX_ITERS_K_MEANS; ++k) {     // few iters are enough
@ -176,7 +177,7 @@ static void AssignSegments(VP8Encoder* const enc, const int alphas[256]) {
    n = 0;    // track the nearest center for current 'a'
    for (a = min_a; a <= max_a; ++a) {
      if (alphas[a]) {
-        while (n < nb - 1 && abs(a - centers[n + 1]) < abs(a - centers[n])) {
+        while (n + 1 < nb && abs(a - centers[n + 1]) < abs(a - centers[n])) {
          n++;
        }
        map[a] = n;
@ -208,7 +209,7 @@ static void AssignSegments(VP8Encoder* const enc, const int alphas[256]) {
    VP8MBInfo* const mb = &enc->mb_info_[n];
    const int alpha = mb->alpha_;
    mb->segment_ = map[alpha];
-    mb->alpha_ = centers[map[alpha]];     // just for the record.
+    mb->alpha_ = centers[map[alpha]];  // for the record.
  }

  if (nb > 1) {
@ -216,7 +217,6 @@ static void AssignSegments(VP8Encoder* const enc, const int alphas[256]) {
    if (smooth) SmoothSegmentMap(enc);
  }

-  SetSegmentProbas(enc);                             // Assign final proba
  SetSegmentAlphas(enc, centers, weighted_average);  // pick some alphas.
 }

@ -225,24 +225,32 @@ static void AssignSegments(VP8Encoder* const enc, const int alphas[256]) {
 // susceptibility and set best modes for this macroblock.
 // Segment assignment is done later.

-// Number of modes to inspect for alpha_ evaluation. For high-quality settings,
-// we don't need to test all the possible modes during the analysis phase.
+// Number of modes to inspect for alpha_ evaluation. For high-quality settings
+// (method >= FAST_ANALYSIS_METHOD) we don't need to test all the possible modes
+// during the analysis phase.
+#define FAST_ANALYSIS_METHOD 4  // method above which we do partial analysis
 #define MAX_INTRA16_MODE 2
 #define MAX_INTRA4_MODE  2
 #define MAX_UV_MODE      2

 static int MBAnalyzeBestIntra16Mode(VP8EncIterator* const it) {
-  const int max_mode = (it->enc_->method_ >= 3) ? MAX_INTRA16_MODE : 4;
+  const int max_mode =
+      (it->enc_->method_ >= FAST_ANALYSIS_METHOD) ? MAX_INTRA16_MODE
+                                                  : NUM_PRED_MODES;
  int mode;
-  int best_alpha = -1;
+  int best_alpha = DEFAULT_ALPHA;
  int best_mode = 0;

  VP8MakeLuma16Preds(it);
  for (mode = 0; mode < max_mode; ++mode) {
-    const int alpha = VP8CollectHistogram(it->yuv_in_ + Y_OFF,
+    VP8Histogram histo = { { 0 } };
+    int alpha;
+
+    VP8CollectHistogram(it->yuv_in_ + Y_OFF,
                        it->yuv_p_ + VP8I16ModeOffsets[mode],
-                                          0, 16);
-    if (alpha > best_alpha) {
+                        0, 16, &histo);
+    alpha = GetAlpha(&histo);
+    if (IS_BETTER_ALPHA(alpha, best_alpha)) {
      best_alpha = alpha;
      best_mode = mode;
    }
@ -254,46 +262,63 @@ static int MBAnalyzeBestIntra16Mode(VP8EncIterator* const it) {
 static int MBAnalyzeBestIntra4Mode(VP8EncIterator* const it,
                                   int best_alpha) {
  uint8_t modes[16];
-  const int max_mode = (it->enc_->method_ >= 3) ? MAX_INTRA4_MODE : NUM_BMODES;
-  int i4_alpha = 0;
+  const int max_mode =
+      (it->enc_->method_ >= FAST_ANALYSIS_METHOD) ? MAX_INTRA4_MODE
+                                                  : NUM_BMODES;
+  int i4_alpha;
+  VP8Histogram total_histo = { { 0 } };
+  int cur_histo = 0;
+
  VP8IteratorStartI4(it);
  do {
    int mode;
-    int best_mode_alpha = -1;
+    int best_mode_alpha = DEFAULT_ALPHA;
+    VP8Histogram histos[2];
    const uint8_t* const src = it->yuv_in_ + Y_OFF + VP8Scan[it->i4_];

    VP8MakeIntra4Preds(it);
    for (mode = 0; mode < max_mode; ++mode) {
-      const int alpha = VP8CollectHistogram(src,
-                                            it->yuv_p_ + VP8I4ModeOffsets[mode],
-                                            0, 1);
-      if (alpha > best_mode_alpha) {
+      int alpha;
+
+      memset(&histos[cur_histo], 0, sizeof(histos[cur_histo]));
+      VP8CollectHistogram(src, it->yuv_p_ + VP8I4ModeOffsets[mode],
+                          0, 1, &histos[cur_histo]);
+      alpha = GetAlpha(&histos[cur_histo]);
+      if (IS_BETTER_ALPHA(alpha, best_mode_alpha)) {
        best_mode_alpha = alpha;
        modes[it->i4_] = mode;
+        cur_histo ^= 1;   // keep track of best histo so far.
      }
    }
-    i4_alpha += best_mode_alpha;
+    // accumulate best histogram
+    MergeHistograms(&histos[cur_histo ^ 1], &total_histo);
    // Note: we reuse the original samples for predictors
  } while (VP8IteratorRotateI4(it, it->yuv_in_ + Y_OFF));

-  if (i4_alpha > best_alpha) {
+  i4_alpha = GetAlpha(&total_histo);
+  if (IS_BETTER_ALPHA(i4_alpha, best_alpha)) {
    VP8SetIntra4Mode(it, modes);
-    best_alpha = ClipAlpha(i4_alpha);
+    best_alpha = i4_alpha;
  }
  return best_alpha;
 }

 static int MBAnalyzeBestUVMode(VP8EncIterator* const it) {
-  int best_alpha = -1;
+  int best_alpha = DEFAULT_ALPHA;
  int best_mode = 0;
-  const int max_mode = (it->enc_->method_ >= 3) ? MAX_UV_MODE : 4;
+  const int max_mode =
+      (it->enc_->method_ >= FAST_ANALYSIS_METHOD) ? MAX_UV_MODE
+                                                  : NUM_PRED_MODES;
  int mode;
  VP8MakeChroma8Preds(it);
  for (mode = 0; mode < max_mode; ++mode) {
-    const int alpha = VP8CollectHistogram(it->yuv_in_ + U_OFF,
+    VP8Histogram histo = { { 0 } };
+    int alpha;
+    VP8CollectHistogram(it->yuv_in_ + U_OFF,
                        it->yuv_p_ + VP8UVModeOffsets[mode],
-                                          16, 16 + 4 + 4);
-    if (alpha > best_alpha) {
+                        16, 16 + 4 + 4, &histo);
+    alpha = GetAlpha(&histo);
+    if (IS_BETTER_ALPHA(alpha, best_alpha)) {
      best_alpha = alpha;
      best_mode = mode;
    }
@ -303,7 +328,8 @@ static int MBAnalyzeBestUVMode(VP8EncIterator* const it) {
 }

 static void MBAnalyze(VP8EncIterator* const it,
-                      int alphas[256], int* const uv_alpha) {
+                      int alphas[MAX_ALPHA + 1],
+                      int* const alpha, int* const uv_alpha) {
  const VP8Encoder* const enc = it->enc_;
  int best_alpha, best_uv_alpha;

@ -312,7 +338,7 @@ static void MBAnalyze(VP8EncIterator* const it,
  VP8SetSegment(it, 0);      // default segment, spec-wise.

  best_alpha = MBAnalyzeBestIntra16Mode(it);
-  if (enc->method_ != 3) {
+  if (enc->method_ >= 5) {
    // We go and make a fast decision for intra4/intra16.
    // It's usually not a good and definitive pick, but helps seeding the stats
    // about level bit-cost.
@ -322,10 +348,22 @@ static void MBAnalyze(VP8EncIterator* const it,
  best_uv_alpha = MBAnalyzeBestUVMode(it);

  // Final susceptibility mix
-  best_alpha = (best_alpha + best_uv_alpha + 1) / 2;
+  best_alpha = (3 * best_alpha + best_uv_alpha + 2) >> 2;
+  best_alpha = FinalAlphaValue(best_alpha);
  alphas[best_alpha]++;
+  it->mb_->alpha_ = best_alpha;   // for later remapping.
+
+  // Accumulate for later complexity analysis.
+  *alpha += best_alpha;   // mixed susceptibility (not just luma)
  *uv_alpha += best_uv_alpha;
-  it->mb_->alpha_ = best_alpha;   // Informative only.
+}
+
+static void DefaultMBInfo(VP8MBInfo* const mb) {
+  mb->type_ = 1;     // I16x16
+  mb->uv_mode_ = 0;
+  mb->skip_ = 0;     // not skipped
+  mb->segment_ = 0;  // default segment
+  mb->alpha_ = 0;
 }

 //------------------------------------------------------------------------------
@ -338,25 +376,122 @@ static void MBAnalyze(VP8EncIterator* const it,
 // and decide intra4/intra16, but that's usually almost always a bad choice at
 // this stage.

-int VP8EncAnalyze(VP8Encoder* const enc) {
-  int ok = 1;
-  int alphas[256] = { 0 };
-  VP8EncIterator it;
-
-  VP8IteratorInit(enc, &it);
+static void ResetAllMBInfo(VP8Encoder* const enc) {
+  int n;
+  for (n = 0; n < enc->mb_w_ * enc->mb_h_; ++n) {
+    DefaultMBInfo(&enc->mb_info_[n]);
+  }
+  // Default susceptibilities.
+  enc->dqm_[0].alpha_ = 0;
+  enc->dqm_[0].beta_ = 0;
+  // Note: we can't compute this alpha_ / uv_alpha_ -> set to default value.
+  enc->alpha_ = 0;
  enc->uv_alpha_ = 0;
-  do {
-    VP8IteratorImport(&it);
-    MBAnalyze(&it, alphas, &enc->uv_alpha_);
-    ok = VP8IteratorProgress(&it, 20);
-    // Let's pretend we have perfect lossless reconstruction.
-  } while (ok && VP8IteratorNext(&it, it.yuv_in_));
-  enc->uv_alpha_ /= enc->mb_w_ * enc->mb_h_;
-  if (ok) AssignSegments(enc, alphas);
+  WebPReportProgress(enc->pic_, enc->percent_ + 20, &enc->percent_);
+}

+// struct used to collect job result
+typedef struct {
+  WebPWorker worker;
+  int alphas[MAX_ALPHA + 1];
+  int alpha, uv_alpha;
+  VP8EncIterator it;
+  int delta_progress;
+} SegmentJob;
+
+// main work call
+static int DoSegmentsJob(SegmentJob* const job, VP8EncIterator* const it) {
+  int ok = 1;
+  if (!VP8IteratorIsDone(it)) {
+    uint8_t tmp[32 + ALIGN_CST];
+    uint8_t* const scratch = (uint8_t*)DO_ALIGN(tmp);
+    do {
+      // Let's pretend we have perfect lossless reconstruction.
+      VP8IteratorImport(it, scratch);
+      MBAnalyze(it, job->alphas, &job->alpha, &job->uv_alpha);
+      ok = VP8IteratorProgress(it, job->delta_progress);
+    } while (ok && VP8IteratorNext(it));
+  }
  return ok;
 }

-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
+static void MergeJobs(const SegmentJob* const src, SegmentJob* const dst) {
+  int i;
+  for (i = 0; i <= MAX_ALPHA; ++i) dst->alphas[i] += src->alphas[i];
+  dst->alpha += src->alpha;
+  dst->uv_alpha += src->uv_alpha;
+}
+
+// initialize the job struct with some TODOs
+static void InitSegmentJob(VP8Encoder* const enc, SegmentJob* const job,
+                           int start_row, int end_row) {
+  WebPWorkerInit(&job->worker);
+  job->worker.data1 = job;
+  job->worker.data2 = &job->it;
+  job->worker.hook = (WebPWorkerHook)DoSegmentsJob;
+  VP8IteratorInit(enc, &job->it);
+  VP8IteratorSetRow(&job->it, start_row);
+  VP8IteratorSetCountDown(&job->it, (end_row - start_row) * enc->mb_w_);
+  memset(job->alphas, 0, sizeof(job->alphas));
+  job->alpha = 0;
+  job->uv_alpha = 0;
+  // only one of both jobs can record the progress, since we don't
+  // expect the user's hook to be multi-thread safe
+  job->delta_progress = (start_row == 0) ? 20 : 0;
+}
+
+// main entry point
+int VP8EncAnalyze(VP8Encoder* const enc) {
+  int ok = 1;
+  const int do_segments =
+      enc->config_->emulate_jpeg_size ||   // We need the complexity evaluation.
+      (enc->segment_hdr_.num_segments_ > 1) ||
+      (enc->method_ == 0);  // for method 0, we need preds_[] to be filled.
+  if (do_segments) {
+    const int last_row = enc->mb_h_;
+    // We give a little more than a half work to the main thread.
+    const int split_row = (9 * last_row + 15) >> 4;
+    const int total_mb = last_row * enc->mb_w_;
+#ifdef WEBP_USE_THREAD
+    const int kMinSplitRow = 2;  // minimal rows needed for mt to be worth it
+    const int do_mt = (enc->thread_level_ > 0) && (split_row >= kMinSplitRow);
+#else
+    const int do_mt = 0;
 #endif
+    SegmentJob main_job;
+    if (do_mt) {
+      SegmentJob side_job;
+      // Note the use of '&' instead of '&&' because we must call the functions
+      // no matter what.
+      InitSegmentJob(enc, &main_job, 0, split_row);
+      InitSegmentJob(enc, &side_job, split_row, last_row);
+      // we don't need to call Reset() on main_job.worker, since we're calling
+      // WebPWorkerExecute() on it
+      ok &= WebPWorkerReset(&side_job.worker);
+      // launch the two jobs in parallel
+      if (ok) {
+        WebPWorkerLaunch(&side_job.worker);
+        WebPWorkerExecute(&main_job.worker);
+        ok &= WebPWorkerSync(&side_job.worker);
+        ok &= WebPWorkerSync(&main_job.worker);
+      }
+      WebPWorkerEnd(&side_job.worker);
+      if (ok) MergeJobs(&side_job, &main_job);  // merge results together
+    } else {
+      // Even for single-thread case, we use the generic Worker tools.
+      InitSegmentJob(enc, &main_job, 0, last_row);
+      WebPWorkerExecute(&main_job.worker);
+      ok &= WebPWorkerSync(&main_job.worker);
+    }
+    WebPWorkerEnd(&main_job.worker);
+    if (ok) {
+      enc->alpha_ = main_job.alpha / total_mb;
+      enc->uv_alpha_ = main_job.uv_alpha / total_mb;
+      AssignSegments(enc, main_job.alphas);
+    }
+  } else {   // Use only one default segment.
+    ResetAllMBInfo(enc);
+  }
+  return ok;
+}
+
--- a/src/enc/backward_references.c
+++ b/src/enc/backward_references.c
@ -1,8 +1,10 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Author: Jyrki Alakuijala (jyrki@google.com)
@ -14,7 +16,9 @@

 #include "./backward_references.h"
 #include "./histogram.h"
+#include "../dsp/lossless.h"
 #include "../utils/color_cache.h"
+#include "../utils/utils.h"

 #define VALUES_IN_BYTE 256

@ -93,7 +97,8 @@ int VP8LBackwardRefsAlloc(VP8LBackwardRefs* const refs, int max_size) {
  assert(refs != NULL);
  refs->size = 0;
  refs->max_size = 0;
-  refs->refs = (PixOrCopy*)malloc(max_size * sizeof(*refs->refs));
+  refs->refs = (PixOrCopy*)WebPSafeMalloc((uint64_t)max_size,
+                                          sizeof(*refs->refs));
  if (refs->refs == NULL) return 0;
  refs->max_size = max_size;
  return 1;
@ -110,7 +115,7 @@ static WEBP_INLINE uint64_t GetPixPairHash64(const uint32_t* const argb) {

 static int HashChainInit(HashChain* const p, int size) {
  int i;
-  p->chain_ = (int*)malloc(size * sizeof(*p->chain_));
+  p->chain_ = (int*)WebPSafeMalloc((uint64_t)size, sizeof(*p->chain_));
  if (p->chain_ == NULL) {
    return 0;
  }
@ -138,74 +143,95 @@ static void HashChainInsert(HashChain* const p,
  p->hash_to_first_index_[hash_code] = pos;
 }

+static void GetParamsForHashChainFindCopy(int quality, int xsize,
+                                          int cache_bits, int* window_size,
+                                          int* iter_pos, int* iter_limit) {
+  const int iter_mult = (quality < 27) ? 1 : 1 + ((quality - 27) >> 4);
+  const int iter_neg = -iter_mult * (quality >> 1);
+  // Limit the backward-ref window size for lower qualities.
+  const int max_window_size = (quality > 50) ? WINDOW_SIZE
+                            : (quality > 25) ? (xsize << 8)
+                            : (xsize << 4);
+  assert(xsize > 0);
+  *window_size = (max_window_size > WINDOW_SIZE) ? WINDOW_SIZE
+               : max_window_size;
+  *iter_pos = 8 + (quality >> 3);
+  // For lower entropy images, the rigorous search loop in HashChainFindCopy
+  // can be relaxed.
+  *iter_limit = (cache_bits > 0) ? iter_neg : iter_neg / 2;
+}
+
 static int HashChainFindCopy(const HashChain* const p,
-                             int quality, int index, int xsize,
-                             const uint32_t* const argb, int maxlen,
+                             int base_position, int xsize_signed,
+                             const uint32_t* const argb, int max_len,
+                             int window_size, int iter_pos, int iter_limit,
                             int* const distance_ptr,
                             int* const length_ptr) {
-  const uint64_t hash_code = GetPixPairHash64(&argb[index]);
-  int prev_length = 0;
-  int64_t best_val = 0;
-  int best_length = 0;
-  int best_distance = 0;
-  const uint32_t* const argb_start = argb + index;
-  const int iter_min_mult = (quality < 50) ? 2 : (quality < 75) ? 4 : 8;
-  const int iter_min = -quality * iter_min_mult;
-  int iter_cnt = 10 + (quality >> 1);
-  const int min_pos = (index > WINDOW_SIZE) ? index - WINDOW_SIZE : 0;
+  const uint32_t* const argb_start = argb + base_position;
+  uint64_t best_val = 0;
+  uint32_t best_length = 1;
+  uint32_t best_distance = 0;
+  const uint32_t xsize = (uint32_t)xsize_signed;
+  const int min_pos =
+      (base_position > window_size) ? base_position - window_size : 0;
  int pos;
-
  assert(xsize > 0);
-  for (pos = p->hash_to_first_index_[hash_code];
+  if (max_len > MAX_LENGTH) {
+    max_len = MAX_LENGTH;
+  }
+  for (pos = p->hash_to_first_index_[GetPixPairHash64(argb_start)];
       pos >= min_pos;
       pos = p->chain_[pos]) {
-    int64_t val;
-    int curr_length;
-    if (iter_cnt < 0) {
-      if (iter_cnt < iter_min || best_val >= 0xff0000) {
+    uint64_t val;
+    uint32_t curr_length;
+    uint32_t distance;
+    const uint64_t* const ptr1 =
+        (const uint64_t*)(argb + pos + best_length - 1);
+    const uint64_t* const ptr2 =
+        (const uint64_t*)(argb_start + best_length - 1);
+
+    if (iter_pos < 0) {
+      if (iter_pos < iter_limit || best_val >= 0xff0000) {
        break;
      }
    }
-    --iter_cnt;
-    if (best_length != 0 &&
-        argb[pos + best_length - 1] != argb_start[best_length - 1]) {
-      continue;
-    }
-    curr_length = FindMatchLength(argb + pos, argb_start, maxlen);
-    if (curr_length < prev_length) {
-      continue;
-    }
-    val = 65536 * curr_length;
+    --iter_pos;
+
+    // Before 'expensive' linear match, check if the two arrays match at the
+    // current best length index and also for the succeeding elements.
+    if (*ptr1 != *ptr2) continue;
+
+    curr_length = FindMatchLength(argb + pos, argb_start, max_len);
+    if (curr_length < best_length) continue;
+
+    distance = (uint32_t)(base_position - pos);
+    val = curr_length << 16;
    // Favoring 2d locality here gives savings for certain images.
-    if (index - pos < 9 * xsize) {
-      const int y = (index - pos) / xsize;
-      int x = (index - pos) % xsize;
-      if (x > xsize / 2) {
+    if (distance < 9 * xsize) {
+      const uint32_t y = distance / xsize;
+      uint32_t x = distance % xsize;
+      if (x > (xsize >> 1)) {
        x = xsize - x;
      }
-      if (x <= 7 && x >= -8) {
+      if (x <= 7) {
+        val += 9 * 9 + 9 * 9;
        val -= y * y + x * x;
-      } else {
-        val -= 9 * 9 + 9 * 9;
      }
-    } else {
-      val -= 9 * 9 + 9 * 9;
    }
    if (best_val < val) {
-      prev_length = curr_length;
      best_val = val;
      best_length = curr_length;
-      best_distance = index - pos;
-      if (curr_length >= MAX_LENGTH) {
+      best_distance = distance;
+      if (curr_length >= (uint32_t)max_len) {
        break;
      }
-      if ((best_distance == 1 || best_distance == xsize) &&
+      if ((best_distance == 1 || distance == xsize) &&
          best_length >= 128) {
        break;
      }
    }
  }
-  *distance_ptr = best_distance;
+  *distance_ptr = (int)best_distance;
  *length_ptr = best_length;
  return (best_length >= MIN_LENGTH);
 }
@ -254,24 +280,29 @@ static int BackwardReferencesHashChain(int xsize, int ysize,
  const int pix_count = xsize * ysize;
  HashChain* const hash_chain = (HashChain*)malloc(sizeof(*hash_chain));
  VP8LColorCache hashers;
+  int window_size = WINDOW_SIZE;
+  int iter_pos = 1;
+  int iter_limit = -1;

  if (hash_chain == NULL) return 0;
+  if (use_color_cache) {
    cc_init = VP8LColorCacheInit(&hashers, cache_bits);
    if (!cc_init) goto Error;
+  }

  if (!HashChainInit(hash_chain, pix_count)) goto Error;

  refs->size = 0;
+  GetParamsForHashChainFindCopy(quality, xsize, cache_bits,
+                                &window_size, &iter_pos, &iter_limit);
  for (i = 0; i < pix_count; ) {
    // Alternative#1: Code the pixels starting at 'i' using backward reference.
    int offset = 0;
    int len = 0;
    if (i < pix_count - 1) {  // FindCopy(i,..) reads pixels at [i] and [i + 1].
-      int maxlen = pix_count - i;
-      if (maxlen > MAX_LENGTH) {
-        maxlen = MAX_LENGTH;
-      }
-      HashChainFindCopy(hash_chain, quality, i, xsize, argb, maxlen,
+      int max_len = pix_count - i;
+      HashChainFindCopy(hash_chain, i, xsize, argb, max_len,
+                        window_size, iter_pos, iter_limit,
                        &offset, &len);
    }
    if (len >= MIN_LENGTH) {
@ -282,22 +313,21 @@ static int BackwardReferencesHashChain(int xsize, int ysize,
      int k;
      HashChainInsert(hash_chain, &argb[i], i);
      if (i < pix_count - 2) {  // FindCopy(i+1,..) reads [i + 1] and [i + 2].
-        int maxlen = pix_count - (i + 1);
-        if (maxlen > MAX_LENGTH) {
-          maxlen = MAX_LENGTH;
-        }
-        HashChainFindCopy(hash_chain, quality,
-                          i + 1, xsize, argb, maxlen, &offset2, &len2);
+        int max_len = pix_count - (i + 1);
+        HashChainFindCopy(hash_chain, i + 1, xsize, argb, max_len,
+                          window_size, iter_pos, iter_limit,
+                          &offset2, &len2);
        if (len2 > len + 1) {
+          const uint32_t pixel = argb[i];
          // Alternative#2 is a better match. So push pixel at 'i' as literal.
-          if (use_color_cache && VP8LColorCacheContains(&hashers, argb[i])) {
-            const int ix = VP8LColorCacheGetIndex(&hashers, argb[i]);
+          if (use_color_cache && VP8LColorCacheContains(&hashers, pixel)) {
+            const int ix = VP8LColorCacheGetIndex(&hashers, pixel);
            refs->refs[refs->size] = PixOrCopyCreateCacheIdx(ix);
          } else {
-            refs->refs[refs->size] = PixOrCopyCreateLiteral(argb[i]);
+            if (use_color_cache) VP8LColorCacheInsert(&hashers, pixel);
+            refs->refs[refs->size] = PixOrCopyCreateLiteral(pixel);
          }
          ++refs->size;
-          VP8LColorCacheInsert(&hashers, argb[i]);
          i++;  // Backward reference to be done for next pixel.
          len = len2;
          offset = offset2;
@ -307,24 +337,30 @@ static int BackwardReferencesHashChain(int xsize, int ysize,
        len = MAX_LENGTH - 1;
      }
      refs->refs[refs->size++] = PixOrCopyCreateCopy(offset, len);
+      if (use_color_cache) {
        for (k = 0; k < len; ++k) {
          VP8LColorCacheInsert(&hashers, argb[i + k]);
-        if (k != 0 && i + k + 1 < pix_count) {
+        }
+      }
      // Add to the hash_chain (but cannot add the last pixel).
+      {
+        const int last = (len < pix_count - 1 - i) ? len : pix_count - 1 - i;
+        for (k = 1; k < last; ++k) {
          HashChainInsert(hash_chain, &argb[i + k], i + k);
        }
      }
      i += len;
    } else {
-      if (use_color_cache && VP8LColorCacheContains(&hashers, argb[i])) {
+      const uint32_t pixel = argb[i];
+      if (use_color_cache && VP8LColorCacheContains(&hashers, pixel)) {
        // push pixel as a PixOrCopyCreateCacheIdx pixel
-        int ix = VP8LColorCacheGetIndex(&hashers, argb[i]);
+        const int ix = VP8LColorCacheGetIndex(&hashers, pixel);
        refs->refs[refs->size] = PixOrCopyCreateCacheIdx(ix);
      } else {
-        refs->refs[refs->size] = PixOrCopyCreateLiteral(argb[i]);
+        if (use_color_cache) VP8LColorCacheInsert(&hashers, pixel);
+        refs->refs[refs->size] = PixOrCopyCreateLiteral(pixel);
      }
      ++refs->size;
-      VP8LColorCacheInsert(&hashers, argb[i]);
      if (i + 1 < pix_count) {
        HashChainInsert(hash_chain, &argb[i], i);
      }
@ -346,46 +382,65 @@ typedef struct {
  double literal_[PIX_OR_COPY_CODES_MAX];
  double blue_[VALUES_IN_BYTE];
  double distance_[NUM_DISTANCE_CODES];
-  int cache_bits_;
 } CostModel;

 static int BackwardReferencesTraceBackwards(
    int xsize, int ysize, int recursive_cost_model,
-    const uint32_t* const argb, int cache_bits, VP8LBackwardRefs* const refs);
+    const uint32_t* const argb, int quality, int cache_bits,
+    VP8LBackwardRefs* const refs);

-static int CostModelBuild(CostModel* const p, int xsize, int ysize,
+static void ConvertPopulationCountTableToBitEstimates(
+    int num_symbols, const int population_counts[], double output[]) {
+  int sum = 0;
+  int nonzeros = 0;
+  int i;
+  for (i = 0; i < num_symbols; ++i) {
+    sum += population_counts[i];
+    if (population_counts[i] > 0) {
+      ++nonzeros;
+    }
+  }
+  if (nonzeros <= 1) {
+    memset(output, 0, num_symbols * sizeof(*output));
+  } else {
+    const double logsum = VP8LFastLog2(sum);
+    for (i = 0; i < num_symbols; ++i) {
+      output[i] = logsum - VP8LFastLog2(population_counts[i]);
+    }
+  }
+}
+
+static int CostModelBuild(CostModel* const m, int xsize, int ysize,
                          int recursion_level, const uint32_t* const argb,
-                          int cache_bits) {
+                          int quality, int cache_bits) {
  int ok = 0;
  VP8LHistogram histo;
  VP8LBackwardRefs refs;

  if (!VP8LBackwardRefsAlloc(&refs, xsize * ysize)) goto Error;

-  p->cache_bits_ = cache_bits;
  if (recursion_level > 0) {
    if (!BackwardReferencesTraceBackwards(xsize, ysize, recursion_level - 1,
-                                          argb, cache_bits, &refs)) {
+                                          argb, quality, cache_bits, &refs)) {
      goto Error;
    }
  } else {
-    const int quality = 100;
    if (!BackwardReferencesHashChain(xsize, ysize, argb, cache_bits, quality,
                                     &refs)) {
      goto Error;
    }
  }
  VP8LHistogramCreate(&histo, &refs, cache_bits);
-  VP8LConvertPopulationCountTableToBitEstimates(
-      VP8LHistogramNumCodes(&histo), histo.literal_, p->literal_);
-  VP8LConvertPopulationCountTableToBitEstimates(
-      VALUES_IN_BYTE, histo.red_, p->red_);
-  VP8LConvertPopulationCountTableToBitEstimates(
-      VALUES_IN_BYTE, histo.blue_, p->blue_);
-  VP8LConvertPopulationCountTableToBitEstimates(
-      VALUES_IN_BYTE, histo.alpha_, p->alpha_);
-  VP8LConvertPopulationCountTableToBitEstimates(
-      NUM_DISTANCE_CODES, histo.distance_, p->distance_);
+  ConvertPopulationCountTableToBitEstimates(
+      VP8LHistogramNumCodes(&histo), histo.literal_, m->literal_);
+  ConvertPopulationCountTableToBitEstimates(
+      VALUES_IN_BYTE, histo.red_, m->red_);
+  ConvertPopulationCountTableToBitEstimates(
+      VALUES_IN_BYTE, histo.blue_, m->blue_);
+  ConvertPopulationCountTableToBitEstimates(
+      VALUES_IN_BYTE, histo.alpha_, m->alpha_);
+  ConvertPopulationCountTableToBitEstimates(
+      NUM_DISTANCE_CODES, histo.distance_, m->distance_);
  ok = 1;

 Error:
@ -393,63 +448,73 @@ static int CostModelBuild(CostModel* const p, int xsize, int ysize,
  return ok;
 }

-static WEBP_INLINE double GetLiteralCost(const CostModel* const p, uint32_t v) {
-  return p->alpha_[v >> 24] +
-      p->red_[(v >> 16) & 0xff] +
-      p->literal_[(v >> 8) & 0xff] +
-      p->blue_[v & 0xff];
+static WEBP_INLINE double GetLiteralCost(const CostModel* const m, uint32_t v) {
+  return m->alpha_[v >> 24] +
+         m->red_[(v >> 16) & 0xff] +
+         m->literal_[(v >> 8) & 0xff] +
+         m->blue_[v & 0xff];
 }

-static WEBP_INLINE double GetCacheCost(const CostModel* const p, uint32_t idx) {
+static WEBP_INLINE double GetCacheCost(const CostModel* const m, uint32_t idx) {
  const int literal_idx = VALUES_IN_BYTE + NUM_LENGTH_CODES + idx;
-  return p->literal_[literal_idx];
+  return m->literal_[literal_idx];
 }

-static WEBP_INLINE double GetLengthCost(const CostModel* const p,
+static WEBP_INLINE double GetLengthCost(const CostModel* const m,
                                        uint32_t length) {
-  int code, extra_bits_count, extra_bits_value;
-  PrefixEncode(length, &code, &extra_bits_count, &extra_bits_value);
-  return p->literal_[VALUES_IN_BYTE + code] + extra_bits_count;
+  int code, extra_bits;
+  VP8LPrefixEncodeBits(length, &code, &extra_bits);
+  return m->literal_[VALUES_IN_BYTE + code] + extra_bits;
 }

-static WEBP_INLINE double GetDistanceCost(const CostModel* const p,
+static WEBP_INLINE double GetDistanceCost(const CostModel* const m,
                                          uint32_t distance) {
-  int code, extra_bits_count, extra_bits_value;
-  PrefixEncode(distance, &code, &extra_bits_count, &extra_bits_value);
-  return p->distance_[code] + extra_bits_count;
+  int code, extra_bits;
+  VP8LPrefixEncodeBits(distance, &code, &extra_bits);
+  return m->distance_[code] + extra_bits;
 }

 static int BackwardReferencesHashChainDistanceOnly(
    int xsize, int ysize, int recursive_cost_model, const uint32_t* const argb,
-    int cache_bits, uint32_t* const dist_array) {
+    int quality, int cache_bits, uint32_t* const dist_array) {
  int i;
  int ok = 0;
  int cc_init = 0;
-  const int quality = 100;
  const int pix_count = xsize * ysize;
  const int use_color_cache = (cache_bits > 0);
-  double* const cost = (double*)malloc(pix_count * sizeof(*cost));
+  float* const cost =
+      (float*)WebPSafeMalloc((uint64_t)pix_count, sizeof(*cost));
  CostModel* cost_model = (CostModel*)malloc(sizeof(*cost_model));
  HashChain* hash_chain = (HashChain*)malloc(sizeof(*hash_chain));
  VP8LColorCache hashers;
  const double mul0 = (recursive_cost_model != 0) ? 1.0 : 0.68;
  const double mul1 = (recursive_cost_model != 0) ? 1.0 : 0.82;
+  const int min_distance_code = 2;  // TODO(vikasa): tune as function of quality
+  int window_size = WINDOW_SIZE;
+  int iter_pos = 1;
+  int iter_limit = -1;

  if (cost == NULL || cost_model == NULL || hash_chain == NULL) goto Error;

+  if (!HashChainInit(hash_chain, pix_count)) goto Error;
+
+  if (use_color_cache) {
    cc_init = VP8LColorCacheInit(&hashers, cache_bits);
-  if (!cc_init || !HashChainInit(hash_chain, pix_count)) goto Error;
+    if (!cc_init) goto Error;
+  }

  if (!CostModelBuild(cost_model, xsize, ysize, recursive_cost_model, argb,
-                      cache_bits)) {
+                      quality, cache_bits)) {
    goto Error;
  }

-  for (i = 0; i < pix_count; ++i) cost[i] = 1e100;
+  for (i = 0; i < pix_count; ++i) cost[i] = 1e38f;

  // We loop one pixel at a time, but store all currently best points to
  // non-processed locations from this point.
  dist_array[0] = 0;
+  GetParamsForHashChainFindCopy(quality, xsize, cache_bits,
+                                &window_size, &iter_pos, &iter_limit);
  for (i = 0; i < pix_count; ++i) {
    double prev_cost = 0.0;
    int shortmax;
@ -460,11 +525,9 @@ static int BackwardReferencesHashChainDistanceOnly(
      int offset = 0;
      int len = 0;
      if (i < pix_count - 1) {  // FindCopy reads pixels at [i] and [i + 1].
-        int maxlen = shortmax ? 2 : MAX_LENGTH;
-        if (maxlen > pix_count - i) {
-          maxlen = pix_count - i;
-        }
-        HashChainFindCopy(hash_chain, quality, i, xsize, argb, maxlen,
+        int max_len = shortmax ? 2 : pix_count - i;
+        HashChainFindCopy(hash_chain, i, xsize, argb, max_len,
+                          window_size, iter_pos, iter_limit,
                          &offset, &len);
      }
      if (len >= MIN_LENGTH) {
@ -473,27 +536,32 @@ static int BackwardReferencesHashChainDistanceOnly(
            prev_cost + GetDistanceCost(cost_model, code);
        int k;
        for (k = 1; k < len; ++k) {
-          const double cost_val =
-              distance_cost + GetLengthCost(cost_model, k);
+          const double cost_val = distance_cost + GetLengthCost(cost_model, k);
          if (cost[i + k] > cost_val) {
-            cost[i + k] = cost_val;
+            cost[i + k] = (float)cost_val;
            dist_array[i + k] = k + 1;
          }
        }
        // This if is for speedup only. It roughly doubles the speed, and
        // makes compression worse by .1 %.
-        if (len >= 128 && code < 2) {
+        if (len >= 128 && code <= min_distance_code) {
          // Long copy for short distances, let's skip the middle
          // lookups for better copies.
          // 1) insert the hashes.
+          if (use_color_cache) {
            for (k = 0; k < len; ++k) {
              VP8LColorCacheInsert(&hashers, argb[i + k]);
-            if (i + k + 1 < pix_count) {
-              // Add to the hash_chain (but cannot add the last pixel).
-              HashChainInsert(hash_chain, &argb[i + k], i + k);
            }
          }
-          // 2) jump.
+          // 2) Add to the hash_chain (but cannot add the last pixel)
+          {
+            const int last = (len + i < pix_count - 1) ? len + i
+                                                       : pix_count - 1;
+            for (k = i; k < last; ++k) {
+              HashChainInsert(hash_chain, &argb[k], k);
+            }
+          }
+          // 3) jump.
          i += len - 1;  // for loop does ++i, thus -1 here.
          goto next_symbol;
        }
@ -509,13 +577,13 @@ static int BackwardReferencesHashChainDistanceOnly(
        const int ix = VP8LColorCacheGetIndex(&hashers, argb[i]);
        cost_val += GetCacheCost(cost_model, ix) * mul0;
      } else {
+        if (use_color_cache) VP8LColorCacheInsert(&hashers, argb[i]);
        cost_val += GetLiteralCost(cost_model, argb[i]) * mul1;
      }
      if (cost[i] > cost_val) {
-        cost[i] = cost_val;
+        cost[i] = (float)cost_val;
        dist_array[i] = 1;  // only one is inserted.
      }
-      VP8LColorCacheInsert(&hashers, argb[i]);
    }
 next_symbol: ;
  }
@ -530,39 +598,30 @@ Error:
  return ok;
 }

-static int TraceBackwards(const uint32_t* const dist_array,
+// We pack the path at the end of *dist_array and return
+// a pointer to this part of the array. Example:
+// dist_array = [1x2xx3x2] => packed [1x2x1232], chosen_path = [1232]
+static void TraceBackwards(uint32_t* const dist_array,
                           int dist_array_size,
                           uint32_t** const chosen_path,
                           int* const chosen_path_size) {
-  int i;
-  // Count how many.
-  int count = 0;
-  for (i = dist_array_size - 1; i >= 0; ) {
-    int k = dist_array[i];
-    assert(k >= 1);
-    ++count;
-    i -= k;
+  uint32_t* path = dist_array + dist_array_size;
+  uint32_t* cur = dist_array + dist_array_size - 1;
+  while (cur >= dist_array) {
+    const int k = *cur;
+    --path;
+    *path = k;
+    cur -= k;
  }
-  // Allocate.
-  *chosen_path_size = count;
-  *chosen_path = (uint32_t*)malloc(count * sizeof(*chosen_path));
-  if (*chosen_path == NULL) return 0;
-
-  // Write in reverse order.
-  for (i = dist_array_size - 1; i >= 0; ) {
-    int k = dist_array[i];
-    assert(k >= 1);
-    (*chosen_path)[--count] = k;
-    i -= k;
-  }
-  return 1;
+  *chosen_path = path;
+  *chosen_path_size = (int)(dist_array + dist_array_size - path);
 }

 static int BackwardReferencesHashChainFollowChosenPath(
-    int xsize, int ysize, const uint32_t* const argb, int cache_bits,
+    int xsize, int ysize, const uint32_t* const argb,
+    int quality, int cache_bits,
    const uint32_t* const chosen_path, int chosen_path_size,
    VP8LBackwardRefs* const refs) {
-  const int quality = 100;
  const int pix_count = xsize * ysize;
  const int use_color_cache = (cache_bits > 0);
  int size = 0;
@ -571,29 +630,41 @@ static int BackwardReferencesHashChainFollowChosenPath(
  int ix;
  int ok = 0;
  int cc_init = 0;
+  int window_size = WINDOW_SIZE;
+  int iter_pos = 1;
+  int iter_limit = -1;
  HashChain* hash_chain = (HashChain*)malloc(sizeof(*hash_chain));
  VP8LColorCache hashers;

-  if (hash_chain == NULL ||
-      !(cc_init = VP8LColorCacheInit(&hashers, cache_bits)) ||
-      !HashChainInit(hash_chain, pix_count)) {
+  if (hash_chain == NULL || !HashChainInit(hash_chain, pix_count)) {
    goto Error;
  }
+  if (use_color_cache) {
+    cc_init = VP8LColorCacheInit(&hashers, cache_bits);
+    if (!cc_init) goto Error;
+  }

  refs->size = 0;
+  GetParamsForHashChainFindCopy(quality, xsize, cache_bits,
+                                &window_size, &iter_pos, &iter_limit);
  for (ix = 0; ix < chosen_path_size; ++ix, ++size) {
    int offset = 0;
    int len = 0;
-    int maxlen = chosen_path[ix];
-    if (maxlen != 1) {
-      HashChainFindCopy(hash_chain, quality,
-                        i, xsize, argb, maxlen, &offset, &len);
-      assert(len == maxlen);
+    int max_len = chosen_path[ix];
+    if (max_len != 1) {
+      HashChainFindCopy(hash_chain, i, xsize, argb, max_len,
+                        window_size, iter_pos, iter_limit,
+                        &offset, &len);
+      assert(len == max_len);
      refs->refs[size] = PixOrCopyCreateCopy(offset, len);
+      if (use_color_cache) {
        for (k = 0; k < len; ++k) {
          VP8LColorCacheInsert(&hashers, argb[i + k]);
-        if (i + k + 1 < pix_count) {
-          // Add to the hash_chain (but cannot add the last pixel).
+        }
+      }
+      {
+        const int last = (len < pix_count - 1 - i) ? len : pix_count - 1 - i;
+        for (k = 0; k < last; ++k) {
          HashChainInsert(hash_chain, &argb[i + k], i + k);
        }
      }
@ -604,9 +675,9 @@ static int BackwardReferencesHashChainFollowChosenPath(
        const int idx = VP8LColorCacheGetIndex(&hashers, argb[i]);
        refs->refs[size] = PixOrCopyCreateCacheIdx(idx);
      } else {
+        if (use_color_cache) VP8LColorCacheInsert(&hashers, argb[i]);
        refs->refs[size] = PixOrCopyCreateLiteral(argb[i]);
      }
-      VP8LColorCacheInsert(&hashers, argb[i]);
      if (i + 1 < pix_count) {
        HashChainInsert(hash_chain, &argb[i], i);
      }
@ -626,34 +697,30 @@ Error:
 static int BackwardReferencesTraceBackwards(int xsize, int ysize,
                                            int recursive_cost_model,
                                            const uint32_t* const argb,
-                                            int cache_bits,
+                                            int quality, int cache_bits,
                                            VP8LBackwardRefs* const refs) {
  int ok = 0;
  const int dist_array_size = xsize * ysize;
  uint32_t* chosen_path = NULL;
  int chosen_path_size = 0;
  uint32_t* dist_array =
-      (uint32_t*)malloc(dist_array_size * sizeof(*dist_array));
+      (uint32_t*)WebPSafeMalloc((uint64_t)dist_array_size, sizeof(*dist_array));

  if (dist_array == NULL) goto Error;

  if (!BackwardReferencesHashChainDistanceOnly(
-      xsize, ysize, recursive_cost_model, argb, cache_bits, dist_array)) {
+      xsize, ysize, recursive_cost_model, argb, quality, cache_bits,
+      dist_array)) {
    goto Error;
  }
-  if (!TraceBackwards(dist_array, dist_array_size,
-                      &chosen_path, &chosen_path_size)) {
-    goto Error;
-  }
-  free(dist_array);   // no need to retain this memory any longer
-  dist_array = NULL;
+  TraceBackwards(dist_array, dist_array_size, &chosen_path, &chosen_path_size);
  if (!BackwardReferencesHashChainFollowChosenPath(
-      xsize, ysize, argb, cache_bits, chosen_path, chosen_path_size, refs)) {
+      xsize, ysize, argb, quality, cache_bits, chosen_path, chosen_path_size,
+      refs)) {
    goto Error;
  }
  ok = 1;
 Error:
-  free(chosen_path);
  free(dist_array);
  return ok;
 }
@ -713,18 +780,20 @@ int VP8LGetBackwardReferences(int width, int height,

  // Choose appropriate backward reference.
  if (lz77_is_useful) {
-    // TraceBackwards is costly. Run it for higher qualities.
-    const int try_lz77_trace_backwards = (quality >= 75);
+    // TraceBackwards is costly. Don't execute it at lower quality.
+    const int try_lz77_trace_backwards = (quality >= 25);
    *best = refs_lz77;   // default guess: lz77 is better
    VP8LClearBackwardRefs(&refs_rle);
    if (try_lz77_trace_backwards) {
-      const int recursion_level = (num_pix < 320 * 200) ? 1 : 0;
+      // Set recursion level for large images using a color cache.
+      const int recursion_level =
+          (num_pix < 320 * 200) && (cache_bits > 0) ? 1 : 0;
      VP8LBackwardRefs refs_trace;
      if (!VP8LBackwardRefsAlloc(&refs_trace, num_pix)) {
        goto End;
      }
-      if (BackwardReferencesTraceBackwards(
-          width, height, recursion_level, argb, cache_bits, &refs_trace)) {
+      if (BackwardReferencesTraceBackwards(width, height, recursion_level, argb,
+                                           quality, cache_bits, &refs_trace)) {
        VP8LClearBackwardRefs(&refs_lz77);
        *best = refs_trace;
      }
@ -755,13 +824,18 @@ static int ComputeCacheHistogram(const uint32_t* const argb,
  int i;
  uint32_t k;
  VP8LColorCache hashers;
+  const int use_color_cache = (cache_bits > 0);
+  int cc_init = 0;

-  if (!VP8LColorCacheInit(&hashers, cache_bits)) return 0;
+  if (use_color_cache) {
+    cc_init = VP8LColorCacheInit(&hashers, cache_bits);
+    if (!cc_init) return 0;
+  }

  for (i = 0; i < refs->size; ++i) {
    const PixOrCopy* const v = &refs->refs[i];
    if (PixOrCopyIsLiteral(v)) {
-      if (cache_bits != 0 &&
+      if (use_color_cache &&
          VP8LColorCacheContains(&hashers, argb[pixel_index])) {
        // push pixel as a cache index
        const int ix = VP8LColorCacheGetIndex(&hashers, argb[pixel_index]);
@ -773,15 +847,17 @@ static int ComputeCacheHistogram(const uint32_t* const argb,
    } else {
      VP8LHistogramAddSinglePixOrCopy(histo, v);
    }
+    if (use_color_cache) {
      for (k = 0; k < PixOrCopyLength(v); ++k) {
-      VP8LColorCacheInsert(&hashers, argb[pixel_index]);
-      ++pixel_index;
+        VP8LColorCacheInsert(&hashers, argb[pixel_index + k]);
      }
    }
+    pixel_index += PixOrCopyLength(v);
+  }
  assert(pixel_index == xsize * ysize);
  (void)xsize;  // xsize is not used in non-debug compilations otherwise.
  (void)ysize;  // ysize is not used in non-debug compilations otherwise.
-  VP8LColorCacheClear(&hashers);
+  if (cc_init) VP8LColorCacheClear(&hashers);
  return 1;
 }

--- a/src/enc/backward_references.h
+++ b/src/enc/backward_references.h
@ -1,8 +1,10 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Author: Jyrki Alakuijala (jyrki@google.com)
@ -16,7 +18,7 @@
 #include "../webp/types.h"
 #include "../webp/format_constants.h"

-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif

@ -28,68 +30,6 @@ extern "C" {
 #define PIX_OR_COPY_CODES_MAX \
    (NUM_LITERAL_CODES + NUM_LENGTH_CODES + (1 << MAX_COLOR_CACHE_BITS))

-// -----------------------------------------------------------------------------
-// PrefixEncode()
-
-// use GNU builtins where available.
-#if defined(__GNUC__) && \
-    ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4)
-static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
-  return n == 0 ? -1 : 31 ^ __builtin_clz(n);
-}
-#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
-#include <intrin.h>
-#pragma intrinsic(_BitScanReverse)
-
-static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
-  unsigned long first_set_bit;
-  return _BitScanReverse(&first_set_bit, n) ? first_set_bit : -1;
-}
-#else
-static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
-  int log = 0;
-  uint32_t value = n;
-  int i;
-
-  if (value == 0) return -1;
-  for (i = 4; i >= 0; --i) {
-    const int shift = (1 << i);
-    const uint32_t x = value >> shift;
-    if (x != 0) {
-      value = x;
-      log += shift;
-    }
-  }
-  return log;
-}
-#endif
-
-static WEBP_INLINE int VP8LBitsLog2Ceiling(uint32_t n) {
-  const int floor = BitsLog2Floor(n);
-  if (n == (n & ~(n - 1)))  // zero or a power of two.
-    return floor;
-  else
-    return floor + 1;
-}
-
-// Splitting of distance and length codes into prefixes and
-// extra bits. The prefixes are encoded with an entropy code
-// while the extra bits are stored just as normal bits.
-static WEBP_INLINE void PrefixEncode(int distance, int* const code,
-                                     int* const extra_bits_count,
-                                     int* const extra_bits_value) {
-  // Collect the two most significant bits where the highest bit is 1.
-  const int highest_bit = BitsLog2Floor(--distance);
-  // & 0x3f is to make behavior well defined when highest_bit
-  // does not exist or is the least significant bit.
-  const int second_highest_bit =
-      (distance >> ((highest_bit - 1) & 0x3f)) & 1;
-  *extra_bits_count = (highest_bit > 0) ? (highest_bit - 1) : 0;
-  *extra_bits_value = distance & ((1 << *extra_bits_count) - 1);
-  *code = (highest_bit > 0) ? (2 * highest_bit + second_highest_bit)
-                            : (highest_bit == 0) ? 1 : 0;
-}
-
 // -----------------------------------------------------------------------------
 // PixOrCopy

@ -205,7 +145,7 @@ int VP8LCalculateEstimateForCacheSize(const uint32_t* const argb,
                                      int xsize, int ysize,
                                      int* const best_cache_bits);

-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }
 #endif

--- a/src/enc/config.c
+++ b/src/enc/config.c
@ -1,8 +1,10 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Coding tools configuration
@ -11,10 +13,6 @@

 #include "../webp/encode.h"

-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 //------------------------------------------------------------------------------
 // WebPConfig
 //------------------------------------------------------------------------------
@ -31,9 +29,9 @@ int WebPConfigInitInternal(WebPConfig* config,
  config->target_PSNR = 0.;
  config->method = 4;
  config->sns_strength = 50;
-  config->filter_strength = 20;   // default: light filtering
+  config->filter_strength = 60;   // mid-filtering
  config->filter_sharpness = 0;
-  config->filter_type = 0;        // default: simple
+  config->filter_type = 1;        // default: strong (so U/V is filtered too)
  config->partitions = 0;
  config->segments = 4;
  config->pass = 1;
@ -46,6 +44,9 @@ int WebPConfigInitInternal(WebPConfig* config,
  config->alpha_quality = 100;
  config->lossless = 0;
  config->image_hint = WEBP_HINT_DEFAULT;
+  config->emulate_jpeg_size = 0;
+  config->thread_level = 0;
+  config->low_memory = 0;

  // TODO(skal): tune.
  switch (preset) {
@ -53,11 +54,13 @@ int WebPConfigInitInternal(WebPConfig* config,
      config->sns_strength = 80;
      config->filter_sharpness = 4;
      config->filter_strength = 35;
+      config->preprocessing &= ~2;   // no dithering
      break;
    case WEBP_PRESET_PHOTO:
      config->sns_strength = 80;
      config->filter_sharpness = 3;
      config->filter_strength = 30;
+      config->preprocessing |= 2;
      break;
    case WEBP_PRESET_DRAWING:
      config->sns_strength = 25;
@ -67,10 +70,12 @@ int WebPConfigInitInternal(WebPConfig* config,
    case WEBP_PRESET_ICON:
      config->sns_strength = 0;
      config->filter_strength = 0;   // disable filtering to retain sharpness
+      config->preprocessing &= ~2;   // no dithering
      break;
    case WEBP_PRESET_TEXT:
      config->sns_strength = 0;
      config->filter_strength = 0;   // disable filtering to retain sharpness
+      config->preprocessing &= ~2;   // no dithering
      config->segments = 2;
      break;
    case WEBP_PRESET_DEFAULT:
@ -106,7 +111,7 @@ int WebPValidateConfig(const WebPConfig* config) {
    return 0;
  if (config->show_compressed < 0 || config->show_compressed > 1)
    return 0;
-  if (config->preprocessing < 0 || config->preprocessing > 1)
+  if (config->preprocessing < 0 || config->preprocessing > 3)
    return 0;
  if (config->partitions < 0 || config->partitions > 3)
    return 0;
@ -120,13 +125,16 @@ int WebPValidateConfig(const WebPConfig* config) {
    return 0;
  if (config->lossless < 0 || config->lossless > 1)
    return 0;
-  if (config->image_hint > WEBP_HINT_PHOTO)
+  if (config->image_hint >= WEBP_HINT_LAST)
+    return 0;
+  if (config->emulate_jpeg_size < 0 || config->emulate_jpeg_size > 1)
+    return 0;
+  if (config->thread_level < 0 || config->thread_level > 1)
+    return 0;
+  if (config->low_memory < 0 || config->low_memory > 1)
    return 0;
  return 1;
 }

 //------------------------------------------------------------------------------

-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
--- a/src/enc/cost.c
+++ b/src/enc/cost.c
@ -1,8 +1,10 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Cost tables for level and modes
@ -11,10 +13,6 @@

 #include "./cost.h"

-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 //------------------------------------------------------------------------------
 // Boolean-cost cost table

@ -75,7 +73,7 @@ const uint16_t VP8LevelCodes[MAX_VARIABLE_LEVEL][2] = {

 // fixed costs for coding levels, deduce from the coding tree.
 // This is only the part that doesn't depend on the probability state.
-const uint16_t VP8LevelFixedCosts[2048] = {
+const uint16_t VP8LevelFixedCosts[MAX_LEVEL + 1] = {
     0,  256,  256,  256,  256,  432,  618,  630,
   731,  640,  640,  828,  901,  948, 1021, 1101,
  1174, 1221, 1294, 1042, 1085, 1115, 1158, 1202,
@ -385,110 +383,107 @@ const uint16_t VP8FixedCostsUV[4] = { 302, 984, 439, 642 };
 // note: these values include the fixed VP8BitCost(1, 145) mode selection cost.
 const uint16_t VP8FixedCostsI16[4] = { 663, 919, 872, 919 };
 const uint16_t VP8FixedCostsI4[NUM_BMODES][NUM_BMODES][NUM_BMODES] = {
-  { {  251, 1362, 1934, 2085, 2314, 2230, 1839, 1988, 2437, 2348 },
-    {  403,  680, 1507, 1519, 2060, 2005, 1992, 1914, 1924, 1733 },
-    {  353, 1121,  973, 1895, 2060, 1787, 1671, 1516, 2012, 1868 },
-    {  770,  852, 1581,  632, 1393, 1780, 1823, 1936, 1074, 1218 },
-    {  510, 1270, 1467, 1319,  847, 1279, 1792, 2094, 1080, 1353 },
-    {  488, 1322,  918, 1573, 1300,  883, 1814, 1752, 1756, 1502 },
-    {  425,  992, 1820, 1514, 1843, 2440,  937, 1771, 1924, 1129 },
-    {  363, 1248, 1257, 1970, 2194, 2385, 1569,  953, 1951, 1601 },
-    {  723, 1257, 1631,  964,  963, 1508, 1697, 1824,  671, 1418 },
-    {  635, 1038, 1573,  930, 1673, 1413, 1410, 1687, 1410,  749 } },
-  { {  451,  613, 1345, 1702, 1870, 1716, 1728, 1766, 2190, 2310 },
-    {  678,  453, 1171, 1443, 1925, 1831, 2045, 1781, 1887, 1602 },
-    {  711,  666,  674, 1718, 1910, 1493, 1775, 1193, 2325, 2325 },
-    {  883,  854, 1583,  542, 1800, 1878, 1664, 2149, 1207, 1087 },
-    {  669,  994, 1248, 1122,  949, 1179, 1376, 1729, 1070, 1244 },
-    {  715, 1026,  715, 1350, 1430,  930, 1717, 1296, 1479, 1479 },
-    {  544,  841, 1656, 1450, 2094, 3883, 1010, 1759, 2076,  809 },
-    {  610,  855,  957, 1553, 2067, 1561, 1704,  824, 2066, 1226 },
-    {  833,  960, 1416,  819, 1277, 1619, 1501, 1617,  757, 1182 },
-    {  711,  964, 1252,  879, 1441, 1828, 1508, 1636, 1594,  734 } },
-  { {  605,  764,  734, 1713, 1747, 1192, 1819, 1353, 1877, 2392 },
-    {  866,  641,  586, 1622, 2072, 1431, 1888, 1346, 2189, 1764 },
-    {  901,  851,  456, 2165, 2281, 1405, 1739, 1193, 2183, 2443 },
-    {  770, 1045,  952, 1078, 1342, 1191, 1436, 1063, 1303,  995 },
-    {  901, 1086,  727, 1170,  884, 1105, 1267, 1401, 1739, 1337 },
-    {  951, 1162,  595, 1488, 1388,  703, 1790, 1366, 2057, 1724 },
-    {  534,  986, 1273, 1987, 3273, 1485, 1024, 1399, 1583,  866 },
-    {  699, 1182,  695, 1978, 1726, 1986, 1326,  714, 1750, 1672 },
-    {  951, 1217, 1209,  920, 1062, 1441, 1548,  999,  952,  932 },
-    {  733, 1284,  784, 1256, 1557, 1098, 1257, 1357, 1414,  908 } },
-  { {  316, 1075, 1653, 1220, 2145, 2051, 1730, 2131, 1884, 1790 },
-    {  745,  516, 1404,  894, 1599, 2375, 2013, 2105, 1475, 1381 },
-    {  516,  729, 1088, 1319, 1637, 3426, 1636, 1275, 1531, 1453 },
-    {  894,  943, 2138,  468, 1704, 2259, 2069, 1763, 1266, 1158 },
-    {  605, 1025, 1235,  871, 1170, 1767, 1493, 1500, 1104, 1258 },
-    {  739,  826, 1207, 1151, 1412,  846, 1305, 2726, 1014, 1569 },
-    {  558,  825, 1820, 1398, 3344, 1556, 1218, 1550, 1228,  878 },
-    {  429,  951, 1089, 1816, 3861, 3861, 1556,  969, 1568, 1828 },
-    {  883,  961, 1752,  769, 1468, 1810, 2081, 2346,  613, 1298 },
-    {  803,  895, 1372,  641, 1303, 1708, 1686, 1700, 1306, 1033 } },
-  { {  439, 1267, 1270, 1579,  963, 1193, 1723, 1729, 1198, 1993 },
-    {  705,  725, 1029, 1153, 1176, 1103, 1821, 1567, 1259, 1574 },
-    {  723,  859,  802, 1253,  972, 1202, 1407, 1665, 1520, 1674 },
-    {  894,  960, 1254,  887, 1052, 1607, 1344, 1349,  865, 1150 },
-    {  833, 1312, 1337, 1205,  572, 1288, 1414, 1529, 1088, 1430 },
-    {  842, 1279, 1068, 1861,  862,  688, 1861, 1630, 1039, 1381 },
-    {  766,  938, 1279, 1546, 3338, 1550, 1031, 1542, 1288,  640 },
-    {  715, 1090,  835, 1609, 1100, 1100, 1603, 1019, 1102, 1617 },
-    {  894, 1813, 1500, 1188,  789, 1194, 1491, 1919,  617, 1333 },
-    {  610, 1076, 1644, 1281, 1283,  975, 1179, 1688, 1434,  889 } },
-  { {  544,  971, 1146, 1849, 1221,  740, 1857, 1621, 1683, 2430 },
-    {  723,  705,  961, 1371, 1426,  821, 2081, 2079, 1839, 1380 },
-    {  783,  857,  703, 2145, 1419,  814, 1791, 1310, 1609, 2206 },
-    {  997, 1000, 1153,  792, 1229, 1162, 1810, 1418,  942,  979 },
-    {  901, 1226,  883, 1289,  793,  715, 1904, 1649, 1319, 3108 },
-    {  979, 1478,  782, 2216, 1454,  455, 3092, 1591, 1997, 1664 },
-    {  663, 1110, 1504, 1114, 1522, 3311,  676, 1522, 1530, 1024 },
-    {  605, 1138, 1153, 1314, 1569, 1315, 1157,  804, 1574, 1320 },
-    {  770, 1216, 1218, 1227,  869, 1384, 1232, 1375,  834, 1239 },
-    {  775, 1007,  843, 1216, 1225, 1074, 2527, 1479, 1149,  975 } },
-  { {  477,  817, 1309, 1439, 1708, 1454, 1159, 1241, 1945, 1672 },
-    {  577,  796, 1112, 1271, 1618, 1458, 1087, 1345, 1831, 1265 },
-    {  663,  776,  753, 1940, 1690, 1690, 1227, 1097, 3149, 1361 },
-    {  766, 1299, 1744, 1161, 1565, 1106, 1045, 1230, 1232,  707 },
-    {  915, 1026, 1404, 1182, 1184,  851, 1428, 2425, 1043,  789 },
-    {  883, 1456,  790, 1082, 1086,  985, 1083, 1484, 1238, 1160 },
-    {  507, 1345, 2261, 1995, 1847, 3636,  653, 1761, 2287,  933 },
-    {  553, 1193, 1470, 2057, 2059, 2059,  833,  779, 2058, 1263 },
-    {  766, 1275, 1515, 1039,  957, 1554, 1286, 1540, 1289,  705 },
-    {  499, 1378, 1496, 1385, 1850, 1850, 1044, 2465, 1515,  720 } },
-  { {  553,  930,  978, 2077, 1968, 1481, 1457,  761, 1957, 2362 },
-    {  694,  864,  905, 1720, 1670, 1621, 1429,  718, 2125, 1477 },
-    {  699,  968,  658, 3190, 2024, 1479, 1865,  750, 2060, 2320 },
-    {  733, 1308, 1296, 1062, 1576, 1322, 1062, 1112, 1172,  816 },
-    {  920,  927, 1052,  939,  947, 1156, 1152, 1073, 3056, 1268 },
-    {  723, 1534,  711, 1547, 1294,  892, 1553,  928, 1815, 1561 },
-    {  663, 1366, 1583, 2111, 1712, 3501,  522, 1155, 2130, 1133 },
-    {  614, 1731, 1188, 2343, 1944, 3733, 1287,  487, 3546, 1758 },
-    {  770, 1585, 1312,  826,  884, 2673, 1185, 1006, 1195, 1195 },
-    {  758, 1333, 1273, 1023, 1621, 1162, 1351,  833, 1479,  862 } },
-  { {  376, 1193, 1446, 1149, 1545, 1577, 1870, 1789, 1175, 1823 },
-    {  803,  633, 1136, 1058, 1350, 1323, 1598, 2247, 1072, 1252 },
-    {  614, 1048,  943,  981, 1152, 1869, 1461, 1020, 1618, 1618 },
-    { 1107, 1085, 1282,  592, 1779, 1933, 1648, 2403,  691, 1246 },
-    {  851, 1309, 1223, 1243,  895, 1593, 1792, 2317,  627, 1076 },
-    {  770, 1216, 1030, 1125,  921,  981, 1629, 1131, 1049, 1646 },
-    {  626, 1469, 1456, 1081, 1489, 3278,  981, 1232, 1498,  733 },
-    {  617, 1201,  812, 1220, 1476, 1476, 1478,  970, 1228, 1488 },
-    { 1179, 1393, 1540,  999, 1243, 1503, 1916, 1925,  414, 1614 },
-    {  943, 1088, 1490,  682, 1112, 1372, 1756, 1505,  966,  966 } },
-  { {  322, 1142, 1589, 1396, 2144, 1859, 1359, 1925, 2084, 1518 },
-    {  617,  625, 1241, 1234, 2121, 1615, 1524, 1858, 1720, 1004 },
-    {  553,  851,  786, 1299, 1452, 1560, 1372, 1561, 1967, 1713 },
-    {  770,  977, 1396,  568, 1893, 1639, 1540, 2108, 1430, 1013 },
-    {  684, 1120, 1375,  982,  930, 2719, 1638, 1643,  933,  993 },
-    {  553, 1103,  996, 1356, 1361, 1005, 1507, 1761, 1184, 1268 },
-    {  419, 1247, 1537, 1554, 1817, 3606, 1026, 1666, 1829,  923 },
-    {  439, 1139, 1101, 1257, 3710, 1922, 1205, 1040, 1931, 1529 },
-    {  979,  935, 1269,  847, 1202, 1286, 1530, 1535,  827, 1036 },
-    {  516, 1378, 1569, 1110, 1798, 1798, 1198, 2199, 1543,  712 } },
+  { {   40, 1151, 1723, 1874, 2103, 2019, 1628, 1777, 2226, 2137 },
+    {  192,  469, 1296, 1308, 1849, 1794, 1781, 1703, 1713, 1522 },
+    {  142,  910,  762, 1684, 1849, 1576, 1460, 1305, 1801, 1657 },
+    {  559,  641, 1370,  421, 1182, 1569, 1612, 1725,  863, 1007 },
+    {  299, 1059, 1256, 1108,  636, 1068, 1581, 1883,  869, 1142 },
+    {  277, 1111,  707, 1362, 1089,  672, 1603, 1541, 1545, 1291 },
+    {  214,  781, 1609, 1303, 1632, 2229,  726, 1560, 1713,  918 },
+    {  152, 1037, 1046, 1759, 1983, 2174, 1358,  742, 1740, 1390 },
+    {  512, 1046, 1420,  753,  752, 1297, 1486, 1613,  460, 1207 },
+    {  424,  827, 1362,  719, 1462, 1202, 1199, 1476, 1199,  538 } },
+  { {  240,  402, 1134, 1491, 1659, 1505, 1517, 1555, 1979, 2099 },
+    {  467,  242,  960, 1232, 1714, 1620, 1834, 1570, 1676, 1391 },
+    {  500,  455,  463, 1507, 1699, 1282, 1564,  982, 2114, 2114 },
+    {  672,  643, 1372,  331, 1589, 1667, 1453, 1938,  996,  876 },
+    {  458,  783, 1037,  911,  738,  968, 1165, 1518,  859, 1033 },
+    {  504,  815,  504, 1139, 1219,  719, 1506, 1085, 1268, 1268 },
+    {  333,  630, 1445, 1239, 1883, 3672,  799, 1548, 1865,  598 },
+    {  399,  644,  746, 1342, 1856, 1350, 1493,  613, 1855, 1015 },
+    {  622,  749, 1205,  608, 1066, 1408, 1290, 1406,  546,  971 },
+    {  500,  753, 1041,  668, 1230, 1617, 1297, 1425, 1383,  523 } },
+  { {  394,  553,  523, 1502, 1536,  981, 1608, 1142, 1666, 2181 },
+    {  655,  430,  375, 1411, 1861, 1220, 1677, 1135, 1978, 1553 },
+    {  690,  640,  245, 1954, 2070, 1194, 1528,  982, 1972, 2232 },
+    {  559,  834,  741,  867, 1131,  980, 1225,  852, 1092,  784 },
+    {  690,  875,  516,  959,  673,  894, 1056, 1190, 1528, 1126 },
+    {  740,  951,  384, 1277, 1177,  492, 1579, 1155, 1846, 1513 },
+    {  323,  775, 1062, 1776, 3062, 1274,  813, 1188, 1372,  655 },
+    {  488,  971,  484, 1767, 1515, 1775, 1115,  503, 1539, 1461 },
+    {  740, 1006,  998,  709,  851, 1230, 1337,  788,  741,  721 },
+    {  522, 1073,  573, 1045, 1346,  887, 1046, 1146, 1203,  697 } },
+  { {  105,  864, 1442, 1009, 1934, 1840, 1519, 1920, 1673, 1579 },
+    {  534,  305, 1193,  683, 1388, 2164, 1802, 1894, 1264, 1170 },
+    {  305,  518,  877, 1108, 1426, 3215, 1425, 1064, 1320, 1242 },
+    {  683,  732, 1927,  257, 1493, 2048, 1858, 1552, 1055,  947 },
+    {  394,  814, 1024,  660,  959, 1556, 1282, 1289,  893, 1047 },
+    {  528,  615,  996,  940, 1201,  635, 1094, 2515,  803, 1358 },
+    {  347,  614, 1609, 1187, 3133, 1345, 1007, 1339, 1017,  667 },
+    {  218,  740,  878, 1605, 3650, 3650, 1345,  758, 1357, 1617 },
+    {  672,  750, 1541,  558, 1257, 1599, 1870, 2135,  402, 1087 },
+    {  592,  684, 1161,  430, 1092, 1497, 1475, 1489, 1095,  822 } },
+  { {  228, 1056, 1059, 1368,  752,  982, 1512, 1518,  987, 1782 },
+    {  494,  514,  818,  942,  965,  892, 1610, 1356, 1048, 1363 },
+    {  512,  648,  591, 1042,  761,  991, 1196, 1454, 1309, 1463 },
+    {  683,  749, 1043,  676,  841, 1396, 1133, 1138,  654,  939 },
+    {  622, 1101, 1126,  994,  361, 1077, 1203, 1318,  877, 1219 },
+    {  631, 1068,  857, 1650,  651,  477, 1650, 1419,  828, 1170 },
+    {  555,  727, 1068, 1335, 3127, 1339,  820, 1331, 1077,  429 },
+    {  504,  879,  624, 1398,  889,  889, 1392,  808,  891, 1406 },
+    {  683, 1602, 1289,  977,  578,  983, 1280, 1708,  406, 1122 },
+    {  399,  865, 1433, 1070, 1072,  764,  968, 1477, 1223,  678 } },
+  { {  333,  760,  935, 1638, 1010,  529, 1646, 1410, 1472, 2219 },
+    {  512,  494,  750, 1160, 1215,  610, 1870, 1868, 1628, 1169 },
+    {  572,  646,  492, 1934, 1208,  603, 1580, 1099, 1398, 1995 },
+    {  786,  789,  942,  581, 1018,  951, 1599, 1207,  731,  768 },
+    {  690, 1015,  672, 1078,  582,  504, 1693, 1438, 1108, 2897 },
+    {  768, 1267,  571, 2005, 1243,  244, 2881, 1380, 1786, 1453 },
+    {  452,  899, 1293,  903, 1311, 3100,  465, 1311, 1319,  813 },
+    {  394,  927,  942, 1103, 1358, 1104,  946,  593, 1363, 1109 },
+    {  559, 1005, 1007, 1016,  658, 1173, 1021, 1164,  623, 1028 },
+    {  564,  796,  632, 1005, 1014,  863, 2316, 1268,  938,  764 } },
+  { {  266,  606, 1098, 1228, 1497, 1243,  948, 1030, 1734, 1461 },
+    {  366,  585,  901, 1060, 1407, 1247,  876, 1134, 1620, 1054 },
+    {  452,  565,  542, 1729, 1479, 1479, 1016,  886, 2938, 1150 },
+    {  555, 1088, 1533,  950, 1354,  895,  834, 1019, 1021,  496 },
+    {  704,  815, 1193,  971,  973,  640, 1217, 2214,  832,  578 },
+    {  672, 1245,  579,  871,  875,  774,  872, 1273, 1027,  949 },
+    {  296, 1134, 2050, 1784, 1636, 3425,  442, 1550, 2076,  722 },
+    {  342,  982, 1259, 1846, 1848, 1848,  622,  568, 1847, 1052 },
+    {  555, 1064, 1304,  828,  746, 1343, 1075, 1329, 1078,  494 },
+    {  288, 1167, 1285, 1174, 1639, 1639,  833, 2254, 1304,  509 } },
+  { {  342,  719,  767, 1866, 1757, 1270, 1246,  550, 1746, 2151 },
+    {  483,  653,  694, 1509, 1459, 1410, 1218,  507, 1914, 1266 },
+    {  488,  757,  447, 2979, 1813, 1268, 1654,  539, 1849, 2109 },
+    {  522, 1097, 1085,  851, 1365, 1111,  851,  901,  961,  605 },
+    {  709,  716,  841,  728,  736,  945,  941,  862, 2845, 1057 },
+    {  512, 1323,  500, 1336, 1083,  681, 1342,  717, 1604, 1350 },
+    {  452, 1155, 1372, 1900, 1501, 3290,  311,  944, 1919,  922 },
+    {  403, 1520,  977, 2132, 1733, 3522, 1076,  276, 3335, 1547 },
+    {  559, 1374, 1101,  615,  673, 2462,  974,  795,  984,  984 },
+    {  547, 1122, 1062,  812, 1410,  951, 1140,  622, 1268,  651 } },
+  { {  165,  982, 1235,  938, 1334, 1366, 1659, 1578,  964, 1612 },
+    {  592,  422,  925,  847, 1139, 1112, 1387, 2036,  861, 1041 },
+    {  403,  837,  732,  770,  941, 1658, 1250,  809, 1407, 1407 },
+    {  896,  874, 1071,  381, 1568, 1722, 1437, 2192,  480, 1035 },
+    {  640, 1098, 1012, 1032,  684, 1382, 1581, 2106,  416,  865 },
+    {  559, 1005,  819,  914,  710,  770, 1418,  920,  838, 1435 },
+    {  415, 1258, 1245,  870, 1278, 3067,  770, 1021, 1287,  522 },
+    {  406,  990,  601, 1009, 1265, 1265, 1267,  759, 1017, 1277 },
+    {  968, 1182, 1329,  788, 1032, 1292, 1705, 1714,  203, 1403 },
+    {  732,  877, 1279,  471,  901, 1161, 1545, 1294,  755,  755 } },
+  { {  111,  931, 1378, 1185, 1933, 1648, 1148, 1714, 1873, 1307 },
+    {  406,  414, 1030, 1023, 1910, 1404, 1313, 1647, 1509,  793 },
+    {  342,  640,  575, 1088, 1241, 1349, 1161, 1350, 1756, 1502 },
+    {  559,  766, 1185,  357, 1682, 1428, 1329, 1897, 1219,  802 },
+    {  473,  909, 1164,  771,  719, 2508, 1427, 1432,  722,  782 },
+    {  342,  892,  785, 1145, 1150,  794, 1296, 1550,  973, 1057 },
+    {  208, 1036, 1326, 1343, 1606, 3395,  815, 1455, 1618,  712 },
+    {  228,  928,  890, 1046, 3499, 1711,  994,  829, 1720, 1318 },
+    {  768,  724, 1058,  636,  991, 1075, 1319, 1324,  616,  825 },
+    {  305, 1167, 1358,  899, 1587, 1587,  987, 1988, 1332,  501 } }
 };

 //------------------------------------------------------------------------------

-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
--- a/src/enc/cost.h
+++ b/src/enc/cost.h
@ -1,8 +1,10 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Cost tables for level and modes.
@ -14,11 +16,12 @@

 #include "./vp8enci.h"

-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif

-extern const uint16_t VP8LevelFixedCosts[2048];   // approximate cost per level
+// approximate cost per level:
+extern const uint16_t VP8LevelFixedCosts[MAX_LEVEL + 1];
 extern const uint16_t VP8EntropyCost[256];        // 8bit fixed-point log(p)

 // Cost of coding one event with probability 'proba'.
@ -41,7 +44,7 @@ extern const uint16_t VP8FixedCostsI4[NUM_BMODES][NUM_BMODES][NUM_BMODES];

 //------------------------------------------------------------------------------

-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }    // extern "C"
 #endif

--- a/src/enc/filter.c
+++ b/src/enc/filter.c
@ -1,20 +1,67 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Selecting filter level
 //
 // Author: somnath@google.com (Somnath Banerjee)

+#include <assert.h>
 #include "./vp8enci.h"

-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
+// This table gives, for a given sharpness, the filtering strength to be
+// used (at least) in order to filter a given edge step delta.
+// This is constructed by brute force inspection: for all delta, we iterate
+// over all possible filtering strength / thresh until needs_filter() returns
+// true.
+#define MAX_DELTA_SIZE 64
+static const uint8_t kLevelsFromDelta[8][MAX_DELTA_SIZE] = {
+  { 0,   1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 },
+  { 0,  1,  2,  3,  5,  6,  7,  8,  9, 11, 12, 13, 14, 15, 17, 18,
+    20, 21, 23, 24, 26, 27, 29, 30, 32, 33, 35, 36, 38, 39, 41, 42,
+    44, 45, 47, 48, 50, 51, 53, 54, 56, 57, 59, 60, 62, 63, 63, 63,
+    63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 },
+  {  0,  1,  2,  3,  5,  6,  7,  8,  9, 11, 12, 13, 14, 16, 17, 19,
+    20, 22, 23, 25, 26, 28, 29, 31, 32, 34, 35, 37, 38, 40, 41, 43,
+    44, 46, 47, 49, 50, 52, 53, 55, 56, 58, 59, 61, 62, 63, 63, 63,
+    63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 },
+  {  0,  1,  2,  3,  5,  6,  7,  8,  9, 11, 12, 13, 15, 16, 18, 19,
+    21, 22, 24, 25, 27, 28, 30, 31, 33, 34, 36, 37, 39, 40, 42, 43,
+    45, 46, 48, 49, 51, 52, 54, 55, 57, 58, 60, 61, 63, 63, 63, 63,
+    63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 },
+  {  0,  1,  2,  3,  5,  6,  7,  8,  9, 11, 12, 14, 15, 17, 18, 20,
+    21, 23, 24, 26, 27, 29, 30, 32, 33, 35, 36, 38, 39, 41, 42, 44,
+    45, 47, 48, 50, 51, 53, 54, 56, 57, 59, 60, 62, 63, 63, 63, 63,
+    63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 },
+  {  0,  1,  2,  4,  5,  7,  8,  9, 11, 12, 13, 15, 16, 17, 19, 20,
+    22, 23, 25, 26, 28, 29, 31, 32, 34, 35, 37, 38, 40, 41, 43, 44,
+    46, 47, 49, 50, 52, 53, 55, 56, 58, 59, 61, 62, 63, 63, 63, 63,
+    63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 },
+  {  0,  1,  2,  4,  5,  7,  8,  9, 11, 12, 13, 15, 16, 18, 19, 21,
+    22, 24, 25, 27, 28, 30, 31, 33, 34, 36, 37, 39, 40, 42, 43, 45,
+    46, 48, 49, 51, 52, 54, 55, 57, 58, 60, 61, 63, 63, 63, 63, 63,
+    63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 },
+  {  0,  1,  2,  4,  5,  7,  8,  9, 11, 12, 14, 15, 17, 18, 20, 21,
+    23, 24, 26, 27, 29, 30, 32, 33, 35, 36, 38, 39, 41, 42, 44, 45,
+    47, 48, 50, 51, 53, 54, 56, 57, 59, 60, 62, 63, 63, 63, 63, 63,
+    63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 }
+};

+int VP8FilterStrengthFromDelta(int sharpness, int delta) {
+  const int pos = (delta < MAX_DELTA_SIZE) ? delta : MAX_DELTA_SIZE - 1;
+  assert(sharpness >= 0 && sharpness <= 7);
+  return kLevelsFromDelta[sharpness][pos];
+}
+
+// -----------------------------------------------------------------------------
 // NOTE: clip1, tables and InitTables are repeated entries of dsp.c
 static uint8_t abs0[255 + 255 + 1];     // abs(i)
 static uint8_t abs1[255 + 255 + 1];     // abs(i)>>1
@ -338,9 +385,8 @@ static double GetMBSSIM(const uint8_t* yuv1, const uint8_t* yuv2) {
 // loop filter strength

 void VP8InitFilter(VP8EncIterator* const it) {
+  if (it->lf_stats_ != NULL) {
    int s, i;
-  if (!it->lf_stats_) return;
-
    InitTables();
    for (s = 0; s < NUM_MB_SEGMENTS; s++) {
      for (i = 0; i < MAX_LF_LEVELS; i++) {
@ -348,18 +394,20 @@ void VP8InitFilter(VP8EncIterator* const it) {
      }
    }
  }
+}

 void VP8StoreFilterStats(VP8EncIterator* const it) {
  int d;
+  VP8Encoder* const enc = it->enc_;
  const int s = it->mb_->segment_;
-  const int level0 = it->enc_->dqm_[s].fstrength_;  // TODO: ref_lf_delta[]
+  const int level0 = enc->dqm_[s].fstrength_;  // TODO: ref_lf_delta[]

  // explore +/-quant range of values around level0
-  const int delta_min = -it->enc_->dqm_[s].quant_;
-  const int delta_max = it->enc_->dqm_[s].quant_;
+  const int delta_min = -enc->dqm_[s].quant_;
+  const int delta_max = enc->dqm_[s].quant_;
  const int step_size = (delta_max - delta_min >= 4) ? 4 : 1;

-  if (!it->lf_stats_) return;
+  if (it->lf_stats_ == NULL) return;

  // NOTE: Currently we are applying filter only across the sublock edges
  // There are two reasons for that.
@ -383,12 +431,9 @@ void VP8StoreFilterStats(VP8EncIterator* const it) {
 }

 void VP8AdjustFilterStrength(VP8EncIterator* const it) {
-  int s;
  VP8Encoder* const enc = it->enc_;
-
-  if (!it->lf_stats_) {
-    return;
-  }
+  if (it->lf_stats_ != NULL) {
+    int s;
    for (s = 0; s < NUM_MB_SEGMENTS; s++) {
      int i, best_level = 0;
      // Improvement over filter level 0 should be at least 1e-5 (relatively)
@ -402,8 +447,25 @@ void VP8AdjustFilterStrength(VP8EncIterator* const it) {
      }
      enc->dqm_[s].fstrength_ = best_level;
    }
+  } else if (enc->config_->filter_strength > 0) {
+    int max_level = 0;
+    int s;
+    for (s = 0; s < NUM_MB_SEGMENTS; s++) {
+      VP8SegmentInfo* const dqm = &enc->dqm_[s];
+      // this '>> 3' accounts for some inverse WHT scaling
+      const int delta = (dqm->max_edge_ * dqm->y2_.q_[1]) >> 3;
+      const int level =
+          VP8FilterStrengthFromDelta(enc->filter_hdr_.sharpness_, delta);
+      if (level > dqm->fstrength_) {
+        dqm->fstrength_ = level;
+      }
+      if (max_level < dqm->fstrength_) {
+        max_level = dqm->fstrength_;
+      }
+    }
+    enc->filter_hdr_.level_ = max_level;
+  }
 }

-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
+// -----------------------------------------------------------------------------
+
--- a/src/enc/frame.c
+++ b/src/enc/frame.c
@ -1,8 +1,10 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 //   frame coding and analysis
@ -16,10 +18,7 @@

 #include "./vp8enci.h"
 #include "./cost.h"
-
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
+#include "../webp/format_constants.h"  // RIFF constants

 #define SEGMENT_VISU 0
 #define DEBUG_SEARCH 0    // useful to track search convergence
@ -37,6 +36,63 @@ typedef struct {
  CostArray*  cost;
 } VP8Residual;

+//------------------------------------------------------------------------------
+// multi-pass convergence
+
+#define HEADER_SIZE_ESTIMATE (RIFF_HEADER_SIZE + CHUNK_HEADER_SIZE +  \
+                              VP8_FRAME_HEADER_SIZE)
+#define DQ_LIMIT 0.4  // convergence is considered reached if dq < DQ_LIMIT
+// we allow 2k of extra head-room in PARTITION0 limit.
+#define PARTITION0_SIZE_LIMIT ((VP8_MAX_PARTITION0_SIZE - 2048ULL) << 11)
+
+typedef struct {  // struct for organizing convergence in either size or PSNR
+  int is_first;
+  float dq;
+  float q, last_q;
+  double value, last_value;   // PSNR or size
+  double target;
+  int do_size_search;
+} PassStats;
+
+static int InitPassStats(const VP8Encoder* const enc, PassStats* const s) {
+  const uint64_t target_size = (uint64_t)enc->config_->target_size;
+  const int do_size_search = (target_size != 0);
+  const float target_PSNR = enc->config_->target_PSNR;
+
+  s->is_first = 1;
+  s->dq = 10.f;
+  s->q = s->last_q = enc->config_->quality;
+  s->target = do_size_search ? (double)target_size
+            : (target_PSNR > 0.) ? target_PSNR
+            : 40.;   // default, just in case
+  s->value = s->last_value = 0.;
+  s->do_size_search = do_size_search;
+  return do_size_search;
+}
+
+static float Clamp(float v, float min, float max) {
+  return (v < min) ? min : (v > max) ? max : v;
+}
+
+static float ComputeNextQ(PassStats* const s) {
+  float dq;
+  if (s->is_first) {
+    dq = (s->value > s->target) ? -s->dq : s->dq;
+    s->is_first = 0;
+  } else if (s->value != s->last_value) {
+    const double slope = (s->target - s->value) / (s->last_value - s->value);
+    dq = (float)(slope * (s->last_q - s->q));
+  } else {
+    dq = 0.;  // we're done?!
+  }
+  // Limit variable to avoid large swings.
+  s->dq = Clamp(dq, -30.f, 30.f);
+  s->last_q = s->q;
+  s->last_value = s->value;
+  s->q = Clamp(s->q + s->dq, 0.f, 100.f);
+  return s->q;
+}
+
 //------------------------------------------------------------------------------
 // Tables for level coding

@ -45,10 +101,10 @@ const uint8_t VP8EncBands[16 + 1] = {
  0  // sentinel
 };

-static const uint8_t kCat3[] = { 173, 148, 140 };
-static const uint8_t kCat4[] = { 176, 155, 140, 135 };
-static const uint8_t kCat5[] = { 180, 157, 141, 134, 130 };
-static const uint8_t kCat6[] =
+const uint8_t VP8Cat3[] = { 173, 148, 140 };
+const uint8_t VP8Cat4[] = { 176, 155, 140, 135 };
+const uint8_t VP8Cat5[] = { 180, 157, 141, 134, 130 };
+const uint8_t VP8Cat6[] =
    { 254, 254, 243, 230, 196, 177, 153, 140, 133, 130, 129 };

 //------------------------------------------------------------------------------
@ -113,14 +169,15 @@ static int Record(int bit, proba_t* const stats) {
 // Note: no need to record the fixed probas.
 static int RecordCoeffs(int ctx, const VP8Residual* const res) {
  int n = res->first;
-  proba_t* s = res->stats[VP8EncBands[n]][ctx];
+  // should be stats[VP8EncBands[n]], but it's equivalent for n=0 or 1
+  proba_t* s = res->stats[n][ctx];
  if (res->last  < 0) {
    Record(0, s + 0);
    return 0;
  }
  while (n <= res->last) {
    int v;
-    Record(1, s + 0);
+    Record(1, s + 0);  // order of record doesn't matter
    while ((v = res->coeffs[n++]) == 0) {
      Record(0, s + 1);
      s = res->stats[VP8EncBands[n]][0];
@ -174,8 +231,7 @@ static int BranchCost(int nb, int total, int proba) {
  return nb * VP8BitCost(1, proba) + (total - nb) * VP8BitCost(0, proba);
 }

-static int FinalizeTokenProbas(VP8Encoder* const enc) {
-  VP8Proba* const proba = &enc->proba_;
+static int FinalizeTokenProbas(VP8Proba* const proba) {
  int has_changed = 0;
  int size = 0;
  int t, b, c, p;
@ -211,6 +267,47 @@ static int FinalizeTokenProbas(VP8Encoder* const enc) {
  return size;
 }

+//------------------------------------------------------------------------------
+// Finalize Segment probability based on the coding tree
+
+static int GetProba(int a, int b) {
+  const int total = a + b;
+  return (total == 0) ? 255     // that's the default probability.
+                      : (255 * a + total / 2) / total;  // rounded proba
+}
+
+static void SetSegmentProbas(VP8Encoder* const enc) {
+  int p[NUM_MB_SEGMENTS] = { 0 };
+  int n;
+
+  for (n = 0; n < enc->mb_w_ * enc->mb_h_; ++n) {
+    const VP8MBInfo* const mb = &enc->mb_info_[n];
+    p[mb->segment_]++;
+  }
+  if (enc->pic_->stats != NULL) {
+    for (n = 0; n < NUM_MB_SEGMENTS; ++n) {
+      enc->pic_->stats->segment_size[n] = p[n];
+    }
+  }
+  if (enc->segment_hdr_.num_segments_ > 1) {
+    uint8_t* const probas = enc->proba_.segments_;
+    probas[0] = GetProba(p[0] + p[1], p[2] + p[3]);
+    probas[1] = GetProba(p[0], p[1]);
+    probas[2] = GetProba(p[2], p[3]);
+
+    enc->segment_hdr_.update_map_ =
+        (probas[0] != 255) || (probas[1] != 255) || (probas[2] != 255);
+    enc->segment_hdr_.size_ =
+        p[0] * (VP8BitCost(0, probas[0]) + VP8BitCost(0, probas[1])) +
+        p[1] * (VP8BitCost(0, probas[0]) + VP8BitCost(1, probas[1])) +
+        p[2] * (VP8BitCost(1, probas[0]) + VP8BitCost(0, probas[2])) +
+        p[3] * (VP8BitCost(1, probas[0]) + VP8BitCost(1, probas[2]));
+  } else {
+    enc->segment_hdr_.update_map_ = 0;
+    enc->segment_hdr_.size_ = 0;
+  }
+}
+
 //------------------------------------------------------------------------------
 // helper functions for residuals struct VP8Residual.

@ -239,39 +336,38 @@ static void SetResidualCoeffs(const int16_t* const coeffs,
 //------------------------------------------------------------------------------
 // Mode costs

-static int GetResidualCost(int ctx, const VP8Residual* const res) {
+static int GetResidualCost(int ctx0, const VP8Residual* const res) {
  int n = res->first;
-  int p0 = res->prob[VP8EncBands[n]][ctx][0];
-  const uint16_t* t = res->cost[VP8EncBands[n]][ctx];
+  // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
+  int p0 = res->prob[n][ctx0][0];
+  const uint16_t* t = res->cost[n][ctx0];
  int cost;

  if (res->last < 0) {
    return VP8BitCost(0, p0);
  }
-  cost = 0;
-  while (n <= res->last) {
-    const int v = res->coeffs[n];
+  cost = VP8BitCost(1, p0);
+  for (; n < res->last; ++n) {
+    const int v = abs(res->coeffs[n]);
    const int b = VP8EncBands[n + 1];
-    ++n;
-    if (v == 0) {
-      // short-case for VP8LevelCost(t, 0) (note: VP8LevelFixedCosts[0] == 0):
-      cost += t[0];
-      t = res->cost[b][0];
-      continue;
+    const int ctx = (v >= 2) ? 2 : v;
+    cost += VP8LevelCost(t, v);
+    t = res->cost[b][ctx];
+    // the masking trick is faster than "if (v) cost += ..." with clang
+    cost += (v ? ~0U : 0) & VP8BitCost(1, res->prob[b][ctx][0]);
  }
-    cost += VP8BitCost(1, p0);
-    if (2u >= (unsigned int)(v + 1)) {   // v = -1 or 1
-      // short-case for "VP8LevelCost(t, 1)" (256 is VP8LevelFixedCosts[1]):
-      cost += 256 + t[1];
-      p0 = res->prob[b][1][0];
-      t = res->cost[b][1];
-    } else {
-      cost += VP8LevelCost(t, abs(v));
-      p0 = res->prob[b][2][0];
-      t = res->cost[b][2];
+  // Last coefficient is always non-zero
+  {
+    const int v = abs(res->coeffs[n]);
+    assert(v != 0);
+    cost += VP8LevelCost(t, v);
+    if (n < 15) {
+      const int b = VP8EncBands[n + 1];
+      const int ctx = (v == 1) ? 1 : 2;
+      const int last_p0 = res->prob[b][ctx][0];
+      cost += VP8BitCost(0, last_p0);
    }
  }
-  if (n < 16) cost += VP8BitCost(0, p0);
  return cost;
 }

@ -342,7 +438,8 @@ int VP8GetCostUV(VP8EncIterator* const it, const VP8ModeScore* const rd) {

 static int PutCoeffs(VP8BitWriter* const bw, int ctx, const VP8Residual* res) {
  int n = res->first;
-  const uint8_t* p = res->prob[VP8EncBands[n]][ctx];
+  // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
+  const uint8_t* p = res->prob[n][ctx];
  if (!VP8PutBit(bw, res->last >= 0, p[0])) {
    return 0;
  }
@ -371,30 +468,30 @@ static int PutCoeffs(VP8BitWriter* const bw, int ctx, const VP8Residual* res) {
      } else {
        int mask;
        const uint8_t* tab;
-        if (v < 3 + (8 << 1)) {          // kCat3  (3b)
+        if (v < 3 + (8 << 1)) {          // VP8Cat3  (3b)
          VP8PutBit(bw, 0, p[8]);
          VP8PutBit(bw, 0, p[9]);
          v -= 3 + (8 << 0);
          mask = 1 << 2;
-          tab = kCat3;
-        } else if (v < 3 + (8 << 2)) {   // kCat4  (4b)
+          tab = VP8Cat3;
+        } else if (v < 3 + (8 << 2)) {   // VP8Cat4  (4b)
          VP8PutBit(bw, 0, p[8]);
          VP8PutBit(bw, 1, p[9]);
          v -= 3 + (8 << 1);
          mask = 1 << 3;
-          tab = kCat4;
-        } else if (v < 3 + (8 << 3)) {   // kCat5  (5b)
+          tab = VP8Cat4;
+        } else if (v < 3 + (8 << 3)) {   // VP8Cat5  (5b)
          VP8PutBit(bw, 1, p[8]);
          VP8PutBit(bw, 0, p[10]);
          v -= 3 + (8 << 2);
          mask = 1 << 4;
-          tab = kCat5;
-        } else {                         // kCat6 (11b)
+          tab = VP8Cat5;
+        } else {                         // VP8Cat6 (11b)
          VP8PutBit(bw, 1, p[8]);
          VP8PutBit(bw, 1, p[10]);
          v -= 3 + (8 << 3);
          mask = 1 << 10;
-          tab = kCat6;
+          tab = VP8Cat6;
        }
        while (mask) {
          VP8PutBit(bw, !!(v & mask), *tab++);
@ -411,8 +508,7 @@ static int PutCoeffs(VP8BitWriter* const bw, int ctx, const VP8Residual* res) {
  return 1;
 }

-static void CodeResiduals(VP8BitWriter* const bw,
-                          VP8EncIterator* const it,
+static void CodeResiduals(VP8BitWriter* const bw, VP8EncIterator* const it,
                          const VP8ModeScore* const rd) {
  int x, y, ch;
  VP8Residual res;
@ -512,146 +608,23 @@ static void RecordResiduals(VP8EncIterator* const it,
 //------------------------------------------------------------------------------
 // Token buffer

-#ifdef USE_TOKEN_BUFFER
+#if !defined(DISABLE_TOKEN_BUFFER)

-void VP8TBufferInit(VP8TBuffer* const b) {
-  b->rows_ = NULL;
-  b->tokens_ = NULL;
-  b->last_ = &b->rows_;
-  b->left_ = 0;
-  b->error_ = 0;
-}
-
-int VP8TBufferNewPage(VP8TBuffer* const b) {
-  VP8Tokens* const page = b->error_ ? NULL : (VP8Tokens*)malloc(sizeof(*page));
-  if (page == NULL) {
-    b->error_ = 1;
-    return 0;
-  }
-  *b->last_ = page;
-  b->last_ = &page->next_;
-  b->left_ = MAX_NUM_TOKEN;
-  b->tokens_ = page->tokens_;
-  return 1;
-}
-
-void VP8TBufferClear(VP8TBuffer* const b) {
-  if (b != NULL) {
-    const VP8Tokens* p = b->rows_;
-    while (p != NULL) {
-      const VP8Tokens* const next = p->next_;
-      free((void*)p);
-      p = next;
-    }
-    VP8TBufferInit(b);
-  }
-}
-
-int VP8EmitTokens(const VP8TBuffer* const b, VP8BitWriter* const bw,
-                  const uint8_t* const probas) {
-  VP8Tokens* p = b->rows_;
-  if (b->error_) return 0;
-  while (p != NULL) {
-    const int N = (p->next_ == NULL) ? b->left_ : 0;
-    int n = MAX_NUM_TOKEN;
-    while (n-- > N) {
-      VP8PutBit(bw, (p->tokens_[n] >> 15) & 1, probas[p->tokens_[n] & 0x7fff]);
-    }
-    p = p->next_;
-  }
-  return 1;
-}
-
-#define TOKEN_ID(b, ctx, p) ((p) + NUM_PROBAS * ((ctx) + (b) * NUM_CTX))
-
-static int RecordCoeffTokens(int ctx, const VP8Residual* const res,
-                             VP8TBuffer* tokens) {
-  int n = res->first;
-  int b = VP8EncBands[n];
-  if (!VP8AddToken(tokens, res->last >= 0, TOKEN_ID(b, ctx, 0))) {
-    return 0;
-  }
-
-  while (n < 16) {
-    const int c = res->coeffs[n++];
-    const int sign = c < 0;
-    int v = sign ? -c : c;
-    const int base_id = TOKEN_ID(b, ctx, 0);
-    if (!VP8AddToken(tokens, v != 0, base_id + 1)) {
-      b = VP8EncBands[n];
-      ctx = 0;
-      continue;
-    }
-    if (!VP8AddToken(tokens, v > 1, base_id + 2)) {
-      b = VP8EncBands[n];
-      ctx = 1;
-    } else {
-      if (!VP8AddToken(tokens, v > 4, base_id + 3)) {
-        if (VP8AddToken(tokens, v != 2, base_id + 4))
-          VP8AddToken(tokens, v == 4, base_id + 5);
-      } else if (!VP8AddToken(tokens, v > 10, base_id + 6)) {
-        if (!VP8AddToken(tokens, v > 6, base_id + 7)) {
-//          VP8AddToken(tokens, v == 6, 159);
-        } else {
-//          VP8AddToken(tokens, v >= 9, 165);
-//          VP8AddToken(tokens, !(v & 1), 145);
-        }
-      } else {
-        int mask;
-        const uint8_t* tab;
-        if (v < 3 + (8 << 1)) {          // kCat3  (3b)
-          VP8AddToken(tokens, 0, base_id + 8);
-          VP8AddToken(tokens, 0, base_id + 9);
-          v -= 3 + (8 << 0);
-          mask = 1 << 2;
-          tab = kCat3;
-        } else if (v < 3 + (8 << 2)) {   // kCat4  (4b)
-          VP8AddToken(tokens, 0, base_id + 8);
-          VP8AddToken(tokens, 1, base_id + 9);
-          v -= 3 + (8 << 1);
-          mask = 1 << 3;
-          tab = kCat4;
-        } else if (v < 3 + (8 << 3)) {   // kCat5  (5b)
-          VP8AddToken(tokens, 1, base_id + 8);
-          VP8AddToken(tokens, 0, base_id + 10);
-          v -= 3 + (8 << 2);
-          mask = 1 << 4;
-          tab = kCat5;
-        } else {                         // kCat6 (11b)
-          VP8AddToken(tokens, 1, base_id + 8);
-          VP8AddToken(tokens, 1, base_id + 10);
-          v -= 3 + (8 << 3);
-          mask = 1 << 10;
-          tab = kCat6;
-        }
-        while (mask) {
-          // VP8AddToken(tokens, !!(v & mask), *tab++);
-          mask >>= 1;
-        }
-      }
-      ctx = 2;
-    }
-    b = VP8EncBands[n];
-    // VP8PutBitUniform(bw, sign);
-    if (n == 16 || !VP8AddToken(tokens, n <= res->last, TOKEN_ID(b, ctx, 0))) {
-      return 1;   // EOB
-    }
-  }
-  return 1;
-}
-
-static void RecordTokens(VP8EncIterator* const it,
-                         const VP8ModeScore* const rd, VP8TBuffer tokens[2]) {
+static void RecordTokens(VP8EncIterator* const it, const VP8ModeScore* const rd,
+                         VP8TBuffer* const tokens) {
  int x, y, ch;
  VP8Residual res;
  VP8Encoder* const enc = it->enc_;

  VP8IteratorNzToBytes(it);
  if (it->mb_->type_ == 1) {   // i16x16
+    const int ctx = it->top_nz_[8] + it->left_nz_[8];
    InitResidual(0, 1, enc, &res);
    SetResidualCoeffs(rd->y_dc_levels, &res);
-// TODO(skal): FIX ->    it->top_nz_[8] = it->left_nz_[8] =
-      RecordCoeffTokens(it->top_nz_[8] + it->left_nz_[8], &res, &tokens[0]);
+    it->top_nz_[8] = it->left_nz_[8] =
+        VP8RecordCoeffTokens(ctx, 1,
+                             res.first, res.last, res.coeffs, tokens);
+    RecordCoeffs(ctx, &res);
    InitResidual(1, 0, enc, &res);
  } else {
    InitResidual(0, 3, enc, &res);
@ -663,7 +636,9 @@ static void RecordTokens(VP8EncIterator* const it,
      const int ctx = it->top_nz_[x] + it->left_nz_[y];
      SetResidualCoeffs(rd->y_ac_levels[x + y * 4], &res);
      it->top_nz_[x] = it->left_nz_[y] =
-          RecordCoeffTokens(ctx, &res, &tokens[0]);
+          VP8RecordCoeffTokens(ctx, res.coeff_type,
+                               res.first, res.last, res.coeffs, tokens);
+      RecordCoeffs(ctx, &res);
    }
  }

@ -675,13 +650,16 @@ static void RecordTokens(VP8EncIterator* const it,
        const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
        SetResidualCoeffs(rd->uv_levels[ch * 2 + x + y * 2], &res);
        it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] =
-            RecordCoeffTokens(ctx, &res, &tokens[1]);
+            VP8RecordCoeffTokens(ctx, 2,
+                                 res.first, res.last, res.coeffs, tokens);
+        RecordCoeffs(ctx, &res);
      }
    }
  }
+  VP8IteratorBytesToNz(it);
 }

-#endif    // USE_TOKEN_BUFFER
+#endif    // !DISABLE_TOKEN_BUFFER

 //------------------------------------------------------------------------------
 // ExtraInfo map / Debug function
@ -697,7 +675,10 @@ static void SetBlock(uint8_t* p, int value, int size) {
 #endif

 static void ResetSSE(VP8Encoder* const enc) {
-  memset(enc->sse_, 0, sizeof(enc->sse_));
+  enc->sse_[0] = 0;
+  enc->sse_[1] = 0;
+  enc->sse_[2] = 0;
+  // Note: enc->sse_[3] is managed by alpha.c
  enc->sse_count_ = 0;
 }

@ -736,6 +717,7 @@ static void StoreSideInfo(const VP8EncIterator* const it) {
        const int b = (int)((it->luma_bits_ + it->uv_bits_ + 7) >> 3);
        *info = (b > 255) ? 255 : b; break;
      }
+      case 7: *info = mb->alpha_; break;
      default: *info = 0; break;
    };
  }
@ -746,9 +728,173 @@ static void StoreSideInfo(const VP8EncIterator* const it) {
 #endif
 }

+static double GetPSNR(uint64_t mse, uint64_t size) {
+  return (mse > 0 && size > 0) ? 10. * log10(255. * 255. * size / mse) : 99;
+}
+
+//------------------------------------------------------------------------------
+//  StatLoop(): only collect statistics (number of skips, token usage, ...).
+//  This is used for deciding optimal probabilities. It also modifies the
+//  quantizer value if some target (size, PSNR) was specified.
+
+static void SetLoopParams(VP8Encoder* const enc, float q) {
+  // Make sure the quality parameter is inside valid bounds
+  q = Clamp(q, 0.f, 100.f);
+
+  VP8SetSegmentParams(enc, q);      // setup segment quantizations and filters
+  SetSegmentProbas(enc);            // compute segment probabilities
+
+  ResetStats(enc);
+  ResetSSE(enc);
+}
+
+static uint64_t OneStatPass(VP8Encoder* const enc, VP8RDLevel rd_opt,
+                            int nb_mbs, int percent_delta,
+                            PassStats* const s) {
+  VP8EncIterator it;
+  uint64_t size = 0;
+  uint64_t size_p0 = 0;
+  uint64_t distortion = 0;
+  const uint64_t pixel_count = nb_mbs * 384;
+
+  VP8IteratorInit(enc, &it);
+  SetLoopParams(enc, s->q);
+  do {
+    VP8ModeScore info;
+    VP8IteratorImport(&it, NULL);
+    if (VP8Decimate(&it, &info, rd_opt)) {
+      // Just record the number of skips and act like skip_proba is not used.
+      enc->proba_.nb_skip_++;
+    }
+    RecordResiduals(&it, &info);
+    size += info.R + info.H;
+    size_p0 += info.H;
+    distortion += info.D;
+    if (percent_delta && !VP8IteratorProgress(&it, percent_delta))
+      return 0;
+    VP8IteratorSaveBoundary(&it);
+  } while (VP8IteratorNext(&it) && --nb_mbs > 0);
+
+  size_p0 += enc->segment_hdr_.size_;
+  if (s->do_size_search) {
+    size += FinalizeSkipProba(enc);
+    size += FinalizeTokenProbas(&enc->proba_);
+    size = ((size + size_p0 + 1024) >> 11) + HEADER_SIZE_ESTIMATE;
+    s->value = (double)size;
+  } else {
+    s->value = GetPSNR(distortion, pixel_count);
+  }
+  return size_p0;
+}
+
+static int StatLoop(VP8Encoder* const enc) {
+  const int method = enc->method_;
+  const int do_search = enc->do_search_;
+  const int fast_probe = ((method == 0 || method == 3) && !do_search);
+  int num_pass_left = enc->config_->pass;
+  const int task_percent = 20;
+  const int percent_per_pass =
+      (task_percent + num_pass_left / 2) / num_pass_left;
+  const int final_percent = enc->percent_ + task_percent;
+  const VP8RDLevel rd_opt =
+      (method >= 3 || do_search) ? RD_OPT_BASIC : RD_OPT_NONE;
+  int nb_mbs = enc->mb_w_ * enc->mb_h_;
+  PassStats stats;
+
+  InitPassStats(enc, &stats);
+  ResetTokenStats(enc);
+
+  // Fast mode: quick analysis pass over few mbs. Better than nothing.
+  if (fast_probe) {
+    if (method == 3) {  // we need more stats for method 3 to be reliable.
+      nb_mbs = (nb_mbs > 200) ? nb_mbs >> 1 : 100;
+    } else {
+      nb_mbs = (nb_mbs > 200) ? nb_mbs >> 2 : 50;
+    }
+  }
+
+  while (num_pass_left-- > 0) {
+    const int is_last_pass = (fabs(stats.dq) <= DQ_LIMIT) ||
+                             (num_pass_left == 0) ||
+                             (enc->max_i4_header_bits_ == 0);
+    const uint64_t size_p0 =
+        OneStatPass(enc, rd_opt, nb_mbs, percent_per_pass, &stats);
+    if (size_p0 == 0) return 0;
+#if (DEBUG_SEARCH > 0)
+    printf("#%d value:%.1lf -> %.1lf   q:%.2f -> %.2f\n",
+           num_pass_left, stats.last_value, stats.value, stats.last_q, stats.q);
+#endif
+    if (enc->max_i4_header_bits_ > 0 && size_p0 > PARTITION0_SIZE_LIMIT) {
+      ++num_pass_left;
+      enc->max_i4_header_bits_ >>= 1;  // strengthen header bit limitation...
+      continue;                        // ...and start over
+    }
+    if (is_last_pass) {
+      break;
+    }
+    // If no target size: just do several pass without changing 'q'
+    if (do_search) {
+      ComputeNextQ(&stats);
+      if (fabs(stats.dq) <= DQ_LIMIT) break;
+    }
+  }
+  if (!do_search || !stats.do_size_search) {
+    // Need to finalize probas now, since it wasn't done during the search.
+    FinalizeSkipProba(enc);
+    FinalizeTokenProbas(&enc->proba_);
+  }
+  VP8CalculateLevelCosts(&enc->proba_);  // finalize costs
+  return WebPReportProgress(enc->pic_, final_percent, &enc->percent_);
+}
+
 //------------------------------------------------------------------------------
 // Main loops
 //
+
+static const int kAverageBytesPerMB[8] = { 50, 24, 16, 9, 7, 5, 3, 2 };
+
+static int PreLoopInitialize(VP8Encoder* const enc) {
+  int p;
+  int ok = 1;
+  const int average_bytes_per_MB = kAverageBytesPerMB[enc->base_quant_ >> 4];
+  const int bytes_per_parts =
+      enc->mb_w_ * enc->mb_h_ * average_bytes_per_MB / enc->num_parts_;
+  // Initialize the bit-writers
+  for (p = 0; ok && p < enc->num_parts_; ++p) {
+    ok = VP8BitWriterInit(enc->parts_ + p, bytes_per_parts);
+  }
+  if (!ok) VP8EncFreeBitWriters(enc);  // malloc error occurred
+  return ok;
+}
+
+static int PostLoopFinalize(VP8EncIterator* const it, int ok) {
+  VP8Encoder* const enc = it->enc_;
+  if (ok) {      // Finalize the partitions, check for extra errors.
+    int p;
+    for (p = 0; p < enc->num_parts_; ++p) {
+      VP8BitWriterFinish(enc->parts_ + p);
+      ok &= !enc->parts_[p].error_;
+    }
+  }
+
+  if (ok) {      // All good. Finish up.
+    if (enc->pic_->stats != NULL) {  // finalize byte counters...
+      int i, s;
+      for (i = 0; i <= 2; ++i) {
+        for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
+          enc->residual_bytes_[i][s] = (int)((it->bit_count_[s][i] + 7) >> 3);
+        }
+      }
+    }
+    VP8AdjustFilterStrength(it);     // ...and store filter stats.
+  } else {
+    // Something bad happened -> need to do some memory cleanup.
+    VP8EncFreeBitWriters(enc);
+  }
+  return ok;
+}
+
+//------------------------------------------------------------------------------
 //  VP8EncLoop(): does the final bitstream coding.

 static void ResetAfterSkip(VP8EncIterator* const it) {
@ -761,28 +907,20 @@ static void ResetAfterSkip(VP8EncIterator* const it) {
 }

 int VP8EncLoop(VP8Encoder* const enc) {
-  int i, s, p;
-  int ok = 1;
  VP8EncIterator it;
-  VP8ModeScore info;
-  const int dont_use_skip = !enc->proba_.use_skip_proba_;
-  const int rd_opt = enc->rd_opt_level_;
-  const int kAverageBytesPerMB = 5;     // TODO: have a kTable[quality/10]
-  const int bytes_per_parts =
-    enc->mb_w_ * enc->mb_h_ * kAverageBytesPerMB / enc->num_parts_;
+  int ok = PreLoopInitialize(enc);
+  if (!ok) return 0;

-  // Initialize the bit-writers
-  for (p = 0; p < enc->num_parts_; ++p) {
-    VP8BitWriterInit(enc->parts_ + p, bytes_per_parts);
-  }
-
-  ResetStats(enc);
-  ResetSSE(enc);
+  StatLoop(enc);  // stats-collection loop

  VP8IteratorInit(enc, &it);
  VP8InitFilter(&it);
  do {
-    VP8IteratorImport(&it);
+    VP8ModeScore info;
+    const int dont_use_skip = !enc->proba_.use_skip_proba_;
+    const VP8RDLevel rd_opt = enc->rd_opt_level_;
+
+    VP8IteratorImport(&it, NULL);
    // Warning! order is important: first call VP8Decimate() and
    // *then* decide how to code the skip decision if there's one.
    if (!VP8Decimate(&it, &info, rd_opt) || dont_use_skip) {
@ -799,141 +937,132 @@ int VP8EncLoop(VP8Encoder* const enc) {
    VP8StoreFilterStats(&it);
    VP8IteratorExport(&it);
    ok = VP8IteratorProgress(&it, 20);
-  } while (ok && VP8IteratorNext(&it, it.yuv_out_));
+    VP8IteratorSaveBoundary(&it);
+  } while (ok && VP8IteratorNext(&it));

-  if (ok) {      // Finalize the partitions, check for extra errors.
-    for (p = 0; p < enc->num_parts_; ++p) {
-      VP8BitWriterFinish(enc->parts_ + p);
-      ok &= !enc->parts_[p].error_;
-    }
-  }
-
-  if (ok) {      // All good. Finish up.
-    if (enc->pic_->stats) {           // finalize byte counters...
-      for (i = 0; i <= 2; ++i) {
-        for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
-          enc->residual_bytes_[i][s] = (int)((it.bit_count_[s][i] + 7) >> 3);
-        }
-      }
-    }
-    VP8AdjustFilterStrength(&it);     // ...and store filter stats.
-  } else {
-    // Something bad happened -> need to do some memory cleanup.
-    VP8EncFreeBitWriters(enc);
-  }
-
-  return ok;
+  return PostLoopFinalize(&it, ok);
 }

 //------------------------------------------------------------------------------
-//  VP8StatLoop(): only collect statistics (number of skips, token usage, ...)
-//                 This is used for deciding optimal probabilities. It also
-//                 modifies the quantizer value if some target (size, PNSR)
-//                 was specified.
+// Single pass using Token Buffer.

-#define kHeaderSizeEstimate (15 + 20 + 10)      // TODO: fix better
+#if !defined(DISABLE_TOKEN_BUFFER)

-static int OneStatPass(VP8Encoder* const enc, float q, int rd_opt, int nb_mbs,
-                       float* const PSNR, int percent_delta) {
+#define MIN_COUNT 96  // minimum number of macroblocks before updating stats
+
+int VP8EncTokenLoop(VP8Encoder* const enc) {
+  // Roughly refresh the proba eight times per pass
+  int max_count = (enc->mb_w_ * enc->mb_h_) >> 3;
+  int num_pass_left = enc->config_->pass;
+  const int do_search = enc->do_search_;
  VP8EncIterator it;
-  uint64_t size = 0;
+  VP8Proba* const proba = &enc->proba_;
+  const VP8RDLevel rd_opt = enc->rd_opt_level_;
+  const uint64_t pixel_count = enc->mb_w_ * enc->mb_h_ * 384;
+  PassStats stats;
+  int ok;
+
+  InitPassStats(enc, &stats);
+  ok = PreLoopInitialize(enc);
+  if (!ok) return 0;
+
+  if (max_count < MIN_COUNT) max_count = MIN_COUNT;
+
+  assert(enc->num_parts_ == 1);
+  assert(enc->use_tokens_);
+  assert(proba->use_skip_proba_ == 0);
+  assert(rd_opt >= RD_OPT_BASIC);   // otherwise, token-buffer won't be useful
+  assert(num_pass_left > 0);
+
+  while (ok && num_pass_left-- > 0) {
+    const int is_last_pass = (fabs(stats.dq) <= DQ_LIMIT) ||
+                             (num_pass_left == 0) ||
+                             (enc->max_i4_header_bits_ == 0);
+    uint64_t size_p0 = 0;
    uint64_t distortion = 0;
-  const uint64_t pixel_count = nb_mbs * 384;
-
-  // Make sure the quality parameter is inside valid bounds
-  if (q < 0.) {
-    q = 0;
-  } else if (q > 100.) {
-    q = 100;
-  }
-
-  VP8SetSegmentParams(enc, q);      // setup segment quantizations and filters
-
-  ResetStats(enc);
-  ResetTokenStats(enc);
-
+    int cnt = max_count;
    VP8IteratorInit(enc, &it);
+    SetLoopParams(enc, stats.q);
+    if (is_last_pass) {
+      ResetTokenStats(enc);
+      VP8InitFilter(&it);  // don't collect stats until last pass (too costly)
+    }
+    VP8TBufferClear(&enc->tokens_);
    do {
      VP8ModeScore info;
-    VP8IteratorImport(&it);
-    if (VP8Decimate(&it, &info, rd_opt)) {
-      // Just record the number of skips and act like skip_proba is not used.
-      enc->proba_.nb_skip_++;
+      VP8IteratorImport(&it, NULL);
+      if (--cnt < 0) {
+        FinalizeTokenProbas(proba);
+        VP8CalculateLevelCosts(proba);  // refresh cost tables for rd-opt
+        cnt = max_count;
      }
-    RecordResiduals(&it, &info);
-    size += info.R;
+      VP8Decimate(&it, &info, rd_opt);
+      RecordTokens(&it, &info, &enc->tokens_);
+      size_p0 += info.H;
      distortion += info.D;
-    if (percent_delta && !VP8IteratorProgress(&it, percent_delta))
-      return 0;
-  } while (VP8IteratorNext(&it, it.yuv_out_) && --nb_mbs > 0);
-  size += FinalizeSkipProba(enc);
-  size += FinalizeTokenProbas(enc);
-  size += enc->segment_hdr_.size_;
-  size = ((size + 1024) >> 11) + kHeaderSizeEstimate;
-
-  if (PSNR) {
-    *PSNR = (float)(10.* log10(255. * 255. * pixel_count / distortion));
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+      if (enc->use_layer_) {
+        VP8EncCodeLayerBlock(&it);
      }
-  return (int)size;
-}
-
-// successive refinement increments.
-static const int dqs[] = { 20, 15, 10, 8, 6, 4, 2, 1, 0 };
-
-int VP8StatLoop(VP8Encoder* const enc) {
-  const int do_search =
-    (enc->config_->target_size > 0 || enc->config_->target_PSNR > 0);
-  const int fast_probe = (enc->method_ < 2 && !do_search);
-  float q = enc->config_->quality;
-  const int max_passes = enc->config_->pass;
-  const int task_percent = 20;
-  const int percent_per_pass = (task_percent + max_passes / 2) / max_passes;
-  const int final_percent = enc->percent_ + task_percent;
-  int pass;
-  int nb_mbs;
-
-  // Fast mode: quick analysis pass over few mbs. Better than nothing.
-  nb_mbs = enc->mb_w_ * enc->mb_h_;
-  if (fast_probe && nb_mbs > 100) nb_mbs = 100;
-
-  // No target size: just do several pass without changing 'q'
-  if (!do_search) {
-    for (pass = 0; pass < max_passes; ++pass) {
-      const int rd_opt = (enc->method_ > 2);
-      if (!OneStatPass(enc, q, rd_opt, nb_mbs, NULL, percent_per_pass)) {
-        return 0;
-      }
-    }
-  } else {
-    // binary search for a size close to target
-    for (pass = 0; pass < max_passes && (dqs[pass] > 0); ++pass) {
-      const int rd_opt = 1;
-      float PSNR;
-      int criterion;
-      const int size = OneStatPass(enc, q, rd_opt, nb_mbs, &PSNR,
-                                   percent_per_pass);
-#if DEBUG_SEARCH
-      printf("#%d size=%d PSNR=%.2f q=%.2f\n", pass, size, PSNR, q);
 #endif
-      if (!size) return 0;
-      if (enc->config_->target_PSNR > 0) {
-        criterion = (PSNR < enc->config_->target_PSNR);
-      } else {
-        criterion = (size < enc->config_->target_size);
+      if (is_last_pass) {
+        StoreSideInfo(&it);
+        VP8StoreFilterStats(&it);
+        VP8IteratorExport(&it);
+        ok = VP8IteratorProgress(&it, 20);
      }
-      // dichotomize
-      if (criterion) {
-        q += dqs[pass];
-      } else {
-        q -= dqs[pass];
+      VP8IteratorSaveBoundary(&it);
+    } while (ok && VP8IteratorNext(&it));
+    if (!ok) break;
+
+    size_p0 += enc->segment_hdr_.size_;
+    if (stats.do_size_search) {
+      uint64_t size = FinalizeTokenProbas(&enc->proba_);
+      size += VP8EstimateTokenSize(&enc->tokens_,
+                                   (const uint8_t*)proba->coeffs_);
+      size = (size + size_p0 + 1024) >> 11;  // -> size in bytes
+      size += HEADER_SIZE_ESTIMATE;
+      stats.value = (double)size;
+    } else {  // compute and store PSNR
+      stats.value = GetPSNR(distortion, pixel_count);
+    }
+
+#if (DEBUG_SEARCH > 0)
+    printf("#%2d metric:%.1lf -> %.1lf   last_q=%.2lf q=%.2lf dq=%.2lf\n",
+           num_pass_left, stats.last_value, stats.value,
+           stats.last_q, stats.q, stats.dq);
+#endif
+    if (size_p0 > PARTITION0_SIZE_LIMIT) {
+      ++num_pass_left;
+      enc->max_i4_header_bits_ >>= 1;  // strengthen header bit limitation...
+      continue;                        // ...and start over
+    }
+    if (is_last_pass) {
+      break;   // done
+    }
+    if (do_search) {
+      ComputeNextQ(&stats);  // Adjust q
    }
  }
+  if (ok) {
+    if (!stats.do_size_search) {
+      FinalizeTokenProbas(&enc->proba_);
    }
-  return WebPReportProgress(enc->pic_, final_percent, &enc->percent_);
+    ok = VP8EmitTokens(&enc->tokens_, enc->parts_ + 0,
+                       (const uint8_t*)proba->coeffs_, 1);
  }
+  ok = ok && WebPReportProgress(enc->pic_, enc->percent_ + 20, &enc->percent_);
+  return PostLoopFinalize(&it, ok);
+}
+
+#else
+
+int VP8EncTokenLoop(VP8Encoder* const enc) {
+  (void)enc;
+  return 0;   // we shouldn't be here.
+}
+
+#endif    // DISABLE_TOKEN_BUFFER

 //------------------------------------------------------------------------------

-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
--- a/src/enc/histogram.c
+++ b/src/enc/histogram.c
@ -1,8 +1,10 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Author: Jyrki Alakuijala (jyrki@google.com)
@ -17,17 +19,7 @@
 #include "./backward_references.h"
 #include "./histogram.h"
 #include "../dsp/lossless.h"
-
-#if defined(_MSC_VER) && !defined(NOT_HAVE_LOG2)
-# define NOT_HAVE_LOG2 1
-#endif
-
-#ifdef NOT_HAVE_LOG2
-static WEBP_INLINE double log2(double d) {
-  const double kLog2Reciprocal = 1.442695040888963;
-  return log(d) * kLog2Reciprocal;
-}
-#endif
+#include "../utils/utils.h"

 static void HistogramClear(VP8LHistogram* const p) {
  memset(p->literal_, 0, sizeof(p->literal_));
@ -65,10 +57,10 @@ VP8LHistogramSet* VP8LAllocateHistogramSet(int size, int cache_bits) {
  int i;
  VP8LHistogramSet* set;
  VP8LHistogram* bulk;
-  const size_t total_size = sizeof(*set)
-                          + size * sizeof(*set->histograms)
-                          + size * sizeof(**set->histograms);
-  uint8_t* memory = (uint8_t*)malloc(total_size);
+  const uint64_t total_size = sizeof(*set)
+                            + (uint64_t)size * sizeof(*set->histograms)
+                            + (uint64_t)size * sizeof(**set->histograms);
+  uint8_t* memory = (uint8_t*)WebPSafeMalloc(total_size, sizeof(*memory));
  if (memory == NULL) return NULL;

  set = (VP8LHistogramSet*)memory;
@ -87,33 +79,6 @@ VP8LHistogramSet* VP8LAllocateHistogramSet(int size, int cache_bits) {

 // -----------------------------------------------------------------------------

-void VP8LConvertPopulationCountTableToBitEstimates(
-    int num_symbols, const int population_counts[], double output[]) {
-  int sum = 0;
-  int nonzeros = 0;
-  int i;
-  for (i = 0; i < num_symbols; ++i) {
-    sum += population_counts[i];
-    if (population_counts[i] > 0) {
-      ++nonzeros;
-    }
-  }
-  if (nonzeros <= 1) {
-    memset(output, 0, num_symbols * sizeof(*output));
-    return;
-  }
-  {
-    const double log2sum = log2(sum);
-    for (i = 0; i < num_symbols; ++i) {
-      if (population_counts[i] == 0) {
-        output[i] = log2sum;
-      } else {
-        output[i] = log2sum - log2(population_counts[i]);
-      }
-    }
-  }
-}
-
 void VP8LHistogramAddSinglePixOrCopy(VP8LHistogram* const histo,
                                     const PixOrCopy* const v) {
  if (PixOrCopyIsLiteral(v)) {
@ -125,20 +90,16 @@ void VP8LHistogramAddSinglePixOrCopy(VP8LHistogram* const histo,
    int literal_ix = 256 + NUM_LENGTH_CODES + PixOrCopyCacheIdx(v);
    ++histo->literal_[literal_ix];
  } else {
-    int code, extra_bits_count, extra_bits_value;
-    PrefixEncode(PixOrCopyLength(v),
-                 &code, &extra_bits_count, &extra_bits_value);
+    int code, extra_bits;
+    VP8LPrefixEncodeBits(PixOrCopyLength(v), &code, &extra_bits);
    ++histo->literal_[256 + code];
-    PrefixEncode(PixOrCopyDistance(v),
-                 &code, &extra_bits_count, &extra_bits_value);
+    VP8LPrefixEncodeBits(PixOrCopyDistance(v), &code, &extra_bits);
    ++histo->distance_[code];
  }
 }

-
-
 static double BitsEntropy(const int* const array, int n) {
-  double retval = 0;
+  double retval = 0.;
  int sum = 0;
  int nonzeros = 0;
  int max_val = 0;
@ -148,15 +109,14 @@ static double BitsEntropy(const int* const array, int n) {
    if (array[i] != 0) {
      sum += array[i];
      ++nonzeros;
-      retval += array[i] * VP8LFastLog(array[i]);
+      retval -= VP8LFastSLog2(array[i]);
      if (max_val < array[i]) {
        max_val = array[i];
      }
    }
  }
-  retval -= sum * VP8LFastLog(sum);
-  retval *= -1.4426950408889634;  // 1.0 / -Log(2);
-  mix = 0.627;
+  retval += VP8LFastSLog2(sum);
+
  if (nonzeros < 5) {
    if (nonzeros <= 1) {
      return 0;
@ -176,35 +136,16 @@ static double BitsEntropy(const int* const array, int n) {
    } else {
      mix = 0.7;  // nonzeros == 4.
    }
+  } else {
+    mix = 0.627;
  }
+
  {
    double min_limit = 2 * sum - max_val;
    min_limit = mix * min_limit + (1.0 - mix) * retval;
-    if (retval < min_limit) {
-      return min_limit;
+    return (retval < min_limit) ? min_limit : retval;
  }
 }
-  return retval;
-}
-
-double VP8LHistogramEstimateBitsBulk(const VP8LHistogram* const p) {
-  double retval = BitsEntropy(&p->literal_[0], VP8LHistogramNumCodes(p))
-                + BitsEntropy(&p->red_[0], 256)
-                + BitsEntropy(&p->blue_[0], 256)
-                + BitsEntropy(&p->alpha_[0], 256)
-                + BitsEntropy(&p->distance_[0], NUM_DISTANCE_CODES);
-  // Compute the extra bits cost.
-  int i;
-  for (i = 2; i < NUM_LENGTH_CODES - 2; ++i) {
-    retval +=
-        (i >> 1) * p->literal_[256 + i + 2];
-  }
-  for (i = 2; i < NUM_DISTANCE_CODES - 2; ++i) {
-    retval += (i >> 1) * p->distance_[i + 2];
-  }
-  return retval;
-}
-

 // Returns the cost encode the rle-encoded entropy code.
 // The constants in this function are experimental.
@ -245,19 +186,150 @@ static double HuffmanCost(const int* const population, int length) {
  return retval;
 }

-// Estimates the Huffman dictionary + other block overhead size.
-static double HistogramEstimateBitsHeader(const VP8LHistogram* const p) {
-  return HuffmanCost(&p->alpha_[0], 256) +
-         HuffmanCost(&p->red_[0], 256) +
-         HuffmanCost(&p->literal_[0], VP8LHistogramNumCodes(p)) +
-         HuffmanCost(&p->blue_[0], 256) +
-         HuffmanCost(&p->distance_[0], NUM_DISTANCE_CODES);
+static double PopulationCost(const int* const population, int length) {
+  return BitsEntropy(population, length) + HuffmanCost(population, length);
 }

-double VP8LHistogramEstimateBits(const VP8LHistogram* const p) {
-  return HistogramEstimateBitsHeader(p) + VP8LHistogramEstimateBitsBulk(p);
+static double ExtraCost(const int* const population, int length) {
+  int i;
+  double cost = 0.;
+  for (i = 2; i < length - 2; ++i) cost += (i >> 1) * population[i + 2];
+  return cost;
 }

+// Estimates the Entropy + Huffman + other block overhead size cost.
+double VP8LHistogramEstimateBits(const VP8LHistogram* const p) {
+  return PopulationCost(p->literal_, VP8LHistogramNumCodes(p))
+       + PopulationCost(p->red_, 256)
+       + PopulationCost(p->blue_, 256)
+       + PopulationCost(p->alpha_, 256)
+       + PopulationCost(p->distance_, NUM_DISTANCE_CODES)
+       + ExtraCost(p->literal_ + 256, NUM_LENGTH_CODES)
+       + ExtraCost(p->distance_, NUM_DISTANCE_CODES);
+}
+
+double VP8LHistogramEstimateBitsBulk(const VP8LHistogram* const p) {
+  return BitsEntropy(p->literal_, VP8LHistogramNumCodes(p))
+       + BitsEntropy(p->red_, 256)
+       + BitsEntropy(p->blue_, 256)
+       + BitsEntropy(p->alpha_, 256)
+       + BitsEntropy(p->distance_, NUM_DISTANCE_CODES)
+       + ExtraCost(p->literal_ + 256, NUM_LENGTH_CODES)
+       + ExtraCost(p->distance_, NUM_DISTANCE_CODES);
+}
+
+// -----------------------------------------------------------------------------
+// Various histogram combine/cost-eval functions
+
+// Adds 'in' histogram to 'out'
+static void HistogramAdd(const VP8LHistogram* const in,
+                         VP8LHistogram* const out) {
+  int i;
+  for (i = 0; i < PIX_OR_COPY_CODES_MAX; ++i) {
+    out->literal_[i] += in->literal_[i];
+  }
+  for (i = 0; i < NUM_DISTANCE_CODES; ++i) {
+    out->distance_[i] += in->distance_[i];
+  }
+  for (i = 0; i < 256; ++i) {
+    out->red_[i] += in->red_[i];
+    out->blue_[i] += in->blue_[i];
+    out->alpha_[i] += in->alpha_[i];
+  }
+}
+
+// Performs out = a + b, computing the cost C(a+b) - C(a) - C(b) while comparing
+// to the threshold value 'cost_threshold'. The score returned is
+//  Score = C(a+b) - C(a) - C(b), where C(a) + C(b) is known and fixed.
+// Since the previous score passed is 'cost_threshold', we only need to compare
+// the partial cost against 'cost_threshold + C(a) + C(b)' to possibly bail-out
+// early.
+static double HistogramAddEval(const VP8LHistogram* const a,
+                               const VP8LHistogram* const b,
+                               VP8LHistogram* const out,
+                               double cost_threshold) {
+  double cost = 0;
+  const double sum_cost = a->bit_cost_ + b->bit_cost_;
+  int i;
+
+  cost_threshold += sum_cost;
+
+  // palette_code_bits_ is part of the cost evaluation for literal_.
+  // TODO(skal): remove/simplify this palette_code_bits_?
+  out->palette_code_bits_ =
+      (a->palette_code_bits_ > b->palette_code_bits_) ? a->palette_code_bits_ :
+                                                        b->palette_code_bits_;
+  for (i = 0; i < PIX_OR_COPY_CODES_MAX; ++i) {
+    out->literal_[i] = a->literal_[i] + b->literal_[i];
+  }
+  cost += PopulationCost(out->literal_, VP8LHistogramNumCodes(out));
+  cost += ExtraCost(out->literal_ + 256, NUM_LENGTH_CODES);
+  if (cost > cost_threshold) return cost;
+
+  for (i = 0; i < 256; ++i) out->red_[i] = a->red_[i] + b->red_[i];
+  cost += PopulationCost(out->red_, 256);
+  if (cost > cost_threshold) return cost;
+
+  for (i = 0; i < 256; ++i) out->blue_[i] = a->blue_[i] + b->blue_[i];
+  cost += PopulationCost(out->blue_, 256);
+  if (cost > cost_threshold) return cost;
+
+  for (i = 0; i < NUM_DISTANCE_CODES; ++i) {
+    out->distance_[i] = a->distance_[i] + b->distance_[i];
+  }
+  cost += PopulationCost(out->distance_, NUM_DISTANCE_CODES);
+  cost += ExtraCost(out->distance_, NUM_DISTANCE_CODES);
+  if (cost > cost_threshold) return cost;
+
+  for (i = 0; i < 256; ++i) out->alpha_[i] = a->alpha_[i] + b->alpha_[i];
+  cost += PopulationCost(out->alpha_, 256);
+
+  out->bit_cost_ = cost;
+  return cost - sum_cost;
+}
+
+// Same as HistogramAddEval(), except that the resulting histogram
+// is not stored. Only the cost C(a+b) - C(a) is evaluated. We omit
+// the term C(b) which is constant over all the evaluations.
+static double HistogramAddThresh(const VP8LHistogram* const a,
+                                 const VP8LHistogram* const b,
+                                 double cost_threshold) {
+  int tmp[PIX_OR_COPY_CODES_MAX];  // <= max storage we'll need
+  int i;
+  double cost = -a->bit_cost_;
+
+  for (i = 0; i < PIX_OR_COPY_CODES_MAX; ++i) {
+    tmp[i] = a->literal_[i] + b->literal_[i];
+  }
+  // note that the tests are ordered so that the usually largest
+  // cost shares come first.
+  cost += PopulationCost(tmp, VP8LHistogramNumCodes(a));
+  cost += ExtraCost(tmp + 256, NUM_LENGTH_CODES);
+  if (cost > cost_threshold) return cost;
+
+  for (i = 0; i < 256; ++i) tmp[i] = a->red_[i] + b->red_[i];
+  cost += PopulationCost(tmp, 256);
+  if (cost > cost_threshold) return cost;
+
+  for (i = 0; i < 256; ++i) tmp[i] = a->blue_[i] + b->blue_[i];
+  cost += PopulationCost(tmp, 256);
+  if (cost > cost_threshold) return cost;
+
+  for (i = 0; i < NUM_DISTANCE_CODES; ++i) {
+    tmp[i] = a->distance_[i] + b->distance_[i];
+  }
+  cost += PopulationCost(tmp, NUM_DISTANCE_CODES);
+  cost += ExtraCost(tmp, NUM_DISTANCE_CODES);
+  if (cost > cost_threshold) return cost;
+
+  for (i = 0; i < 256; ++i) tmp[i] = a->alpha_[i] + b->alpha_[i];
+  cost += PopulationCost(tmp, 256);
+
+  return cost;
+}
+
+// -----------------------------------------------------------------------------
+
 static void HistogramBuildImage(int xsize, int histo_bits,
                                const VP8LBackwardRefs* const backward_refs,
                                VP8LHistogramSet* const image) {
@ -287,14 +359,15 @@ static uint32_t MyRand(uint32_t *seed) {
 }

 static int HistogramCombine(const VP8LHistogramSet* const in,
-                            VP8LHistogramSet* const out, int num_pairs) {
+                            VP8LHistogramSet* const out, int iter_mult,
+                            int num_pairs, int num_tries_no_success) {
  int ok = 0;
  int i, iter;
  uint32_t seed = 0;
  int tries_with_no_success = 0;
-  const int min_cluster_size = 2;
  int out_size = in->size;
-  const int outer_iters = in->size * 3;
+  const int outer_iters = in->size * iter_mult;
+  const int min_cluster_size = 2;
  VP8LHistogram* const histos = (VP8LHistogram*)malloc(2 * sizeof(*histos));
  VP8LHistogram* cur_combo = histos + 0;    // trial merged histogram
  VP8LHistogram* best_combo = histos + 1;   // best merged histogram so far
@ -309,29 +382,26 @@ static int HistogramCombine(const VP8LHistogramSet* const in,

  // Collapse similar histograms in 'out'.
  for (iter = 0; iter < outer_iters && out_size >= min_cluster_size; ++iter) {
-    // We pick the best pair to be combined out of 'inner_iters' pairs.
    double best_cost_diff = 0.;
-    int best_idx1 = 0, best_idx2 = 1;
+    int best_idx1 = -1, best_idx2 = 1;
    int j;
+    const int num_tries = (num_pairs < out_size) ? num_pairs : out_size;
    seed += iter;
-    for (j = 0; j < num_pairs; ++j) {
+    for (j = 0; j < num_tries; ++j) {
      double curr_cost_diff;
      // Choose two histograms at random and try to combine them.
      const uint32_t idx1 = MyRand(&seed) % out_size;
-      const uint32_t tmp = ((j & 7) + 1) % (out_size - 1);
+      const uint32_t tmp = (j & 7) + 1;
      const uint32_t diff = (tmp < 3) ? tmp : MyRand(&seed) % (out_size - 1);
      const uint32_t idx2 = (idx1 + diff + 1) % out_size;
      if (idx1 == idx2) {
        continue;
      }
-      *cur_combo = *out->histograms[idx1];
-      VP8LHistogramAdd(cur_combo, out->histograms[idx2]);
-      cur_combo->bit_cost_ = VP8LHistogramEstimateBits(cur_combo);
      // Calculate cost reduction on combining.
-      curr_cost_diff = cur_combo->bit_cost_
-                     - out->histograms[idx1]->bit_cost_
-                     - out->histograms[idx2]->bit_cost_;
-      if (best_cost_diff > curr_cost_diff) {    // found a better pair?
+      curr_cost_diff = HistogramAddEval(out->histograms[idx1],
+                                        out->histograms[idx2],
+                                        cur_combo, best_cost_diff);
+      if (curr_cost_diff < best_cost_diff) {    // found a better pair?
        {     // swap cur/best combo histograms
          VP8LHistogram* const tmp_histo = cur_combo;
          cur_combo = best_combo;
@ -343,7 +413,7 @@ static int HistogramCombine(const VP8LHistogramSet* const in,
      }
    }

-    if (best_cost_diff < 0.0) {
+    if (best_idx1 >= 0) {
      *out->histograms[best_idx1] = *best_combo;
      // swap best_idx2 slot with last one (which is now unused)
      --out_size;
@ -353,7 +423,7 @@ static int HistogramCombine(const VP8LHistogramSet* const in,
      }
      tries_with_no_success = 0;
    }
-    if (++tries_with_no_success >= 50) {
+    if (++tries_with_no_success >= num_tries_no_success) {
      break;
    }
  }
@ -368,20 +438,11 @@ static int HistogramCombine(const VP8LHistogramSet* const in,
 // -----------------------------------------------------------------------------
 // Histogram refinement

-// What is the bit cost of moving square_histogram from
-// cur_symbol to candidate_symbol.
-// TODO(skal): we don't really need to copy the histogram and Add(). Instead
-// we just need VP8LDualHistogramEstimateBits(A, B) estimation function.
+// What is the bit cost of moving square_histogram from cur_symbol to candidate.
 static double HistogramDistance(const VP8LHistogram* const square_histogram,
-                                const VP8LHistogram* const candidate) {
-  const double previous_bit_cost = candidate->bit_cost_;
-  double new_bit_cost;
-  VP8LHistogram modified_histo;
-  modified_histo = *candidate;
-  VP8LHistogramAdd(&modified_histo, square_histogram);
-  new_bit_cost = VP8LHistogramEstimateBits(&modified_histo);
-
-  return new_bit_cost - previous_bit_cost;
+                                const VP8LHistogram* const candidate,
+                                double cost_threshold) {
+  return HistogramAddThresh(candidate, square_histogram, cost_threshold);
 }

 // Find the best 'out' histogram for each of the 'in' histograms.
@ -392,11 +453,12 @@ static void HistogramRemap(const VP8LHistogramSet* const in,
  int i;
  for (i = 0; i < in->size; ++i) {
    int best_out = 0;
-    double best_bits = HistogramDistance(in->histograms[i], out->histograms[0]);
+    double best_bits =
+        HistogramDistance(in->histograms[i], out->histograms[0], 1.e38);
    int k;
    for (k = 1; k < out->size; ++k) {
      const double cur_bits =
-          HistogramDistance(in->histograms[i], out->histograms[k]);
+          HistogramDistance(in->histograms[i], out->histograms[k], best_bits);
      if (cur_bits < best_bits) {
        best_bits = cur_bits;
        best_out = k;
@ -410,7 +472,7 @@ static void HistogramRemap(const VP8LHistogramSet* const in,
    HistogramClear(out->histograms[i]);
  }
  for (i = 0; i < in->size; ++i) {
-    VP8LHistogramAdd(out->histograms[symbols[i]], in->histograms[i]);
+    HistogramAdd(in->histograms[i], out->histograms[symbols[i]]);
  }
 }

@ -422,8 +484,13 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
  int ok = 0;
  const int histo_xsize = histo_bits ? VP8LSubSampleSize(xsize, histo_bits) : 1;
  const int histo_ysize = histo_bits ? VP8LSubSampleSize(ysize, histo_bits) : 1;
-  const int num_histo_pairs = 10 + quality / 2;  // For HistogramCombine().
  const int histo_image_raw_size = histo_xsize * histo_ysize;
+
+  // Heuristic params for HistogramCombine().
+  const int num_tries_no_success = 8 + (quality >> 1);
+  const int iter_mult = (quality < 27) ? 1 : 1 + ((quality - 27) >> 4);
+  const int num_pairs = (quality < 25) ? 10 : (5 * quality) >> 3;
+
  VP8LHistogramSet* const image_out =
      VP8LAllocateHistogramSet(histo_image_raw_size, cache_bits);
  if (image_out == NULL) return 0;
@ -431,7 +498,8 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
  // Build histogram image.
  HistogramBuildImage(xsize, histo_bits, refs, image_out);
  // Collapse similar histograms.
-  if (!HistogramCombine(image_out, image_in, num_histo_pairs)) {
+  if (!HistogramCombine(image_out, image_in, iter_mult, num_pairs,
+                        num_tries_no_success)) {
    goto Error;
  }
  // Find the optimal map from original histograms to the final ones.
--- a/src/enc/histogram.h
+++ b/src/enc/histogram.h
@ -1,8 +1,10 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Author: Jyrki Alakuijala (jyrki@google.com)
@ -22,7 +24,7 @@
 #include "../webp/format_constants.h"
 #include "../webp/types.h"

-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif

@ -80,30 +82,11 @@ double VP8LHistogramEstimateBits(const VP8LHistogram* const p);
 // represent the entropy code itself.
 double VP8LHistogramEstimateBitsBulk(const VP8LHistogram* const p);

-static WEBP_INLINE void VP8LHistogramAdd(VP8LHistogram* const p,
-                                         const VP8LHistogram* const a) {
-  int i;
-  for (i = 0; i < PIX_OR_COPY_CODES_MAX; ++i) {
-    p->literal_[i] += a->literal_[i];
-  }
-  for (i = 0; i < NUM_DISTANCE_CODES; ++i) {
-    p->distance_[i] += a->distance_[i];
-  }
-  for (i = 0; i < 256; ++i) {
-    p->red_[i] += a->red_[i];
-    p->blue_[i] += a->blue_[i];
-    p->alpha_[i] += a->alpha_[i];
-  }
-}
-
 static WEBP_INLINE int VP8LHistogramNumCodes(const VP8LHistogram* const p) {
  return 256 + NUM_LENGTH_CODES +
      ((p->palette_code_bits_ > 0) ? (1 << p->palette_code_bits_) : 0);
 }

-void VP8LConvertPopulationCountTableToBitEstimates(
-    int num_symbols, const int population_counts[], double output[]);
-
 // Builds the histogram image.
 int VP8LGetHistoImageSymbols(int xsize, int ysize,
                             const VP8LBackwardRefs* const refs,
@ -111,7 +94,7 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
                             VP8LHistogramSet* const image_in,
                             uint16_t* const histogram_symbols);

-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }
 #endif

--- a/src/enc/iterator.c
+++ b/src/enc/iterator.c
@ -1,8 +1,10 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // VP8Iterator: block iterator
@ -13,21 +15,16 @@

 #include "./vp8enci.h"

-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 //------------------------------------------------------------------------------
 // VP8Iterator
 //------------------------------------------------------------------------------

 static void InitLeft(VP8EncIterator* const it) {
-  const VP8Encoder* const enc = it->enc_;
-  enc->y_left_[-1] = enc->u_left_[-1] = enc->v_left_[-1] =
+  it->y_left_[-1] = it->u_left_[-1] = it->v_left_[-1] =
      (it->y_ > 0) ? 129 : 127;
-  memset(enc->y_left_, 129, 16);
-  memset(enc->u_left_, 129, 8);
-  memset(enc->v_left_, 129, 8);
+  memset(it->y_left_, 129, 16);
+  memset(it->u_left_, 129, 8);
+  memset(it->v_left_, 129, 8);
  it->left_nz_[8] = 0;
 }

@ -38,43 +35,60 @@ static void InitTop(VP8EncIterator* const it) {
  memset(enc->nz_, 0, enc->mb_w_ * sizeof(*enc->nz_));
 }

-void VP8IteratorReset(VP8EncIterator* const it) {
+void VP8IteratorSetRow(VP8EncIterator* const it, int y) {
  VP8Encoder* const enc = it->enc_;
  it->x_ = 0;
-  it->y_ = 0;
-  it->y_offset_ = 0;
-  it->uv_offset_ = 0;
-  it->mb_ = enc->mb_info_;
-  it->preds_ = enc->preds_;
+  it->y_ = y;
+  it->bw_ = &enc->parts_[y & (enc->num_parts_ - 1)];
+  it->preds_ = enc->preds_ + y * 4 * enc->preds_w_;
  it->nz_ = enc->nz_;
-  it->bw_ = &enc->parts_[0];
-  it->done_ = enc->mb_w_* enc->mb_h_;
+  it->mb_ = enc->mb_info_ + y * enc->mb_w_;
+  it->y_top_ = enc->y_top_;
+  it->uv_top_ = enc->uv_top_;
+  InitLeft(it);
+}
+
+void VP8IteratorReset(VP8EncIterator* const it) {
+  VP8Encoder* const enc = it->enc_;
+  VP8IteratorSetRow(it, 0);
+  VP8IteratorSetCountDown(it, enc->mb_w_ * enc->mb_h_);  // default
  InitTop(it);
  InitLeft(it);
  memset(it->bit_count_, 0, sizeof(it->bit_count_));
  it->do_trellis_ = 0;
 }

+void VP8IteratorSetCountDown(VP8EncIterator* const it, int count_down) {
+  it->count_down_ = it->count_down0_ = count_down;
+}
+
+int VP8IteratorIsDone(const VP8EncIterator* const it) {
+  return (it->count_down_ <= 0);
+}
+
 void VP8IteratorInit(VP8Encoder* const enc, VP8EncIterator* const it) {
  it->enc_ = enc;
  it->y_stride_  = enc->pic_->y_stride;
  it->uv_stride_ = enc->pic_->uv_stride;
-  // TODO(later): for multithreading, these should be owned by 'it'.
-  it->yuv_in_   = enc->yuv_in_;
-  it->yuv_out_  = enc->yuv_out_;
-  it->yuv_out2_ = enc->yuv_out2_;
-  it->yuv_p_    = enc->yuv_p_;
+  it->yuv_in_   = (uint8_t*)DO_ALIGN(it->yuv_mem_);
+  it->yuv_out_  = it->yuv_in_ + YUV_SIZE;
+  it->yuv_out2_ = it->yuv_out_ + YUV_SIZE;
+  it->yuv_p_    = it->yuv_out2_ + YUV_SIZE;
  it->lf_stats_ = enc->lf_stats_;
  it->percent0_ = enc->percent_;
+  it->y_left_ = (uint8_t*)DO_ALIGN(it->yuv_left_mem_ + 1);
+  it->u_left_ = it->y_left_ + 16 + 16;
+  it->v_left_ = it->u_left_ + 16;
  VP8IteratorReset(it);
 }

 int VP8IteratorProgress(const VP8EncIterator* const it, int delta) {
  VP8Encoder* const enc = it->enc_;
-  if (delta && enc->pic_->progress_hook) {
-    const int percent = (enc->mb_h_ <= 1)
+  if (delta && enc->pic_->progress_hook != NULL) {
+    const int done = it->count_down0_ - it->count_down_;
+    const int percent = (it->count_down0_ <= 0)
                      ? it->percent0_
-                      : it->percent0_ + delta * it->y_ / (enc->mb_h_ - 1);
+                      : it->percent0_ + delta * done / it->count_down0_;
    return WebPReportProgress(enc->pic_, percent, &enc->percent_);
  }
  return 1;
@ -84,6 +98,8 @@ int VP8IteratorProgress(const VP8EncIterator* const it, int delta) {
 // Import the source samples into the cache. Takes care of replicating
 // boundary pixels if necessary.

+static WEBP_INLINE int MinSize(int a, int b) { return (a < b) ? a : b; }
+
 static void ImportBlock(const uint8_t* src, int src_stride,
                        uint8_t* dst, int w, int h, int size) {
  int i;
@ -101,30 +117,55 @@ static void ImportBlock(const uint8_t* src, int src_stride,
  }
 }

-void VP8IteratorImport(const VP8EncIterator* const it) {
+static void ImportLine(const uint8_t* src, int src_stride,
+                       uint8_t* dst, int len, int total_len) {
+  int i;
+  for (i = 0; i < len; ++i, src += src_stride) dst[i] = *src;
+  for (; i < total_len; ++i) dst[i] = dst[len - 1];
+}
+
+void VP8IteratorImport(VP8EncIterator* const it, uint8_t* tmp_32) {
  const VP8Encoder* const enc = it->enc_;
  const int x = it->x_, y = it->y_;
  const WebPPicture* const pic = enc->pic_;
  const uint8_t* const ysrc = pic->y + (y * pic->y_stride  + x) * 16;
  const uint8_t* const usrc = pic->u + (y * pic->uv_stride + x) * 8;
  const uint8_t* const vsrc = pic->v + (y * pic->uv_stride + x) * 8;
-  uint8_t* const ydst = it->yuv_in_ + Y_OFF;
-  uint8_t* const udst = it->yuv_in_ + U_OFF;
-  uint8_t* const vdst = it->yuv_in_ + V_OFF;
-  int w = (pic->width - x * 16);
-  int h = (pic->height - y * 16);
-
-  if (w > 16) w = 16;
-  if (h > 16) h = 16;
-
-  // Luma plane
-  ImportBlock(ysrc, pic->y_stride, ydst, w, h, 16);
-
-  {   // U/V planes
+  const int w = MinSize(pic->width - x * 16, 16);
+  const int h = MinSize(pic->height - y * 16, 16);
  const int uv_w = (w + 1) >> 1;
  const int uv_h = (h + 1) >> 1;
-    ImportBlock(usrc, pic->uv_stride, udst, uv_w, uv_h, 8);
-    ImportBlock(vsrc, pic->uv_stride, vdst, uv_w, uv_h, 8);
+
+  ImportBlock(ysrc, pic->y_stride,  it->yuv_in_ + Y_OFF, w, h, 16);
+  ImportBlock(usrc, pic->uv_stride, it->yuv_in_ + U_OFF, uv_w, uv_h, 8);
+  ImportBlock(vsrc, pic->uv_stride, it->yuv_in_ + V_OFF, uv_w, uv_h, 8);
+
+  if (tmp_32 == NULL) return;
+
+  // Import source (uncompressed) samples into boundary.
+  if (x == 0) {
+    InitLeft(it);
+  } else {
+    if (y == 0) {
+      it->y_left_[-1] = it->u_left_[-1] = it->v_left_[-1] = 127;
+    } else {
+      it->y_left_[-1] = ysrc[- 1 - pic->y_stride];
+      it->u_left_[-1] = usrc[- 1 - pic->uv_stride];
+      it->v_left_[-1] = vsrc[- 1 - pic->uv_stride];
+    }
+    ImportLine(ysrc - 1, pic->y_stride,  it->y_left_, h,   16);
+    ImportLine(usrc - 1, pic->uv_stride, it->u_left_, uv_h, 8);
+    ImportLine(vsrc - 1, pic->uv_stride, it->v_left_, uv_h, 8);
+  }
+
+  it->y_top_  = tmp_32 + 0;
+  it->uv_top_ = tmp_32 + 16;
+  if (y == 0) {
+    memset(tmp_32, 127, 32 * sizeof(*tmp_32));
+  } else {
+    ImportLine(ysrc - pic->y_stride,  1, tmp_32,          w,   16);
+    ImportLine(usrc - pic->uv_stride, 1, tmp_32 + 16,     uv_w, 8);
+    ImportLine(vsrc - pic->uv_stride, 1, tmp_32 + 16 + 8, uv_w, 8);
  }
 }

@ -240,48 +281,44 @@ void VP8IteratorBytesToNz(VP8EncIterator* const it) {
 #undef BIT

 //------------------------------------------------------------------------------
-// Advance to the next position, doing the bookeeping.
+// Advance to the next position, doing the bookkeeping.

-int VP8IteratorNext(VP8EncIterator* const it,
-                    const uint8_t* const block_to_save) {
+void VP8IteratorSaveBoundary(VP8EncIterator* const it) {
  VP8Encoder* const enc = it->enc_;
-  if (block_to_save) {
  const int x = it->x_, y = it->y_;
-    const uint8_t* const ysrc = block_to_save + Y_OFF;
-    const uint8_t* const usrc = block_to_save + U_OFF;
+  const uint8_t* const ysrc = it->yuv_out_ + Y_OFF;
+  const uint8_t* const uvsrc = it->yuv_out_ + U_OFF;
  if (x < enc->mb_w_ - 1) {   // left
    int i;
    for (i = 0; i < 16; ++i) {
-        enc->y_left_[i] = ysrc[15 + i * BPS];
+      it->y_left_[i] = ysrc[15 + i * BPS];
    }
    for (i = 0; i < 8; ++i) {
-        enc->u_left_[i] = usrc[7 + i * BPS];
-        enc->v_left_[i] = usrc[15 + i * BPS];
+      it->u_left_[i] = uvsrc[7 + i * BPS];
+      it->v_left_[i] = uvsrc[15 + i * BPS];
    }
    // top-left (before 'top'!)
-      enc->y_left_[-1] = enc->y_top_[x * 16 + 15];
-      enc->u_left_[-1] = enc->uv_top_[x * 16 + 0 + 7];
-      enc->v_left_[-1] = enc->uv_top_[x * 16 + 8 + 7];
+    it->y_left_[-1] = it->y_top_[15];
+    it->u_left_[-1] = it->uv_top_[0 + 7];
+    it->v_left_[-1] = it->uv_top_[8 + 7];
  }
  if (y < enc->mb_h_ - 1) {  // top
-      memcpy(enc->y_top_ + x * 16, ysrc + 15 * BPS, 16);
-      memcpy(enc->uv_top_ + x * 16, usrc + 7 * BPS, 8 + 8);
+    memcpy(it->y_top_, ysrc + 15 * BPS, 16);
+    memcpy(it->uv_top_, uvsrc + 7 * BPS, 8 + 8);
  }
 }

-  it->mb_++;
+int VP8IteratorNext(VP8EncIterator* const it) {
  it->preds_ += 4;
-  it->nz_++;
-  it->x_++;
-  if (it->x_ == enc->mb_w_) {
-    it->x_ = 0;
-    it->y_++;
-    it->bw_ = &enc->parts_[it->y_ & (enc->num_parts_ - 1)];
-    it->preds_ = enc->preds_ + it->y_ * 4 * enc->preds_w_;
-    it->nz_ = enc->nz_;
-    InitLeft(it);
+  it->mb_ += 1;
+  it->nz_ += 1;
+  it->y_top_ += 16;
+  it->uv_top_ += 16;
+  it->x_ += 1;
+  if (it->x_ == it->enc_->mb_w_) {
+    VP8IteratorSetRow(it, ++it->y_);
  }
-  return (0 < --it->done_);
+  return (0 < --it->count_down_);
 }

 //------------------------------------------------------------------------------
@ -368,15 +405,15 @@ void VP8IteratorStartI4(VP8EncIterator* const it) {

  // Import the boundary samples
  for (i = 0; i < 17; ++i) {    // left
-    it->i4_boundary_[i] = enc->y_left_[15 - i];
+    it->i4_boundary_[i] = it->y_left_[15 - i];
  }
  for (i = 0; i < 16; ++i) {    // top
-    it->i4_boundary_[17 + i] = enc->y_top_[it->x_ * 16 + i];
+    it->i4_boundary_[17 + i] = it->y_top_[i];
  }
  // top-right samples have a special case on the far right of the picture
  if (it->x_ < enc->mb_w_ - 1) {
    for (i = 16; i < 16 + 4; ++i) {
-      it->i4_boundary_[17 + i] = enc->y_top_[it->x_ * 16 + i];
+      it->i4_boundary_[17 + i] = it->y_top_[i];
    }
  } else {    // else, replicate the last valid pixel four times
    for (i = 16; i < 16 + 4; ++i) {
@ -417,6 +454,3 @@ int VP8IteratorRotateI4(VP8EncIterator* const it,

 //------------------------------------------------------------------------------

-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
--- a/src/enc/layer.c
+++ b/src/enc/layer.c
@ -1,8 +1,10 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Enhancement layer (for YUV444/422)
@ -13,10 +15,6 @@

 #include "./vp8enci.h"

-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 //------------------------------------------------------------------------------

 void VP8EncInitLayer(VP8Encoder* const enc) {
@ -44,6 +42,3 @@ void VP8EncDeleteLayer(VP8Encoder* enc) {
  free(enc->layer_data_);
 }

-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
--- a/src/enc/picture.c
+++ b/src/enc/picture.c
@ -1,8 +1,10 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // WebPPicture utils: colorspace conversion, crop, ...
@ -14,12 +16,15 @@
 #include <math.h>

 #include "./vp8enci.h"
+#include "../utils/alpha_processing.h"
+#include "../utils/random.h"
 #include "../utils/rescaler.h"
+#include "../utils/utils.h"
 #include "../dsp/dsp.h"
+#include "../dsp/yuv.h"

-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
+// Uncomment to disable gamma-compression during RGB->U/V averaging
+#define USE_GAMMA_COMPRESSION

 #define HALVE(x) (((x) + 1) >> 1)
 #define IS_YUV_CSP(csp, YUV_CSP) (((csp) & WEBP_CSP_UV_MASK) == (YUV_CSP))
@ -30,6 +35,10 @@ static const union {
 } test_endian = { 0xff000000u };
 #define ALPHA_IS_LAST (test_endian.bytes[3] == 0xff)

+static WEBP_INLINE uint32_t MakeARGB32(int r, int g, int b) {
+  return (0xff000000u | (r << 16) | (g << 8) | b);
+}
+
 //------------------------------------------------------------------------------
 // WebPPicture
 //------------------------------------------------------------------------------
@ -81,14 +90,12 @@ int WebPPictureAlloc(WebPPicture* picture) {

      // Security and validation checks
      if (width <= 0 || height <= 0 ||         // luma/alpha param error
-          uv_width < 0 || uv_height < 0 ||     // u/v param error
-          y_size >= (1ULL << 40) ||            // reasonable global size
-          (size_t)total_size != total_size) {  // overflow on 32bit
+          uv_width < 0 || uv_height < 0) {     // u/v param error
        return 0;
      }
      // Clear previous buffer and allocate a new one.
      WebPPictureFree(picture);   // erase previous buffer
-      mem = (uint8_t*)malloc((size_t)total_size);
+      mem = (uint8_t*)WebPSafeMalloc(total_size, sizeof(*mem));
      if (mem == NULL) return 0;

      // From now on, we're in the clear, we can no longer fail...
@ -116,18 +123,16 @@ int WebPPictureAlloc(WebPPicture* picture) {
        picture->v0 = mem;
        mem += uv0_size;
      }
+      (void)mem;  // makes the static analyzer happy
    } else {
      void* memory;
      const uint64_t argb_size = (uint64_t)width * height;
-      const uint64_t total_size = argb_size * sizeof(*picture->argb);
-      if (width <= 0 || height <= 0 ||
-          argb_size >= (1ULL << 40) ||
-          (size_t)total_size != total_size) {
+      if (width <= 0 || height <= 0) {
        return 0;
      }
      // Clear previous buffer and allocate a new one.
      WebPPictureFree(picture);   // erase previous buffer
-      memory = malloc((size_t)total_size);
+      memory = WebPSafeMalloc(argb_size, sizeof(*picture->argb));
      if (memory == NULL) return 0;

      // TODO(skal): align plane to cache line?
@ -293,8 +298,11 @@ int WebPPictureView(const WebPPicture* src,
    dst->y = src->y + top * src->y_stride + left;
    dst->u = src->u + (top >> 1) * src->uv_stride + (left >> 1);
    dst->v = src->v + (top >> 1) * src->uv_stride + (left >> 1);
+    dst->y_stride = src->y_stride;
+    dst->uv_stride = src->uv_stride;
    if (src->a != NULL) {
      dst->a = src->a + top * src->a_stride + left;
+      dst->a_stride = src->a_stride;
    }
 #ifdef WEBP_EXPERIMENTAL_FEATURES
    if (src->u0 != NULL) {
@ -302,10 +310,12 @@ int WebPPictureView(const WebPPicture* src,
          IS_YUV_CSP(dst->colorspace, WEBP_YUV422) ? (left >> 1) : left;
      dst->u0 = src->u0 + top * src->uv0_stride + left_pos;
      dst->v0 = src->v0 + top * src->uv0_stride + left_pos;
+      dst->uv0_stride = src->uv0_stride;
    }
 #endif
  } else {
    dst->argb = src->argb + top * src->argb_stride + left;
+    dst->argb_stride = src->argb_stride;
  }
  return 1;
 }
@ -391,6 +401,28 @@ static void RescalePlane(const uint8_t* src,
  }
 }

+static void AlphaMultiplyARGB(WebPPicture* const pic, int inverse) {
+  uint32_t* ptr = pic->argb;
+  int y;
+  for (y = 0; y < pic->height; ++y) {
+    WebPMultARGBRow(ptr, pic->width, inverse);
+    ptr += pic->argb_stride;
+  }
+}
+
+static void AlphaMultiplyY(WebPPicture* const pic, int inverse) {
+  const uint8_t* ptr_a = pic->a;
+  if (ptr_a != NULL) {
+    uint8_t* ptr_y = pic->y;
+    int y;
+    for (y = 0; y < pic->height; ++y) {
+      WebPMultRow(ptr_y, ptr_a, pic->width, inverse);
+      ptr_y += pic->y_stride;
+      ptr_a += pic->a_stride;
+    }
+  }
+}
+
 int WebPPictureRescale(WebPPicture* pic, int width, int height) {
  WebPPicture tmp;
  int prev_width, prev_height;
@ -416,14 +448,24 @@ int WebPPictureRescale(WebPPicture* pic, int width, int height) {
  if (!WebPPictureAlloc(&tmp)) return 0;

  if (!pic->use_argb) {
-    work = (int32_t*)malloc(2 * width * sizeof(*work));
+    work = (int32_t*)WebPSafeMalloc(2ULL * width, sizeof(*work));
    if (work == NULL) {
      WebPPictureFree(&tmp);
      return 0;
    }
+    // If present, we need to rescale alpha first (for AlphaMultiplyY).
+    if (pic->a != NULL) {
+      RescalePlane(pic->a, prev_width, prev_height, pic->a_stride,
+                   tmp.a, width, height, tmp.a_stride, work, 1);
+    }

+    // We take transparency into account on the luma plane only. That's not
+    // totally exact blending, but still is a good approximation.
+    AlphaMultiplyY(pic, 0);
    RescalePlane(pic->y, prev_width, prev_height, pic->y_stride,
                 tmp.y, width, height, tmp.y_stride, work, 1);
+    AlphaMultiplyY(&tmp, 1);
+
    RescalePlane(pic->u,
                 HALVE(prev_width), HALVE(prev_height), pic->uv_stride,
                 tmp.u,
@ -433,10 +475,6 @@ int WebPPictureRescale(WebPPicture* pic, int width, int height) {
                 tmp.v,
                 HALVE(width), HALVE(height), tmp.uv_stride, work, 1);

-    if (tmp.a != NULL) {
-      RescalePlane(pic->a, prev_width, prev_height, pic->a_stride,
-                   tmp.a, width, height, tmp.a_stride, work, 1);
-    }
 #ifdef WEBP_EXPERIMENTAL_FEATURES
    if (tmp.u0 != NULL) {
      const int s = IS_YUV_CSP(tmp.colorspace, WEBP_YUV422) ? 2 : 1;
@ -449,18 +487,21 @@ int WebPPictureRescale(WebPPicture* pic, int width, int height) {
    }
 #endif
  } else {
-    work = (int32_t*)malloc(2 * width * 4 * sizeof(*work));
+    work = (int32_t*)WebPSafeMalloc(2ULL * width * 4, sizeof(*work));
    if (work == NULL) {
      WebPPictureFree(&tmp);
      return 0;
    }
-
+    // In order to correctly interpolate colors, we need to apply the alpha
+    // weighting first (black-matting), scale the RGB values, and remove
+    // the premultiplication afterward (while preserving the alpha channel).
+    AlphaMultiplyARGB(pic, 0);
    RescalePlane((const uint8_t*)pic->argb, prev_width, prev_height,
                 pic->argb_stride * 4,
                 (uint8_t*)tmp.argb, width, height,
                 tmp.argb_stride * 4,
                 work, 4);
-
+    AlphaMultiplyARGB(&tmp, 1);
  }
  WebPPictureFree(pic);
  free(work);
@ -480,17 +521,17 @@ void WebPMemoryWriterInit(WebPMemoryWriter* writer) {
 int WebPMemoryWrite(const uint8_t* data, size_t data_size,
                    const WebPPicture* picture) {
  WebPMemoryWriter* const w = (WebPMemoryWriter*)picture->custom_ptr;
-  size_t next_size;
+  uint64_t next_size;
  if (w == NULL) {
    return 1;
  }
-  next_size = w->size + data_size;
+  next_size = (uint64_t)w->size + data_size;
  if (next_size > w->max_size) {
    uint8_t* new_mem;
-    size_t next_max_size = w->max_size * 2;
+    uint64_t next_max_size = 2ULL * w->max_size;
    if (next_max_size < next_size) next_max_size = next_size;
-    if (next_max_size < 8192) next_max_size = 8192;
-    new_mem = (uint8_t*)malloc(next_max_size);
+    if (next_max_size < 8192ULL) next_max_size = 8192ULL;
+    new_mem = (uint8_t*)WebPSafeMalloc(next_max_size, 1);
    if (new_mem == NULL) {
      return 0;
    }
@ -499,7 +540,8 @@ int WebPMemoryWrite(const uint8_t* data, size_t data_size,
    }
    free(w->mem);
    w->mem = new_mem;
-    w->max_size = next_max_size;
+    // down-cast is ok, thanks to WebPSafeMalloc
+    w->max_size = (size_t)next_max_size;
  }
  if (data_size > 0) {
    memcpy(w->mem + w->size, data, data_size);
@ -547,48 +589,102 @@ int WebPPictureHasTransparency(const WebPPicture* picture) {

 //------------------------------------------------------------------------------
 // RGB -> YUV conversion
-// The exact naming is Y'CbCr, following the ITU-R BT.601 standard.
-// More information at: http://en.wikipedia.org/wiki/YCbCr
-// Y = 0.2569 * R + 0.5044 * G + 0.0979 * B + 16
-// U = -0.1483 * R - 0.2911 * G + 0.4394 * B + 128
-// V = 0.4394 * R - 0.3679 * G - 0.0715 * B + 128
-// We use 16bit fixed point operations.

-enum { YUV_FRAC = 16 };
-
-static WEBP_INLINE int clip_uv(int v) {
-   v = (v + (257 << (YUV_FRAC + 2 - 1))) >> (YUV_FRAC + 2);
-   return ((v & ~0xff) == 0) ? v : (v < 0) ? 0 : 255;
+static int RGBToY(int r, int g, int b, VP8Random* const rg) {
+  return VP8RGBToY(r, g, b, VP8RandomBits(rg, YUV_FIX));
 }

-static WEBP_INLINE int rgb_to_y(int r, int g, int b) {
-  const int kRound = (1 << (YUV_FRAC - 1)) + (16 << YUV_FRAC);
-  const int luma = 16839 * r + 33059 * g + 6420 * b;
-  return (luma + kRound) >> YUV_FRAC;  // no need to clip
+static int RGBToU(int r, int g, int b, VP8Random* const rg) {
+  return VP8RGBToU(r, g, b, VP8RandomBits(rg, YUV_FIX + 2));
 }

-static WEBP_INLINE int rgb_to_u(int r, int g, int b) {
-  return clip_uv(-9719 * r - 19081 * g + 28800 * b);
+static int RGBToV(int r, int g, int b, VP8Random* const rg) {
+  return VP8RGBToV(r, g, b, VP8RandomBits(rg, YUV_FIX + 2));
 }

-static WEBP_INLINE int rgb_to_v(int r, int g, int b) {
-  return clip_uv(+28800 * r - 24116 * g - 4684 * b);
+//------------------------------------------------------------------------------
+
+#if defined(USE_GAMMA_COMPRESSION)
+
+// gamma-compensates loss of resolution during chroma subsampling
+#define kGamma 0.80
+#define kGammaFix 12     // fixed-point precision for linear values
+#define kGammaScale ((1 << kGammaFix) - 1)
+#define kGammaTabFix 7   // fixed-point fractional bits precision
+#define kGammaTabScale (1 << kGammaTabFix)
+#define kGammaTabRounder (kGammaTabScale >> 1)
+#define kGammaTabSize (1 << (kGammaFix - kGammaTabFix))
+
+static int kLinearToGammaTab[kGammaTabSize + 1];
+static uint16_t kGammaToLinearTab[256];
+static int kGammaTablesOk = 0;
+
+static void InitGammaTables(void) {
+  if (!kGammaTablesOk) {
+    int v;
+    const double scale = 1. / kGammaScale;
+    for (v = 0; v <= 255; ++v) {
+      kGammaToLinearTab[v] =
+          (uint16_t)(pow(v / 255., kGamma) * kGammaScale + .5);
+    }
+    for (v = 0; v <= kGammaTabSize; ++v) {
+      const double x = scale * (v << kGammaTabFix);
+      kLinearToGammaTab[v] = (int)(pow(x, 1. / kGamma) * 255. + .5);
+    }
+    kGammaTablesOk = 1;
+  }
 }

-// TODO: we can do better than simply 2x2 averaging on U/V samples.
-#define SUM4(ptr) ((ptr)[0] + (ptr)[step] + \
-                   (ptr)[rgb_stride] + (ptr)[rgb_stride + step])
-#define SUM2H(ptr) (2 * (ptr)[0] + 2 * (ptr)[step])
-#define SUM2V(ptr) (2 * (ptr)[0] + 2 * (ptr)[rgb_stride])
-#define SUM1(ptr)  (4 * (ptr)[0])
+static WEBP_INLINE uint32_t GammaToLinear(uint8_t v) {
+  return kGammaToLinearTab[v];
+}
+
+// Convert a linear value 'v' to YUV_FIX+2 fixed-point precision
+// U/V value, suitable for RGBToU/V calls.
+static WEBP_INLINE int LinearToGamma(uint32_t base_value, int shift) {
+  const int v = base_value << shift;              // final uplifted value
+  const int tab_pos = v >> (kGammaTabFix + 2);    // integer part
+  const int x = v & ((kGammaTabScale << 2) - 1);  // fractional part
+  const int v0 = kLinearToGammaTab[tab_pos];
+  const int v1 = kLinearToGammaTab[tab_pos + 1];
+  const int y = v1 * x + v0 * ((kGammaTabScale << 2) - x);   // interpolate
+  return (y + kGammaTabRounder) >> kGammaTabFix;             // descale
+}
+
+#else
+
+static void InitGammaTables(void) {}
+static WEBP_INLINE uint32_t GammaToLinear(uint8_t v) { return v; }
+static WEBP_INLINE int LinearToGamma(uint32_t base_value, int shift) {
+  (void)shift;
+  return v;
+}
+
+#endif    // USE_GAMMA_COMPRESSION
+
+//------------------------------------------------------------------------------
+
+#define SUM4(ptr) LinearToGamma(                         \
+    GammaToLinear((ptr)[0]) +                            \
+    GammaToLinear((ptr)[step]) +                         \
+    GammaToLinear((ptr)[rgb_stride]) +                   \
+    GammaToLinear((ptr)[rgb_stride + step]), 0)          \
+
+#define SUM2H(ptr) \
+    LinearToGamma(GammaToLinear((ptr)[0]) + GammaToLinear((ptr)[step]), 1)
+#define SUM2V(ptr) \
+    LinearToGamma(GammaToLinear((ptr)[0]) + GammaToLinear((ptr)[rgb_stride]), 1)
+#define SUM1(ptr)  \
+    LinearToGamma(GammaToLinear((ptr)[0]), 2)
+
 #define RGB_TO_UV(x, y, SUM) {                           \
  const int src = (2 * (step * (x) + (y) * rgb_stride)); \
  const int dst = (x) + (y) * picture->uv_stride;        \
  const int r = SUM(r_ptr + src);                        \
  const int g = SUM(g_ptr + src);                        \
  const int b = SUM(b_ptr + src);                        \
-  picture->u[dst] = rgb_to_u(r, g, b);                   \
-  picture->v[dst] = rgb_to_v(r, g, b);                   \
+  picture->u[dst] = RGBToU(r, g, b, &rg);                \
+  picture->v[dst] = RGBToV(r, g, b, &rg);                \
 }

 #define RGB_TO_UV0(x_in, x_out, y, SUM) {                \
@ -597,8 +693,8 @@ static WEBP_INLINE int rgb_to_v(int r, int g, int b) {
  const int r = SUM(r_ptr + src);                        \
  const int g = SUM(g_ptr + src);                        \
  const int b = SUM(b_ptr + src);                        \
-  picture->u0[dst] = rgb_to_u(r, g, b);                  \
-  picture->v0[dst] = rgb_to_v(r, g, b);                  \
+  picture->u0[dst] = RGBToU(r, g, b, &rg);               \
+  picture->v0[dst] = RGBToV(r, g, b, &rg);               \
 }

 static void MakeGray(WebPPicture* const picture) {
@ -617,12 +713,14 @@ static int ImportYUVAFromRGBA(const uint8_t* const r_ptr,
                              const uint8_t* const a_ptr,
                              int step,         // bytes per pixel
                              int rgb_stride,   // bytes per scanline
+                              float dithering,
                              WebPPicture* const picture) {
  const WebPEncCSP uv_csp = picture->colorspace & WEBP_CSP_UV_MASK;
  int x, y;
  const int width = picture->width;
  const int height = picture->height;
  const int has_alpha = CheckNonOpaque(a_ptr, width, height, step, rgb_stride);
+  VP8Random rg;

  picture->colorspace = uv_csp;
  picture->use_argb = 0;
@ -631,12 +729,15 @@ static int ImportYUVAFromRGBA(const uint8_t* const r_ptr,
  }
  if (!WebPPictureAlloc(picture)) return 0;

+  VP8InitRandom(&rg, dithering);
+  InitGammaTables();
+
  // Import luma plane
  for (y = 0; y < height; ++y) {
    for (x = 0; x < width; ++x) {
      const int offset = step * x + y * rgb_stride;
      picture->y[x + y * picture->y_stride] =
-          rgb_to_y(r_ptr[offset], g_ptr[offset], b_ptr[offset]);
+          RGBToY(r_ptr[offset], g_ptr[offset], b_ptr[offset], &rg);
    }
  }

@ -646,7 +747,7 @@ static int ImportYUVAFromRGBA(const uint8_t* const r_ptr,
      for (x = 0; x < (width >> 1); ++x) {
        RGB_TO_UV(x, y, SUM4);
      }
-      if (picture->width & 1) {
+      if (width & 1) {
        RGB_TO_UV(x, y, SUM2V);
      }
    }
@ -684,6 +785,7 @@ static int ImportYUVAFromRGBA(const uint8_t* const r_ptr,

  if (has_alpha) {
    assert(step >= 4);
+    assert(picture->a != NULL);
    for (y = 0; y < height; ++y) {
      for (x = 0; x < width; ++x) {
        picture->a[x + y * picture->a_stride] =
@ -706,7 +808,7 @@ static int Import(WebPPicture* const picture,

  if (!picture->use_argb) {
    return ImportYUVAFromRGBA(r_ptr, g_ptr, b_ptr, a_ptr, step, rgb_stride,
-                              picture);
+                              0.f /* no dithering */, picture);
  }
  if (import_alpha) {
    picture->colorspace |= WEBP_CSP_ALPHA_BIT;
@ -721,10 +823,7 @@ static int Import(WebPPicture* const picture,
      for (x = 0; x < width; ++x) {
        const int offset = step * x + y * rgb_stride;
        const uint32_t argb =
-            0xff000000u |
-            (r_ptr[offset] << 16) |
-            (g_ptr[offset] <<  8) |
-            (b_ptr[offset]);
+            MakeARGB32(r_ptr[offset], g_ptr[offset], b_ptr[offset]);
        picture->argb[x + y * picture->argb_stride] = argb;
      }
    }
@ -734,7 +833,7 @@ static int Import(WebPPicture* const picture,
    for (y = 0; y < height; ++y) {
      for (x = 0; x < width; ++x) {
        const int offset = step * x + y * rgb_stride;
-        const uint32_t argb = (a_ptr[offset] << 24) |
+        const uint32_t argb = ((uint32_t)a_ptr[offset] << 24) |
                              (r_ptr[offset] << 16) |
                              (g_ptr[offset] <<  8) |
                              (b_ptr[offset]);
@ -785,8 +884,7 @@ int WebPPictureImportBGRX(WebPPicture* picture,

 int WebPPictureYUVAToARGB(WebPPicture* picture) {
  if (picture == NULL) return 0;
-  if (picture->memory_ == NULL || picture->y == NULL ||
-      picture->u == NULL || picture->v == NULL) {
+  if (picture->y == NULL || picture->u == NULL || picture->v == NULL) {
    return WebPEncodingSetError(picture, VP8_ENC_ERROR_NULL_PARAMETER);
  }
  if ((picture->colorspace & WEBP_CSP_ALPHA_BIT) && picture->a == NULL) {
@ -809,7 +907,7 @@ int WebPPictureYUVAToARGB(WebPPicture* picture) {
    WebPUpsampleLinePairFunc upsample = WebPGetLinePairConverter(ALPHA_IS_LAST);

    // First row, with replicated top samples.
-    upsample(NULL, cur_y, cur_u, cur_v, cur_u, cur_v, NULL, dst, width);
+    upsample(cur_y, NULL, cur_u, cur_v, cur_u, cur_v, dst, NULL, width);
    cur_y += picture->y_stride;
    dst += argb_stride;
    // Center rows.
@ -830,11 +928,11 @@ int WebPPictureYUVAToARGB(WebPPicture* picture) {
    // Insert alpha values if needed, in replacement for the default 0xff ones.
    if (picture->colorspace & WEBP_CSP_ALPHA_BIT) {
      for (y = 0; y < height; ++y) {
-        uint32_t* const dst = picture->argb + y * picture->argb_stride;
+        uint32_t* const argb_dst = picture->argb + y * picture->argb_stride;
        const uint8_t* const src = picture->a + y * picture->a_stride;
        int x;
        for (x = 0; x < width; ++x) {
-          dst[x] = (dst[x] & 0x00ffffffu) | (src[x] << 24);
+          argb_dst[x] = (argb_dst[x] & 0x00ffffffu) | ((uint32_t)src[x] << 24);
        }
      }
    }
@ -842,7 +940,8 @@ int WebPPictureYUVAToARGB(WebPPicture* picture) {
  return 1;
 }

-int WebPPictureARGBToYUVA(WebPPicture* picture, WebPEncCSP colorspace) {
+int WebPPictureARGBToYUVADithered(WebPPicture* picture, WebPEncCSP colorspace,
+                                  float dithering) {
  if (picture == NULL) return 0;
  if (picture->argb == NULL) {
    return WebPEncodingSetError(picture, VP8_ENC_ERROR_NULL_PARAMETER);
@ -858,7 +957,8 @@ int WebPPictureARGBToYUVA(WebPPicture* picture, WebPEncCSP colorspace) {
    PictureResetARGB(&tmp);  // reset ARGB buffer so that it's not free()'d.
    tmp.use_argb = 0;
    tmp.colorspace = colorspace & WEBP_CSP_UV_MASK;
-    if (!ImportYUVAFromRGBA(r, g, b, a, 4, 4 * picture->argb_stride, &tmp)) {
+    if (!ImportYUVAFromRGBA(r, g, b, a, 4, 4 * picture->argb_stride, dithering,
+                            &tmp)) {
      return WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
    }
    // Copy back the YUV specs into 'picture'.
@ -870,6 +970,10 @@ int WebPPictureARGBToYUVA(WebPPicture* picture, WebPEncCSP colorspace) {
  return 1;
 }

+int WebPPictureARGBToYUVA(WebPPicture* picture, WebPEncCSP colorspace) {
+  return WebPPictureARGBToYUVADithered(picture, colorspace, 0.f);
+}
+
 //------------------------------------------------------------------------------
 // Helper: clean up fully transparent area to help compressibility.

@ -935,54 +1039,207 @@ void WebPCleanupTransparentArea(WebPPicture* pic) {
 #undef SIZE
 #undef SIZE2

+//------------------------------------------------------------------------------
+// Blend color and remove transparency info
+
+#define BLEND(V0, V1, ALPHA) \
+    ((((V0) * (255 - (ALPHA)) + (V1) * (ALPHA)) * 0x101) >> 16)
+#define BLEND_10BIT(V0, V1, ALPHA) \
+    ((((V0) * (1020 - (ALPHA)) + (V1) * (ALPHA)) * 0x101) >> 18)
+
+void WebPBlendAlpha(WebPPicture* pic, uint32_t background_rgb) {
+  const int red = (background_rgb >> 16) & 0xff;
+  const int green = (background_rgb >> 8) & 0xff;
+  const int blue = (background_rgb >> 0) & 0xff;
+  VP8Random rg;
+  int x, y;
+  if (pic == NULL) return;
+  VP8InitRandom(&rg, 0.f);
+  if (!pic->use_argb) {
+    const int uv_width = (pic->width >> 1);  // omit last pixel during u/v loop
+    const int Y0 = RGBToY(red, green, blue, &rg);
+    // VP8RGBToU/V expects the u/v values summed over four pixels
+    const int U0 = RGBToU(4 * red, 4 * green, 4 * blue, &rg);
+    const int V0 = RGBToV(4 * red, 4 * green, 4 * blue, &rg);
+    const int has_alpha = pic->colorspace & WEBP_CSP_ALPHA_BIT;
+    if (!has_alpha || pic->a == NULL) return;    // nothing to do
+    for (y = 0; y < pic->height; ++y) {
+      // Luma blending
+      uint8_t* const y_ptr = pic->y + y * pic->y_stride;
+      uint8_t* const a_ptr = pic->a + y * pic->a_stride;
+      for (x = 0; x < pic->width; ++x) {
+        const int alpha = a_ptr[x];
+        if (alpha < 0xff) {
+          y_ptr[x] = BLEND(Y0, y_ptr[x], a_ptr[x]);
+        }
+      }
+      // Chroma blending every even line
+      if ((y & 1) == 0) {
+        uint8_t* const u = pic->u + (y >> 1) * pic->uv_stride;
+        uint8_t* const v = pic->v + (y >> 1) * pic->uv_stride;
+        uint8_t* const a_ptr2 =
+            (y + 1 == pic->height) ? a_ptr : a_ptr + pic->a_stride;
+        for (x = 0; x < uv_width; ++x) {
+          // Average four alpha values into a single blending weight.
+          // TODO(skal): might lead to visible contouring. Can we do better?
+          const int alpha =
+              a_ptr[2 * x + 0] + a_ptr[2 * x + 1] +
+              a_ptr2[2 * x + 0] + a_ptr2[2 * x + 1];
+          u[x] = BLEND_10BIT(U0, u[x], alpha);
+          v[x] = BLEND_10BIT(V0, v[x], alpha);
+        }
+        if (pic->width & 1) {   // rightmost pixel
+          const int alpha = 2 * (a_ptr[2 * x + 0] + a_ptr2[2 * x + 0]);
+          u[x] = BLEND_10BIT(U0, u[x], alpha);
+          v[x] = BLEND_10BIT(V0, v[x], alpha);
+        }
+      }
+      memset(a_ptr, 0xff, pic->width);
+    }
+  } else {
+    uint32_t* argb = pic->argb;
+    const uint32_t background = MakeARGB32(red, green, blue);
+    for (y = 0; y < pic->height; ++y) {
+      for (x = 0; x < pic->width; ++x) {
+        const int alpha = (argb[x] >> 24) & 0xff;
+        if (alpha != 0xff) {
+          if (alpha > 0) {
+            int r = (argb[x] >> 16) & 0xff;
+            int g = (argb[x] >>  8) & 0xff;
+            int b = (argb[x] >>  0) & 0xff;
+            r = BLEND(red, r, alpha);
+            g = BLEND(green, g, alpha);
+            b = BLEND(blue, b, alpha);
+            argb[x] = MakeARGB32(r, g, b);
+          } else {
+            argb[x] = background;
+          }
+        }
+      }
+      argb += pic->argb_stride;
+    }
+  }
+}
+
+#undef BLEND
+#undef BLEND_10BIT
+
+//------------------------------------------------------------------------------
+// local-min distortion
+//
+// For every pixel in the *reference* picture, we search for the local best
+// match in the compressed image. This is not a symmetrical measure.
+
+// search radius. Shouldn't be too large.
+#define RADIUS 2
+
+static float AccumulateLSIM(const uint8_t* src, int src_stride,
+                            const uint8_t* ref, int ref_stride,
+                            int w, int h) {
+  int x, y;
+  double total_sse = 0.;
+  for (y = 0; y < h; ++y) {
+    const int y_0 = (y - RADIUS < 0) ? 0 : y - RADIUS;
+    const int y_1 = (y + RADIUS + 1 >= h) ? h : y + RADIUS + 1;
+    for (x = 0; x < w; ++x) {
+      const int x_0 = (x - RADIUS < 0) ? 0 : x - RADIUS;
+      const int x_1 = (x + RADIUS + 1 >= w) ? w : x + RADIUS + 1;
+      double best_sse = 255. * 255.;
+      const double value = (double)ref[y * ref_stride + x];
+      int i, j;
+      for (j = y_0; j < y_1; ++j) {
+        const uint8_t* s = src + j * src_stride;
+        for (i = x_0; i < x_1; ++i) {
+          const double sse = (double)(s[i] - value) * (s[i] - value);
+          if (sse < best_sse) best_sse = sse;
+        }
+      }
+      total_sse += best_sse;
+    }
+  }
+  return (float)total_sse;
+}
+#undef RADIUS

 //------------------------------------------------------------------------------
 // Distortion

 // Max value returned in case of exact similarity.
 static const double kMinDistortion_dB = 99.;
+static float GetPSNR(const double v) {
+  return (float)((v > 0.) ? -4.3429448 * log(v / (255 * 255.))
+                          : kMinDistortion_dB);
+}

-int WebPPictureDistortion(const WebPPicture* pic1, const WebPPicture* pic2,
+int WebPPictureDistortion(const WebPPicture* src, const WebPPicture* ref,
                          int type, float result[5]) {
-  int c;
  DistoStats stats[5];
  int has_alpha;
+  int uv_w, uv_h;

-  if (pic1 == NULL || pic2 == NULL ||
-      pic1->width != pic2->width || pic1->height != pic2->height ||
-      pic1->y == NULL || pic2->y == NULL ||
-      pic1->u == NULL || pic2->u == NULL ||
-      pic1->v == NULL || pic2->v == NULL ||
+  if (src == NULL || ref == NULL ||
+      src->width != ref->width || src->height != ref->height ||
+      src->y == NULL || ref->y == NULL ||
+      src->u == NULL || ref->u == NULL ||
+      src->v == NULL || ref->v == NULL ||
      result == NULL) {
    return 0;
  }
  // TODO(skal): provide distortion for ARGB too.
-  if (pic1->use_argb == 1 || pic1->use_argb != pic2->use_argb) {
+  if (src->use_argb == 1 || src->use_argb != ref->use_argb) {
    return 0;
  }

-  has_alpha = !!(pic1->colorspace & WEBP_CSP_ALPHA_BIT);
-  if (has_alpha != !!(pic2->colorspace & WEBP_CSP_ALPHA_BIT) ||
-      (has_alpha && (pic1->a == NULL || pic2->a == NULL))) {
+  has_alpha = !!(src->colorspace & WEBP_CSP_ALPHA_BIT);
+  if (has_alpha != !!(ref->colorspace & WEBP_CSP_ALPHA_BIT) ||
+      (has_alpha && (src->a == NULL || ref->a == NULL))) {
    return 0;
  }

  memset(stats, 0, sizeof(stats));
-  VP8SSIMAccumulatePlane(pic1->y, pic1->y_stride,
-                         pic2->y, pic2->y_stride,
-                         pic1->width, pic1->height, &stats[0]);
-  VP8SSIMAccumulatePlane(pic1->u, pic1->uv_stride,
-                         pic2->u, pic2->uv_stride,
-                         (pic1->width + 1) >> 1, (pic1->height + 1) >> 1,
-                         &stats[1]);
-  VP8SSIMAccumulatePlane(pic1->v, pic1->uv_stride,
-                         pic2->v, pic2->uv_stride,
-                         (pic1->width + 1) >> 1, (pic1->height + 1) >> 1,
-                         &stats[2]);
+
+  uv_w = HALVE(src->width);
+  uv_h = HALVE(src->height);
+  if (type >= 2) {
+    float sse[4];
+    sse[0] = AccumulateLSIM(src->y, src->y_stride,
+                            ref->y, ref->y_stride, src->width, src->height);
+    sse[1] = AccumulateLSIM(src->u, src->uv_stride,
+                            ref->u, ref->uv_stride, uv_w, uv_h);
+    sse[2] = AccumulateLSIM(src->v, src->uv_stride,
+                            ref->v, ref->uv_stride, uv_w, uv_h);
+    sse[3] = has_alpha ? AccumulateLSIM(src->a, src->a_stride,
+                                        ref->a, ref->a_stride,
+                                        src->width, src->height)
+                       : 0.f;
+    result[0] = GetPSNR(sse[0] / (src->width * src->height));
+    result[1] = GetPSNR(sse[1] / (uv_w * uv_h));
+    result[2] = GetPSNR(sse[2] / (uv_w * uv_h));
+    result[3] = GetPSNR(sse[3] / (src->width * src->height));
+    {
+      double total_sse = sse[0] + sse[1] + sse[2];
+      int total_pixels = src->width * src->height + 2 * uv_w * uv_h;
      if (has_alpha) {
-    VP8SSIMAccumulatePlane(pic1->a, pic1->a_stride,
-                           pic2->a, pic2->a_stride,
-                           pic1->width, pic1->height, &stats[3]);
+        total_pixels += src->width * src->height;
+        total_sse += sse[3];
+      }
+      result[4] = GetPSNR(total_sse / total_pixels);
+    }
+  } else {
+    int c;
+    VP8SSIMAccumulatePlane(src->y, src->y_stride,
+                           ref->y, ref->y_stride,
+                           src->width, src->height, &stats[0]);
+    VP8SSIMAccumulatePlane(src->u, src->uv_stride,
+                           ref->u, ref->uv_stride,
+                           uv_w, uv_h, &stats[1]);
+    VP8SSIMAccumulatePlane(src->v, src->uv_stride,
+                           ref->v, ref->uv_stride,
+                           uv_w, uv_h, &stats[2]);
+    if (has_alpha) {
+      VP8SSIMAccumulatePlane(src->a, src->a_stride,
+                             ref->a, ref->a_stride,
+                             src->width, src->height, &stats[3]);
    }
    for (c = 0; c <= 4; ++c) {
      if (type == 1) {
@ -991,12 +1248,12 @@ int WebPPictureDistortion(const WebPPicture* pic1, const WebPPicture* pic2,
                                     : kMinDistortion_dB);
      } else {
        const double v = VP8SSIMGetSquaredError(&stats[c]);
-      result[c] = (float)((v > 0.) ? -4.3429448 * log(v / (255 * 255.))
-                                   : kMinDistortion_dB);
+        result[c] = GetPSNR(v);
      }
      // Accumulate forward
      if (c < 4) VP8SSIMAddStats(&stats[c], &stats[4]);
    }
+  }
  return 1;
 }

@ -1043,10 +1300,10 @@ size_t NAME(const uint8_t* in, int w, int h, int bps, float q,          \
  return Encode(in, w, h, bps, IMPORTER, q, 0, out);                    \
 }

-ENCODE_FUNC(WebPEncodeRGB, WebPPictureImportRGB);
-ENCODE_FUNC(WebPEncodeBGR, WebPPictureImportBGR);
-ENCODE_FUNC(WebPEncodeRGBA, WebPPictureImportRGBA);
-ENCODE_FUNC(WebPEncodeBGRA, WebPPictureImportBGRA);
+ENCODE_FUNC(WebPEncodeRGB, WebPPictureImportRGB)
+ENCODE_FUNC(WebPEncodeBGR, WebPPictureImportBGR)
+ENCODE_FUNC(WebPEncodeRGBA, WebPPictureImportRGBA)
+ENCODE_FUNC(WebPEncodeBGRA, WebPPictureImportBGRA)

 #undef ENCODE_FUNC

@ -1056,15 +1313,12 @@ size_t NAME(const uint8_t* in, int w, int h, int bps, uint8_t** out) {       \
  return Encode(in, w, h, bps, IMPORTER, LOSSLESS_DEFAULT_QUALITY, 1, out);  \
 }

-LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessRGB, WebPPictureImportRGB);
-LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessBGR, WebPPictureImportBGR);
-LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessRGBA, WebPPictureImportRGBA);
-LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessBGRA, WebPPictureImportBGRA);
+LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessRGB, WebPPictureImportRGB)
+LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessBGR, WebPPictureImportBGR)
+LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessRGBA, WebPPictureImportRGBA)
+LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessBGRA, WebPPictureImportBGRA)

 #undef LOSSLESS_ENCODE_FUNC

 //------------------------------------------------------------------------------

-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
--- a/src/enc/quant.c
+++ b/src/enc/quant.c
@ -1,8 +1,10 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 //   Quantization
@ -11,6 +13,7 @@

 #include <assert.h>
 #include <math.h>
+#include <stdlib.h>  // for abs()

 #include "./vp8enci.h"
 #include "./cost.h"
@ -22,16 +25,78 @@

 #define MID_ALPHA 64      // neutral value for susceptibility
 #define MIN_ALPHA 30      // lowest usable value for susceptibility
-#define MAX_ALPHA 100     // higher meaninful value for susceptibility
+#define MAX_ALPHA 100     // higher meaningful value for susceptibility

 #define SNS_TO_DQ 0.9     // Scaling constant between the sns value and the QP
                          // power-law modulation. Must be strictly less than 1.

+#define I4_PENALTY 4000   // Rate-penalty for quick i4/i16 decision
+
+// number of non-zero coeffs below which we consider the block very flat
+// (and apply a penalty to complex predictions)
+#define FLATNESS_LIMIT_I16 10      // I16 mode
+#define FLATNESS_LIMIT_I4  3       // I4 mode
+#define FLATNESS_LIMIT_UV  2       // UV mode
+#define FLATNESS_PENALTY   140     // roughly ~1bit per block
+
 #define MULT_8B(a, b) (((a) * (b) + 128) >> 8)

-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
+// #define DEBUG_BLOCK
+
+//------------------------------------------------------------------------------
+
+#if defined(DEBUG_BLOCK)
+
+#include <stdio.h>
+#include <stdlib.h>
+
+static void PrintBlockInfo(const VP8EncIterator* const it,
+                           const VP8ModeScore* const rd) {
+  int i, j;
+  const int is_i16 = (it->mb_->type_ == 1);
+  printf("SOURCE / OUTPUT / ABS DELTA\n");
+  for (j = 0; j < 24; ++j) {
+    if (j == 16) printf("\n");   // newline before the U/V block
+    for (i = 0; i < 16; ++i) printf("%3d ", it->yuv_in_[i + j * BPS]);
+    printf("     ");
+    for (i = 0; i < 16; ++i) printf("%3d ", it->yuv_out_[i + j * BPS]);
+    printf("     ");
+    for (i = 0; i < 16; ++i) {
+      printf("%1d ", abs(it->yuv_out_[i + j * BPS] - it->yuv_in_[i + j * BPS]));
+    }
+    printf("\n");
+  }
+  printf("\nD:%d SD:%d R:%d H:%d nz:0x%x score:%d\n",
+    (int)rd->D, (int)rd->SD, (int)rd->R, (int)rd->H, (int)rd->nz,
+    (int)rd->score);
+  if (is_i16) {
+    printf("Mode: %d\n", rd->mode_i16);
+    printf("y_dc_levels:");
+    for (i = 0; i < 16; ++i) printf("%3d ", rd->y_dc_levels[i]);
+    printf("\n");
+  } else {
+    printf("Modes[16]: ");
+    for (i = 0; i < 16; ++i) printf("%d ", rd->modes_i4[i]);
+    printf("\n");
+  }
+  printf("y_ac_levels:\n");
+  for (j = 0; j < 16; ++j) {
+    for (i = is_i16 ? 1 : 0; i < 16; ++i) {
+      printf("%4d ", rd->y_ac_levels[j][i]);
+    }
+    printf("\n");
+  }
+  printf("\n");
+  printf("uv_levels (mode=%d):\n", rd->mode_uv);
+  for (j = 0; j < 8; ++j) {
+    for (i = 0; i < 16; ++i) {
+      printf("%4d ", rd->uv_levels[j][i]);
+    }
+    printf("\n");
+  }
+}
+
+#endif   // DEBUG_BLOCK

 //------------------------------------------------------------------------------

@ -100,31 +165,13 @@ static const uint16_t kAcTable2[128] = {
  385, 393, 401, 409, 416, 424, 432, 440
 };

-static const uint16_t kCoeffThresh[16] = {
-  0,  10, 20, 30,
-  10, 20, 30, 30,
-  20, 30, 30, 30,
-  30, 30, 30, 30
+static const uint8_t kBiasMatrices[3][2] = {  // [luma-ac,luma-dc,chroma][dc,ac]
+  { 96, 110 }, { 96, 108 }, { 110, 115 }
 };

-// TODO(skal): tune more. Coeff thresholding?
-static const uint8_t kBiasMatrices[3][16] = {  // [3] = [luma-ac,luma-dc,chroma]
-  { 96, 96, 96, 96,
-    96, 96, 96, 96,
-    96, 96, 96, 96,
-    96, 96, 96, 96 },
-  { 96, 96, 96, 96,
-    96, 96, 96, 96,
-    96, 96, 96, 96,
-    96, 96, 96, 96 },
-  { 96, 96, 96, 96,
-    96, 96, 96, 96,
-    96, 96, 96, 96,
-    96, 96, 96, 96 }
-};
-
-// Sharpening by (slightly) raising the hi-frequency coeffs (only for trellis).
+// Sharpening by (slightly) raising the hi-frequency coeffs.
 // Hack-ish but helpful for mid-bitrate range. Use with care.
+#define SHARPEN_BITS 11  // number of descaling bits for sharpening bias
 static const uint8_t kFreqSharpening[16] = {
  0,  30, 60, 90,
  30, 60, 90, 90,
@ -137,20 +184,30 @@ static const uint8_t kFreqSharpening[16] = {

 // Returns the average quantizer
 static int ExpandMatrix(VP8Matrix* const m, int type) {
-  int i;
-  int sum = 0;
+  int i, sum;
+  for (i = 0; i < 2; ++i) {
+    const int is_ac_coeff = (i > 0);
+    const int bias = kBiasMatrices[type][is_ac_coeff];
+    m->iq_[i] = (1 << QFIX) / m->q_[i];
+    m->bias_[i] = BIAS(bias);
+    // zthresh_ is the exact value such that QUANTDIV(coeff, iQ, B) is:
+    //   * zero if coeff <= zthresh
+    //   * non-zero if coeff > zthresh
+    m->zthresh_[i] = ((1 << QFIX) - 1 - m->bias_[i]) / m->iq_[i];
+  }
  for (i = 2; i < 16; ++i) {
    m->q_[i] = m->q_[1];
+    m->iq_[i] = m->iq_[1];
+    m->bias_[i] = m->bias_[1];
+    m->zthresh_[i] = m->zthresh_[1];
  }
-  for (i = 0; i < 16; ++i) {
-    const int j = kZigzag[i];
-    const int bias = kBiasMatrices[type][j];
-    m->iq_[j] = (1 << QFIX) / m->q_[j];
-    m->bias_[j] = BIAS(bias);
-    // TODO(skal): tune kCoeffThresh[]
-    m->zthresh_[j] = ((256 /*+ kCoeffThresh[j]*/ - bias) * m->q_[j] + 127) >> 8;
-    m->sharpen_[j] = (kFreqSharpening[j] * m->q_[j]) >> 11;
-    sum += m->q_[j];
+  for (sum = 0, i = 0; i < 16; ++i) {
+    if (type == 0) {  // we only use sharpening for AC luma coeffs
+      m->sharpen_[i] = (kFreqSharpening[i] * m->q_[i]) >> SHARPEN_BITS;
+    } else {
+      m->sharpen_[i] = 0;
+    }
+    sum += m->q_[i];
  }
  return (sum + 8) >> 4;
 }
@ -178,8 +235,6 @@ static void SetupMatrices(VP8Encoder* enc) {
    q16 = ExpandMatrix(&m->y2_, 1);
    quv = ExpandMatrix(&m->uv_, 2);

-    // TODO: Switch to kLambda*[] tables?
-    {
    m->lambda_i4_          = (3 * q4 * q4) >> 7;
    m->lambda_i16_         = (3 * q16 * q16);
    m->lambda_uv_          = (3 * quv * quv) >> 6;
@ -188,7 +243,9 @@ static void SetupMatrices(VP8Encoder* enc) {
    m->lambda_trellis_i16_ = (q16 * q16) >> 2;
    m->lambda_trellis_uv_  = (quv *quv) << 1;
    m->tlambda_            = (tlambda_scale * q4) >> 5;
-    }
+
+    m->min_disto_ = 10 * m->y1_.q_[0];   // quantization-aware min disto
+    m->max_edge_  = 0;
  }
 }

@ -197,16 +254,21 @@ static void SetupMatrices(VP8Encoder* enc) {

 // Very small filter-strength values have close to no visual effect. So we can
 // save a little decoding-CPU by turning filtering off for these.
-#define FSTRENGTH_CUTOFF 3
+#define FSTRENGTH_CUTOFF 2

 static void SetupFilterStrength(VP8Encoder* const enc) {
  int i;
-  const int level0 = enc->config_->filter_strength;
+  // level0 is in [0..500]. Using '-f 50' as filter_strength is mid-filtering.
+  const int level0 = 5 * enc->config_->filter_strength;
  for (i = 0; i < NUM_MB_SEGMENTS; ++i) {
-    // Segments with lower quantizer will be less filtered. TODO: tune (wrt SNS)
-    const int level = level0 * 256 * enc->dqm_[i].quant_ / 128;
-    const int f = level / (256 + enc->dqm_[i].beta_);
-    enc->dqm_[i].fstrength_ = (f < FSTRENGTH_CUTOFF) ? 0 : (f > 63) ? 63 : f;
+    VP8SegmentInfo* const m = &enc->dqm_[i];
+    // We focus on the quantization of AC coeffs.
+    const int qstep = kAcTable[clip(m->quant_, 0, 127)] >> 2;
+    const int base_strength =
+        VP8FilterStrengthFromDelta(enc->filter_hdr_.sharpness_, qstep);
+    // Segments with lower complexity ('beta') will be less filtered.
+    const int f = base_strength * level0 / (256 + m->beta_);
+    m->fstrength_ = (f < FSTRENGTH_CUTOFF) ? 0 : (f > 63) ? 63 : f;
  }
  // We record the initial strength (mainly for the case of 1-segment only).
  enc->filter_hdr_.level_ = enc->dqm_[0].fstrength_;
@ -224,28 +286,90 @@ static void SetupFilterStrength(VP8Encoder* const enc) {
 // We want to emulate jpeg-like behaviour where the expected "good" quality
 // is around q=75. Internally, our "good" middle is around c=50. So we
 // map accordingly using linear piece-wise function
-static double QualityToCompression(double q) {
-  const double c = q / 100.;
-  return (c < 0.75) ? c * (2. / 3.) : 2. * c - 1.;
+static double QualityToCompression(double c) {
+  const double linear_c = (c < 0.75) ? c * (2. / 3.) : 2. * c - 1.;
+  // The file size roughly scales as pow(quantizer, 3.). Actually, the
+  // exponent is somewhere between 2.8 and 3.2, but we're mostly interested
+  // in the mid-quant range. So we scale the compressibility inversely to
+  // this power-law: quant ~= compression ^ 1/3. This law holds well for
+  // low quant. Finer modeling for high-quant would make use of kAcTable[]
+  // more explicitly.
+  const double v = pow(linear_c, 1 / 3.);
+  return v;
+}
+
+static double QualityToJPEGCompression(double c, double alpha) {
+  // We map the complexity 'alpha' and quality setting 'c' to a compression
+  // exponent empirically matched to the compression curve of libjpeg6b.
+  // On average, the WebP output size will be roughly similar to that of a
+  // JPEG file compressed with same quality factor.
+  const double amin = 0.30;
+  const double amax = 0.85;
+  const double exp_min = 0.4;
+  const double exp_max = 0.9;
+  const double slope = (exp_min - exp_max) / (amax - amin);
+  // Linearly interpolate 'expn' from exp_min to exp_max
+  // in the [amin, amax] range.
+  const double expn = (alpha > amax) ? exp_min
+                    : (alpha < amin) ? exp_max
+                    : exp_max + slope * (alpha - amin);
+  const double v = pow(c, expn);
+  return v;
+}
+
+static int SegmentsAreEquivalent(const VP8SegmentInfo* const S1,
+                                 const VP8SegmentInfo* const S2) {
+  return (S1->quant_ == S2->quant_) && (S1->fstrength_ == S2->fstrength_);
+}
+
+static void SimplifySegments(VP8Encoder* const enc) {
+  int map[NUM_MB_SEGMENTS] = { 0, 1, 2, 3 };
+  const int num_segments = enc->segment_hdr_.num_segments_;
+  int num_final_segments = 1;
+  int s1, s2;
+  for (s1 = 1; s1 < num_segments; ++s1) {    // find similar segments
+    const VP8SegmentInfo* const S1 = &enc->dqm_[s1];
+    int found = 0;
+    // check if we already have similar segment
+    for (s2 = 0; s2 < num_final_segments; ++s2) {
+      const VP8SegmentInfo* const S2 = &enc->dqm_[s2];
+      if (SegmentsAreEquivalent(S1, S2)) {
+        found = 1;
+        break;
+      }
+    }
+    map[s1] = s2;
+    if (!found) {
+      if (num_final_segments != s1) {
+        enc->dqm_[num_final_segments] = enc->dqm_[s1];
+      }
+      ++num_final_segments;
+    }
+  }
+  if (num_final_segments < num_segments) {  // Remap
+    int i = enc->mb_w_ * enc->mb_h_;
+    while (i-- > 0) enc->mb_info_[i].segment_ = map[enc->mb_info_[i].segment_];
+    enc->segment_hdr_.num_segments_ = num_final_segments;
+    // Replicate the trailing segment infos (it's mostly cosmetics)
+    for (i = num_final_segments; i < num_segments; ++i) {
+      enc->dqm_[i] = enc->dqm_[num_final_segments - 1];
+    }
+  }
 }

 void VP8SetSegmentParams(VP8Encoder* const enc, float quality) {
  int i;
  int dq_uv_ac, dq_uv_dc;
-  const int num_segments = enc->config_->segments;
+  const int num_segments = enc->segment_hdr_.num_segments_;
  const double amp = SNS_TO_DQ * enc->config_->sns_strength / 100. / 128.;
-  const double c_base = QualityToCompression(quality);
+  const double Q = quality / 100.;
+  const double c_base = enc->config_->emulate_jpeg_size ?
+      QualityToJPEGCompression(Q, enc->alpha_ / 255.) :
+      QualityToCompression(Q);
  for (i = 0; i < num_segments; ++i) {
-    // The file size roughly scales as pow(quantizer, 3.). Actually, the
-    // exponent is somewhere between 2.8 and 3.2, but we're mostly interested
-    // in the mid-quant range. So we scale the compressibility inversely to
-    // this power-law: quant ~= compression ^ 1/3. This law holds well for
-    // low quant. Finer modelling for high-quant would make use of kAcTable[]
-    // more explicitely.
-    // Additionally, we modulate the base exponent 1/3 to accommodate for the
-    // quantization susceptibility and allow denser segments to be quantized
-    // more.
-    const double expn = (1. - amp * enc->dqm_[i].alpha_) / 3.;
+    // We modulate the base coefficient to accommodate for the quantization
+    // susceptibility and allow denser segments to be quantized more.
+    const double expn = 1. - amp * enc->dqm_[i].alpha_;
    const double c = pow(c_base, expn);
    const int q = (int)(127. * (1. - c));
    assert(expn > 0.);
@ -281,9 +405,11 @@ void VP8SetSegmentParams(VP8Encoder* const enc, float quality) {
  enc->dq_uv_dc_ = dq_uv_dc;
  enc->dq_uv_ac_ = dq_uv_ac;

-  SetupMatrices(enc);
-
  SetupFilterStrength(enc);   // initialize segments' filtering, eventually
+
+  if (num_segments > 1) SimplifySegments(enc);
+
+  SetupMatrices(enc);         // finalize quantization matrices
 }

 //------------------------------------------------------------------------------
@ -299,16 +425,14 @@ const int VP8I4ModeOffsets[NUM_BMODES] = {
 };

 void VP8MakeLuma16Preds(const VP8EncIterator* const it) {
-  const VP8Encoder* const enc = it->enc_;
-  const uint8_t* const left = it->x_ ? enc->y_left_ : NULL;
-  const uint8_t* const top = it->y_ ? enc->y_top_ + it->x_ * 16 : NULL;
+  const uint8_t* const left = it->x_ ? it->y_left_ : NULL;
+  const uint8_t* const top = it->y_ ? it->y_top_ : NULL;
  VP8EncPredLuma16(it->yuv_p_, left, top);
 }

 void VP8MakeChroma8Preds(const VP8EncIterator* const it) {
-  const VP8Encoder* const enc = it->enc_;
-  const uint8_t* const left = it->x_ ? enc->u_left_ : NULL;
-  const uint8_t* const top = it->y_ ? enc->uv_top_ + it->x_ * 16 : NULL;
+  const uint8_t* const left = it->x_ ? it->u_left_ : NULL;
+  const uint8_t* const top = it->y_ ? it->uv_top_ : NULL;
  VP8EncPredChroma8(it->yuv_p_, left, top);
 }

@ -364,6 +488,7 @@ static void InitScore(VP8ModeScore* const rd) {
  rd->D  = 0;
  rd->SD = 0;
  rd->R  = 0;
+  rd->H  = 0;
  rd->nz = 0;
  rd->score = MAX_COST;
 }
@ -372,6 +497,7 @@ static void CopyScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
  dst->D  = src->D;
  dst->SD = src->SD;
  dst->R  = src->R;
+  dst->H  = src->H;
  dst->nz = src->nz;      // note that nz is not accumulated, but just copied.
  dst->score = src->score;
 }
@ -380,6 +506,7 @@ static void AddScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
  dst->D  += src->D;
  dst->SD += src->SD;
  dst->R  += src->R;
+  dst->H  += src->H;
  dst->nz |= src->nz;     // here, new nz bits are accumulated.
  dst->score += src->score;
 }
@ -408,7 +535,7 @@ typedef struct {

 static WEBP_INLINE void SetRDScore(int lambda, VP8ModeScore* const rd) {
  // TODO: incorporate the "* 256" in the tables?
-  rd->score = rd->R * lambda + 256 * (rd->D + rd->SD);
+  rd->score = (rd->R + rd->H) * lambda + 256 * (rd->D + rd->SD);
 }

 static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate,
@ -471,11 +598,10 @@ static int TrellisQuantizeBlock(const VP8EncIterator* const it,
    // note: it's important to take sign of the _original_ coeff,
    // so we don't have to consider level < 0 afterward.
    const int sign = (in[j] < 0);
-    int coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
-    int level0;
-    if (coeff0 > 2047) coeff0 = 2047;
+    const int coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
+    int level0 = QUANTDIV(coeff0, iQ, B);
+    if (level0 > MAX_LEVEL) level0 = MAX_LEVEL;

-    level0 = QUANTDIV(coeff0, iQ, B);
    // test all alternate level values around level0.
    for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
      Node* const cur = &NODE(n, m);
@ -487,7 +613,7 @@ static int TrellisQuantizeBlock(const VP8EncIterator* const it,
      cur->sign = sign;
      cur->level = level;
      cur->ctx = (level == 0) ? 0 : (level == 1) ? 1 : 2;
-      if (level >= 2048 || level < 0) {   // node is dead?
+      if (level > MAX_LEVEL || level < 0) {   // node is dead?
        cur->cost = MAX_COST;
        continue;
      }
@ -580,10 +706,10 @@ static int ReconstructIntra16(VP8EncIterator* const it,
                              VP8ModeScore* const rd,
                              uint8_t* const yuv_out,
                              int mode) {
-  const VP8Encoder* const enc = it->enc_;
+  VP8Encoder* const enc = it->enc_;
  const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
  const uint8_t* const src = it->yuv_in_ + Y_OFF;
-  const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
+  VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
  int nz = 0;
  int n;
  int16_t tmp[16][16], dc_tmp[16];
@ -592,7 +718,7 @@ static int ReconstructIntra16(VP8EncIterator* const it,
    VP8FTransform(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]);
  }
  VP8FTransformWHT(tmp[0], dc_tmp);
-  nz |= VP8EncQuantizeBlock(dc_tmp, rd->y_dc_levels, 0, &dqm->y2_) << 24;
+  nz |= VP8EncQuantizeBlockWHT(dc_tmp, rd->y_dc_levels, &dqm->y2_) << 24;

  if (DO_TRELLIS_I16 && it->do_trellis_) {
    int x, y;
@ -687,7 +813,18 @@ static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,

 //------------------------------------------------------------------------------
 // RD-opt decision. Reconstruct each modes, evalue distortion and bit-cost.
-// Pick the mode is lower RD-cost = Rate + lamba * Distortion.
+// Pick the mode is lower RD-cost = Rate + lambda * Distortion.
+
+static void StoreMaxDelta(VP8SegmentInfo* const dqm, const int16_t DCs[16]) {
+  // We look at the first three AC coefficients to determine what is the average
+  // delta between each sub-4x4 block.
+  const int v0 = abs(DCs[1]);
+  const int v1 = abs(DCs[4]);
+  const int v2 = abs(DCs[5]);
+  int max_v = (v0 > v1) ? v1 : v0;
+  max_v = (v2 > max_v) ? v2 : max_v;
+  if (max_v > dqm->max_edge_) dqm->max_edge_ = max_v;
+}

 static void SwapPtr(uint8_t** a, uint8_t** b) {
  uint8_t* const tmp = *a;
@ -699,9 +836,23 @@ static void SwapOut(VP8EncIterator* const it) {
  SwapPtr(&it->yuv_out_, &it->yuv_out2_);
 }

+static score_t IsFlat(const int16_t* levels, int num_blocks, score_t thresh) {
+  score_t score = 0;
+  while (num_blocks-- > 0) {      // TODO(skal): refine positional scoring?
+    int i;
+    for (i = 1; i < 16; ++i) {    // omit DC, we're only interested in AC
+      score += (levels[i] != 0);
+      if (score > thresh) return 0;
+    }
+    levels += 16;
+  }
+  return 1;
+}
+
 static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* const rd) {
-  const VP8Encoder* const enc = it->enc_;
-  const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
+  const int kNumBlocks = 16;
+  VP8Encoder* const enc = it->enc_;
+  VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
  const int lambda = dqm->lambda_i16_;
  const int tlambda = dqm->tlambda_;
  const uint8_t* const src = it->yuv_in_ + Y_OFF;
@ -709,7 +860,7 @@ static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* const rd) {
  int mode;

  rd->mode_i16 = -1;
-  for (mode = 0; mode < 4; ++mode) {
+  for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
    uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF;  // scratch buffer
    int nz;

@ -720,8 +871,13 @@ static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* const rd) {
    rd16.D = VP8SSE16x16(src, tmp_dst);
    rd16.SD = tlambda ? MULT_8B(tlambda, VP8TDisto16x16(src, tmp_dst, kWeightY))
            : 0;
+    rd16.H = VP8FixedCostsI16[mode];
    rd16.R = VP8GetCostLuma16(it, &rd16);
-    rd16.R += VP8FixedCostsI16[mode];
+    if (mode > 0 &&
+        IsFlat(rd16.y_ac_levels[0], kNumBlocks, FLATNESS_LIMIT_I16)) {
+      // penalty to avoid flat area to be mispredicted by complex mode
+      rd16.R += FLATNESS_PENALTY * kNumBlocks;
+    }

    // Since we always examine Intra16 first, we can overwrite *rd directly.
    SetRDScore(lambda, &rd16);
@ -736,6 +892,13 @@ static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* const rd) {
  }
  SetRDScore(dqm->lambda_mode_, rd);   // finalize score for mode decision.
  VP8SetIntra16Mode(it, rd->mode_i16);
+
+  // we have a blocky macroblock (only DCs are non-zero) with fairly high
+  // distortion, record max delta so we can later adjust the minimal filtering
+  // strength needed to smooth these blocks out.
+  if ((rd->nz & 0xffff) == 0 && rd->D > dqm->min_disto_) {
+    StoreMaxDelta(dqm, rd->y_dc_levels);
+  }
 }

 //------------------------------------------------------------------------------
@ -765,9 +928,11 @@ static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
  }

  InitScore(&rd_best);
-  rd_best.score = 211;  // '211' is the value of VP8BitCost(0, 145)
+  rd_best.H = 211;  // '211' is the value of VP8BitCost(0, 145)
+  SetRDScore(dqm->lambda_mode_, &rd_best);
  VP8IteratorStartI4(it);
  do {
+    const int kNumBlocks = 1;
    VP8ModeScore rd_i4;
    int mode;
    int best_mode = -1;
@ -791,8 +956,11 @@ static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
      rd_tmp.SD =
          tlambda ? MULT_8B(tlambda, VP8TDisto4x4(src, tmp_dst, kWeightY))
                  : 0;
+      rd_tmp.H = mode_costs[mode];
      rd_tmp.R = VP8GetCostLuma4(it, tmp_levels);
-      rd_tmp.R += mode_costs[mode];
+      if (mode > 0 && IsFlat(tmp_levels, kNumBlocks, FLATNESS_LIMIT_I4)) {
+        rd_tmp.R += FLATNESS_PENALTY * kNumBlocks;
+      }

      SetRDScore(lambda, &rd_tmp);
      if (best_mode < 0 || rd_tmp.score < rd_i4.score) {
@ -804,14 +972,17 @@ static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
    }
    SetRDScore(dqm->lambda_mode_, &rd_i4);
    AddScore(&rd_best, &rd_i4);
-    total_header_bits += mode_costs[best_mode];
-    if (rd_best.score >= rd->score ||
-        total_header_bits > enc->max_i4_header_bits_) {
+    if (rd_best.score >= rd->score) {
+      return 0;
+    }
+    total_header_bits += (int)rd_i4.H;   // <- equal to mode_costs[best_mode];
+    if (total_header_bits > enc->max_i4_header_bits_) {
      return 0;
    }
    // Copy selected samples if not in the right place already.
-    if (best_block != best_blocks + VP8Scan[it->i4_])
+    if (best_block != best_blocks + VP8Scan[it->i4_]) {
      VP8Copy4x4(best_block, best_blocks + VP8Scan[it->i4_]);
+    }
    rd->modes_i4[it->i4_] = best_mode;
    it->top_nz_[it->i4_ & 3] = it->left_nz_[it->i4_ >> 2] = (rd_i4.nz ? 1 : 0);
  } while (VP8IteratorRotateI4(it, best_blocks));
@ -827,6 +998,7 @@ static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
 //------------------------------------------------------------------------------

 static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
+  const int kNumBlocks = 8;
  const VP8Encoder* const enc = it->enc_;
  const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
  const int lambda = dqm->lambda_uv_;
@ -838,7 +1010,7 @@ static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {

  rd->mode_uv = -1;
  InitScore(&rd_best);
-  for (mode = 0; mode < 4; ++mode) {
+  for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
    VP8ModeScore rd_uv;

    // Reconstruct
@ -847,8 +1019,11 @@ static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
    // Compute RD-score
    rd_uv.D  = VP8SSE16x8(src, tmp_dst);
    rd_uv.SD = 0;    // TODO: should we call TDisto? it tends to flatten areas.
+    rd_uv.H  = VP8FixedCostsUV[mode];
    rd_uv.R  = VP8GetCostUV(it, &rd_uv);
-    rd_uv.R += VP8FixedCostsUV[mode];
+    if (mode > 0 && IsFlat(rd_uv.uv_levels[0], kNumBlocks, FLATNESS_LIMIT_UV)) {
+      rd_uv.R += FLATNESS_PENALTY * kNumBlocks;
+    }

    SetRDScore(lambda, &rd_uv);
    if (mode == 0 || rd_uv.score < rd_best.score) {
@ -867,10 +1042,10 @@ static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {

 static void SimpleQuantize(VP8EncIterator* const it, VP8ModeScore* const rd) {
  const VP8Encoder* const enc = it->enc_;
-  const int i16 = (it->mb_->type_ == 1);
+  const int is_i16 = (it->mb_->type_ == 1);
  int nz = 0;

-  if (i16) {
+  if (is_i16) {
    nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF, it->preds_[0]);
  } else {
    VP8IteratorStartI4(it);
@ -889,11 +1064,66 @@ static void SimpleQuantize(VP8EncIterator* const it, VP8ModeScore* const rd) {
  rd->nz = nz;
 }

+// Refine intra16/intra4 sub-modes based on distortion only (not rate).
+static void DistoRefine(VP8EncIterator* const it, int try_both_i4_i16) {
+  const int is_i16 = (it->mb_->type_ == 1);
+  score_t best_score = MAX_COST;
+
+  if (try_both_i4_i16 || is_i16) {
+    int mode;
+    int best_mode = -1;
+    for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
+      const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
+      const uint8_t* const src = it->yuv_in_ + Y_OFF;
+      const score_t score = VP8SSE16x16(src, ref);
+      if (score < best_score) {
+        best_mode = mode;
+        best_score = score;
+      }
+    }
+    VP8SetIntra16Mode(it, best_mode);
+  }
+  if (try_both_i4_i16 || !is_i16) {
+    uint8_t modes_i4[16];
+    // We don't evaluate the rate here, but just account for it through a
+    // constant penalty (i4 mode usually needs more bits compared to i16).
+    score_t score_i4 = (score_t)I4_PENALTY;
+
+    VP8IteratorStartI4(it);
+    do {
+      int mode;
+      int best_sub_mode = -1;
+      score_t best_sub_score = MAX_COST;
+      const uint8_t* const src = it->yuv_in_ + Y_OFF + VP8Scan[it->i4_];
+
+      // TODO(skal): we don't really need the prediction pixels here,
+      // but just the distortion against 'src'.
+      VP8MakeIntra4Preds(it);
+      for (mode = 0; mode < NUM_BMODES; ++mode) {
+        const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
+        const score_t score = VP8SSE4x4(src, ref);
+        if (score < best_sub_score) {
+          best_sub_mode = mode;
+          best_sub_score = score;
+        }
+      }
+      modes_i4[it->i4_] = best_sub_mode;
+      score_i4 += best_sub_score;
+      if (score_i4 >= best_score) break;
+    } while (VP8IteratorRotateI4(it, it->yuv_in_ + Y_OFF));
+    if (score_i4 < best_score) {
+      VP8SetIntra4Mode(it, modes_i4);
+    }
+  }
+}
+
 //------------------------------------------------------------------------------
 // Entry point

-int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd, int rd_opt) {
+int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd,
+                VP8RDLevel rd_opt) {
  int is_skipped;
+  const int method = it->enc_->method_;

  InitScore(rd);

@ -902,22 +1132,21 @@ int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd, int rd_opt) {
  VP8MakeLuma16Preds(it);
  VP8MakeChroma8Preds(it);

-  // for rd_opt = 2, we perform trellis-quant on the final decision only.
-  // for rd_opt > 2, we use it for every scoring (=much slower).
-  if (rd_opt > 0) {
-    it->do_trellis_ = (rd_opt > 2);
+  if (rd_opt > RD_OPT_NONE) {
+    it->do_trellis_ = (rd_opt >= RD_OPT_TRELLIS_ALL);
    PickBestIntra16(it, rd);
-    if (it->enc_->method_ >= 2) {
+    if (method >= 2) {
      PickBestIntra4(it, rd);
    }
    PickBestUV(it, rd);
-    if (rd_opt == 2) {
+    if (rd_opt == RD_OPT_TRELLIS) {   // finish off with trellis-optim now
      it->do_trellis_ = 1;
      SimpleQuantize(it, rd);
    }
  } else {
-    // TODO: for method_ == 2, pick the best intra4/intra16 based on SSE
-    it->do_trellis_ = (it->enc_->method_ == 2);
+    // For method == 2, pick the best intra4/intra16 based on SSE (~tad slower).
+    // For method <= 1, we refine intra4 or intra16 (but don't re-examine mode).
+    DistoRefine(it, (method >= 2));
    SimpleQuantize(it, rd);
  }
  is_skipped = (rd->nz == 0);
@ -925,6 +1154,3 @@ int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd, int rd_opt) {
  return is_skipped;
 }

-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
--- a/src/enc/syntax.c
+++ b/src/enc/syntax.c
@ -1,8 +1,10 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Header syntax writing
@ -11,35 +13,20 @@

 #include <assert.h>

-#include "../webp/format_constants.h"
+#include "../utils/utils.h"
+#include "../webp/format_constants.h"  // RIFF constants
+#include "../webp/mux_types.h"         // ALPHA_FLAG
 #include "./vp8enci.h"

-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 //------------------------------------------------------------------------------
 // Helper functions

-// TODO(later): Move to webp/format_constants.h?
-static void PutLE24(uint8_t* const data, uint32_t val) {
-  data[0] = (val >>  0) & 0xff;
-  data[1] = (val >>  8) & 0xff;
-  data[2] = (val >> 16) & 0xff;
-}
-
-static void PutLE32(uint8_t* const data, uint32_t val) {
-  PutLE24(data, val);
-  data[3] = (val >> 24) & 0xff;
-}
-
 static int IsVP8XNeeded(const VP8Encoder* const enc) {
  return !!enc->has_alpha_;  // Currently the only case when VP8X is needed.
                             // This could change in the future.
 }

 static int PutPaddingByte(const WebPPicture* const pic) {
-
  const uint8_t pad_byte[1] = { 0 };
  return !!pic->writer(pad_byte, 1, pic);
 }
@ -73,7 +60,7 @@ static WebPEncodingError PutVP8XHeader(const VP8Encoder* const enc) {
  assert(pic->width <= MAX_CANVAS_SIZE && pic->height <= MAX_CANVAS_SIZE);

  if (enc->has_alpha_) {
-    flags |= ALPHA_FLAG_BIT;
+    flags |= ALPHA_FLAG;
  }

  PutLE32(vp8x + TAG_SIZE,              VP8X_CHUNK_SIZE);
@ -327,7 +314,9 @@ static size_t GeneratePartition0(VP8Encoder* const enc) {

  PutSegmentHeader(bw, enc);
  PutFilterHeader(bw, &enc->filter_hdr_);
-  VP8PutValue(bw, enc->config_->partitions, 2);
+  VP8PutValue(bw, enc->num_parts_ == 8 ? 3 :
+                  enc->num_parts_ == 4 ? 2 :
+                  enc->num_parts_ == 2 ? 1 : 0, 2);
  PutQuant(bw, enc);
  VP8PutBitUniform(bw, 0);   // no proba update
  VP8WriteProbas(bw, &enc->proba_);
@ -432,6 +421,3 @@ int VP8EncWrite(VP8Encoder* const enc) {

 //------------------------------------------------------------------------------

-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
--- a/src/enc/token.c
+++ b/src/enc/token.c
@ -0,0 +1,273 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Paginated token buffer
+//
+//  A 'token' is a bit value associated with a probability, either fixed
+// or a later-to-be-determined after statistics have been collected.
+// For dynamic probability, we just record the slot id (idx) for the probability
+// value in the final probability array (uint8_t* probas in VP8EmitTokens).
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "./cost.h"
+#include "./vp8enci.h"
+
+#if !defined(DISABLE_TOKEN_BUFFER)
+
+// we use pages to reduce the number of memcpy()
+#define MAX_NUM_TOKEN 8192          // max number of token per page
+#define FIXED_PROBA_BIT (1u << 14)
+
+struct VP8Tokens {
+  uint16_t tokens_[MAX_NUM_TOKEN];  // bit#15: bit
+                                    // bit #14: constant proba or idx
+                                    // bits 0..13: slot or constant proba
+  VP8Tokens* next_;
+};
+
+//------------------------------------------------------------------------------
+
+void VP8TBufferInit(VP8TBuffer* const b) {
+  b->tokens_ = NULL;
+  b->pages_ = NULL;
+  b->last_page_ = &b->pages_;
+  b->left_ = 0;
+  b->error_ = 0;
+}
+
+void VP8TBufferClear(VP8TBuffer* const b) {
+  if (b != NULL) {
+    const VP8Tokens* p = b->pages_;
+    while (p != NULL) {
+      const VP8Tokens* const next = p->next_;
+      free((void*)p);
+      p = next;
+    }
+    VP8TBufferInit(b);
+  }
+}
+
+static int TBufferNewPage(VP8TBuffer* const b) {
+  VP8Tokens* const page = b->error_ ? NULL : (VP8Tokens*)malloc(sizeof(*page));
+  if (page == NULL) {
+    b->error_ = 1;
+    return 0;
+  }
+  *b->last_page_ = page;
+  b->last_page_ = &page->next_;
+  b->left_ = MAX_NUM_TOKEN;
+  b->tokens_ = page->tokens_;
+  page->next_ = NULL;
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+
+#define TOKEN_ID(t, b, ctx, p) \
+    ((p) + NUM_PROBAS * ((ctx) + NUM_CTX * ((b) + NUM_BANDS * (t))))
+
+static WEBP_INLINE int AddToken(VP8TBuffer* const b,
+                                int bit, uint32_t proba_idx) {
+  assert(proba_idx < FIXED_PROBA_BIT);
+  assert(bit == 0 || bit == 1);
+  if (b->left_ > 0 || TBufferNewPage(b)) {
+    const int slot = --b->left_;
+    b->tokens_[slot] = (bit << 15) | proba_idx;
+  }
+  return bit;
+}
+
+static WEBP_INLINE void AddConstantToken(VP8TBuffer* const b,
+                                         int bit, int proba) {
+  assert(proba < 256);
+  assert(bit == 0 || bit == 1);
+  if (b->left_ > 0 || TBufferNewPage(b)) {
+    const int slot = --b->left_;
+    b->tokens_[slot] = (bit << 15) | FIXED_PROBA_BIT | proba;
+  }
+}
+
+int VP8RecordCoeffTokens(int ctx, int coeff_type, int first, int last,
+                         const int16_t* const coeffs,
+                         VP8TBuffer* const tokens) {
+  int n = first;
+  uint32_t base_id = TOKEN_ID(coeff_type, n, ctx, 0);
+  if (!AddToken(tokens, last >= 0, base_id + 0)) {
+    return 0;
+  }
+
+  while (n < 16) {
+    const int c = coeffs[n++];
+    const int sign = c < 0;
+    int v = sign ? -c : c;
+    if (!AddToken(tokens, v != 0, base_id + 1)) {
+      ctx = 0;
+      base_id = TOKEN_ID(coeff_type, VP8EncBands[n], ctx, 0);
+      continue;
+    }
+    if (!AddToken(tokens, v > 1, base_id + 2)) {
+      ctx = 1;
+    } else {
+      if (!AddToken(tokens, v > 4, base_id + 3)) {
+        if (AddToken(tokens, v != 2, base_id + 4))
+          AddToken(tokens, v == 4, base_id + 5);
+      } else if (!AddToken(tokens, v > 10, base_id + 6)) {
+        if (!AddToken(tokens, v > 6, base_id + 7)) {
+          AddConstantToken(tokens, v == 6, 159);
+        } else {
+          AddConstantToken(tokens, v >= 9, 165);
+          AddConstantToken(tokens, !(v & 1), 145);
+        }
+      } else {
+        int mask;
+        const uint8_t* tab;
+        if (v < 3 + (8 << 1)) {          // VP8Cat3  (3b)
+          AddToken(tokens, 0, base_id + 8);
+          AddToken(tokens, 0, base_id + 9);
+          v -= 3 + (8 << 0);
+          mask = 1 << 2;
+          tab = VP8Cat3;
+        } else if (v < 3 + (8 << 2)) {   // VP8Cat4  (4b)
+          AddToken(tokens, 0, base_id + 8);
+          AddToken(tokens, 1, base_id + 9);
+          v -= 3 + (8 << 1);
+          mask = 1 << 3;
+          tab = VP8Cat4;
+        } else if (v < 3 + (8 << 3)) {   // VP8Cat5  (5b)
+          AddToken(tokens, 1, base_id + 8);
+          AddToken(tokens, 0, base_id + 10);
+          v -= 3 + (8 << 2);
+          mask = 1 << 4;
+          tab = VP8Cat5;
+        } else {                         // VP8Cat6 (11b)
+          AddToken(tokens, 1, base_id + 8);
+          AddToken(tokens, 1, base_id + 10);
+          v -= 3 + (8 << 3);
+          mask = 1 << 10;
+          tab = VP8Cat6;
+        }
+        while (mask) {
+          AddConstantToken(tokens, !!(v & mask), *tab++);
+          mask >>= 1;
+        }
+      }
+      ctx = 2;
+    }
+    AddConstantToken(tokens, sign, 128);
+    base_id = TOKEN_ID(coeff_type, VP8EncBands[n], ctx, 0);
+    if (n == 16 || !AddToken(tokens, n <= last, base_id + 0)) {
+      return 1;   // EOB
+    }
+  }
+  return 1;
+}
+
+#undef TOKEN_ID
+
+//------------------------------------------------------------------------------
+// This function works, but isn't currently used. Saved for later.
+
+#if 0
+
+static void Record(int bit, proba_t* const stats) {
+  proba_t p = *stats;
+  if (p >= 0xffff0000u) {               // an overflow is inbound.
+    p = ((p + 1u) >> 1) & 0x7fff7fffu;  // -> divide the stats by 2.
+  }
+  // record bit count (lower 16 bits) and increment total count (upper 16 bits).
+  p += 0x00010000u + bit;
+  *stats = p;
+}
+
+void VP8TokenToStats(const VP8TBuffer* const b, proba_t* const stats) {
+  const VP8Tokens* p = b->pages_;
+  while (p != NULL) {
+    const int N = (p->next_ == NULL) ? b->left_ : 0;
+    int n = MAX_NUM_TOKEN;
+    while (n-- > N) {
+      const uint16_t token = p->tokens_[n];
+      if (!(token & FIXED_PROBA_BIT)) {
+        Record((token >> 15) & 1, stats + (token & 0x3fffu));
+      }
+    }
+    p = p->next_;
+  }
+}
+
+#endif   // 0
+
+//------------------------------------------------------------------------------
+// Final coding pass, with known probabilities
+
+int VP8EmitTokens(VP8TBuffer* const b, VP8BitWriter* const bw,
+                  const uint8_t* const probas, int final_pass) {
+  const VP8Tokens* p = b->pages_;
+  (void)final_pass;
+  if (b->error_) return 0;
+  while (p != NULL) {
+    const VP8Tokens* const next = p->next_;
+    const int N = (next == NULL) ? b->left_ : 0;
+    int n = MAX_NUM_TOKEN;
+    while (n-- > N) {
+      const uint16_t token = p->tokens_[n];
+      const int bit = (token >> 15) & 1;
+      if (token & FIXED_PROBA_BIT) {
+        VP8PutBit(bw, bit, token & 0xffu);  // constant proba
+      } else {
+        VP8PutBit(bw, bit, probas[token & 0x3fffu]);
+      }
+    }
+    if (final_pass) free((void*)p);
+    p = next;
+  }
+  if (final_pass) b->pages_ = NULL;
+  return 1;
+}
+
+// Size estimation
+size_t VP8EstimateTokenSize(VP8TBuffer* const b, const uint8_t* const probas) {
+  size_t size = 0;
+  const VP8Tokens* p = b->pages_;
+  if (b->error_) return 0;
+  while (p != NULL) {
+    const VP8Tokens* const next = p->next_;
+    const int N = (next == NULL) ? b->left_ : 0;
+    int n = MAX_NUM_TOKEN;
+    while (n-- > N) {
+      const uint16_t token = p->tokens_[n];
+      const int bit = token & (1 << 15);
+      if (token & FIXED_PROBA_BIT) {
+        size += VP8BitCost(bit, token & 0xffu);
+      } else {
+        size += VP8BitCost(bit, probas[token & 0x3fffu]);
+      }
+    }
+    p = next;
+  }
+  return size;
+}
+
+//------------------------------------------------------------------------------
+
+#else     // DISABLE_TOKEN_BUFFER
+
+void VP8TBufferInit(VP8TBuffer* const b) {
+  (void)b;
+}
+void VP8TBufferClear(VP8TBuffer* const b) {
+  (void)b;
+}
+
+#endif    // !DISABLE_TOKEN_BUFFER
+
--- a/src/enc/tree.c
+++ b/src/enc/tree.c
@ -1,27 +1,24 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
-// Token probabilities
+// Coding of token probabilities, intra modes and segments.
 //
 // Author: Skal (pascal.massimino@gmail.com)

 #include "./vp8enci.h"

-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 //------------------------------------------------------------------------------
 // Default probabilities

 // Paragraph 13.5
 const uint8_t
  VP8CoeffsProba0[NUM_TYPES][NUM_BANDS][NUM_CTX][NUM_PROBAS] = {
-  // genereated using vp8_default_coef_probs() in entropy.c:129
  { { { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
@ -318,7 +315,7 @@ void VP8CodeIntraModes(VP8Encoder* const enc) {
  VP8EncIterator it;
  VP8IteratorInit(enc, &it);
  do {
-    const VP8MBInfo* mb = it.mb_;
+    const VP8MBInfo* const mb = it.mb_;
    const uint8_t* preds = it.preds_;
    if (enc->segment_hdr_.update_map_) {
      PutSegment(bw, mb->segment_, enc->proba_.segments_);
@ -343,7 +340,7 @@ void VP8CodeIntraModes(VP8Encoder* const enc) {
      }
    }
    PutUVMode(bw, mb->uv_mode_);
-  } while (VP8IteratorNext(&it, 0));
+  } while (VP8IteratorNext(&it));
 }

 //------------------------------------------------------------------------------
@ -505,6 +502,3 @@ void VP8WriteProbas(VP8BitWriter* const bw, const VP8Proba* const probas) {
  }
 }

-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
--- a/src/enc/vp8enci.h
+++ b/src/enc/vp8enci.h
@ -1,8 +1,10 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 //   WebP encoder: internal header.
@ -16,8 +18,9 @@
 #include "../webp/encode.h"
 #include "../dsp/dsp.h"
 #include "../utils/bit_writer.h"
+#include "../utils/thread.h"

-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif

@ -26,11 +29,8 @@ extern "C" {

 // version numbers
 #define ENC_MAJ_VERSION 0
-#define ENC_MIN_VERSION 1
-#define ENC_REV_VERSION 99
-
-// size of histogram used by CollectHistogram.
-#define MAX_COEFF_THRESH   64
+#define ENC_MIN_VERSION 4
+#define ENC_REV_VERSION 0

 // intra prediction modes
 enum { B_DC_PRED = 0,   // 4x4 modes
@ -47,7 +47,8 @@ enum { B_DC_PRED = 0,   // 4x4 modes

       // Luma16 or UV modes
       DC_PRED = B_DC_PRED, V_PRED = B_VE_PRED,
-       H_PRED = B_HE_PRED, TM_PRED = B_TM_PRED
+       H_PRED = B_HE_PRED, TM_PRED = B_TM_PRED,
+       NUM_PRED_MODES = 4
     };

 enum { NUM_MB_SEGMENTS = 4,
@ -57,15 +58,23 @@ enum { NUM_MB_SEGMENTS = 4,
       NUM_CTX = 3,
       NUM_PROBAS = 11,
       MAX_LF_LEVELS = 64,       // Maximum loop filter level
-       MAX_VARIABLE_LEVEL = 67  // last (inclusive) level with variable cost
+       MAX_VARIABLE_LEVEL = 67,  // last (inclusive) level with variable cost
+       MAX_LEVEL = 2047          // max level (note: max codable is 2047 + 67)
     };

+typedef enum {   // Rate-distortion optimization levels
+  RD_OPT_NONE        = 0,  // no rd-opt
+  RD_OPT_BASIC       = 1,  // basic scoring (no trellis)
+  RD_OPT_TRELLIS     = 2,  // perform trellis-quant on the final decision only
+  RD_OPT_TRELLIS_ALL = 3   // trellis-quant for every scoring (much slower)
+} VP8RDLevel;
+
 // YUV-cache parameters. Cache is 16-pixels wide.
 // The original or reconstructed samples can be accessed using VP8Scan[]
 // The predicted blocks can be accessed using offsets to yuv_p_ and
 // the arrays VP8*ModeOffsets[];
 //         +----+      YUV Samples area. See VP8Scan[] for accessing the blocks.
-//  Y_OFF  |YYYY| <- original samples  (enc->yuv_in_)
+//  Y_OFF  |YYYY| <- original samples  ('yuv_in_')
 //         |YYYY|
 //         |YYYY|
 //         |YYYY|
@ -160,7 +169,17 @@ typedef int64_t score_t;     // type used for scores, rate, distortion
 static WEBP_INLINE int QUANTDIV(int n, int iQ, int B) {
  return (n * iQ + B) >> QFIX;
 }
-extern const uint8_t VP8Zigzag[16];
+
+// size of histogram used by CollectHistogram.
+#define MAX_COEFF_THRESH   31
+typedef struct VP8Histogram VP8Histogram;
+struct VP8Histogram {
+  // TODO(skal): we only need to store the max_value and last_non_zero actually.
+  int distribution[MAX_COEFF_THRESH + 1];
+};
+
+// Uncomment the following to remove token-buffer code:
+// #define DISABLE_TOKEN_BUFFER

 //------------------------------------------------------------------------------
 // Headers
@ -229,16 +248,19 @@ typedef struct {
  int beta_;       // filter-susceptibility, range [0,255].
  int quant_;      // final segment quantizer.
  int fstrength_;  // final in-loop filtering strength
+  int max_edge_;   // max edge delta (for filtering strength)
+  int min_disto_;  // minimum distortion required to trigger filtering record
  // reactivities
  int lambda_i16_, lambda_i4_, lambda_uv_;
  int lambda_mode_, lambda_trellis_, tlambda_;
  int lambda_trellis_i16_, lambda_trellis_i4_, lambda_trellis_uv_;
 } VP8SegmentInfo;

-// Handy transcient struct to accumulate score and info during RD-optimization
+// Handy transient struct to accumulate score and info during RD-optimization
 // and mode evaluation.
 typedef struct {
-  score_t D, SD, R, score;    // Distortion, spectral distortion, rate, score.
+  score_t D, SD;              // Distortion, spectral distortion
+  score_t H, R, score;        // header bits, rate, score.
  int16_t y_dc_levels[16];    // Quantized levels for luma-DC, luma-AC, chroma.
  int16_t y_ac_levels[16][16];
  int16_t uv_levels[4 + 4][16];
@ -252,12 +274,11 @@ typedef struct {
 // right neighbouring data (samples, predictions, contexts, ...)
 typedef struct {
  int x_, y_;                      // current macroblock
-  int y_offset_, uv_offset_;       // offset to the luma / chroma planes
  int y_stride_, uv_stride_;       // respective strides
-  uint8_t*      yuv_in_;           // borrowed from enc_ (for now)
-  uint8_t*      yuv_out_;          // ''
-  uint8_t*      yuv_out2_;         // ''
-  uint8_t*      yuv_p_;            // ''
+  uint8_t*      yuv_in_;           // input samples
+  uint8_t*      yuv_out_;          // output samples
+  uint8_t*      yuv_out2_;         // secondary buffer swapped with yuv_out_.
+  uint8_t*      yuv_p_;            // scratch buffer for prediction
  VP8Encoder*   enc_;              // back-pointer
  VP8MBInfo*    mb_;               // current macroblock
  VP8BitWriter* bw_;               // current bit-writer
@ -273,24 +294,43 @@ typedef struct {
  uint64_t      uv_bits_;          // macroblock bit-cost for chroma
  LFStats*      lf_stats_;         // filter stats (borrowed from enc_)
  int           do_trellis_;       // if true, perform extra level optimisation
-  int           done_;             // true when scan is finished
+  int           count_down_;       // number of mb still to be processed
+  int           count_down0_;      // starting counter value (for progress)
  int           percent0_;         // saved initial progress percent
+
+  uint8_t* y_left_;    // left luma samples (addressable from index -1 to 15).
+  uint8_t* u_left_;    // left u samples (addressable from index -1 to 7)
+  uint8_t* v_left_;    // left v samples (addressable from index -1 to 7)
+
+  uint8_t* y_top_;     // top luma samples at position 'x_'
+  uint8_t* uv_top_;    // top u/v samples at position 'x_', packed as 16 bytes
+
+  // memory for storing y/u/v_left_ and yuv_in_/out_*
+  uint8_t yuv_left_mem_[17 + 16 + 16 + 8 + ALIGN_CST];     // memory for *_left_
+  uint8_t yuv_mem_[3 * YUV_SIZE + PRED_SIZE + ALIGN_CST];  // memory for yuv_*
 } VP8EncIterator;

  // in iterator.c
-// must be called first.
+// must be called first
 void VP8IteratorInit(VP8Encoder* const enc, VP8EncIterator* const it);
-// restart a scan.
+// restart a scan
 void VP8IteratorReset(VP8EncIterator* const it);
-// import samples from source
-void VP8IteratorImport(const VP8EncIterator* const it);
+// reset iterator position to row 'y'
+void VP8IteratorSetRow(VP8EncIterator* const it, int y);
+// set count down (=number of iterations to go)
+void VP8IteratorSetCountDown(VP8EncIterator* const it, int count_down);
+// return true if iteration is finished
+int VP8IteratorIsDone(const VP8EncIterator* const it);
+// Import uncompressed samples from source.
+// If tmp_32 is not NULL, import boundary samples too.
+// tmp_32 is a 32-bytes scratch buffer that must be aligned in memory.
+void VP8IteratorImport(VP8EncIterator* const it, uint8_t* tmp_32);
 // export decimated samples
 void VP8IteratorExport(const VP8EncIterator* const it);
-// go to next macroblock. Returns !done_. If *block_to_save is non-null, will
-// save the boundary values to top_/left_ arrays. block_to_save can be
-// it->yuv_out_ or it->yuv_in_.
-int VP8IteratorNext(VP8EncIterator* const it,
-                    const uint8_t* const block_to_save);
+// go to next macroblock. Returns false if not finished.
+int VP8IteratorNext(VP8EncIterator* const it);
+// save the yuv_out_ boundary values to top_/left_ arrays for next iterations.
+void VP8IteratorSaveBoundary(VP8EncIterator* const it);
 // Report progression based on macroblock rows. Return 0 for user-abort request.
 int VP8IteratorProgress(const VP8EncIterator* const it,
                        int final_delta_percent);
@ -314,44 +354,40 @@ void VP8SetSegment(const VP8EncIterator* const it, int segment);
 //------------------------------------------------------------------------------
 // Paginated token buffer

-// WIP: #define USE_TOKEN_BUFFER
-
-#ifdef USE_TOKEN_BUFFER
-
-#define MAX_NUM_TOKEN 2048
-
-typedef struct VP8Tokens VP8Tokens;
-struct VP8Tokens {
-  uint16_t tokens_[MAX_NUM_TOKEN];  // bit#15: bit, bits 0..14: slot
-  int left_;
-  VP8Tokens* next_;
-};
+typedef struct VP8Tokens VP8Tokens;  // struct details in token.c

 typedef struct {
-  VP8Tokens* rows_;
-  uint16_t* tokens_;    // set to (*last_)->tokens_
-  VP8Tokens** last_;
-  int left_;
+#if !defined(DISABLE_TOKEN_BUFFER)
+  VP8Tokens* pages_;        // first page
+  VP8Tokens** last_page_;   // last page
+  uint16_t* tokens_;        // set to (*last_page_)->tokens_
+  int left_;          // how many free tokens left before the page is full.
+#endif
  int error_;         // true in case of malloc error
 } VP8TBuffer;

 void VP8TBufferInit(VP8TBuffer* const b);    // initialize an empty buffer
-int VP8TBufferNewPage(VP8TBuffer* const b);  // allocate a new page
-void VP8TBufferClear(VP8TBuffer* const b);   // de-allocate memory
+void VP8TBufferClear(VP8TBuffer* const b);   // de-allocate pages memory

-int VP8EmitTokens(const VP8TBuffer* const b, VP8BitWriter* const bw,
-                  const uint8_t* const probas);
+#if !defined(DISABLE_TOKEN_BUFFER)

-static WEBP_INLINE int VP8AddToken(VP8TBuffer* const b,
-                                   int bit, int proba_idx) {
-  if (b->left_ > 0 || VP8TBufferNewPage(b)) {
-    const int slot = --b->left_;
-    b->tokens_[slot] = (bit << 15) | proba_idx;
-  }
-  return bit;
-}
+// Finalizes bitstream when probabilities are known.
+// Deletes the allocated token memory if final_pass is true.
+int VP8EmitTokens(VP8TBuffer* const b, VP8BitWriter* const bw,
+                  const uint8_t* const probas, int final_pass);

-#endif  // USE_TOKEN_BUFFER
+// record the coding of coefficients without knowing the probabilities yet
+int VP8RecordCoeffTokens(int ctx, int coeff_type, int first, int last,
+                         const int16_t* const coeffs,
+                         VP8TBuffer* const tokens);
+
+// Estimate the final coded size given a set of 'probas'.
+size_t VP8EstimateTokenSize(VP8TBuffer* const b, const uint8_t* const probas);
+
+// unused for now
+void VP8TokenToStats(const VP8TBuffer* const b, proba_t* const stats);
+
+#endif  // !DISABLE_TOKEN_BUFFER

 //------------------------------------------------------------------------------
 // VP8Encoder
@ -376,6 +412,7 @@ struct VP8Encoder {
  // per-partition boolean decoders.
  VP8BitWriter bw_;                         // part0
  VP8BitWriter parts_[MAX_NUM_PARTITIONS];  // token partitions
+  VP8TBuffer tokens_;                       // token buffer

  int percent_;                             // for progress

@ -383,6 +420,7 @@ struct VP8Encoder {
  int has_alpha_;
  uint8_t* alpha_data_;       // non-NULL if transparency is present
  uint32_t alpha_data_size_;
+  WebPWorker alpha_worker_;

  // enhancement layer
  int use_layer_;
@ -394,6 +432,7 @@ struct VP8Encoder {
  VP8SegmentInfo dqm_[NUM_MB_SEGMENTS];
  int base_quant_;                 // nominal quantizer value. Only used
                                   // for relative coding of segments' quant.
+  int alpha_;                      // global susceptibility (<=> complexity)
  int uv_alpha_;                   // U/V quantization susceptibility
  // global offset of quantizers, shared by all segments
  int dq_y1_dc_;
@ -402,7 +441,7 @@ struct VP8Encoder {

  // probabilities and statistics
  VP8Proba proba_;
-  uint64_t sse_[3];        // sum of Y/U/V squared errors for all macroblocks
+  uint64_t sse_[4];        // sum of Y/U/V/A squared errors for all macroblocks
  uint64_t sse_count_;     // pixel count for the sse_[] stats
  int      coded_size_;
  int      residual_bytes_[3][4];
@ -410,24 +449,19 @@ struct VP8Encoder {

  // quality/speed settings
  int method_;               // 0=fastest, 6=best/slowest.
-  int rd_opt_level_;        // Deduced from method_.
+  VP8RDLevel rd_opt_level_;  // Deduced from method_.
  int max_i4_header_bits_;   // partition #0 safeness factor
+  int thread_level_;         // derived from config->thread_level
+  int do_search_;            // derived from config->target_XXX
+  int use_tokens_;           // if true, use token buffer

  // Memory
  VP8MBInfo* mb_info_;   // contextual macroblock infos (mb_w_ + 1)
  uint8_t*   preds_;     // predictions modes: (4*mb_w+1) * (4*mb_h+1)
  uint32_t*  nz_;        // non-zero bit context: mb_w+1
-  uint8_t*   yuv_in_;    // input samples
-  uint8_t*   yuv_out_;   // output samples
-  uint8_t*   yuv_out2_;  // secondary scratch out-buffer. swapped with yuv_out_.
-  uint8_t*   yuv_p_;     // scratch buffer for prediction
  uint8_t   *y_top_;     // top luma samples.
  uint8_t   *uv_top_;    // top u/v samples.
-                         // U and V are packed into 16 pixels (8 U + 8 V)
-  uint8_t   *y_left_;    // left luma samples (adressable from index -1 to 15).
-  uint8_t   *u_left_;    // left u samples (adressable from index -1 to 7)
-  uint8_t   *v_left_;    // left v samples (adressable from index -1 to 7)
-
+                         // U and V are packed into 16 bytes (8 U + 8 V)
  LFStats   *lf_stats_;  // autofilter stats (if NULL, autofilter is off)
 };

@ -455,6 +489,11 @@ void VP8EncFreeBitWriters(VP8Encoder* const enc);

  // in frame.c
 extern const uint8_t VP8EncBands[16 + 1];
+extern const uint8_t VP8Cat3[];
+extern const uint8_t VP8Cat4[];
+extern const uint8_t VP8Cat5[];
+extern const uint8_t VP8Cat6[];
+
 // Form all the four Intra16x16 predictions in the yuv_p_ cache
 void VP8MakeLuma16Preds(const VP8EncIterator* const it);
 // Form all the four Chroma8x8 predictions in the yuv_p_ cache
@ -466,9 +505,9 @@ void VP8MakeIntra4Preds(const VP8EncIterator* const it);
 int VP8GetCostLuma16(VP8EncIterator* const it, const VP8ModeScore* const rd);
 int VP8GetCostLuma4(VP8EncIterator* const it, const int16_t levels[16]);
 int VP8GetCostUV(VP8EncIterator* const it, const VP8ModeScore* const rd);
-// Main stat / coding passes
+// Main coding calls
 int VP8EncLoop(VP8Encoder* const enc);
-int VP8StatLoop(VP8Encoder* const enc);
+int VP8EncTokenLoop(VP8Encoder* const enc);

  // in webpenc.c
 // Assign an error code to a picture. Return false for convenience.
@ -485,12 +524,14 @@ int VP8EncAnalyze(VP8Encoder* const enc);
 // Sets up segment's quantization values, base_quant_ and filter strengths.
 void VP8SetSegmentParams(VP8Encoder* const enc, float quality);
 // Pick best modes and fills the levels. Returns true if skipped.
-int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd, int rd_opt);
+int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd,
+                VP8RDLevel rd_opt);

  // in alpha.c
-void VP8EncInitAlpha(VP8Encoder* enc);           // initialize alpha compression
-int VP8EncFinishAlpha(VP8Encoder* enc);          // finalize compressed data
-void VP8EncDeleteAlpha(VP8Encoder* enc);         // delete compressed data
+void VP8EncInitAlpha(VP8Encoder* const enc);    // initialize alpha compression
+int VP8EncStartAlpha(VP8Encoder* const enc);    // start alpha coding process
+int VP8EncFinishAlpha(VP8Encoder* const enc);   // finalize compressed data
+int VP8EncDeleteAlpha(VP8Encoder* const enc);   // delete compressed data

  // in layer.c
 void VP8EncInitLayer(VP8Encoder* const enc);     // init everything
@ -516,9 +557,13 @@ void VP8InitFilter(VP8EncIterator* const it);
 void VP8StoreFilterStats(VP8EncIterator* const it);
 void VP8AdjustFilterStrength(VP8EncIterator* const it);

+// returns the approximate filtering strength needed to smooth a edge
+// step of 'delta', given a sharpness parameter 'sharpness'.
+int VP8FilterStrengthFromDelta(int sharpness, int delta);
+
 //------------------------------------------------------------------------------

-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }    // extern "C"
 #endif

--- a/src/enc/vp8l.c
+++ b/src/enc/vp8l.c
@ -1,8 +1,10 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // main entry for the lossless encoder.
@ -20,14 +22,12 @@
 #include "../dsp/lossless.h"
 #include "../utils/bit_writer.h"
 #include "../utils/huffman_encode.h"
+#include "../utils/utils.h"
 #include "../webp/format_constants.h"

-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 #define PALETTE_KEY_RIGHT_SHIFT   22  // Key for 1K buffer.
 #define MAX_HUFF_IMAGE_SIZE       (16 * 1024 * 1024)
+#define MAX_COLORS_FOR_GRAPH      64

 // -----------------------------------------------------------------------------
 // Palette
@ -35,7 +35,8 @@ extern "C" {
 static int CompareColors(const void* p1, const void* p2) {
  const uint32_t a = *(const uint32_t*)p1;
  const uint32_t b = *(const uint32_t*)p2;
-  return (a < b) ? -1 : (a > b) ? 1 : 0;
+  assert(a != b);
+  return (a < b) ? -1 : 1;
 }

 // If number of colors in the image is less than or equal to MAX_PALETTE_SIZE,
@ -83,7 +84,7 @@ static int AnalyzeAndCreatePalette(const WebPPicture* const pic,
    argb += pic->argb_stride;
  }

-  // TODO(skal): could we reuse in_use[] to speed up ApplyPalette()?
+  // TODO(skal): could we reuse in_use[] to speed up EncodePalette()?
  num_colors = 0;
  for (i = 0; i < (int)(sizeof(in_use) / sizeof(in_use[0])); ++i) {
    if (in_use[i]) {
@ -97,23 +98,24 @@ static int AnalyzeAndCreatePalette(const WebPPicture* const pic,
  return 1;
 }

-static int AnalyzeEntropy(const WebPPicture* const pic,
+static int AnalyzeEntropy(const uint32_t* argb,
+                          int width, int height, int argb_stride,
                          double* const nonpredicted_bits,
                          double* const predicted_bits) {
  int x, y;
-  const uint32_t* argb = pic->argb;
  const uint32_t* last_line = NULL;
  uint32_t last_pix = argb[0];    // so we're sure that pix_diff == 0

  VP8LHistogram* nonpredicted = NULL;
-  VP8LHistogram* predicted = (VP8LHistogram*)malloc(2 * sizeof(*predicted));
+  VP8LHistogram* predicted =
+      (VP8LHistogram*)malloc(2 * sizeof(*predicted));
  if (predicted == NULL) return 0;
  nonpredicted = predicted + 1;

  VP8LHistogramInit(predicted, 0);
  VP8LHistogramInit(nonpredicted, 0);
-  for (y = 0; y < pic->height; ++y) {
-    for (x = 0; x < pic->width; ++x) {
+  for (y = 0; y < height; ++y) {
+    for (x = 0; x < width; ++x) {
      const uint32_t pix = argb[x];
      const uint32_t pix_diff = VP8LSubPixels(pix, last_pix);
      if (pix_diff == 0) continue;
@ -129,7 +131,7 @@ static int AnalyzeEntropy(const WebPPicture* const pic,
      }
    }
    last_line = argb;
-    argb += pic->argb_stride;
+    argb += argb_stride;
  }
  *nonpredicted_bits = VP8LHistogramEstimateBitsBulk(nonpredicted);
  *predicted_bits = VP8LHistogramEstimateBitsBulk(predicted);
@ -143,32 +145,39 @@ static int VP8LEncAnalyze(VP8LEncoder* const enc, WebPImageHint image_hint) {

  enc->use_palette_ =
      AnalyzeAndCreatePalette(pic, enc->palette_, &enc->palette_size_);
-  if (!enc->use_palette_) {
-    if (image_hint == WEBP_HINT_DEFAULT) {
-      double non_pred_entropy, pred_entropy;
-      if (!AnalyzeEntropy(pic, &non_pred_entropy, &pred_entropy)) {
-        return 0;
+
+  if (image_hint == WEBP_HINT_GRAPH) {
+    if (enc->use_palette_ && enc->palette_size_ < MAX_COLORS_FOR_GRAPH) {
+      enc->use_palette_ = 0;
+    }
  }

+  if (!enc->use_palette_) {
+    if (image_hint == WEBP_HINT_PHOTO) {
+      enc->use_predict_ = 1;
+      enc->use_cross_color_ = 1;
+    } else {
+      double non_pred_entropy, pred_entropy;
+      if (!AnalyzeEntropy(pic->argb, pic->width, pic->height, pic->argb_stride,
+                          &non_pred_entropy, &pred_entropy)) {
+        return 0;
+      }
      if (pred_entropy < 0.95 * non_pred_entropy) {
        enc->use_predict_ = 1;
        enc->use_cross_color_ = 1;
      }
-    } else if (image_hint == WEBP_HINT_PHOTO) {
-      enc->use_predict_ = 1;
-      enc->use_cross_color_ = 1;
    }
  }
-  return 1;
-}

+  return 1;
+}

 static int GetHuffBitLengthsAndCodes(
    const VP8LHistogramSet* const histogram_image,
    HuffmanTreeCode* const huffman_codes) {
  int i, k;
  int ok = 1;
-  int total_length_size = 0;
+  uint64_t total_length_size = 0;
  uint8_t* mem_buf = NULL;
  const int histogram_image_size = histogram_image->size;

@ -189,9 +198,8 @@ static int GetHuffBitLengthsAndCodes(
  {
    uint16_t* codes;
    uint8_t* lengths;
-    const size_t total_buf_size = total_length_size * sizeof(*lengths)
-                                + total_length_size * sizeof(*codes);
-    mem_buf = (uint8_t*)calloc(total_buf_size, 1);
+    mem_buf = (uint8_t*)WebPSafeCalloc(total_length_size,
+                                       sizeof(*lengths) + sizeof(*codes));
    if (mem_buf == NULL) {
      ok = 0;
      goto End;
@ -208,7 +216,7 @@ static int GetHuffBitLengthsAndCodes(
  }

  // Create Huffman trees.
-  for (i = 0; i < histogram_image_size; ++i) {
+  for (i = 0; ok && (i < histogram_image_size); ++i) {
    HuffmanTreeCode* const codes = &huffman_codes[5 * i];
    VP8LHistogram* const histo = histogram_image->histograms[i];
    ok = ok && VP8LCreateHuffmanTree(histo->literal_, 15, codes + 0);
@ -219,7 +227,11 @@ static int GetHuffBitLengthsAndCodes(
  }

 End:
-  if (!ok) free(mem_buf);
+  if (!ok) {
+    free(mem_buf);
+    // If one VP8LCreateHuffmanTree() above fails, we need to clean up behind.
+    memset(huffman_codes, 0, 5 * histogram_image_size * sizeof(*huffman_codes));
+  }
  return ok;
 }

@ -293,7 +305,7 @@ static int StoreFullHuffmanCode(VP8LBitWriter* const bw,
  int num_tokens;
  HuffmanTreeCode huffman_code;
  HuffmanTreeToken* const tokens =
-      (HuffmanTreeToken*)malloc(max_tokens * sizeof(*tokens));
+      (HuffmanTreeToken*)WebPSafeMalloc((uint64_t)max_tokens, sizeof(*tokens));
  if (tokens == NULL) return 0;

  huffman_code.num_symbols = CODE_LENGTH_CODES;
@ -394,9 +406,10 @@ static int StoreHuffmanCode(VP8LBitWriter* const bw,
 }

 static void WriteHuffmanCode(VP8LBitWriter* const bw,
-                             const HuffmanTreeCode* const code, int index) {
-  const int depth = code->code_lengths[index];
-  const int symbol = code->codes[index];
+                             const HuffmanTreeCode* const code,
+                             int code_index) {
+  const int depth = code->code_lengths[code_index];
+  const int symbol = code->codes[code_index];
  VP8LWriteBits(bw, depth, symbol);
 }

@ -431,12 +444,12 @@ static void StoreImageToBitMask(
      int bits, n_bits;
      int code, distance;

-      PrefixEncode(v->len, &code, &n_bits, &bits);
+      VP8LPrefixEncode(v->len, &code, &n_bits, &bits);
      WriteHuffmanCode(bw, codes, 256 + code);
      VP8LWriteBits(bw, n_bits, bits);

      distance = PixOrCopyDistance(v);
-      PrefixEncode(distance, &code, &n_bits, &bits);
+      VP8LPrefixEncode(distance, &code, &n_bits, &bits);
      WriteHuffmanCode(bw, codes + 4, code);
      VP8LWriteBits(bw, n_bits, bits);
    }
@ -500,24 +513,29 @@ static int EncodeImageInternal(VP8LBitWriter* const bw,
                               const uint32_t* const argb,
                               int width, int height, int quality,
                               int cache_bits, int histogram_bits) {
-  int i;
  int ok = 0;
  const int use_2d_locality = 1;
  const int use_color_cache = (cache_bits > 0);
-  const int histogram_image_xysize =
+  const uint32_t histogram_image_xysize =
      VP8LSubSampleSize(width, histogram_bits) *
      VP8LSubSampleSize(height, histogram_bits);
  VP8LHistogramSet* histogram_image =
      VP8LAllocateHistogramSet(histogram_image_xysize, 0);
  int histogram_image_size = 0;
-  int bit_array_size = 0;
+  size_t bit_array_size = 0;
  HuffmanTreeCode* huffman_codes = NULL;
  VP8LBackwardRefs refs;
  uint16_t* const histogram_symbols =
-      (uint16_t*)malloc(histogram_image_xysize * sizeof(*histogram_symbols));
+      (uint16_t*)WebPSafeMalloc((uint64_t)histogram_image_xysize,
+                                sizeof(*histogram_symbols));
  assert(histogram_bits >= MIN_HUFFMAN_BITS);
  assert(histogram_bits <= MAX_HUFFMAN_BITS);
-  if (histogram_image == NULL || histogram_symbols == NULL) goto Error;
+
+  if (histogram_image == NULL || histogram_symbols == NULL) {
+    free(histogram_image);
+    free(histogram_symbols);
+    return 0;
+  }

  // Calculate backward references from ARGB image.
  if (!VP8LGetBackwardReferences(width, height, argb, quality, cache_bits,
@ -534,12 +552,15 @@ static int EncodeImageInternal(VP8LBitWriter* const bw,
  // Create Huffman bit lengths and codes for each histogram image.
  histogram_image_size = histogram_image->size;
  bit_array_size = 5 * histogram_image_size;
-  huffman_codes = (HuffmanTreeCode*)calloc(bit_array_size,
+  huffman_codes = (HuffmanTreeCode*)WebPSafeCalloc(bit_array_size,
                                                   sizeof(*huffman_codes));
  if (huffman_codes == NULL ||
      !GetHuffBitLengthsAndCodes(histogram_image, huffman_codes)) {
    goto Error;
  }
+  // Free combined histograms.
+  free(histogram_image);
+  histogram_image = NULL;

  // Color Cache parameters.
  VP8LWriteBits(bw, 1, use_color_cache);
@ -553,14 +574,16 @@ static int EncodeImageInternal(VP8LBitWriter* const bw,
    VP8LWriteBits(bw, 1, write_histogram_image);
    if (write_histogram_image) {
      uint32_t* const histogram_argb =
-          (uint32_t*)malloc(histogram_image_xysize * sizeof(*histogram_argb));
+          (uint32_t*)WebPSafeMalloc((uint64_t)histogram_image_xysize,
+                                    sizeof(*histogram_argb));
      int max_index = 0;
+      uint32_t i;
      if (histogram_argb == NULL) goto Error;
      for (i = 0; i < histogram_image_xysize; ++i) {
-        const int index = histogram_symbols[i] & 0xffff;
-        histogram_argb[i] = 0xff000000 | (index << 8);
-        if (index >= max_index) {
-          max_index = index + 1;
+        const int symbol_index = histogram_symbols[i] & 0xffff;
+        histogram_argb[i] = 0xff000000 | (symbol_index << 8);
+        if (symbol_index >= max_index) {
+          max_index = symbol_index + 1;
        }
      }
      histogram_image_size = max_index;
@ -576,17 +599,14 @@ static int EncodeImageInternal(VP8LBitWriter* const bw,
  }

  // Store Huffman codes.
+  {
+    int i;
    for (i = 0; i < 5 * histogram_image_size; ++i) {
      HuffmanTreeCode* const codes = &huffman_codes[i];
-    if (!StoreHuffmanCode(bw, codes)) {
-      goto Error;
-    }
+      if (!StoreHuffmanCode(bw, codes)) goto Error;
      ClearHuffmanTreeIfOnlyOneSymbol(codes);
    }
-
-  // Free combined histograms.
-  free(histogram_image);
-  histogram_image = NULL;
+  }

  // Store actual literals.
  StoreImageToBitMask(bw, width, histogram_bits, &refs,
@ -594,7 +614,7 @@ static int EncodeImageInternal(VP8LBitWriter* const bw,
  ok = 1;

 Error:
-  if (!ok) free(histogram_image);
+  free(histogram_image);

  VP8LClearBackwardRefs(&refs);
  if (huffman_codes != NULL) {
@ -610,7 +630,7 @@ static int EncodeImageInternal(VP8LBitWriter* const bw,

 // Check if it would be a good idea to subtract green from red and blue. We
 // only impact entropy in red/blue components, don't bother to look at others.
-static int EvalAndApplySubtractGreen(const VP8LEncoder* const enc,
+static int EvalAndApplySubtractGreen(VP8LEncoder* const enc,
                                     int width, int height,
                                     VP8LBitWriter* const bw) {
  if (!enc->use_palette_) {
@ -639,7 +659,8 @@ static int EvalAndApplySubtractGreen(const VP8LEncoder* const enc,
    free(histo);

    // Check if subtracting green yields low entropy.
-    if (bit_cost_after < bit_cost_before) {
+    enc->use_subtract_green_ = (bit_cost_after < bit_cost_before);
+    if (enc->use_subtract_green_) {
      VP8LWriteBits(bw, 1, TRANSFORM_PRESENT);
      VP8LWriteBits(bw, 2, SUBTRACT_GREEN);
      VP8LSubtractGreenFromBlueAndRed(enc->argb_, width * height);
@ -674,7 +695,7 @@ static int ApplyCrossColorFilter(const VP8LEncoder* const enc,
  const int ccolor_transform_bits = enc->transform_bits_;
  const int transform_width = VP8LSubSampleSize(width, ccolor_transform_bits);
  const int transform_height = VP8LSubSampleSize(height, ccolor_transform_bits);
-  const int step = (quality == 0) ? 32 : 8;
+  const int step = (quality < 25) ? 32 : (quality > 50) ? 8 : 16;

  VP8LColorSpaceTransform(width, height, ccolor_transform_bits, step,
                          enc->argb_, enc->transform_data_);
@ -691,13 +712,6 @@ static int ApplyCrossColorFilter(const VP8LEncoder* const enc,

 // -----------------------------------------------------------------------------

-static void PutLE32(uint8_t* const data, uint32_t val) {
-  data[0] = (val >>  0) & 0xff;
-  data[1] = (val >>  8) & 0xff;
-  data[2] = (val >> 16) & 0xff;
-  data[3] = (val >> 24) & 0xff;
-}
-
 static WebPEncodingError WriteRiffHeader(const WebPPicture* const pic,
                                         size_t riff_size, size_t vp8l_size) {
  uint8_t riff[RIFF_HEADER_SIZE + CHUNK_HEADER_SIZE + VP8L_SIGNATURE_SIZE] = {
@ -754,7 +768,7 @@ static WebPEncodingError WriteImage(const WebPPicture* const pic,
      goto Error;
    }
  }
-  *coded_size = vp8l_size;
+  *coded_size = CHUNK_HEADER_SIZE + riff_size;
  return VP8_ENC_OK;

 Error:
@ -769,14 +783,14 @@ static WebPEncodingError AllocateTransformBuffer(VP8LEncoder* const enc,
                                                 int width, int height) {
  WebPEncodingError err = VP8_ENC_OK;
  const int tile_size = 1 << enc->transform_bits_;
-  const size_t image_size = width * height;
-  const size_t argb_scratch_size = tile_size * width + width;
-  const size_t transform_data_size =
-      VP8LSubSampleSize(width, enc->transform_bits_) *
-      VP8LSubSampleSize(height, enc->transform_bits_);
-  const size_t total_size =
+  const uint64_t image_size = width * height;
+  const uint64_t argb_scratch_size = tile_size * width + width;
+  const uint64_t transform_data_size =
+      (uint64_t)VP8LSubSampleSize(width, enc->transform_bits_) *
+      (uint64_t)VP8LSubSampleSize(height, enc->transform_bits_);
+  const uint64_t total_size =
      image_size + argb_scratch_size + transform_data_size;
-  uint32_t* mem = (uint32_t*)malloc(total_size * sizeof(*mem));
+  uint32_t* mem = (uint32_t*)WebPSafeMalloc(total_size, sizeof(*mem));
  if (mem == NULL) {
    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
    goto Error;
@ -792,61 +806,94 @@ static WebPEncodingError AllocateTransformBuffer(VP8LEncoder* const enc,
  return err;
 }

-// Bundles multiple (2, 4 or 8) pixels into a single pixel.
-// Returns the new xsize.
-static void BundleColorMap(const WebPPicture* const pic,
-                           int xbits, uint32_t* bundled_argb, int xs) {
-  int y;
-  const int bit_depth = 1 << (3 - xbits);
-  uint32_t code = 0;
-  const uint32_t* argb = pic->argb;
-  const int width = pic->width;
-  const int height = pic->height;
+static void ApplyPalette(uint32_t* src, uint32_t* dst,
+                         uint32_t src_stride, uint32_t dst_stride,
+                         const uint32_t* palette, int palette_size,
+                         int width, int height, int xbits, uint8_t* row) {
+  int i, x, y;
+  int use_LUT = 1;
+  for (i = 0; i < palette_size; ++i) {
+    if ((palette[i] & 0xffff00ffu) != 0) {
+      use_LUT = 0;
+      break;
+    }
+  }

+  if (use_LUT) {
+    uint8_t inv_palette[MAX_PALETTE_SIZE] = { 0 };
+    for (i = 0; i < palette_size; ++i) {
+      const int color = (palette[i] >> 8) & 0xff;
+      inv_palette[color] = i;
+    }
    for (y = 0; y < height; ++y) {
-    int x;
      for (x = 0; x < width; ++x) {
-      const int mask = (1 << xbits) - 1;
-      const int xsub = x & mask;
-      if (xsub == 0) {
-        code = 0;
+        const int color = (src[x] >> 8) & 0xff;
+        row[x] = inv_palette[color];
      }
-      // TODO(vikasa): simplify the bundling logic.
-      code |= (argb[x] & 0xff00) << (bit_depth * xsub);
-      bundled_argb[y * xs + (x >> xbits)] = 0xff000000 | code;
+      VP8LBundleColorMap(row, width, xbits, dst);
+      src += src_stride;
+      dst += dst_stride;
+    }
+  } else {
+    // Use 1 pixel cache for ARGB pixels.
+    uint32_t last_pix = palette[0];
+    int last_idx = 0;
+    for (y = 0; y < height; ++y) {
+      for (x = 0; x < width; ++x) {
+        const uint32_t pix = src[x];
+        if (pix != last_pix) {
+          for (i = 0; i < palette_size; ++i) {
+            if (pix == palette[i]) {
+              last_idx = i;
+              last_pix = pix;
+              break;
+            }
+          }
+        }
+        row[x] = last_idx;
+      }
+      VP8LBundleColorMap(row, width, xbits, dst);
+      src += src_stride;
+      dst += dst_stride;
    }
-    argb += pic->argb_stride;
  }
 }

 // Note: Expects "enc->palette_" to be set properly.
 // Also, "enc->palette_" will be modified after this call and should not be used
 // later.
-static WebPEncodingError ApplyPalette(VP8LBitWriter* const bw,
+static WebPEncodingError EncodePalette(VP8LBitWriter* const bw,
                                       VP8LEncoder* const enc, int quality) {
  WebPEncodingError err = VP8_ENC_OK;
-  int i, x, y;
+  int i;
  const WebPPicture* const pic = enc->pic_;
-  uint32_t* argb = pic->argb;
+  uint32_t* src = pic->argb;
+  uint32_t* dst;
  const int width = pic->width;
  const int height = pic->height;
  uint32_t* const palette = enc->palette_;
  const int palette_size = enc->palette_size_;
+  uint8_t* row = NULL;
+  int xbits;

  // Replace each input pixel by corresponding palette index.
-  for (y = 0; y < height; ++y) {
-    for (x = 0; x < width; ++x) {
-      const uint32_t pix = argb[x];
-      for (i = 0; i < palette_size; ++i) {
-        if (pix == palette[i]) {
-          argb[x] = 0xff000000u | (i << 8);
-          break;
-        }
-      }
-    }
-    argb += pic->argb_stride;
+  // This is done line by line.
+  if (palette_size <= 4) {
+    xbits = (palette_size <= 2) ? 3 : 2;
+  } else {
+    xbits = (palette_size <= 16) ? 1 : 0;
  }

+  err = AllocateTransformBuffer(enc, VP8LSubSampleSize(width, xbits), height);
+  if (err != VP8_ENC_OK) goto Error;
+  dst = enc->argb_;
+
+  row = (uint8_t*)WebPSafeMalloc((uint64_t)width, sizeof(*row));
+  if (row == NULL) return VP8_ENC_ERROR_OUT_OF_MEMORY;
+
+  ApplyPalette(src, dst, pic->argb_stride, enc->current_width_,
+               palette, palette_size, width, height, xbits, row);
+
  // Save palette to bitstream.
  VP8LWriteBits(bw, 1, TRANSFORM_PRESENT);
  VP8LWriteBits(bw, 2, COLOR_INDEXING_TRANSFORM);
@ -860,34 +907,19 @@ static WebPEncodingError ApplyPalette(VP8LBitWriter* const bw,
    goto Error;
  }

-  if (palette_size <= 16) {
-    // Image can be packed (multiple pixels per uint32_t).
-    int xbits = 1;
-    if (palette_size <= 2) {
-      xbits = 3;
-    } else if (palette_size <= 4) {
-      xbits = 2;
-    }
-    err = AllocateTransformBuffer(enc, VP8LSubSampleSize(width, xbits), height);
-    if (err != VP8_ENC_OK) goto Error;
-    BundleColorMap(pic, xbits, enc->argb_, enc->current_width_);
-  }
-
 Error:
+  free(row);
  return err;
 }

 // -----------------------------------------------------------------------------

-static int GetHistoBits(const WebPConfig* const config,
-                        const WebPPicture* const pic) {
-  const int width = pic->width;
-  const int height = pic->height;
-  const size_t hist_size = sizeof(VP8LHistogram);
+static int GetHistoBits(int method, int use_palette, int width, int height) {
+  const uint64_t hist_size = sizeof(VP8LHistogram);
  // Make tile size a function of encoding method (Range: 0 to 6).
-  int histo_bits = 7 - config->method;
+  int histo_bits = (use_palette ? 9 : 7) - method;
  while (1) {
-    const size_t huff_image_size = VP8LSubSampleSize(width, histo_bits) *
+    const uint64_t huff_image_size = VP8LSubSampleSize(width, histo_bits) *
                                     VP8LSubSampleSize(height, histo_bits) *
                                     hist_size;
    if (huff_image_size <= MAX_HUFF_IMAGE_SIZE) break;
@ -897,13 +929,14 @@ static int GetHistoBits(const WebPConfig* const config,
         (histo_bits > MAX_HUFFMAN_BITS) ? MAX_HUFFMAN_BITS : histo_bits;
 }

-static void InitEncParams(VP8LEncoder* const enc) {
+static void FinishEncParams(VP8LEncoder* const enc) {
  const WebPConfig* const config = enc->config_;
-  const WebPPicture* const picture = enc->pic_;
+  const WebPPicture* const pic = enc->pic_;
  const int method = config->method;
  const float quality = config->quality;
+  const int use_palette = enc->use_palette_;
  enc->transform_bits_ = (method < 4) ? 5 : (method > 4) ? 3 : 4;
-  enc->histo_bits_ = GetHistoBits(config, picture);
+  enc->histo_bits_ = GetHistoBits(method, use_palette, pic->width, pic->height);
  enc->cache_bits_ = (quality <= 25.f) ? 0 : 7;
 }

@ -919,6 +952,9 @@ static VP8LEncoder* VP8LEncoderNew(const WebPConfig* const config,
  }
  enc->config_ = config;
  enc->pic_ = picture;
+
+  VP8LDspInit();
+
  return enc;
 }

@ -938,14 +974,13 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
  const int width = picture->width;
  const int height = picture->height;
  VP8LEncoder* const enc = VP8LEncoderNew(config, picture);
+  const size_t byte_position = VP8LBitWriterNumBytes(bw);

  if (enc == NULL) {
    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
    goto Error;
  }

-  InitEncParams(enc);
-
  // ---------------------------------------------------------------------------
  // Analyze image (entropy, num_palettes etc)

@ -954,9 +989,12 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
    goto Error;
  }

+  FinishEncParams(enc);
+
  if (enc->use_palette_) {
-    err = ApplyPalette(bw, enc, quality);
+    err = EncodePalette(bw, enc, quality);
    if (err != VP8_ENC_OK) goto Error;
+    // Color cache is disabled for palette.
    enc->cache_bits_ = 0;
  }

@ -1017,6 +1055,20 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
    goto Error;
  }

+  if (picture->stats != NULL) {
+    WebPAuxStats* const stats = picture->stats;
+    stats->lossless_features = 0;
+    if (enc->use_predict_) stats->lossless_features |= 1;
+    if (enc->use_cross_color_) stats->lossless_features |= 2;
+    if (enc->use_subtract_green_) stats->lossless_features |= 4;
+    if (enc->use_palette_) stats->lossless_features |= 8;
+    stats->histogram_bits = enc->histo_bits_;
+    stats->transform_bits = enc->transform_bits_;
+    stats->cache_bits = enc->cache_bits_;
+    stats->palette_size = enc->palette_size_;
+    stats->lossless_size = (int)(VP8LBitWriterNumBytes(bw) - byte_position);
+  }
+
 Error:
  VP8LEncoderDelete(enc);
  return err;
@ -1035,19 +1087,34 @@ int VP8LEncodeImage(const WebPConfig* const config,

  if (config == NULL || picture->argb == NULL) {
    err = VP8_ENC_ERROR_NULL_PARAMETER;
-    goto Error;
+    WebPEncodingSetError(picture, err);
+    return 0;
  }

  width = picture->width;
  height = picture->height;
+  if (!VP8LBitWriterInit(&bw, (width * height) >> 1)) {
+    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    goto Error;
+  }
+
  if (!WebPReportProgress(picture, 1, &percent)) {
 UserAbort:
    err = VP8_ENC_ERROR_USER_ABORT;
    goto Error;
  }
+  // Reset stats (for pure lossless coding)
+  if (picture->stats != NULL) {
+    WebPAuxStats* const stats = picture->stats;
+    memset(stats, 0, sizeof(*stats));
+    stats->PSNR[0] = 99.f;
+    stats->PSNR[1] = 99.f;
+    stats->PSNR[2] = 99.f;
+    stats->PSNR[3] = 99.f;
+    stats->PSNR[4] = 99.f;
+  }

  // Write image size.
-  VP8LBitWriterInit(&bw, (width * height) >> 1);
  if (!WriteImageSize(picture, &bw)) {
    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
    goto Error;
@ -1075,15 +1142,10 @@ int VP8LEncodeImage(const WebPConfig* const config,

  if (!WebPReportProgress(picture, 100, &percent)) goto UserAbort;

-  // Collect some stats if needed.
+  // Save size.
  if (picture->stats != NULL) {
-    WebPAuxStats* const stats = picture->stats;
-    memset(stats, 0, sizeof(*stats));
-    stats->PSNR[0] = 99.;
-    stats->PSNR[1] = 99.;
-    stats->PSNR[2] = 99.;
-    stats->PSNR[3] = 99.;
-    stats->coded_size = (int)coded_size;
+    picture->stats->coded_size += (int)coded_size;
+    picture->stats->lossless_size = (int)coded_size;
  }

  if (picture->extra_info != NULL) {
@ -1104,6 +1166,3 @@ int VP8LEncodeImage(const WebPConfig* const config,

 //------------------------------------------------------------------------------

-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
--- a/src/enc/vp8li.h
+++ b/src/enc/vp8li.h
@ -1,8 +1,10 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Lossless encoder: internal header.
@ -17,7 +19,7 @@
 #include "../webp/encode.h"
 #include "../webp/format_constants.h"

-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif

@ -38,6 +40,7 @@ typedef struct {

  // Encoding parameters derived from image characteristics.
  int use_cross_color_;
+  int use_subtract_green_;
  int use_predict_;
  int use_palette_;
  int palette_size_;
@ -60,7 +63,7 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,

 //------------------------------------------------------------------------------

-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }    // extern "C"
 #endif

--- a/src/enc/webpenc.c
+++ b/src/enc/webpenc.c
@ -1,8 +1,10 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // WebP encoder: main entry point
@ -16,13 +18,10 @@

 #include "./vp8enci.h"
 #include "./vp8li.h"
+#include "../utils/utils.h"

 // #define PRINT_MEMORY_INFO

-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 #ifdef PRINT_MEMORY_INFO
 #include <stdio.h>
 #endif
@ -92,34 +91,53 @@ static void ResetBoundaryPredictions(VP8Encoder* const enc) {
  enc->nz_[-1] = 0;   // constant
 }

-// Map configured quality level to coding tools used.
-//-------------+---+---+---+---+---+---+
-//   Quality   | 0 | 1 | 2 | 3 | 4 | 5 +
-//-------------+---+---+---+---+---+---+
-// dynamic prob| ~ | x | x | x | x | x |
-//-------------+---+---+---+---+---+---+
-// rd-opt modes|   |   | x | x | x | x |
-//-------------+---+---+---+---+---+---+
-// fast i4/i16 | x | x |   |   |   |   |
-//-------------+---+---+---+---+---+---+
-// rd-opt i4/16|   |   | x | x | x | x |
-//-------------+---+---+---+---+---+---+
-// Trellis     |   | x |   |   | x | x |
-//-------------+---+---+---+---+---+---+
-// full-SNS    |   |   |   |   |   | x |
-//-------------+---+---+---+---+---+---+
+// Mapping from config->method_ to coding tools used.
+//-------------------+---+---+---+---+---+---+---+
+//   Method          | 0 | 1 | 2 | 3 |(4)| 5 | 6 |
+//-------------------+---+---+---+---+---+---+---+
+// fast probe        | x |   |   | x |   |   |   |
+//-------------------+---+---+---+---+---+---+---+
+// dynamic proba     | ~ | x | x | x | x | x | x |
+//-------------------+---+---+---+---+---+---+---+
+// fast mode analysis|   |   |   |   | x | x | x |
+//-------------------+---+---+---+---+---+---+---+
+// basic rd-opt      |   |   |   | x | x | x | x |
+//-------------------+---+---+---+---+---+---+---+
+// disto-score i4/16 |   |   | x |   |   |   |   |
+//-------------------+---+---+---+---+---+---+---+
+// rd-opt i4/16      |   |   | ~ | x | x | x | x |
+//-------------------+---+---+---+---+---+---+---+
+// token buffer (opt)|   |   |   | x | x | x | x |
+//-------------------+---+---+---+---+---+---+---+
+// Trellis           |   |   |   |   |   | x |Ful|
+//-------------------+---+---+---+---+---+---+---+
+// full-SNS          |   |   |   |   | x | x | x |
+//-------------------+---+---+---+---+---+---+---+

 static void MapConfigToTools(VP8Encoder* const enc) {
-  const int method = enc->config_->method;
-  const int limit = 100 - enc->config_->partition_limit;
+  const WebPConfig* const config = enc->config_;
+  const int method = config->method;
+  const int limit = 100 - config->partition_limit;
  enc->method_ = method;
-  enc->rd_opt_level_ = (method >= 6) ? 3
-                     : (method >= 5) ? 2
-                     : (method >= 3) ? 1
-                     : 0;
+  enc->rd_opt_level_ = (method >= 6) ? RD_OPT_TRELLIS_ALL
+                     : (method >= 5) ? RD_OPT_TRELLIS
+                     : (method >= 3) ? RD_OPT_BASIC
+                     : RD_OPT_NONE;
  enc->max_i4_header_bits_ =
      256 * 16 * 16 *                 // upper bound: up to 16bit per 4x4 block
      (limit * limit) / (100 * 100);  // ... modulated with a quadratic curve.
+
+  enc->thread_level_ = config->thread_level;
+
+  enc->do_search_ = (config->target_size > 0 || config->target_PSNR > 0);
+  if (!config->low_memory) {
+#if !defined(DISABLE_TOKEN_BUFFER)
+    enc->use_tokens_ = (enc->rd_opt_level_ >= RD_OPT_BASIC);  // need rd stats
+#endif
+    if (enc->use_tokens_) {
+      enc->num_parts_ = 1;   // doesn't work with multi-partition
+    }
+  }
 }

 // Memory scaling with dimensions:
@ -135,7 +153,7 @@ static void MapConfigToTools(VP8Encoder* const enc) {
 //             non-zero: 196
 //             lf-stats: 2048
 //                total: 68635
-// Transcient object sizes:
+// Transient object sizes:
 //       VP8EncIterator: 352
 //         VP8ModeScore: 912
 //       VP8SegmentInfo: 532
@ -153,19 +171,16 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
  const int preds_h = 4 * mb_h + 1;
  const size_t preds_size = preds_w * preds_h * sizeof(uint8_t);
  const int top_stride = mb_w * 16;
-  const size_t nz_size = (mb_w + 1) * sizeof(uint32_t);
-  const size_t cache_size = (3 * YUV_SIZE + PRED_SIZE) * sizeof(uint8_t);
+  const size_t nz_size = (mb_w + 1) * sizeof(uint32_t) + ALIGN_CST;
  const size_t info_size = mb_w * mb_h * sizeof(VP8MBInfo);
-  const size_t samples_size = (2 * top_stride +         // top-luma/u/v
-                               16 + 16 + 16 + 8 + 1 +   // left y/u/v
-                               2 * ALIGN_CST)           // align all
-                               * sizeof(uint8_t);
+  const size_t samples_size = 2 * top_stride * sizeof(uint8_t)  // top-luma/u/v
+                            + ALIGN_CST;                        // align all
  const size_t lf_stats_size =
      config->autofilter ? sizeof(LFStats) + ALIGN_CST : 0;
  VP8Encoder* enc;
  uint8_t* mem;
-  size_t size = sizeof(VP8Encoder) + ALIGN_CST  // main struct
-              + cache_size                      // working caches
+  const uint64_t size = (uint64_t)sizeof(VP8Encoder)   // main struct
+                      + ALIGN_CST                      // cache alignment
                      + info_size                      // modes info
                      + preds_size                     // prediction modes
                      + samples_size                   // top/left samples
@ -176,16 +191,15 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
  printf("===================================\n");
  printf("Memory used:\n"
         "             encoder: %ld\n"
-         "         block cache: %ld\n"
         "                info: %ld\n"
         "               preds: %ld\n"
         "         top samples: %ld\n"
         "            non-zero: %ld\n"
         "            lf-stats: %ld\n"
         "               total: %ld\n",
-         sizeof(VP8Encoder) + ALIGN_CST, cache_size, info_size,
+         sizeof(VP8Encoder) + ALIGN_CST, info_size,
         preds_size, samples_size, nz_size, lf_stats_size, size);
-  printf("Transcient object sizes:\n"
+  printf("Transient object sizes:\n"
         "      VP8EncIterator: %ld\n"
         "        VP8ModeScore: %ld\n"
         "      VP8SegmentInfo: %ld\n"
@ -198,7 +212,7 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
         mb_w * mb_h * 384 * sizeof(uint8_t));
  printf("===================================\n");
 #endif
-  mem = (uint8_t*)malloc(size);
+  mem = (uint8_t*)WebPSafeMalloc(size, sizeof(*mem));
  if (mem == NULL) {
    WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
    return NULL;
@ -210,19 +224,11 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
  enc->mb_w_ = mb_w;
  enc->mb_h_ = mb_h;
  enc->preds_w_ = preds_w;
-  enc->yuv_in_ = (uint8_t*)mem;
-  mem += YUV_SIZE;
-  enc->yuv_out_ = (uint8_t*)mem;
-  mem += YUV_SIZE;
-  enc->yuv_out2_ = (uint8_t*)mem;
-  mem += YUV_SIZE;
-  enc->yuv_p_ = (uint8_t*)mem;
-  mem += PRED_SIZE;
  enc->mb_info_ = (VP8MBInfo*)mem;
  mem += info_size;
  enc->preds_ = ((uint8_t*)mem) + 1 + enc->preds_w_;
  mem += preds_w * preds_h * sizeof(uint8_t);
-  enc->nz_ = 1 + (uint32_t*)mem;
+  enc->nz_ = 1 + (uint32_t*)DO_ALIGN(mem);
  mem += nz_size;
  enc->lf_stats_ = lf_stats_size ? (LFStats*)DO_ALIGN(mem) : NULL;
  mem += lf_stats_size;
@ -232,13 +238,7 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
  enc->y_top_ = (uint8_t*)mem;
  enc->uv_top_ = enc->y_top_ + top_stride;
  mem += 2 * top_stride;
-  mem = (uint8_t*)DO_ALIGN(mem + 1);
-  enc->y_left_ = (uint8_t*)mem;
-  mem += 16 + 16;
-  enc->u_left_ = (uint8_t*)mem;
-  mem += 16;
-  enc->v_left_ = (uint8_t*)mem;
-  mem += 8;
+  assert(mem <= (uint8_t*)enc + size);

  enc->config_ = config;
  enc->profile_ = use_filter ? ((config->filter_type == 1) ? 0 : 1) : 2;
@ -257,23 +257,27 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
  VP8EncInitLayer(enc);
 #endif

+  VP8TBufferInit(&enc->tokens_);
  return enc;
 }

-static void DeleteVP8Encoder(VP8Encoder* enc) {
+static int DeleteVP8Encoder(VP8Encoder* enc) {
+  int ok = 1;
  if (enc != NULL) {
-    VP8EncDeleteAlpha(enc);
+    ok = VP8EncDeleteAlpha(enc);
 #ifdef WEBP_EXPERIMENTAL_FEATURES
    VP8EncDeleteLayer(enc);
 #endif
+    VP8TBufferClear(&enc->tokens_);
    free(enc);
  }
+  return ok;
 }

 //------------------------------------------------------------------------------

 static double GetPSNR(uint64_t err, uint64_t size) {
-  return err ? 10. * log10(255. * 255. * size / err) : 99.;
+  return (err > 0 && size > 0) ? 10. * log10(255. * 255. * size / err) : 99.;
 }

 static void FinalizePSNR(const VP8Encoder* const enc) {
@ -284,6 +288,7 @@ static void FinalizePSNR(const VP8Encoder* const enc) {
  stats->PSNR[1] = (float)GetPSNR(sse[1], size / 4);
  stats->PSNR[2] = (float)GetPSNR(sse[2], size / 4);
  stats->PSNR[3] = (float)GetPSNR(sse[0] + sse[1] + sse[2], size * 3 / 2);
+  stats->PSNR[4] = (float)GetPSNR(sse[3], size);
 }

 static void StoreStats(VP8Encoder* const enc) {
@ -329,7 +334,7 @@ int WebPReportProgress(const WebPPicture* const pic,
 //------------------------------------------------------------------------------

 int WebPEncode(const WebPConfig* config, WebPPicture* pic) {
-  int ok;
+  int ok = 0;

  if (pic == NULL)
    return 0;
@ -343,35 +348,53 @@ int WebPEncode(const WebPConfig* config, WebPPicture* pic) {
  if (pic->width > WEBP_MAX_DIMENSION || pic->height > WEBP_MAX_DIMENSION)
    return WebPEncodingSetError(pic, VP8_ENC_ERROR_BAD_DIMENSION);

+  if (pic->stats != NULL) memset(pic->stats, 0, sizeof(*pic->stats));
+
  if (!config->lossless) {
    VP8Encoder* enc = NULL;
    if (pic->y == NULL || pic->u == NULL || pic->v == NULL) {
-      if (pic->argb != NULL) {
-        if (!WebPPictureARGBToYUVA(pic, WEBP_YUV420)) return 0;
-      } else {
-        return WebPEncodingSetError(pic, VP8_ENC_ERROR_NULL_PARAMETER);
+      // Make sure we have YUVA samples.
+      float dithering = 0.f;
+      if (config->preprocessing & 2) {
+        const float x = config->quality / 100.f;
+        const float x2 = x * x;
+        // slowly decreasing from max dithering at low quality (q->0)
+        // to 0.5 dithering amplitude at high quality (q->100)
+        dithering = 1.0f + (0.5f - 1.0f) * x2 * x2;
+      }
+      if (!WebPPictureARGBToYUVADithered(pic, WEBP_YUV420, dithering)) {
+        return 0;
      }
    }

    enc = InitVP8Encoder(config, pic);
    if (enc == NULL) return 0;  // pic->error is already set.
    // Note: each of the tasks below account for 20% in the progress report.
-    ok = VP8EncAnalyze(enc)
-      && VP8StatLoop(enc)
-      && VP8EncLoop(enc)
-      && VP8EncFinishAlpha(enc)
+    ok = VP8EncAnalyze(enc);
+
+    // Analysis is done, proceed to actual coding.
+    ok = ok && VP8EncStartAlpha(enc);   // possibly done in parallel
+    if (!enc->use_tokens_) {
+      ok = ok && VP8EncLoop(enc);
+    } else {
+      ok = ok && VP8EncTokenLoop(enc);
+    }
+    ok = ok && VP8EncFinishAlpha(enc);
 #ifdef WEBP_EXPERIMENTAL_FEATURES
-      && VP8EncFinishLayer(enc)
+    ok = ok && VP8EncFinishLayer(enc);
 #endif
-      && VP8EncWrite(enc);
+
+    ok = ok && VP8EncWrite(enc);
    StoreStats(enc);
    if (!ok) {
      VP8EncFreeBitWriters(enc);
    }
-    DeleteVP8Encoder(enc);
+    ok &= DeleteVP8Encoder(enc);  // must always be called, even if !ok
  } else {
-    if (pic->argb == NULL)
-      return WebPEncodingSetError(pic, VP8_ENC_ERROR_NULL_PARAMETER);
+    // Make sure we have ARGB samples.
+    if (pic->argb == NULL && !WebPPictureYUVAToARGB(pic)) {
+      return 0;
+    }

    ok = VP8LEncodeImage(config, pic);  // Sets pic->error in case of problem.
  }
@ -379,6 +402,3 @@ int WebPEncode(const WebPConfig* config, WebPPicture* pic) {
  return ok;
 }

-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
--- a/Show More
+++ b/Show More