update ChangeLog

Change-Id: Idea3464bbcb28896179c99488e7b96a4341b508a
Regression fix for alpha channels using color cache:
2025-07-15 21:39:59 +02:00 · 2013-06-17 16:53:22 -07:00 · 2013-06-17 16:29:09 -07:00 · 2013-06-17 15:43:33 -07:00 · 2013-06-17 15:43:33 -07:00 · 2013-06-12 23:32:59 -07:00
150 changed files with 35423 additions and 6014 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -1,3 +1,4 @@
 .gitattributes export-ignore
 .gitignore export-ignore
 .mailmap export-ignore
+*.pdf -text -diff
--- a/.gitignore
+++ b/.gitignore
@ -1,8 +1,9 @@
 *.l[ao]
-*.o
+*.[ao]
 .deps
 .libs
 /aclocal.m4
+/ar-lib
 /autom4te.cache
 /compile
 /config.*
@ -16,7 +17,12 @@
 /stamp-h1
 Makefile
 Makefile.in
-examples/[cd]webp
+examples/[cdv]webp
+examples/gif2webp
+examples/webpmux
 /output
+/doc/output
 *.idb
 *.pdb
+/iosbuild
+/WebP.framework
--- a/.mailmap
+++ b/.mailmap
@ -1,2 +1,6 @@
+<johann.koenig@duck.com> <johannkoenig@google.com>
 Mikołaj Zalewski <mikolajz@google.com>
 Pascal Massimino <pascal.massimino@gmail.com>
+Vikas Arora <vikasa@google.com>
+<vikasa@google.com> <vikasa@gmail.com>
+<vikasa@google.com> <vikaas.arora@gmail.com>
--- a/8
+++ b/8
@ -1,9 +1,17 @@
 Contributors:
+- Christian Duvivier (cduvivier at google dot com)
 - James Zern (jzern at google dot com)
 - Jan Engelhardt (jengelh at medozas dot de)
+- Johann (johann dot koenig at duck dot com)
+- Jyrki Alakuijala (jyrki at google dot com)
+- Lou Quillio (louquillio at google dot com)
+- Mans Rullgard (mans at mansr dot com)
+- Martin Olsson (mnemo at minimum dot se)
 - Mikołaj Zalewski (mikolajz at google dot com)
+- Noel Chromium (noel at chromium dot org)
 - Pascal Massimino (pascal dot massimino at gmail dot com)
 - Pierre Joye (pierre dot php at gmail dot com)
+- Scott LaVarnway (slavarnway at google dot com)
 - Somnath Banerjee (somnath dot banerjee at gmail dot com)
 - Urvang Joshi (urvang at google dot com)
 - Vikas Arora (vikasa at google dot com)
--- a/Android.mk
+++ b/Android.mk
@ -2,46 +2,74 @@ LOCAL_PATH:= $(call my-dir)

 include $(CLEAR_VARS)
 LOCAL_SRC_FILES := \
-	src/dec/alpha.c \
-	src/dec/dsp.c \
-	src/dec/frame.c \
-	src/dec/idec.c \
-	src/dec/layer.c \
-	src/dec/quant.c \
-	src/dec/tree.c \
-	src/dec/vp8.c \
-	src/dec/webp.c \
-	src/dec/io.c \
-	src/dec/buffer.c \
-	src/dsp/yuv.c \
-	src/dsp/upsampling.c \
-	src/dsp/cpu.c \
-	src/dsp/dec.c \
-	src/dsp/dec_neon.c \
-	src/dsp/enc.c \
-	src/enc/alpha.c \
-	src/enc/analysis.c \
-	src/enc/config.c \
-	src/enc/dsp.c \
-	src/enc/filter.c \
-	src/enc/frame.c \
-	src/enc/iterator.c \
-	src/enc/layer.c \
-	src/enc/picture.c \
-	src/enc/quant.c \
-	src/enc/syntax.c \
-	src/enc/tree.c \
-	src/enc/webpenc.c
-	src/utils/bit_reader.c \
-	src/utils/bit_writer.c \
-	src/utils/thread.c \
+    src/dec/alpha.c \
+    src/dec/buffer.c \
+    src/dec/frame.c \
+    src/dec/idec.c \
+    src/dec/io.c \
+    src/dec/layer.c \
+    src/dec/quant.c \
+    src/dec/tree.c \
+    src/dec/vp8.c \
+    src/dec/vp8l.c \
+    src/dec/webp.c \
+    src/dsp/cpu.c \
+    src/dsp/dec.c \
+    src/dsp/dec_sse2.c \
+    src/dsp/enc.c \
+    src/dsp/enc_sse2.c \
+    src/dsp/lossless.c \
+    src/dsp/upsampling.c \
+    src/dsp/upsampling_sse2.c \
+    src/dsp/yuv.c \
+    src/enc/alpha.c \
+    src/enc/analysis.c \
+    src/enc/backward_references.c \
+    src/enc/config.c \
+    src/enc/cost.c \
+    src/enc/filter.c \
+    src/enc/frame.c \
+    src/enc/histogram.c \
+    src/enc/iterator.c \
+    src/enc/layer.c \
+    src/enc/picture.c \
+    src/enc/quant.c \
+    src/enc/syntax.c \
+    src/enc/token.c \
+    src/enc/tree.c \
+    src/enc/vp8l.c \
+    src/enc/webpenc.c \
+    src/utils/bit_reader.c \
+    src/utils/bit_writer.c \
+    src/utils/color_cache.c \
+    src/utils/filters.c \
+    src/utils/huffman.c \
+    src/utils/huffman_encode.c \
+    src/utils/quant_levels.c \
+    src/utils/quant_levels_dec.c \
+    src/utils/rescaler.c \
+    src/utils/thread.c \
+    src/utils/utils.c \

-LOCAL_CFLAGS := -Wall -DANDROID -DHAVE_MALLOC_H -DHAVE_PTHREAD -DWEBP_USE_THREAD \
+LOCAL_CFLAGS := -Wall -DANDROID -DHAVE_MALLOC_H -DHAVE_PTHREAD \
+                -DWEBP_USE_THREAD \
                -finline-functions -frename-registers -ffast-math \
                -s -fomit-frame-pointer -Isrc/webp

 LOCAL_C_INCLUDES += $(LOCAL_PATH)/src

+ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
+  # Setting LOCAL_ARM_NEON will enable -mfpu=neon which may cause illegal
+  # instructions to be generated for armv7a code. Instead target the neon code
+  # specifically.
+  LOCAL_SRC_FILES += src/dsp/dec_neon.c.neon
+  LOCAL_SRC_FILES += src/dsp/upsampling_neon.c.neon
+  LOCAL_SRC_FILES += src/dsp/enc_neon.c.neon
+endif
+LOCAL_STATIC_LIBRARIES := cpufeatures
+
 LOCAL_MODULE:= webp

 include $(BUILD_STATIC_LIBRARY)
+
+$(call import-module,android/cpufeatures)
--- a/1135
+++ b/1135
--- a/Makefile.vc
+++ b/Makefile.vc
@ -1,18 +1,10 @@
 #
 # Stem for static libs and DLLs
 #
-LIB_NAME       = libwebp_a
-LIB_NAME_DEBUG = libwebp_a_debug
-
-#
-# Stem for DLL import libs
-#
-IMPLIB_NAME       = libwebp_dll
-IMPLIB_NAME_DEBUG = libwebp_dll_debug
-
-!IFNDEF DEP_PATH
-DEPS_PATH   = ../../deps
-!ENDIF
+LIBWEBPDECODER_BASENAME = libwebpdecoder
+LIBWEBP_BASENAME = libwebp
+LIBWEBPMUX_BASENAME = libwebpmux
+LIBWEBPDEMUX_BASENAME = libwebpdemux

 !IFNDEF ARCH
 !IF ! [ cl 2>&1 | find "x86" > NUL ]
@ -32,17 +24,18 @@ PLATFORM_LDFLAGS = /SAFESEH
 #############################################################
 ## Nothing more to do below this line!

-MT         = mt.exe
-CCNODBG    = cl.exe /nologo /O2 /DNDEBUG
-CCDEBUG    = cl.exe /nologo /Od /Gm /Zi /D_DEBUG /RTC1
-CFLAGS     = /Isrc /nologo /W3 /EHsc /FD /c /GS
+NOLOGO     = /nologo
+CCNODBG    = cl.exe $(NOLOGO) /O2 /DNDEBUG
+CCDEBUG    = cl.exe $(NOLOGO) /Od /Gm /Zi /D_DEBUG /RTC1
+CFLAGS     = /Isrc $(NOLOGO) /W3 /EHsc /c /GS
 CFLAGS     = $(CFLAGS) /DWIN32 /D_CRT_SECURE_NO_WARNINGS /DWIN32_LEAN_AND_MEAN
 CFLAGS     = $(CFLAGS) /DHAVE_WINCODEC_H /DWEBP_USE_THREAD
-LDFLAGS    = /LARGEADDRESSAWARE /MANIFEST /NXCOMPAT /DYNAMICBASE $(PLATFORM_LDFLAGS)
-LNKDLL     = link.exe /DLL
-LNKLIB     = link.exe /lib
-LNKEXE     = link.exe
-LFLAGS     = /nologo /machine:$(ARCH)
+LDFLAGS    = /LARGEADDRESSAWARE /MANIFEST /NXCOMPAT /DYNAMICBASE
+LDFLAGS    = $(LDFLAGS) $(PLATFORM_LDFLAGS)
+LNKDLL     = link.exe /DLL $(NOLOGO)
+LNKEXE     = link.exe $(NOLOGO)
+LNKLIB     = lib.exe $(NOLOGO)
+MT         = mt.exe $(NOLOGO)

 CFGSET     = FALSE
 !IF "$(OBJDIR)" == ""
@ -65,36 +58,58 @@ DIROBJ = $(DIRBASE)\obj
 DIRLIB = $(DIRBASE)\lib
 DIRINC = $(DIRBASE)\include
 DIRBIN = $(DIRBASE)\bin
+LIBWEBP_PDBNAME = $(DIROBJ)\$(LIBWEBP_BASENAME).pdb
+OUTPUT_DIRS = $(DIRBIN) $(DIRINC) $(DIRLIB) \
+              $(DIROBJ)\dec \
+              $(DIROBJ)\demux \
+              $(DIROBJ)\dsp \
+              $(DIROBJ)\enc \
+              $(DIROBJ)\examples \
+              $(DIROBJ)\mux \
+              $(DIROBJ)\utils \

 # Target configuration
 !IF "$(CFG)" == "release-static"
-TARGET         = $(LIB_NAME).lib
 CC             = $(CCNODBG)
 STATICLIBBUILD = TRUE
 !ELSE IF "$(CFG)" == "debug-static"
-TARGET         = $(LIB_NAME_DEBUG).lib
 CC             = $(CCDEBUG)
+RTLIB          = $(RTLIBD)
 STATICLIBBUILD = TRUE
+LIBWEBPDECODER_BASENAME = $(LIBWEBPDECODER_BASENAME)_debug
+LIBWEBP_BASENAME = $(LIBWEBP_BASENAME)_debug
+LIBWEBPMUX_BASENAME = $(LIBWEBPMUX_BASENAME)_debug
+LIBWEBPDEMUX_BASENAME = $(LIBWEBPDEMUX_BASENAME)_debug
 !ELSE IF "$(CFG)" == "release-dynamic"
-TARGETDLL = $(LIB_NAME).dll
-TARGET    = $(IMPLIB_NAME).lib
 CC        = $(CCNODBG)
 DLLBUILD  = TRUE
 !ELSE IF "$(CFG)" == "debug-dynamic"
-TARGETDLL = $(LIB_NAME_DEBUG).dll
-TARGET    = $(IMPLIB_NAME_DEBUG).lib
 CC        = $(CCDEBUG)
+RTLIB     = $(RTLIBD)
 DLLBUILD  = TRUE
+LIBWEBPDECODER_BASENAME = $(LIBWEBPDECODER_BASENAME)_debug
+LIBWEBP_BASENAME = $(LIBWEBP_BASENAME)_debug
+LIBWEBPMUX_BASENAME = $(LIBWEBPMUX_BASENAME)_debug
+LIBWEBPDEMUX_BASENAME = $(LIBWEBPDEMUX_BASENAME)_debug
 !ENDIF

 !IF "$(STATICLIBBUILD)" == "TRUE"
 CC     = $(CC) $(RTLIB)
-LNK    = $(LNKLIB) /out:$(DIRLIB)\$(TARGET)
 CFGSET = TRUE
+LIBWEBPDECODER = $(DIRLIB)\$(LIBWEBPDECODER_BASENAME).lib
+LIBWEBP = $(DIRLIB)\$(LIBWEBP_BASENAME).lib
+LIBWEBPMUX = $(DIRLIB)\$(LIBWEBPMUX_BASENAME).lib
+LIBWEBPDEMUX = $(DIRLIB)\$(LIBWEBPDEMUX_BASENAME).lib
 !ELSE IF "$(DLLBUILD)" == "TRUE"
+DLLC   = webp_dll.c
 DLLINC = webp_dll.h
+DLL_OBJS = $(DIROBJ)\$(DLLC:.c=.obj)
 CC     = $(CC) /I$(DIROBJ) /FI$(DLLINC) $(RTLIB) /DWEBP_DLL
-LNK    = $(LNKDLL) /out:$(DIRBIN)\$(TARGETDLL) /implib:$(DIRLIB)\$(TARGET)
+LIBWEBPDECODER = $(DIRLIB)\$(LIBWEBPDECODER_BASENAME)_dll.lib
+LIBWEBP = $(DIRLIB)\$(LIBWEBP_BASENAME)_dll.lib
+LIBWEBPMUX = $(DIRLIB)\$(LIBWEBPMUX_BASENAME)_dll.lib
+LIBWEBPDEMUX = $(DIRLIB)\$(LIBWEBPDEMUX_BASENAME)_dll.lib
+LIBWEBP_PDBNAME = $(DIROBJ)\$(LIBWEBP_BASENAME)_dll.pdb
 CFGSET = TRUE
 !ENDIF

@ -102,21 +117,25 @@ CFGSET = TRUE
 # Usage
 #
 !IF "$(CFGSET)" == "FALSE"
-!MESSAGE Usage: nmake /f Makefile.vc [CFG=<config>] [OBJDIR=<path>] [RTLIBCFG=<rtlib>] [<target>]
+!MESSAGE Usage: nmake /f Makefile.vc [CFG=<config>]
+!MESSAGE .          [OBJDIR=<path>] [RTLIBCFG=<rtlib>] [<target>]
+!MESSAGE
 !MESSAGE where <config> is one of:
 !MESSAGE -  release-static                - release static library
 !MESSAGE -  debug-static                  - debug static library
 !MESSAGE -  release-dynamic               - release dynamic link library (DLL)
 !MESSAGE -  debug-dynamic                 - debug dynamic link library (DLL)
+!MESSAGE
 !MESSAGE <target> may be:
 !MESSAGE -  clean                         - perform a clean for CFG
 !MESSAGE -  experimental                  - build CFG with experimental
-!MESSAGE .                                  features enabled. Requires zlib.
+!MESSAGE .                                  features enabled.
+!MESSAGE - (empty)                        - build libwebp-based targets for CFG
+!MESSAGE - all                            - build (de)mux-based targets for CFG
 !MESSAGE
-!MESSAGE <rtlibcfg> controls the runtime library linkage - can be 'static' or 'dynamic'.
-!MESSAGE <target> can be left blank in which case all is assumed
-!MESSAGE <path> is the path where you like to build (obj, bins, etc.)
-!MESSAGE   default to ..\obj\
+!MESSAGE RTLIBCFG controls the runtime library linkage - 'static' or 'dynamic'.
+!MESSAGE OBJDIR is the path where you like to build (obj, bins, etc.),
+!MESSAGE   defaults to ..\obj

 !IF "$(CFG)" != ""
 !MESSAGE
@ -131,94 +150,144 @@ CFGSET = TRUE
 # A config was provided, so the library can be built.
 #

-X_OBJS= \
-	$(DIROBJ)\dec\frame.obj \
-	$(DIROBJ)\dec\quant.obj \
-	$(DIROBJ)\dec\tree.obj \
-	$(DIROBJ)\dec\vp8.obj \
-	$(DIROBJ)\dec\webp.obj \
-	$(DIROBJ)\dec\io.obj \
-	$(DIROBJ)\dec\buffer.obj \
-	$(DIROBJ)\dec\idec.obj \
-	$(DIROBJ)\dec\alpha.obj \
-	$(DIROBJ)\dec\layer.obj \
-	$(DIROBJ)\enc\analysis.obj \
-	$(DIROBJ)\enc\config.obj \
-	$(DIROBJ)\enc\cost.obj \
-	$(DIROBJ)\enc\frame.obj \
-	$(DIROBJ)\enc\filter.obj \
-	$(DIROBJ)\enc\iterator.obj \
-	$(DIROBJ)\enc\picture.obj \
-	$(DIROBJ)\enc\quant.obj \
-	$(DIROBJ)\enc\syntax.obj \
-	$(DIROBJ)\enc\tree.obj \
-	$(DIROBJ)\enc\webpenc.obj \
-	$(DIROBJ)\enc\alpha.obj \
-	$(DIROBJ)\enc\layer.obj \
-	$(DIROBJ)\dsp\enc.obj \
-	$(DIROBJ)\dsp\enc_sse2.obj \
-	$(DIROBJ)\dsp\upsampling.obj \
-	$(DIROBJ)\dsp\upsampling_sse2.obj \
-	$(DIROBJ)\dsp\dec.obj \
-	$(DIROBJ)\dsp\dec_sse2.obj \
-	$(DIROBJ)\dsp\cpu.obj \
-	$(DIROBJ)\dsp\yuv.obj \
-	$(DIROBJ)\utils\bit_reader.obj \
-	$(DIROBJ)\utils\bit_writer.obj \
-	$(DIROBJ)\utils\thread.obj \
-	$(RESOURCE)
+DEC_OBJS = \
+    $(DIROBJ)\dec\alpha.obj \
+    $(DIROBJ)\dec\buffer.obj \
+    $(DIROBJ)\dec\frame.obj \
+    $(DIROBJ)\dec\idec.obj \
+    $(DIROBJ)\dec\io.obj \
+    $(DIROBJ)\dec\layer.obj \
+    $(DIROBJ)\dec\quant.obj \
+    $(DIROBJ)\dec\tree.obj \
+    $(DIROBJ)\dec\vp8.obj \
+    $(DIROBJ)\dec\vp8l.obj \
+    $(DIROBJ)\dec\webp.obj \

-EXAMPLES_OBJS = \
-	$(DIROBJ)\examples\cwebp.obj \
-	$(DIROBJ)\examples\dwebp.obj
+DEMUX_OBJS = \
+    $(DIROBJ)\demux\demux.obj \

-all: $(DIRLIB)\$(TARGET) $(DIRBIN)\dwebp.exe $(DIRBIN)\cwebp.exe
+DSP_DEC_OBJS = \
+    $(DIROBJ)\dsp\cpu.obj \
+    $(DIROBJ)\dsp\dec.obj \
+    $(DIROBJ)\dsp\dec_neon.obj \
+    $(DIROBJ)\dsp\dec_sse2.obj \
+    $(DIROBJ)\dsp\lossless.obj \
+    $(DIROBJ)\dsp\upsampling.obj \
+    $(DIROBJ)\dsp\upsampling_neon.obj \
+    $(DIROBJ)\dsp\upsampling_sse2.obj \
+    $(DIROBJ)\dsp\yuv.obj \
+
+DSP_ENC_OBJS = \
+    $(DIROBJ)\dsp\enc.obj \
+    $(DIROBJ)\dsp\enc_neon.obj \
+    $(DIROBJ)\dsp\enc_sse2.obj \
+
+EX_FORMAT_DEC_OBJS = \
+    $(DIROBJ)\examples\jpegdec.obj \
+    $(DIROBJ)\examples\metadata.obj \
+    $(DIROBJ)\examples\pngdec.obj \
+    $(DIROBJ)\examples\tiffdec.obj \
+    $(DIROBJ)\examples\wicdec.obj \
+
+EX_UTIL_OBJS = \
+    $(DIROBJ)\examples\example_util.obj \
+
+ENC_OBJS = \
+    $(DIROBJ)\enc\alpha.obj \
+    $(DIROBJ)\enc\analysis.obj \
+    $(DIROBJ)\enc\backward_references.obj \
+    $(DIROBJ)\enc\config.obj \
+    $(DIROBJ)\enc\cost.obj \
+    $(DIROBJ)\enc\filter.obj \
+    $(DIROBJ)\enc\frame.obj \
+    $(DIROBJ)\enc\histogram.obj \
+    $(DIROBJ)\enc\iterator.obj \
+    $(DIROBJ)\enc\layer.obj \
+    $(DIROBJ)\enc\picture.obj \
+    $(DIROBJ)\enc\quant.obj \
+    $(DIROBJ)\enc\syntax.obj \
+    $(DIROBJ)\enc\token.obj \
+    $(DIROBJ)\enc\tree.obj \
+    $(DIROBJ)\enc\vp8l.obj \
+    $(DIROBJ)\enc\webpenc.obj \
+
+MUX_OBJS = \
+    $(DIROBJ)\mux\muxedit.obj \
+    $(DIROBJ)\mux\muxinternal.obj \
+    $(DIROBJ)\mux\muxread.obj \
+
+UTILS_DEC_OBJS = \
+    $(DIROBJ)\utils\bit_reader.obj \
+    $(DIROBJ)\utils\color_cache.obj \
+    $(DIROBJ)\utils\filters.obj \
+    $(DIROBJ)\utils\huffman.obj \
+    $(DIROBJ)\utils\quant_levels_dec.obj \
+    $(DIROBJ)\utils\rescaler.obj \
+    $(DIROBJ)\utils\thread.obj \
+    $(DIROBJ)\utils\utils.obj \
+
+UTILS_ENC_OBJS = \
+    $(DIROBJ)\utils\bit_writer.obj \
+    $(DIROBJ)\utils\huffman_encode.obj \
+    $(DIROBJ)\utils\quant_levels.obj \
+
+LIBWEBPDECODER_OBJS = $(DEC_OBJS) $(DSP_DEC_OBJS) $(UTILS_DEC_OBJS)
+LIBWEBP_OBJS = $(LIBWEBPDECODER_OBJS) $(ENC_OBJS) $(DSP_ENC_OBJS) \
+               $(UTILS_ENC_OBJS) $(DLL_OBJS)
+LIBWEBPMUX_OBJS = $(MUX_OBJS) $(LIBWEBPMUX_OBJS)
+LIBWEBPDEMUX_OBJS = $(DEMUX_OBJS) $(LIBWEBPDEMUX_OBJS)
+
+OUT_LIBS = $(LIBWEBPDECODER) $(LIBWEBP)
+OUT_EXAMPLES = $(DIRBIN)\cwebp.exe $(DIRBIN)\dwebp.exe
+EXTRA_EXAMPLES = $(DIRBIN)\vwebp.exe $(DIRBIN)\webpmux.exe
+
+ex: $(OUT_LIBS) $(OUT_EXAMPLES)
+all: ex $(EXTRA_EXAMPLES)
+$(DIRBIN)\cwebp.exe: $(DIROBJ)\examples\cwebp.obj $(EX_FORMAT_DEC_OBJS)
+$(DIRBIN)\dwebp.exe: $(DIROBJ)\examples\dwebp.obj
+$(DIRBIN)\vwebp.exe: $(DIROBJ)\examples\vwebp.obj
+$(DIRBIN)\vwebp.exe: $(EX_UTIL_OBJS) $(LIBWEBPDEMUX) $(LIBWEBP)
+$(DIRBIN)\webpmux.exe: $(DIROBJ)\examples\webpmux.obj $(LIBWEBPMUX)
+$(DIRBIN)\webpmux.exe: $(EX_UTIL_OBJS) $(LIBWEBP)
+$(OUT_EXAMPLES): $(EX_UTIL_OBJS) $(LIBWEBP)
+$(EX_UTIL_OBJS) $(EX_FORMAT_DEC_OBJS): $(OUTPUT_DIRS)

-# Additional include and library paths (for zlib) can be passed via the CL and
-# LINK environment variables respectively:
-# > set CL=/I\zlib\include
-# > set LINK=\zlib\zlib.lib
-# > nmake /f Makefile.vc CFG=release-static experimental
 experimental:
 	$(MAKE) /f Makefile.vc \
-	    CFG=$(CFG) CFLAGS="$(CFLAGS) /DWEBP_EXPERIMENTAL_FEATURES" /$(MAKEFLAGS)
+	    CFG=$(CFG) \
+	    CFLAGS="$(CFLAGS) /DWEBP_EXPERIMENTAL_FEATURES" /$(MAKEFLAGS)

-$(DIRLIB)\$(TARGET): $(X_OBJS)
-	$(LNK) $(LFLAGS) $(X_OBJS)
+$(LIBWEBPDECODER): $(LIBWEBPDECODER_OBJS)
+$(LIBWEBP): $(LIBWEBP_OBJS)
+$(LIBWEBPMUX): $(LIBWEBPMUX_OBJS)
+$(LIBWEBPDEMUX): $(LIBWEBPDEMUX_OBJS)
+
+$(LIBWEBP_OBJS) $(LIBWEBPMUX_OBJS) $(LIBWEBPDEMUX_OBJS): $(OUTPUT_DIRS)
+
+!IF "$(DLLBUILD)" == "TRUE"
+$(LIBWEBP_OBJS) $(LIBWEBPMUX_OBJS) $(LIBWEBPDEMUX_OBJS): \
+    $(DIROBJ)\$(DLLINC) $(DIROBJ)\$(DLLC)
+
+{$(DIROBJ)}.c{$(DIROBJ)}.obj:
+	$(CC) $(CFLAGS) /Fd$(LIBWEBP_PDBNAME) /Fo$@  $<
+
+$(LIBWEBPMUX): $(LIBWEBP)
+$(LIBWEBPDEMUX): $(LIBWEBP)
+
+$(LIBWEBPDECODER) $(LIBWEBP) $(LIBWEBPMUX) $(LIBWEBPDEMUX):
+	$(LNKDLL) /out:$(DIRBIN)\$(@B:_dll=.dll) /implib:$@ $(LFLAGS) $**
 	-xcopy $(DIROBJ)\*.pdb $(DIRLIB) /y

-$(X_OBJS): $(DIROBJ)\enc $(DIROBJ)\dec $(DIROBJ)\dsp $(DIROBJ)\utils $(DIRLIB) $(DIRINC) $(DIRBIN)
-!IF "$(DLLBUILD)" == "TRUE"
-$(X_OBJS): $(DIROBJ)\$(DLLINC)
 clean::
-	@-erase /s $(DIROBJ)\$(DLLINC) 2> NUL
+	@-erase /s $(DIROBJ)\$(DLLC) $(DIROBJ)\$(DLLINC) 2> NUL
+!ELSE
+$(LIBWEBPDECODER) $(LIBWEBP) $(LIBWEBPMUX) $(LIBWEBPDEMUX):
+	$(LNKLIB) /out:$@ $**
+	-xcopy $(DIROBJ)\*.pdb $(DIRLIB) /y
 !ENDIF

-$(EXAMPLES_OBJS): $(DIROBJ)\examples $(DIRLIB)\$(TARGET)
-
-$(DIROBJ)\enc:
-	@if not exist "$(DIROBJ)\enc" mkdir $(DIROBJ)\enc
-
-$(DIROBJ)\examples:
-	@if not exist "$(DIROBJ)\examples" mkdir $(DIROBJ)\examples
-
-$(DIROBJ)\dec:
-	@if not exist "$(DIROBJ)\dec" mkdir $(DIROBJ)\dec
-
-$(DIROBJ)\dsp:
-	@if not exist "$(DIROBJ)\dsp" mkdir $(DIROBJ)\dsp
-
-$(DIROBJ)\utils:
-	@if not exist "$(DIROBJ)\utils" mkdir $(DIROBJ)\utils
-
-$(DIRLIB):
-	@if not exist "$(DIRLIB)" mkdir $(DIRLIB)
-
-$(DIRINC):
-	@if not exist "$(DIRINC)" mkdir $(DIRINC)
-
-$(DIRBIN):
-	@if not exist "$(DIRBIN)" mkdir $(DIRBIN)
+$(OUTPUT_DIRS):
+	@if not exist "$(@)" mkdir "$(@)"

 # generate a helper include to define WEBP_EXTERN suitable for the DLL build
 $(DIROBJ)\$(DLLINC):
@ -227,20 +296,35 @@ $(DIROBJ)\$(DLLINC):
 	@echo #define WEBP_EXTERN(type) __declspec(dllexport) type >> $@
 	@echo #endif  /* WEBP_DLL_H_ */ >> $@

+# expose a WebPFree() function for use in managed code
+$(DIROBJ)\$(DLLC): $(DIROBJ)\$(DLLINC)
+	@echo #include ^<stdlib.h^> > $@
+	@echo #include "webp_dll.h" >> $@
+	@echo // This function should be used in place of free() for memory >> $@
+	@echo // returned by the WebP API. >> $@
+	@echo WEBP_EXTERN(void) WebPFree(void* ptr) { >> $@
+	@echo   free(ptr); >> $@
+	@echo } >> $@
+
 .SUFFIXES: .c .obj .res .exe
-{examples}.c{$(DIROBJ)\examples}.obj:
-	$(CC) $(CFLAGS) /Fo"$@"  $<
-{src\dec}.c{$(DIROBJ)\dec}.obj:
-	$(CC) $(CFLAGS) /Fo"$@"  $<
-{src\enc}.c{$(DIROBJ)\enc}.obj:
-	$(CC) $(CFLAGS) /Fo"$@"  $<
-{src\dsp}.c{$(DIROBJ)\dsp}.obj:
-	$(CC) $(CFLAGS) /Fo"$@"  $<
-{src\utils}.c{$(DIROBJ)\utils}.obj:
-	$(CC) $(CFLAGS) /Fo"$@"  $<
+{examples}.c{$(DIROBJ)\examples}.obj::
+	$(CC) $(CFLAGS) /Fd$(DIROBJ)\examples\ /Fo$(DIROBJ)\examples\  $<
+{src\dec}.c{$(DIROBJ)\dec}.obj::
+	$(CC) $(CFLAGS) /Fd$(LIBWEBP_PDBNAME) /Fo$(DIROBJ)\dec\ $<
+{src\demux}.c{$(DIROBJ)\demux}.obj::
+	$(CC) $(CFLAGS) /Fd$(LIBWEBP_PDBNAME) /Fo$(DIROBJ)\demux\ $<
+{src\dsp}.c{$(DIROBJ)\dsp}.obj::
+	$(CC) $(CFLAGS) /Fd$(LIBWEBP_PDBNAME) /Fo$(DIROBJ)\dsp\ $<
+{src\enc}.c{$(DIROBJ)\enc}.obj::
+	$(CC) $(CFLAGS) /Fd$(LIBWEBP_PDBNAME) /Fo$(DIROBJ)\enc\ $<
+{src\mux}.c{$(DIROBJ)\mux}.obj::
+	$(CC) $(CFLAGS) /Fd$(LIBWEBP_PDBNAME) /Fo$(DIROBJ)\mux\ $<
+{src\utils}.c{$(DIROBJ)\utils}.obj::
+	$(CC) $(CFLAGS) /Fd$(LIBWEBP_PDBNAME) /Fo$(DIROBJ)\utils\ $<

 {$(DIROBJ)\examples}.obj{$(DIRBIN)}.exe:
-	$(LNKEXE) $(LDFLAGS) /OUT:"$@" $<  ole32.lib windowscodecs.lib shlwapi.lib $(DIRLIB)\$(TARGET)
+	$(LNKEXE) $(LDFLAGS) /OUT:$@ $** \
+	    ole32.lib windowscodecs.lib shlwapi.lib
 	$(MT) -manifest $@.manifest -outputresource:$@;1
 	del $@.manifest

--- a/47
+++ b/47
@ -1,3 +1,50 @@
+- 6/13/13: version 0.3.1
+  This is a binary compatible release.
+  * Add incremental decoding support for images containing ALPH and ICCP chunks.
+  * Python bindings via swig for the simple encode/decode interfaces similar to
+    Java.
+
+- 3/20/13: version 0.3.0
+  This is a binary compatible release.
+  * WebPINewRGB/WebPINewYUVA accept being passed a NULL output buffer
+    and will perform auto-allocation.
+  * default filter option is now '-strong -f 60'
+  * encoding speed-up for lossy methods 3 to 6
+  * alpha encoding can be done in parallel to lossy using 'cwebp -mt ...'
+  * color profile, metadata (XMP/EXIF) and animation support finalized in the
+    container.
+  * various NEON assembly additions
+  Tool updates / additions:
+    * gif2webp added
+    * vwebp given color profile & animation support
+    * cwebp can preserve color profile / metadata with '-metadata'
+
+- 10/30/12: version 0.2.1
+  * Various security related fixes
+  * cwebp.exe: fix import errors on Windows XP
+  * enable DLL builds for mingw targets
+
+- 8/3/12: version 0.2.0
+  * Add support for ARGB -> YUVA conversion for lossless decoder
+    New functions: WebPINewYUVA, WebPIDecGetYUVA
+  * Add stats for lossless and alpha encoding
+  * Security related hardening: allocation and size checks
+  * Add PAM output support to dwebp
+
+- 7/19/12: version 0.1.99
+  * This is a pre-release of 0.2.0, not an rc to allow for further
+    incompatible changes based on user feedback.
+  * Alpha channel encode/decode support.
+  * Lossless encoder/decoder.
+  * Add TIFF input support to cwebp.
+  Incompatible changes:
+    * The encode ABI has been modified to support alpha encoding.
+    * Deprecated function WebPINew() has been removed.
+    * Decode function signatures have changed to consistently use size_t over
+      int/uint32_t.
+    * decode_vp8.h is no longer installed system-wide.
+    * cwebp will encode the alpha channel if present.
+
 - 9/19/11: version 0.1.3
  * Advanced decoding APIs.
  * On-the-fly cropping and rescaling of images.
--- a/203
+++ b/203
@ -4,7 +4,7 @@
          \__\__/\____/\_____/__/ ____  ___
                / _/ /    \    \ /  _ \/ _/
               /  \_/   / /   \ \   __/  \__
-               \____/____/\_____/_____/____/v0.1.3
+               \____/____/\_____/_____/____/v0.3.1

 Description:
 ============
@ -13,7 +13,7 @@ WebP codec: library to encode and decode images in WebP format. This package
 contains the library that can be used in other programs to add WebP support,
 as well as the command line tools 'cwebp' and 'dwebp'.

-See http://code.google.com/speed/webp
+See http://developers.google.com/speed/webp

 Latest sources are available from http://www.webmproject.org/code/

@ -49,11 +49,13 @@ will build the binaries examples/cwebp and examples/dwebp, along
 with the static library src/libwebp.a. No system-wide installation
 is supplied, as this is a simple alternative to the full installation
 system based on the autoconf tools (see below).
-Please refer the makefile.unix for additional details and customizations.
+Please refer to makefile.unix for additional details and customizations.

 Using autoconf tools:
 ---------------------
-./autogen.sh
+When building from git sources, you will need to run autogen.sh to generate the
+configure script.
+
 ./configure
 make
 make install
@ -61,7 +63,6 @@ make install
 should be all you need to have the following files

 /usr/local/include/webp/decode.h
-/usr/local/include/webp/decode_vp8.h
 /usr/local/include/webp/encode.h
 /usr/local/include/webp/types.h
 /usr/local/lib/libwebp.*
@ -70,32 +71,40 @@ should be all you need to have the following files

 installed.

-Note: The encoding and decoding libraries are compiled separately
-(as src/dec/libwebpdecode.* and src/dec/libwebpencode.*). They
-can be installed independently using a minor modification in the
-corresponding Makefile.am configure files (see comments there).
+Note: A decode-only library, libwebpdecoder, is available using the
+'--enable-libwebpdecoder' flag. The encode library is built separately and can
+be installed independently using a minor modification in the corresponding
+Makefile.am configure files (see comments there). See './configure --help' for
+more options.

 SWIG bindings:
 --------------

-To generate language bindings from swig/libwebp.i swig-1.3
-(http://www.swig.org) is required. 2.0 may work, but has not been tested.
+To generate language bindings from swig/libwebp.i at least swig-1.3
+(http://www.swig.org) is required.

 Currently the following functions are mapped:
 Decode:
  WebPGetDecoderVersion
  WebPGetInfo
-  WebPDecodeRGB
  WebPDecodeRGBA
  WebPDecodeARGB
-  WebPDecodeBGR
  WebPDecodeBGRA
+  WebPDecodeBGR
+  WebPDecodeRGB
+
 Encode:
  WebPGetEncoderVersion
-  WebPEncodeRGB
  WebPEncodeRGBA
-  WebPEncodeBGR
  WebPEncodeBGRA
+  WebPEncodeRGB
+  WebPEncodeBGR
+  WebPEncodeLosslessRGBA
+  WebPEncodeLosslessBGRA
+  WebPEncodeLosslessRGB
+  WebPEncodeLosslessBGR
+
+See swig/README for more detailed build instructions.

 Java bindings:

@ -103,6 +112,12 @@ To build the swig-generated JNI wrapper code at least JDK-1.5 (or equivalent)
 is necessary for enum support. The output is intended to be a shared object /
 DLL that can be loaded via System.loadLibrary("webp_jni").

+Python bindings:
+
+To build the swig-generated Python extension code at least Python 2.6 is
+required. Python < 2.6 may build with some minor changes to libwebp.i or the
+generated code, but is untested.
+
 Encoding tool:
 ==============

@ -111,9 +126,13 @@ decoding (dwebp) images.

 The easiest use should look like:
  cwebp input.png -q 80 -o output.webp
-which will convert the input PNG or JPEG file to a WebP file using a
-quality factor of 80 on a 0->100 scale (0 being the lowest quality,
-100 being the best. Default value is 75).
+which will convert the input file to a WebP file using a quality factor of 80
+on a 0->100 scale (0 being the lowest quality, 100 being the best. Default
+value is 75).
+You might want to try the -lossless flag too, which will compress the source
+(in RGBA format) without any loss. The -q quality parameter will in this case
+control the amount of processing time spent trying to make the output file as
+small as possible.

 A longer list of options is available using the -longhelp command line flag:

@ -121,45 +140,63 @@ A longer list of options is available using the -longhelp command line flag:
 Usage:
 cwebp [-preset <...>] [options] in_file [-o out_file]

-If input size (-s) for an image is not specified, it is assumed to be a either
-  PNG or JPEG format file.
+If input size (-s) for an image is not specified, it is assumed to be a PNG,
+JPEG or TIFF file.
 options:
  -h / -help  ............ short help
  -H / -longhelp  ........ long help
  -q <float> ............. quality factor (0:small..100:big)
+  -alpha_q <int> ......... Transparency-compression quality (0..100).
  -preset <string> ....... Preset setting, one of:
                            default, photo, picture,
                            drawing, icon, text
     -preset must come first, as it overwrites other parameters.
  -m <int> ............... compression method (0=fast, 6=slowest)
  -segments <int> ........ number of segments to use (1..4)
+  -size <int> ............ Target size (in bytes)
+  -psnr <float> .......... Target PSNR (in dB. typically: 42)

  -s <int> <int> ......... Input size (width x height) for YUV
  -sns <int> ............. Spatial Noise Shaping (0:off, 100:max)
  -f <int> ............... filter strength (0=off..100)
  -sharpness <int> ....... filter sharpness (0:most .. 7:least sharp)
-  -strong ................ use strong filter instead of simple.
+  -strong ................ use strong filter instead of simple (default).
+  -nostrong .............. use simple filter instead of strong.
  -partition_limit <int> . limit quality to fit the 512k limit on
                           the first partition (0=no degradation ... 100=full)
-  -alpha_comp <int> ...... set the transparency-compression
-  -noalpha ............... discard any transparency information.
  -pass <int> ............ analysis pass number (1..10)
-  -partitions <int> ...... number of partitions to use (0..3)
  -crop <x> <y> <w> <h> .. crop picture with the given rectangle
  -resize <w> <h> ........ resize picture (after any cropping)
+  -mt .................... use multi-threading if available
+  -low_memory ............ reduce memory usage (slower encoding)
  -map <int> ............. print map of extra info.
+  -print_psnr ............ prints averaged PSNR distortion.
+  -print_ssim ............ prints averaged SSIM distortion.
+  -print_lsim ............ prints local-similarity distortion.
  -d <file.pgm> .......... dump the compressed output (PGM file).
+  -alpha_method <int> .... Transparency-compression method (0..1)
+  -alpha_filter <string> . predictive filtering for alpha plane.
+                           One of: none, fast (default) or best.
+  -alpha_cleanup ......... Clean RGB values in transparent area.
+  -noalpha ............... discard any transparency information.
+  -lossless .............. Encode image losslessly.
+  -hint <string> ......... Specify image characteristics hint.
+                           One of: photo, picture or graph
+
+  -metadata <string> ..... comma separated list of metadata to
+                           copy from the input to the output if present.
+                           Valid values: all, none (default), exif, icc, xmp

  -short ................. condense printed message
  -quiet ................. don't print anything.
  -version ............... print version number and exit.
  -noasm ................. disable all assembly optimizations.
  -v ..................... verbose, e.g. print encoding/decoding times
+  -progress .............. report encoding progress

 Experimental Options:
-  -size <int> ............ Target size (in bytes)
-  -psnr <float> .......... Target PSNR (in dB. typically: 42)
-  -af <int> .............. adjust filter strength (0=off, 1=on)
+  -jpeg_like ............. Roughly match expected JPEG size.
+  -af .................... auto-adjust filter strength.
  -pre <int> ............. pre-processing filter


@ -183,10 +220,11 @@ Namely:
     but with better quality.
     Typical value is around '75'.
  * 'f' option directly links to the filtering strength used by the codec's
-     in-loop processing. The higher, the smoother will highly-compressed area
-     look. This is particularly useful when aiming at very small files.
-     Typical values are around 20-30. Note that using the option -strong will
-     change the type of filtering. Use "-f 0" to turn filtering off.
+     in-loop processing. The higher the value, the smoother the
+     highly-compressed area will look. This is particularly useful when aiming
+     at very small files. Typical values are around 20-30. Note that using the
+     option -strong/-nostrong will change the type of filtering. Use "-f 0" to
+     turn filtering off.
  * 'm' controls the trade-off between encoding speed and quality. Default is 4.
     You can try -m 5 or -m 6 to explore more (time-consuming) encoding
     possibilities. A lower value will result in faster encoding at the expense
@ -211,9 +249,12 @@ Usage: dwebp in_file [options] [-o out_file]

 Decodes the WebP image file to PNG format [Default]
 Use following options to convert into alternate image formats:
-  -ppm ......... save the raw RGB samples as color PPM
+  -pam ......... save the raw RGBA samples as a color PAM
+  -ppm ......... save the raw RGB samples as a color PPM
  -pgm ......... save the raw YUV samples as a grayscale PGM
                 file with IMC4 layout.
+  -yuv ......... save the raw YUV samples in flat layout.
+
 Other options are:
  -version  .... print version number and exit.
  -nofancy ..... don't use the fancy YUV420 upscaler.
@ -221,10 +262,62 @@ Use following options to convert into alternate image formats:
  -mt .......... use multi-threading
  -crop <x> <y> <w> <h> ... crop output with the given rectangle
  -scale <w> <h> .......... scale the output (*after* any cropping)
+  -alpha ....... only save the alpha plane.
  -h     ....... this help message.
  -v     ....... verbose (e.g. print encoding/decoding times)
  -noasm ....... disable all assembly optimizations.

+Visualization tool:
+===================
+
+There's a little self-serve visualization tool called 'vwebp' under the
+examples/ directory. It uses OpenGL to open a simple drawing window and show
+a decoded WebP file. It's not yet integrated in the automake build system, but
+you can try to manually compile it using the recommendations below.
+
+Usage: vwebp in_file [options]
+
+Decodes the WebP image file and visualize it using OpenGL
+Options are:
+  -version  .... print version number and exit.
+  -noicc ....... don't use the icc profile if present.
+  -nofancy ..... don't use the fancy YUV420 upscaler.
+  -nofilter .... disable in-loop filtering.
+  -mt .......... use multi-threading.
+  -info ........ print info.
+  -h     ....... this help message.
+
+Keyboard shortcuts:
+  'c' ................ toggle use of color profile.
+  'i' ................ overlay file information.
+  'q' / 'Q' / ESC .... quit.
+
+Building:
+---------
+
+Prerequisites:
+1) OpenGL & OpenGL Utility Toolkit (GLUT)
+  Linux:
+    $ sudo apt-get install freeglut3-dev mesa-common-dev
+  Mac + XCode:
+    - These libraries should be available in the OpenGL / GLUT frameworks.
+  Windows:
+    http://freeglut.sourceforge.net/index.php#download
+
+2) (Optional) qcms (Quick Color Management System)
+  i. Download qcms from Mozilla / Chromium:
+    http://hg.mozilla.org/mozilla-central/file/0e7639e3bdfb/gfx/qcms
+    http://src.chromium.org/viewvc/chrome/trunk/src/third_party/qcms
+  ii. Build and archive the source files as libqcms.a / qcms.lib
+  iii. Update makefile.unix / Makefile.vc
+    a) Define WEBP_HAVE_QCMS
+    b) Update include / library paths to reference the qcms directory.
+
+Build using makefile.unix / Makefile.vc:
+$ make -f makefile.unix examples/vwebp
+> nmake /f Makefile.vc CFG=release-static \
+    ../obj/x64/release-static/bin/vwebp.exe
+
 Encoding API:
 =============

@ -242,6 +335,20 @@ size_t WebPEncodeBGRA(const uint8_t* bgra, int width, int height, int stride,
 They will convert raw RGB samples to a WebP data. The only control supplied
 is the quality factor.

+There are some variants for using the lossless format:
+
+size_t WebPEncodeLosslessRGB(const uint8_t* rgb, int width, int height,
+                             int stride, uint8_t** output);
+size_t WebPEncodeLosslessBGR(const uint8_t* bgr, int width, int height,
+                             int stride, uint8_t** output);
+size_t WebPEncodeLosslessRGBA(const uint8_t* rgba, int width, int height,
+                              int stride, uint8_t** output);
+size_t WebPEncodeLosslessBGRA(const uint8_t* bgra, int width, int height,
+                              int stride, uint8_t** output);
+
+Of course in this case, no quality factor is needed since the compression
+occurs without loss of the input values, at the expense of larger output sizes.
+
 Advanced encoding API:
 ----------------------

@ -306,21 +413,19 @@ Decoding API:
 This is mainly just one function to call:

 #include "webp/decode.h"
-uint8_t* WebPDecodeRGB(const uint8_t* data, uint32_t data_size,
-                       int *width, int *height);
+uint8_t* WebPDecodeRGB(const uint8_t* data, size_t data_size,
+                       int* width, int* height);

 Please have a look at the file src/webp/decode.h for the details.
 There are variants for decoding in BGR/RGBA/ARGB/BGRA order, along with
 decoding to raw Y'CbCr samples. One can also decode the image directly into a
 pre-allocated buffer.

-To detect a WebP file and gather picture's dimensions, the function:
-  int WebPGetInfo(const uint8_t* data, uint32_t data_size,
-                  int *width, int *height);
+To detect a WebP file and gather the picture's dimensions, the function:
+  int WebPGetInfo(const uint8_t* data, size_t data_size,
+                  int* width, int* height);
 is supplied. No decoding is involved when using it.

-A lower-level API is available from the header file <webp/decode_vp8.h>
-
 Incremental decoding API:
 =========================

@ -330,7 +435,11 @@ is stored into an instance of the WebPIDecoder object. This object can be
 created with the purpose of decoding either RGB or Y'CbCr samples.
 For instance:

-  WebPIDecoder* idec = WebPINew(MODE_BGR);
+  WebPDecBuffer buffer;
+  WebPInitDecBuffer(&buffer);
+  buffer.colorspace = MODE_BGR;
+  ...
+  WebPIDecoder* idec = WebPINewDecoder(&buffer);

 As data is made progressively available, this incremental-decoder object
 can be used to decode the picture further. There are two (mutually exclusive)
@ -344,23 +453,23 @@ or by just mentioning the new size of the transmitted data:

  WebPIUpdate(idec, buffer, size_of_transmitted_buffer);

-Note that 'buffer' can be modified between each calls to WebPIUpdate, in
+Note that 'buffer' can be modified between each call to WebPIUpdate, in
 particular when the buffer is resized to accommodate larger data.

 These functions will return the decoding status: either VP8_STATUS_SUSPENDED if
-decoding is not finished yet, or VP8_STATUS_OK when decoding is done.
-Any other status is an error condition.
+decoding is not finished yet or VP8_STATUS_OK when decoding is done. Any other
+status is an error condition.

-The idec object must always be released (even upon an error condition)
-by calling: WebPDelete(idec).
+The 'idec' object must always be released (even upon an error condition) by
+calling: WebPDelete(idec).

 To retrieve partially decoded picture samples, one must use the corresponding
-method: WebPIDecGetRGB or WebPIDecGetYUV.
+method: WebPIDecGetRGB or WebPIDecGetYUVA.
 It will return the last displayable pixel row.

 Lastly, note that decoding can also be performed into a pre-allocated pixel
 buffer. This buffer must be passed when creating a WebPIDecoder, calling
-WebPINewRGB() or WebPINewYUV().
+WebPINewRGB() or WebPINewYUVA().

 Please have a look at the src/webp/decode.h header for further details.

--- a/README.mux
+++ b/README.mux
@ -0,0 +1,185 @@
+          __   __  ____  ____  ____  __ __  _     __ __
+         /  \\/  \/  _ \/  _ \/  _ \/  \  \/ \___/_ / _\
+         \       /   __/  _  \   __/      /  /  (_/  /__
+          \__\__/\_____/_____/__/  \__//_/\_____/__/___/v0.1.1
+
+
+Description:
+============
+
+WebPMux: set of two libraries 'Mux' and 'Demux' for creation, extraction and
+manipulation of an extended format WebP file, which can have features like
+color profile, metadata and animation. Reference command-line tools 'webpmux'
+and 'vwebp' as well as the WebP container specification
+'doc/webp-container-spec.txt' are also provided in this package.
+
+WebP Mux tool:
+==============
+
+The examples/ directory contains a tool (webpmux) for manipulating WebP
+files. The webpmux tool can be used to create an extended format WebP file and
+also to extract or strip relevant data from such a file.
+
+A list of options is available using the -help command line flag:
+
+> webpmux -help
+Usage: webpmux -get GET_OPTIONS INPUT -o OUTPUT
+       webpmux -set SET_OPTIONS INPUT -o OUTPUT
+       webpmux -strip STRIP_OPTIONS INPUT -o OUTPUT
+       webpmux -frame FRAME_OPTIONS [-frame...] [-loop LOOP_COUNT]
+               [-bgcolor BACKGROUND_COLOR] -o OUTPUT
+       webpmux -info INPUT
+       webpmux [-h|-help]
+       webpmux -version
+
+GET_OPTIONS:
+ Extract relevant data.
+   icc       Get ICC profile.
+   exif      Get EXIF metadata.
+   xmp       Get XMP metadata.
+   frame n   Get nth frame.
+
+SET_OPTIONS:
+ Set color profile/metadata.
+   icc  file.icc     Set ICC profile.
+   exif file.exif    Set EXIF metadata.
+   xmp  file.xmp     Set XMP metadata.
+   where:    'file.icc' contains the ICC profile to be set,
+             'file.exif' contains the EXIF metadata to be set
+             'file.xmp' contains the XMP metadata to be set
+
+STRIP_OPTIONS:
+ Strip color profile/metadata.
+   icc       Strip ICC profile.
+   exif      Strip EXIF metadata.
+   xmp       Strip XMP metadata.
+
+FRAME_OPTIONS(i):
+ Create animation.
+   file_i +di+xi+yi+mi
+   where:    'file_i' is the i'th animation frame (WebP format),
+             'di' is the pause duration before next frame.
+             'xi','yi' specify the image offset for this frame.
+             'mi' is the dispose method for this frame (0 or 1).
+
+LOOP_COUNT:
+ Number of times to repeat the animation.
+ Valid range is 0 to 65535 [Default: 0 (infinite)].
+
+BACKGROUND_COLOR:
+ Background color of the canvas.
+  A,R,G,B
+  where:    'A', 'R', 'G' and 'B' are integers in the range 0 to 255 specifying
+            the Alpha, Red, Green and Blue component values respectively
+            [Default: 255,255,255,255].
+
+INPUT & OUTPUT are in WebP format.
+
+Note: The nature of EXIF, XMP and ICC data is not checked and is assumed to be
+valid.
+
+Visualization tool:
+===================
+
+The examples/ directory also contains a tool (vwebp) for viewing WebP files.
+It decodes the image and visualizes it using OpenGL. See the libwebp README
+for details on building and running this program.
+
+Mux API:
+========
+The Mux API contains methods for adding data to and reading data from WebP
+files. This API currently supports XMP/EXIF metadata, ICC profile and animation.
+Other features may be added in subsequent releases.
+
+Example#1 (pseudo code): Creating a WebPMux object with image data, color
+profile and XMP metadata.
+
+  int copy_data = 0;
+  WebPMux* mux = WebPMuxNew();
+  // ... (Prepare image data).
+  WebPMuxSetImage(mux, &image, copy_data);
+  // ... (Prepare ICC profile data).
+  WebPMuxSetChunk(mux, "ICCP", &icc_profile, copy_data);
+  // ... (Prepare XMP metadata).
+  WebPMuxSetChunk(mux, "XMP ", &xmp, copy_data);
+  // Get data from mux in WebP RIFF format.
+  WebPMuxAssemble(mux, &output_data);
+  WebPMuxDelete(mux);
+  // ... (Consume output_data; e.g. write output_data.bytes to file).
+  WebPDataClear(&output_data);
+
+
+Example#2 (pseudo code): Get image and color profile data from a WebP file.
+
+  int copy_data = 0;
+  // ... (Read data from file).
+  WebPMux* mux = WebPMuxCreate(&data, copy_data);
+  WebPMuxGetFrame(mux, 1, &image);
+  // ... (Consume image; e.g. call WebPDecode() to decode the data).
+  WebPMuxGetChunk(mux, "ICCP", &icc_profile);
+  // ... (Consume icc_profile).
+  WebPMuxDelete(mux);
+  free(data);
+
+
+For a detailed Mux API reference, please refer to the header file
+(src/webp/mux.h).
+
+Demux API:
+==========
+The Demux API enables extraction of images and extended format data from
+WebP files. This API currently supports reading of XMP/EXIF metadata, ICC
+profile and animated images. Other features may be added in subsequent
+releases.
+
+Code Example: Demuxing WebP data to extract all the frames, ICC profile
+and EXIF/XMP metadata.
+
+  WebPDemuxer* demux = WebPDemux(&webp_data);
+  uint32_t width = WebPDemuxGetI(demux, WEBP_FF_CANVAS_WIDTH);
+  uint32_t height = WebPDemuxGetI(demux, WEBP_FF_CANVAS_HEIGHT);
+  // ... (Get information about the features present in the WebP file).
+  uint32_t flags = WebPDemuxGetI(demux, WEBP_FF_FORMAT_FLAGS);
+
+  // ... (Iterate over all frames).
+  WebPIterator iter;
+  if (WebPDemuxGetFrame(demux, 1, &iter)) {
+    do {
+      // ... (Consume 'iter'; e.g. Decode 'iter.fragment' with WebPDecode(),
+      // ... and get other frame properties like width, height, offsets etc.
+      // ... see 'struct WebPIterator' below for more info).
+    } while (WebPDemuxNextFrame(&iter));
+    WebPDemuxReleaseIterator(&iter);
+  }
+
+  // ... (Extract metadata).
+  WebPChunkIterator chunk_iter;
+  if (flags & ICCP_FLAG) WebPDemuxGetChunk(demux, "ICCP", 1, &chunk_iter);
+  // ... (Consume the ICC profile in 'chunk_iter.chunk').
+  WebPDemuxReleaseChunkIterator(&chunk_iter);
+  if (flags & EXIF_FLAG) WebPDemuxGetChunk(demux, "EXIF", 1, &chunk_iter);
+  // ... (Consume the EXIF metadata in 'chunk_iter.chunk').
+  WebPDemuxReleaseChunkIterator(&chunk_iter);
+  if (flags & XMP_FLAG) WebPDemuxGetChunk(demux, "XMP ", 1, &chunk_iter);
+  // ... (Consume the XMP metadata in 'chunk_iter.chunk').
+  WebPDemuxReleaseChunkIterator(&chunk_iter);
+  WebPDemuxDelete(demux);
+
+
+For a detailed Demux API reference, please refer to the header file
+(src/webp/demux.h).
+
+
+Bugs:
+=====
+
+Please report all bugs to our issue tracker:
+    http://code.google.com/p/webp/issues
+Patches welcome! See this page to get started:
+    http://www.webmproject.org/code/contribute/submitting-patches/
+
+Discuss:
+========
+
+Email: webp-discuss@webmproject.org
+Web: http://groups.google.com/a/webmproject.org/group/webp-discuss
--- a/configure.ac
+++ b/configure.ac
@ -1,97 +1,314 @@
-AC_INIT([libwebp], [0.1.3],
+AC_INIT([libwebp], [0.3.1],
        [http://code.google.com/p/webp/issues],,
-        [http://code.google.com/speed/webp])
+        [http://developers.google.com/speed/webp])
 AC_CANONICAL_TARGET
 AM_INIT_AUTOMAKE([-Wall foreign subdir-objects])
+
+dnl === automake >= 1.12 requires this for 'unusual archivers' support.
+dnl === it must occur before LT_INIT (AC_PROG_LIBTOOL).
+m4_ifdef([AM_PROG_AR], [AM_PROG_AR])
+
 AC_PROG_LIBTOOL
 AM_PROG_CC_C_O

-AC_ARG_WITH([pkgconfigdir], AS_HELP_STRING([--with-pkgconfigdir=PATH],
-	[Path to the pkgconfig directory [[LIBDIR/pkgconfig]]]),
-	[pkgconfigdir="$withval"], [pkgconfigdir='${libdir}/pkgconfig'])
+dnl === Enable less verbose output when building.
+m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
+
+dnl === SET_IF_UNSET(shell_var, value)
+dnl ===   Set the shell variable 'shell_var' to 'value' if it is unset.
+AC_DEFUN([SET_IF_UNSET], [test "${$1+set}" = "set" || $1=$2])
+
+AC_ARG_ENABLE([everything],
+              AS_HELP_STRING([--enable-everything],
+                             [Enable all optional targets. These can still be
+                              disabled with --disable-target]),
+              [SET_IF_UNSET([enable_libwebpdecoder], [$enableval])
+               SET_IF_UNSET([enable_libwebpdemux], [$enableval])
+               SET_IF_UNSET([enable_libwebpmux], [$enableval])])
+
+AC_ARG_WITH([pkgconfigdir], AS_HELP_STRING([--with-pkgconfigdir=DIR],
+            [Path to the pkgconfig directory @<:@LIBDIR/pkgconfig@:>@]),
+            [pkgconfigdir="$withval"], [pkgconfigdir='${libdir}/pkgconfig'])
 AC_SUBST([pkgconfigdir])

-dnl === Check libz is present
+dnl === TEST_AND_ADD_CFLAGS(flag)
+dnl ===   Checks whether $CC supports 'flag' and adds it to AM_CFLAGS on success.
+AC_DEFUN([TEST_AND_ADD_CFLAGS],
+         [SAVED_CFLAGS="$CFLAGS"
+          CFLAGS="-Werror $1"
+          AC_MSG_CHECKING([whether $CC supports $1])
+          dnl Note AC_LANG_PROGRAM([]) uses an old-style main definition.
+          AC_COMPILE_IFELSE([AC_LANG_SOURCE([int main(void) { return 0; }])],
+                            [AC_MSG_RESULT([yes])]
+                            dnl Simply append the variable avoiding a
+                            dnl compatibility ifdef for AS_VAR_APPEND as this
+                            dnl variable shouldn't grow all that large.
+                            [AM_CFLAGS="$AM_CFLAGS $1"],
+                            [AC_MSG_RESULT([no])])
+          CFLAGS="$SAVED_CFLAGS"])
+TEST_AND_ADD_CFLAGS([-Wall])
+TEST_AND_ADD_CFLAGS([-Wdeclaration-after-statement])
+TEST_AND_ADD_CFLAGS([-Wextra])
+TEST_AND_ADD_CFLAGS([-Wmissing-declarations])
+TEST_AND_ADD_CFLAGS([-Wmissing-prototypes])
+TEST_AND_ADD_CFLAGS([-Wold-style-definition])
+TEST_AND_ADD_CFLAGS([-Wshadow])
+TEST_AND_ADD_CFLAGS([-Wunused-but-set-variable])
+TEST_AND_ADD_CFLAGS([-Wunused])
+TEST_AND_ADD_CFLAGS([-Wvla])
+AC_SUBST([AM_CFLAGS])

-if test "$enable_experimental" = "yes"; then
-  AC_CHECK_HEADER(zlib.h,
-    AC_CHECK_LIB(z, gzsetparams,,AC_MSG_ERROR(zlib library not found)),
-    AC_MSG_ERROR(zlib not available - no zlib.h)
-  )
+dnl === CLEAR_LIBVARS([var_pfx])
+dnl ===   Clears <var_pfx>_{INCLUDES,LIBS}.
+AC_DEFUN([CLEAR_LIBVARS], [$1_INCLUDES=""; $1_LIBS=""])
+
+dnl === WITHLIB_OPTION([opt_pfx], [outvar_pfx])
+dnl ===   Defines --with-<opt_pfx>{include,lib}dir options which set
+dnl ===   the variables <outvar_pfx>_{INCLUDES,LIBS}.
+AC_DEFUN([WITHLIB_OPTION],
+  [AC_ARG_WITH([$1includedir],
+               AS_HELP_STRING([--with-$1includedir=DIR],
+                              [use $2 includes from DIR]),
+               $2_INCLUDES="-I$withval")
+   AC_ARG_WITH([$1libdir],
+               AS_HELP_STRING([--with-$1libdir=DIR],
+                              [use $2 libraries from DIR]),
+               [$2_LIBS="-L$withval"])])
+
+dnl === LIBCHECK_PROLOGUE([var_pfx])
+dnl ===   Caches the current values of CPPFLAGS/LIBS in SAVED_* then
+dnl ===   prepends the current values with <var_pfx>_{INCLUDES,LIBS}.
+AC_DEFUN([LIBCHECK_PROLOGUE],
+         [SAVED_CPPFLAGS=$CPPFLAGS
+          SAVED_LIBS=$LIBS
+          CPPFLAGS="$$1_INCLUDES $CPPFLAGS"
+          LIBS="$$1_LIBS $LIBS"])
+
+dnl === LIBCHECK_EPILOGUE([var_pfx])
+dnl ===   Restores the values of CPPFLAGS/LIBS from SAVED_* and exports
+dnl ===   <var_pfx>_{INCLUDES,LIBS} with AC_SUBST.
+AC_DEFUN([LIBCHECK_EPILOGUE],
+         [AC_SUBST($1_LIBS)
+          AC_SUBST($1_INCLUDES)
+          CPPFLAGS=$SAVED_CPPFLAGS
+          LIBS=$SAVED_LIBS])
+
+dnl === Check for pthread support
+AC_ARG_ENABLE([threading],
+              AS_HELP_STRING([--disable-threading],
+                             [Disable detection of thread support]),,
+              [enable_threading=yes])
+if test "$enable_threading" = "yes"; then
+  AC_MSG_NOTICE([checking for threading support...])
+  AX_PTHREAD([AC_DEFINE([WEBP_USE_THREAD], [1],
+                        [Undefine this to disable thread support.])
+              LIBS="$PTHREAD_LIBS $LIBS"
+              CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
+              CC="$PTHREAD_CC"
+             ],
+             [enable_threading=no])
 fi
+AC_MSG_NOTICE([checking if threading is enabled... ${enable_threading-no}])
+
+dnl === check for OpenGL/GLUT support ===
+CLEAR_LIBVARS([GL])
+WITHLIB_OPTION([gl], [GL])
+
+LIBCHECK_PROLOGUE([GL])
+
+glut_cflags="none"
+glut_ldflags="none"
+case $host_os in
+  darwin*)
+    # Special case for OSX builds. Append these to give the user a chance to
+    # override with --with-gl*
+    glut_cflags="$glut_cflags|-framework GLUT -framework OpenGL"
+    glut_ldflags="$glut_ldflags|-framework GLUT -framework OpenGL"
+    ;;
+esac
+
+GLUT_SAVED_CPPFLAGS="$CPPFLAGS"
+SAVED_IFS="$IFS"
+IFS="|"
+for flag in $glut_cflags; do
+  # restore IFS immediately as the autoconf macros may need the default.
+  IFS="$SAVED_IFS"
+  unset ac_cv_header_GL_glut_h
+  unset ac_cv_header_OpenGL_glut_h
+
+  case $flag in
+    none) ;;
+    *) CPPFLAGS="$flag $CPPFLAGS";;
+  esac
+  AC_CHECK_HEADERS([GL/glut.h GLUT/glut.h OpenGL/glut.h],
+                   [glut_headers=yes;
+                    test "$flag" = "none" || GL_INCLUDES="$CPPFLAGS";
+                    break])
+  CPPFLAGS="$GLUT_SAVED_CPPFLAGS"
+  test "$glut_headers" = "yes" && break
+done
+IFS="$SAVED_IFS"
+
+if test "$glut_headers" = "yes"; then
+  AC_LANG_PUSH([C])
+  GLUT_SAVED_LDFLAGS="$LDFLAGS"
+  SAVED_IFS="$IFS"
+  IFS="|"
+  for flag in $glut_ldflags; do
+    # restore IFS immediately as the autoconf macros may need the default.
+    IFS="$SAVED_IFS"
+    unset ac_cv_search_glBegin
+
+    case $flag in
+      none) ;;
+      *) LDFLAGS="$flag $LDFLAGS";;
+    esac
+
+    # find libGL
+    GL_SAVED_LIBS="$LIBS"
+    AC_SEARCH_LIBS([glBegin], [GL OpenGL])
+    LIBS="$GL_SAVED_LIBS"
+
+    # A direct link to libGL may not be necessary on e.g., linux.
+    GLUT_SAVED_LIBS="$LIBS"
+    for lib in "" "-lglut" "-lglut $ac_cv_search_glBegin"; do
+      LIBS="$lib"
+      AC_LINK_IFELSE(
+        [AC_LANG_PROGRAM([
+           #ifdef __cplusplus
+           # define EXTERN_C extern "C"
+           #else
+           # define EXTERN_C
+           #endif
+           EXTERN_C char glOrtho();
+           EXTERN_C char glutMainLoop();
+          ],[
+           glOrtho();
+           glutMainLoop();
+          ])
+        ],
+        [glut_support=yes], []
+      )
+      if test "$glut_support" = "yes"; then
+        GL_LIBS="$LDFLAGS $lib"
+        break
+      fi
+    done
+    LIBS="$GLUT_SAVED_LIBS"
+    LDFLAGS="$GLUT_SAVED_LDFLAGS"
+    test "$glut_support" = "yes" && break
+  done
+  IFS="$SAVED_IFS"
+  AC_LANG_POP
+fi
+
+LIBCHECK_EPILOGUE([GL])
+
+if test "$glut_support" = "yes" -a "$enable_libwebpdemux" = "yes"; then
+  build_vwebp=yes
+fi
+AM_CONDITIONAL([BUILD_VWEBP], [test "$build_vwebp" = "yes"])

 dnl === check for PNG support ===

-PNG_INCLUDES=""
-PNG_LIBS=""
-AC_PATH_PROG(LIBPNG_CONFIG, libpng-config)
+CLEAR_LIBVARS([PNG])
+AC_PATH_PROGS(LIBPNG_CONFIG,
+              [libpng-config libpng15-config libpng14-config libpng12-config])
 if test -n "$LIBPNG_CONFIG"; then
  PNG_INCLUDES=`$LIBPNG_CONFIG --cflags`
  PNG_PREFIX=`$LIBPNG_CONFIG --prefix`
  if test "${PNG_PREFIX}/lib" != "/usr/lib" ; then
    PNG_LIBS="-L${PNG_PREFIX}/lib"
  fi
+  PNG_LIBS="$PNG_LIBS `$LIBPNG_CONFIG --libs`"
 fi

-AC_ARG_WITH(pngincludedir,
-            [--with-pngincludedir=DIR use PNG includes from DIR],
-            PNG_INCLUDES="-I$withval")
-AC_ARG_WITH(pnglibdir,
-            [--with-pnglibdir=DIR    use PNG libraries from DIR],
-            [PNG_LIBS="-L$withval"])
-
-SAVED_CPPFLAGS=$CPPFLAGS
-SAVED_LIBS=$LIBS
-CPPFLAGS="$PNG_INCLUDES $CPPFLAGS"
-LIBS="$PNG_LIBS $LIBS"
+WITHLIB_OPTION([png], [PNG])

+LIBCHECK_PROLOGUE([PNG])
 AC_CHECK_HEADER(png.h,
-  AC_CHECK_LIB(png, main,
-               [PNG_LIBS="$PNG_LIBS -lpng"
-                PNG_INCLUDES="$PNG_INCLUDES -DWEBP_HAVE_PNG"
-                AC_DEFINE(WEBP_HAVE_PNG, [1], [Set to 1 if PNG library is installed])
-               ],
-               AC_MSG_WARN(Optional png library not found),
-               [$MATH_LIBS]),
-  AC_MSG_WARN(png library not available - no png.h)
+  AC_SEARCH_LIBS(png_get_libpng_ver, [png],
+                 [test "$ac_cv_search_png_get_libpng_ver" = "none required" \
+                    || PNG_LIBS="$PNG_LIBS $ac_cv_search_png_get_libpng_ver"
+                  PNG_INCLUDES="$PNG_INCLUDES -DWEBP_HAVE_PNG"
+                  AC_DEFINE(WEBP_HAVE_PNG, [1],
+                            [Set to 1 if PNG library is installed])
+                  png_support=yes
+                 ],
+                 [AC_MSG_WARN(Optional png library not found)
+                  PNG_LIBS=""
+                  PNG_INCLUDES=""
+                 ],
+                 [$MATH_LIBS]),
+  [AC_MSG_WARN(png library not available - no png.h)
+   PNG_LIBS=""
+   PNG_INCLUDES=""
+  ],
 )
-AC_SUBST(PNG_LIBS)
-AC_SUBST(PNG_INCLUDES)
-
-CPPFLAGS=$SAVED_CPPFLAGS
-LIBS=$SAVED_LIBS
+LIBCHECK_EPILOGUE([PNG])

 dnl === check for JPEG support ===

-JPEG_INCLUDES=""
-JPEG_LIBS=""
-AC_ARG_WITH(jpegincludedir,
-            [--with-jpegincludedir=DIR use JPEG includes from DIR],
-            JPEG_INCLUDES="-I$withval")
-AC_ARG_WITH(jpeglibdir,
-            [--with-jpeglibdir=DIR    use JPEG libraries from DIR],
-            [JPEG_LIBS="-L$withval"])
-
-SAVED_CPPFLAGS=$CPPFLAGS
-SAVED_LIBS=$LIBS
-CPPFLAGS="$JPEG_INCLUDES $CPPFLAGS"
-LIBS="$JPEG_LIBS $LIBS"
+CLEAR_LIBVARS([JPEG])
+WITHLIB_OPTION([jpeg], [JPEG])

+LIBCHECK_PROLOGUE([JPEG])
 AC_CHECK_HEADER(jpeglib.h,
  AC_CHECK_LIB(jpeg, jpeg_set_defaults,
               [JPEG_LIBS="$JPEG_LIBS -ljpeg"
                JPEG_INCLUDES="$JPEG_INCLUDES -DWEBP_HAVE_JPEG"
-                AC_DEFINE(WEBP_HAVE_JPEG, [1], [Set to 1 if JPEG library is installed])
+                AC_DEFINE(WEBP_HAVE_JPEG, [1],
+                          [Set to 1 if JPEG library is installed])
+                jpeg_support=yes
               ],
               AC_MSG_WARN(Optional jpeg library not found),
               [$MATH_LIBS]),
  AC_MSG_WARN(jpeg library not available - no jpeglib.h)
 )
-AC_SUBST(JPEG_LIBS)
-AC_SUBST(JPEG_INCLUDES)
+LIBCHECK_EPILOGUE([JPEG])

-CPPFLAGS=$SAVED_CPPFLAGS
-LIBS=$SAVED_LIBS
+dnl === check for TIFF support ===
+
+CLEAR_LIBVARS([TIFF])
+WITHLIB_OPTION([tiff], [TIFF])
+
+LIBCHECK_PROLOGUE([TIFF])
+AC_CHECK_HEADER(tiffio.h,
+  AC_CHECK_LIB(tiff, TIFFGetVersion,
+               [TIFF_LIBS="$TIFF_LIBS -ltiff"
+                TIFF_INCLUDES="$TIFF_INCLUDES -DWEBP_HAVE_TIFF"
+                AC_DEFINE(WEBP_HAVE_TIFF, [1],
+                          [Set to 1 if TIFF library is installed])
+                tiff_support=yes
+               ],
+               AC_MSG_WARN(Optional tiff library not found),
+               [$MATH_LIBS]),
+  AC_MSG_WARN(tiff library not available - no tiffio.h)
+)
+LIBCHECK_EPILOGUE([TIFF])
+
+dnl === check for GIF support ===
+
+CLEAR_LIBVARS([GIF])
+WITHLIB_OPTION([gif], [GIF])
+
+LIBCHECK_PROLOGUE([GIF])
+AC_CHECK_HEADER(gif_lib.h,
+  AC_CHECK_LIB([gif], [DGifOpenFileHandle],
+               [GIF_LIBS="$GIF_LIBS -lgif"
+                gif_support=yes
+               ],
+               AC_MSG_WARN(Optional gif library not found),
+               [$MATH_LIBS]),
+  AC_MSG_WARN(gif library not available - no gif_lib.h)
+)
+LIBCHECK_EPILOGUE([GIF])
+
+if test "$gif_support" = "yes" -a \
+        "$enable_libwebpmux" = "yes"; then
+  build_gif2webp=yes
+fi
+AM_CONDITIONAL([BUILD_GIF2WEBP], [test "${build_gif2webp}" = "yes"])

 dnl === check for WIC support ===

@ -136,18 +353,56 @@ if test "$target_os" = "mingw32"; then
  fi
 fi

-dnl === If --enable-experimental is defined, add the flag WEBP_EXPERIMENTAL_FEATURES
+dnl === If --enable-swap-16bit-csp is defined, add -DWEBP_SWAP_16BIT_CSP
+
+USE_SWAP_16BIT_CSP=""
+AC_MSG_CHECKING(if --enable-swap-16bit-csp option is specified)
+AC_ARG_ENABLE([swap-16bit-csp],
+              AS_HELP_STRING([--enable-swap-16bit-csp],
+                             [Enable byte swap for 16 bit colorspaces]))
+if test "$enable_swap_16bit_csp" = "yes"; then
+  USE_SWAP_16BIT_CSP="-DWEBP_SWAP_16BIT_CSP"
+fi
+AC_MSG_RESULT(${enable_swap_16bit_csp-no})
+AC_SUBST(USE_SWAP_16BIT_CSP)
+
+dnl === If --enable-experimental is defined, add -DWEBP_EXPERIMENTAL_FEATURES

 USE_EXPERIMENTAL_CODE=""
 AC_MSG_CHECKING(if --enable-experimental option is specified)
-AC_ARG_ENABLE(experimental, [  --enable-experimental         Activate experimental features])
+AC_ARG_ENABLE([experimental], AS_HELP_STRING([--enable-experimental],
+                                             [Activate experimental features]))
 if test "$enable_experimental" = "yes"; then
-        AC_DEFINE(EXPERIMENTAL,,[Enable experimental code])
-        USE_EXPERIMENTAL_CODE="-DWEBP_EXPERIMENTAL_FEATURES"
+  AC_DEFINE(WEBP_EXPERIMENTAL_FEATURES, [1], [Enable experimental code])
+  USE_EXPERIMENTAL_CODE="-DWEBP_EXPERIMENTAL_FEATURES"
 fi
 AC_MSG_RESULT(${enable_experimental-no})
 AC_SUBST(USE_EXPERIMENTAL_CODE)

+dnl === Check whether libwebpmux should be built
+AC_MSG_CHECKING(whether libwebpmux is to be built)
+AC_ARG_ENABLE([libwebpmux],
+              AS_HELP_STRING([--enable-libwebpmux],
+                             [Build libwebpmux @<:@default=no@:>@]))
+AC_MSG_RESULT(${enable_libwebpmux-no})
+AM_CONDITIONAL([WANT_MUX], [test "$enable_libwebpmux" = "yes"])
+
+dnl === Check whether libwebpdemux should be built
+AC_MSG_CHECKING(whether libwebpdemux is to be built)
+AC_ARG_ENABLE([libwebpdemux],
+              AS_HELP_STRING([--enable-libwebpdemux],
+                             [Build libwebpdemux @<:@default=no@:>@]))
+AC_MSG_RESULT(${enable_libwebpdemux-no})
+AM_CONDITIONAL([WANT_DEMUX], [test "$enable_libwebpdemux" = "yes"])
+
+dnl === Check whether decoder library should be built.
+AC_MSG_CHECKING(whether decoder library is to be built)
+AC_ARG_ENABLE([libwebpdecoder],
+              AS_HELP_STRING([--enable-libwebpdecoder],
+                             [Build libwebpdecoder @<:@default=no@:>@]))
+AC_MSG_RESULT(${enable_libwebpdecoder-no})
+AM_CONDITIONAL([BUILD_LIBWEBPDECODER], [test "$enable_libwebpdecoder" = "yes"])
+
 dnl =========================

 AC_CONFIG_MACRO_DIR([m4])
@ -155,8 +410,40 @@ AC_CONFIG_HEADERS([config.h])
 AC_CONFIG_FILES([Makefile src/Makefile man/Makefile \
                 examples/Makefile src/dec/Makefile \
                 src/enc/Makefile src/dsp/Makefile \
+                 src/demux/Makefile src/mux/Makefile \
                 src/utils/Makefile \
-                 src/libwebp.pc])
+                 src/libwebp.pc src/libwebpdecoder.pc \
+                 src/demux/libwebpdemux.pc src/mux/libwebpmux.pc])


 AC_OUTPUT
+
+AC_MSG_NOTICE([
+WebP Configuration Summary
+--------------------------
+
+Shared libraries: ${enable_shared}
+Static libraries: ${enable_static}
+Threaded decode: ${enable_threading-no}
+libwebp: yes
+libwebpdecoder: ${enable_libwebpdecoder-no}
+libwebpdemux: ${enable_libwebpdemux-no}
+libwebpmux: ${enable_libwebpmux-no}
+
+Tools:
+cwebp : yes
+  Input format support
+  ====================
+  JPEG : ${jpeg_support-no}
+  PNG  : ${png_support-no}
+  TIFF : ${tiff_support-no}
+  WIC  : ${wic_support-no}
+dwebp : yes
+  Output format support
+  =====================
+  PNG  : ${png_support-no}
+  WIC  : ${wic_support-no}
+gif2webp : ${build_gif2webp-no}
+webpmux  : ${enable_libwebpmux-no}
+vwebp    : ${build_vwebp-no}
+])
--- a/doc/README
+++ b/doc/README
@ -0,0 +1,29 @@
+
+Generate libwebp Container Spec Docs from Text Source
+=====================================================
+
+HTML generation requires kramdown [1], easily installed as a
+rubygem [2].  Rubygems installation should satisfy dependencies
+automatically.
+
+[1]: http://kramdown.rubyforge.org/
+[2]: http://rubygems.org/
+
+HTML generation can then be done from the project root:
+
+$ kramdown doc/webp-container-spec.txt --template doc/template.html > \
+  doc/output/webp-container-spec.html
+
+kramdown can optionally syntax highlight code blocks, using CodeRay [3],
+a dependency of kramdown that rubygems will install automatically.  The
+following will apply inline CSS styling; an external stylesheet is not
+needed.
+
+$ kramdown doc/webp-lossless-bitstream-spec.txt --template \
+  doc/template.html --coderay-css style --coderay-line-numbers ' ' \
+  --coderay-default-lang c > \
+  doc/output/webp-lossless-bitstream-spec.html
+
+Optimally, use kramdown 0.13.7 or newer if syntax highlighting desired.
+
+[3]: http://coderay.rubychan.de/
--- a/doc/TODO
+++ b/doc/TODO
@ -0,0 +1,13 @@
+<louquillio@google.com>, 20111004
+
+* Determine that normative RFC 2119 terms (MUST, SHOULD, MAY, etc.) are
+  truly intended in all cases where capitalized.
+
+* Several passages could be made clearer.
+
+  * Overall edit for scope.  Portions are phrased as an introduction to
+    the 0.1.3 RIFF container additions, rather than a holistic guide to
+    WebP.
+
+  * To wit, suggest s/[spec|specification]/guide/g .  "Spec" can imply a
+    standards track; in any case it's too formal for a work in progress.
--- a/doc/template.html
+++ b/doc/template.html
@ -0,0 +1,94 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="utf-8">
+  <title>WebP Container Specification</title>
+  <meta name="generator" content="kramdown <%= ::Kramdown::VERSION %>" />
+  <style type="text/css">
+  body {
+    color: #000;
+    background-color: #fff;
+    margin: 10%;
+    font-family: "Liberation Sans", "DejaVu Sans", "Bitstream Vera Sans", Arial, sans-serif;
+    line-height: 1.4;
+  }
+  h2 {
+    border-bottom: 1px solid #ccc;
+    padding-bottom: 0;
+  }
+  table {
+    border-collapse: collapse;
+  }
+  th, td {
+    border: 1px solid #999;
+    padding: .5em .7em;;
+  }
+  th {
+    color: #fff;
+    background-color: #000;
+  }
+  td {
+  }
+  hr {
+  }
+  code {
+    color: #000;
+    background-color: #f7f7f7;
+    padding: 0 3px;
+    font-family: "Liberation Mono", "DejaVu Sans Mono", "Bitstream Vera Sans Mono", Consolata, monospace;
+  }
+  pre {
+    background-color: #f7f7f7;
+    padding: 1em;
+    border: 1px solid #ccc;
+    width: 42em;
+    overflow: auto;
+    font-family: "Liberation Mono", "DejaVu Sans Mono", "Bitstream Vera Sans Mono", Consolata, monospace;
+  }
+  pre code {
+    background-color: #f7f7f7;
+    padding: 0; /* Only want padding on inline code, not blocks */
+  }
+  pre.terminal {
+    color: #fff;
+    background-color: #000;
+    border: 1px solid #ccc;
+    max-height: 30em;
+  }
+  pre.terminal code {
+    color: #fff;
+    background-color: #000;
+    font-size: smaller;
+  }
+  #markdown-toc ul {
+    list-style-type: disc;
+  }
+  ul#markdown-toc {
+    margin-top: -1em;
+    visibility: hidden;
+    -webkit-padding-start: 0;
+  }
+  ul#markdown-toc ul {
+    visibility: visible;
+  }
+  ul#markdown-toc ul ul{
+    visibility: visible;
+  }
+  ul#markdown-toc + hr {
+    margin-bottom: 4em;
+  }
+  ol ol { /* Format nested ordered lists */
+    list-style-type: lower-alpha;
+  }
+  dt {
+    font-style: italic;
+    font-weight: bold;
+  }
+  .caption {
+  }
+  </style>
+</head>
+<body>
+<%= @body %>
+</body>
+</html>
--- a/doc/webp-container-spec.txt
+++ b/doc/webp-container-spec.txt
@ -0,0 +1,895 @@
+<!--
+
+Although you may be viewing an alternate representation, this document
+is sourced in Markdown, a light-duty markup scheme, and is optimized for
+the [kramdown](http://kramdown.rubyforge.org/) transformer.
+
+See the accompanying README. External link targets are referenced at the
+end of this file.
+
+-->
+
+
+WebP Container Specification
+============================
+
+* TOC placeholder
+{:toc}
+
+
+Introduction
+------------
+
+WebP is an image format that uses either (i) the VP8 key frame encoding
+to compress image data in a lossy way, or (ii) the WebP lossless encoding
+(and possibly other encodings in the future). These encoding schemes should
+make it more efficient than currently used formats. It is optimized for fast
+image transfer over the network (e.g., for websites). The WebP format has
+feature parity (color profile, metadata, animation etc) with other formats as
+well. This document describes the structure of a WebP file.
+
+The WebP container (i.e., RIFF container for WebP) allows feature support over
+and above the basic use case of WebP (i.e., a file containing a single image
+encoded as a VP8 key frame). The WebP container provides additional support
+for:
+
+  * **Lossless compression.** An image can be losslessly compressed, using the
+    WebP Lossless Format.
+
+  * **Metadata.** An image may have metadata stored in EXIF or XMP formats.
+
+  * **Transparency.** An image may have transparency, i.e., an alpha channel.
+
+  * **Color Profile.** An image may have an embedded ICC profile as described
+    by the [International Color Consortium][iccspec].
+
+  * **Animation.** An image may have multiple frames with pauses between them,
+    making it an animation.
+
+  * **Image Fragmentation.** A single bitstream in WebP has an inherent
+    limitation for width or height of 2^14 pixels, and, when using VP8, a 512
+    KiB limit on the size of the first compressed partition. To support larger
+    images, the format supports images that are composed of multiple fragments,
+    each encoded as a separate bitstream. All fragments logically form a single
+    image: they have common metadata, color profile, etc. Image fragmentation
+    may also improve efficiency for larger images, e.g., grass can be encoded
+    differently than sky.
+
+The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT",
+"SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this
+document are to be interpreted as described in [RFC 2119][].
+
+**Note:** Out of the features mentioned above, lossy compression, lossless
+compression, transparency, metadata, color profile and animation are finalized
+and are to be considered stable. On the other hand, image fragmentation is
+experimental as of now, and is open to discussion, feedback and comments.
+The same is indicated using annotation "_status: experimental_" in the relevant
+sections of this document.
+
+Terminology &amp; Basics
+------------------------
+
+A WebP file contains either a still image (i.e., an encoded matrix of pixels)
+or an [animation](#animation). Optionally, it can also contain transparency
+information, color profile and metadata. In case we need to refer only to the
+matrix of pixels, we will call it the _canvas_ of the image.
+
+Below are additional terms used throughout this document:
+
+_Reader/Writer_
+
+: Code that reads WebP files is referred to as a _reader_, while code that
+writes them is referred to as a _writer_.
+
+_uint16_
+
+: A 16-bit, little-endian, unsigned integer.
+
+_uint24_
+
+: A 24-bit, little-endian, unsigned integer.
+
+_uint32_
+
+: A 32-bit, little-endian, unsigned integer.
+
+_FourCC_
+
+: A _FourCC_ (four-character code) is a _uint32_ created by concatenating four
+  ASCII characters in little-endian order.
+
+_1-based_
+
+: An unsigned integer field storing values offset by `-1`. e.g., Such a field
+would store value _25_ as _24_.
+
+RIFF file format
+----------------
+The WebP file format is based on the RIFF (resource interchange file format)
+document format.
+
+The basic element of a RIFF file is a _chunk_. It consists of:
+
+     0                   1                   2                   3
+     0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                         Chunk FourCC                          |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                          Chunk Size                           |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                         Chunk Payload                         |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+Chunk FourCC: 32 bits
+
+: ASCII four-character code used for chunk identification.
+
+Chunk Size: 32 bits (_uint32_)
+
+: The size of the chunk not including this field, the chunk identifier or
+  padding.
+
+Chunk Payload: _Chunk Size_ bytes
+
+: The data payload. If _Chunk Size_ is odd, a single padding byte -- that
+  SHOULD be `0` -- is added.
+
+_ChunkHeader('ABCD')_
+
+: This is used to describe the _FourCC_ and _Chunk Size_ header of individual
+  chunks, where 'ABCD' is the FourCC for the chunk. This element's
+  size is 8 bytes.
+
+**Note:** RIFF has a convention that all-uppercase chunk FourCCs are standard
+chunks that apply to any RIFF file format, while FourCCs specific to a file
+format are all lowercase. WebP does not follow this convention.
+
+WebP file header
+----------------
+
+     0                   1                   2                   3
+     0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |      'R'      |      'I'      |      'F'      |      'F'      |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                           File Size                           |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |      'W'      |      'E'      |      'B'      |      'P'      |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+'RIFF': 32 bits
+
+: The ASCII characters 'R' 'I' 'F' 'F'.
+
+File Size: 32 bits (_uint32_)
+
+: The size of the file in bytes starting at offset 8. The maximum value of
+this field is 2^32 minus 10 bytes and thus the size of the whole file is at
+most 4GiB minus 2 bytes.
+
+'WEBP': 32 bits
+
+: The ASCII characters 'W' 'E' 'B' 'P'.
+
+A WebP file MUST begin with a RIFF header with the FourCC 'WEBP'. The file size
+in the header is the total size of the chunks that follow plus `4` bytes for
+the 'WEBP' FourCC. The file SHOULD NOT contain anything after it. As the size
+of any chunk is even, the size given by the RIFF header is also even. The
+contents of individual chunks will be described in the following sections.
+
+Simple file format (lossy)
+--------------------------
+
+This layout SHOULD be used if the image requires _lossy_ encoding and does not
+require transparency or other advanced features provided by the extended format.
+Files with this layout are smaller and supported by older software.
+
+Simple WebP (lossy) file format:
+
+     0                   1                   2                   3
+     0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                    WebP file header (12 bytes)                |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                          VP8 chunk                            |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+VP8 chunk:
+
+     0                   1                   2                   3
+     0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                      ChunkHeader('VP8 ')                      |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                           VP8 data                            |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+VP8 data: _Chunk Size_ bytes
+
+: VP8 bitstream data.
+
+The VP8 bitstream format specification can be found at [VP8 Data Format and
+Decoding Guide][vp8spec]. Note that the VP8 frame header contains the VP8 frame
+width and height. That is assumed to be the width and height of the canvas.
+
+The VP8 specification describes how to decode the image into Y'CbCr
+format. To convert to RGB, Rec. 601 SHOULD be used.
+
+Simple file format (lossless)
+-----------------------------
+
+**Note:** Older readers may not support files using the lossless format.
+
+This layout SHOULD be used if the image requires _lossless_ encoding (with an
+optional transparency channel) and does not require advanced features provided
+by the extended format.
+
+Simple WebP (lossless) file format:
+
+     0                   1                   2                   3
+     0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                    WebP file header (12 bytes)                |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                          VP8L chunk                           |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+VP8L chunk:
+
+     0                   1                   2                   3
+     0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                      ChunkHeader('VP8L')                      |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                           VP8L data                           |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+VP8L data: _Chunk Size_ bytes
+
+: VP8L bitstream data.
+
+The current specification of the VP8L bitstream can be found at
+[WebP Lossless Bitstream Format][webpllspec]. Note that the VP8L header
+contains the VP8L image width and height. That is assumed to be the width
+and height of the canvas.
+
+Extended file format
+--------------------
+
+**Note:** Older readers may not support files using the extended format.
+
+An extended format file consists of:
+
+  * A 'VP8X' chunk with information about features used in the file.
+
+  * An optional 'ICCP' chunk with color profile.
+
+  * An optional 'ANIM' chunk with animation control data.
+
+  * Image data.
+
+  * An optional 'EXIF' chunk with EXIF metadata.
+
+  * An optional 'XMP ' chunk with XMP metadata.
+
+  * An optional list of [unknown chunks](#unknown-chunks). _\[status: experimental\]_
+
+For a _still image_, the _image data_ consists of a single frame, whereas for
+an _animated image_, it consists of multiple frames. More details about frames
+can be found in the [Animation](#animation) section.
+
+Moreover, each frame can be fragmented or non-fragmented, as will be described
+in the [Extended WebP file header](#extended_header) section. More details about
+fragments can be found in the [Fragments](#fragments) section.
+
+All chunks SHOULD be placed in the same order as listed above. If a chunk
+appears in the wrong place, the file is invalid, but readers MAY parse the
+file, ignoring the chunks that come too late.
+
+**Rationale:** Setting the order of chunks should allow quicker file
+parsing. For example, if an 'ALPH' chunk does not appear in its required
+position, a decoder can choose to stop searching for it. The rule of
+ignoring late chunks should make programs that need to do a full search
+give the same results as the ones stopping early.
+
+Extended WebP file header:
+{:#extended_header}
+
+     0                   1                   2                   3
+     0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                   WebP file header (12 bytes)                 |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                      ChunkHeader('VP8X')                      |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |Rsv|I|L|E|X|A|F|                   Reserved                    |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |          Canvas Width Minus One               |             ...
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    ...  Canvas Height Minus One    |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+Reserved (Rsv): 2 bits
+
+: SHOULD be `0`.
+
+ICC profile (I): 1 bit
+
+: Set if the file contains an ICC profile.
+
+Alpha (L): 1 bit
+
+: Set if any of the frames of the image contain transparency information
+("alpha").
+
+EXIF metadata (E): 1 bit
+
+: Set if the file contains EXIF metadata.
+
+XMP metadata (X): 1 bit
+
+: Set if the file contains XMP metadata.
+
+Animation (A): 1 bit
+
+: Set if this is an animated image. Data in 'ANIM' and 'ANMF' chunks should be
+used to control the animation.
+
+Image Fragmentation (F): 1 bit _\[status: experimental\]_
+
+: Set if any of the frames in the image are represented by fragments.
+
+Reserved: 24 bits
+
+: SHOULD be `0`.
+
+Canvas Width Minus One: 24 bits
+
+: _1-based_ width of the canvas in pixels.
+  The actual canvas width is '1 + Canvas Width Minus One'
+
+Canvas Height Minus One: 24 bits
+
+: _1-based_ height of the canvas in pixels.
+  The actual canvas height is '1 + Canvas Height Minus One'
+
+The product of _Canvas Width_ and _Canvas Height_ MUST be at most `2^32 - 1`.
+
+Future specifications MAY add more fields.
+
+### Chunks
+
+#### Animation
+
+An animation is controlled by ANIM and ANMF chunks.
+
+ANIM Chunk:
+{:#anim_chunk}
+
+For an animated image, this chunk contains the _global parameters_ of the
+animation.
+
+     0                   1                   2                   3
+     0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                      ChunkHeader('ANIM')                      |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                       Background Color                        |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |          Loop Count           |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+Background Color: 32 bits (_uint32_)
+
+: The default background color of the canvas in \[Blue, Green, Red, Alpha\]
+byte order. This color is used to fill the unused space on the canvas around the
+frames, as well as the transparent pixels of the first frame. Background color
+is also used when disposal method is `1`.
+
+**Note**: Viewers that have a preferred background against which to present the
+images (web browsers, for example) should ignore this value and use their
+preferred background color instead.
+
+Loop Count: 16 bits (_uint16_)
+
+: The number of times to loop the animation. `0` means infinitely.
+
+This chunk MUST appear if the _Animation_ flag in the VP8X chunk is set.
+If the _Animation_ flag is not set and this chunk is present, it
+SHOULD be ignored.
+
+
+ANMF chunk:
+
+For animated images, this chunk contains information about a _single_ frame.
+If the _Animation flag_ is not set, then this chunk SHOULD NOT be present.
+
+     0                   1                   2                   3
+     0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                      ChunkHeader('ANMF')                      |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                        Frame X                |             ...
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    ...          Frame Y            |   Frame Width Minus One     ...
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    ...             |           Frame Height Minus One              |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                 Frame Duration                |  Reserved   |D|
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                         Frame Data                            |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+Frame X: 24 bits (_uint24_)
+
+: The X coordinate of the upper left corner of the frame is `Frame X * 2`
+
+Frame Y: 24 bits (_uint24_)
+
+: The Y coordinate of the upper left corner of the frame is `Frame Y * 2`
+
+Frame Width Minus One: 24 bits (_uint24_)
+
+: The _1-based_ width of the frame.
+  The frame width is `1 + Frame Width Minus One`
+
+Frame Height Minus One: 24 bits (_uint24_)
+
+: The _1-based_ height of the frame.
+  The frame height is `1 + Frame Height Minus One`
+
+Frame Duration: 24 bits (_uint24_)
+
+: The time to wait before displaying the next frame, in 1 millisecond units.
+In particular, frame duration of 0 is useful when one wants to update multiple
+areas of the canvas at once during the animation.
+
+Reserved: 7 bits
+
+: SHOULD be 0.
+
+Disposal method (D): 1 bit
+
+: Indicates how _the current frame_ is to be treated after it has been displayed
+(before rendering the next frame) on the canvas:
+
+  * `0`: Do not dispose. Leave the canvas as is.
+
+  * `1`: Dispose to background color. Fill the _rectangle_ on the canvas covered
+    by the _current frame_ with background color specified in the
+    [ANIM chunk](#anim_chunk).
+
+After disposing the current frame, render the next frame on the canvas using
+[alpha-blending](#alpha-blending). If the next frame does not have an alpha
+channel, assume alpha value of 255, effectively replacing the rectangle.
+
+**Notes**:
+
+  * The frame disposal only applies to the _frame rectangle_, that is, the
+    rectangle defined by _Frame X_, _Frame Y_, _frame width_ and _frame height_.
+    It may or may not cover the whole canvas.
+
+{:#alpha-blending}
+  * **Alpha-blending**:
+
+    Given that each of the R, G, B and A channels is 8-bit, and the RGB
+    channels are _not premultiplied_ by alpha, the formula for blending
+    'dst' onto 'src' is:
+
+~~~~~
+    blend.A = src.A + dst.A * (1 - src.A / 255)
+    if blend.A = 0 then
+      blend.RGB = 0
+    else
+      blend.RGB = (src.RGB * src.A +
+                   dst.RGB * dst.A * (1 - src.A / 255)) / blend.A
+~~~~~
+
+  * Alpha-blending SHOULD be done in linear color space, by taking into account
+    the [color profile](#color-profile) of the image. If the color profile is
+    not present, sRGB is to be assumed. (Note that sRGB also needs to be
+    linearized due to a gamma of ~2.2).
+
+Frame Data: _Chunk Size_ - `16` bytes
+
+: For a fragmented frame, it consists of multiple [fragment chunks](#fragments).
+
+: For a non-fragmented frame, it consists of:
+
+  * An optional [alpha subchunk](#alpha) for the frame.
+
+  * A [bitstream subchunk](#bitstream-vp8vp8l) for the frame.
+
+  * An optional list of [unknown chunks](#unknown-chunks).
+
+**Note**: The 'ANMF' payload, _Frame Data_ above, consists of individual
+_padded_ chunks as described by the [RIFF file format](#riff-file-format).
+
+#### Fragments _\[status: experimental\]_
+
+For images that are represented by fragments, this chunk contains data for
+a single fragment. If the _Image Fragmentation Flag_ is not set, then this chunk
+SHOULD NOT be present.
+
+     0                   1                   2                   3
+     0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                      ChunkHeader('FRGM')                      |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                  Fragment X                   |             ...
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    ...       Fragment Y            |         Fragment Data         |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+Fragment X: 24 bits (_uint24_)
+
+: The X coordinate of the upper left corner of the fragment is `Fragment X * 2`
+
+Fragment Y: 24 bits (_uint24_)
+
+: The Y coordinate of the upper left corner of the fragment is `Fragment Y * 2`
+
+Fragment Data: _Chunk Size_ - `6` bytes
+
+: It contains:
+
+  * An optional [alpha subchunk](#alpha) for the fragment.
+  * The [bitstream subchunk](#bitstream-vp8vp8l) for the fragment.
+  * An optional list of [unknown chunks](#unknown-chunks).
+
+Note: The width and height of the fragment is obtained from the bitstream
+subchunk.
+
+The fragments of a frame SHOULD have the following properties:
+
+  * They collectively cover the whole frame.
+
+  * No pair of fragments have any overlapping region on the frame.
+
+  * No portion of any fragment should be located outside of the canvas.
+
+#### Alpha
+
+     0                   1                   2                   3
+     0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                      ChunkHeader('ALPH')                      |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |Rsv| P | F | C |     Alpha Bitstream...                        |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+Reserved (Rsv): 2 bits
+
+: SHOULD be `0`.
+
+Pre-processing (P): 2 bits
+
+: These INFORMATIVE bits are used to signal the pre-processing that has
+been performed during compression. The decoder can use this information to
+e.g. dither the values or smooth the gradients prior to display.
+
+  * `0`: no pre-processing
+  * `1`: level reduction
+
+Filtering method (F): 2 bits
+
+: The filtering method used:
+
+  * `0`: None.
+  * `1`: Horizontal filter.
+  * `2`: Vertical filter.
+  * `3`: Gradient filter.
+
+For each pixel, filtering is performed using the following calculations.
+Assume the alpha values surrounding the current `X` position are labeled as:
+
+     C | B |
+    ---+---+
+     A | X |
+
+We seek to compute the alpha value at position `X`. First, a prediction is
+made depending on the filtering method:
+
+  * Method `0`: predictor = 0
+  * Method `1`: predictor = A
+  * Method `2`: predictor = B
+  * Method `3`: predictor = clip(A + B - C)
+
+where `clip(v)` is equal to:
+
+  * 0    if v < 0
+  * 255  if v > 255
+  * v    otherwise
+
+The final value is derived by adding the decompressed value `X` to the
+predictor and using modulo-256 arithmetic to wrap the \[256-511\] range
+into the \[0-255\] one:
+
+`alpha = (predictor + X) % 256`
+
+There are special cases for left-most and top-most pixel positions:
+
+  * Top-left value at location (0,0) uses 0 as predictor value. Otherwise,
+  * For horizontal or gradient filtering methods, the left-most pixels at
+    location (0, y) are predicted using the location (0, y-1) just above.
+  * For vertical or gradient filtering methods, the top-most pixels at
+    location (x, 0) are predicted using the location (x-1, 0) on the left.
+
+
+Decoders are not required to use this information in any specified way.
+
+Compression method (C): 2 bits
+
+: The compression method used:
+
+  * `0`: No compression.
+  * `1`: Compressed using the WebP lossless format.
+
+Alpha bitstream: _Chunk Size_ - `1` bytes
+
+: Encoded alpha bitstream.
+
+This optional chunk contains encoded alpha data for this frame/fragment. A
+frame/fragment containing a 'VP8L' chunk SHOULD NOT contain this chunk.
+
+**Rationale**: The transparency information is already part of the 'VP8L'
+chunk.
+
+The alpha channel data is stored as uncompressed raw data (when
+compression method is '0') or compressed using the lossless format
+(when the compression method is '1').
+
+  * Raw data: consists of a byte sequence of length width * height,
+    containing all the 8-bit transparency values in scan order.
+
+  * Lossless format compression: the byte sequence is a compressed
+    image-stream (as described in the [WebP Lossless Bitstream Format]
+    [webpllspec]) of implicit dimension width x height. That is, this
+    image-stream does NOT contain any headers describing the image dimension.
+
+    **Rationale**: the dimension is already known from other sources,
+    so storing it again would be redundant and error-prone.
+
+    Once the image-stream is decoded into ARGB color values, following
+    the process described in the lossless format specification, the
+    transparency information must be extracted from the *green* channel
+    of the ARGB quadruplet.
+
+    **Rationale**: the green channel is allowed extra transformation
+    steps in the specification -- unlike the other channels -- that can
+    improve compression.
+
+#### Bitstream (VP8/VP8L)
+
+This chunk contains compressed bitstream data for a single frame/fragment.
+
+A bitstream chunk may be either (i) a VP8 chunk, using "VP8 " (note the
+significant fourth-character space) as its tag _or_ (ii) a VP8L chunk, using
+"VP8L" as its tag.
+
+The formats of VP8 and VP8L chunks are as described in sections
+[Simple file format (lossy)](#simple-file-format-lossy)
+and [Simple file format (lossless)](#simple-file-format-lossless) respectively.
+
+#### Color profile
+
+     0                   1                   2                   3
+     0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                      ChunkHeader('ICCP')                      |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                       Color Profile                           |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+Color Profile: _Chunk Size_ bytes
+
+: ICC profile.
+
+This chunk MUST appear before the image data.
+
+There SHOULD be at most one such chunk. If there are more such chunks, readers
+MAY ignore all except the first one.
+See the [ICC Specification][iccspec] for details.
+
+If this chunk is not present, sRGB SHOULD be assumed.
+
+#### Metadata
+
+Metadata can be stored in 'EXIF' or 'XMP ' chunks.
+
+There SHOULD be at most one chunk of each type ('EXIF' and 'XMP '). If there
+are more such chunks, readers MAY ignore all except the first one. Also, a file
+may possibly contain both 'EXIF' and 'XMP ' chunks.
+
+The chunks are defined as follows:
+
+EXIF chunk:
+
+     0                   1                   2                   3
+     0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                      ChunkHeader('EXIF')                      |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                        EXIF Metadata                          |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+EXIF Metadata: _Chunk Size_ bytes
+
+: image metadata in EXIF format.
+
+
+XMP chunk:
+
+     0                   1                   2                   3
+     0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                      ChunkHeader('XMP ')                      |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    |                        XMP Metadata                           |
+    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+XMP Metadata: _Chunk Size_ bytes
+
+: image metadata in XMP format.
+
+Additional guidance about handling metadata can be found in the
+Metadata Working Group's [Guidelines for Handling Metadata][metadata].
+
+#### Unknown Chunks _\[status: experimental\]_
+
+A RIFF chunk (described in [this](#terminology-amp-basics) section) whose _chunk
+tag_ is different from any of the chunks described in this document, is
+considered an _unknown chunk_.
+
+**Rationale**: Allowing unknown chunks gives a provision for future extension
+of the format, and also allows storage of any application-specific data.
+
+A file MAY contain unknown chunks:
+
+  * At the end of the file as described in [Extended WebP file
+    header](#extended_header) section.
+  * At the end of FRGM and ANMF chunks as described in [Fragments](#fragments)
+    and [Animation](#animation) sections.
+
+Readers SHOULD ignore these chunks. Writers SHOULD preserve them in their
+original order (unless they specifically intend to modify these chunks).
+
+### Assembling the Canvas from fragments/frames
+
+Here we provide an overview of how a reader should assemble a canvas in case
+of a fragmented-image and in case of an animated image. The notation
+_VP8X.field_ means the field in the 'VP8X' chunk with the same description.
+
+Displaying a _fragmented image_ canvas MUST be equivalent to the following
+pseudocode: _\[status: experimental\]_
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+assert VP8X.flags.hasFragments
+canvas ← new black image of size VP8X.canvasWidth x VP8X.canvasHeight.
+frgm_params ← nil
+for chunk in image_data:
+    assert chunk.tag is "FRGM"
+    frgm_params.fragmentX = Fragment X
+    frgm_params.fragmentY = Fragment Y
+    for subchunk in 'Fragment Data':
+        if subchunk.tag == "ALPH":
+            assert alpha subchunks not found in 'Fragment Data' earlier
+            frgm_params.alpha = alpha_data
+        else if subchunk.tag == "VP8 " OR subchunk.tag == "VP8L":
+            assert bitstream subchunks not found in 'Fragment Data' earlier
+            frgm_params.bitstream = bitstream_data
+    frgm_params.fragmentWidth = Width extracted from bitstream subchunk
+    frgm_params.fragmentHeight = Height extracted from bitstream subchunk
+    assert VP8X.canvasWidth >=
+        frgm_params.fragmentX + frgm_params.fragmentWidth
+    assert VP8X.canvasHeight >=
+        frgm_params.fragmentY + frgm_params.fragmentHeight
+    assert fragment has the properties mentioned in "Image Fragments" section.
+    render fragment with frame_params.alpha and frame_params.bitstream on canvas
+    with top-left corner in (frgm_params.fragmentX, frgm_params.fragmentY).
+canvas contains the decoded canvas.
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Displaying an _animated image_ canvas MUST be equivalent to the following
+pseudocode:
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+assert VP8X.flags.hasAnimation
+canvas ← new image of size VP8X.canvasWidth x VP8X.canvasHeight with
+background color ANIM.background_color.
+loop_count ← ANIM.loopCount
+dispose_method ← ANIM.disposeMethod
+if loop_count == 0:
+    loop_count = ∞
+frame_params ← nil
+for loop = 0, ..., loop_count - 1
+    assert next chunk in image_data is ANMF
+    frame_params.frameX = Frame X
+    frame_params.frameY = Frame Y
+    frame_params.frameWidth = Frame Width Minus One + 1
+    frame_params.frameHeight = Frame Height Minus One + 1
+    frame_params.frameDuration = Frame Duration
+    assert VP8X.canvasWidth >= frame_params.frameX + frame_params.frameWidth
+    assert VP8X.canvasHeight >= frame_params.frameY + frame_params.frameHeight
+    if VP8X.flags.hasFragments and first subchunk in 'Frame Data' is FRGM
+        // Fragmented frame.
+        frame_params.{bitstream,alpha} = canvas decoded from subchunks in
+                                         'Frame Data' as per the pseudocode for
+                                         _fragmented image_ above.
+    else
+        // Non-fragmented frame.
+        for subchunk in 'Frame Data':
+            if subchunk.tag == "ALPH":
+                assert alpha subchunks not found in 'Frame Data' earlier
+                frame_params.alpha = alpha_data
+            else if subchunk.tag == "VP8 " OR subchunk.tag == "VP8L":
+                assert bitstream subchunks not found in 'Frame Data' earlier
+                frame_params.bitstream = bitstream_data
+    render frame with frame_params.alpha and frame_params.bitstream on canvas
+    with top-left corner in (frame_params.frameX, frame_params.frameY), using
+    dispose method dispose_method.
+    Show the contents of the image for frame_params.frameDuration * 1ms.
+canvas contains the decoded canvas.
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Example file layouts
+--------------------
+
+A lossy encoded image with alpha may look as follows:
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+RIFF/WEBP
+- VP8X (descriptions of features used)
+- ALPH (alpha bitstream)
+- VP8 (bitstream)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+A losslessly encoded image may look as follows:
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+RIFF/WEBP
+- VP8X (descriptions of features used)
+- XYZW (unknown chunk)
+- VP8L (lossless bitstream)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+A lossless image with ICC profile and XMP metadata may
+look as follows:
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+RIFF/WEBP
+- VP8X (descriptions of features used)
+- ICCP (color profile)
+- VP8L (lossless bitstream)
+- XMP  (metadata)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+A fragmented image may look as follows:
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+RIFF/WEBP
+- VP8X (descriptions of features used)
+- FRGM (fragment1 parameters + data)
+- FRGM (fragment2 parameters + data)
+- FRGM (fragment3 parameters + data)
+- FRGM (fragment4 parameters + data)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+An animated image with EXIF metadata may look as follows:
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+RIFF/WEBP
+- VP8X (descriptions of features used)
+- ANIM (global animation parameters)
+- ANMF (frame1 parameters + data)
+- ANMF (frame2 parameters + data)
+- ANMF (frame3 parameters + data)
+- ANMF (frame4 parameters + data)
+- EXIF (metadata)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+[vp8spec]:  http://tools.ietf.org/html/rfc6386
+[webpllspec]: https://gerrit.chromium.org/gerrit/gitweb?p=webm/libwebp.git;a=blob;f=doc/webp-lossless-bitstream-spec.txt;hb=master
+[iccspec]: http://www.color.org/icc_specs2.xalter
+[metadata]: http://www.metadataworkinggroup.org/pdf/mwg_guidance.pdf
+[rfc 2119]: http://tools.ietf.org/html/rfc2119
--- a/doc/webp-lossless-bitstream-spec.txt
+++ b/doc/webp-lossless-bitstream-spec.txt
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@ -1,11 +1,52 @@
 AM_CPPFLAGS = -I$(top_srcdir)/src

 bin_PROGRAMS = dwebp cwebp
+if BUILD_VWEBP
+  bin_PROGRAMS += vwebp
+endif
+if WANT_MUX
+  bin_PROGRAMS += webpmux
+endif
+
+if BUILD_GIF2WEBP
+  bin_PROGRAMS += gif2webp
+endif
+
+noinst_LTLIBRARIES = libexampleutil.la
+
+libexampleutil_la_SOURCES = example_util.c example_util.h

 dwebp_SOURCES = dwebp.c stopwatch.h
-dwebp_CPPFLAGS = $(AM_CPPFLAGS) $(PNG_INCLUDES) $(JPEG_INCLUDES) $(USE_EXPERIMENTAL_CODE)
-dwebp_LDADD = ../src/libwebp.la $(PNG_LIBS) $(JPEG_LIBS)
+dwebp_CPPFLAGS  = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE)
+dwebp_CPPFLAGS += $(JPEG_INCLUDES) $(PNG_INCLUDES)
+dwebp_LDADD = libexampleutil.la $(PNG_LIBS) $(JPEG_LIBS)

-cwebp_SOURCES = cwebp.c stopwatch.h
-cwebp_CPPFLAGS = $(AM_CPPFLAGS) $(PNG_INCLUDES) $(JPEG_INCLUDES) $(USE_EXPERIMENTAL_CODE)
-cwebp_LDADD = ../src/libwebp.la $(PNG_LIBS) $(JPEG_LIBS)
+cwebp_SOURCES  = cwebp.c metadata.c metadata.h stopwatch.h
+cwebp_SOURCES += jpegdec.c jpegdec.h
+cwebp_SOURCES += pngdec.c pngdec.h
+cwebp_SOURCES += tiffdec.c tiffdec.h
+cwebp_SOURCES += wicdec.c wicdec.h
+cwebp_CPPFLAGS  = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE)
+cwebp_CPPFLAGS += $(JPEG_INCLUDES) $(PNG_INCLUDES) $(TIFF_INCLUDES)
+cwebp_LDADD = ../src/libwebp.la $(JPEG_LIBS) $(PNG_LIBS) $(TIFF_LIBS)
+
+gif2webp_SOURCES = gif2webp.c
+gif2webp_CPPFLAGS = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE) $(GIF_INCLUDES)
+gif2webp_LDADD  = libexampleutil.la ../src/mux/libwebpmux.la ../src/libwebp.la
+gif2webp_LDADD += $(GIF_LIBS)
+
+webpmux_SOURCES = webpmux.c
+webpmux_CPPFLAGS = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE)
+webpmux_LDADD = libexampleutil.la ../src/mux/libwebpmux.la ../src/libwebp.la
+
+vwebp_SOURCES = vwebp.c
+vwebp_CPPFLAGS = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE) $(GL_INCLUDES)
+vwebp_LDADD = libexampleutil.la ../src/demux/libwebpdemux.la $(GL_LIBS)
+
+if BUILD_LIBWEBPDECODER
+  dwebp_LDADD += ../src/libwebpdecoder.la
+  vwebp_LDADD += ../src/libwebpdecoder.la
+else
+  dwebp_LDADD += ../src/libwebp.la
+  vwebp_LDADD += ../src/libwebp.la
+endif
--- a/examples/cwebp.c
+++ b/examples/cwebp.c
--- a/examples/dwebp.c
+++ b/examples/dwebp.c
@ -1,13 +1,13 @@
-// Copyright 2010 Google Inc.
+// Copyright 2010 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
-//  Command-line tool for decoding a WebP image
-//
-//  Compile with:     gcc -o dwebp dwebp.c -lwebpdecode
+//  Command-line tool for decoding a WebP image.
 //
 // Author: Skal (pascal.massimino@gmail.com)

@ -38,37 +38,42 @@
 #endif

 #include "webp/decode.h"
-#include "stopwatch.h"
+#include "./example_util.h"
+#include "./stopwatch.h"

+static int verbose = 0;
+#ifndef WEBP_DLL
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif

-static int verbose = 0;
-#ifndef WEBP_DLL
 extern void* VP8GetCPUInfo;   // opaque forward declaration.
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
 #endif
+#endif  // WEBP_DLL

 //------------------------------------------------------------------------------

 // Output types
 typedef enum {
  PNG = 0,
+  PAM,
  PPM,
  PGM,
+  YUV,
  ALPHA_PLANE_ONLY  // this is for experimenting only
 } OutputFileFormat;

 #ifdef HAVE_WINCODEC_H

-#define IFS(fn)                \
-  do {                         \
-     if (SUCCEEDED(hr))        \
-     {                         \
-        hr = (fn);             \
-        if (FAILED(hr) && verbose)           \
-          printf(#fn " failed %08x\n", hr);  \
-     }                         \
+#define IFS(fn)                                                     \
+  do {                                                              \
+    if (SUCCEEDED(hr)) {                                            \
+      hr = (fn);                                                    \
+      if (FAILED(hr)) fprintf(stderr, #fn " failed %08lx\n", hr);   \
+    }                                                               \
  } while (0)

 #ifdef __cplusplus
@ -77,12 +82,13 @@ typedef enum {
 #define MAKE_REFGUID(x) &(x)
 #endif

-static HRESULT CreateOutputStream(const char* out_file_name,
-                                  IStream** ppStream) {
+static HRESULT CreateOutputStream(const char* out_file_name, IStream** stream) {
  HRESULT hr = S_OK;
-  IFS(SHCreateStreamOnFileA(out_file_name, STGM_WRITE | STGM_CREATE, ppStream));
-  if (FAILED(hr))
-    printf("Error opening output file %s (%08x)\n", out_file_name, hr);
+  IFS(SHCreateStreamOnFileA(out_file_name, STGM_WRITE | STGM_CREATE, stream));
+  if (FAILED(hr)) {
+    fprintf(stderr, "Error opening output file %s (%08lx)\n",
+            out_file_name, hr);
+  }
  return hr;
 }

@ -90,40 +96,42 @@ static HRESULT WriteUsingWIC(const char* out_file_name, REFGUID container_guid,
                             unsigned char* rgb, int stride,
                             uint32_t width, uint32_t height, int has_alpha) {
  HRESULT hr = S_OK;
-  IWICImagingFactory* pFactory = NULL;
-  IWICBitmapFrameEncode* pFrame = NULL;
-  IWICBitmapEncoder* pEncoder = NULL;
-  IStream* pStream = NULL;
+  IWICImagingFactory* factory = NULL;
+  IWICBitmapFrameEncode* frame = NULL;
+  IWICBitmapEncoder* encoder = NULL;
+  IStream* stream = NULL;
  WICPixelFormatGUID pixel_format = has_alpha ? GUID_WICPixelFormat32bppBGRA
                                              : GUID_WICPixelFormat24bppBGR;

  IFS(CoInitialize(NULL));
  IFS(CoCreateInstance(MAKE_REFGUID(CLSID_WICImagingFactory), NULL,
-          CLSCTX_INPROC_SERVER, MAKE_REFGUID(IID_IWICImagingFactory),
-          (LPVOID*)&pFactory));
+                       CLSCTX_INPROC_SERVER,
+                       MAKE_REFGUID(IID_IWICImagingFactory),
+                       (LPVOID*)&factory));
  if (hr == REGDB_E_CLASSNOTREG) {
-    printf("Couldn't access Windows Imaging Component (are you running \n");
-    printf("Windows XP SP3 or newer?). PNG support not available.\n");
-    printf("Use -ppm or -pgm for available PPM and PGM formats.\n");
+    fprintf(stderr,
+            "Couldn't access Windows Imaging Component (are you running "
+            "Windows XP SP3 or newer?). PNG support not available. "
+            "Use -ppm or -pgm for available PPM and PGM formats.\n");
  }
-  IFS(CreateOutputStream(out_file_name, &pStream));
-  IFS(IWICImagingFactory_CreateEncoder(pFactory, container_guid, NULL,
-          &pEncoder));
-  IFS(IWICBitmapEncoder_Initialize(pEncoder, pStream,
+  IFS(CreateOutputStream(out_file_name, &stream));
+  IFS(IWICImagingFactory_CreateEncoder(factory, container_guid, NULL,
+                                       &encoder));
+  IFS(IWICBitmapEncoder_Initialize(encoder, stream,
                                   WICBitmapEncoderNoCache));
-  IFS(IWICBitmapEncoder_CreateNewFrame(pEncoder, &pFrame, NULL));
-  IFS(IWICBitmapFrameEncode_Initialize(pFrame, NULL));
-  IFS(IWICBitmapFrameEncode_SetSize(pFrame, width, height));
-  IFS(IWICBitmapFrameEncode_SetPixelFormat(pFrame, &pixel_format));
-  IFS(IWICBitmapFrameEncode_WritePixels(pFrame, height, stride,
-          height * stride, rgb));
-  IFS(IWICBitmapFrameEncode_Commit(pFrame));
-  IFS(IWICBitmapEncoder_Commit(pEncoder));
+  IFS(IWICBitmapEncoder_CreateNewFrame(encoder, &frame, NULL));
+  IFS(IWICBitmapFrameEncode_Initialize(frame, NULL));
+  IFS(IWICBitmapFrameEncode_SetSize(frame, width, height));
+  IFS(IWICBitmapFrameEncode_SetPixelFormat(frame, &pixel_format));
+  IFS(IWICBitmapFrameEncode_WritePixels(frame, height, stride,
+                                        height * stride, rgb));
+  IFS(IWICBitmapFrameEncode_Commit(frame));
+  IFS(IWICBitmapEncoder_Commit(encoder));

-  if (pFrame != NULL) IUnknown_Release(pFrame);
-  if (pEncoder != NULL) IUnknown_Release(pEncoder);
-  if (pFactory != NULL) IUnknown_Release(pFactory);
-  if (pStream != NULL) IUnknown_Release(pStream);
+  if (frame != NULL) IUnknown_Release(frame);
+  if (encoder != NULL) IUnknown_Release(encoder);
+  if (factory != NULL) IUnknown_Release(factory);
+  if (stream != NULL) IUnknown_Release(stream);
  return hr;
 }

@ -136,8 +144,8 @@ static int WritePNG(const char* out_file_name,
  const int has_alpha = (buffer->colorspace == MODE_BGRA);

  return SUCCEEDED(WriteUsingWIC(out_file_name,
-             MAKE_REFGUID(GUID_ContainerFormatPng), rgb, stride, width,
-             height, has_alpha));
+                                 MAKE_REFGUID(GUID_ContainerFormatPng),
+                                 rgb, stride, width, height, has_alpha));
 }

 #elif defined(WEBP_HAVE_PNG)    // !HAVE_WINCODEC_H
@ -185,28 +193,32 @@ static int WritePNG(FILE* out_file, const WebPDecBuffer* const buffer) {
  return 1;
 }
 #else    // !HAVE_WINCODEC_H && !WEBP_HAVE_PNG
-
-typedef uint32_t png_uint_32;
-
 static int WritePNG(FILE* out_file, const WebPDecBuffer* const buffer) {
  (void)out_file;
  (void)buffer;
-  printf("PNG support not compiled. Please install the libpng development "
-         "package before building.\n");
-  printf("You can run with -ppm flag to decode in PPM format.\n");
+  fprintf(stderr, "PNG support not compiled. Please install the libpng "
+          "development package before building.\n");
+  fprintf(stderr, "You can run with -ppm flag to decode in PPM format.\n");
  return 0;
 }
 #endif

-static int WritePPM(FILE* fout, const WebPDecBuffer* const buffer) {
+static int WritePPM(FILE* fout, const WebPDecBuffer* const buffer, int alpha) {
  const uint32_t width = buffer->width;
  const uint32_t height = buffer->height;
  const unsigned char* const rgb = buffer->u.RGBA.rgba;
  const int stride = buffer->u.RGBA.stride;
+  const size_t bytes_per_px = alpha ? 4 : 3;
  uint32_t y;
-  fprintf(fout, "P6\n%d %d\n255\n", width, height);
+
+  if (alpha) {
+    fprintf(fout, "P7\nWIDTH %d\nHEIGHT %d\nDEPTH 4\nMAXVAL 255\n"
+                  "TUPLTYPE RGB_ALPHA\nENDHDR\n", width, height);
+  } else {
+    fprintf(fout, "P6\n%d %d\n255\n", width, height);
+  }
  for (y = 0; y < height; ++y) {
-    if (fwrite(rgb + y * stride, width, 3, fout) != 3) {
+    if (fwrite(rgb + y * stride, width, bytes_per_px, fout) != bytes_per_px) {
      return 0;
    }
  }
@ -229,32 +241,50 @@ static int WriteAlphaPlane(FILE* fout, const WebPDecBuffer* const buffer) {
  return 1;
 }

-static int WritePGM(FILE* fout, const WebPDecBuffer* const buffer) {
+// format=PGM: save a grayscale PGM file using the IMC4 layout
+// (http://www.fourcc.org/yuv.php#IMC4). This is a very convenient format for
+// viewing the samples, esp. for odd dimensions.
+// format=YUV: just save the Y/U/V/A planes sequentially without header.
+static int WritePGMOrYUV(FILE* fout, const WebPDecBuffer* const buffer,
+                         OutputFileFormat format) {
  const int width = buffer->width;
  const int height = buffer->height;
  const WebPYUVABuffer* const yuv = &buffer->u.YUVA;
-  // Save a grayscale PGM file using the IMC4 layout
-  // (http://www.fourcc.org/yuv.php#IMC4). This is a very
-  // convenient format for viewing the samples, esp. for
-  // odd dimensions.
  int ok = 1;
  int y;
+  const int pad = (format == YUV) ? 0 : 1;
  const int uv_width = (width + 1) / 2;
  const int uv_height = (height + 1) / 2;
-  const int out_stride = (width + 1) & ~1;
+  const int out_stride = (width + pad) & ~pad;
  const int a_height = yuv->a ? height : 0;
-  fprintf(fout, "P5\n%d %d\n255\n", out_stride, height + uv_height + a_height);
+  if (format == PGM) {
+    fprintf(fout, "P5\n%d %d\n255\n",
+            out_stride, height + uv_height + a_height);
+  }
  for (y = 0; ok && y < height; ++y) {
    ok &= (fwrite(yuv->y + y * yuv->y_stride, width, 1, fout) == 1);
-    if (width & 1) fputc(0, fout);    // padding byte
+    if (format == PGM) {
+      if (width & 1) fputc(0, fout);    // padding byte
+    }
  }
-  for (y = 0; ok && y < uv_height; ++y) {
-    ok &= (fwrite(yuv->u + y * yuv->u_stride, uv_width, 1, fout) == 1);
-    ok &= (fwrite(yuv->v + y * yuv->v_stride, uv_width, 1, fout) == 1);
+  if (format == PGM) {   // IMC4 layout
+    for (y = 0; ok && y < uv_height; ++y) {
+      ok &= (fwrite(yuv->u + y * yuv->u_stride, uv_width, 1, fout) == 1);
+      ok &= (fwrite(yuv->v + y * yuv->v_stride, uv_width, 1, fout) == 1);
+    }
+  } else {
+    for (y = 0; ok && y < uv_height; ++y) {
+      ok &= (fwrite(yuv->u + y * yuv->u_stride, uv_width, 1, fout) == 1);
+    }
+    for (y = 0; ok && y < uv_height; ++y) {
+      ok &= (fwrite(yuv->v + y * yuv->v_stride, uv_width, 1, fout) == 1);
+    }
  }
  for (y = 0; ok && y < a_height; ++y) {
    ok &= (fwrite(yuv->a + y * yuv->a_stride, width, 1, fout) == 1);
-    if (width & 1) fputc(0, fout);    // padding byte
+    if (format == PGM) {
+      if (width & 1) fputc(0, fout);    // padding byte
+    }
  }
  return ok;
 }
@ -286,10 +316,12 @@ static void SaveOutput(const WebPDecBuffer* const buffer,
 #else
    ok &= WritePNG(fout, buffer);
 #endif
+  } else if (format == PAM) {
+    ok &= WritePPM(fout, buffer, 1);
  } else if (format == PPM) {
-    ok &= WritePPM(fout, buffer);
-  } else if (format == PGM) {
-    ok &= WritePGM(fout, buffer);
+    ok &= WritePPM(fout, buffer, 0);
+  } else if (format == PGM || format == YUV) {
+    ok &= WritePGMOrYUV(fout, buffer, format);
  } else if (format == ALPHA_PLANE_ONLY) {
    ok &= WriteAlphaPlane(fout, buffer);
  }
@ -299,8 +331,8 @@ static void SaveOutput(const WebPDecBuffer* const buffer,
  if (ok) {
    printf("Saved file %s\n", out_file);
    if (verbose) {
-      const double time = StopwatchReadAndReset(&stop_watch);
-      printf("Time to write output: %.3fs\n", time);
+      const double write_time = StopwatchReadAndReset(&stop_watch);
+      printf("Time to write output: %.3fs\n", write_time);
    }
  } else {
    fprintf(stderr, "Error writing file %s !!\n", out_file);
@ -311,9 +343,12 @@ static void Help(void) {
  printf("Usage: dwebp in_file [options] [-o out_file]\n\n"
         "Decodes the WebP image file to PNG format [Default]\n"
         "Use following options to convert into alternate image formats:\n"
-         "  -ppm ......... save the raw RGB samples as color PPM\n"
+         "  -pam ......... save the raw RGBA samples as a color PAM\n"
+         "  -ppm ......... save the raw RGB samples as a color PPM\n"
         "  -pgm ......... save the raw YUV samples as a grayscale PGM\n"
         "                 file with IMC4 layout.\n"
+         "  -yuv ......... save the raw YUV samples in flat layout.\n"
+         "\n"
         " Other options are:\n"
         "  -version  .... print version number and exit.\n"
         "  -nofancy ..... don't use the fancy YUV420 upscaler.\n"
@ -321,9 +356,7 @@ static void Help(void) {
         "  -mt .......... use multi-threading\n"
         "  -crop <x> <y> <w> <h> ... crop output with the given rectangle\n"
         "  -scale <w> <h> .......... scale the output (*after* any cropping)\n"
-#ifdef WEBP_EXPERIMENTAL_FEATURES
         "  -alpha ....... only save the alpha plane.\n"
-#endif
         "  -h     ....... this help message.\n"
         "  -v     ....... verbose (e.g. print encoding/decoding times)\n"
 #ifndef WEBP_DLL
@ -364,15 +397,19 @@ int main(int argc, const char *argv[]) {
      config.options.no_fancy_upsampling = 1;
    } else if (!strcmp(argv[c], "-nofilter")) {
      config.options.bypass_filtering = 1;
+    } else if (!strcmp(argv[c], "-pam")) {
+      format = PAM;
    } else if (!strcmp(argv[c], "-ppm")) {
      format = PPM;
    } else if (!strcmp(argv[c], "-version")) {
      const int version = WebPGetDecoderVersion();
      printf("%d.%d.%d\n",
-        (version >> 16) & 0xff, (version >> 8) & 0xff, version & 0xff);
+             (version >> 16) & 0xff, (version >> 8) & 0xff, version & 0xff);
      return 0;
    } else if (!strcmp(argv[c], "-pgm")) {
      format = PGM;
+    } else if (!strcmp(argv[c], "-yuv")) {
+      format = YUV;
    } else if (!strcmp(argv[c], "-mt")) {
      config.options.use_threads = 1;
    } else if (!strcmp(argv[c], "-crop") && c < argc - 4) {
@ -392,7 +429,7 @@ int main(int argc, const char *argv[]) {
      VP8GetCPUInfo = NULL;
 #endif
    } else if (argv[c][0] == '-') {
-      printf("Unknown option '%s'\n", argv[c]);
+      fprintf(stderr, "Unknown option '%s'\n", argv[c]);
      Help();
      return -1;
    } else {
@ -401,7 +438,7 @@ int main(int argc, const char *argv[]) {
  }

  if (in_file == NULL) {
-    printf("missing input file!!\n");
+    fprintf(stderr, "missing input file!!\n");
    Help();
    return -1;
  }
@ -410,35 +447,26 @@ int main(int argc, const char *argv[]) {
    Stopwatch stop_watch;
    VP8StatusCode status = VP8_STATUS_OK;
    int ok;
-    uint32_t data_size = 0;
-    void* data = NULL;
-    FILE* const in = fopen(in_file, "rb");
+    size_t data_size = 0;
+    const uint8_t* data = NULL;

-    if (!in) {
-      fprintf(stderr, "cannot open input file '%s'\n", in_file);
-      return 1;
-    }
-    fseek(in, 0, SEEK_END);
-    data_size = ftell(in);
-    fseek(in, 0, SEEK_SET);
-    data = malloc(data_size);
-    ok = (fread(data, data_size, 1, in) == 1);
-    fclose(in);
-    if (!ok) {
-      fprintf(stderr, "Could not read %d bytes of data from file %s\n",
-              data_size, in_file);
-      free(data);
-      return -1;
-    }
+    if (!ExUtilReadFile(in_file, &data, &data_size)) return -1;

    if (verbose)
      StopwatchReadAndReset(&stop_watch);

-    status = WebPGetFeatures((const uint8_t*)data, data_size, bitstream);
+    status = WebPGetFeatures(data, data_size, bitstream);
    if (status != VP8_STATUS_OK) {
      goto end;
    }

+    if (bitstream->has_animation) {
+      fprintf(stderr,
+              "Error! Decoding of an animated WebP file is not supported.\n"
+              "       Use webpmux to extract the individual frames or\n"
+              "       vwebp to view this image.\n");
+    }
+
    switch (format) {
      case PNG:
 #ifdef HAVE_WINCODEC_H
@ -447,27 +475,31 @@ int main(int argc, const char *argv[]) {
        output_buffer->colorspace = bitstream->has_alpha ? MODE_RGBA : MODE_RGB;
 #endif
        break;
+      case PAM:
+        output_buffer->colorspace = MODE_RGBA;
+        break;
      case PPM:
        output_buffer->colorspace = MODE_RGB;  // drops alpha for PPM
        break;
      case PGM:
+      case YUV:
        output_buffer->colorspace = bitstream->has_alpha ? MODE_YUVA : MODE_YUV;
        break;
      case ALPHA_PLANE_ONLY:
        output_buffer->colorspace = MODE_YUVA;
        break;
      default:
-        free(data);
+        free((void*)data);
        return -1;
    }
-    status = WebPDecode((const uint8_t*)data, data_size, &config);
+    status = WebPDecode(data, data_size, &config);

    if (verbose) {
-      const double time = StopwatchReadAndReset(&stop_watch);
-      printf("Time to decode picture: %.3fs\n", time);
+      const double decode_time = StopwatchReadAndReset(&stop_watch);
+      printf("Time to decode picture: %.3fs\n", decode_time);
    }
 end:
-    free(data);
+    free((void*)data);
    ok = (status == VP8_STATUS_OK);
    if (!ok) {
      fprintf(stderr, "Decoding of %s failed.\n", in_file);
@ -493,7 +525,3 @@ int main(int argc, const char *argv[]) {
 }

 //------------------------------------------------------------------------------
-
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
--- a/examples/example_util.c
+++ b/examples/example_util.c
@ -0,0 +1,79 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+//  Utility functions used by the example programs.
+//
+
+#include "./example_util.h"
+#include <stdio.h>
+#include <stdlib.h>
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+// -----------------------------------------------------------------------------
+// File I/O
+
+int ExUtilReadFile(const char* const file_name,
+                   const uint8_t** data, size_t* data_size) {
+  int ok;
+  void* file_data;
+  size_t file_size;
+  FILE* in;
+
+  if (file_name == NULL || data == NULL || data_size == NULL) return 0;
+  *data = NULL;
+  *data_size = 0;
+
+  in = fopen(file_name, "rb");
+  if (in == NULL) {
+    fprintf(stderr, "cannot open input file '%s'\n", file_name);
+    return 0;
+  }
+  fseek(in, 0, SEEK_END);
+  file_size = ftell(in);
+  fseek(in, 0, SEEK_SET);
+  file_data = malloc(file_size);
+  if (file_data == NULL) return 0;
+  ok = (fread(file_data, file_size, 1, in) == 1);
+  fclose(in);
+
+  if (!ok) {
+    fprintf(stderr, "Could not read %d bytes of data from file %s\n",
+            (int)file_size, file_name);
+    free(file_data);
+    return 0;
+  }
+  *data = (uint8_t*)file_data;
+  *data_size = file_size;
+  return 1;
+}
+
+int ExUtilWriteFile(const char* const file_name,
+                    const uint8_t* data, size_t data_size) {
+  int ok;
+  FILE* out;
+
+  if (file_name == NULL || data == NULL) {
+    return 0;
+  }
+  out = fopen(file_name, "wb");
+  if (out == NULL) {
+    fprintf(stderr, "Error! Cannot open output file '%s'\n", file_name);
+    return 0;
+  }
+  ok = (fwrite(data, data_size, 1, out) == 1);
+  fclose(out);
+  return ok;
+}
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/examples/example_util.h
+++ b/examples/example_util.h
@ -0,0 +1,36 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+//  Utility functions used by the example programs.
+//
+
+#ifndef WEBP_EXAMPLES_EXAMPLE_UTIL_H_
+#define WEBP_EXAMPLES_EXAMPLE_UTIL_H_
+
+#include "webp/types.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+// Allocates storage for entire file 'file_name' and returns contents and size
+// in 'data' and 'data_size'. Returns 1 on success, 0 otherwise. '*data' should
+// be deleted using free().
+int ExUtilReadFile(const char* const file_name,
+                   const uint8_t** data, size_t* data_size);
+
+// Write a data segment into a file named 'file_name'. Returns true if ok.
+int ExUtilWriteFile(const char* const file_name,
+                    const uint8_t* data, size_t data_size);
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
+
+#endif  // WEBP_EXAMPLES_EXAMPLE_UTIL_H_
--- a/examples/gif2webp.c
+++ b/examples/gif2webp.c
@ -0,0 +1,536 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+//  simple tool to convert animated GIFs to WebP
+//
+// Authors: Skal (pascal.massimino@gmail.com)
+//          Urvang (urvang@google.com)
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <gif_lib.h>
+#include "webp/encode.h"
+#include "webp/mux.h"
+#include "./example_util.h"
+
+#define GIF_TRANSPARENT_MASK 0x01
+#define GIF_DISPOSE_MASK     0x07
+#define GIF_DISPOSE_SHIFT    2
+#define TRANSPARENT_COLOR    0x00ffffff
+#define WHITE_COLOR          0xffffffff
+
+//------------------------------------------------------------------------------
+
+static int transparent_index = -1;  // No transparency by default.
+
+static void ClearPicture(WebPPicture* const picture, uint32_t color) {
+  int x, y;
+  for (y = 0; y < picture->height; ++y) {
+    uint32_t* const dst = picture->argb + y * picture->argb_stride;
+    for (x = 0; x < picture->width; ++x) dst[x] = color;
+  }
+}
+
+static void Remap(const uint8_t* const src, const GifFileType* const gif,
+                  uint32_t* dst, int len) {
+  int i;
+  const GifColorType* colors;
+  const ColorMapObject* const cmap =
+      gif->Image.ColorMap ? gif->Image.ColorMap : gif->SColorMap;
+  if (cmap == NULL) return;
+  colors = cmap->Colors;
+
+  for (i = 0; i < len; ++i) {
+    const GifColorType c = colors[src[i]];
+    dst[i] = (src[i] == transparent_index) ? TRANSPARENT_COLOR
+           : c.Blue | (c.Green << 8) | (c.Red << 16) | (0xff << 24);
+  }
+}
+
+static int ReadSubImage(GifFileType* gif, WebPPicture* pic, WebPPicture* view) {
+  const GifImageDesc image_desc = gif->Image;
+  const int offset_x = image_desc.Left;
+  const int offset_y = image_desc.Top;
+  const int sub_w = image_desc.Width;
+  const int sub_h = image_desc.Height;
+  uint32_t* dst = NULL;
+  uint8_t* tmp = NULL;
+  int ok = 0;
+
+  // Use a view for the sub-picture:
+  if (!WebPPictureView(pic, offset_x, offset_y, sub_w, sub_h, view)) {
+    fprintf(stderr, "Sub-image %dx%d at position %d,%d is invalid!\n",
+            sub_w, sub_h, offset_x, offset_y);
+    goto End;
+  }
+  dst = view->argb;
+
+  tmp = (uint8_t*)malloc(sub_w * sizeof(*tmp));
+  if (tmp == NULL) goto End;
+
+  if (image_desc.Interlace) {  // Interlaced image.
+    // We need 4 passes, with the following offsets and jumps.
+    const int interlace_offsets[] = { 0, 4, 2, 1 };
+    const int interlace_jumps[]   = { 8, 8, 4, 2 };
+    int pass;
+    for (pass = 0; pass < 4; ++pass) {
+      int y;
+      for (y = interlace_offsets[pass]; y < sub_h; y += interlace_jumps[pass]) {
+        if (DGifGetLine(gif, tmp, sub_w) == GIF_ERROR) goto End;
+        Remap(tmp, gif, dst + y * view->argb_stride, sub_w);
+      }
+    }
+  } else {  // Non-interlaced image.
+    int y;
+    for (y = 0; y < sub_h; ++y) {
+      if (DGifGetLine(gif, tmp, sub_w) == GIF_ERROR) goto End;
+      Remap(tmp, gif, dst + y * view->argb_stride, sub_w);
+    }
+  }
+  // re-align the view with even offset (and adjust dimensions if needed).
+  WebPPictureView(pic, offset_x & ~1, offset_y & ~1,
+                  sub_w + (offset_x & 1), sub_h + (offset_y & 1), view);
+  ok = 1;
+
+ End:
+  free(tmp);
+  return ok;
+}
+
+static int GetBackgroundColor(const ColorMapObject* const color_map,
+                              GifWord bgcolor_idx, uint32_t* const bgcolor) {
+  if (transparent_index != -1 && bgcolor_idx == transparent_index) {
+    *bgcolor = TRANSPARENT_COLOR;  // Special case.
+    return 1;
+  } else if (color_map == NULL || color_map->Colors == NULL
+             || bgcolor_idx >= color_map->ColorCount) {
+    return 0;  // Invalid color map or index.
+  } else {
+    const GifColorType color = color_map->Colors[bgcolor_idx];
+    *bgcolor = (0xff        << 24)
+             | (color.Red   << 16)
+             | (color.Green <<  8)
+             | (color.Blue  <<  0);
+    return 1;
+  }
+}
+
+static void DisplayGifError(const GifFileType* const gif, int gif_error) {
+  // GIFLIB_MAJOR is only defined in libgif >= 4.2.0.
+  // libgif 4.2.0 has retired PrintGifError() and added GifErrorString().
+#if defined(GIFLIB_MAJOR) && defined(GIFLIB_MINOR) && \
+        ((GIFLIB_MAJOR == 4 && GIFLIB_MINOR >= 2) || GIFLIB_MAJOR > 4)
+#if GIFLIB_MAJOR >= 5
+    // Static string actually, hence the const char* cast.
+    const char* error_str = (const char*)GifErrorString(
+        (gif == NULL) ? gif_error : gif->Error);
+#else
+    const char* error_str = (const char*)GifErrorString();
+    (void)gif;
+#endif
+    if (error_str == NULL) error_str = "Unknown error";
+    fprintf(stderr, "GIFLib Error %d: %s\n", gif_error, error_str);
+#else
+    (void)gif;
+    fprintf(stderr, "GIFLib Error %d: ", gif_error);
+    PrintGifError();
+    fprintf(stderr, "\n");
+#endif
+}
+
+static const char* const kErrorMessages[] = {
+  "WEBP_MUX_NOT_FOUND", "WEBP_MUX_INVALID_ARGUMENT", "WEBP_MUX_BAD_DATA",
+  "WEBP_MUX_MEMORY_ERROR", "WEBP_MUX_NOT_ENOUGH_DATA"
+};
+
+static const char* ErrorString(WebPMuxError err) {
+  assert(err <= WEBP_MUX_NOT_FOUND && err >= WEBP_MUX_NOT_ENOUGH_DATA);
+  return kErrorMessages[-err];
+}
+
+//------------------------------------------------------------------------------
+
+static void Help(void) {
+  printf("Usage:\n");
+  printf(" gif2webp [options] gif_file -o webp_file\n");
+  printf("options:\n");
+  printf("  -h / -help  ............ this help\n");
+  printf("  -lossy ................. Encode image using lossy compression.\n");
+  printf("  -q <float> ............. quality factor (0:small..100:big)\n");
+  printf("  -m <int> ............... compression method (0=fast, 6=slowest)\n");
+  printf("  -f <int> ............... filter strength (0=off..100)\n");
+  printf("\n");
+  printf("  -version ............... print version number and exit.\n");
+  printf("  -v ..................... verbose.\n");
+  printf("  -quiet ................. don't print anything.\n");
+  printf("\n");
+}
+
+//------------------------------------------------------------------------------
+
+int main(int argc, const char *argv[]) {
+  int verbose = 0;
+  int gif_error = GIF_ERROR;
+  WebPMuxError err = WEBP_MUX_OK;
+  int ok = 0;
+  const char *in_file = NULL, *out_file = NULL;
+  FILE* out = NULL;
+  GifFileType* gif = NULL;
+  WebPPicture picture;
+  WebPMuxFrameInfo frame;
+  WebPMuxAnimParams anim = { WHITE_COLOR, 0 };
+
+  int is_first_frame = 1;
+  int done;
+  int c;
+  int quiet = 0;
+  WebPConfig config;
+  WebPMux* mux = NULL;
+  WebPData webp_data = { NULL, 0 };
+  int stored_icc = 0;  // Whether we have already stored an ICC profile.
+  int stored_xmp = 0;
+
+  memset(&frame, 0, sizeof(frame));
+  frame.id = WEBP_CHUNK_ANMF;
+  frame.dispose_method = WEBP_MUX_DISPOSE_BACKGROUND;
+
+  if (!WebPConfigInit(&config) || !WebPPictureInit(&picture)) {
+    fprintf(stderr, "Error! Version mismatch!\n");
+    return -1;
+  }
+  config.lossless = 1;  // Use lossless compression by default.
+
+  if (argc == 1) {
+    Help();
+    return 0;
+  }
+
+  for (c = 1; c < argc; ++c) {
+    if (!strcmp(argv[c], "-h") || !strcmp(argv[c], "-help")) {
+      Help();
+      return 0;
+    } else if (!strcmp(argv[c], "-o") && c < argc - 1) {
+      out_file = argv[++c];
+    } else if (!strcmp(argv[c], "-lossy")) {
+      config.lossless = 0;
+    } else if (!strcmp(argv[c], "-q") && c < argc - 1) {
+      config.quality = (float)strtod(argv[++c], NULL);
+    } else if (!strcmp(argv[c], "-m") && c < argc - 1) {
+      config.method = strtol(argv[++c], NULL, 0);
+    } else if (!strcmp(argv[c], "-f") && c < argc - 1) {
+      config.filter_strength = strtol(argv[++c], NULL, 0);
+    } else if (!strcmp(argv[c], "-version")) {
+      const int enc_version = WebPGetEncoderVersion();
+      const int mux_version = WebPGetMuxVersion();
+      printf("WebP Encoder version: %d.%d.%d\nWebP Mux version: %d.%d.%d\n",
+             (enc_version >> 16) & 0xff, (enc_version >> 8) & 0xff,
+             enc_version & 0xff, (mux_version >> 16) & 0xff,
+             (mux_version >> 8) & 0xff, mux_version & 0xff);
+      return 0;
+    } else if (!strcmp(argv[c], "-quiet")) {
+      quiet = 1;
+    } else if (!strcmp(argv[c], "-v")) {
+      verbose = 1;
+    } else if (argv[c][0] == '-') {
+      fprintf(stderr, "Error! Unknown option '%s'\n", argv[c]);
+      Help();
+      return -1;
+    } else {
+      in_file = argv[c];
+    }
+  }
+  if (!WebPValidateConfig(&config)) {
+    fprintf(stderr, "Error! Invalid configuration.\n");
+    goto End;
+  }
+
+  if (in_file == NULL) {
+    fprintf(stderr, "No input file specified!\n");
+    Help();
+    goto End;
+  }
+
+  // Start the decoder object
+#if defined(GIFLIB_MAJOR) && (GIFLIB_MAJOR >= 5)
+  // There was an API change in version 5.0.0.
+  gif = DGifOpenFileName(in_file, &gif_error);
+#else
+  gif = DGifOpenFileName(in_file);
+#endif
+  if (gif == NULL) goto End;
+
+  // Allocate picture buffer
+  picture.width = gif->SWidth;
+  picture.height = gif->SHeight;
+  picture.use_argb = 1;
+    if (!WebPPictureAlloc(&picture)) goto End;
+
+  mux = WebPMuxNew();
+  if (mux == NULL) {
+    fprintf(stderr, "ERROR: could not create a mux object.\n");
+    goto End;
+  }
+
+  // Loop over GIF images
+  done = 0;
+  do {
+    GifRecordType type;
+    if (DGifGetRecordType(gif, &type) == GIF_ERROR) goto End;
+
+    switch (type) {
+      case IMAGE_DESC_RECORD_TYPE: {
+        WebPPicture sub_image;
+        WebPMemoryWriter memory;
+
+        if (frame.dispose_method == WEBP_MUX_DISPOSE_BACKGROUND) {
+          ClearPicture(&picture, anim.bgcolor);
+        }
+
+        if (!DGifGetImageDesc(gif)) goto End;
+        if (!ReadSubImage(gif, &picture, &sub_image)) goto End;
+
+        if (!config.lossless) {
+          // We need to call BGRA variant because of the way we do Remap(). Note
+          // that 'sub_image' will no longer be a view and own some memory.
+          WebPPictureImportBGRA(
+              &sub_image, (uint8_t*)sub_image.argb,
+              sub_image.argb_stride * sizeof(*sub_image.argb));
+          sub_image.use_argb = 0;
+        } else {
+          sub_image.use_argb = 1;
+        }
+
+        sub_image.writer = WebPMemoryWrite;
+        sub_image.custom_ptr = &memory;
+        WebPMemoryWriterInit(&memory);
+        if (!WebPEncode(&config, &sub_image)) {
+          fprintf(stderr, "Error! Cannot encode picture as WebP\n");
+          fprintf(stderr, "Error code: %d\n", sub_image.error_code);
+          goto End;
+        }
+
+        // Now we have all the info about the frame, as a Graphic Control
+        // Extension Block always appears before the Image Descriptor Block.
+        // So add the frame to mux.
+        frame.x_offset = gif->Image.Left & ~1;
+        frame.y_offset = gif->Image.Top & ~1;
+        frame.bitstream.bytes = memory.mem;
+        frame.bitstream.size = memory.size;
+        err = WebPMuxPushFrame(mux, &frame, 1);
+        if (err != WEBP_MUX_OK) {
+          fprintf(stderr, "ERROR (%s): Could not add animation frame.\n",
+                  ErrorString(err));
+          goto End;
+        }
+        if (verbose) {
+          printf("Added frame %dx%d (offset:%d,%d duration:%d) ",
+                 sub_image.width, sub_image.height,
+                 frame.x_offset, frame.y_offset,
+                 frame.duration);
+          printf("dispose:%d transparent index:%d\n",
+                 frame.dispose_method, transparent_index);
+        }
+        WebPDataClear(&frame.bitstream);
+        WebPPictureFree(&sub_image);
+        break;
+      }
+      case EXTENSION_RECORD_TYPE: {
+        int extension;
+        GifByteType *data = NULL;
+        if (DGifGetExtension(gif, &extension, &data) == GIF_ERROR) {
+          goto End;
+        }
+        switch (extension) {
+          case COMMENT_EXT_FUNC_CODE: {
+            break;  // Do nothing for now.
+          }
+          case GRAPHICS_EXT_FUNC_CODE: {
+            const int flags = data[1];
+            const int dispose = (flags >> GIF_DISPOSE_SHIFT) & GIF_DISPOSE_MASK;
+            const int delay = data[2] | (data[3] << 8);  // In 10 ms units.
+            if (data[0] != 4) goto End;
+            frame.duration = delay * 10;  // Duration is in 1 ms units for WebP.
+            if (dispose == 3) {
+              fprintf(stderr, "WARNING: GIF_DISPOSE_RESTORE not supported.");
+              // failsafe. TODO(urvang): emulate the correct behaviour by
+              // recoding the whole frame.
+              frame.dispose_method = WEBP_MUX_DISPOSE_BACKGROUND;
+            } else {
+              frame.dispose_method =
+                  (dispose == 2) ? WEBP_MUX_DISPOSE_BACKGROUND
+                                 : WEBP_MUX_DISPOSE_NONE;
+            }
+            transparent_index = (flags & GIF_TRANSPARENT_MASK) ? data[4] : -1;
+            if (is_first_frame) {
+              if (!GetBackgroundColor(gif->SColorMap, gif->SBackGroundColor,
+                                      &anim.bgcolor)) {
+                fprintf(stderr, "GIF decode warning: invalid background color "
+                        "index. Assuming white background.\n");
+              }
+              ClearPicture(&picture, anim.bgcolor);
+              is_first_frame = 0;
+            }
+            break;
+          }
+          case PLAINTEXT_EXT_FUNC_CODE: {
+            break;
+          }
+          case APPLICATION_EXT_FUNC_CODE: {
+            if (data[0] != 11) break;    // Chunk is too short
+            if (!memcmp(data + 1, "NETSCAPE2.0", 11)) {
+              // Recognize and parse Netscape2.0 NAB extension for loop count.
+              if (DGifGetExtensionNext(gif, &data) == GIF_ERROR) goto End;
+              if (data == NULL) goto End;  // Loop count sub-block missing.
+              if (data[0] != 3 && data[1] != 1) break;   // wrong size/marker
+              anim.loop_count = data[2] | (data[3] << 8);
+              if (verbose) printf("Loop count: %d\n", anim.loop_count);
+            } else {  // An extension containing metadata.
+              // We only store the first encountered chunk of each type.
+              const int is_xmp =
+                  !stored_xmp && !memcmp(data + 1, "XMP DataXMP", 11);
+              const int is_icc =
+                  !stored_icc && !memcmp(data + 1, "ICCRGBG1012", 11);
+              if (is_xmp || is_icc) {
+                const char* const fourccs[2] = { "XMP " , "ICCP" };
+                const char* const features[2] = { "XMP" , "ICC" };
+                WebPData metadata = { NULL, 0 };
+                // Construct metadata from sub-blocks.
+                // Usual case (including ICC profile): In each sub-block, the
+                // first byte specifies its size in bytes (0 to 255) and the
+                // rest of the bytes contain the data.
+                // Special case for XMP data: In each sub-block, the first byte
+                // is also part of the XMP payload. XMP in GIF also has a 257
+                // byte padding data. See the XMP specification for details.
+                while (1) {
+                  WebPData prev_metadata = metadata;
+                  WebPData subblock;
+                  if (DGifGetExtensionNext(gif, &data) == GIF_ERROR) {
+                    WebPDataClear(&metadata);
+                    goto End;
+                  }
+                  if (data == NULL) break;  // Finished.
+                  subblock.size = is_xmp ? data[0] + 1 : data[0];
+                  assert(subblock.size > 0);
+                  subblock.bytes = is_xmp ? data : data + 1;
+                  metadata.bytes =
+                      (uint8_t*)realloc((void*)metadata.bytes,
+                                        prev_metadata.size + subblock.size);
+                  if (metadata.bytes == NULL) {
+                    WebPDataClear(&prev_metadata);
+                    goto End;
+                  }
+                  metadata.size += subblock.size;
+                  memcpy((void*)(metadata.bytes + prev_metadata.size),
+                         subblock.bytes, subblock.size);
+                }
+                if (is_xmp) {
+                  // XMP padding data is 0x01, 0xff, 0xfe ... 0x01, 0x00.
+                  const size_t xmp_pading_size = 257;
+                  if (metadata.size > xmp_pading_size) {
+                    metadata.size -= xmp_pading_size;
+                  }
+                }
+
+                // Add metadata chunk.
+                err = WebPMuxSetChunk(mux, fourccs[is_icc], &metadata, 1);
+                if (verbose) {
+                  printf("%s size: %d\n", features[is_icc], (int)metadata.size);
+                }
+                WebPDataClear(&metadata);
+                if (err != WEBP_MUX_OK) {
+                  fprintf(stderr, "ERROR (%s): Could not set %s chunk.\n",
+                          ErrorString(err), features[is_icc]);
+                  goto End;
+                }
+                if (is_icc) {
+                  stored_icc = 1;
+                } else if (is_xmp) {
+                  stored_xmp = 1;
+                }
+              }
+            }
+            break;
+          }
+          default: {
+            break;  // skip
+          }
+        }
+        while (data != NULL) {
+          if (DGifGetExtensionNext(gif, &data) == GIF_ERROR) goto End;
+        }
+        break;
+      }
+      case TERMINATE_RECORD_TYPE: {
+        done = 1;
+        break;
+      }
+      default: {
+        if (verbose) {
+          fprintf(stderr, "Skipping over unknown record type %d\n", type);
+        }
+        break;
+      }
+    }
+  } while (!done);
+
+  // Finish muxing
+  err = WebPMuxSetAnimationParams(mux, &anim);
+  if (err != WEBP_MUX_OK) {
+    fprintf(stderr, "ERROR (%s): Could not set animation parameters.\n",
+            ErrorString(err));
+    goto End;
+  }
+
+  err = WebPMuxAssemble(mux, &webp_data);
+  if (err != WEBP_MUX_OK) {
+    fprintf(stderr, "ERROR (%s) assembling the WebP file.\n", ErrorString(err));
+    goto End;
+  }
+  if (out_file != NULL) {
+    if (!ExUtilWriteFile(out_file, webp_data.bytes, webp_data.size)) {
+      fprintf(stderr, "Error writing output file: %s\n", out_file);
+      goto End;
+    }
+    if (!quiet) {
+      printf("Saved output file: %s\n", out_file);
+    }
+  } else {
+    if (!quiet) {
+      printf("Nothing written; use -o flag to save the result.\n");
+    }
+  }
+
+  // All OK.
+  ok = 1;
+  gif_error = GIF_OK;
+
+ End:
+  WebPDataClear(&webp_data);
+  WebPMuxDelete(mux);
+  WebPPictureFree(&picture);
+  if (out != NULL && out_file != NULL) fclose(out);
+
+  if (gif_error != GIF_OK) {
+    DisplayGifError(gif, gif_error);
+  }
+  if (gif != NULL) {
+    DGifCloseFile(gif);
+  }
+
+  return !ok;
+}
+
+//------------------------------------------------------------------------------
--- a/examples/jpegdec.c
+++ b/examples/jpegdec.c
@ -0,0 +1,294 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// JPEG decode.
+
+#include "./jpegdec.h"
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <stdio.h>
+
+#ifdef WEBP_HAVE_JPEG
+#include <jpeglib.h>
+#include <setjmp.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "webp/encode.h"
+#include "./metadata.h"
+
+// -----------------------------------------------------------------------------
+// Metadata processing
+
+#ifndef JPEG_APP1
+# define JPEG_APP1 (JPEG_APP0 + 1)
+#endif
+#ifndef JPEG_APP2
+# define JPEG_APP2 (JPEG_APP0 + 2)
+#endif
+
+typedef struct {
+  const uint8_t* data;
+  size_t data_length;
+  int seq;  // this segment's sequence number [1, 255] for use in reassembly.
+} ICCPSegment;
+
+static void SaveMetadataMarkers(j_decompress_ptr dinfo) {
+  const unsigned int max_marker_length = 0xffff;
+  jpeg_save_markers(dinfo, JPEG_APP1, max_marker_length);  // Exif/XMP
+  jpeg_save_markers(dinfo, JPEG_APP2, max_marker_length);  // ICC profile
+}
+
+static int CompareICCPSegments(const void* a, const void* b) {
+  const ICCPSegment* s1 = (const ICCPSegment*)a;
+  const ICCPSegment* s2 = (const ICCPSegment*)b;
+  return s1->seq - s2->seq;
+}
+
+// Extract ICC profile segments from the marker list in 'dinfo', reassembling
+// and storing them in 'iccp'.
+// Returns true on success and false for memory errors and corrupt profiles.
+static int StoreICCP(j_decompress_ptr dinfo, MetadataPayload* const iccp) {
+  // ICC.1:2010-12 (4.3.0.0) Annex B.4 Embedding ICC Profiles in JPEG files
+  static const char kICCPSignature[] = "ICC_PROFILE";
+  static const size_t kICCPSignatureLength = 12;  // signature includes '\0'
+  static const size_t kICCPSkipLength = 14;  // signature + seq & count
+  int expected_count = 0;
+  int actual_count = 0;
+  int seq_max = 0;
+  size_t total_size = 0;
+  ICCPSegment iccp_segments[255];
+  jpeg_saved_marker_ptr marker;
+
+  memset(iccp_segments, 0, sizeof(iccp_segments));
+  for (marker = dinfo->marker_list; marker != NULL; marker = marker->next) {
+    if (marker->marker == JPEG_APP2 &&
+        marker->data_length > kICCPSkipLength &&
+        !memcmp(marker->data, kICCPSignature, kICCPSignatureLength)) {
+      // ICC_PROFILE\0<seq><count>; 'seq' starts at 1.
+      const int seq = marker->data[kICCPSignatureLength];
+      const int count = marker->data[kICCPSignatureLength + 1];
+      const size_t segment_size = marker->data_length - kICCPSkipLength;
+      ICCPSegment* segment;
+
+      if (segment_size == 0 || count == 0 || seq == 0) {
+        fprintf(stderr, "[ICCP] size (%d) / count (%d) / sequence number (%d)"
+                        " cannot be 0!\n",
+                (int)segment_size, seq, count);
+        return 0;
+      }
+
+      if (expected_count == 0) {
+        expected_count = count;
+      } else if (expected_count != count) {
+        fprintf(stderr, "[ICCP] Inconsistent segment count (%d / %d)!\n",
+                expected_count, count);
+        return 0;
+      }
+
+      segment = iccp_segments + seq - 1;
+      if (segment->data_length != 0) {
+        fprintf(stderr, "[ICCP] Duplicate segment number (%d)!\n" , seq);
+        return 0;
+      }
+
+      segment->data = marker->data + kICCPSkipLength;
+      segment->data_length = segment_size;
+      segment->seq = seq;
+      total_size += segment_size;
+      if (seq > seq_max) seq_max = seq;
+      ++actual_count;
+    }
+  }
+
+  if (actual_count == 0) return 1;
+  if (seq_max != actual_count) {
+    fprintf(stderr, "[ICCP] Discontinuous segments, expected: %d actual: %d!\n",
+            actual_count, seq_max);
+    return 0;
+  }
+  if (expected_count != actual_count) {
+    fprintf(stderr, "[ICCP] Segment count: %d does not match expected: %d!\n",
+            actual_count, expected_count);
+    return 0;
+  }
+
+  // The segments may appear out of order in the file, sort them based on
+  // sequence number before assembling the payload.
+  qsort(iccp_segments, actual_count, sizeof(*iccp_segments),
+        CompareICCPSegments);
+
+  iccp->bytes = (uint8_t*)malloc(total_size);
+  if (iccp->bytes == NULL) return 0;
+  iccp->size = total_size;
+
+  {
+    int i;
+    size_t offset = 0;
+    for (i = 0; i < seq_max; ++i) {
+      memcpy(iccp->bytes + offset,
+             iccp_segments[i].data, iccp_segments[i].data_length);
+      offset += iccp_segments[i].data_length;
+    }
+  }
+  return 1;
+}
+
+// Returns true on success and false for memory errors and corrupt profiles.
+// The caller must use MetadataFree() on 'metadata' in all cases.
+static int ExtractMetadataFromJPEG(j_decompress_ptr dinfo,
+                                   Metadata* const metadata) {
+  static const struct {
+    int marker;
+    const char* signature;
+    size_t signature_length;
+    size_t storage_offset;
+  } kJPEGMetadataMap[] = {
+    // Exif 2.2 Section 4.7.2 Interoperability Structure of APP1 ...
+    { JPEG_APP1, "Exif\0",                        6, METADATA_OFFSET(exif) },
+    // XMP Specification Part 3 Section 3 Embedding XMP Metadata ... #JPEG
+    // TODO(jzern) Add support for 'ExtendedXMP'
+    { JPEG_APP1, "http://ns.adobe.com/xap/1.0/", 29, METADATA_OFFSET(xmp) },
+    { 0, NULL, 0, 0 },
+  };
+  jpeg_saved_marker_ptr marker;
+  // Treat ICC profiles separately as they may be segmented and out of order.
+  if (!StoreICCP(dinfo, &metadata->iccp)) return 0;
+
+  for (marker = dinfo->marker_list; marker != NULL; marker = marker->next) {
+    int i;
+    for (i = 0; kJPEGMetadataMap[i].marker != 0; ++i) {
+      if (marker->marker == kJPEGMetadataMap[i].marker &&
+          marker->data_length > kJPEGMetadataMap[i].signature_length &&
+          !memcmp(marker->data, kJPEGMetadataMap[i].signature,
+                  kJPEGMetadataMap[i].signature_length)) {
+        MetadataPayload* const payload =
+            (MetadataPayload*)((uint8_t*)metadata +
+                               kJPEGMetadataMap[i].storage_offset);
+
+        if (payload->bytes == NULL) {
+          const char* marker_data = (const char*)marker->data +
+                                    kJPEGMetadataMap[i].signature_length;
+          const size_t marker_data_length =
+              marker->data_length - kJPEGMetadataMap[i].signature_length;
+          if (!MetadataCopy(marker_data, marker_data_length, payload)) return 0;
+        } else {
+          fprintf(stderr, "Ignoring additional '%s' marker\n",
+                  kJPEGMetadataMap[i].signature);
+        }
+      }
+    }
+  }
+  return 1;
+}
+
+#undef JPEG_APP1
+#undef JPEG_APP2
+
+// -----------------------------------------------------------------------------
+// JPEG decoding
+
+struct my_error_mgr {
+  struct jpeg_error_mgr pub;
+  jmp_buf setjmp_buffer;
+};
+
+static void my_error_exit(j_common_ptr dinfo) {
+  struct my_error_mgr* myerr = (struct my_error_mgr*)dinfo->err;
+  dinfo->err->output_message(dinfo);
+  longjmp(myerr->setjmp_buffer, 1);
+}
+
+int ReadJPEG(FILE* in_file, WebPPicture* const pic, Metadata* const metadata) {
+  int ok = 0;
+  int stride, width, height;
+  struct jpeg_decompress_struct dinfo;
+  struct my_error_mgr jerr;
+  uint8_t* rgb = NULL;
+  JSAMPROW buffer[1];
+
+  dinfo.err = jpeg_std_error(&jerr.pub);
+  jerr.pub.error_exit = my_error_exit;
+
+  if (setjmp(jerr.setjmp_buffer)) {
+ Error:
+    MetadataFree(metadata);
+    jpeg_destroy_decompress(&dinfo);
+    goto End;
+  }
+
+  jpeg_create_decompress(&dinfo);
+  jpeg_stdio_src(&dinfo, in_file);
+  if (metadata != NULL) SaveMetadataMarkers(&dinfo);
+  jpeg_read_header(&dinfo, TRUE);
+
+  dinfo.out_color_space = JCS_RGB;
+  dinfo.dct_method = JDCT_IFAST;
+  dinfo.do_fancy_upsampling = TRUE;
+
+  jpeg_start_decompress(&dinfo);
+
+  if (dinfo.output_components != 3) {
+    goto Error;
+  }
+
+  width = dinfo.output_width;
+  height = dinfo.output_height;
+  stride = dinfo.output_width * dinfo.output_components * sizeof(*rgb);
+
+  rgb = (uint8_t*)malloc(stride * height);
+  if (rgb == NULL) {
+    goto End;
+  }
+  buffer[0] = (JSAMPLE*)rgb;
+
+  while (dinfo.output_scanline < dinfo.output_height) {
+    if (jpeg_read_scanlines(&dinfo, buffer, 1) != 1) {
+      goto End;
+    }
+    buffer[0] += stride;
+  }
+
+  if (metadata != NULL) {
+    ok = ExtractMetadataFromJPEG(&dinfo, metadata);
+    if (!ok) {
+      fprintf(stderr, "Error extracting JPEG metadata!\n");
+      goto Error;
+    }
+  }
+
+  jpeg_finish_decompress(&dinfo);
+  jpeg_destroy_decompress(&dinfo);
+
+  // WebP conversion.
+  pic->width = width;
+  pic->height = height;
+  ok = WebPPictureImportRGB(pic, rgb, stride);
+  if (!ok) goto Error;
+
+ End:
+  free(rgb);
+  return ok;
+}
+#else  // !WEBP_HAVE_JPEG
+int ReadJPEG(FILE* in_file, struct WebPPicture* const pic,
+             struct Metadata* const metadata) {
+  (void)in_file;
+  (void)pic;
+  (void)metadata;
+  fprintf(stderr, "JPEG support not compiled. Please install the libjpeg "
+          "development package before building.\n");
+  return 0;
+}
+#endif  // WEBP_HAVE_JPEG
+
+// -----------------------------------------------------------------------------
--- a/examples/jpegdec.h
+++ b/examples/jpegdec.h
@ -0,0 +1,35 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// JPEG decode.
+
+#ifndef WEBP_EXAMPLES_JPEGDEC_H_
+#define WEBP_EXAMPLES_JPEGDEC_H_
+
+#include <stdio.h>
+#include "webp/types.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+struct Metadata;
+struct WebPPicture;
+
+// Reads a JPEG from 'in_file', returning the decoded output in 'pic'.
+// The output is RGB.
+// Returns true on success.
+int ReadJPEG(FILE* in_file, struct WebPPicture* const pic,
+             struct Metadata* const metadata);
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
+
+#endif  // WEBP_EXAMPLES_JPEGDEC_H_
--- a/examples/metadata.c
+++ b/examples/metadata.c
@ -0,0 +1,49 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+//  Metadata types and functions.
+//
+
+#include "./metadata.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "webp/types.h"
+
+void MetadataInit(Metadata* const metadata) {
+  if (metadata == NULL) return;
+  memset(metadata, 0, sizeof(*metadata));
+}
+
+void MetadataPayloadDelete(MetadataPayload* const payload) {
+  if (payload == NULL) return;
+  free(payload->bytes);
+  payload->bytes = NULL;
+  payload->size = 0;
+}
+
+void MetadataFree(Metadata* const metadata) {
+  if (metadata == NULL) return;
+  MetadataPayloadDelete(&metadata->exif);
+  MetadataPayloadDelete(&metadata->iccp);
+  MetadataPayloadDelete(&metadata->xmp);
+}
+
+int MetadataCopy(const char* metadata, size_t metadata_len,
+                 MetadataPayload* const payload) {
+  if (metadata == NULL || metadata_len == 0 || payload == NULL) return 0;
+  payload->bytes = (uint8_t*)malloc(metadata_len);
+  if (payload->bytes == NULL) return 0;
+  payload->size = metadata_len;
+  memcpy(payload->bytes, metadata, metadata_len);
+  return 1;
+}
+
+// -----------------------------------------------------------------------------
--- a/examples/metadata.h
+++ b/examples/metadata.h
@ -0,0 +1,47 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+//  Metadata types and functions.
+//
+
+#ifndef WEBP_EXAMPLES_METADATA_H_
+#define WEBP_EXAMPLES_METADATA_H_
+
+#include "webp/types.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+typedef struct MetadataPayload {
+  uint8_t* bytes;
+  size_t size;
+} MetadataPayload;
+
+typedef struct Metadata {
+  MetadataPayload exif;
+  MetadataPayload iccp;
+  MetadataPayload xmp;
+} Metadata;
+
+#define METADATA_OFFSET(x) offsetof(Metadata, x)
+
+void MetadataInit(Metadata* const metadata);
+void MetadataPayloadDelete(MetadataPayload* const payload);
+void MetadataFree(Metadata* const metadata);
+
+// Stores 'metadata' to 'payload->bytes', returns false on allocation error.
+int MetadataCopy(const char* metadata, size_t metadata_len,
+                 MetadataPayload* const payload);
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
+
+#endif  // WEBP_EXAMPLES_METADATA_H_
--- a/examples/pngdec.c
+++ b/examples/pngdec.c
@ -0,0 +1,297 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// PNG decode.
+
+#include "./pngdec.h"
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <stdio.h>
+
+#ifdef WEBP_HAVE_PNG
+#include <png.h>
+#include <setjmp.h>   // note: this must be included *after* png.h
+#include <stdlib.h>
+
+#include "webp/encode.h"
+#include "./metadata.h"
+
+static void PNGAPI error_function(png_structp png, png_const_charp dummy) {
+  (void)dummy;  // remove variable-unused warning
+  longjmp(png_jmpbuf(png), 1);
+}
+
+// Converts the NULL terminated 'hexstring' which contains 2-byte character
+// representations of hex values to raw data.
+// 'hexstring' may contain values consisting of [A-F][a-f][0-9] in pairs,
+// e.g., 7af2..., separated by any number of newlines.
+// 'expected_length' is the anticipated processed size.
+// On success the raw buffer is returned with its length equivalent to
+// 'expected_length'. NULL is returned if the processed length is less than
+// 'expected_length' or any character aside from those above is encountered.
+// The returned buffer must be freed by the caller.
+static uint8_t* HexStringToBytes(const char* hexstring,
+                                 size_t expected_length) {
+  const char* src = hexstring;
+  size_t actual_length = 0;
+  uint8_t* const raw_data = (uint8_t*)malloc(expected_length);
+  uint8_t* dst;
+
+  if (raw_data == NULL) return NULL;
+
+  for (dst = raw_data; actual_length < expected_length && *src != '\0'; ++src) {
+    char* end;
+    char val[3];
+    if (*src == '\n') continue;
+    val[0] = *src++;
+    val[1] = *src;
+    val[2] = '\0';
+    *dst++ = (uint8_t)strtol(val, &end, 16);
+    if (end != val + 2) break;
+    ++actual_length;
+  }
+
+  if (actual_length != expected_length) {
+    free(raw_data);
+    return NULL;
+  }
+  return raw_data;
+}
+
+static int ProcessRawProfile(const char* profile, size_t profile_len,
+                             MetadataPayload* const payload) {
+  const char* src = profile;
+  char* end;
+  int expected_length;
+
+  if (profile == NULL || profile_len == 0) return 0;
+
+  // ImageMagick formats 'raw profiles' as
+  // '\n<name>\n<length>(%8lu)\n<hex payload>\n'.
+  if (*src != '\n') {
+    fprintf(stderr, "Malformed raw profile, expected '\\n' got '\\x%.2X'\n",
+            *src);
+    return 0;
+  }
+  ++src;
+  // skip the profile name and extract the length.
+  while (*src != '\0' && *src++ != '\n') {}
+  expected_length = (int)strtol(src, &end, 10);
+  if (*end != '\n') {
+    fprintf(stderr, "Malformed raw profile, expected '\\n' got '\\x%.2X'\n",
+            *end);
+    return 0;
+  }
+  ++end;
+
+  // 'end' now points to the profile payload.
+  payload->bytes = HexStringToBytes(end, expected_length);
+  if (payload->bytes == NULL) return 0;
+  payload->size = expected_length;
+  return 1;
+}
+
+static const struct {
+  const char* name;
+  int (*process)(const char* profile, size_t profile_len,
+                 MetadataPayload* const payload);
+  size_t storage_offset;
+} kPNGMetadataMap[] = {
+  // http://www.sno.phy.queensu.ca/~phil/exiftool/TagNames/PNG.html#TextualData
+  // See also: ExifTool on CPAN.
+  { "Raw profile type exif", ProcessRawProfile, METADATA_OFFSET(exif) },
+  { "Raw profile type xmp",  ProcessRawProfile, METADATA_OFFSET(xmp) },
+  // Exiftool puts exif data in APP1 chunk, too.
+  { "Raw profile type APP1", ProcessRawProfile, METADATA_OFFSET(exif) },
+  // XMP Specification Part 3, Section 3 #PNG
+  { "XML:com.adobe.xmp",     MetadataCopy,      METADATA_OFFSET(xmp) },
+  { NULL, NULL, 0 },
+};
+
+// Looks for metadata at both the beginning and end of the PNG file, giving
+// preference to the head.
+// Returns true on success. The caller must use MetadataFree() on 'metadata' in
+// all cases.
+static int ExtractMetadataFromPNG(png_structp png,
+                                  png_infop const head_info,
+                                  png_infop const end_info,
+                                  Metadata* const metadata) {
+  int p;
+
+  for (p = 0; p < 2; ++p)  {
+    png_infop const info = (p == 0) ? head_info : end_info;
+    png_textp text = NULL;
+    const int num = png_get_text(png, info, &text, NULL);
+    int i;
+    // Look for EXIF / XMP metadata.
+    for (i = 0; i < num; ++i, ++text) {
+      int j;
+      for (j = 0; kPNGMetadataMap[j].name != NULL; ++j) {
+        if (!strcmp(text->key, kPNGMetadataMap[j].name)) {
+          MetadataPayload* const payload =
+              (MetadataPayload*)((uint8_t*)metadata +
+                                 kPNGMetadataMap[j].storage_offset);
+          png_size_t text_length;
+          switch (text->compression) {
+#ifdef PNG_iTXt_SUPPORTED
+            case PNG_ITXT_COMPRESSION_NONE:
+            case PNG_ITXT_COMPRESSION_zTXt:
+              text_length = text->itxt_length;
+              break;
+#endif
+            case PNG_TEXT_COMPRESSION_NONE:
+            case PNG_TEXT_COMPRESSION_zTXt:
+            default:
+              text_length = text->text_length;
+              break;
+          }
+          if (payload->bytes != NULL) {
+            fprintf(stderr, "Ignoring additional '%s'\n", text->key);
+          } else if (!kPNGMetadataMap[j].process(text->text, text_length,
+                                                 payload)) {
+            fprintf(stderr, "Failed to process: '%s'\n", text->key);
+            return 0;
+          }
+          break;
+        }
+      }
+    }
+    // Look for an ICC profile.
+    {
+      png_charp name;
+      int comp_type;
+#if ((PNG_LIBPNG_VER_MAJOR << 8) | PNG_LIBPNG_VER_MINOR << 0) < \
+    ((1 << 8) | (5 << 0))
+      png_charp profile;
+#else  // >= libpng 1.5.0
+      png_bytep profile;
+#endif
+      png_uint_32 len;
+
+      if (png_get_iCCP(png, info,
+                       &name, &comp_type, &profile, &len) == PNG_INFO_iCCP) {
+        if (!MetadataCopy((const char*)profile, len, &metadata->iccp)) return 0;
+      }
+    }
+  }
+
+  return 1;
+}
+
+int ReadPNG(FILE* in_file, WebPPicture* const pic, int keep_alpha,
+            Metadata* const metadata) {
+  png_structp png;
+  png_infop info = NULL;
+  png_infop end_info = NULL;
+  int color_type, bit_depth, interlaced;
+  int has_alpha;
+  int num_passes;
+  int p;
+  int ok = 0;
+  png_uint_32 width, height, y;
+  int stride;
+  uint8_t* rgb = NULL;
+
+  png = png_create_read_struct(PNG_LIBPNG_VER_STRING, 0, 0, 0);
+  if (png == NULL) {
+    goto End;
+  }
+
+  png_set_error_fn(png, 0, error_function, NULL);
+  if (setjmp(png_jmpbuf(png))) {
+ Error:
+    MetadataFree(metadata);
+    png_destroy_read_struct(&png, &info, &end_info);
+    goto End;
+  }
+
+  info = png_create_info_struct(png);
+  if (info == NULL) goto Error;
+  end_info = png_create_info_struct(png);
+  if (end_info == NULL) goto Error;
+
+  png_init_io(png, in_file);
+  png_read_info(png, info);
+  if (!png_get_IHDR(png, info,
+                    &width, &height, &bit_depth, &color_type, &interlaced,
+                    NULL, NULL)) goto Error;
+
+  png_set_strip_16(png);
+  png_set_packing(png);
+  if (color_type == PNG_COLOR_TYPE_PALETTE) png_set_palette_to_rgb(png);
+  if (color_type == PNG_COLOR_TYPE_GRAY ||
+      color_type == PNG_COLOR_TYPE_GRAY_ALPHA) {
+    if (bit_depth < 8) {
+      png_set_expand_gray_1_2_4_to_8(png);
+    }
+    png_set_gray_to_rgb(png);
+  }
+  if (png_get_valid(png, info, PNG_INFO_tRNS)) {
+    png_set_tRNS_to_alpha(png);
+    has_alpha = 1;
+  } else {
+    has_alpha = !!(color_type & PNG_COLOR_MASK_ALPHA);
+  }
+
+  if (!keep_alpha) {
+    png_set_strip_alpha(png);
+    has_alpha = 0;
+  }
+
+  num_passes = png_set_interlace_handling(png);
+  png_read_update_info(png, info);
+  stride = (has_alpha ? 4 : 3) * width * sizeof(*rgb);
+  rgb = (uint8_t*)malloc(stride * height);
+  if (rgb == NULL) goto Error;
+  for (p = 0; p < num_passes; ++p) {
+    for (y = 0; y < height; ++y) {
+      png_bytep row = rgb + y * stride;
+      png_read_rows(png, &row, NULL, 1);
+    }
+  }
+  png_read_end(png, end_info);
+
+  if (metadata != NULL &&
+      !ExtractMetadataFromPNG(png, info, end_info, metadata)) {
+    fprintf(stderr, "Error extracting PNG metadata!\n");
+    goto Error;
+  }
+
+  png_destroy_read_struct(&png, &info, &end_info);
+
+  pic->width = width;
+  pic->height = height;
+  ok = has_alpha ? WebPPictureImportRGBA(pic, rgb, stride)
+                 : WebPPictureImportRGB(pic, rgb, stride);
+
+  if (!ok) {
+    goto Error;
+  }
+
+ End:
+  free(rgb);
+  return ok;
+}
+#else  // !WEBP_HAVE_PNG
+int ReadPNG(FILE* in_file, struct WebPPicture* const pic, int keep_alpha,
+            struct Metadata* const metadata) {
+  (void)in_file;
+  (void)pic;
+  (void)keep_alpha;
+  (void)metadata;
+  fprintf(stderr, "PNG support not compiled. Please install the libpng "
+          "development package before building.\n");
+  return 0;
+}
+#endif  // WEBP_HAVE_PNG
+
+// -----------------------------------------------------------------------------
--- a/examples/pngdec.h
+++ b/examples/pngdec.h
@ -0,0 +1,35 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// PNG decode.
+
+#ifndef WEBP_EXAMPLES_PNGDEC_H_
+#define WEBP_EXAMPLES_PNGDEC_H_
+
+#include <stdio.h>
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+struct Metadata;
+struct WebPPicture;
+
+// Reads a PNG from 'in_file', returning the decoded output in 'pic'.
+// If 'keep_alpha' is true and the PNG has an alpha channel, the output is RGBA
+// otherwise it will be RGB.
+// Returns true on success.
+int ReadPNG(FILE* in_file, struct WebPPicture* const pic, int keep_alpha,
+            struct Metadata* const metadata);
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
+
+#endif  // WEBP_EXAMPLES_PNGDEC_H_
--- a/examples/stopwatch.h
+++ b/examples/stopwatch.h
@ -1,8 +1,10 @@
-// Copyright 2011 Google Inc.
+// Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 //  Helper functions to measure elapsed time.
@ -12,12 +14,12 @@
 #ifndef WEBP_EXAMPLES_STOPWATCH_H_
 #define WEBP_EXAMPLES_STOPWATCH_H_

-#ifdef _WIN32
+#if defined _WIN32 && !defined __GNUC__
 #include <windows.h>

 typedef LARGE_INTEGER Stopwatch;

-static inline double StopwatchReadAndReset(Stopwatch* watch) {
+static WEBP_INLINE double StopwatchReadAndReset(Stopwatch* watch) {
  const LARGE_INTEGER old_value = *watch;
  LARGE_INTEGER freq;
  if (!QueryPerformanceCounter(watch))
@ -35,7 +37,7 @@ static inline double StopwatchReadAndReset(Stopwatch* watch) {

 typedef struct timeval Stopwatch;

-static inline double StopwatchReadAndReset(Stopwatch* watch) {
+static WEBP_INLINE double StopwatchReadAndReset(Stopwatch* watch) {
  const struct timeval old_value = *watch;
  gettimeofday(watch, NULL);
  return watch->tv_sec - old_value.tv_sec +
--- a/examples/test.webp
+++ b/examples/test.webp
--- a/examples/tiffdec.c
+++ b/examples/tiffdec.c
@ -0,0 +1,140 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// TIFF decode.
+
+#include "./tiffdec.h"
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <stdio.h>
+
+#ifdef WEBP_HAVE_TIFF
+#include <tiffio.h>
+
+#include "webp/encode.h"
+#include "./metadata.h"
+
+static const struct {
+  ttag_t tag;
+  size_t storage_offset;
+} kTIFFMetadataMap[] = {
+  { TIFFTAG_ICCPROFILE, METADATA_OFFSET(iccp) },
+  { TIFFTAG_XMLPACKET,  METADATA_OFFSET(xmp) },
+  { 0, 0 },
+};
+
+// Returns true on success. The caller must use MetadataFree() on 'metadata' in
+// all cases.
+static int ExtractMetadataFromTIFF(TIFF* const tif, Metadata* const metadata) {
+  int i;
+  toff_t exif_ifd_offset;
+
+  for (i = 0; kTIFFMetadataMap[i].tag != 0; ++i) {
+    MetadataPayload* const payload =
+        (MetadataPayload*)((uint8_t*)metadata +
+                           kTIFFMetadataMap[i].storage_offset);
+    void* tag_data;
+    uint32 tag_data_len;
+
+    if (TIFFGetField(tif, kTIFFMetadataMap[i].tag, &tag_data_len, &tag_data) &&
+        !MetadataCopy((const char*)tag_data, tag_data_len, payload)) {
+      return 0;
+    }
+  }
+
+  // TODO(jzern): To extract the raw EXIF directory some parsing of it would be
+  // necessary to determine the overall size. In addition, value offsets in
+  // individual directory entries may need to be updated as, depending on the
+  // type, they are file based.
+  // Exif 2.2 Section 4.6.2 Tag Structure
+  // TIFF Revision 6.0 Part 1 Section 2 TIFF Structure #Image File Directory
+  if (TIFFGetField(tif, TIFFTAG_EXIFIFD, &exif_ifd_offset)) {
+    fprintf(stderr, "Warning: EXIF extraction from TIFF is unsupported.\n");
+  }
+  return 1;
+}
+
+int ReadTIFF(const char* const filename,
+             WebPPicture* const pic, int keep_alpha,
+             Metadata* const metadata) {
+  TIFF* const tif = TIFFOpen(filename, "r");
+  uint32 width, height;
+  uint32* raster;
+  int ok = 0;
+  tdir_t dircount;
+
+  if (tif == NULL) {
+    fprintf(stderr, "Error! Cannot open TIFF file '%s'\n", filename);
+    return 0;
+  }
+
+  dircount = TIFFNumberOfDirectories(tif);
+  if (dircount > 1) {
+    fprintf(stderr, "Warning: multi-directory TIFF files are not supported.\n"
+                    "Only the first will be used, %d will be ignored.\n",
+                    dircount - 1);
+  }
+
+  if (!(TIFFGetField(tif, TIFFTAG_IMAGEWIDTH, &width) &&
+        TIFFGetField(tif, TIFFTAG_IMAGELENGTH, &height))) {
+    fprintf(stderr, "Error! Cannot retrieve TIFF image dimensions.\n");
+    return 0;
+  }
+  raster = (uint32*)_TIFFmalloc(width * height * sizeof(*raster));
+  if (raster != NULL) {
+    if (TIFFReadRGBAImageOriented(tif, width, height, raster,
+                                  ORIENTATION_TOPLEFT, 1)) {
+      const int stride = width * sizeof(*raster);
+      pic->width = width;
+      pic->height = height;
+      // TIFF data is ABGR
+#ifdef __BIG_ENDIAN__
+      TIFFSwabArrayOfLong(raster, width * height);
+#endif
+      ok = keep_alpha
+         ? WebPPictureImportRGBA(pic, (const uint8_t*)raster, stride)
+         : WebPPictureImportRGBX(pic, (const uint8_t*)raster, stride);
+    }
+    _TIFFfree(raster);
+  } else {
+    fprintf(stderr, "Error allocating TIFF RGBA memory!\n");
+  }
+
+  if (ok) {
+    if (metadata != NULL) {
+      ok = ExtractMetadataFromTIFF(tif, metadata);
+      if (!ok) {
+        fprintf(stderr, "Error extracting TIFF metadata!\n");
+        MetadataFree(metadata);
+        WebPPictureFree(pic);
+      }
+    }
+  }
+
+  TIFFClose(tif);
+  return ok;
+}
+#else  // !WEBP_HAVE_TIFF
+int ReadTIFF(const char* const filename,
+             struct WebPPicture* const pic, int keep_alpha,
+             struct Metadata* const metadata) {
+  (void)filename;
+  (void)pic;
+  (void)keep_alpha;
+  (void)metadata;
+  fprintf(stderr, "TIFF support not compiled. Please install the libtiff "
+          "development package before building.\n");
+  return 0;
+}
+#endif  // WEBP_HAVE_TIFF
+
+// -----------------------------------------------------------------------------
--- a/examples/tiffdec.h
+++ b/examples/tiffdec.h
@ -0,0 +1,34 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// TIFF decode.
+
+#ifndef WEBP_EXAMPLES_TIFFDEC_H_
+#define WEBP_EXAMPLES_TIFFDEC_H_
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+struct Metadata;
+struct WebPPicture;
+
+// Reads a TIFF from 'filename', returning the decoded output in 'pic'.
+// If 'keep_alpha' is true and the TIFF has an alpha channel, the output is RGBA
+// otherwise it will be RGB.
+// Returns true on success.
+int ReadTIFF(const char* const filename,
+             struct WebPPicture* const pic, int keep_alpha,
+             struct Metadata* const metadata);
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
+
+#endif  // WEBP_EXAMPLES_TIFFDEC_H_
--- a/examples/vwebp.c
+++ b/examples/vwebp.c
@ -0,0 +1,514 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+//  Simple OpenGL-based WebP file viewer.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#if defined(HAVE_GLUT_GLUT_H)
+#include <GLUT/glut.h>
+#else
+#include <GL/glut.h>
+#ifdef FREEGLUT
+#include <GL/freeglut.h>
+#endif
+#endif
+
+#ifdef WEBP_HAVE_QCMS
+#include <qcms.h>
+#endif
+
+#include "webp/decode.h"
+#include "webp/demux.h"
+
+#include "./example_util.h"
+
+#ifdef _MSC_VER
+#define snprintf _snprintf
+#endif
+
+static void Help(void);
+
+// Unfortunate global variables. Gathered into a struct for comfort.
+static struct {
+  int has_animation;
+  int has_color_profile;
+  int done;
+  int decoding_error;
+  int print_info;
+  int use_color_profile;
+
+  int canvas_width, canvas_height;
+  int loop_count;
+  uint32_t bg_color;
+
+  const char* file_name;
+  WebPData data;
+  WebPDecoderConfig* config;
+  const WebPDecBuffer* pic;
+  WebPDemuxer* dmux;
+  WebPIterator frameiter;
+  struct {
+    int width, height;
+    int x_offset, y_offset;
+    enum WebPMuxAnimDispose dispose_method;
+  } prev_frame;
+  WebPChunkIterator iccp;
+} kParams;
+
+static void ClearPreviousPic(void) {
+  WebPFreeDecBuffer((WebPDecBuffer*)kParams.pic);
+  kParams.pic = NULL;
+}
+
+static void ClearParams(void) {
+  ClearPreviousPic();
+  WebPDataClear(&kParams.data);
+  WebPDemuxReleaseIterator(&kParams.frameiter);
+  WebPDemuxReleaseChunkIterator(&kParams.iccp);
+  WebPDemuxDelete(kParams.dmux);
+  kParams.dmux = NULL;
+}
+
+// -----------------------------------------------------------------------------
+// Color profile handling
+static int ApplyColorProfile(const WebPData* const profile,
+                             WebPDecBuffer* const rgba) {
+#ifdef WEBP_HAVE_QCMS
+  int i, ok = 0;
+  uint8_t* line;
+  uint8_t major_revision;
+  qcms_profile* input_profile = NULL;
+  qcms_profile* output_profile = NULL;
+  qcms_transform* transform = NULL;
+  const qcms_data_type input_type = QCMS_DATA_RGBA_8;
+  const qcms_data_type output_type = QCMS_DATA_RGBA_8;
+  const qcms_intent intent = QCMS_INTENT_DEFAULT;
+
+  if (profile == NULL || rgba == NULL) return 0;
+  if (profile->bytes == NULL || profile->size < 10) return 1;
+  major_revision = profile->bytes[8];
+
+  qcms_enable_iccv4();
+  input_profile = qcms_profile_from_memory(profile->bytes, profile->size);
+  // qcms_profile_is_bogus() is broken with ICCv4.
+  if (input_profile == NULL ||
+      (major_revision < 4 && qcms_profile_is_bogus(input_profile))) {
+    fprintf(stderr, "Color profile is bogus!\n");
+    goto Error;
+  }
+
+  output_profile = qcms_profile_sRGB();
+  if (output_profile == NULL) {
+    fprintf(stderr, "Error creating output color profile!\n");
+    goto Error;
+  }
+
+  qcms_profile_precache_output_transform(output_profile);
+  transform = qcms_transform_create(input_profile, input_type,
+                                    output_profile, output_type,
+                                    intent);
+  if (transform == NULL) {
+    fprintf(stderr, "Error creating color transform!\n");
+    goto Error;
+  }
+
+  line = rgba->u.RGBA.rgba;
+  for (i = 0; i < rgba->height; ++i, line += rgba->u.RGBA.stride) {
+    qcms_transform_data(transform, line, line, rgba->width);
+  }
+  ok = 1;
+
+ Error:
+  if (input_profile != NULL) qcms_profile_release(input_profile);
+  if (output_profile != NULL) qcms_profile_release(output_profile);
+  if (transform != NULL) qcms_transform_release(transform);
+  return ok;
+#else
+  (void)profile;
+  (void)rgba;
+  return 1;
+#endif  // WEBP_HAVE_QCMS
+}
+
+//------------------------------------------------------------------------------
+// File decoding
+
+static int Decode(void) {   // Fills kParams.frameiter
+  const WebPIterator* const iter = &kParams.frameiter;
+  WebPDecoderConfig* const config = kParams.config;
+  WebPDecBuffer* const output_buffer = &config->output;
+  int ok = 0;
+
+  ClearPreviousPic();
+  output_buffer->colorspace = MODE_RGBA;
+  ok = (WebPDecode(iter->fragment.bytes, iter->fragment.size,
+                   config) == VP8_STATUS_OK);
+  if (!ok) {
+    fprintf(stderr, "Decoding of frame #%d failed!\n", iter->frame_num);
+  } else {
+    kParams.pic = output_buffer;
+    if (kParams.use_color_profile) {
+      ok = ApplyColorProfile(&kParams.iccp.chunk, output_buffer);
+      if (!ok) {
+        fprintf(stderr, "Applying color profile to frame #%d failed!\n",
+                iter->frame_num);
+      }
+    }
+  }
+  return ok;
+}
+
+static void decode_callback(int what) {
+  if (what == 0 && !kParams.done) {
+    int duration = 0;
+    if (kParams.dmux != NULL) {
+      WebPIterator* const iter = &kParams.frameiter;
+      if (!WebPDemuxNextFrame(iter)) {
+        WebPDemuxReleaseIterator(iter);
+        if (WebPDemuxGetFrame(kParams.dmux, 1, iter)) {
+          --kParams.loop_count;
+          kParams.done = (kParams.loop_count == 0);
+        } else {
+          kParams.decoding_error = 1;
+          kParams.done = 1;
+          return;
+        }
+      }
+      duration = iter->duration;
+    }
+    if (!Decode()) {
+      kParams.decoding_error = 1;
+      kParams.done = 1;
+    } else {
+      glutPostRedisplay();
+      glutTimerFunc(duration, decode_callback, what);
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Callbacks
+
+static void HandleKey(unsigned char key, int pos_x, int pos_y) {
+  (void)pos_x;
+  (void)pos_y;
+  if (key == 'q' || key == 'Q' || key == 27 /* Esc */) {
+#ifdef FREEGLUT
+    glutLeaveMainLoop();
+#else
+    ClearParams();
+    exit(0);
+#endif
+  } else if (key == 'c') {
+    if (kParams.has_color_profile && !kParams.decoding_error) {
+      kParams.use_color_profile = 1 - kParams.use_color_profile;
+
+      if (kParams.has_animation) {
+        // Restart the completed animation to pickup the color profile change.
+        if (kParams.done && kParams.loop_count == 0) {
+          kParams.loop_count =
+              (int)WebPDemuxGetI(kParams.dmux, WEBP_FF_LOOP_COUNT) + 1;
+          kParams.done = 0;
+          // Start the decode loop immediately.
+          glutTimerFunc(0, decode_callback, 0);
+        }
+      } else {
+        Decode();
+        glutPostRedisplay();
+      }
+    }
+  } else if (key == 'i') {
+    kParams.print_info = 1 - kParams.print_info;
+    glutPostRedisplay();
+  }
+}
+
+static void HandleReshape(int width, int height) {
+  // TODO(skal): proper handling of resize, esp. for large pictures.
+  // + key control of the zoom.
+  glViewport(0, 0, width, height);
+  glMatrixMode(GL_PROJECTION);
+  glLoadIdentity();
+  glMatrixMode(GL_MODELVIEW);
+  glLoadIdentity();
+}
+
+static void PrintString(const char* const text) {
+  void* const font = GLUT_BITMAP_9_BY_15;
+  int i;
+  for (i = 0; text[i]; ++i) {
+    glutBitmapCharacter(font, text[i]);
+  }
+}
+
+static float GetColorf(uint32_t color, int shift) {
+  return (color >> shift) / 255.f;
+}
+
+static void DrawCheckerBoard(void) {
+  const int square_size = 8;  // must be a power of 2
+  int x, y;
+  GLint viewport[4];  // x, y, width, height
+
+  glPushMatrix();
+
+  glGetIntegerv(GL_VIEWPORT, viewport);
+  // shift to integer coordinates with (0,0) being top-left.
+  glOrtho(0, viewport[2], viewport[3], 0, -1, 1);
+  for (y = 0; y < viewport[3]; y += square_size) {
+    for (x = 0; x < viewport[2]; x += square_size) {
+      const GLubyte color = 128 + 64 * (!((x + y) & square_size));
+      glColor3ub(color, color, color);
+      glRecti(x, y, x + square_size, y + square_size);
+    }
+  }
+  glPopMatrix();
+}
+
+static void HandleDisplay(void) {
+  const WebPDecBuffer* const pic = kParams.pic;
+  const WebPIterator* const iter = &kParams.frameiter;
+  GLfloat xoff, yoff;
+  if (pic == NULL) return;
+  glPushMatrix();
+  glPixelZoom(1, -1);
+  xoff = (GLfloat)(2. * iter->x_offset / kParams.canvas_width);
+  yoff = (GLfloat)(2. * iter->y_offset / kParams.canvas_height);
+  glRasterPos2f(-1.f + xoff, 1.f - yoff);
+  glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
+  glPixelStorei(GL_UNPACK_ROW_LENGTH, pic->u.RGBA.stride / 4);
+
+  if (kParams.prev_frame.dispose_method == WEBP_MUX_DISPOSE_BACKGROUND) {
+    // TODO(later): these offsets and those above should factor in window size.
+    //              they will be incorrect if the window is resized.
+    // glScissor() takes window coordinates (0,0 at bottom left).
+    const int window_x = kParams.prev_frame.x_offset;
+    const int window_y = kParams.canvas_height -
+                         kParams.prev_frame.y_offset -
+                         kParams.prev_frame.height;
+    glEnable(GL_SCISSOR_TEST);
+    // Only updated the requested area, not the whole canvas.
+    glScissor(window_x, window_y,
+              kParams.prev_frame.width, kParams.prev_frame.height);
+
+    glClear(GL_COLOR_BUFFER_BIT);  // use clear color
+    DrawCheckerBoard();
+
+    glDisable(GL_SCISSOR_TEST);
+  }
+  kParams.prev_frame.width = iter->width;
+  kParams.prev_frame.height = iter->height;
+  kParams.prev_frame.x_offset = iter->x_offset;
+  kParams.prev_frame.y_offset = iter->y_offset;
+  kParams.prev_frame.dispose_method = iter->dispose_method;
+
+  glDrawPixels(pic->width, pic->height,
+               GL_RGBA, GL_UNSIGNED_BYTE,
+               (GLvoid*)pic->u.RGBA.rgba);
+  if (kParams.print_info) {
+    char tmp[32];
+
+    glColor4f(0.90f, 0.0f, 0.90f, 1.0f);
+    glRasterPos2f(-0.95f, 0.90f);
+    PrintString(kParams.file_name);
+
+    snprintf(tmp, sizeof(tmp), "Dimension:%d x %d", pic->width, pic->height);
+    glColor4f(0.90f, 0.0f, 0.90f, 1.0f);
+    glRasterPos2f(-0.95f, 0.80f);
+    PrintString(tmp);
+    if (iter->x_offset != 0 || iter->y_offset != 0) {
+      snprintf(tmp, sizeof(tmp), " (offset:%d,%d)",
+               iter->x_offset, iter->y_offset);
+      glRasterPos2f(-0.95f, 0.70f);
+      PrintString(tmp);
+    }
+  }
+  glPopMatrix();
+  glFlush();
+}
+
+static void StartDisplay(void) {
+  const int width = kParams.canvas_width;
+  const int height = kParams.canvas_height;
+  glutInitDisplayMode(GLUT_RGBA);
+  glutInitWindowSize(width, height);
+  glutCreateWindow("WebP viewer");
+  glutDisplayFunc(HandleDisplay);
+  glutIdleFunc(NULL);
+  glutKeyboardFunc(HandleKey);
+  glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA);
+  glEnable(GL_BLEND);
+  glClearColor(GetColorf(kParams.bg_color, 0),
+               GetColorf(kParams.bg_color, 8),
+               GetColorf(kParams.bg_color, 16),
+               GetColorf(kParams.bg_color, 24));
+  HandleReshape(width, height);
+  glClear(GL_COLOR_BUFFER_BIT);
+  DrawCheckerBoard();
+}
+
+//------------------------------------------------------------------------------
+// Main
+
+static void Help(void) {
+  printf("Usage: vwebp in_file [options]\n\n"
+         "Decodes the WebP image file and visualize it using OpenGL\n"
+         "Options are:\n"
+         "  -version  .... print version number and exit.\n"
+         "  -noicc ....... don't use the icc profile if present.\n"
+         "  -nofancy ..... don't use the fancy YUV420 upscaler.\n"
+         "  -nofilter .... disable in-loop filtering.\n"
+         "  -mt .......... use multi-threading.\n"
+         "  -info ........ print info.\n"
+         "  -h     ....... this help message.\n"
+         "\n"
+         "Keyboard shortcuts:\n"
+         "  'c' ................ toggle use of color profile.\n"
+         "  'i' ................ overlay file information.\n"
+         "  'q' / 'Q' / ESC .... quit.\n"
+        );
+}
+
+int main(int argc, char *argv[]) {
+  WebPDecoderConfig config;
+  int c;
+
+  if (!WebPInitDecoderConfig(&config)) {
+    fprintf(stderr, "Library version mismatch!\n");
+    return -1;
+  }
+  kParams.config = &config;
+  kParams.use_color_profile = 1;
+
+  for (c = 1; c < argc; ++c) {
+    if (!strcmp(argv[c], "-h") || !strcmp(argv[c], "-help")) {
+      Help();
+      return 0;
+    } else if (!strcmp(argv[c], "-noicc")) {
+      kParams.use_color_profile = 0;
+    } else if (!strcmp(argv[c], "-nofancy")) {
+      config.options.no_fancy_upsampling = 1;
+    } else if (!strcmp(argv[c], "-nofilter")) {
+      config.options.bypass_filtering = 1;
+    } else if (!strcmp(argv[c], "-info")) {
+      kParams.print_info = 1;
+    } else if (!strcmp(argv[c], "-version")) {
+      const int dec_version = WebPGetDecoderVersion();
+      const int dmux_version = WebPGetDemuxVersion();
+      printf("WebP Decoder version: %d.%d.%d\nWebP Demux version: %d.%d.%d\n",
+             (dec_version >> 16) & 0xff, (dec_version >> 8) & 0xff,
+             dec_version & 0xff, (dmux_version >> 16) & 0xff,
+             (dmux_version >> 8) & 0xff, dmux_version & 0xff);
+      return 0;
+    } else if (!strcmp(argv[c], "-mt")) {
+      config.options.use_threads = 1;
+    } else if (argv[c][0] == '-') {
+      printf("Unknown option '%s'\n", argv[c]);
+      Help();
+      return -1;
+    } else {
+      kParams.file_name = argv[c];
+    }
+  }
+
+  if (kParams.file_name == NULL) {
+    printf("missing input file!!\n");
+    Help();
+    return 0;
+  }
+
+  if (!ExUtilReadFile(kParams.file_name,
+                      &kParams.data.bytes, &kParams.data.size)) {
+    goto Error;
+  }
+
+  if (!WebPGetInfo(kParams.data.bytes, kParams.data.size, NULL, NULL)) {
+    fprintf(stderr, "Input file doesn't appear to be WebP format.\n");
+    goto Error;
+  }
+
+  kParams.dmux = WebPDemux(&kParams.data);
+  if (kParams.dmux == NULL) {
+    fprintf(stderr, "Could not create demuxing object!\n");
+    goto Error;
+  }
+
+  if (WebPDemuxGetI(kParams.dmux, WEBP_FF_FORMAT_FLAGS) & FRAGMENTS_FLAG) {
+    fprintf(stderr, "Image fragments are not supported for now!\n");
+    goto Error;
+  }
+  kParams.canvas_width = WebPDemuxGetI(kParams.dmux, WEBP_FF_CANVAS_WIDTH);
+  kParams.canvas_height = WebPDemuxGetI(kParams.dmux, WEBP_FF_CANVAS_HEIGHT);
+  if (kParams.print_info) {
+    printf("Canvas: %d x %d\n", kParams.canvas_width, kParams.canvas_height);
+  }
+
+  kParams.prev_frame.width = kParams.canvas_width;
+  kParams.prev_frame.height = kParams.canvas_height;
+  kParams.prev_frame.x_offset = kParams.prev_frame.y_offset = 0;
+  kParams.prev_frame.dispose_method = WEBP_MUX_DISPOSE_BACKGROUND;
+
+  memset(&kParams.iccp, 0, sizeof(kParams.iccp));
+  kParams.has_color_profile =
+      !!(WebPDemuxGetI(kParams.dmux, WEBP_FF_FORMAT_FLAGS) & ICCP_FLAG);
+  if (kParams.has_color_profile) {
+#ifdef WEBP_HAVE_QCMS
+    if (!WebPDemuxGetChunk(kParams.dmux, "ICCP", 1, &kParams.iccp)) goto Error;
+    printf("VP8X: Found color profile\n");
+#else
+    fprintf(stderr, "Warning: color profile present, but qcms is unavailable!\n"
+            "Build libqcms from Mozilla or Chromium and define WEBP_HAVE_QCMS "
+            "before building.\n");
+#endif
+  }
+
+  if (!WebPDemuxGetFrame(kParams.dmux, 1, &kParams.frameiter)) goto Error;
+
+  kParams.has_animation = (kParams.frameiter.num_frames > 1);
+  kParams.loop_count = (int)WebPDemuxGetI(kParams.dmux, WEBP_FF_LOOP_COUNT);
+  kParams.bg_color = WebPDemuxGetI(kParams.dmux, WEBP_FF_BACKGROUND_COLOR);
+  printf("VP8X: Found %d images in file (loop count = %d)\n",
+         kParams.frameiter.num_frames, kParams.loop_count);
+
+  // Decode first frame
+  if (!Decode()) goto Error;
+
+  // Position iterator to last frame. Next call to HandleDisplay will wrap over.
+  // We take this into account by bumping up loop_count.
+  WebPDemuxGetFrame(kParams.dmux, 0, &kParams.frameiter);
+  if (kParams.loop_count) ++kParams.loop_count;
+
+  // Start display (and timer)
+  glutInit(&argc, argv);
+#ifdef FREEGLUT
+  glutSetOption(GLUT_ACTION_ON_WINDOW_CLOSE, GLUT_ACTION_CONTINUE_EXECUTION);
+#endif
+  StartDisplay();
+
+  if (kParams.has_animation) glutTimerFunc(0, decode_callback, 0);
+  glutMainLoop();
+
+  // Should only be reached when using FREEGLUT:
+  ClearParams();
+  return 0;
+
+ Error:
+  ClearParams();
+  return -1;
+}
+
+//------------------------------------------------------------------------------
--- a/examples/webpmux.c
+++ b/examples/webpmux.c
--- a/examples/wicdec.c
+++ b/examples/wicdec.c
@ -0,0 +1,348 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Windows Imaging Component (WIC) decode.
+
+#include "./wicdec.h"
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <stdio.h>
+
+#ifdef HAVE_WINCODEC_H
+#ifdef __MINGW32__
+#define INITGUID  // Without this GUIDs are declared extern and fail to link
+#endif
+#define CINTERFACE
+#define COBJMACROS
+#define _WIN32_IE 0x500  // Workaround bug in shlwapi.h when compiling C++
+                         // code with COBJMACROS.
+#include <shlwapi.h>
+#include <windows.h>
+#include <wincodec.h>
+
+#include "webp/encode.h"
+#include "./metadata.h"
+
+#define IFS(fn)                                                     \
+  do {                                                              \
+    if (SUCCEEDED(hr)) {                                            \
+      hr = (fn);                                                    \
+      if (FAILED(hr)) fprintf(stderr, #fn " failed %08lx\n", hr);   \
+    }                                                               \
+  } while (0)
+
+// modified version of DEFINE_GUID from guiddef.h.
+#define WEBP_DEFINE_GUID(name, l, w1, w2, b1, b2, b3, b4, b5, b6, b7, b8) \
+  static const GUID name = \
+      { l, w1, w2, { b1, b2,  b3,  b4,  b5,  b6,  b7,  b8 } }
+
+#ifdef __cplusplus
+#define MAKE_REFGUID(x) (x)
+#else
+#define MAKE_REFGUID(x) &(x)
+#endif
+
+typedef struct WICFormatImporter {
+  const GUID* pixel_format;
+  int bytes_per_pixel;
+  int (*import)(WebPPicture* const, const uint8_t* const, int);
+} WICFormatImporter;
+
+// From Microsoft SDK 7.0a -- wincodec.h
+// Create local copies for compatibility when building against earlier
+// versions of the SDK.
+WEBP_DEFINE_GUID(GUID_WICPixelFormat24bppBGR_,
+                 0x6fddc324, 0x4e03, 0x4bfe,
+                 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x0c);
+WEBP_DEFINE_GUID(GUID_WICPixelFormat24bppRGB_,
+                 0x6fddc324, 0x4e03, 0x4bfe,
+                 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x0d);
+WEBP_DEFINE_GUID(GUID_WICPixelFormat32bppBGRA_,
+                 0x6fddc324, 0x4e03, 0x4bfe,
+                 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x0f);
+WEBP_DEFINE_GUID(GUID_WICPixelFormat32bppRGBA_,
+                 0xf5c7ad2d, 0x6a8d, 0x43dd,
+                 0xa7, 0xa8, 0xa2, 0x99, 0x35, 0x26, 0x1a, 0xe9);
+
+static HRESULT OpenInputStream(const char* filename, IStream** stream) {
+  HRESULT hr = S_OK;
+  IFS(SHCreateStreamOnFileA(filename, STGM_READ, stream));
+  if (FAILED(hr)) {
+    fprintf(stderr, "Error opening input file %s (%08lx)\n", filename, hr);
+  }
+  return hr;
+}
+
+// -----------------------------------------------------------------------------
+// Metadata processing
+
+// Stores the first non-zero sized color profile from 'frame' to 'iccp'.
+// Returns an HRESULT to indicate success or failure. The caller is responsible
+// for freeing 'iccp->bytes' in either case.
+static HRESULT ExtractICCP(IWICImagingFactory* const factory,
+                           IWICBitmapFrameDecode* const frame,
+                           MetadataPayload* const iccp) {
+  HRESULT hr = S_OK;
+  UINT i, count;
+  IWICColorContext** color_contexts;
+
+  IFS(IWICBitmapFrameDecode_GetColorContexts(frame, 0, NULL, &count));
+  if (FAILED(hr) || count == 0) return hr;
+
+  color_contexts = (IWICColorContext**)calloc(count, sizeof(*color_contexts));
+  if (color_contexts == NULL) return E_OUTOFMEMORY;
+  for (i = 0; SUCCEEDED(hr) && i < count; ++i) {
+    IFS(IWICImagingFactory_CreateColorContext(factory, &color_contexts[i]));
+  }
+
+  if (SUCCEEDED(hr)) {
+    UINT num_color_contexts;
+    IFS(IWICBitmapFrameDecode_GetColorContexts(frame,
+                                               count, color_contexts,
+                                               &num_color_contexts));
+    for (i = 0; SUCCEEDED(hr) && i < num_color_contexts; ++i) {
+      WICColorContextType type;
+      IFS(IWICColorContext_GetType(color_contexts[i], &type));
+      if (SUCCEEDED(hr) && type == WICColorContextProfile) {
+        UINT size;
+        IFS(IWICColorContext_GetProfileBytes(color_contexts[i],
+                                             0, NULL, &size));
+        if (size > 0) {
+          iccp->bytes = (uint8_t*)malloc(size);
+          if (iccp->bytes == NULL) {
+            hr = E_OUTOFMEMORY;
+            break;
+          }
+          iccp->size = size;
+          IFS(IWICColorContext_GetProfileBytes(color_contexts[i],
+                                               (UINT)iccp->size, iccp->bytes,
+                                               &size));
+          if (SUCCEEDED(hr) && size != iccp->size) {
+            fprintf(stderr, "Warning! ICC profile size (%u) != expected (%u)\n",
+                    size, (uint32_t)iccp->size);
+            iccp->size = size;
+          }
+          break;
+        }
+      }
+    }
+  }
+  for (i = 0; i < count; ++i) {
+    if (color_contexts[i] != NULL) IUnknown_Release(color_contexts[i]);
+  }
+  free(color_contexts);
+  return hr;
+}
+
+static HRESULT ExtractMetadata(IWICImagingFactory* const factory,
+                               IWICBitmapFrameDecode* const frame,
+                               Metadata* const metadata) {
+  // TODO(jzern): add XMP/EXIF extraction.
+  const HRESULT hr = ExtractICCP(factory, frame, &metadata->iccp);
+  if (FAILED(hr)) MetadataFree(metadata);
+  return hr;
+}
+
+// -----------------------------------------------------------------------------
+
+static int HasPalette(GUID pixel_format) {
+  return (IsEqualGUID(MAKE_REFGUID(pixel_format),
+                      MAKE_REFGUID(GUID_WICPixelFormat1bppIndexed)) ||
+          IsEqualGUID(MAKE_REFGUID(pixel_format),
+                      MAKE_REFGUID(GUID_WICPixelFormat2bppIndexed)) ||
+          IsEqualGUID(MAKE_REFGUID(pixel_format),
+                      MAKE_REFGUID(GUID_WICPixelFormat4bppIndexed)) ||
+          IsEqualGUID(MAKE_REFGUID(pixel_format),
+                      MAKE_REFGUID(GUID_WICPixelFormat8bppIndexed)));
+}
+
+static int HasAlpha(IWICImagingFactory* const factory,
+                    IWICBitmapDecoder* const decoder,
+                    IWICBitmapFrameDecode* const frame,
+                    GUID pixel_format) {
+  int has_alpha;
+  if (HasPalette(pixel_format)) {
+    IWICPalette* frame_palette = NULL;
+    IWICPalette* global_palette = NULL;
+    BOOL frame_palette_has_alpha = FALSE;
+    BOOL global_palette_has_alpha = FALSE;
+
+    // A palette may exist at the frame or container level,
+    // check IWICPalette::HasAlpha() for both if present.
+    if (SUCCEEDED(IWICImagingFactory_CreatePalette(factory, &frame_palette)) &&
+        SUCCEEDED(IWICBitmapFrameDecode_CopyPalette(frame, frame_palette))) {
+      IWICPalette_HasAlpha(frame_palette, &frame_palette_has_alpha);
+    }
+    if (SUCCEEDED(IWICImagingFactory_CreatePalette(factory, &global_palette)) &&
+        SUCCEEDED(IWICBitmapDecoder_CopyPalette(decoder, global_palette))) {
+      IWICPalette_HasAlpha(global_palette, &global_palette_has_alpha);
+    }
+    has_alpha = frame_palette_has_alpha || global_palette_has_alpha;
+
+    if (frame_palette != NULL) IUnknown_Release(frame_palette);
+    if (global_palette != NULL) IUnknown_Release(global_palette);
+  } else {
+    has_alpha = IsEqualGUID(MAKE_REFGUID(pixel_format),
+                            MAKE_REFGUID(GUID_WICPixelFormat32bppRGBA_)) ||
+                IsEqualGUID(MAKE_REFGUID(pixel_format),
+                            MAKE_REFGUID(GUID_WICPixelFormat32bppBGRA_));
+  }
+  return has_alpha;
+}
+
+int ReadPictureWithWIC(const char* const filename,
+                       WebPPicture* const pic, int keep_alpha,
+                       Metadata* const metadata) {
+  // From Microsoft SDK 6.0a -- ks.h
+  // Define a local copy to avoid link errors under mingw.
+  WEBP_DEFINE_GUID(GUID_NULL_, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+  static const WICFormatImporter kAlphaFormatImporters[] = {
+    { &GUID_WICPixelFormat32bppBGRA_, 4, WebPPictureImportBGRA },
+    { &GUID_WICPixelFormat32bppRGBA_, 4, WebPPictureImportRGBA },
+    { NULL, 0, NULL },
+  };
+  static const WICFormatImporter kNonAlphaFormatImporters[] = {
+    { &GUID_WICPixelFormat24bppBGR_, 3, WebPPictureImportBGR },
+    { &GUID_WICPixelFormat24bppRGB_, 3, WebPPictureImportRGB },
+    { NULL, 0, NULL },
+  };
+  HRESULT hr = S_OK;
+  IWICBitmapFrameDecode* frame = NULL;
+  IWICFormatConverter* converter = NULL;
+  IWICImagingFactory* factory = NULL;
+  IWICBitmapDecoder* decoder = NULL;
+  IStream* stream = NULL;
+  UINT frame_count = 0;
+  UINT width = 0, height = 0;
+  BYTE* rgb = NULL;
+  WICPixelFormatGUID src_pixel_format = GUID_WICPixelFormatUndefined;
+  const WICFormatImporter* importer = NULL;
+  GUID src_container_format = GUID_NULL_;
+  static const GUID* kAlphaContainers[] = {
+    &GUID_ContainerFormatBmp,
+    &GUID_ContainerFormatPng,
+    &GUID_ContainerFormatTiff,
+    NULL
+  };
+  int has_alpha = 0;
+  int stride;
+
+  IFS(CoInitialize(NULL));
+  IFS(CoCreateInstance(MAKE_REFGUID(CLSID_WICImagingFactory), NULL,
+                       CLSCTX_INPROC_SERVER,
+                       MAKE_REFGUID(IID_IWICImagingFactory),
+                       (LPVOID*)&factory));
+  if (hr == REGDB_E_CLASSNOTREG) {
+    fprintf(stderr,
+            "Couldn't access Windows Imaging Component (are you running "
+            "Windows XP SP3 or newer?). Most formats not available. "
+            "Use -s for the available YUV input.\n");
+  }
+  // Prepare for image decoding.
+  IFS(OpenInputStream(filename, &stream));
+  IFS(IWICImagingFactory_CreateDecoderFromStream(
+          factory, stream, NULL,
+          WICDecodeMetadataCacheOnDemand, &decoder));
+  IFS(IWICBitmapDecoder_GetFrameCount(decoder, &frame_count));
+  if (SUCCEEDED(hr) && frame_count == 0) {
+    fprintf(stderr, "No frame found in input file.\n");
+    hr = E_FAIL;
+  }
+  IFS(IWICBitmapDecoder_GetFrame(decoder, 0, &frame));
+  IFS(IWICBitmapFrameDecode_GetPixelFormat(frame, &src_pixel_format));
+  IFS(IWICBitmapDecoder_GetContainerFormat(decoder, &src_container_format));
+
+  if (keep_alpha) {
+    const GUID** guid;
+    for (guid = kAlphaContainers; *guid != NULL; ++guid) {
+      if (IsEqualGUID(MAKE_REFGUID(src_container_format),
+                      MAKE_REFGUID(**guid))) {
+        has_alpha = HasAlpha(factory, decoder, frame, src_pixel_format);
+        break;
+      }
+    }
+  }
+
+  // Prepare for pixel format conversion (if necessary).
+  IFS(IWICImagingFactory_CreateFormatConverter(factory, &converter));
+
+  for (importer = has_alpha ? kAlphaFormatImporters : kNonAlphaFormatImporters;
+       hr == S_OK && importer->import != NULL; ++importer) {
+    BOOL can_convert;
+    const HRESULT cchr = IWICFormatConverter_CanConvert(
+        converter,
+        MAKE_REFGUID(src_pixel_format),
+        MAKE_REFGUID(*importer->pixel_format),
+        &can_convert);
+    if (SUCCEEDED(cchr) && can_convert) break;
+  }
+  if (importer->import == NULL) hr = E_FAIL;
+
+  IFS(IWICFormatConverter_Initialize(converter, (IWICBitmapSource*)frame,
+                                     importer->pixel_format,
+                                     WICBitmapDitherTypeNone,
+                                     NULL, 0.0, WICBitmapPaletteTypeCustom));
+
+  // Decode.
+  IFS(IWICFormatConverter_GetSize(converter, &width, &height));
+  stride = importer->bytes_per_pixel * width * sizeof(*rgb);
+  if (SUCCEEDED(hr)) {
+    rgb = (BYTE*)malloc(stride * height);
+    if (rgb == NULL)
+      hr = E_OUTOFMEMORY;
+  }
+  IFS(IWICFormatConverter_CopyPixels(converter, NULL,
+                                     stride, stride * height, rgb));
+
+  // WebP conversion.
+  if (SUCCEEDED(hr)) {
+    int ok;
+    pic->width = width;
+    pic->height = height;
+    ok = importer->import(pic, rgb, stride);
+    if (!ok) hr = E_FAIL;
+  }
+  if (SUCCEEDED(hr)) {
+    if (metadata != NULL) {
+      hr = ExtractMetadata(factory, frame, metadata);
+      if (FAILED(hr)) {
+        fprintf(stderr, "Error extracting image metadata using WIC!\n");
+      }
+    }
+  }
+
+  // Cleanup.
+  if (converter != NULL) IUnknown_Release(converter);
+  if (frame != NULL) IUnknown_Release(frame);
+  if (decoder != NULL) IUnknown_Release(decoder);
+  if (factory != NULL) IUnknown_Release(factory);
+  if (stream != NULL) IUnknown_Release(stream);
+  free(rgb);
+  return SUCCEEDED(hr);
+}
+#else  // !HAVE_WINCODEC_H
+int ReadPictureWithWIC(const char* const filename,
+                       struct WebPPicture* const pic, int keep_alpha,
+                       struct Metadata* const metadata) {
+  (void)filename;
+  (void)pic;
+  (void)keep_alpha;
+  (void)metadata;
+  fprintf(stderr, "Windows Imaging Component (WIC) support not compiled. "
+                  "Visual Studio and mingw-w64 builds support WIC. Make sure "
+                  "wincodec.h detection is working correctly if using autoconf "
+                  "and HAVE_WINCODEC_H is defined before building.\n");
+  return 0;
+}
+#endif  // HAVE_WINCODEC_H
+
+// -----------------------------------------------------------------------------
--- a/examples/wicdec.h
+++ b/examples/wicdec.h
@ -0,0 +1,34 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Windows Imaging Component (WIC) decode.
+
+#ifndef WEBP_EXAMPLES_WICDEC_H_
+#define WEBP_EXAMPLES_WICDEC_H_
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+struct Metadata;
+struct WebPPicture;
+
+// Reads an image from 'filename', returning the decoded output in 'pic'.
+// If 'keep_alpha' is true and the image has an alpha channel, the output is
+// RGBA otherwise it will be RGB.
+// Returns true on success.
+int ReadPictureWithWIC(const char* const filename,
+                       struct WebPPicture* const pic, int keep_alpha,
+                       struct Metadata* const metadata);
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
+
+#endif  // WEBP_EXAMPLES_WICDEC_H_
--- a/iosbuild.sh
+++ b/iosbuild.sh
@ -0,0 +1,93 @@
+#!/bin/bash
+#
+# This script generates 'WebP.framework'. An iOS app can decode WebP images
+# by including 'WebP.framework'.
+#
+# Run ./iosbuild.sh to generate 'WebP.framework' under the current directory
+# (previous build will be erased if it exists).
+#
+# This script is inspired by the build script written by Carson McDonald.
+# (http://www.ioncannon.net/programming/1483/using-webp-to-reduce-native-ios-app-size/).
+
+set -e
+
+# Extract the latest SDK version from the final field of the form: iphoneosX.Y
+declare -r SDK=$(xcodebuild -showsdks \
+  | grep iphoneos | sort | tail -n 1 | awk '{print substr($NF, 9)}'
+)
+declare -r OLDPATH=${PATH}
+
+# Add iPhoneOS-V6 to the list of platforms below if you need armv6 support.
+# Note that iPhoneOS-V6 support is not available with the iOS6 SDK.
+declare -r PLATFORMS="iPhoneSimulator iPhoneOS-V7 iPhoneOS-V7s"
+declare -r SRCDIR=$(dirname $0)
+declare -r TOPDIR=$(pwd)
+declare -r BUILDDIR="${TOPDIR}/iosbuild"
+declare -r TARGETDIR="${TOPDIR}/WebP.framework"
+declare -r DEVELOPER=$(xcode-select --print-path)
+declare -r PLATFORMSROOT="${DEVELOPER}/Platforms"
+declare -r LIPO=$(xcrun -sdk iphoneos${SDK} -find lipo)
+LIBLIST=''
+
+if [[ -z "${SDK}" ]]; then
+  echo "iOS SDK not available"
+  exit 1
+elif [[ ${SDK} < 4.0 ]]; then
+  echo "You need iOS SDK version 4.0 or above"
+  exit 1
+else
+  echo "iOS SDK Version ${SDK}"
+fi
+
+rm -rf ${BUILDDIR}
+rm -rf ${TARGETDIR}
+mkdir -p ${BUILDDIR}
+mkdir -p ${TARGETDIR}/Headers/
+
+[[ -e ${SRCDIR}/configure ]] || (cd ${SRCDIR} && sh autogen.sh)
+
+for PLATFORM in ${PLATFORMS}; do
+  if [[ "${PLATFORM}" == "iPhoneOS-V7s" ]]; then
+    PLATFORM="iPhoneOS"
+    ARCH="armv7s"
+  elif [[ "${PLATFORM}" == "iPhoneOS-V7" ]]; then
+    PLATFORM="iPhoneOS"
+    ARCH="armv7"
+  elif [[ "${PLATFORM}" == "iPhoneOS-V6" ]]; then
+    PLATFORM="iPhoneOS"
+    ARCH="armv6"
+  else
+    ARCH="i386"
+  fi
+
+  ROOTDIR="${BUILDDIR}/${PLATFORM}-${SDK}-${ARCH}"
+  mkdir -p "${ROOTDIR}"
+
+  export DEVROOT="${PLATFORMSROOT}/${PLATFORM}.platform/Developer"
+  export SDKROOT="${DEVROOT}/SDKs/${PLATFORM}${SDK}.sdk"
+
+  export CFLAGS="-arch ${ARCH} -pipe -isysroot ${SDKROOT}"
+  export CXXFLAGS=${CFLAGS}
+  export LDFLAGS="-arch ${ARCH} -pipe -isysroot ${SDKROOT}"
+  export PATH="${DEVROOT}/usr/bin:${OLDPATH}"
+
+  ${SRCDIR}/configure --host=${ARCH}-apple-darwin --prefix=${ROOTDIR} \
+    --build=$(${SRCDIR}/config.guess) \
+    --disable-shared --enable-static \
+    --enable-libwebpdecoder --enable-swap-16bit-csp
+
+  # run make only in the src/ directory to create libwebpdecoder.a
+  cd src/
+  make V=0
+  make install
+
+  LIBLIST+=" ${ROOTDIR}/lib/libwebpdecoder.a"
+
+  make clean
+  cd ..
+
+  export PATH=${OLDPATH}
+done
+
+cp -a ${SRCDIR}/src/webp/* ${TARGETDIR}/Headers/
+${LIPO} -create ${LIBLIST} -output ${TARGETDIR}/WebP
--- a/m4/ax_pthread.m4
+++ b/m4/ax_pthread.m4
@ -0,0 +1,309 @@
+# ===========================================================================
+#        http://www.gnu.org/software/autoconf-archive/ax_pthread.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+#   AX_PTHREAD([ACTION-IF-FOUND[, ACTION-IF-NOT-FOUND]])
+#
+# DESCRIPTION
+#
+#   This macro figures out how to build C programs using POSIX threads. It
+#   sets the PTHREAD_LIBS output variable to the threads library and linker
+#   flags, and the PTHREAD_CFLAGS output variable to any special C compiler
+#   flags that are needed. (The user can also force certain compiler
+#   flags/libs to be tested by setting these environment variables.)
+#
+#   Also sets PTHREAD_CC to any special C compiler that is needed for
+#   multi-threaded programs (defaults to the value of CC otherwise). (This
+#   is necessary on AIX to use the special cc_r compiler alias.)
+#
+#   NOTE: You are assumed to not only compile your program with these flags,
+#   but also link it with them as well. e.g. you should link with
+#   $PTHREAD_CC $CFLAGS $PTHREAD_CFLAGS $LDFLAGS ... $PTHREAD_LIBS $LIBS
+#
+#   If you are only building threads programs, you may wish to use these
+#   variables in your default LIBS, CFLAGS, and CC:
+#
+#     LIBS="$PTHREAD_LIBS $LIBS"
+#     CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
+#     CC="$PTHREAD_CC"
+#
+#   In addition, if the PTHREAD_CREATE_JOINABLE thread-attribute constant
+#   has a nonstandard name, defines PTHREAD_CREATE_JOINABLE to that name
+#   (e.g. PTHREAD_CREATE_UNDETACHED on AIX).
+#
+#   Also HAVE_PTHREAD_PRIO_INHERIT is defined if pthread is found and the
+#   PTHREAD_PRIO_INHERIT symbol is defined when compiling with
+#   PTHREAD_CFLAGS.
+#
+#   ACTION-IF-FOUND is a list of shell commands to run if a threads library
+#   is found, and ACTION-IF-NOT-FOUND is a list of commands to run it if it
+#   is not found. If ACTION-IF-FOUND is not specified, the default action
+#   will define HAVE_PTHREAD.
+#
+#   Please let the authors know if this macro fails on any platform, or if
+#   you have any other suggestions or comments. This macro was based on work
+#   by SGJ on autoconf scripts for FFTW (http://www.fftw.org/) (with help
+#   from M. Frigo), as well as ac_pthread and hb_pthread macros posted by
+#   Alejandro Forero Cuervo to the autoconf macro repository. We are also
+#   grateful for the helpful feedback of numerous users.
+#
+#   Updated for Autoconf 2.68 by Daniel Richard G.
+#
+# LICENSE
+#
+#   Copyright (c) 2008 Steven G. Johnson <stevenj@alum.mit.edu>
+#   Copyright (c) 2011 Daniel Richard G. <skunk@iSKUNK.ORG>
+#
+#   This program is free software: you can redistribute it and/or modify it
+#   under the terms of the GNU General Public License as published by the
+#   Free Software Foundation, either version 3 of the License, or (at your
+#   option) any later version.
+#
+#   This program is distributed in the hope that it will be useful, but
+#   WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+#   Public License for more details.
+#
+#   You should have received a copy of the GNU General Public License along
+#   with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+#   As a special exception, the respective Autoconf Macro's copyright owner
+#   gives unlimited permission to copy, distribute and modify the configure
+#   scripts that are the output of Autoconf when processing the Macro. You
+#   need not follow the terms of the GNU General Public License when using
+#   or distributing such scripts, even though portions of the text of the
+#   Macro appear in them. The GNU General Public License (GPL) does govern
+#   all other use of the material that constitutes the Autoconf Macro.
+#
+#   This special exception to the GPL applies to versions of the Autoconf
+#   Macro released by the Autoconf Archive. When you make and distribute a
+#   modified version of the Autoconf Macro, you may extend this special
+#   exception to the GPL to apply to your modified version as well.
+
+#serial 18
+
+AU_ALIAS([ACX_PTHREAD], [AX_PTHREAD])
+AC_DEFUN([AX_PTHREAD], [
+AC_REQUIRE([AC_CANONICAL_HOST])
+AC_LANG_PUSH([C])
+ax_pthread_ok=no
+
+# We used to check for pthread.h first, but this fails if pthread.h
+# requires special compiler flags (e.g. on True64 or Sequent).
+# It gets checked for in the link test anyway.
+
+# First of all, check if the user has set any of the PTHREAD_LIBS,
+# etcetera environment variables, and if threads linking works using
+# them:
+if test x"$PTHREAD_LIBS$PTHREAD_CFLAGS" != x; then
+        save_CFLAGS="$CFLAGS"
+        CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
+        save_LIBS="$LIBS"
+        LIBS="$PTHREAD_LIBS $LIBS"
+        AC_MSG_CHECKING([for pthread_join in LIBS=$PTHREAD_LIBS with CFLAGS=$PTHREAD_CFLAGS])
+        AC_TRY_LINK_FUNC(pthread_join, ax_pthread_ok=yes)
+        AC_MSG_RESULT($ax_pthread_ok)
+        if test x"$ax_pthread_ok" = xno; then
+                PTHREAD_LIBS=""
+                PTHREAD_CFLAGS=""
+        fi
+        LIBS="$save_LIBS"
+        CFLAGS="$save_CFLAGS"
+fi
+
+# We must check for the threads library under a number of different
+# names; the ordering is very important because some systems
+# (e.g. DEC) have both -lpthread and -lpthreads, where one of the
+# libraries is broken (non-POSIX).
+
+# Create a list of thread flags to try.  Items starting with a "-" are
+# C compiler flags, and other items are library names, except for "none"
+# which indicates that we try without any flags at all, and "pthread-config"
+# which is a program returning the flags for the Pth emulation library.
+
+ax_pthread_flags="pthreads none -Kthread -kthread lthread -pthread -pthreads -mthreads pthread --thread-safe -mt pthread-config"
+
+# The ordering *is* (sometimes) important.  Some notes on the
+# individual items follow:
+
+# pthreads: AIX (must check this before -lpthread)
+# none: in case threads are in libc; should be tried before -Kthread and
+#       other compiler flags to prevent continual compiler warnings
+# -Kthread: Sequent (threads in libc, but -Kthread needed for pthread.h)
+# -kthread: FreeBSD kernel threads (preferred to -pthread since SMP-able)
+# lthread: LinuxThreads port on FreeBSD (also preferred to -pthread)
+# -pthread: Linux/gcc (kernel threads), BSD/gcc (userland threads)
+# -pthreads: Solaris/gcc
+# -mthreads: Mingw32/gcc, Lynx/gcc
+# -mt: Sun Workshop C (may only link SunOS threads [-lthread], but it
+#      doesn't hurt to check since this sometimes defines pthreads too;
+#      also defines -D_REENTRANT)
+#      ... -mt is also the pthreads flag for HP/aCC
+# pthread: Linux, etcetera
+# --thread-safe: KAI C++
+# pthread-config: use pthread-config program (for GNU Pth library)
+
+case ${host_os} in
+        solaris*)
+
+        # On Solaris (at least, for some versions), libc contains stubbed
+        # (non-functional) versions of the pthreads routines, so link-based
+        # tests will erroneously succeed.  (We need to link with -pthreads/-mt/
+        # -lpthread.)  (The stubs are missing pthread_cleanup_push, or rather
+        # a function called by this macro, so we could check for that, but
+        # who knows whether they'll stub that too in a future libc.)  So,
+        # we'll just look for -pthreads and -lpthread first:
+
+        ax_pthread_flags="-pthreads pthread -mt -pthread $ax_pthread_flags"
+        ;;
+
+        darwin*)
+        ax_pthread_flags="-pthread $ax_pthread_flags"
+        ;;
+esac
+
+if test x"$ax_pthread_ok" = xno; then
+for flag in $ax_pthread_flags; do
+
+        case $flag in
+                none)
+                AC_MSG_CHECKING([whether pthreads work without any flags])
+                ;;
+
+                -*)
+                AC_MSG_CHECKING([whether pthreads work with $flag])
+                PTHREAD_CFLAGS="$flag"
+                ;;
+
+                pthread-config)
+                AC_CHECK_PROG(ax_pthread_config, pthread-config, yes, no)
+                if test x"$ax_pthread_config" = xno; then continue; fi
+                PTHREAD_CFLAGS="`pthread-config --cflags`"
+                PTHREAD_LIBS="`pthread-config --ldflags` `pthread-config --libs`"
+                ;;
+
+                *)
+                AC_MSG_CHECKING([for the pthreads library -l$flag])
+                PTHREAD_LIBS="-l$flag"
+                ;;
+        esac
+
+        save_LIBS="$LIBS"
+        save_CFLAGS="$CFLAGS"
+        LIBS="$PTHREAD_LIBS $LIBS"
+        CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
+
+        # Check for various functions.  We must include pthread.h,
+        # since some functions may be macros.  (On the Sequent, we
+        # need a special flag -Kthread to make this header compile.)
+        # We check for pthread_join because it is in -lpthread on IRIX
+        # while pthread_create is in libc.  We check for pthread_attr_init
+        # due to DEC craziness with -lpthreads.  We check for
+        # pthread_cleanup_push because it is one of the few pthread
+        # functions on Solaris that doesn't have a non-functional libc stub.
+        # We try pthread_create on general principles.
+        AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <pthread.h>
+                        static void routine(void *a) { a = 0; }
+                        static void *start_routine(void *a) { return a; }],
+                       [pthread_t th; pthread_attr_t attr;
+                        pthread_create(&th, 0, start_routine, 0);
+                        pthread_join(th, 0);
+                        pthread_attr_init(&attr);
+                        pthread_cleanup_push(routine, 0);
+                        pthread_cleanup_pop(0) /* ; */])],
+                [ax_pthread_ok=yes],
+                [])
+
+        LIBS="$save_LIBS"
+        CFLAGS="$save_CFLAGS"
+
+        AC_MSG_RESULT($ax_pthread_ok)
+        if test "x$ax_pthread_ok" = xyes; then
+                break;
+        fi
+
+        PTHREAD_LIBS=""
+        PTHREAD_CFLAGS=""
+done
+fi
+
+# Various other checks:
+if test "x$ax_pthread_ok" = xyes; then
+        save_LIBS="$LIBS"
+        LIBS="$PTHREAD_LIBS $LIBS"
+        save_CFLAGS="$CFLAGS"
+        CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
+
+        # Detect AIX lossage: JOINABLE attribute is called UNDETACHED.
+        AC_MSG_CHECKING([for joinable pthread attribute])
+        attr_name=unknown
+        for attr in PTHREAD_CREATE_JOINABLE PTHREAD_CREATE_UNDETACHED; do
+            AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <pthread.h>],
+                           [int attr = $attr; return attr /* ; */])],
+                [attr_name=$attr; break],
+                [])
+        done
+        AC_MSG_RESULT($attr_name)
+        if test "$attr_name" != PTHREAD_CREATE_JOINABLE; then
+            AC_DEFINE_UNQUOTED(PTHREAD_CREATE_JOINABLE, $attr_name,
+                               [Define to necessary symbol if this constant
+                                uses a non-standard name on your system.])
+        fi
+
+        AC_MSG_CHECKING([if more special flags are required for pthreads])
+        flag=no
+        case ${host_os} in
+            aix* | freebsd* | darwin*) flag="-D_THREAD_SAFE";;
+            osf* | hpux*) flag="-D_REENTRANT";;
+            solaris*)
+            if test "$GCC" = "yes"; then
+                flag="-D_REENTRANT"
+            else
+                flag="-mt -D_REENTRANT"
+            fi
+            ;;
+        esac
+        AC_MSG_RESULT(${flag})
+        if test "x$flag" != xno; then
+            PTHREAD_CFLAGS="$flag $PTHREAD_CFLAGS"
+        fi
+
+        AC_CACHE_CHECK([for PTHREAD_PRIO_INHERIT],
+            ax_cv_PTHREAD_PRIO_INHERIT, [
+                AC_LINK_IFELSE([
+                    AC_LANG_PROGRAM([[#include <pthread.h>]], [[int i = PTHREAD_PRIO_INHERIT;]])],
+                    [ax_cv_PTHREAD_PRIO_INHERIT=yes],
+                    [ax_cv_PTHREAD_PRIO_INHERIT=no])
+            ])
+        AS_IF([test "x$ax_cv_PTHREAD_PRIO_INHERIT" = "xyes"],
+            AC_DEFINE([HAVE_PTHREAD_PRIO_INHERIT], 1, [Have PTHREAD_PRIO_INHERIT.]))
+
+        LIBS="$save_LIBS"
+        CFLAGS="$save_CFLAGS"
+
+        # More AIX lossage: must compile with xlc_r or cc_r
+        if test x"$GCC" != xyes; then
+          AC_CHECK_PROGS(PTHREAD_CC, xlc_r cc_r, ${CC})
+        else
+          PTHREAD_CC=$CC
+        fi
+else
+        PTHREAD_CC="$CC"
+fi
+
+AC_SUBST(PTHREAD_LIBS)
+AC_SUBST(PTHREAD_CFLAGS)
+AC_SUBST(PTHREAD_CC)
+
+# Finally, execute ACTION-IF-FOUND/ACTION-IF-NOT-FOUND:
+if test x"$ax_pthread_ok" = xyes; then
+        ifelse([$1],,AC_DEFINE(HAVE_PTHREAD,1,[Define if you have POSIX threads libraries and header files.]),[$1])
+        :
+else
+        ax_pthread_ok=no
+        $2
+fi
+AC_LANG_POP
+])dnl AX_PTHREAD
--- a/makefile.unix
+++ b/makefile.unix
@ -2,7 +2,8 @@
 # system, for simple local building of the libraries and tools.
 # It will not install the libraries system-wide, but just create the 'cwebp'
 # and 'dwebp' tools in the examples/ directory, along with the static
-# library 'src/libwebp.a'.
+# libraries 'src/libwebp.a', 'src/libwebpdecoder.a', 'src/mux/libwebpmux.a' and
+# 'src/demux/libwebpdemux.a'.
 #
 # To build the library and examples, use:
 #    make -f makefile.unix
@ -10,23 +11,40 @@

 #### Customizable part ####

-# These flag assume you have libpng and libjpeg installed. If not, either
-# follow below install instructions or just comment out the next lines.
-EXTRA_FLAGS= -DWEBP_HAVE_PNG -DWEBP_HAVE_JPEG
-EXTRA_LIBS= -lpng -ljpeg -lz
+# These flags assume you have libpng, libjpeg, libtiff and libgif installed. If
+# not, either follow the install instructions below or just comment out the next
+# four lines.
+EXTRA_FLAGS= -DWEBP_HAVE_PNG -DWEBP_HAVE_JPEG -DWEBP_HAVE_TIFF
+DWEBP_LIBS= -lpng -lz
+CWEBP_LIBS= $(DWEBP_LIBS) -ljpeg -ltiff
+GIF_LIBS = -lgif
+
 ifeq ($(strip $(shell uname)), Darwin)
+  # Work around a problem linking tables marked as common symbols,
+  # cf., src/enc/yuv.[hc]
+  # Failure observed with: gcc 4.2.1 and 4.0.1.
+  EXTRA_FLAGS += -fno-common
+  EXTRA_FLAGS += -DHAVE_GLUT_GLUT_H
  EXTRA_FLAGS += -I/opt/local/include
  EXTRA_LIBS  += -L/opt/local/lib
+  GL_LIBS = -framework GLUT -framework OpenGL
+else
+  GL_LIBS = -lglut -lGL
 endif

+
 # To install libraries on Mac OS X:
 # 1. Install MacPorts (http://www.macports.org/install.php)
 # 2. Run "sudo port install jpeg"
 # 3. Run "sudo port install libpng"
+# 4. Run "sudo port install tiff"
+# 5. Run "sudo port install giflib"

 # To install libraries on Linux:
 # 1. Run "sudo apt-get install libjpeg62-dev"
 # 2. Run "sudo apt-get install libpng12-dev"
+# 3. Run "sudo apt-get install libtiff4-dev"
+# 4. Run "sudo apt-get install libgif-dev"

 # Uncomment for build for 32bit platform
 # Alternatively, you can just use the command
@ -36,6 +54,9 @@ endif
 # Extra flags to enable experimental features and code
 # EXTRA_FLAGS += -DWEBP_EXPERIMENTAL_FEATURES

+# Extra flags to enable byte swap for 16 bit colorspaces.
+# EXTRA_FLAGS += -DWEBP_SWAP_16BIT_CSP
+
 # Extra flags to enable multi-threading
 EXTRA_FLAGS += -DWEBP_USE_THREAD
 EXTRA_LIBS += -lpthread
@ -45,78 +66,215 @@ EXTRA_FLAGS += -Wextra -Wold-style-definition
 EXTRA_FLAGS += -Wmissing-prototypes
 EXTRA_FLAGS += -Wmissing-declarations
 EXTRA_FLAGS += -Wdeclaration-after-statement
+EXTRA_FLAGS += -Wshadow
 # EXTRA_FLAGS += -Wvla

 #### Nothing should normally be changed below this line ####

 AR = ar
 ARFLAGS = r
-CC = gcc -Isrc/ -Iexamples/ -Wall
+CC = gcc
+CPPFLAGS = -Isrc/ -Wall
 CFLAGS = -O3 -DNDEBUG $(EXTRA_FLAGS)
 INSTALL = install
-LDFLAGS = $(EXTRA_LIBS) -lm
+GROFF = /usr/bin/groff
+COL = /usr/bin/col
+LDFLAGS = $(EXTRA_LIBS) $(EXTRA_FLAGS) -lm

-DEC_OBJS = src/dec/frame.o src/dec/webp.o src/dec/quant.o src/dec/tree.o \
-           src/dec/vp8.o src/dec/idec.o src/dec/alpha.o src/dec/layer.o \
-           src/dec/io.o src/dec/buffer.o
-ENC_OBJS = src/enc/webpenc.o src/enc/syntax.o \
-           src/enc/alpha.o src/enc/layer.o \
-           src/enc/tree.o src/enc/config.o src/enc/frame.o \
-           src/enc/quant.o src/enc/iterator.o src/enc/analysis.o \
-           src/enc/cost.o src/enc/picture.o src/enc/filter.o
-DSP_OBJS = src/dsp/cpu.o src/dsp/enc.o \
-           src/dsp/enc_sse2.o src/dsp/dec.o src/dsp/dec_sse2.o \
-           src/dsp/dec_neon.o src/dsp/upsampling.o src/dsp/upsampling_sse2.o \
-           src/dsp/yuv.o
-UTILS_OBJS = src/utils/bit_reader.o src/utils/bit_writer.o src/utils/thread.o
+DEC_OBJS = \
+    src/dec/alpha.o \
+    src/dec/buffer.o \
+    src/dec/frame.o \
+    src/dec/idec.o \
+    src/dec/io.o \
+    src/dec/layer.o \
+    src/dec/quant.o \
+    src/dec/tree.o \
+    src/dec/vp8.o \
+    src/dec/vp8l.o \
+    src/dec/webp.o \

-OBJS = $(DEC_OBJS) $(ENC_OBJS) $(DSP_OBJS) $(UTILS_OBJS)
+DEMUX_OBJS = \
+    src/demux/demux.o \

-HDRS = src/webp/encode.h src/enc/vp8enci.h src/enc/cost.h \
-       src/dec/vp8i.h  \
-       src/dsp/yuv.h src/dsp/dsp.h \
-       src/utils/bit_writer.h src/utils/bit_reader.h src/utils/thread.h
+DSP_DEC_OBJS = \
+    src/dsp/cpu.o \
+    src/dsp/dec.o \
+    src/dsp/dec_neon.o \
+    src/dsp/dec_sse2.o \
+    src/dsp/lossless.o \
+    src/dsp/upsampling.o \
+    src/dsp/upsampling_neon.o \
+    src/dsp/upsampling_sse2.o \
+    src/dsp/yuv.o \

-OUTPUT = examples/cwebp examples/dwebp src/libwebp.a
+DSP_ENC_OBJS = \
+    src/dsp/enc.o \
+    src/dsp/enc_neon.o \
+    src/dsp/enc_sse2.o \

-all:ex
+ENC_OBJS = \
+    src/enc/alpha.o \
+    src/enc/analysis.o \
+    src/enc/backward_references.o \
+    src/enc/config.o \
+    src/enc/cost.o \
+    src/enc/filter.o \
+    src/enc/frame.o \
+    src/enc/histogram.o \
+    src/enc/iterator.o \
+    src/enc/layer.o \
+    src/enc/picture.o \
+    src/enc/quant.o \
+    src/enc/syntax.o \
+    src/enc/token.o \
+    src/enc/tree.o \
+    src/enc/vp8l.o \
+    src/enc/webpenc.o \
+
+EX_FORMAT_DEC_OBJS = \
+    examples/jpegdec.o \
+    examples/metadata.o \
+    examples/pngdec.o \
+    examples/tiffdec.o \
+
+EX_UTIL_OBJS = \
+    examples/example_util.o \
+
+MUX_OBJS = \
+    src/mux/muxedit.o \
+    src/mux/muxinternal.o \
+    src/mux/muxread.o \
+
+UTILS_DEC_OBJS = \
+    src/utils/bit_reader.o \
+    src/utils/color_cache.o \
+    src/utils/filters.o \
+    src/utils/huffman.o \
+    src/utils/quant_levels_dec.o \
+    src/utils/rescaler.o \
+    src/utils/thread.o \
+    src/utils/utils.o \
+
+UTILS_ENC_OBJS = \
+    src/utils/bit_writer.o \
+    src/utils/huffman_encode.o \
+    src/utils/quant_levels.o \
+
+LIBWEBPDECODER_OBJS = $(DEC_OBJS) $(DSP_DEC_OBJS) $(UTILS_DEC_OBJS)
+LIBWEBP_OBJS = $(LIBWEBPDECODER_OBJS) $(ENC_OBJS) $(DSP_ENC_OBJS) \
+               $(UTILS_ENC_OBJS)
+LIBWEBPMUX_OBJS = $(MUX_OBJS)
+LIBWEBPDEMUX_OBJS = $(DEMUX_OBJS)
+
+HDRS_INSTALLED = \
+    src/webp/decode.h \
+    src/webp/demux.h \
+    src/webp/encode.h \
+    src/webp/mux.h \
+    src/webp/mux_types.h \
+    src/webp/types.h \
+
+HDRS = \
+    src/dec/decode_vp8.h \
+    src/dec/vp8i.h \
+    src/dec/vp8li.h \
+    src/dec/webpi.h \
+    src/dsp/dsp.h \
+    src/dsp/lossless.h \
+    src/dsp/yuv.h \
+    src/enc/cost.h \
+    src/enc/vp8enci.h \
+    src/utils/bit_reader.h \
+    src/utils/bit_writer.h \
+    src/utils/color_cache.h \
+    src/utils/filters.h \
+    src/utils/huffman.h \
+    src/utils/huffman_encode.h \
+    src/utils/quant_levels.h \
+    src/utils/quant_levels_dec.h \
+    src/utils/rescaler.h \
+    src/utils/thread.h \
+    src/webp/format_constants.h \
+    $(HDRS_INSTALLED) \
+
+OUT_LIBS = examples/libexample_util.a src/libwebpdecoder.a src/libwebp.a
+OUT_EXAMPLES = examples/cwebp examples/dwebp
+EXTRA_EXAMPLES = examples/gif2webp examples/vwebp examples/webpmux
+
+OUTPUT = $(OUT_LIBS) $(OUT_EXAMPLES)
+ifeq ($(MAKECMDGOALS),clean)
+  OUTPUT += $(EXTRA_EXAMPLES)
+  OUTPUT += src/demux/libwebpdemux.a src/mux/libwebpmux.a
+endif
+
+ex: $(OUT_EXAMPLES)
+all: ex $(EXTRA_EXAMPLES)
+
+$(EX_FORMAT_DEC_OBJS): %.o: %.h

 %.o: %.c $(HDRS)
-	$(CC) $(CFLAGS) -c $< -o $@
+	$(CC) $(CFLAGS) $(CPPFLAGS) -c $< -o $@

-src/libwebp.a:  $(OBJS)
+examples/libexample_util.a: $(EX_UTIL_OBJS)
+src/libwebpdecoder.a: $(LIBWEBPDECODER_OBJS)
+src/libwebp.a: $(LIBWEBP_OBJS)
+src/mux/libwebpmux.a: $(LIBWEBPMUX_OBJS)
+src/demux/libwebpdemux.a: $(LIBWEBPDEMUX_OBJS)
+
+%.a:
 	$(AR) $(ARFLAGS) $@ $^

-ex: examples/cwebp examples/dwebp
+examples/cwebp: examples/cwebp.o $(EX_FORMAT_DEC_OBJS)
+examples/dwebp: examples/dwebp.o
+examples/gif2webp: examples/gif2webp.o
+examples/vwebp: examples/vwebp.o
+examples/webpmux: examples/webpmux.o

-examples/cwebp: examples/cwebp.o src/libwebp.a
-examples/dwebp: examples/dwebp.o src/libwebp.a
-examples/cwebp examples/dwebp:
+examples/cwebp: src/libwebp.a
+examples/cwebp: EXTRA_LIBS += $(CWEBP_LIBS)
+examples/dwebp: examples/libexample_util.a src/libwebpdecoder.a
+examples/dwebp: EXTRA_LIBS += $(DWEBP_LIBS)
+examples/gif2webp: examples/libexample_util.a src/mux/libwebpmux.a src/libwebp.a
+examples/gif2webp: EXTRA_LIBS += $(GIF_LIBS)
+examples/vwebp: examples/libexample_util.a src/demux/libwebpdemux.a
+examples/vwebp: src/libwebp.a
+examples/vwebp: EXTRA_LIBS += $(GL_LIBS)
+examples/webpmux: examples/libexample_util.a src/mux/libwebpmux.a
+examples/webpmux: src/libwebpdecoder.a
+
+$(OUT_EXAMPLES) $(EXTRA_EXAMPLES):
 	$(CC) -o $@ $^ $(LDFLAGS)

 dist: DESTDIR := dist
+dist: OUT_EXAMPLES += $(EXTRA_EXAMPLES)
 dist: all
 	$(INSTALL) -m755 -d $(DESTDIR)/include/webp \
-	    $(DESTDIR)/doc $(DESTDIR)/lib
-	$(INSTALL) -m755 -s examples/cwebp examples/dwebp $(DESTDIR)
-	$(INSTALL) -m644 src/webp/*.h $(DESTDIR)/include/webp
+	           $(DESTDIR)/doc $(DESTDIR)/lib
+	$(INSTALL) -m755 -s $(OUT_EXAMPLES) $(DESTDIR)
+	$(INSTALL) -m644 $(HDRS_INSTALLED) $(DESTDIR)/include/webp
 	$(INSTALL) -m644 src/libwebp.a $(DESTDIR)/lib
+	$(INSTALL) -m644 src/demux/libwebpdemux.a $(DESTDIR)/lib
+	$(INSTALL) -m644 src/mux/libwebpmux.a $(DESTDIR)/lib
 	umask 022; \
-	for m in man/[cd]webp.1; do \
+	for m in man/[cd]webp.1 man/gif2webp.1 man/webpmux.1; do \
 	  basenam=$$(basename $$m .1); \
-	  /usr/bin/groff -t -e -man -T utf8 $$m \
-	    | col -bx >$(DESTDIR)/doc/$${basenam}.txt; \
-	  /usr/bin/groff -t -e -man -T html $$m \
-	    | col -bx >$(DESTDIR)/doc/$${basenam}.html; \
+	  $(GROFF) -t -e -man -T utf8 $$m \
+	    | $(COL) -bx >$(DESTDIR)/doc/$${basenam}.txt; \
+	  $(GROFF) -t -e -man -T html $$m \
+	    | $(COL) -bx >$(DESTDIR)/doc/$${basenam}.html; \
 	done

 clean:
-	$(RM) ${OUTPUT} *~ \
-              src/enc/*.o src/enc/*~ \
+	$(RM) $(OUTPUT) *~ \
+              examples/*.o examples/*~ \
              src/dec/*.o src/dec/*~ \
+              src/demux/*.o src/demux/*~ \
              src/dsp/*.o src/dsp/*~ \
+              src/enc/*.o src/enc/*~ \
+              src/mux/*.o src/mux/*~ \
              src/utils/*.o src/utils/*~ \
-              examples/*.o examples/*~
+              src/webp/*~ man/*~ doc/*~ swig/*~ \

 superclean: clean
 	$(RM) -r .git *.log *.cache *~
@ -127,7 +285,8 @@ superclean: clean
 	$(RM) Makefile */Makefile */*/Makefile
 	$(RM) Makefile.in */Makefile.in */*/Makefile.in
 	$(RM) config.log autom4te.cache libtool config.h stamp-h1
-	$(RM) aclocal.m4 compile config.guess config.h.in config.sub config.status
+	$(RM) aclocal.m4 compile
+	$(RM) config.guess config.h.in config.sub config.status
 	$(RM) configure depcomp install-sh ltmain.sh missing src/libwebp.pc
 	$(RM) m4/*

--- a/man/Makefile.am
+++ b/man/Makefile.am
@ -1,2 +1,8 @@
 man_MANS = cwebp.1 dwebp.1
+if WANT_MUX
+  man_MANS += webpmux.1
+endif
+if BUILD_GIF2WEBP
+  man_MANS += gif2webp.1
+endif
 EXTRA_DIST = $(man_MANS)
--- a/man/cwebp.1
+++ b/man/cwebp.1
@ -1,10 +1,10 @@
 .\"                                      Hey, EMACS: -*- nroff -*-
-.TH CWEBP 1 "September 19, 2011"
+.TH CWEBP 1 "March 13, 2013"
 .SH NAME
 cwebp \- compress an image file to a WebP file
 .SH SYNOPSIS
 .B cwebp
-.RI [ options ] " input_file -o output_file.webp
+.RI [ options ] " input_file \-o output_file.webp
 .br
 .SH DESCRIPTION
 This manual page documents the
@ -12,13 +12,11 @@ This manual page documents the
 command.
 .PP
 \fBcwebp\fP compresses an image using the WebP format.
-Input format can be either PNG, JPEG, or raw Y'CbCr samples.
-When using PNG, the transparency information (alpha channel) is currently
-discarded.
+Input format can be either PNG, JPEG, TIFF or raw Y'CbCr samples.
 .SH OPTIONS
 The basic options are:
 .TP
-.B \-o string
+.BI \-o " string
 Specify the name of the output WebP file. If omitted, \fBcwebp\fP will
 perform compression but only report statistics.
 .TP
@ -31,19 +29,30 @@ A summary of all the possible options.
 .B \-version
 Print the version number (as major.minor.revision) and exit.
 .TP
-.B \-q float
-Specify the compression factor between 0 and 100. A small factor
-produces a smaller file with lower quality. Best quality is achieved
-using a value of 100. The default is 75.
+.BI \-q " float
+Specify the compression factor for RGB channels between 0 and 100. The default
+is 75.
+.br
+In case of lossy compression (default), a small factor produces a smaller file
+with lower quality. Best quality is achieved by using a value of 100.
+.br
+In case of lossless compression (specified by the \-lossless option), a small
+factor enables faster compression speed, but produces a larger file. Maximum
+compression is achieved by using a value of 100.
 .TP
-.B \-f int
+.BI \-alpha_q " int
+Specify the compression factor for alpha compression between 0 and 100.
+Lossless compression of alpha is achieved using a value of 100, while the lower
+values result in a lossy compression. The default is 100.
+.TP
+.BI \-f " int
 Specify the strength of the deblocking filter, between 0 (no filtering)
 and 100 (maximum filtering). A value of 0 will turn off any filtering.
 Higher value will increase the strength of the filtering process applied
-after decoding the picture. The higher the smoother the picture will
+after decoding the picture. The higher the value the smoother the picture will
 appear. Typical values are usually in the range of 20 to 50.
 .TP
-.B \-preset string
+.BI \-preset " string
 Specify a set of pre-defined parameters to suit a particular type of
 source material. Possible values are:  \fBdefault\fP, \fBphoto\fP,
 \fBpicture\fP, \fBdrawing\fP, \fBicon\fP, \fBtext\fP. Since
@ -51,21 +60,40 @@ source material. Possible values are:  \fBdefault\fP, \fBphoto\fP,
 \fB\-q\fP one), this option should preferably appear first in the
 order of the arguments.
 .TP
-.B \-sns int
+.BI \-sns " int
 Specify the amplitude of the spatial noise shaping. Spatial noise shaping
 (or \fBsns\fP for short) refers to a general collection of built-in algorithms
 used to decide which area of the picture should use relatively less bits,
 and where else to better transfer these bits. The possible range goes from
 0 (algorithm is off) to 100 (the maximal effect). The default value is 80.
 .TP
-.B \-m int
+.BI \-m " int
 Specify the compression method to use. This parameter controls the
-tradeoff between encoding speed and the compressed file size and quality.
+trade off between encoding speed and the compressed file size and quality.
 Possible values range from 0 to 6. Default value is 4.
 When higher values are used, the encoder will spend more time inspecting
 additional encoding possibilities and decide on the quality gain.
 Lower value can result is faster processing time at the expense of
-larger filesize and lower compression quality.
+larger file size and lower compression quality.
+.TP
+.B \-jpeg_like
+Change the internal parameter mapping to better match the expected size
+of JPEG compression. This flag will generally produce an output file of
+similar size to its JPEG equivalent (for the same \fB\-q\fP setting), but
+with less visual distortion.
+.TP
+.B \-mt
+Use multi-threading for encoding, if possible. This option is only effective
+when using lossy compression on a source with a transparency channel.
+.TP
+.B \-low_memory
+Reduce memory usage of lossy encoding by saving four times the compressed
+size (typically). This will make the encoding slower and the output slightly
+different in size and distortion. This flag is only effective for methods
+3 and up, and is off by default. Note that leaving this flag off will have
+some side effects on the bitstream: it forces certain bitstream features
+like number of partitions (forced to 1). Note that a more detailed report
+of bitstream size is printed by \fBcwebp\fP when using this option.
 .TP
 .B \-af
 Turns auto-filter on. This algorithm will spend additional time optimizing
@ -74,19 +102,25 @@ the filtering strength to reach a well-balanced quality.
 .SH ADDITIONAL OPTIONS
 More advanced options are:
 .TP
-.B \-sharpness int
+.BI \-sharpness " int
 Specify the sharpness of the filtering (if used).
 Range is 0 (sharpest) to 7 (least sharp). Default is 0.
 .TP
 .B \-strong
-Use a stronger filtering than the default one (if filtering is being
-used thanks to the \fB\-f\fP option). Strong filtering is off by default.
+Use strong filtering (if filtering is being used thanks to the
+\fB\-f\fP option). Strong filtering is on by default.
 .TP
-.B \-segments int
+.B \-nostrong
+Disable strong filtering (if filtering is being used thanks to the
+\fB\-f\fP option) and use simple filtering instead.
+.TP
+.BI \-segments " int
 Change the number of partitions to use during the segmentation of the
 sns algorithm. Segments should be in range 1 to 4. Default value is 4.
+This option has no effect for methods 3 and up, unless \fB\-low_memory\fP
+is used.
 .TP
-.B \-partition_limit int
+.BI \-partition_limit " int
 Degrade quality by limiting the number of bits used by some macroblocks.
 Range is 0 (no degradation, the default) to 100 (full degradation).
 Useful values are usually around 30-70 for moderately large images.
@ -105,65 +139,123 @@ If using \fB-partition_limit\fP is not enough to meet the 512k constraint, one
 should use less segments in order to save more header bits per macroblock.
 See the \fB-segments\fP option.
 .TP
-.B \-size int
+.BI \-size " int
 Specify a target size (in bytes) to try and reach for the compressed output.
 Compressor will make several pass of partial encoding in order to get as
 close as possible to this target.
 .TP
-.B \-psnr float
+.BI \-psnr " float
 Specify a target PSNR (in dB) to try and reach for the compressed output.
 Compressor will make several pass of partial encoding in order to get as
 close as possible to this target.
 .TP
-.B \-pass int
-Set a maximum number of pass to use during the dichotomy used by
+.BI \-pass " int
+Set a maximum number of passes to use during the dichotomy used by
 options \fB\-size\fP or \fB\-psnr\fP. Maximum value is 10.
 .TP
-.B \-crop x_position y_position width height
+.BI \-crop " x_position y_position width height
 Crop the source to a rectangle with top-left corner at coordinates
 (\fBx_position\fP, \fBy_position\fP) and size \fBwidth\fP x \fBheight\fP.
 This cropping area must be fully contained within the source rectangle.
 .TP
-.B \-s width height
+.BI \-s " width height
 Specify that the input file actually consists of raw Y'CbCr samples following
 the ITU-R BT.601 recommendation, in 4:2:0 linear format.
 The luma plane has size \fBwidth\fP x \fBheight\fP.
 .TP
-.B \-map int
+.BI \-map " int
 Output additional ASCII-map of encoding information. Possible map values
 range from 1 to 6. This is only meant to help debugging.
 .TP
-.B \-pre int
+.BI \-pre " int
 Specify a pre-processing filter. This option is a placeholder
 and has currently no effect.
 .TP
+.BI \-alpha_filter " string
+Specify the predictive filtering method for the alpha plane. One of 'none',
+\&'fast' or 'best', in increasing complexity and slowness order. Default is
+\&'fast'. Internally, alpha filtering is performed using four possible
+predictions (none, horizontal, vertical, gradient). The 'best' mode will try
+each mode in turn and pick the one which gives the smaller size. The 'fast'
+mode will just try to form an a-priori guess without testing all modes.
+.TP
+.BI \-alpha_method " int
+Specify the algorithm used for alpha compression: 0 or 1. Algorithm 0 denotes
+no compression, 1 uses WebP lossless format for compression. The default is 1.
+.TP
+.B \-alpha_cleanup
+Modify unseen RGB values under fully transparent area, to help compressibility.
+The default is off.
+.TP
+.B \-noalpha
+Using this option will discard the alpha channel.
+.TP
+.B \-lossless
+Encode the image without any loss.
+.TP
+.BI \-hint " string
+Specify the hint about input image type. Possible values are:
+\fBphoto\fP, \fBpicture\fP or \fBgraph\fP.
+.TP
+.BI \-metadata " string
+A comma separated list of metadata to copy from the input to the output if
+present.
+Valid values: \fBall\fP, \fBnone\fP, \fBexif\fP, \fBicc\fP, \fBxmp\fP.
+The default is \fBnone\fP.
+
+Note: each input format may not support all combinations.
+.TP
 .B \-noasm
 Disable all assembly optimizations.
 .TP
 .B \-v
 Print extra information (encoding time in particular).
 .TP
+.B \-print_psnr
+Compute and report average PSNR (Peak-Signal-To-Noise ratio).
+.TP
+.B \-print_ssim
+Compute and report average SSIM (structural similarity
+metric, see http://en.wikipedia.org/wiki/SSIM for additional details).
+.TP
+.B \-print_lsim
+Compute and report local similarity metric (sum of lowest error amongst the
+collocated pixel neighbors).
+.TP
+.B \-progress
+Report encoding progress in percent.
+.TP
 .B \-quiet
 Do not print anything.
 .TP
 .B \-short
 Only print brief information (output file size and PSNR) for testing purpose.

-.SH Examples:
-cwebp -q 70 picture.png -o picture.webp
+.SH BUGS
+Please report all bugs to our issue tracker:
+http://code.google.com/p/webp/issues
 .br
-cwebp -sns 70 -f 50 -strong -af -size 60000 picture.png -o picture.webp
+Patches welcome! See this page to get started:
+http://www.webmproject.org/code/contribute/submitting-patches/

-.SH
-.SH SEE ALSO
-.BR dwebp (1).
+.SH EXAMPLES
+cwebp \-q 50 -lossless picture.png \-o picture_lossless.webp
 .br
-Please refer to http://code.google.com/speed/webp/ for additional
-information.
-.SH AUTHOR
+cwebp \-q 70 picture_with_alpha.png \-o picture_with_alpha.webp
+.br
+cwebp \-sns 70 \-f 50 \-size 60000 picture.png \-o picture.webp
+
+.SH AUTHORS
 \fBcwebp\fP was written by the WebP team.
 .br
 The latest source tree is available at http://www.webmproject.org/code
 .PP
 This manual page was written by Pascal Massimino <pascal.massimino@gmail.com>,
 for the Debian project (and may be used by others).
+
+.SH SEE ALSO
+.BR dwebp (1),
+.BR gif2webp (1).
+.br
+Please refer to http://developers.google.com/speed/webp/ for additional
+information.
--- a/man/dwebp.1
+++ b/man/dwebp.1
@ -1,5 +1,5 @@
 .\"                                      Hey, EMACS: -*- nroff -*-
-.TH DWEBP 1 "September 19, 2011"
+.TH DWEBP 1 "February 01, 2013"
 .SH NAME
 dwebp \- decompress a WebP file to an image file
 .SH SYNOPSIS
@ -11,7 +11,7 @@ This manual page documents the
 .B dwebp
 command.
 .PP
-\fBdwebp\fP decompresses WebP files into PNG, PPM or PGM images.
+\fBdwebp\fP decompresses WebP files into PNG, PAM, PPM or PGM images.
 .SH OPTIONS
 The basic options are:
 .TP
@ -21,16 +21,25 @@ Print usage summary.
 .B \-version
 Print the version number (as major.minor.revision) and exit.
 .TP
-.B \-o string
+.BI \-o " string
 Specify the name of the output file (as PNG format by default).
 .TP
+.B \-pam
+Change the output format to PAM (retains alpha).
+.TP
 .B \-ppm
-Change the output format to PPM.
+Change the output format to PPM (discards alpha).
 .TP
 .B \-pgm
-Change the output format to PGM. The output consist of luma/chroma
+Change the output format to PGM. The output consists of luma/chroma
 samples instead of RGB, using the ICM4 layout. This option is mainly
-for verification and debugging purpose.
+for verification and debugging purposes.
+.TP
+.B \-yuv
+Change the output format to raw YUV. The output consists of
+luma/chroma-U/chroma-V samples instead of RGB, saved sequentially as
+individual planes. This option is mainly for verification and debugging
+purposes.
 .TP
 .B \-nofancy
 Don't use the fancy upscaler for YUV420. This may lead to jaggy
@ -44,7 +53,7 @@ but will make the decoding faster.
 .B \-mt
 Use multi-threading for decoding, if possible.
 .TP
-.B \-crop x_position y_position width height
+.BI \-crop " x_position y_position width height
 Crop the decoded picture to a rectangle with top-left corner at coordinates
 (\fBx_position\fP, \fBy_position\fP) and size \fBwidth\fP x \fBheight\fP.
 This cropping area must be fully contained within the source rectangle.
@ -52,11 +61,11 @@ The top-left corner will be snapped to even coordinates if needed.
 This option is meant to reduce the memory needed for cropping large images.
 Note: the cropping is applied \fIbefore\fP any scaling.
 .TP
-.B \-scale width height
-Rescale the decoded picture to dimension \fBwidth\fP x \fBheight\fP. This option is
-mostly intended to reducing the memory needed to decode large images,
-when only a small version is needed (thumbnail, preview, etc.).
-Note: scaling is applied \fIafter\fP cropping.
+.BI \-scale " width height
+Rescale the decoded picture to dimension \fBwidth\fP x \fBheight\fP. This
+option is mostly intended to reducing the memory needed to decode large images,
+when only a small version is needed (thumbnail, preview, etc.).  Note: scaling
+is applied \fIafter\fP cropping.
 .TP
 .B \-v
 Print extra information (decoding time in particular).
@ -64,21 +73,38 @@ Print extra information (decoding time in particular).
 .B \-noasm
 Disable all assembly optimizations.

-.SH Examples:
-dwebp picture.webp -o output.png
+.SH BUGS
+Please report all bugs to our issue tracker:
+http://code.google.com/p/webp/issues
 .br
-dwebp picture.webp -ppm -o output.ppm
+Patches welcome! See this page to get started:
+http://www.webmproject.org/code/contribute/submitting-patches/

-.SH
-.SH SEE ALSO
-.BR cwebp (1).
+.SH EXAMPLES
+dwebp picture.webp \-o output.png
 .br
-Please refer to http://code.google.com/speed/webp/ for additional
-information.
-.SH AUTHOR
+dwebp picture.webp \-ppm \-o output.ppm
+
+.SH AUTHORS
 \fBdwebp\fP was written by the WebP team.
 .br
 The latest source tree is available at http://www.webmproject.org/code
 .PP
 This manual page was written by Pascal Massimino <pascal.massimino@gmail.com>,
 for the Debian project (and may be used by others).
+
+.SH SEE ALSO
+.BR cwebp (1),
+.BR webpmux (1),
+.BR gif2webp (1).
+.br
+Please refer to http://developers.google.com/speed/webp/ for additional
+information.
+.SS Output file format details
+PAM: http://netpbm.sourceforge.net/doc/pam.html
+.br
+PGM: http://netpbm.sourceforge.net/doc/pgm.html
+.br
+PPM: http://netpbm.sourceforge.net/doc/ppm.html
+.br
+PNG: http://www.libpng.org/pub/png/png-sitemap.html#info
--- a/man/gif2webp.1
+++ b/man/gif2webp.1
@ -0,0 +1,96 @@
+.\"                                      Hey, EMACS: -*- nroff -*-
+.TH GIF2WEBP 1 "February 01, 2013"
+.SH NAME
+gif2webp \- Convert a GIF image to WebP
+.SH SYNOPSIS
+.B gif2webp
+.RI [ options ] " input_file.gif \-o output_file.webp
+.br
+.SH DESCRIPTION
+This manual page documents the
+.B gif2webp
+command.
+.PP
+\fBgif2webp\fP converts a GIF image to a WebP image.
+.SH OPTIONS
+The basic options are:
+.TP
+.BI \-o " string
+Specify the name of the output WebP file. If omitted, \fBgif2webp\fP will
+perform conversion but only report statistics.
+.TP
+.B \-h, \-help
+Usage information.
+.TP
+.B \-version
+Print the version number (as major.minor.revision) and exit.
+.TP
+.B \-lossy
+Encode the image using lossy compression.
+.TP
+.BI \-q " float
+Specify the compression factor for RGB channels between 0 and 100. The default
+is 75.
+.br
+In case of lossless compression (default), a small factor enables faster
+compression speed, but produces a larger file. Maximum compression is achieved
+by using a value of 100.
+.br
+In case of lossy compression (specified by the \-lossy option), a small factor
+produces a smaller file with lower quality. Best quality is achieved by using a
+value of 100.
+.TP
+.BI \-m " int
+Specify the compression method to use. This parameter controls the
+trade off between encoding speed and the compressed file size and quality.
+Possible values range from 0 to 6. Default value is 4.
+When higher values are used, the encoder will spend more time inspecting
+additional encoding possibilities and decide on the quality gain.
+Lower value can result is faster processing time at the expense of
+larger file size and lower compression quality.
+.TP
+.BI \-f " int
+For lossy encoding only (specified by the \-lossy option). Specify the strength
+of the deblocking filter, between 0 (no filtering) and 100 (maximum filtering).
+A value of 0 will turn off any filtering. Higher value will increase the
+strength of the filtering process applied after decoding the picture. The higher
+the value the smoother the picture will appear. Typical values are usually in
+the range of 20 to 50.
+.TP
+.B \-v
+Print extra information.
+.TP
+.B \-quiet
+Do not print anything.
+
+.SH BUGS
+Please report all bugs to our issue tracker:
+http://code.google.com/p/webp/issues
+.br
+Patches welcome! See this page to get started:
+http://www.webmproject.org/code/contribute/submitting-patches/
+
+.SH EXAMPLES
+gif2webp picture.gif \-o picture.webp
+.br
+gif2webp \-q 70 picture.gif \-o picture.webp
+.br
+gif2webp \-lossy \-m 3 picture.gif \-o picture_lossy.webp
+.br
+gif2webp \-lossy \-f 50 picture.gif \-o picture.webp
+
+.SH AUTHORS
+\fBgif2webp\fP was written by the WebP team.
+.br
+The latest source tree is available at http://www.webmproject.org/code
+.PP
+This manual page was written by Urvang Joshi <urvang@google.com>, for the
+Debian project (and may be used by others).
+
+.SH SEE ALSO
+.BR dwebp (1),
+.BR cwebp (1),
+.BR webpmux (1).
+.br
+Please refer to http://developers.google.com/speed/webp/ for additional
+information.
--- a/man/webpmux.1
+++ b/man/webpmux.1
@ -0,0 +1,174 @@
+.\"                                      Hey, EMACS: -*- nroff -*-
+.TH WEBPMUX 1 "March 16, 2013"
+.SH NAME
+webpmux \- command line tool to create WebP Mux/container file.
+.SH SYNOPSIS
+.B webpmux \-get
+.I GET_OPTIONS
+.I INPUT
+.B \-o
+.I OUTPUT
+.br
+.B webpmux \-set
+.I SET_OPTIONS
+.I INPUT
+.B \-o
+.I OUTPUT
+.br
+.B webpmux \-strip
+.I STRIP_OPTIONS
+.I INPUT
+.B \-o
+.I OUTPUT
+.br
+.B webpmux \-frame
+.I FRAME_OPTIONS
+.B [ \-frame ... ] [ \-loop
+.I LOOP_COUNT
+.B ]
+.br
+.RS 8
+.B [ \-bgcolor
+.I BACKGROUND_COLOR
+.B ] \-o
+.I OUTPUT
+.RE
+.br
+.B webpmux \-info
+.I INPUT
+.br
+.B webpmux [\-h|\-help]
+.br
+.B webpmux \-version
+.SH DESCRIPTION
+This manual page documents the
+.B webpmux
+command.
+.PP
+\fBwebpmux\fP can be used to create a WebP container file
+and extract/strip relevant data from the container file.
+.SH OPTIONS
+.SS GET_OPTIONS (\-get):
+.TP
+.B icc
+Get ICC profile.
+.TP
+.B exif
+Get EXIF metadata.
+.TP
+.B xmp
+Get XMP metadata.
+.TP
+.BI frame " n
+Get nth frame.
+
+.SS SET_OPTIONS (\-set)
+.TP
+.BI icc " file.icc
+Set ICC profile.
+.P
+Where: 'file.icc' contains the ICC profile to be set.
+.TP
+.BI exif " file.exif
+Set EXIF metadata.
+.P
+Where: 'file.exif' contains the EXIF metadata to be set.
+.TP
+.BI xmp " file.xmp
+Set XMP metadata.
+.P
+Where: 'file.xmp' contains the XMP metadata to be set.
+
+.SS STRIP_OPTIONS (\-strip)
+.TP
+.B icc
+Strip ICC profile.
+.TP
+.B exif
+Strip EXIF metadata.
+.TP
+.B xmp
+Strip XMP metadata.
+
+.SS FRAME_OPTIONS (\-frame)
+.TP
+.I file_i +di[+xi+yi[+mi]]
+Where: 'file_i' is the i'th frame (WebP format), 'xi','yi' specify the image
+offset for this frame, 'di' is the pause duration before next frame and 'mi' is
+the dispose method for this frame (0 for NONE or 1 for BACKGROUND).
+'mi' can be omitted and will default to 0 (NONE).
+Additionally, if 'mi' is ommitted then'xi' and 'yi' can be omitted and will
+default to +0+0.
+.TP
+.BI \-loop " n
+Loop the frames n number of times. 0 indicates the frames should loop forever.
+Valid range is 0 to 65535 [Default: 0 (infinite)].
+.TP
+.BI \-bgcolor " A,R,G,B
+Background color of the canvas.
+.br
+where: 'A', 'R', 'G' and 'B' are integers in the range 0 to 255 specifying the
+Alpha, Red, Green and Blue component values respectively
+[Default: 255,255,255,255].
+
+.SS INPUT
+.TP
+Input file in WebP format.
+
+.SS OUTPUT (\-o)
+.TP
+Output file in WebP format.
+
+.SS Note:
+.TP
+The nature of EXIF, XMP and ICC data is not checked and is assumed to be valid.
+
+.SH BUGS
+Please report all bugs to our issue tracker:
+http://code.google.com/p/webp/issues
+.br
+Patches welcome! See this page to get started:
+http://www.webmproject.org/code/contribute/submitting-patches/
+
+.SH EXAMPLES
+webpmux \-set icc image_profile.icc in.webp \-o icc_container.webp
+.br
+webpmux \-get icc icc_container.webp \-o image_profile.icc
+.br
+webpmux \-strip icc icc_container.webp \-o without_icc.webp
+.br
+webpmux \-set xmp image_metadata.xmp in.webp \-o xmp_container.webp
+.br
+webpmux \-get xmp xmp_container.webp \-o image_metadata.xmp
+.br
+webpmux \-strip xmp xmp_container.webp \-o without_xmp.webp
+.br
+webpmux \-set exif image_metadata.exif in.webp \-o exif_container.webp
+.br
+webpmux \-get exif exif_container.webp \-o image_metadata.exif
+.br
+webpmux \-strip exif exif_container.webp \-o without_exif.webp
+.br
+webpmux \-frame anim_1.webp +100 \-frame anim_2.webp +100+50+50 \-loop 10
+.br
+.RS 8
+\-bgcolor 255,255,255,255 \-o anim_container.webp
+.RE
+.br
+webpmux \-get frame 2 anim_container.webp \-o frame_2.webp
+
+.SH AUTHORS
+\fBwebpmux\fP is written by the WebP team.
+.br
+The latest source tree is available at http://www.webmproject.org/code
+.PP
+This manual page was written by Vikas Arora <vikaas.arora@gmail.com>,
+for the Debian project (and may be used by others).
+
+.SH SEE ALSO
+.BR dwebp (1),
+.BR cwebp (1),
+.BR gif2webp (1).
+.br
+Please refer to http://developers.google.com/speed/webp/ for additional
+information.
--- a/src/Makefile.am
+++ b/src/Makefile.am
@ -1,18 +1,55 @@
-SUBDIRS = dec enc dsp utils
+# The mux and demux libraries depend on libwebp, thus the '.' to force the
+# build order so it's available to them.
+SUBDIRS = dec enc dsp utils .
+if WANT_MUX
+  SUBDIRS += mux
+endif
+if WANT_DEMUX
+  SUBDIRS += demux
+endif

 AM_CPPFLAGS = -I$(top_srcdir)/src
 lib_LTLIBRARIES = libwebp.la

-libwebp_la_SOURCES =
-libwebp_la_LIBADD = dec/libwebpdecode.la \
-                    enc/libwebpencode.la \
-                    utils/libwebputils.la \
-                    dsp/libwebpdsp.la
-libwebp_la_LDFLAGS = -version-info 2:0:0
-libwebpinclude_HEADERS = webp/types.h webp/decode.h webp/decode_vp8.h \
-                         webp/encode.h
-libwebpincludedir = $(includedir)/webp
+if BUILD_LIBWEBPDECODER
+  lib_LTLIBRARIES += libwebpdecoder.la
+endif

+common_HEADERS =
+common_HEADERS += webp/decode.h
+common_HEADERS += webp/types.h
+commondir = $(includedir)/webp
+
+libwebp_la_SOURCES =
+libwebpinclude_HEADERS =
+libwebpinclude_HEADERS += webp/encode.h
+noinst_HEADERS =
+noinst_HEADERS += webp/format_constants.h
+
+libwebp_la_LIBADD =
+libwebp_la_LIBADD += dec/libwebpdecode.la
+libwebp_la_LIBADD += dsp/libwebpdsp.la
+libwebp_la_LIBADD += enc/libwebpencode.la
+libwebp_la_LIBADD += utils/libwebputils.la
+
+# Use '-no-undefined' to declare that libwebp does not depend on any libraries
+# other than the ones listed on the command line, i.e., after linking, it will
+# not have unresolved symbols. Some platforms (Windows among them) require all
+# symbols in shared libraries to be resolved at library creation.
+libwebp_la_LDFLAGS = -no-undefined -version-info 4:3:0
+libwebpincludedir = $(includedir)/webp
 pkgconfig_DATA = libwebp.pc

+if BUILD_LIBWEBPDECODER
+  libwebpdecoder_la_SOURCES =
+
+  libwebpdecoder_la_LIBADD =
+  libwebpdecoder_la_LIBADD += dec/libwebpdecode.la
+  libwebpdecoder_la_LIBADD += dsp/libwebpdspdecode.la
+  libwebpdecoder_la_LIBADD += utils/libwebputilsdecode.la
+
+  libwebpdecoder_la_LDFLAGS = -no-undefined -version-info 0:1:0
+  pkgconfig_DATA += libwebpdecoder.pc
+endif
+
 ${pkgconfig_DATA}: ${top_builddir}/config.status
--- a/src/dec/Makefile.am
+++ b/src/dec/Makefile.am
@ -1,13 +1,28 @@
 AM_CPPFLAGS = -I$(top_srcdir)/src
-
-libwebpdecode_la_SOURCES = vp8i.h webpi.h \
-                           frame.c quant.c tree.c vp8.c webp.c \
-                           idec.c alpha.c layer.c io.c buffer.c
-libwebpdecode_la_LDFLAGS = -version-info 2:0:0
-libwebpdecode_la_CPPFLAGS = $(USE_EXPERIMENTAL_CODE)
-libwebpdecodeinclude_HEADERS = ../webp/decode.h ../webp/decode_vp8.h ../webp/types.h
-libwebpdecodeincludedir = $(includedir)/webp
-
-noinst_HEADERS = vp8i.h webpi.h
-
 noinst_LTLIBRARIES = libwebpdecode.la
+
+libwebpdecode_la_SOURCES =
+libwebpdecode_la_SOURCES += alpha.c
+libwebpdecode_la_SOURCES += buffer.c
+libwebpdecode_la_SOURCES += decode_vp8.h
+libwebpdecode_la_SOURCES += frame.c
+libwebpdecode_la_SOURCES += idec.c
+libwebpdecode_la_SOURCES += io.c
+libwebpdecode_la_SOURCES += layer.c
+libwebpdecode_la_SOURCES += quant.c
+libwebpdecode_la_SOURCES += tree.c
+libwebpdecode_la_SOURCES += vp8.c
+libwebpdecode_la_SOURCES += vp8i.h
+libwebpdecode_la_SOURCES += vp8l.c
+libwebpdecode_la_SOURCES += vp8li.h
+libwebpdecode_la_SOURCES += webp.c
+libwebpdecode_la_SOURCES += webpi.h
+
+libwebpdecodeinclude_HEADERS =
+libwebpdecodeinclude_HEADERS += ../webp/decode.h
+libwebpdecodeinclude_HEADERS += ../webp/types.h
+noinst_HEADERS =
+noinst_HEADERS += ../webp/format_constants.h
+
+libwebpdecode_la_CPPFLAGS = $(USE_EXPERIMENTAL_CODE)
+libwebpdecodeincludedir = $(includedir)/webp
--- a/src/dec/alpha.c
+++ b/src/dec/alpha.c
@ -1,8 +1,10 @@
-// Copyright 2011 Google Inc.
+// Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Alpha-plane decompression.
@ -10,60 +12,104 @@
 // Author: Skal (pascal.massimino@gmail.com)

 #include <stdlib.h>
-#include "vp8i.h"
-
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-
-#include "zlib.h"
+#include "./vp8i.h"
+#include "./vp8li.h"
+#include "../utils/filters.h"
+#include "../utils/quant_levels_dec.h"
+#include "../webp/format_constants.h"

 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif

+//------------------------------------------------------------------------------
+// Decodes the compressed data 'data' of size 'data_size' into the 'output'.
+// The 'output' buffer should be pre-allocated and must be of the same
+// dimension 'height'x'width', as that of the image.
+//
+// Returns 1 on successfully decoding the compressed alpha and
+//         0 if either:
+//           error in bit-stream header (invalid compression mode or filter), or
+//           error returned by appropriate compression method.
+
+static int DecodeAlpha(const uint8_t* data, size_t data_size,
+                       int width, int height, uint8_t* output) {
+  WEBP_FILTER_TYPE filter;
+  int pre_processing;
+  int rsrv;
+  int ok = 0;
+  int method;
+  const uint8_t* const alpha_data = data + ALPHA_HEADER_LEN;
+  const size_t alpha_data_size = data_size - ALPHA_HEADER_LEN;
+
+  assert(width > 0 && height > 0);
+  assert(data != NULL && output != NULL);
+
+  if (data_size <= ALPHA_HEADER_LEN) {
+    return 0;
+  }
+
+  method = (data[0] >> 0) & 0x03;
+  filter = (data[0] >> 2) & 0x03;
+  pre_processing = (data[0] >> 4) & 0x03;
+  rsrv = (data[0] >> 6) & 0x03;
+  if (method < ALPHA_NO_COMPRESSION ||
+      method > ALPHA_LOSSLESS_COMPRESSION ||
+      filter >= WEBP_FILTER_LAST ||
+      pre_processing > ALPHA_PREPROCESSED_LEVELS ||
+      rsrv != 0) {
+    return 0;
+  }
+
+  if (method == ALPHA_NO_COMPRESSION) {
+    const size_t alpha_decoded_size = height * width;
+    ok = (alpha_data_size >= alpha_decoded_size);
+    if (ok) memcpy(output, alpha_data, alpha_decoded_size);
+  } else {
+    ok = VP8LDecodeAlphaImageStream(width, height, alpha_data, alpha_data_size,
+                                    output);
+  }
+
+  if (ok) {
+    WebPUnfilterFunc unfilter_func = WebPUnfilters[filter];
+    if (unfilter_func != NULL) {
+      // TODO(vikas): Implement on-the-fly decoding & filter mechanism to decode
+      // and apply filter per image-row.
+      unfilter_func(width, height, width, output);
+    }
+    if (pre_processing == ALPHA_PREPROCESSED_LEVELS) {
+      ok = DequantizeLevels(output, width, height);
+    }
+  }
+
+  return ok;
+}
+
 //------------------------------------------------------------------------------

 const uint8_t* VP8DecompressAlphaRows(VP8Decoder* const dec,
                                      int row, int num_rows) {
-  uint8_t* output = dec->alpha_plane_;
-  const int stride = dec->pic_hdr_.width_;
-  if (row < 0 || row + num_rows > dec->pic_hdr_.height_) {
-    return NULL;    // sanity check
+  const int width = dec->pic_hdr_.width_;
+  const int height = dec->pic_hdr_.height_;
+
+  if (row < 0 || num_rows < 0 || row + num_rows > height) {
+    return NULL;    // sanity check.
  }
+
  if (row == 0) {
-    // TODO(skal): for now, we just decompress everything during the first call.
-    // Later, we'll decode progressively, but we need to store the
-    // z_stream state.
-    const uint8_t* data = dec->alpha_data_;
-    size_t data_size = dec->alpha_data_size_;
-    const size_t output_size = stride * dec->pic_hdr_.height_;
-    int ret = Z_OK;
-    z_stream strm;
-
-    memset(&strm, 0, sizeof(strm));
-    if (inflateInit(&strm) != Z_OK) {
-      return 0;
-    }
-    strm.avail_in = data_size;
-    strm.next_in = (unsigned char*)data;
-    do {
-      strm.avail_out = output_size;
-      strm.next_out = output;
-      ret = inflate(&strm, Z_NO_FLUSH);
-      if (ret == Z_NEED_DICT || ret == Z_DATA_ERROR || ret == Z_MEM_ERROR) {
-        break;
-      }
-    } while (strm.avail_out == 0);
-
-    inflateEnd(&strm);
-    if (ret != Z_STREAM_END) {
-      return NULL;    // error
+    // Decode everything during the first call.
+    assert(!dec->is_alpha_decoded_);
+    if (!DecodeAlpha(dec->alpha_data_, (size_t)dec->alpha_data_size_,
+                     width, height, dec->alpha_plane_)) {
+      return NULL;  // Error.
    }
+    dec->is_alpha_decoded_ = 1;
  }
-  return output + row * stride;
+
+  // Return a pointer to the current decoded row.
+  return dec->alpha_plane_ + row * width;
 }

 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
-
-#endif    // WEBP_EXPERIMENTAL_FEATURES
--- a/src/dec/buffer.c
+++ b/src/dec/buffer.c
@ -1,8 +1,10 @@
-// Copyright 2011 Google Inc.
+// Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Everything about WebPDecBuffer
@ -10,8 +12,10 @@
 // Author: Skal (pascal.massimino@gmail.com)

 #include <stdlib.h>
-#include "vp8i.h"
-#include "webpi.h"
+
+#include "./vp8i.h"
+#include "./webpi.h"
+#include "../utils/utils.h"

 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
@ -21,33 +25,51 @@ extern "C" {
 // WebPDecBuffer

 // Number of bytes per pixel for the different color-spaces.
-static const int kModeBpp[MODE_LAST] = { 3, 4, 3, 4, 4, 2, 2, 1, 1 };
+static const int kModeBpp[MODE_LAST] = {
+  3, 4, 3, 4, 4, 2, 2,
+  4, 4, 4, 2,    // pre-multiplied modes
+  1, 1 };
+
+// Check that webp_csp_mode is within the bounds of WEBP_CSP_MODE.
+// Convert to an integer to handle both the unsigned/signed enum cases
+// without the need for casting to remove type limit warnings.
+static int IsValidColorspace(int webp_csp_mode) {
+  return (webp_csp_mode >= MODE_RGB && webp_csp_mode < MODE_LAST);
+}

 static VP8StatusCode CheckDecBuffer(const WebPDecBuffer* const buffer) {
  int ok = 1;
-  WEBP_CSP_MODE mode = buffer->colorspace;
+  const WEBP_CSP_MODE mode = buffer->colorspace;
  const int width = buffer->width;
  const int height = buffer->height;
-  if (mode >= MODE_YUV) {   // YUV checks
+  if (!IsValidColorspace(mode)) {
+    ok = 0;
+  } else if (!WebPIsRGBMode(mode)) {   // YUV checks
    const WebPYUVABuffer* const buf = &buffer->u.YUVA;
-    const int size = buf->y_stride * height;
-    const int u_size = buf->u_stride * ((height + 1) / 2);
-    const int v_size = buf->v_stride * ((height + 1) / 2);
-    const int a_size = buf->a_stride * height;
-    ok &= (size <= buf->y_size);
+    const uint64_t y_size = (uint64_t)buf->y_stride * height;
+    const uint64_t u_size = (uint64_t)buf->u_stride * ((height + 1) / 2);
+    const uint64_t v_size = (uint64_t)buf->v_stride * ((height + 1) / 2);
+    const uint64_t a_size = (uint64_t)buf->a_stride * height;
+    ok &= (y_size <= buf->y_size);
    ok &= (u_size <= buf->u_size);
    ok &= (v_size <= buf->v_size);
-    ok &= (a_size <= buf->a_size);
    ok &= (buf->y_stride >= width);
    ok &= (buf->u_stride >= (width + 1) / 2);
    ok &= (buf->v_stride >= (width + 1) / 2);
-    if (buf->a) {
+    ok &= (buf->y != NULL);
+    ok &= (buf->u != NULL);
+    ok &= (buf->v != NULL);
+    if (mode == MODE_YUVA) {
      ok &= (buf->a_stride >= width);
+      ok &= (a_size <= buf->a_size);
+      ok &= (buf->a != NULL);
    }
  } else {    // RGB checks
    const WebPRGBABuffer* const buf = &buffer->u.RGBA;
-    ok &= (buf->stride * height <= buf->size);
+    const uint64_t size = (uint64_t)buf->stride * height;
+    ok &= (size <= buf->size);
    ok &= (buf->stride >= width * kModeBpp[mode]);
+    ok &= (buf->rgba != NULL);
  }
  return ok ? VP8_STATUS_OK : VP8_STATUS_INVALID_PARAM;
 }
@ -55,24 +77,22 @@ static VP8StatusCode CheckDecBuffer(const WebPDecBuffer* const buffer) {
 static VP8StatusCode AllocateBuffer(WebPDecBuffer* const buffer) {
  const int w = buffer->width;
  const int h = buffer->height;
+  const WEBP_CSP_MODE mode = buffer->colorspace;

-  if (w <= 0 || h <= 0) {
+  if (w <= 0 || h <= 0 || !IsValidColorspace(mode)) {
    return VP8_STATUS_INVALID_PARAM;
  }

  if (!buffer->is_external_memory && buffer->private_memory == NULL) {
    uint8_t* output;
-    WEBP_CSP_MODE mode = buffer->colorspace;
-    int stride;
    int uv_stride = 0, a_stride = 0;
-    int uv_size = 0;
-    uint64_t size, a_size = 0, total_size;
+    uint64_t uv_size = 0, a_size = 0, total_size;
    // We need memory and it hasn't been allocated yet.
    // => initialize output buffer, now that dimensions are known.
-    stride = w * kModeBpp[mode];
-    size = (uint64_t)stride * h;
+    const int stride = w * kModeBpp[mode];
+    const uint64_t size = (uint64_t)stride * h;

-    if (mode >= MODE_YUV) {
+    if (!WebPIsRGBMode(mode)) {
      uv_stride = (w + 1) / 2;
      uv_size = (uint64_t)uv_stride * ((h + 1) / 2);
      if (mode == MODE_YUVA) {
@ -83,36 +103,33 @@ static VP8StatusCode AllocateBuffer(WebPDecBuffer* const buffer) {
    total_size = size + 2 * uv_size + a_size;

    // Security/sanity checks
-    if (((size_t)total_size != total_size) || (total_size >= (1ULL << 40))) {
-      return VP8_STATUS_INVALID_PARAM;
-    }
-
-    buffer->private_memory = output = (uint8_t*)malloc((size_t)total_size);
+    output = (uint8_t*)WebPSafeMalloc(total_size, sizeof(*output));
    if (output == NULL) {
      return VP8_STATUS_OUT_OF_MEMORY;
    }
+    buffer->private_memory = output;

-    if (mode >= MODE_YUV) {   // YUVA initialization
+    if (!WebPIsRGBMode(mode)) {   // YUVA initialization
      WebPYUVABuffer* const buf = &buffer->u.YUVA;
      buf->y = output;
      buf->y_stride = stride;
-      buf->y_size = size;
+      buf->y_size = (size_t)size;
      buf->u = output + size;
      buf->u_stride = uv_stride;
-      buf->u_size = uv_size;
+      buf->u_size = (size_t)uv_size;
      buf->v = output + size + uv_size;
      buf->v_stride = uv_stride;
-      buf->v_size = uv_size;
+      buf->v_size = (size_t)uv_size;
      if (mode == MODE_YUVA) {
        buf->a = output + size + 2 * uv_size;
      }
-      buf->a_size = a_size;
+      buf->a_size = (size_t)a_size;
      buf->a_stride = a_stride;
    } else {  // RGBA initialization
      WebPRGBABuffer* const buf = &buffer->u.RGBA;
      buf->rgba = output;
      buf->stride = stride;
-      buf->size = size;
+      buf->size = (size_t)size;
    }
  }
  return CheckDecBuffer(buffer);
@ -140,7 +157,7 @@ VP8StatusCode WebPAllocateDecBuffer(int w, int h,
      if (options->scaled_width <= 0 || options->scaled_height <= 0) {
        return VP8_STATUS_INVALID_PARAM;
      }
-      w  = options->scaled_width;
+      w = options->scaled_width;
      h = options->scaled_height;
    }
  }
@ -154,15 +171,17 @@ VP8StatusCode WebPAllocateDecBuffer(int w, int h,
 //------------------------------------------------------------------------------
 // constructors / destructors

-int WebPInitDecBufferInternal(WebPDecBuffer* const buffer, int version) {
-  if (version != WEBP_DECODER_ABI_VERSION) return 0;  // version mismatch
-  if (!buffer) return 0;
+int WebPInitDecBufferInternal(WebPDecBuffer* buffer, int version) {
+  if (WEBP_ABI_IS_INCOMPATIBLE(version, WEBP_DECODER_ABI_VERSION)) {
+    return 0;  // version mismatch
+  }
+  if (buffer == NULL) return 0;
  memset(buffer, 0, sizeof(*buffer));
  return 1;
 }

-void WebPFreeDecBuffer(WebPDecBuffer* const buffer) {
-  if (buffer) {
+void WebPFreeDecBuffer(WebPDecBuffer* buffer) {
+  if (buffer != NULL) {
    if (!buffer->is_external_memory)
      free(buffer->private_memory);
    buffer->private_memory = NULL;
@ -171,9 +190,9 @@ void WebPFreeDecBuffer(WebPDecBuffer* const buffer) {

 void WebPCopyDecBuffer(const WebPDecBuffer* const src,
                       WebPDecBuffer* const dst) {
-  if (src && dst) {
+  if (src != NULL && dst != NULL) {
    *dst = *src;
-    if (src->private_memory) {
+    if (src->private_memory != NULL) {
      dst->is_external_memory = 1;   // dst buffer doesn't own the memory.
      dst->private_memory = NULL;
    }
@ -182,9 +201,9 @@ void WebPCopyDecBuffer(const WebPDecBuffer* const src,

 // Copy and transfer ownership from src to dst (beware of parameter order!)
 void WebPGrabDecBuffer(WebPDecBuffer* const src, WebPDecBuffer* const dst) {
-  if (src && dst) {
+  if (src != NULL && dst != NULL) {
    *dst = *src;
-    if (src->private_memory) {
+    if (src->private_memory != NULL) {
      src->is_external_memory = 1;   // src relinquishes ownership
      src->private_memory = NULL;
    }
--- a/src/webp/decode_vp8.h
+++ b/src/webp/decode_vp8.h
@ -1,8 +1,10 @@
-// Copyright 2010 Google Inc.
+// Copyright 2010 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 //  Low-level API for VP8 decoder
@ -12,7 +14,7 @@
 #ifndef WEBP_WEBP_DECODE_VP8_H_
 #define WEBP_WEBP_DECODE_VP8_H_

-#include "./decode.h"
+#include "../webp/decode.h"

 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
@ -82,7 +84,7 @@ struct VP8Io {
  int fancy_upsampling;

  // Input buffer.
-  uint32_t data_size;
+  size_t data_size;
  const uint8_t* data;

  // If true, in-loop filtering will not be performed even if present in the
@ -99,56 +101,81 @@ struct VP8Io {
  int use_scaling;
  int scaled_width, scaled_height;

-  // pointer to the alpha data (if present) corresponding to the rows
+  // If non NULL, pointer to the alpha data (if present) corresponding to the
+  // start of the current row (That is: it is pre-offset by mb_y and takes
+  // cropping into account).
  const uint8_t* a;
 };

 // Internal, version-checked, entry point
-WEBP_EXTERN(int) VP8InitIoInternal(VP8Io* const, int);
+int VP8InitIoInternal(VP8Io* const, int);

 // Set the custom IO function pointers and user-data. The setter for IO hooks
 // should be called before initiating incremental decoding. Returns true if
 // WebPIDecoder object is successfully modified, false otherwise.
-WEBP_EXTERN(int) WebPISetIOHooks(WebPIDecoder* const idec,
-                                 VP8IoPutHook put,
-                                 VP8IoSetupHook setup,
-                                 VP8IoTeardownHook teardown,
-                                 void* user_data);
+int WebPISetIOHooks(WebPIDecoder* const idec,
+                    VP8IoPutHook put,
+                    VP8IoSetupHook setup,
+                    VP8IoTeardownHook teardown,
+                    void* user_data);

 // Main decoding object. This is an opaque structure.
 typedef struct VP8Decoder VP8Decoder;

 // Create a new decoder object.
-WEBP_EXTERN(VP8Decoder*) VP8New(void);
+VP8Decoder* VP8New(void);

 // Must be called to make sure 'io' is initialized properly.
 // Returns false in case of version mismatch. Upon such failure, no other
 // decoding function should be called (VP8Decode, VP8GetHeaders, ...)
-static inline int VP8InitIo(VP8Io* const io) {
+static WEBP_INLINE int VP8InitIo(VP8Io* const io) {
  return VP8InitIoInternal(io, WEBP_DECODER_ABI_VERSION);
 }

 // Start decoding a new picture. Returns true if ok.
-WEBP_EXTERN(int) VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io);
+int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io);

 // Decode a picture. Will call VP8GetHeaders() if it wasn't done already.
 // Returns false in case of error.
-WEBP_EXTERN(int) VP8Decode(VP8Decoder* const dec, VP8Io* const io);
+int VP8Decode(VP8Decoder* const dec, VP8Io* const io);

 // Return current status of the decoder:
-WEBP_EXTERN(VP8StatusCode) VP8Status(VP8Decoder* const dec);
+VP8StatusCode VP8Status(VP8Decoder* const dec);

 // return readable string corresponding to the last status.
-WEBP_EXTERN(const char*) VP8StatusMessage(VP8Decoder* const dec);
+const char* VP8StatusMessage(VP8Decoder* const dec);

 // Resets the decoder in its initial state, reclaiming memory.
 // Not a mandatory call between calls to VP8Decode().
-WEBP_EXTERN(void) VP8Clear(VP8Decoder* const dec);
+void VP8Clear(VP8Decoder* const dec);

 // Destroy the decoder object.
-WEBP_EXTERN(void) VP8Delete(VP8Decoder* const dec);
+void VP8Delete(VP8Decoder* const dec);

 //------------------------------------------------------------------------------
+// Miscellaneous VP8/VP8L bitstream probing functions.
+
+// Returns true if the next 3 bytes in data contain the VP8 signature.
+WEBP_EXTERN(int) VP8CheckSignature(const uint8_t* const data, size_t data_size);
+
+// Validates the VP8 data-header and retrieves basic header information viz
+// width and height. Returns 0 in case of formatting error. *width/*height
+// can be passed NULL.
+WEBP_EXTERN(int) VP8GetInfo(
+    const uint8_t* data,
+    size_t data_size,    // data available so far
+    size_t chunk_size,   // total data size expected in the chunk
+    int* const width, int* const height);
+
+// Returns true if the next byte(s) in data is a VP8L signature.
+WEBP_EXTERN(int) VP8LCheckSignature(const uint8_t* const data, size_t size);
+
+// Validates the VP8L data-header and retrieves basic header information viz
+// width, height and alpha. Returns 0 in case of formatting error.
+// width/height/has_alpha can be passed NULL.
+WEBP_EXTERN(int) VP8LGetInfo(
+    const uint8_t* data, size_t data_size,  // data available so far
+    int* const width, int* const height, int* const has_alpha);

 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
--- a/src/dec/frame.c
+++ b/src/dec/frame.c
@ -1,8 +1,10 @@
-// Copyright 2010 Google Inc.
+// Copyright 2010 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Frame-reconstruction function. Memory allocation.
@ -11,6 +13,7 @@

 #include <stdlib.h>
 #include "./vp8i.h"
+#include "../utils/utils.h"

 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
@ -19,54 +22,7 @@ extern "C" {
 #define ALIGN_MASK (32 - 1)

 //------------------------------------------------------------------------------
-// For multi-threaded decoding we need to use 3 rows of 16 pixels as delay line.
-//
-// Reason is: the deblocking filter cannot deblock the bottom horizontal edges
-// immediately, and needs to wait for first few rows of the next macroblock to
-// be decoded. Hence, deblocking is lagging behind by 4 or 8 pixels (depending
-// on strength).
-// With two threads, the vertical positions of the rows being decoded are:
-// Decode:  [ 0..15][16..31][32..47][48..63][64..79][...
-// Deblock:         [ 0..11][12..27][28..43][44..59][...
-// If we use two threads and two caches of 16 pixels, the sequence would be:
-// Decode:  [ 0..15][16..31][ 0..15!!][16..31][ 0..15][...
-// Deblock:         [ 0..11][12..27!!][-4..11][12..27][...
-// The problem occurs during row [12..15!!] that both the decoding and
-// deblocking threads are writing simultaneously.
-// With 3 cache lines, one get a safe write pattern:
-// Decode:  [ 0..15][16..31][32..47][ 0..15][16..31][32..47][0..
-// Deblock:         [ 0..11][12..27][28..43][-4..11][12..27][28...
-// Note that multi-threaded output _without_ deblocking can make use of two
-// cache lines of 16 pixels only, since there's no lagging behind. The decoding
-// and output process have non-concurrent writing:
-// Decode:  [ 0..15][16..31][ 0..15][16..31][...
-// io->put:         [ 0..15][16..31][ 0..15][...
-
-#define MT_CACHE_LINES 3
-#define ST_CACHE_LINES 1   // 1 cache row only for single-threaded case
-
-// Initialize multi/single-thread worker
-static int InitThreadContext(VP8Decoder* const dec) {
-  dec->cache_id_ = 0;
-  if (dec->use_threads_) {
-    WebPWorker* const worker = &dec->worker_;
-    if (!WebPWorkerReset(worker)) {
-      return VP8SetError(dec, VP8_STATUS_OUT_OF_MEMORY,
-                         "thread initialization failed.");
-    }
-    worker->data1 = dec;
-    worker->data2 = (void*)&dec->thread_ctx_.io_;
-    worker->hook = (WebPWorkerHook)VP8FinishRow;
-    dec->num_caches_ =
-      (dec->filter_type_ > 0) ? MT_CACHE_LINES : MT_CACHE_LINES - 1;
-  } else {
-    dec->num_caches_ = ST_CACHE_LINES;
-  }
-  return 1;
-}
-
-//------------------------------------------------------------------------------
-// Memory setup
+// Filtering

 // kFilterExtraRows[] = How many extra lines are needed on the MB boundary
 // for caching, given a filtering level.
@ -75,125 +31,7 @@ static int InitThreadContext(VP8Decoder* const dec) {
 //                 U/V, so it's 8 samples total (because of the 2x upsampling).
 static const uint8_t kFilterExtraRows[3] = { 0, 2, 8 };

-static int AllocateMemory(VP8Decoder* const dec) {
-  const int num_caches = dec->num_caches_;
-  const int mb_w = dec->mb_w_;
-  const int intra_pred_mode_size = 4 * mb_w * sizeof(uint8_t);
-  const int top_size = (16 + 8 + 8) * mb_w;
-  const int mb_info_size = (mb_w + 1) * sizeof(VP8MB);
-  const int f_info_size =
-    (dec->filter_type_ > 0) ?
-        mb_w * (dec->use_threads_ ? 2 : 1) * sizeof(VP8FInfo)
-      : 0;
-  const int yuv_size = YUV_SIZE * sizeof(*dec->yuv_b_);
-  const int coeffs_size = 384 * sizeof(*dec->coeffs_);
-  const int cache_height = (16 * num_caches
-                         + kFilterExtraRows[dec->filter_type_]) * 3 / 2;
-  const int cache_size = top_size * cache_height;
-  const int alpha_size =
-    dec->alpha_data_ ? (dec->pic_hdr_.width_ * dec->pic_hdr_.height_) : 0;
-  const int needed = intra_pred_mode_size
-                   + top_size + mb_info_size + f_info_size
-                   + yuv_size + coeffs_size
-                   + cache_size + alpha_size + ALIGN_MASK;
-  uint8_t* mem;
-
-  if (needed > dec->mem_size_) {
-    free(dec->mem_);
-    dec->mem_size_ = 0;
-    dec->mem_ = (uint8_t*)malloc(needed);
-    if (dec->mem_ == NULL) {
-      return VP8SetError(dec, VP8_STATUS_OUT_OF_MEMORY,
-                         "no memory during frame initialization.");
-    }
-    dec->mem_size_ = needed;
-  }
-
-  mem = (uint8_t*)dec->mem_;
-  dec->intra_t_ = (uint8_t*)mem;
-  mem += intra_pred_mode_size;
-
-  dec->y_t_ = (uint8_t*)mem;
-  mem += 16 * mb_w;
-  dec->u_t_ = (uint8_t*)mem;
-  mem += 8 * mb_w;
-  dec->v_t_ = (uint8_t*)mem;
-  mem += 8 * mb_w;
-
-  dec->mb_info_ = ((VP8MB*)mem) + 1;
-  mem += mb_info_size;
-
-  dec->f_info_ = f_info_size ? (VP8FInfo*)mem : NULL;
-  mem += f_info_size;
-  dec->thread_ctx_.id_ = 0;
-  dec->thread_ctx_.f_info_ = dec->f_info_;
-  if (dec->use_threads_) {
-    // secondary cache line. The deblocking process need to make use of the
-    // filtering strength from previous macroblock row, while the new ones
-    // are being decoded in parallel. We'll just swap the pointers.
-    dec->thread_ctx_.f_info_ += mb_w;
-  }
-
-  mem = (uint8_t*)((uintptr_t)(mem + ALIGN_MASK) & ~ALIGN_MASK);
-  assert((yuv_size & ALIGN_MASK) == 0);
-  dec->yuv_b_ = (uint8_t*)mem;
-  mem += yuv_size;
-
-  dec->coeffs_ = (int16_t*)mem;
-  mem += coeffs_size;
-
-  dec->cache_y_stride_ = 16 * mb_w;
-  dec->cache_uv_stride_ = 8 * mb_w;
-  {
-    const int extra_rows = kFilterExtraRows[dec->filter_type_];
-    const int extra_y = extra_rows * dec->cache_y_stride_;
-    const int extra_uv = (extra_rows / 2) * dec->cache_uv_stride_;
-    dec->cache_y_ = ((uint8_t*)mem) + extra_y;
-    dec->cache_u_ = dec->cache_y_
-                  + 16 * num_caches * dec->cache_y_stride_ + extra_uv;
-    dec->cache_v_ = dec->cache_u_
-                  + 8 * num_caches * dec->cache_uv_stride_ + extra_uv;
-    dec->cache_id_ = 0;
-  }
-  mem += cache_size;
-
-  // alpha plane
-  dec->alpha_plane_ = alpha_size ? (uint8_t*)mem : NULL;
-  mem += alpha_size;
-
-  // note: left-info is initialized once for all.
-  memset(dec->mb_info_ - 1, 0, mb_info_size);
-
-  // initialize top
-  memset(dec->intra_t_, B_DC_PRED, intra_pred_mode_size);
-
-  return 1;
-}
-
-static void InitIo(VP8Decoder* const dec, VP8Io* io) {
-  // prepare 'io'
-  io->mb_y = 0;
-  io->y = dec->cache_y_;
-  io->u = dec->cache_u_;
-  io->v = dec->cache_v_;
-  io->y_stride = dec->cache_y_stride_;
-  io->uv_stride = dec->cache_uv_stride_;
-  io->fancy_upsampling = 0;    // default
-  io->a = NULL;
-}
-
-int VP8InitFrame(VP8Decoder* const dec, VP8Io* io) {
-  if (!InitThreadContext(dec)) return 0;  // call first. Sets dec->num_caches_.
-  if (!AllocateMemory(dec)) return 0;
-  InitIo(dec, io);
-  VP8DspInit();  // Init critical function pointers and look-up tables.
-  return 1;
-}
-
-//------------------------------------------------------------------------------
-// Filtering
-
-static inline int hev_thresh_from_level(int level, int keyframe) {
+static WEBP_INLINE int hev_thresh_from_level(int level, int keyframe) {
  if (keyframe) {
    return (level >= 40) ? 2 : (level >= 15) ? 1 : 0;
  } else {
@ -261,53 +99,50 @@ static void FilterRow(const VP8Decoder* const dec) {
 }

 //------------------------------------------------------------------------------
+// Precompute the filtering strength for each segment and each i4x4/i16x16 mode.

-void VP8StoreBlock(VP8Decoder* const dec) {
+static void PrecomputeFilterStrengths(VP8Decoder* const dec) {
  if (dec->filter_type_ > 0) {
-    VP8FInfo* const info = dec->f_info_ + dec->mb_x_;
-    const int skip = dec->mb_info_[dec->mb_x_].skip_;
-    int level = dec->filter_levels_[dec->segment_];
-    if (dec->filter_hdr_.use_lf_delta_) {
-      // TODO(skal): only CURRENT is handled for now.
-      level += dec->filter_hdr_.ref_lf_delta_[0];
-      if (dec->is_i4x4_) {
-        level += dec->filter_hdr_.mode_lf_delta_[0];
-      }
-    }
-    level = (level < 0) ? 0 : (level > 63) ? 63 : level;
-    info->f_level_ = level;
-
-    if (dec->filter_hdr_.sharpness_ > 0) {
-      if (dec->filter_hdr_.sharpness_ > 4) {
-        level >>= 2;
+    int s;
+    const VP8FilterHeader* const hdr = &dec->filter_hdr_;
+    for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
+      int i4x4;
+      // First, compute the initial level
+      int base_level;
+      if (dec->segment_hdr_.use_segment_) {
+        base_level = dec->segment_hdr_.filter_strength_[s];
+        if (!dec->segment_hdr_.absolute_delta_) {
+          base_level += hdr->level_;
+        }
      } else {
-        level >>= 1;
+        base_level = hdr->level_;
      }
-      if (level > 9 - dec->filter_hdr_.sharpness_) {
-        level = 9 - dec->filter_hdr_.sharpness_;
-      }
-    }
+      for (i4x4 = 0; i4x4 <= 1; ++i4x4) {
+        VP8FInfo* const info = &dec->fstrengths_[s][i4x4];
+        int level = base_level;
+        if (hdr->use_lf_delta_) {
+          // TODO(skal): only CURRENT is handled for now.
+          level += hdr->ref_lf_delta_[0];
+          if (i4x4) {
+            level += hdr->mode_lf_delta_[0];
+          }
+        }
+        level = (level < 0) ? 0 : (level > 63) ? 63 : level;
+        info->f_level_ = level;

-    info->f_ilevel_ = (level < 1) ? 1 : level;
-    info->f_inner_ = (!skip || dec->is_i4x4_);
-  }
-  {
-    // Transfer samples to row cache
-    int y;
-    const int y_offset = dec->cache_id_ * 16 * dec->cache_y_stride_;
-    const int uv_offset = dec->cache_id_ * 8 * dec->cache_uv_stride_;
-    uint8_t* const ydst = dec->cache_y_ + dec->mb_x_ * 16 + y_offset;
-    uint8_t* const udst = dec->cache_u_ + dec->mb_x_ * 8 + uv_offset;
-    uint8_t* const vdst = dec->cache_v_ + dec->mb_x_ * 8 + uv_offset;
-    for (y = 0; y < 16; ++y) {
-      memcpy(ydst + y * dec->cache_y_stride_,
-             dec->yuv_b_ + Y_OFF + y * BPS, 16);
-    }
-    for (y = 0; y < 8; ++y) {
-      memcpy(udst + y * dec->cache_uv_stride_,
-           dec->yuv_b_ + U_OFF + y * BPS, 8);
-      memcpy(vdst + y * dec->cache_uv_stride_,
-           dec->yuv_b_ + V_OFF + y * BPS, 8);
+        if (hdr->sharpness_ > 0) {
+          if (hdr->sharpness_ > 4) {
+            level >>= 2;
+          } else {
+            level >>= 1;
+          }
+          if (level > 9 - hdr->sharpness_) {
+            level = 9 - hdr->sharpness_;
+          }
+        }
+        info->f_ilevel_ = (level < 1) ? 1 : level;
+        info->f_inner_ = 0;
+      }
    }
  }
 }
@ -326,7 +161,7 @@ void VP8StoreBlock(VP8Decoder* const dec) {
 #define MACROBLOCK_VPOS(mb_y)  ((mb_y) * 16)    // vertical position of a MB

 // Finalize and transmit a complete row. Return false in case of user-abort.
-int VP8FinishRow(VP8Decoder* const dec, VP8Io* io) {
+static int FinishRow(VP8Decoder* const dec, VP8Io* const io) {
  int ok = 1;
  const VP8ThreadContext* const ctx = &dec->thread_ctx_;
  const int extra_y_rows = kFilterExtraRows[dec->filter_type_];
@ -365,15 +200,18 @@ int VP8FinishRow(VP8Decoder* const dec, VP8Io* io) {
      y_end = io->crop_bottom;    // make sure we don't overflow on last row.
    }
    io->a = NULL;
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-    if (dec->alpha_data_) {
+    if (dec->alpha_data_ != NULL && y_start < y_end) {
+      // TODO(skal): several things to correct here:
+      // * testing presence of alpha with dec->alpha_data_ is not a good idea
+      // * we're actually decompressing the full plane only once. It should be
+      //   more obvious from signature.
+      // * we could free alpha_data_ right after this call, but we don't own.
      io->a = VP8DecompressAlphaRows(dec, y_start, y_end - y_start);
      if (io->a == NULL) {
        return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
                           "Could not decode alpha data.");
      }
    }
-#endif
    if (y_start < io->crop_top) {
      const int delta_y = io->crop_top - y_start;
      y_start = io->crop_top;
@ -381,7 +219,7 @@ int VP8FinishRow(VP8Decoder* const dec, VP8Io* io) {
      io->y += dec->cache_y_stride_ * delta_y;
      io->u += dec->cache_uv_stride_ * (delta_y >> 1);
      io->v += dec->cache_uv_stride_ * (delta_y >> 1);
-      if (io->a) {
+      if (io->a != NULL) {
        io->a += io->width * delta_y;
      }
    }
@ -389,7 +227,7 @@ int VP8FinishRow(VP8Decoder* const dec, VP8Io* io) {
      io->y += io->crop_left;
      io->u += io->crop_left >> 1;
      io->v += io->crop_left >> 1;
-      if (io->a) {
+      if (io->a != NULL) {
        io->a += io->crop_left;
      }
      io->mb_y = y_start - io->crop_top;
@ -421,7 +259,7 @@ int VP8ProcessRow(VP8Decoder* const dec, VP8Io* const io) {
    // ctx->id_ and ctx->f_info_ are already set
    ctx->mb_y_ = dec->mb_y_;
    ctx->filter_row_ = dec->filter_row_;
-    ok = VP8FinishRow(dec, io);
+    ok = FinishRow(dec, io);
  } else {
    WebPWorker* const worker = &dec->worker_;
    // Finish previous job *before* updating context
@ -482,8 +320,13 @@ VP8StatusCode VP8EnterCritical(VP8Decoder* const dec, VP8Io* const io) {
      dec->tl_mb_y_ = 0;
    } else {
      // For simple filter, we can filter only the cropped region.
-      dec->tl_mb_y_ = io->crop_top >> 4;
-      dec->tl_mb_x_ = io->crop_left >> 4;
+      // We include 'extra_pixels' on the other side of the boundary, since
+      // vertical or horizontal filtering of the previous macroblock can
+      // modify some abutting pixels.
+      dec->tl_mb_x_ = (io->crop_left - extra_pixels) >> 4;
+      dec->tl_mb_y_ = (io->crop_top - extra_pixels) >> 4;
+      if (dec->tl_mb_x_ < 0) dec->tl_mb_x_ = 0;
+      if (dec->tl_mb_y_ < 0) dec->tl_mb_y_ = 0;
    }
    // We need some 'extra' pixels on the right/bottom.
    dec->br_mb_y_ = (io->crop_bottom + 15 + extra_pixels) >> 4;
@ -495,6 +338,7 @@ VP8StatusCode VP8EnterCritical(VP8Decoder* const dec, VP8Io* const io) {
      dec->br_mb_y_ = dec->mb_h_;
    }
  }
+  PrecomputeFilterStrengths(dec);
  return VP8_STATUS_OK;
 }

@ -510,6 +354,178 @@ int VP8ExitCritical(VP8Decoder* const dec, VP8Io* const io) {
  return ok;
 }

+//------------------------------------------------------------------------------
+// For multi-threaded decoding we need to use 3 rows of 16 pixels as delay line.
+//
+// Reason is: the deblocking filter cannot deblock the bottom horizontal edges
+// immediately, and needs to wait for first few rows of the next macroblock to
+// be decoded. Hence, deblocking is lagging behind by 4 or 8 pixels (depending
+// on strength).
+// With two threads, the vertical positions of the rows being decoded are:
+// Decode:  [ 0..15][16..31][32..47][48..63][64..79][...
+// Deblock:         [ 0..11][12..27][28..43][44..59][...
+// If we use two threads and two caches of 16 pixels, the sequence would be:
+// Decode:  [ 0..15][16..31][ 0..15!!][16..31][ 0..15][...
+// Deblock:         [ 0..11][12..27!!][-4..11][12..27][...
+// The problem occurs during row [12..15!!] that both the decoding and
+// deblocking threads are writing simultaneously.
+// With 3 cache lines, one get a safe write pattern:
+// Decode:  [ 0..15][16..31][32..47][ 0..15][16..31][32..47][0..
+// Deblock:         [ 0..11][12..27][28..43][-4..11][12..27][28...
+// Note that multi-threaded output _without_ deblocking can make use of two
+// cache lines of 16 pixels only, since there's no lagging behind. The decoding
+// and output process have non-concurrent writing:
+// Decode:  [ 0..15][16..31][ 0..15][16..31][...
+// io->put:         [ 0..15][16..31][ 0..15][...
+
+#define MT_CACHE_LINES 3
+#define ST_CACHE_LINES 1   // 1 cache row only for single-threaded case
+
+// Initialize multi/single-thread worker
+static int InitThreadContext(VP8Decoder* const dec) {
+  dec->cache_id_ = 0;
+  if (dec->use_threads_) {
+    WebPWorker* const worker = &dec->worker_;
+    if (!WebPWorkerReset(worker)) {
+      return VP8SetError(dec, VP8_STATUS_OUT_OF_MEMORY,
+                         "thread initialization failed.");
+    }
+    worker->data1 = dec;
+    worker->data2 = (void*)&dec->thread_ctx_.io_;
+    worker->hook = (WebPWorkerHook)FinishRow;
+    dec->num_caches_ =
+      (dec->filter_type_ > 0) ? MT_CACHE_LINES : MT_CACHE_LINES - 1;
+  } else {
+    dec->num_caches_ = ST_CACHE_LINES;
+  }
+  return 1;
+}
+
+#undef MT_CACHE_LINES
+#undef ST_CACHE_LINES
+
+//------------------------------------------------------------------------------
+// Memory setup
+
+static int AllocateMemory(VP8Decoder* const dec) {
+  const int num_caches = dec->num_caches_;
+  const int mb_w = dec->mb_w_;
+  // Note: we use 'size_t' when there's no overflow risk, uint64_t otherwise.
+  const size_t intra_pred_mode_size = 4 * mb_w * sizeof(uint8_t);
+  const size_t top_size = (16 + 8 + 8) * mb_w;
+  const size_t mb_info_size = (mb_w + 1) * sizeof(VP8MB);
+  const size_t f_info_size =
+      (dec->filter_type_ > 0) ?
+          mb_w * (dec->use_threads_ ? 2 : 1) * sizeof(VP8FInfo)
+        : 0;
+  const size_t yuv_size = YUV_SIZE * sizeof(*dec->yuv_b_);
+  const size_t coeffs_size = 384 * sizeof(*dec->coeffs_);
+  const size_t cache_height = (16 * num_caches
+                            + kFilterExtraRows[dec->filter_type_]) * 3 / 2;
+  const size_t cache_size = top_size * cache_height;
+  // alpha_size is the only one that scales as width x height.
+  const uint64_t alpha_size = (dec->alpha_data_ != NULL) ?
+      (uint64_t)dec->pic_hdr_.width_ * dec->pic_hdr_.height_ : 0ULL;
+  const uint64_t needed = (uint64_t)intra_pred_mode_size
+                        + top_size + mb_info_size + f_info_size
+                        + yuv_size + coeffs_size
+                        + cache_size + alpha_size + ALIGN_MASK;
+  uint8_t* mem;
+
+  if (needed != (size_t)needed) return 0;  // check for overflow
+  if (needed > dec->mem_size_) {
+    free(dec->mem_);
+    dec->mem_size_ = 0;
+    dec->mem_ = WebPSafeMalloc(needed, sizeof(uint8_t));
+    if (dec->mem_ == NULL) {
+      return VP8SetError(dec, VP8_STATUS_OUT_OF_MEMORY,
+                         "no memory during frame initialization.");
+    }
+    // down-cast is ok, thanks to WebPSafeAlloc() above.
+    dec->mem_size_ = (size_t)needed;
+  }
+
+  mem = (uint8_t*)dec->mem_;
+  dec->intra_t_ = (uint8_t*)mem;
+  mem += intra_pred_mode_size;
+
+  dec->y_t_ = (uint8_t*)mem;
+  mem += 16 * mb_w;
+  dec->u_t_ = (uint8_t*)mem;
+  mem += 8 * mb_w;
+  dec->v_t_ = (uint8_t*)mem;
+  mem += 8 * mb_w;
+
+  dec->mb_info_ = ((VP8MB*)mem) + 1;
+  mem += mb_info_size;
+
+  dec->f_info_ = f_info_size ? (VP8FInfo*)mem : NULL;
+  mem += f_info_size;
+  dec->thread_ctx_.id_ = 0;
+  dec->thread_ctx_.f_info_ = dec->f_info_;
+  if (dec->use_threads_) {
+    // secondary cache line. The deblocking process need to make use of the
+    // filtering strength from previous macroblock row, while the new ones
+    // are being decoded in parallel. We'll just swap the pointers.
+    dec->thread_ctx_.f_info_ += mb_w;
+  }
+
+  mem = (uint8_t*)((uintptr_t)(mem + ALIGN_MASK) & ~ALIGN_MASK);
+  assert((yuv_size & ALIGN_MASK) == 0);
+  dec->yuv_b_ = (uint8_t*)mem;
+  mem += yuv_size;
+
+  dec->coeffs_ = (int16_t*)mem;
+  mem += coeffs_size;
+
+  dec->cache_y_stride_ = 16 * mb_w;
+  dec->cache_uv_stride_ = 8 * mb_w;
+  {
+    const int extra_rows = kFilterExtraRows[dec->filter_type_];
+    const int extra_y = extra_rows * dec->cache_y_stride_;
+    const int extra_uv = (extra_rows / 2) * dec->cache_uv_stride_;
+    dec->cache_y_ = ((uint8_t*)mem) + extra_y;
+    dec->cache_u_ = dec->cache_y_
+                  + 16 * num_caches * dec->cache_y_stride_ + extra_uv;
+    dec->cache_v_ = dec->cache_u_
+                  + 8 * num_caches * dec->cache_uv_stride_ + extra_uv;
+    dec->cache_id_ = 0;
+  }
+  mem += cache_size;
+
+  // alpha plane
+  dec->alpha_plane_ = alpha_size ? (uint8_t*)mem : NULL;
+  mem += alpha_size;
+  assert(mem <= (uint8_t*)dec->mem_ + dec->mem_size_);
+
+  // note: left-info is initialized once for all.
+  memset(dec->mb_info_ - 1, 0, mb_info_size);
+
+  // initialize top
+  memset(dec->intra_t_, B_DC_PRED, intra_pred_mode_size);
+
+  return 1;
+}
+
+static void InitIo(VP8Decoder* const dec, VP8Io* io) {
+  // prepare 'io'
+  io->mb_y = 0;
+  io->y = dec->cache_y_;
+  io->u = dec->cache_u_;
+  io->v = dec->cache_v_;
+  io->y_stride = dec->cache_y_stride_;
+  io->uv_stride = dec->cache_uv_stride_;
+  io->a = NULL;
+}
+
+int VP8InitFrame(VP8Decoder* const dec, VP8Io* io) {
+  if (!InitThreadContext(dec)) return 0;  // call first. Sets dec->num_caches_.
+  if (!AllocateMemory(dec)) return 0;
+  InitIo(dec, io);
+  VP8DspInit();  // Init critical function pointers and look-up tables.
+  return 1;
+}
+
 //------------------------------------------------------------------------------
 // Main reconstruction function.

@ -520,7 +536,7 @@ static const int kScan[16] = {
  0 + 12 * BPS,  4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS
 };

-static inline int CheckMode(VP8Decoder* const dec, int mode) {
+static WEBP_INLINE int CheckMode(VP8Decoder* const dec, int mode) {
  if (mode == B_DC_PRED) {
    if (dec->mb_x_ == 0) {
      return (dec->mb_y_ == 0) ? B_DC_PRED_NOTOPLEFT : B_DC_PRED_NOLEFT;
@ -531,11 +547,12 @@ static inline int CheckMode(VP8Decoder* const dec, int mode) {
  return mode;
 }

-static inline void Copy32b(uint8_t* dst, uint8_t* src) {
+static WEBP_INLINE void Copy32b(uint8_t* dst, uint8_t* src) {
  *(uint32_t*)dst = *(uint32_t*)src;
 }

 void VP8ReconstructBlock(VP8Decoder* const dec) {
+  int j;
  uint8_t* const y_dst = dec->yuv_b_ + Y_OFF;
  uint8_t* const u_dst = dec->yuv_b_ + U_OFF;
  uint8_t* const v_dst = dec->yuv_b_ + V_OFF;
@ -543,7 +560,6 @@ void VP8ReconstructBlock(VP8Decoder* const dec) {
  // Rotate in the left samples from previously decoded block. We move four
  // pixels at a time for alignment reason, and because of in-loop filter.
  if (dec->mb_x_ > 0) {
-    int j;
    for (j = -1; j < 16; ++j) {
      Copy32b(&y_dst[j * BPS - 4], &y_dst[j * BPS + 12]);
    }
@ -552,7 +568,6 @@ void VP8ReconstructBlock(VP8Decoder* const dec) {
      Copy32b(&v_dst[j * BPS - 4], &v_dst[j * BPS + 4]);
    }
  } else {
-    int j;
    for (j = 0; j < 16; ++j) {
      y_dst[j * BPS - 1] = 129;
    }
@ -655,6 +670,21 @@ void VP8ReconstructBlock(VP8Decoder* const dec) {
      }
    }
  }
+  // Transfer reconstructed samples from yuv_b_ cache to final destination.
+  {
+    const int y_offset = dec->cache_id_ * 16 * dec->cache_y_stride_;
+    const int uv_offset = dec->cache_id_ * 8 * dec->cache_uv_stride_;
+    uint8_t* const y_out = dec->cache_y_ + dec->mb_x_ * 16 + y_offset;
+    uint8_t* const u_out = dec->cache_u_ + dec->mb_x_ * 8 + uv_offset;
+    uint8_t* const v_out = dec->cache_v_ + dec->mb_x_ * 8 + uv_offset;
+    for (j = 0; j < 16; ++j) {
+      memcpy(y_out + j * dec->cache_y_stride_, y_dst + j * BPS, 16);
+    }
+    for (j = 0; j < 8; ++j) {
+      memcpy(u_out + j * dec->cache_uv_stride_, u_dst + j * BPS, 8);
+      memcpy(v_out + j * dec->cache_uv_stride_, v_dst + j * BPS, 8);
+    }
+  }
 }

 //------------------------------------------------------------------------------
--- a/src/dec/idec.c
+++ b/src/dec/idec.c
@ -1,8 +1,10 @@
-// Copyright 2011 Google Inc.
+// Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Incremental decoding
@ -13,13 +15,16 @@
 #include <string.h>
 #include <stdlib.h>

-#include "webpi.h"
-#include "vp8i.h"
+#include "./webpi.h"
+#include "./vp8i.h"
+#include "../utils/utils.h"

 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif

+// In append mode, buffer allocations increase as multiples of this value.
+// Needs to be a power of 2.
 #define CHUNK_SIZE 4096
 #define MAX_MB_SIZE 4096

@ -33,6 +38,8 @@ typedef enum {
  STATE_VP8_FRAME_HEADER,  // For VP8 Frame header (within VP8 chunk).
  STATE_VP8_PARTS0,
  STATE_VP8_DATA,
+  STATE_VP8L_HEADER,
+  STATE_VP8L_DATA,
  STATE_DONE,
  STATE_ERROR
 } DecState;
@ -47,8 +54,8 @@ typedef enum {
 // storage for partition #0 and partial data (in a rolling fashion)
 typedef struct {
  MemBufferMode mode_;  // Operation mode
-  uint32_t start_;      // start location of the data to be decoded
-  uint32_t end_;        // end location
+  size_t start_;        // start location of the data to be decoded
+  size_t end_;          // end location
  size_t buf_size_;     // size of the allocated buffer
  uint8_t* buf_;        // We don't own this buffer in case WebPIUpdate()

@ -59,12 +66,13 @@ typedef struct {
 struct WebPIDecoder {
  DecState state_;         // current decoding state
  WebPDecParams params_;   // Params to store output info
-  VP8Decoder* dec_;
+  int is_lossless_;        // for down-casting 'dec_'.
+  void* dec_;              // either a VP8Decoder or a VP8LDecoder instance
  VP8Io io_;

  MemBuffer mem_;          // input memory buffer.
  WebPDecBuffer output_;   // output buffer (when no external one is supplied)
-  uint32_t vp8_size_;      // VP8 size extracted from VP8 Header.
+  size_t chunk_size_;      // Compressed VP8/VP8L size extracted from Header.
 };

 // MB context to restore in case VP8DecodeMB() fails
@ -80,102 +88,129 @@ typedef struct {
 //------------------------------------------------------------------------------
 // MemBuffer: incoming data handling

-#define REMAP(PTR, OLD_BASE, NEW_BASE) (PTR) = (NEW_BASE) + ((PTR) - OLD_BASE)
+static void RemapBitReader(VP8BitReader* const br, ptrdiff_t offset) {
+  if (br->buf_ != NULL) {
+    br->buf_ += offset;
+    br->buf_end_ += offset;
+  }
+}

-static inline size_t MemDataSize(const MemBuffer* mem) {
+static WEBP_INLINE size_t MemDataSize(const MemBuffer* mem) {
  return (mem->end_ - mem->start_);
 }

+// Check if we need to preserve the compressed alpha data, as it may not have
+// been decoded yet.
+static int NeedCompressedAlpha(const WebPIDecoder* const idec) {
+  if (idec->state_ == STATE_PRE_VP8) {
+    // We haven't parsed the headers yet, so we don't know whether the image is
+    // lossy or lossless. This also means that we haven't parsed the ALPH chunk.
+    return 0;
+  }
+  if (idec->is_lossless_) {
+    return 0;  // ALPH chunk is not present for lossless images.
+  } else {
+    const VP8Decoder* const dec = (VP8Decoder*)idec->dec_;
+    assert(dec != NULL);  // Must be true as idec->state_ != STATE_PRE_VP8.
+    return (dec->alpha_data_ != NULL) && !dec->is_alpha_decoded_;
+  }
+}
+
+static void DoRemap(WebPIDecoder* const idec, ptrdiff_t offset) {
+  MemBuffer* const mem = &idec->mem_;
+  const uint8_t* const new_base = mem->buf_ + mem->start_;
+  // note: for VP8, setting up idec->io_ is only really needed at the beginning
+  // of the decoding, till partition #0 is complete.
+  idec->io_.data = new_base;
+  idec->io_.data_size = MemDataSize(mem);
+
+  if (idec->dec_ != NULL) {
+    if (!idec->is_lossless_) {
+      VP8Decoder* const dec = (VP8Decoder*)idec->dec_;
+      const int last_part = dec->num_parts_ - 1;
+      if (offset != 0) {
+        int p;
+        for (p = 0; p <= last_part; ++p) {
+          RemapBitReader(dec->parts_ + p, offset);
+        }
+        // Remap partition #0 data pointer to new offset, but only in MAP
+        // mode (in APPEND mode, partition #0 is copied into a fixed memory).
+        if (mem->mode_ == MEM_MODE_MAP) {
+          RemapBitReader(&dec->br_, offset);
+        }
+      }
+      assert(last_part >= 0);
+      dec->parts_[last_part].buf_end_ = mem->buf_ + mem->end_;
+      if (NeedCompressedAlpha(idec)) dec->alpha_data_ += offset;
+    } else {    // Resize lossless bitreader
+      VP8LDecoder* const dec = (VP8LDecoder*)idec->dec_;
+      VP8LBitReaderSetBuffer(&dec->br_, new_base, MemDataSize(mem));
+    }
+  }
+}
+
 // Appends data to the end of MemBuffer->buf_. It expands the allocated memory
 // size if required and also updates VP8BitReader's if new memory is allocated.
 static int AppendToMemBuffer(WebPIDecoder* const idec,
                             const uint8_t* const data, size_t data_size) {
+  VP8Decoder* const dec = (VP8Decoder*)idec->dec_;
  MemBuffer* const mem = &idec->mem_;
-  VP8Decoder* const dec = idec->dec_;
-  const int last_part = dec->num_parts_ - 1;
+  const int need_compressed_alpha = NeedCompressedAlpha(idec);
+  const uint8_t* const old_start = mem->buf_ + mem->start_;
+  const uint8_t* const old_base =
+      need_compressed_alpha ? dec->alpha_data_ : old_start;
  assert(mem->mode_ == MEM_MODE_APPEND);
+  if (data_size > MAX_CHUNK_PAYLOAD) {
+    // security safeguard: trying to allocate more than what the format
+    // allows for a chunk should be considered a smoke smell.
+    return 0;
+  }

  if (mem->end_ + data_size > mem->buf_size_) {  // Need some free memory
-    int p;
-    uint8_t* new_buf = NULL;
-    const int num_chunks = (MemDataSize(mem) + data_size + CHUNK_SIZE - 1)
-        / CHUNK_SIZE;
-    const size_t new_size = num_chunks * CHUNK_SIZE;
-    const uint8_t* const base = mem->buf_ + mem->start_;
-
-    new_buf = (uint8_t*)malloc(new_size);
-    if (!new_buf) return 0;
-    memcpy(new_buf, base, MemDataSize(mem));
-
-    // adjust VP8BitReader pointers
-    for (p = 0; p <= last_part; ++p) {
-      if (dec->parts_[p].buf_) {
-        REMAP(dec->parts_[p].buf_, base, new_buf);
-        REMAP(dec->parts_[p].buf_end_, base, new_buf);
-      }
-    }
-
-    // adjust memory pointers
+    const size_t new_mem_start = old_start - old_base;
+    const size_t current_size = MemDataSize(mem) + new_mem_start;
+    const uint64_t new_size = (uint64_t)current_size + data_size;
+    const uint64_t extra_size = (new_size + CHUNK_SIZE - 1) & ~(CHUNK_SIZE - 1);
+    uint8_t* const new_buf =
+        (uint8_t*)WebPSafeMalloc(extra_size, sizeof(*new_buf));
+    if (new_buf == NULL) return 0;
+    memcpy(new_buf, old_base, current_size);
    free(mem->buf_);
    mem->buf_ = new_buf;
-    mem->buf_size_ = new_size;
-
-    mem->end_ = MemDataSize(mem);
-    mem->start_ = 0;
+    mem->buf_size_ = (size_t)extra_size;
+    mem->start_ = new_mem_start;
+    mem->end_ = current_size;
  }

  memcpy(mem->buf_ + mem->end_, data, data_size);
  mem->end_ += data_size;
  assert(mem->end_ <= mem->buf_size_);
-  dec->parts_[last_part].buf_end_ = mem->buf_ + mem->end_;

-  // note: setting up idec->io_ is only really needed at the beginning
-  // of the decoding, till partition #0 is complete.
-  idec->io_.data = mem->buf_ + mem->start_;
-  idec->io_.data_size = MemDataSize(mem);
+  DoRemap(idec, mem->buf_ + mem->start_ - old_start);
  return 1;
 }

 static int RemapMemBuffer(WebPIDecoder* const idec,
                          const uint8_t* const data, size_t data_size) {
-  int p;
  MemBuffer* const mem = &idec->mem_;
-  VP8Decoder* const dec = idec->dec_;
-  const int last_part = dec->num_parts_ - 1;
-  const uint8_t* base = mem->buf_;
-
+  const uint8_t* const old_buf = mem->buf_;
+  const uint8_t* const old_start = old_buf + mem->start_;
  assert(mem->mode_ == MEM_MODE_MAP);
-  if (data_size < mem->buf_size_) {
-    return 0;  // we cannot remap to a shorter buffer!
-  }

-  for (p = 0; p <= last_part; ++p) {
-    if (dec->parts_[p].buf_) {
-      REMAP(dec->parts_[p].buf_, base, data);
-      REMAP(dec->parts_[p].buf_end_, base, data);
-    }
-  }
-  dec->parts_[last_part].buf_end_ = data + data_size;
-
-  // Remap partition #0 data pointer to new offset.
-  if (dec->br_.buf_) {
-    REMAP(dec->br_.buf_, base, data);
-    REMAP(dec->br_.buf_end_, base, data);
-  }
+  if (data_size < mem->buf_size_) return 0;  // can't remap to a shorter buffer!

  mem->buf_ = (uint8_t*)data;
  mem->end_ = mem->buf_size_ = data_size;

-  idec->io_.data = data;
-  idec->io_.data_size = data_size;
+  DoRemap(idec, mem->buf_ + mem->start_ - old_start);
  return 1;
 }

 static void InitMemBuffer(MemBuffer* const mem) {
  mem->mode_       = MEM_MODE_NONE;
-  mem->buf_        = 0;
+  mem->buf_        = NULL;
  mem->buf_size_   = 0;
-  mem->part0_buf_  = 0;
+  mem->part0_buf_  = NULL;
  mem->part0_size_ = 0;
 }

@ -197,8 +232,6 @@ static int CheckMemBufferMode(MemBuffer* const mem, MemBufferMode expected) {
  return 1;
 }

-#undef REMAP
-
 //------------------------------------------------------------------------------
 // Macroblock-decoding contexts

@ -244,81 +277,111 @@ static VP8StatusCode IDecError(WebPIDecoder* const idec, VP8StatusCode error) {
 }

 static void ChangeState(WebPIDecoder* const idec, DecState new_state,
-                        uint32_t consumed_bytes) {
+                        size_t consumed_bytes) {
+  MemBuffer* const mem = &idec->mem_;
  idec->state_ = new_state;
-  idec->mem_.start_ += consumed_bytes;
-  assert(idec->mem_.start_ <= idec->mem_.end_);
+  mem->start_ += consumed_bytes;
+  assert(mem->start_ <= mem->end_);
+  idec->io_.data = mem->buf_ + mem->start_;
+  idec->io_.data_size = MemDataSize(mem);
 }

 // Headers
 static VP8StatusCode DecodeWebPHeaders(WebPIDecoder* const idec) {
-  const uint8_t* data = idec->mem_.buf_ + idec->mem_.start_;
-  uint32_t curr_size = MemDataSize(&idec->mem_);
-  uint32_t vp8_size;
-  uint32_t bytes_skipped;
+  MemBuffer* const mem = &idec->mem_;
+  const uint8_t* data = mem->buf_ + mem->start_;
+  size_t curr_size = MemDataSize(mem);
  VP8StatusCode status;
+  WebPHeaderStructure headers;

-  status = WebPParseHeaders(&data, &curr_size, &vp8_size, &bytes_skipped);
+  headers.data = data;
+  headers.data_size = curr_size;
+  status = WebPParseHeaders(&headers);
  if (status == VP8_STATUS_NOT_ENOUGH_DATA) {
    return VP8_STATUS_SUSPENDED;  // We haven't found a VP8 chunk yet.
-  } else if (status == VP8_STATUS_OK) {
-    idec->vp8_size_ = vp8_size;
-    ChangeState(idec, STATE_VP8_FRAME_HEADER, bytes_skipped);
-    return VP8_STATUS_OK;  // We have skipped all pre-VP8 chunks.
-  } else {
+  } else if (status != VP8_STATUS_OK) {
    return IDecError(idec, status);
  }
+
+  idec->chunk_size_ = headers.compressed_size;
+  idec->is_lossless_ = headers.is_lossless;
+  if (!idec->is_lossless_) {
+    VP8Decoder* const dec = VP8New();
+    if (dec == NULL) {
+      return VP8_STATUS_OUT_OF_MEMORY;
+    }
+    idec->dec_ = dec;
+#ifdef WEBP_USE_THREAD
+    dec->use_threads_ = (idec->params_.options != NULL) &&
+                        (idec->params_.options->use_threads > 0);
+#else
+    dec->use_threads_ = 0;
+#endif
+    dec->alpha_data_ = headers.alpha_data;
+    dec->alpha_data_size_ = headers.alpha_data_size;
+    ChangeState(idec, STATE_VP8_FRAME_HEADER, headers.offset);
+  } else {
+    VP8LDecoder* const dec = VP8LNew();
+    if (dec == NULL) {
+      return VP8_STATUS_OUT_OF_MEMORY;
+    }
+    idec->dec_ = dec;
+    ChangeState(idec, STATE_VP8L_HEADER, headers.offset);
+  }
+  return VP8_STATUS_OK;
 }

 static VP8StatusCode DecodeVP8FrameHeader(WebPIDecoder* const idec) {
  const uint8_t* data = idec->mem_.buf_ + idec->mem_.start_;
-  const uint32_t curr_size = MemDataSize(&idec->mem_);
+  const size_t curr_size = MemDataSize(&idec->mem_);
  uint32_t bits;

  if (curr_size < VP8_FRAME_HEADER_SIZE) {
    // Not enough data bytes to extract VP8 Frame Header.
    return VP8_STATUS_SUSPENDED;
  }
-  if (!VP8GetInfo(data, curr_size, idec->vp8_size_, NULL, NULL, NULL)) {
+  if (!VP8GetInfo(data, curr_size, idec->chunk_size_, NULL, NULL)) {
    return IDecError(idec, VP8_STATUS_BITSTREAM_ERROR);
  }

  bits = data[0] | (data[1] << 8) | (data[2] << 16);
  idec->mem_.part0_size_ = (bits >> 5) + VP8_FRAME_HEADER_SIZE;

-  idec->io_.data_size = curr_size;
  idec->io_.data = data;
+  idec->io_.data_size = curr_size;
  idec->state_ = STATE_VP8_PARTS0;
  return VP8_STATUS_OK;
 }

 // Partition #0
-static int CopyParts0Data(WebPIDecoder* idec) {
-  VP8BitReader* const br = &idec->dec_->br_;
+static int CopyParts0Data(WebPIDecoder* const idec) {
+  VP8Decoder* const dec = (VP8Decoder*)idec->dec_;
+  VP8BitReader* const br = &dec->br_;
  const size_t psize = br->buf_end_ - br->buf_;
  MemBuffer* const mem = &idec->mem_;
-  assert(!mem->part0_buf_);
+  assert(!idec->is_lossless_);
+  assert(mem->part0_buf_ == NULL);
  assert(psize > 0);
-  assert(psize <= mem->part0_size_);
+  assert(psize <= mem->part0_size_);  // Format limit: no need for runtime check
  if (mem->mode_ == MEM_MODE_APPEND) {
    // We copy and grab ownership of the partition #0 data.
    uint8_t* const part0_buf = (uint8_t*)malloc(psize);
-    if (!part0_buf) {
+    if (part0_buf == NULL) {
      return 0;
    }
    memcpy(part0_buf, br->buf_, psize);
    mem->part0_buf_ = part0_buf;
-    mem->start_ += psize;
    br->buf_ = part0_buf;
    br->buf_end_ = part0_buf + psize;
  } else {
    // Else: just keep pointers to the partition #0's data in dec_->br_.
  }
+  mem->start_ += psize;
  return 1;
 }

 static VP8StatusCode DecodePartition0(WebPIDecoder* const idec) {
-  VP8Decoder* const dec = idec->dec_;
+  VP8Decoder* const dec = (VP8Decoder*)idec->dec_;
  VP8Io* const io = &idec->io_;
  const WebPDecParams* const params = &idec->params_;
  WebPDecBuffer* const output = params->output;
@ -366,13 +429,11 @@ static VP8StatusCode DecodePartition0(WebPIDecoder* const idec) {

 // Remaining partitions
 static VP8StatusCode DecodeRemaining(WebPIDecoder* const idec) {
-  VP8BitReader* br;
-  VP8Decoder* const dec = idec->dec_;
+  VP8Decoder* const dec = (VP8Decoder*)idec->dec_;
  VP8Io* const io = &idec->io_;

  assert(dec->ready_);

-  br = &dec->br_;
  for (; dec->mb_y_ < dec->mb_h_; ++dec->mb_y_) {
    VP8BitReader* token_br = &dec->parts_[dec->mb_y_ & (dec->num_parts_ - 1)];
    if (dec->mb_x_ == 0) {
@ -390,9 +451,8 @@ static VP8StatusCode DecodeRemaining(WebPIDecoder* const idec) {
        }
        return VP8_STATUS_SUSPENDED;
      }
+      // Reconstruct and emit samples.
      VP8ReconstructBlock(dec);
-      // Store data and save block's filtering params
-      VP8StoreBlock(dec);

      // Release buffer only if there is only one partition
      if (dec->num_parts_ == 1) {
@ -415,13 +475,69 @@ static VP8StatusCode DecodeRemaining(WebPIDecoder* const idec) {
  return VP8_STATUS_OK;
 }

+static int ErrorStatusLossless(WebPIDecoder* const idec, VP8StatusCode status) {
+  if (status == VP8_STATUS_SUSPENDED || status == VP8_STATUS_NOT_ENOUGH_DATA) {
+    return VP8_STATUS_SUSPENDED;
+  }
+  return IDecError(idec, status);
+}
+
+static VP8StatusCode DecodeVP8LHeader(WebPIDecoder* const idec) {
+  VP8Io* const io = &idec->io_;
+  VP8LDecoder* const dec = (VP8LDecoder*)idec->dec_;
+  const WebPDecParams* const params = &idec->params_;
+  WebPDecBuffer* const output = params->output;
+  size_t curr_size = MemDataSize(&idec->mem_);
+  assert(idec->is_lossless_);
+
+  // Wait until there's enough data for decoding header.
+  if (curr_size < (idec->chunk_size_ >> 3)) {
+    return VP8_STATUS_SUSPENDED;
+  }
+  if (!VP8LDecodeHeader(dec, io)) {
+    return ErrorStatusLossless(idec, dec->status_);
+  }
+  // Allocate/verify output buffer now.
+  dec->status_ = WebPAllocateDecBuffer(io->width, io->height, params->options,
+                                       output);
+  if (dec->status_ != VP8_STATUS_OK) {
+    return IDecError(idec, dec->status_);
+  }
+
+  idec->state_ = STATE_VP8L_DATA;
+  return VP8_STATUS_OK;
+}
+
+static VP8StatusCode DecodeVP8LData(WebPIDecoder* const idec) {
+  VP8LDecoder* const dec = (VP8LDecoder*)idec->dec_;
+  const size_t curr_size = MemDataSize(&idec->mem_);
+  assert(idec->is_lossless_);
+
+  // At present Lossless decoder can't decode image incrementally. So wait till
+  // all the image data is aggregated before image can be decoded.
+  if (curr_size < idec->chunk_size_) {
+    return VP8_STATUS_SUSPENDED;
+  }
+
+  if (!VP8LDecodeImage(dec)) {
+    return ErrorStatusLossless(idec, dec->status_);
+  }
+
+  idec->state_ = STATE_DONE;
+
+  return VP8_STATUS_OK;
+}
+
  // Main decoding loop
 static VP8StatusCode IDecode(WebPIDecoder* idec) {
  VP8StatusCode status = VP8_STATUS_SUSPENDED;
-  assert(idec->dec_);

  if (idec->state_ == STATE_PRE_VP8) {
    status = DecodeWebPHeaders(idec);
+  } else {
+    if (idec->dec_ == NULL) {
+      return VP8_STATUS_SUSPENDED;    // can't continue if we have no decoder.
+    }
  }
  if (idec->state_ == STATE_VP8_FRAME_HEADER) {
    status = DecodeVP8FrameHeader(idec);
@ -432,25 +548,26 @@ static VP8StatusCode IDecode(WebPIDecoder* idec) {
  if (idec->state_ == STATE_VP8_DATA) {
    status = DecodeRemaining(idec);
  }
+  if (idec->state_ == STATE_VP8L_HEADER) {
+    status = DecodeVP8LHeader(idec);
+  }
+  if (idec->state_ == STATE_VP8L_DATA) {
+    status = DecodeVP8LData(idec);
+  }
  return status;
 }

 //------------------------------------------------------------------------------
 // Public functions

-WebPIDecoder* WebPINewDecoder(WebPDecBuffer* const output_buffer) {
-  WebPIDecoder* idec = (WebPIDecoder*)calloc(1, sizeof(WebPIDecoder));
+WebPIDecoder* WebPINewDecoder(WebPDecBuffer* output_buffer) {
+  WebPIDecoder* idec = (WebPIDecoder*)calloc(1, sizeof(*idec));
  if (idec == NULL) {
    return NULL;
  }

-  idec->dec_ = VP8New();
-  if (idec->dec_ == NULL) {
-    free(idec);
-    return NULL;
-  }
-
  idec->state_ = STATE_PRE_VP8;
+  idec->chunk_size_ = 0;

  InitMemBuffer(&idec->mem_);
  WebPInitDecBuffer(&idec->output_);
@ -460,19 +577,11 @@ WebPIDecoder* WebPINewDecoder(WebPDecBuffer* const output_buffer) {
  idec->params_.output = output_buffer ? output_buffer : &idec->output_;
  WebPInitCustomIo(&idec->params_, &idec->io_);  // Plug the I/O functions.

-#ifdef WEBP_USE_THREAD
-  idec->dec_->use_threads_ = idec->params_.options &&
-                             (idec->params_.options->use_threads > 0);
-#else
-  idec->dec_->use_threads_ = 0;
-#endif
-  idec->vp8_size_ = 0;
-
  return idec;
 }

-WebPIDecoder* WebPIDecode(const uint8_t* data, uint32_t data_size,
-                          WebPDecoderConfig* const config) {
+WebPIDecoder* WebPIDecode(const uint8_t* data, size_t data_size,
+                          WebPDecoderConfig* config) {
  WebPIDecoder* idec;

  // Parse the bitstream's features, if requested:
@ -483,7 +592,7 @@ WebPIDecoder* WebPIDecode(const uint8_t* data, uint32_t data_size,
  }
  // Create an instance of the incremental decoder
  idec = WebPINewDecoder(config ? &config->output : NULL);
-  if (!idec) {
+  if (idec == NULL) {
    return NULL;
  }
  // Finish initialization
@ -493,9 +602,15 @@ WebPIDecoder* WebPIDecode(const uint8_t* data, uint32_t data_size,
  return idec;
 }

-void WebPIDelete(WebPIDecoder* const idec) {
-  if (!idec) return;
-  VP8Delete(idec->dec_);
+void WebPIDelete(WebPIDecoder* idec) {
+  if (idec == NULL) return;
+  if (idec->dec_ != NULL) {
+    if (!idec->is_lossless_) {
+      VP8Delete(idec->dec_);
+    } else {
+      VP8LDelete(idec->dec_);
+    }
+  }
  ClearMemBuffer(&idec->mem_);
  WebPFreeDecBuffer(&idec->output_);
  free(idec);
@ -504,34 +619,58 @@ void WebPIDelete(WebPIDecoder* const idec) {
 //------------------------------------------------------------------------------
 // Wrapper toward WebPINewDecoder

-WebPIDecoder* WebPINew(WEBP_CSP_MODE mode) {
-  WebPIDecoder* const idec = WebPINewDecoder(NULL);
-  if (!idec) return NULL;
-  idec->output_.colorspace = mode;
-  return idec;
-}
-
 WebPIDecoder* WebPINewRGB(WEBP_CSP_MODE mode, uint8_t* output_buffer,
-                          int output_buffer_size, int output_stride) {
+                          size_t output_buffer_size, int output_stride) {
+  const int is_external_memory = (output_buffer != NULL);
  WebPIDecoder* idec;
+
  if (mode >= MODE_YUV) return NULL;
+  if (!is_external_memory) {    // Overwrite parameters to sane values.
+    output_buffer_size = 0;
+    output_stride = 0;
+  } else {  // A buffer was passed. Validate the other params.
+    if (output_stride == 0 || output_buffer_size == 0) {
+      return NULL;   // invalid parameter.
+    }
+  }
  idec = WebPINewDecoder(NULL);
-  if (!idec) return NULL;
+  if (idec == NULL) return NULL;
  idec->output_.colorspace = mode;
-  idec->output_.is_external_memory = 1;
+  idec->output_.is_external_memory = is_external_memory;
  idec->output_.u.RGBA.rgba = output_buffer;
  idec->output_.u.RGBA.stride = output_stride;
  idec->output_.u.RGBA.size = output_buffer_size;
  return idec;
 }

-WebPIDecoder* WebPINewYUV(uint8_t* luma, int luma_size, int luma_stride,
-                          uint8_t* u, int u_size, int u_stride,
-                          uint8_t* v, int v_size, int v_stride) {
-  WebPIDecoder* const idec = WebPINewDecoder(NULL);
-  if (!idec) return NULL;
-  idec->output_.colorspace = MODE_YUV;
-  idec->output_.is_external_memory = 1;
+WebPIDecoder* WebPINewYUVA(uint8_t* luma, size_t luma_size, int luma_stride,
+                           uint8_t* u, size_t u_size, int u_stride,
+                           uint8_t* v, size_t v_size, int v_stride,
+                           uint8_t* a, size_t a_size, int a_stride) {
+  const int is_external_memory = (luma != NULL);
+  WebPIDecoder* idec;
+  WEBP_CSP_MODE colorspace;
+
+  if (!is_external_memory) {    // Overwrite parameters to sane values.
+    luma_size = u_size = v_size = a_size = 0;
+    luma_stride = u_stride = v_stride = a_stride = 0;
+    u = v = a = NULL;
+    colorspace = MODE_YUVA;
+  } else {  // A luma buffer was passed. Validate the other parameters.
+    if (u == NULL || v == NULL) return NULL;
+    if (luma_size == 0 || u_size == 0 || v_size == 0) return NULL;
+    if (luma_stride == 0 || u_stride == 0 || v_stride == 0) return NULL;
+    if (a != NULL) {
+      if (a_size == 0 || a_stride == 0) return NULL;
+    }
+    colorspace = (a == NULL) ? MODE_YUV : MODE_YUVA;
+  }
+
+  idec = WebPINewDecoder(NULL);
+  if (idec == NULL) return NULL;
+
+  idec->output_.colorspace = colorspace;
+  idec->output_.is_external_memory = is_external_memory;
  idec->output_.u.YUVA.y = luma;
  idec->output_.u.YUVA.y_stride = luma_stride;
  idec->output_.u.YUVA.y_size = luma_size;
@ -541,16 +680,25 @@ WebPIDecoder* WebPINewYUV(uint8_t* luma, int luma_size, int luma_stride,
  idec->output_.u.YUVA.v = v;
  idec->output_.u.YUVA.v_stride = v_stride;
  idec->output_.u.YUVA.v_size = v_size;
+  idec->output_.u.YUVA.a = a;
+  idec->output_.u.YUVA.a_stride = a_stride;
+  idec->output_.u.YUVA.a_size = a_size;
  return idec;
 }

+WebPIDecoder* WebPINewYUV(uint8_t* luma, size_t luma_size, int luma_stride,
+                          uint8_t* u, size_t u_size, int u_stride,
+                          uint8_t* v, size_t v_size, int v_stride) {
+  return WebPINewYUVA(luma, luma_size, luma_stride,
+                      u, u_size, u_stride,
+                      v, v_size, v_stride,
+                      NULL, 0, 0);
+}
+
 //------------------------------------------------------------------------------

 static VP8StatusCode IDecCheckStatus(const WebPIDecoder* const idec) {
  assert(idec);
-  if (idec->dec_ == NULL) {
-    return VP8_STATUS_USER_ABORT;
-  }
  if (idec->state_ == STATE_ERROR) {
    return VP8_STATUS_BITSTREAM_ERROR;
  }
@ -560,8 +708,8 @@ static VP8StatusCode IDecCheckStatus(const WebPIDecoder* const idec) {
  return VP8_STATUS_SUSPENDED;
 }

-VP8StatusCode WebPIAppend(WebPIDecoder* const idec, const uint8_t* data,
-                          uint32_t data_size) {
+VP8StatusCode WebPIAppend(WebPIDecoder* idec,
+                          const uint8_t* data, size_t data_size) {
  VP8StatusCode status;
  if (idec == NULL || data == NULL) {
    return VP8_STATUS_INVALID_PARAM;
@ -581,8 +729,8 @@ VP8StatusCode WebPIAppend(WebPIDecoder* const idec, const uint8_t* data,
  return IDecode(idec);
 }

-VP8StatusCode WebPIUpdate(WebPIDecoder* const idec, const uint8_t* data,
-                          uint32_t data_size) {
+VP8StatusCode WebPIUpdate(WebPIDecoder* idec,
+                          const uint8_t* data, size_t data_size) {
  VP8StatusCode status;
  if (idec == NULL || data == NULL) {
    return VP8_STATUS_INVALID_PARAM;
@ -605,61 +753,67 @@ VP8StatusCode WebPIUpdate(WebPIDecoder* const idec, const uint8_t* data,
 //------------------------------------------------------------------------------

 static const WebPDecBuffer* GetOutputBuffer(const WebPIDecoder* const idec) {
-  if (!idec || !idec->dec_ || idec->state_ <= STATE_VP8_PARTS0) {
+  if (idec == NULL || idec->dec_ == NULL) {
+    return NULL;
+  }
+  if (idec->state_ <= STATE_VP8_PARTS0) {
    return NULL;
  }
  return idec->params_.output;
 }

-const WebPDecBuffer* WebPIDecodedArea(const WebPIDecoder* const idec,
-                                      int* const left, int* const top,
-                                      int* const width, int* const height) {
+const WebPDecBuffer* WebPIDecodedArea(const WebPIDecoder* idec,
+                                      int* left, int* top,
+                                      int* width, int* height) {
  const WebPDecBuffer* const src = GetOutputBuffer(idec);
-  if (left) *left = 0;
-  if (top) *top = 0;
+  if (left != NULL) *left = 0;
+  if (top != NULL) *top = 0;
  // TODO(skal): later include handling of rotations.
  if (src) {
-    if (width) *width = src->width;
-    if (height) *height = idec->params_.last_y;
+    if (width != NULL) *width = src->width;
+    if (height != NULL) *height = idec->params_.last_y;
  } else {
-    if (width) *width = 0;
-    if (height) *height = 0;
+    if (width != NULL) *width = 0;
+    if (height != NULL) *height = 0;
  }
  return src;
 }

-uint8_t* WebPIDecGetRGB(const WebPIDecoder* const idec, int* last_y,
+uint8_t* WebPIDecGetRGB(const WebPIDecoder* idec, int* last_y,
                        int* width, int* height, int* stride) {
  const WebPDecBuffer* const src = GetOutputBuffer(idec);
-  if (!src) return NULL;
+  if (src == NULL) return NULL;
  if (src->colorspace >= MODE_YUV) {
    return NULL;
  }

-  if (last_y) *last_y = idec->params_.last_y;
-  if (width) *width = src->width;
-  if (height) *height = src->height;
-  if (stride) *stride = src->u.RGBA.stride;
+  if (last_y != NULL) *last_y = idec->params_.last_y;
+  if (width != NULL) *width = src->width;
+  if (height != NULL) *height = src->height;
+  if (stride != NULL) *stride = src->u.RGBA.stride;

  return src->u.RGBA.rgba;
 }

-uint8_t* WebPIDecGetYUV(const WebPIDecoder* const idec, int* last_y,
-                        uint8_t** u, uint8_t** v,
-                        int* width, int* height, int *stride, int* uv_stride) {
+uint8_t* WebPIDecGetYUVA(const WebPIDecoder* idec, int* last_y,
+                         uint8_t** u, uint8_t** v, uint8_t** a,
+                         int* width, int* height,
+                         int* stride, int* uv_stride, int* a_stride) {
  const WebPDecBuffer* const src = GetOutputBuffer(idec);
-  if (!src) return NULL;
+  if (src == NULL) return NULL;
  if (src->colorspace < MODE_YUV) {
    return NULL;
  }

-  if (last_y) *last_y = idec->params_.last_y;
-  if (u) *u = src->u.YUVA.u;
-  if (v) *v = src->u.YUVA.v;
-  if (width) *width = src->width;
-  if (height) *height = src->height;
-  if (stride) *stride = src->u.YUVA.y_stride;
-  if (uv_stride) *uv_stride = src->u.YUVA.u_stride;
+  if (last_y != NULL) *last_y = idec->params_.last_y;
+  if (u != NULL) *u = src->u.YUVA.u;
+  if (v != NULL) *v = src->u.YUVA.v;
+  if (a != NULL) *a = src->u.YUVA.a;
+  if (width != NULL) *width = src->width;
+  if (height != NULL) *height = src->height;
+  if (stride != NULL) *stride = src->u.YUVA.y_stride;
+  if (uv_stride != NULL) *uv_stride = src->u.YUVA.u_stride;
+  if (a_stride != NULL) *a_stride = src->u.YUVA.a_stride;

  return src->u.YUVA.y;
 }
@ -669,7 +823,7 @@ int WebPISetIOHooks(WebPIDecoder* const idec,
                    VP8IoSetupHook setup,
                    VP8IoTeardownHook teardown,
                    void* user_data) {
-  if (!idec || !idec->dec_ || idec->state_ > STATE_PRE_VP8) {
+  if (idec == NULL || idec->state_ > STATE_PRE_VP8) {
    return 0;
  }

--- a/src/dec/io.c
+++ b/src/dec/io.c
@ -1,8 +1,10 @@
-// Copyright 2011 Google Inc.
+// Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // functions for sample output.
@ -32,11 +34,12 @@ static int EmitYUV(const VP8Io* const io, WebPDecParams* const p) {
  const int mb_w = io->mb_w;
  const int mb_h = io->mb_h;
  const int uv_w = (mb_w + 1) / 2;
+  const int uv_h = (mb_h + 1) / 2;
  int j;
  for (j = 0; j < mb_h; ++j) {
    memcpy(y_dst + j * buf->y_stride, io->y + j * io->y_stride, mb_w);
  }
-  for (j = 0; j < (mb_h + 1) / 2; ++j) {
+  for (j = 0; j < uv_h; ++j) {
    memcpy(u_dst + j * buf->u_stride, io->u + j * io->uv_stride, uv_w);
    memcpy(v_dst + j * buf->v_stride, io->v + j * io->uv_stride, uv_w);
  }
@ -103,16 +106,14 @@ static int EmitFancyRGB(const VP8Io* const io, WebPDecParams* const p) {
  int num_lines_out = io->mb_h;   // a priori guess
  const WebPRGBABuffer* const buf = &p->output->u.RGBA;
  uint8_t* dst = buf->rgba + io->mb_y * buf->stride;
-  const WebPUpsampleLinePairFunc upsample =
-      io->a ? WebPUpsamplersKeepAlpha[p->output->colorspace]
-            : WebPUpsamplers[p->output->colorspace];
+  WebPUpsampleLinePairFunc upsample = WebPUpsamplers[p->output->colorspace];
  const uint8_t* cur_y = io->y;
  const uint8_t* cur_u = io->u;
  const uint8_t* cur_v = io->v;
  const uint8_t* top_u = p->tmp_u;
  const uint8_t* top_v = p->tmp_v;
  int y = io->mb_y;
-  int y_end = io->mb_y + io->mb_h;
+  const int y_end = io->mb_y + io->mb_h;
  const int mb_w = io->mb_w;
  const int uv_w = (mb_w + 1) / 2;

@ -121,11 +122,9 @@ static int EmitFancyRGB(const VP8Io* const io, WebPDecParams* const p) {
    upsample(NULL, cur_y, cur_u, cur_v, cur_u, cur_v, NULL, dst, mb_w);
  } else {
    // We can finish the left-over line from previous call.
-    // Warning! Don't overwrite the alpha values (if any), as they
-    // are not lagging one line behind but are already written.
    upsample(p->tmp_y, cur_y, top_u, top_v, cur_u, cur_v,
             dst - buf->stride, dst, mb_w);
-    num_lines_out++;
+    ++num_lines_out;
  }
  // Loop over each output pairs of row.
  for (; y + 2 < y_end; y += 2) {
@ -153,7 +152,7 @@ static int EmitFancyRGB(const VP8Io* const io, WebPDecParams* const p) {
    // Process the very last row of even-sized picture
    if (!(y_end & 1)) {
      upsample(cur_y, NULL, cur_u, cur_v, cur_u, cur_v,
-              dst + buf->stride, NULL, mb_w);
+               dst + buf->stride, NULL, mb_w);
    }
  }
  return num_lines_out;
@ -163,151 +162,130 @@ static int EmitFancyRGB(const VP8Io* const io, WebPDecParams* const p) {

 //------------------------------------------------------------------------------

-#ifdef WEBP_EXPERIMENTAL_FEATURES
 static int EmitAlphaYUV(const VP8Io* const io, WebPDecParams* const p) {
+  const uint8_t* alpha = io->a;
+  const WebPYUVABuffer* const buf = &p->output->u.YUVA;
  const int mb_w = io->mb_w;
  const int mb_h = io->mb_h;
-  int j;
-  const WebPYUVABuffer* const buf = &p->output->u.YUVA;
  uint8_t* dst = buf->a + io->mb_y * buf->a_stride;
-  const uint8_t* alpha = io->a;
-  if (alpha) {
+  int j;
+
+  if (alpha != NULL) {
    for (j = 0; j < mb_h; ++j) {
      memcpy(dst, alpha, mb_w * sizeof(*dst));
      alpha += io->width;
      dst += buf->a_stride;
    }
+  } else if (buf->a != NULL) {
+    // the user requested alpha, but there is none, set it to opaque.
+    for (j = 0; j < mb_h; ++j) {
+      memset(dst, 0xff, mb_w * sizeof(*dst));
+      dst += buf->a_stride;
+    }
  }
  return 0;
 }

+static int GetAlphaSourceRow(const VP8Io* const io,
+                             const uint8_t** alpha, int* const num_rows) {
+  int start_y = io->mb_y;
+  *num_rows = io->mb_h;
+
+  // Compensate for the 1-line delay of the fancy upscaler.
+  // This is similar to EmitFancyRGB().
+  if (io->fancy_upsampling) {
+    if (start_y == 0) {
+      // We don't process the last row yet. It'll be done during the next call.
+      --*num_rows;
+    } else {
+      --start_y;
+      // Fortunately, *alpha data is persistent, so we can go back
+      // one row and finish alpha blending, now that the fancy upscaler
+      // completed the YUV->RGB interpolation.
+      *alpha -= io->width;
+    }
+    if (io->crop_top + io->mb_y + io->mb_h == io->crop_bottom) {
+      // If it's the very last call, we process all the remaining rows!
+      *num_rows = io->crop_bottom - io->crop_top - start_y;
+    }
+  }
+  return start_y;
+}
+
 static int EmitAlphaRGB(const VP8Io* const io, WebPDecParams* const p) {
-  const int mb_w = io->mb_w;
-  const int mb_h = io->mb_h;
-  int i, j;
-  const WebPRGBABuffer* const buf = &p->output->u.RGBA;
-  uint8_t* dst = buf->rgba + io->mb_y * buf->stride;
  const uint8_t* alpha = io->a;
-  if (alpha) {
-    for (j = 0; j < mb_h; ++j) {
+  if (alpha != NULL) {
+    const int mb_w = io->mb_w;
+    const WEBP_CSP_MODE colorspace = p->output->colorspace;
+    const int alpha_first =
+        (colorspace == MODE_ARGB || colorspace == MODE_Argb);
+    const WebPRGBABuffer* const buf = &p->output->u.RGBA;
+    int num_rows;
+    const int start_y = GetAlphaSourceRow(io, &alpha, &num_rows);
+    uint8_t* const base_rgba = buf->rgba + start_y * buf->stride;
+    uint8_t* dst = base_rgba + (alpha_first ? 0 : 3);
+    uint32_t alpha_mask = 0xff;
+    int i, j;
+
+    for (j = 0; j < num_rows; ++j) {
      for (i = 0; i < mb_w; ++i) {
-        dst[4 * i + 3] = alpha[i];
+        const uint32_t alpha_value = alpha[i];
+        dst[4 * i] = alpha_value;
+        alpha_mask &= alpha_value;
      }
      alpha += io->width;
      dst += buf->stride;
    }
+    // alpha_mask is < 0xff if there's non-trivial alpha to premultiply with.
+    if (alpha_mask != 0xff && WebPIsPremultipliedMode(colorspace)) {
+      WebPApplyAlphaMultiply(base_rgba, alpha_first,
+                             mb_w, num_rows, buf->stride);
+    }
  }
  return 0;
 }

-#endif    /* WEBP_EXPERIMENTAL_FEATURES */
+static int EmitAlphaRGBA4444(const VP8Io* const io, WebPDecParams* const p) {
+  const uint8_t* alpha = io->a;
+  if (alpha != NULL) {
+    const int mb_w = io->mb_w;
+    const WEBP_CSP_MODE colorspace = p->output->colorspace;
+    const WebPRGBABuffer* const buf = &p->output->u.RGBA;
+    int num_rows;
+    const int start_y = GetAlphaSourceRow(io, &alpha, &num_rows);
+    uint8_t* const base_rgba = buf->rgba + start_y * buf->stride;
+    uint8_t* alpha_dst = base_rgba + 1;
+    uint32_t alpha_mask = 0x0f;
+    int i, j;

-//------------------------------------------------------------------------------
-// Simple picture rescaler
-
-// TODO(skal): start a common library for encoder and decoder, and factorize
-// this code in.
-
-#define RFIX 30
-#define MULT(x,y) (((int64_t)(x) * (y) + (1 << (RFIX - 1))) >> RFIX)
-
-static void InitRescaler(WebPRescaler* const wrk,
-                         int src_width, int src_height,
-                         uint8_t* dst,
-                         int dst_width, int dst_height, int dst_stride,
-                         int x_add, int x_sub, int y_add, int y_sub,
-                         int32_t* work) {
-  wrk->x_expand = (src_width < dst_width);
-  wrk->src_width = src_width;
-  wrk->src_height = src_height;
-  wrk->dst_width = dst_width;
-  wrk->dst_height = dst_height;
-  wrk->dst = dst;
-  wrk->dst_stride = dst_stride;
-  // for 'x_expand', we use bilinear interpolation
-  wrk->x_add = wrk->x_expand ? (x_sub - 1) : x_add - x_sub;
-  wrk->x_sub = wrk->x_expand ? (x_add - 1) : x_sub;
-  wrk->y_accum = y_add;
-  wrk->y_add = y_add;
-  wrk->y_sub = y_sub;
-  wrk->fx_scale = (1 << RFIX) / x_sub;
-  wrk->fy_scale = (1 << RFIX) / y_sub;
-  wrk->fxy_scale = wrk->x_expand ?
-      ((int64_t)dst_height << RFIX) / (x_sub * src_height) :
-      ((int64_t)dst_height << RFIX) / (x_add * src_height);
-  wrk->irow = work;
-  wrk->frow = work + dst_width;
-}
-
-static inline void ImportRow(const uint8_t* const src,
-                             WebPRescaler* const wrk) {
-  int x_in = 0;
-  int x_out;
-  int accum = 0;
-  if (!wrk->x_expand) {
-    int sum = 0;
-    for (x_out = 0; x_out < wrk->dst_width; ++x_out) {
-      accum += wrk->x_add;
-      for (; accum > 0; accum -= wrk->x_sub) {
-        sum += src[x_in++];
-      }
-      {        // Emit next horizontal pixel.
-        const int32_t base = src[x_in++];
-        const int32_t frac = base * (-accum);
-        wrk->frow[x_out] = (sum + base) * wrk->x_sub - frac;
-        // fresh fractional start for next pixel
-        sum = MULT(frac, wrk->fx_scale);
+    for (j = 0; j < num_rows; ++j) {
+      for (i = 0; i < mb_w; ++i) {
+        // Fill in the alpha value (converted to 4 bits).
+        const uint32_t alpha_value = alpha[i] >> 4;
+        alpha_dst[2 * i] = (alpha_dst[2 * i] & 0xf0) | alpha_value;
+        alpha_mask &= alpha_value;
      }
+      alpha += io->width;
+      alpha_dst += buf->stride;
    }
-  } else {        // simple bilinear interpolation
-    int left = src[0], right = src[0];
-    for (x_out = 0; x_out < wrk->dst_width; ++x_out) {
-      if (accum < 0) {
-        left = right;
-        right = src[++x_in];
-        accum += wrk->x_add;
-      }
-      wrk->frow[x_out] = right * wrk->x_add + (left - right) * accum;
-      accum -= wrk->x_sub;
+    if (alpha_mask != 0x0f && WebPIsPremultipliedMode(colorspace)) {
+      WebPApplyAlphaMultiply4444(base_rgba, mb_w, num_rows, buf->stride);
    }
  }
-  // Accumulate the new row's contribution
-  for (x_out = 0; x_out < wrk->dst_width; ++x_out) {
-    wrk->irow[x_out] += wrk->frow[x_out];
-  }
+  return 0;
 }

-static void ExportRow(WebPRescaler* const wrk) {
-  int x_out;
-  const int yscale = wrk->fy_scale * (-wrk->y_accum);
-  assert(wrk->y_accum <= 0);
-  for (x_out = 0; x_out < wrk->dst_width; ++x_out) {
-    const int frac = MULT(wrk->frow[x_out], yscale);
-    const int v = (int)MULT(wrk->irow[x_out] - frac, wrk->fxy_scale);
-    wrk->dst[x_out] = (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
-    wrk->irow[x_out] = frac;   // new fractional start
-  }
-  wrk->y_accum += wrk->y_add;
-  wrk->dst += wrk->dst_stride;
-}
-
-#undef MULT
-#undef RFIX
-
 //------------------------------------------------------------------------------
 // YUV rescaling (no final RGB conversion needed)

 static int Rescale(const uint8_t* src, int src_stride,
                   int new_lines, WebPRescaler* const wrk) {
  int num_lines_out = 0;
-  while (new_lines-- > 0) {    // import new contribution of one source row.
-    ImportRow(src, wrk);
-    src += src_stride;
-    wrk->y_accum -= wrk->y_sub;
-    while (wrk->y_accum <= 0) {      // emit output row(s)
-      ExportRow(wrk);
-      num_lines_out++;
-    }
+  while (new_lines > 0) {    // import new contributions of source rows.
+    const int lines_in = WebPRescalerImport(wrk, new_lines, src, src_stride);
+    src += lines_in * src_stride;
+    new_lines -= lines_in;
+    num_lines_out += WebPRescalerExport(wrk);    // emit output row(s)
  }
  return num_lines_out;
 }
@ -322,19 +300,14 @@ static int EmitRescaledYUV(const VP8Io* const io, WebPDecParams* const p) {
 }

 static int EmitRescaledAlphaYUV(const VP8Io* const io, WebPDecParams* const p) {
-  if (io->a) {
+  if (io->a != NULL) {
    Rescale(io->a, io->width, io->mb_h, &p->scaler_a);
  }
  return 0;
 }

-static int IsAlphaMode(WEBP_CSP_MODE mode) {
-  return (mode == MODE_RGBA || mode == MODE_BGRA || mode == MODE_ARGB ||
-          mode == MODE_RGBA_4444 || mode == MODE_YUVA);
-}
-
 static int InitYUVRescaler(const VP8Io* const io, WebPDecParams* const p) {
-  const int has_alpha = IsAlphaMode(p->output->colorspace);
+  const int has_alpha = WebPIsAlphaMode(p->output->colorspace);
  const WebPYUVABuffer* const buf = &p->output->u.YUVA;
  const int out_width  = io->scaled_width;
  const int out_height = io->scaled_height;
@ -356,26 +329,27 @@ static int InitYUVRescaler(const VP8Io* const io, WebPDecParams* const p) {
    return 0;   // memory error
  }
  work = (int32_t*)p->memory;
-  InitRescaler(&p->scaler_y, io->mb_w, io->mb_h,
-               buf->y, out_width, out_height, buf->y_stride,
-               io->mb_w, out_width, io->mb_h, out_height,
-               work);
-  InitRescaler(&p->scaler_u, uv_in_width, uv_in_height,
-               buf->u, uv_out_width, uv_out_height, buf->u_stride,
-               uv_in_width, uv_out_width,
-               uv_in_height, uv_out_height,
-               work + work_size);
-  InitRescaler(&p->scaler_v, uv_in_width, uv_in_height,
-               buf->v, uv_out_width, uv_out_height, buf->v_stride,
-               uv_in_width, uv_out_width,
-               uv_in_height, uv_out_height,
-               work + work_size + uv_work_size);
+  WebPRescalerInit(&p->scaler_y, io->mb_w, io->mb_h,
+                   buf->y, out_width, out_height, buf->y_stride, 1,
+                   io->mb_w, out_width, io->mb_h, out_height,
+                   work);
+  WebPRescalerInit(&p->scaler_u, uv_in_width, uv_in_height,
+                   buf->u, uv_out_width, uv_out_height, buf->u_stride, 1,
+                   uv_in_width, uv_out_width,
+                   uv_in_height, uv_out_height,
+                   work + work_size);
+  WebPRescalerInit(&p->scaler_v, uv_in_width, uv_in_height,
+                   buf->v, uv_out_width, uv_out_height, buf->v_stride, 1,
+                   uv_in_width, uv_out_width,
+                   uv_in_height, uv_out_height,
+                   work + work_size + uv_work_size);
  p->emit = EmitRescaledYUV;
+
  if (has_alpha) {
-    InitRescaler(&p->scaler_a, io->mb_w, io->mb_h,
-                 buf->a, out_width, out_height, buf->a_stride,
-                 io->mb_w, out_width, io->mb_h, out_height,
-                 work + work_size + 2 * uv_work_size);
+    WebPRescalerInit(&p->scaler_a, io->mb_w, io->mb_h,
+                     buf->a, out_width, out_height, buf->a_stride, 1,
+                     io->mb_w, out_width, io->mb_h, out_height,
+                     work + work_size + 2 * uv_work_size);
    p->emit_alpha = EmitRescaledAlphaYUV;
  }
  return 1;
@ -384,20 +358,6 @@ static int InitYUVRescaler(const VP8Io* const io, WebPDecParams* const p) {
 //------------------------------------------------------------------------------
 // RGBA rescaling

-// import new contributions until one row is ready to be output, or all input
-// is consumed.
-static int Import(const uint8_t* src, int src_stride,
-                  int new_lines, WebPRescaler* const wrk) {
-  int num_lines_in = 0;
-  while (num_lines_in < new_lines && wrk->y_accum > 0) {
-    ImportRow(src, wrk);
-    src += src_stride;
-    ++num_lines_in;
-    wrk->y_accum -= wrk->y_sub;
-  }
-  return num_lines_in;
-}
-
 static int ExportRGB(WebPDecParams* const p, int y_pos) {
  const WebPYUV444Converter convert =
      WebPYUV444Converters[p->output->colorspace];
@ -406,16 +366,17 @@ static int ExportRGB(WebPDecParams* const p, int y_pos) {
  int num_lines_out = 0;
  // For RGB rescaling, because of the YUV420, current scan position
  // U/V can be +1/-1 line from the Y one.  Hence the double test.
-  while (p->scaler_y.y_accum <= 0 && p->scaler_u.y_accum <= 0) {
+  while (WebPRescalerHasPendingOutput(&p->scaler_y) &&
+         WebPRescalerHasPendingOutput(&p->scaler_u)) {
    assert(p->last_y + y_pos + num_lines_out < p->output->height);
    assert(p->scaler_u.y_accum == p->scaler_v.y_accum);
-    ExportRow(&p->scaler_y);
-    ExportRow(&p->scaler_u);
-    ExportRow(&p->scaler_v);
+    WebPRescalerExportRow(&p->scaler_y);
+    WebPRescalerExportRow(&p->scaler_u);
+    WebPRescalerExportRow(&p->scaler_v);
    convert(p->scaler_y.dst, p->scaler_u.dst, p->scaler_v.dst,
            dst, p->scaler_y.dst_width);
    dst += buf->stride;
-    num_lines_out++;
+    ++num_lines_out;
  }
  return num_lines_out;
 }
@ -426,12 +387,15 @@ static int EmitRescaledRGB(const VP8Io* const io, WebPDecParams* const p) {
  int j = 0, uv_j = 0;
  int num_lines_out = 0;
  while (j < mb_h) {
-    const int y_lines_in = Import(io->y + j * io->y_stride, io->y_stride,
-                                  mb_h - j, &p->scaler_y);
-    const int u_lines_in = Import(io->u + uv_j * io->uv_stride, io->uv_stride,
-                                  uv_mb_h - uv_j, &p->scaler_u);
-    const int v_lines_in = Import(io->v + uv_j * io->uv_stride, io->uv_stride,
-                                  uv_mb_h - uv_j, &p->scaler_v);
+    const int y_lines_in =
+        WebPRescalerImport(&p->scaler_y, mb_h - j,
+                           io->y + j * io->y_stride, io->y_stride);
+    const int u_lines_in =
+        WebPRescalerImport(&p->scaler_u, uv_mb_h - uv_j,
+                           io->u + uv_j * io->uv_stride, io->uv_stride);
+    const int v_lines_in =
+        WebPRescalerImport(&p->scaler_v, uv_mb_h - uv_j,
+                           io->v + uv_j * io->uv_stride, io->uv_stride);
    (void)v_lines_in;   // remove a gcc warning
    assert(u_lines_in == v_lines_in);
    j += y_lines_in;
@ -443,34 +407,80 @@ static int EmitRescaledRGB(const VP8Io* const io, WebPDecParams* const p) {

 static int ExportAlpha(WebPDecParams* const p, int y_pos) {
  const WebPRGBABuffer* const buf = &p->output->u.RGBA;
-  uint8_t* dst = buf->rgba + (p->last_y + y_pos) * buf->stride;
+  uint8_t* const base_rgba = buf->rgba + (p->last_y + y_pos) * buf->stride;
+  const WEBP_CSP_MODE colorspace = p->output->colorspace;
+  const int alpha_first =
+      (colorspace == MODE_ARGB || colorspace == MODE_Argb);
+  uint8_t* dst = base_rgba + (alpha_first ? 0 : 3);
  int num_lines_out = 0;
-  while (p->scaler_a.y_accum <= 0) {
+  const int is_premult_alpha = WebPIsPremultipliedMode(colorspace);
+  uint32_t alpha_mask = 0xff;
+  const int width = p->scaler_a.dst_width;
+
+  while (WebPRescalerHasPendingOutput(&p->scaler_a)) {
    int i;
    assert(p->last_y + y_pos + num_lines_out < p->output->height);
-    ExportRow(&p->scaler_a);
-    for (i = 0; i < p->scaler_a.dst_width; ++i) {
-      dst[4 * i + 3] = p->scaler_a.dst[i];
+    WebPRescalerExportRow(&p->scaler_a);
+    for (i = 0; i < width; ++i) {
+      const uint32_t alpha_value = p->scaler_a.dst[i];
+      dst[4 * i] = alpha_value;
+      alpha_mask &= alpha_value;
    }
    dst += buf->stride;
-    num_lines_out++;
+    ++num_lines_out;
+  }
+  if (is_premult_alpha && alpha_mask != 0xff) {
+    WebPApplyAlphaMultiply(base_rgba, alpha_first,
+                           width, num_lines_out, buf->stride);
+  }
+  return num_lines_out;
+}
+
+static int ExportAlphaRGBA4444(WebPDecParams* const p, int y_pos) {
+  const WebPRGBABuffer* const buf = &p->output->u.RGBA;
+  uint8_t* const base_rgba = buf->rgba + (p->last_y + y_pos) * buf->stride;
+  uint8_t* alpha_dst = base_rgba + 1;
+  int num_lines_out = 0;
+  const WEBP_CSP_MODE colorspace = p->output->colorspace;
+  const int width = p->scaler_a.dst_width;
+  const int is_premult_alpha = WebPIsPremultipliedMode(colorspace);
+  uint32_t alpha_mask = 0x0f;
+
+  while (WebPRescalerHasPendingOutput(&p->scaler_a)) {
+    int i;
+    assert(p->last_y + y_pos + num_lines_out < p->output->height);
+    WebPRescalerExportRow(&p->scaler_a);
+    for (i = 0; i < width; ++i) {
+      // Fill in the alpha value (converted to 4 bits).
+      const uint32_t alpha_value = p->scaler_a.dst[i] >> 4;
+      alpha_dst[2 * i] = (alpha_dst[2 * i] & 0xf0) | alpha_value;
+      alpha_mask &= alpha_value;
+    }
+    alpha_dst += buf->stride;
+    ++num_lines_out;
+  }
+  if (is_premult_alpha && alpha_mask != 0x0f) {
+    WebPApplyAlphaMultiply4444(base_rgba, width, num_lines_out, buf->stride);
  }
  return num_lines_out;
 }

 static int EmitRescaledAlphaRGB(const VP8Io* const io, WebPDecParams* const p) {
-  if (io->a) {
-    int j = 0, pos = 0;
+  if (io->a != NULL) {
+    WebPRescaler* const scaler = &p->scaler_a;
+    int j = 0;
+    int pos = 0;
    while (j < io->mb_h) {
-      j += Import(io->a + j * io->width, io->width, io->mb_h - j, &p->scaler_a);
-      pos += ExportAlpha(p, pos);
+      j += WebPRescalerImport(scaler, io->mb_h - j,
+                              io->a + j * io->width, io->width);
+      pos += p->emit_alpha_row(p, pos);
    }
  }
  return 0;
 }

 static int InitRGBRescaler(const VP8Io* const io, WebPDecParams* const p) {
-  const int has_alpha = IsAlphaMode(p->output->colorspace);
+  const int has_alpha = WebPIsAlphaMode(p->output->colorspace);
  const int out_width  = io->scaled_width;
  const int out_height = io->scaled_height;
  const int uv_in_width  = (io->mb_w + 1) >> 1;
@ -486,33 +496,38 @@ static int InitRGBRescaler(const VP8Io* const io, WebPDecParams* const p) {
    tmp_size1 += work_size;
    tmp_size2 += out_width;
  }
-  p->memory =
-      calloc(1, tmp_size1 * sizeof(*work) + tmp_size2 * sizeof(*tmp));
+  p->memory = calloc(1, tmp_size1 * sizeof(*work) + tmp_size2 * sizeof(*tmp));
  if (p->memory == NULL) {
    return 0;   // memory error
  }
  work = (int32_t*)p->memory;
  tmp = (uint8_t*)(work + tmp_size1);
-  InitRescaler(&p->scaler_y, io->mb_w, io->mb_h,
-               tmp + 0 * out_width, out_width, out_height, 0,
-               io->mb_w, out_width, io->mb_h, out_height,
-               work + 0 * work_size);
-  InitRescaler(&p->scaler_u, uv_in_width, uv_in_height,
-               tmp + 1 * out_width, out_width, out_height, 0,
-               io->mb_w, 2 * out_width, io->mb_h, 2 * out_height,
-               work + 1 * work_size);
-  InitRescaler(&p->scaler_v, uv_in_width, uv_in_height,
-               tmp + 2 * out_width, out_width, out_height, 0,
-               io->mb_w, 2 * out_width, io->mb_h, 2 * out_height,
-               work + 2 * work_size);
+  WebPRescalerInit(&p->scaler_y, io->mb_w, io->mb_h,
+                   tmp + 0 * out_width, out_width, out_height, 0, 1,
+                   io->mb_w, out_width, io->mb_h, out_height,
+                   work + 0 * work_size);
+  WebPRescalerInit(&p->scaler_u, uv_in_width, uv_in_height,
+                   tmp + 1 * out_width, out_width, out_height, 0, 1,
+                   io->mb_w, 2 * out_width, io->mb_h, 2 * out_height,
+                   work + 1 * work_size);
+  WebPRescalerInit(&p->scaler_v, uv_in_width, uv_in_height,
+                   tmp + 2 * out_width, out_width, out_height, 0, 1,
+                   io->mb_w, 2 * out_width, io->mb_h, 2 * out_height,
+                   work + 2 * work_size);
  p->emit = EmitRescaledRGB;

  if (has_alpha) {
-    InitRescaler(&p->scaler_a, io->mb_w, io->mb_h,
-                 tmp + 3 * out_width, out_width, out_height, 0,
-                 io->mb_w, out_width, io->mb_h, out_height,
-                 work + 3 * work_size);
+    WebPRescalerInit(&p->scaler_a, io->mb_w, io->mb_h,
+                     tmp + 3 * out_width, out_width, out_height, 0, 1,
+                     io->mb_w, out_width, io->mb_h, out_height,
+                     work + 3 * work_size);
    p->emit_alpha = EmitRescaledAlphaRGB;
+    if (p->output->colorspace == MODE_RGBA_4444 ||
+        p->output->colorspace == MODE_rgbA_4444) {
+      p->emit_alpha_row = ExportAlphaRGBA4444;
+    } else {
+      p->emit_alpha_row = ExportAlpha;
+    }
  }
  return 1;
 }
@ -520,67 +535,17 @@ static int InitRGBRescaler(const VP8Io* const io, WebPDecParams* const p) {
 //------------------------------------------------------------------------------
 // Default custom functions

-// Setup crop_xxx fields, mb_w and mb_h
-static int InitFromOptions(const WebPDecoderOptions* const options,
-                           VP8Io* const io) {
-  const int W = io->width;
-  const int H = io->height;
-  int x = 0, y = 0, w = W, h = H;
-
-  // Cropping
-  io->use_cropping = (options != NULL) && (options->use_cropping > 0);
-  if (io->use_cropping) {
-    w = options->crop_width;
-    h = options->crop_height;
-    // TODO(skal): take colorspace into account. Don't assume YUV420.
-    x = options->crop_left & ~1;
-    y = options->crop_top & ~1;
-    if (x < 0 || y < 0 || w <= 0 || h <= 0 || x + w > W || y + h > H) {
-      return 0;  // out of frame boundary error
-    }
-  }
-  io->crop_left   = x;
-  io->crop_top    = y;
-  io->crop_right  = x + w;
-  io->crop_bottom = y + h;
-  io->mb_w = w;
-  io->mb_h = h;
-
-  // Scaling
-  io->use_scaling = (options != NULL) && (options->use_scaling > 0);
-  if (io->use_scaling) {
-    if (options->scaled_width <= 0 || options->scaled_height <= 0) {
-      return 0;
-    }
-    io->scaled_width = options->scaled_width;
-    io->scaled_height = options->scaled_height;
-  }
-
-  // Filter
-  io->bypass_filtering = options && options->bypass_filtering;
-
-  // Fancy upsampler
-#ifdef FANCY_UPSAMPLING
-  io->fancy_upsampling = (options == NULL) || (!options->no_fancy_upsampling);
-#endif
-
-  if (io->use_scaling) {
-    // disable filter (only for large downscaling ratio).
-    io->bypass_filtering = (io->scaled_width < W * 3 / 4) &&
-                           (io->scaled_height < H * 3 / 4);
-    io->fancy_upsampling = 0;
-  }
-  return 1;
-}
-
 static int CustomSetup(VP8Io* io) {
  WebPDecParams* const p = (WebPDecParams*)io->opaque;
-  const int is_rgb = (p->output->colorspace < MODE_YUV);
+  const WEBP_CSP_MODE colorspace = p->output->colorspace;
+  const int is_rgb = WebPIsRGBMode(colorspace);
+  const int is_alpha = WebPIsAlphaMode(colorspace);

  p->memory = NULL;
  p->emit = NULL;
  p->emit_alpha = NULL;
-  if (!InitFromOptions(p->options, io)) {
+  p->emit_alpha_row = NULL;
+  if (!WebPIoInitFromOptions(p->options, io, is_alpha ? MODE_YUV : MODE_YUVA)) {
    return 0;
  }

@ -609,12 +574,14 @@ static int CustomSetup(VP8Io* io) {
    } else {
      p->emit = EmitYUV;
    }
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-    if (IsAlphaMode(p->output->colorspace)) {
-      // We need transparency output
-      p->emit_alpha = is_rgb ? EmitAlphaRGB : EmitAlphaYUV;
+    if (is_alpha) {  // need transparency output
+      if (WebPIsPremultipliedMode(colorspace)) WebPInitPremultiply();
+      p->emit_alpha =
+          (colorspace == MODE_RGBA_4444 || colorspace == MODE_rgbA_4444) ?
+              EmitAlphaRGBA4444
+          : is_rgb ? EmitAlphaRGB
+          : EmitAlphaYUV;
    }
-#endif
  }

  if (is_rgb) {
@ -626,7 +593,7 @@ static int CustomSetup(VP8Io* io) {
 //------------------------------------------------------------------------------

 static int CustomPut(const VP8Io* io) {
-  WebPDecParams* p = (WebPDecParams*)io->opaque;
+  WebPDecParams* const p = (WebPDecParams*)io->opaque;
  const int mb_w = io->mb_w;
  const int mb_h = io->mb_h;
  int num_lines_out;
--- a/src/dec/layer.c
+++ b/src/dec/layer.c
@ -1,8 +1,10 @@
-// Copyright 2011 Google Inc.
+// Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Enhancement layer (for YUV444/422)
@ -11,7 +13,8 @@

 #include <assert.h>
 #include <stdlib.h>
-#include "vp8i.h"
+
+#include "./vp8i.h"

 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
--- a/src/dec/quant.c
+++ b/src/dec/quant.c
@ -1,21 +1,23 @@
-// Copyright 2010 Google Inc.
+// Copyright 2010 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Quantizer initialization
 //
 // Author: Skal (pascal.massimino@gmail.com)

-#include "vp8i.h"
+#include "./vp8i.h"

 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif

-static inline int clip(int v, int M) {
+static WEBP_INLINE int clip(int v, int M) {
  return v < 0 ? 0 : v > M ? M : v;
 }

@ -94,8 +96,10 @@ void VP8ParseQuant(VP8Decoder* const dec) {
      m->y1_mat_[1] = kAcTable[clip(q + 0,       127)];

      m->y2_mat_[0] = kDcTable[clip(q + dqy2_dc, 127)] * 2;
-      // TODO(skal): make it another table?
-      m->y2_mat_[1] = kAcTable[clip(q + dqy2_ac, 127)] * 155 / 100;
+      // For all x in [0..284], x*155/100 is bitwise equal to (x*101581) >> 16.
+      // The smallest precision for that is '(x*6349) >> 12' but 16 is a good
+      // word size.
+      m->y2_mat_[1] = (kAcTable[clip(q + dqy2_ac, 127)] * 101581) >> 16;
      if (m->y2_mat_[1] < 8) m->y2_mat_[1] = 8;

      m->uv_mat_[0] = kDcTable[clip(q + dquv_dc, 117)];
--- a/src/dec/tree.c
+++ b/src/dec/tree.c
@ -1,8 +1,10 @@
-// Copyright 2010 Google Inc.
+// Copyright 2010 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Coding trees and probas
@ -59,8 +61,8 @@ static const int8_t kMVRef[8] = {
 };

 static const int8_t kMVRef4[6] = {
-  -LEFT4, 1
-    -ABOVE4, 2
+  -LEFT4, 1,
+    -ABOVE4, 2,
      -ZERO4, -NEW4
 };
 #endif
--- a/src/dec/vp8.c
+++ b/src/dec/vp8.c
@ -1,8 +1,10 @@
-// Copyright 2010 Google Inc.
+// Copyright 2010 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // main entry for the decoder
@ -10,8 +12,11 @@
 // Author: Skal (pascal.massimino@gmail.com)

 #include <stdlib.h>
-#include "vp8i.h"
-#include "webpi.h"
+
+#include "./vp8i.h"
+#include "./vp8li.h"
+#include "./webpi.h"
+#include "../utils/bit_reader.h"

 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
@ -32,20 +37,22 @@ static void SetOk(VP8Decoder* const dec) {
 }

 int VP8InitIoInternal(VP8Io* const io, int version) {
-  if (version != WEBP_DECODER_ABI_VERSION)
+  if (WEBP_ABI_IS_INCOMPATIBLE(version, WEBP_DECODER_ABI_VERSION)) {
    return 0;  // mismatch error
-  if (io) {
+  }
+  if (io != NULL) {
    memset(io, 0, sizeof(*io));
  }
  return 1;
 }

 VP8Decoder* VP8New(void) {
-  VP8Decoder* dec = (VP8Decoder*)calloc(1, sizeof(VP8Decoder));
-  if (dec) {
+  VP8Decoder* const dec = (VP8Decoder*)calloc(1, sizeof(*dec));
+  if (dec != NULL) {
    SetOk(dec);
    WebPWorkerInit(&dec->worker_);
    dec->ready_ = 0;
+    dec->num_parts_ = 1;
  }
  return dec;
 }
@ -56,35 +63,46 @@ VP8StatusCode VP8Status(VP8Decoder* const dec) {
 }

 const char* VP8StatusMessage(VP8Decoder* const dec) {
-  if (!dec) return "no object";
+  if (dec == NULL) return "no object";
  if (!dec->error_msg_) return "OK";
  return dec->error_msg_;
 }

 void VP8Delete(VP8Decoder* const dec) {
-  if (dec) {
+  if (dec != NULL) {
    VP8Clear(dec);
    free(dec);
  }
 }

 int VP8SetError(VP8Decoder* const dec,
-                VP8StatusCode error, const char * const msg) {
-  dec->status_ = error;
-  dec->error_msg_ = msg;
-  dec->ready_ = 0;
+                VP8StatusCode error, const char* const msg) {
+  // TODO This check would be unnecessary if alpha decompression was separated
+  // from VP8ProcessRow/FinishRow. This avoids setting 'dec->status_' to
+  // something other than VP8_STATUS_BITSTREAM_ERROR on alpha decompression
+  // failure.
+  if (dec->status_ == VP8_STATUS_OK) {
+    dec->status_ = error;
+    dec->error_msg_ = msg;
+    dec->ready_ = 0;
+  }
  return 0;
 }

 //------------------------------------------------------------------------------

-int VP8GetInfo(const uint8_t* data, uint32_t data_size, uint32_t chunk_size,
-               int* width, int* height, int* has_alpha) {
-  if (data_size < 10) {
+int VP8CheckSignature(const uint8_t* const data, size_t data_size) {
+  return (data_size >= 3 &&
+          data[0] == 0x9d && data[1] == 0x01 && data[2] == 0x2a);
+}
+
+int VP8GetInfo(const uint8_t* data, size_t data_size, size_t chunk_size,
+               int* const width, int* const height) {
+  if (data == NULL || data_size < VP8_FRAME_HEADER_SIZE) {
    return 0;         // not enough data
  }
  // check signature
-  if (data[3] != 0x9d || data[4] != 0x01 || data[5] != 0x2a) {
+  if (!VP8CheckSignature(data + 3, data_size - 3)) {
    return 0;         // Wrong signature.
  } else {
    const uint32_t bits = data[0] | (data[1] << 8) | (data[2] << 16);
@ -92,14 +110,6 @@ int VP8GetInfo(const uint8_t* data, uint32_t data_size, uint32_t chunk_size,
    const int w = ((data[7] << 8) | data[6]) & 0x3fff;
    const int h = ((data[9] << 8) | data[8]) & 0x3fff;

-    if (has_alpha) {
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-      if (data_size < 11) return 0;
-      *has_alpha = !!(data[10] & 0x80);    // the colorspace_ bit
-#else
-      *has_alpha = 0;
-#endif
-    }
    if (!key_frame) {   // Not a keyframe.
      return 0;
    }
@ -129,7 +139,7 @@ int VP8GetInfo(const uint8_t* data, uint32_t data_size, uint32_t chunk_size,
 // Header parsing

 static void ResetSegmentHeader(VP8SegmentHeader* const hdr) {
-  assert(hdr);
+  assert(hdr != NULL);
  hdr->use_segment_ = 0;
  hdr->update_map_ = 0;
  hdr->absolute_delta_ = 1;
@ -140,8 +150,8 @@ static void ResetSegmentHeader(VP8SegmentHeader* const hdr) {
 // Paragraph 9.3
 static int ParseSegmentHeader(VP8BitReader* br,
                              VP8SegmentHeader* hdr, VP8Proba* proba) {
-  assert(br);
-  assert(hdr);
+  assert(br != NULL);
+  assert(hdr != NULL);
  hdr->use_segment_ = VP8Get(br);
  if (hdr->use_segment_) {
    hdr->update_map_ = VP8Get(br);
@ -177,7 +187,7 @@ static int ParseSegmentHeader(VP8BitReader* br,
 // is returned, and this is an unrecoverable error.
 // If the partitions were positioned ok, VP8_STATUS_OK is returned.
 static VP8StatusCode ParsePartitions(VP8Decoder* const dec,
-                                     const uint8_t* buf, uint32_t size) {
+                                     const uint8_t* buf, size_t size) {
  VP8BitReader* const br = &dec->br_;
  const uint8_t* sz = buf;
  const uint8_t* buf_end = buf + size;
@ -228,33 +238,18 @@ static int ParseFilterHeader(VP8BitReader* br, VP8Decoder* const dec) {
    }
  }
  dec->filter_type_ = (hdr->level_ == 0) ? 0 : hdr->simple_ ? 1 : 2;
-  if (dec->filter_type_ > 0) {    // precompute filter levels per segment
-    if (dec->segment_hdr_.use_segment_) {
-      int s;
-      for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
-        int strength = dec->segment_hdr_.filter_strength_[s];
-        if (!dec->segment_hdr_.absolute_delta_) {
-          strength += hdr->level_;
-        }
-        dec->filter_levels_[s] = strength;
-      }
-    } else {
-      dec->filter_levels_[0] = hdr->level_;
-    }
-  }
  return !br->eof_;
 }

 // Topmost call
 int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
  const uint8_t* buf;
-  uint32_t buf_size;
-  uint32_t vp8_chunk_size;
-  uint32_t bytes_skipped;
+  size_t buf_size;
  VP8FrameHeader* frm_hdr;
  VP8PictureHeader* pic_hdr;
  VP8BitReader* br;
  VP8StatusCode status;
+  WebPHeaderStructure headers;

  if (dec == NULL) {
    return 0;
@ -265,16 +260,32 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
                       "null VP8Io passed to VP8GetHeaders()");
  }

-  buf = io->data;
-  buf_size = io->data_size;
-
  // Process Pre-VP8 chunks.
-  status = WebPParseHeaders(&buf, &buf_size, &vp8_chunk_size, &bytes_skipped);
+  headers.data = io->data;
+  headers.data_size = io->data_size;
+  status = WebPParseHeaders(&headers);
  if (status != VP8_STATUS_OK) {
    return VP8SetError(dec, status, "Incorrect/incomplete header.");
  }
+  if (headers.is_lossless) {
+    return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
+                       "Unexpected lossless format encountered.");
+  }
+
+  if (dec->alpha_data_ == NULL) {
+    assert(dec->alpha_data_size_ == 0);
+    // We have NOT set alpha data yet. Set it now.
+    // (This is to ensure that dec->alpha_data_ is NOT reset to NULL if
+    // WebPParseHeaders() is called more than once, as in incremental decoding
+    // case.)
+    dec->alpha_data_ = headers.alpha_data;
+    dec->alpha_data_size_ = headers.alpha_data_size;
+  }

  // Process the VP8 frame header.
+  buf = headers.data + headers.offset;
+  buf_size = headers.data_size - headers.offset;
+  assert(headers.data_size >= headers.offset);  // WebPParseHeaders' guarantee
  if (buf_size < 4) {
    return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
                       "Truncated header.");
@ -305,7 +316,7 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
      return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
                         "cannot parse picture header");
    }
-    if (buf[0] != 0x9d || buf[1] != 0x01 || buf[2] != 0x2a) {
+    if (!VP8CheckSignature(buf, buf_size)) {
      return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
                         "Bad code word");
    }
@ -342,9 +353,6 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
                       "bad partition length");
  }

-  dec->alpha_data_ = NULL;
-  dec->alpha_data_size_ = 0;
-
  br = &dec->br_;
  VP8InitBitReader(br, buf, buf + frm_hdr->partition_length_);
  buf += frm_hdr->partition_length_;
@ -418,17 +426,9 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {

    if (frm_hdr->partition_length_ < kTrailerSize ||
        ext_buf[kTrailerSize - 1] != kTrailerMarker) {
- Error:
      return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
                         "RIFF: Inconsistent extra information.");
    }
-    // Alpha
-    size = (ext_buf[4] << 0) | (ext_buf[5] << 8) | (ext_buf[6] << 16);
-    if (frm_hdr->partition_length_ < size + kTrailerSize) {
-      goto Error;
-    }
-    dec->alpha_data_ = (size > 0) ? ext_buf - size : NULL;
-    dec->alpha_data_size_ = size;

    // Layer
    size = (ext_buf[0] << 0) | (ext_buf[1] << 8) | (ext_buf[2] << 16);
@ -446,7 +446,7 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
 //------------------------------------------------------------------------------
 // Residual decoding (Paragraph 13.2 / 13.3)

-static const uint8_t kBands[16 + 1] = {
+static const int kBands[16 + 1] = {
  0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7,
  0  // extra entry as sentinel
 };
@ -462,63 +462,69 @@ static const uint8_t kZigzag[16] = {
 };

 typedef const uint8_t (*ProbaArray)[NUM_CTX][NUM_PROBAS];  // for const-casting
+typedef const uint8_t (*ProbaCtxArray)[NUM_PROBAS];
+
+// See section 13-2: http://tools.ietf.org/html/rfc6386#section-13.2
+static int GetLargeValue(VP8BitReader* const br, const uint8_t* const p) {
+  int v;
+  if (!VP8GetBit(br, p[3])) {
+    if (!VP8GetBit(br, p[4])) {
+      v = 2;
+    } else {
+      v = 3 + VP8GetBit(br, p[5]);
+    }
+  } else {
+    if (!VP8GetBit(br, p[6])) {
+      if (!VP8GetBit(br, p[7])) {
+        v = 5 + VP8GetBit(br, 159);
+      } else {
+        v = 7 + 2 * VP8GetBit(br, 165);
+        v += VP8GetBit(br, 145);
+      }
+    } else {
+      const uint8_t* tab;
+      const int bit1 = VP8GetBit(br, p[8]);
+      const int bit0 = VP8GetBit(br, p[9 + bit1]);
+      const int cat = 2 * bit1 + bit0;
+      v = 0;
+      for (tab = kCat3456[cat]; *tab; ++tab) {
+        v += v + VP8GetBit(br, *tab);
+      }
+      v += 3 + (8 << cat);
+    }
+  }
+  return v;
+}

 // Returns the position of the last non-zero coeff plus one
 // (and 0 if there's no coeff at all)
 static int GetCoeffs(VP8BitReader* const br, ProbaArray prob,
-                     int ctx, const uint16_t dq[2], int n, int16_t* out) {
-  const uint8_t* p = prob[kBands[n]][ctx];
+                     int ctx, const quant_t dq, int n, int16_t* out) {
+  // n is either 0 or 1 here. kBands[n] is not necessary for extracting '*p'.
+  const uint8_t* p = prob[n][ctx];
  if (!VP8GetBit(br, p[0])) {   // first EOB is more a 'CBP' bit.
    return 0;
  }
-  while (1) {
-    ++n;
+  for (; n < 16; ++n) {
+    const ProbaCtxArray p_ctx = prob[kBands[n + 1]];
    if (!VP8GetBit(br, p[1])) {
-      p = prob[kBands[n]][0];
+      p = p_ctx[0];
    } else {  // non zero coeff
-      int v, j;
+      int v;
      if (!VP8GetBit(br, p[2])) {
-        p = prob[kBands[n]][1];
        v = 1;
+        p = p_ctx[1];
      } else {
-        if (!VP8GetBit(br, p[3])) {
-          if (!VP8GetBit(br, p[4])) {
-            v = 2;
-          } else {
-            v = 3 + VP8GetBit(br, p[5]);
-          }
-        } else {
-          if (!VP8GetBit(br, p[6])) {
-            if (!VP8GetBit(br, p[7])) {
-              v = 5 + VP8GetBit(br, 159);
-            } else {
-              v = 7 + 2 * VP8GetBit(br, 165);
-              v += VP8GetBit(br, 145);
-            }
-          } else {
-            const uint8_t* tab;
-            const int bit1 = VP8GetBit(br, p[8]);
-            const int bit0 = VP8GetBit(br, p[9 + bit1]);
-            const int cat = 2 * bit1 + bit0;
-            v = 0;
-            for (tab = kCat3456[cat]; *tab; ++tab) {
-              v += v + VP8GetBit(br, *tab);
-            }
-            v += 3 + (8 << cat);
-          }
-        }
-        p = prob[kBands[n]][2];
+        v = GetLargeValue(br, p);
+        p = p_ctx[2];
      }
-      j = kZigzag[n - 1];
-      out[j] = VP8GetSigned(br, v) * dq[j > 0];
-      if (n == 16 || !VP8GetBit(br, p[0])) {   // EOB
-        return n;
+      out[kZigzag[n]] = VP8GetSigned(br, v) * dq[n > 0];
+      if (n < 15 && !VP8GetBit(br, p[0])) {   // EOB
+        return n + 1;
      }
    }
-    if (n == 16) {
-      return 16;
-    }
  }
+  return 16;
 }

 // Alias-safe way of converting 4bytes to 32bits.
@ -556,6 +562,7 @@ static void ParseResiduals(VP8Decoder* const dec,
  uint32_t non_zero_dc = 0;
  int x, y, ch;

+  nz_dc.i32 = nz_ac.i32 = 0;
  memset(dst, 0, 384 * sizeof(*dst));
  if (!dec->is_i4x4_) {    // parse DC
    int16_t dc[16] = { 0 };
@ -656,6 +663,12 @@ int VP8DecodeMB(VP8Decoder* const dec, VP8BitReader* const token_br) {
    dec->non_zero_ac_ = 0;
  }

+  if (dec->filter_type_ > 0) {  // store filter info
+    VP8FInfo* const finfo = dec->f_info_ + dec->mb_x_;
+    *finfo = dec->fstrengths_[dec->segment_][dec->is_i4x4_];
+    finfo->f_inner_ = (!info->skip_ || dec->is_i4x4_);
+  }
+
  return (!token_br->eof_);
 }

@ -679,10 +692,8 @@ static int ParseFrame(VP8Decoder* const dec, VP8Io* io) {
        return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
                           "Premature end-of-file encountered.");
      }
+      // Reconstruct and emit samples.
      VP8ReconstructBlock(dec);
-
-      // Store data and save block's filtering params
-      VP8StoreBlock(dec);
    }
    if (!VP8ProcessRow(dec, io)) {
      return VP8SetError(dec, VP8_STATUS_USER_ABORT, "Output aborted.");
@ -747,7 +758,7 @@ int VP8Decode(VP8Decoder* const dec, VP8Io* const io) {
  }

  dec->ready_ = 0;
-  return 1;
+  return ok;
 }

 void VP8Clear(VP8Decoder* const dec) {
--- a/src/dec/vp8i.h
+++ b/src/dec/vp8i.h
@ -1,8 +1,10 @@
-// Copyright 2010 Google Inc.
+// Copyright 2010 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // VP8 decoder: internal header.
@ -13,6 +15,7 @@
 #define WEBP_DEC_VP8I_H_

 #include <string.h>     // for memcpy()
+#include "./vp8li.h"
 #include "../utils/bit_reader.h"
 #include "../utils/thread.h"
 #include "../dsp/dsp.h"
@ -26,8 +29,8 @@ extern "C" {

 // version numbers
 #define DEC_MAJ_VERSION 0
-#define DEC_MIN_VERSION 1
-#define DEC_REV_VERSION 3
+#define DEC_MIN_VERSION 3
+#define DEC_REV_VERSION 1

 #define ONLY_KEYFRAME_CODE      // to remove any code related to P-Frames

@ -156,14 +159,15 @@ typedef struct {  // filter specs
 } VP8FInfo;

 typedef struct {  // used for syntax-parsing
-  unsigned int nz_;          // non-zero AC/DC coeffs
+  unsigned int nz_:24;       // non-zero AC/DC coeffs (24bit)
  unsigned int dc_nz_:1;     // non-zero DC coeffs
  unsigned int skip_:1;      // block type
 } VP8MB;

 // Dequantization matrices
+typedef int quant_t[2];      // [DC / AC].  Can be 'uint16_t[2]' too (~slower).
 typedef struct {
-  uint16_t y1_mat_[2], y2_mat_[2], uv_mat_[2];    // [DC / AC]
+  quant_t y1_mat_, y2_mat_, uv_mat_;
 } VP8QuantMatrix;

 // Persistent information needed by the parallel processing
@ -250,7 +254,7 @@ struct VP8Decoder {

  // main memory chunk for the above data. Persistent.
  void* mem_;
-  int mem_size_;
+  size_t mem_size_;

  // Per macroblock non-persistent infos.
  int mb_x_, mb_y_;       // current position, in macroblock units
@ -267,14 +271,15 @@ struct VP8Decoder {
  uint32_t non_zero_ac_;

  // Filtering side-info
-  int filter_type_;                         // 0=off, 1=simple, 2=complex
-  int filter_row_;                          // per-row flag
-  uint8_t filter_levels_[NUM_MB_SEGMENTS];  // precalculated per-segment
+  int filter_type_;                          // 0=off, 1=simple, 2=complex
+  int filter_row_;                           // per-row flag
+  VP8FInfo fstrengths_[NUM_MB_SEGMENTS][2];  // precalculated per-segment/type

  // extensions
  const uint8_t* alpha_data_;   // compressed alpha data (if present)
  size_t alpha_data_size_;
-  uint8_t* alpha_plane_;        // output
+  int is_alpha_decoded_;  // true if alpha_data_ is decoded in alpha_plane_
+  uint8_t* alpha_plane_;        // output. Persistent, contains the whole data.

  int layer_colorspace_;
  const uint8_t* layer_data_;   // compressed layer data (if present)
@ -286,15 +291,7 @@ struct VP8Decoder {

 // in vp8.c
 int VP8SetError(VP8Decoder* const dec,
-                VP8StatusCode error, const char * const msg);
-
-// Validates the VP8 data-header and retrieve basic header information viz width
-// and height. Returns 0 in case of formatting error. *width/*height/*has_alpha
-// can be passed NULL.
-int VP8GetInfo(const uint8_t* data,
-               uint32_t data_size,    // data available so far
-               uint32_t chunk_size,   // total data size expect in the chunk
-               int *width, int *height, int *has_alpha);
+                VP8StatusCode error, const char* const msg);

 // in tree.c
 void VP8ResetProba(VP8Proba* const proba);
@ -316,14 +313,8 @@ VP8StatusCode VP8EnterCritical(VP8Decoder* const dec, VP8Io* const io);
 // Must always be called in pair with VP8EnterCritical().
 // Returns false in case of error.
 int VP8ExitCritical(VP8Decoder* const dec, VP8Io* const io);
-// Filter the decoded macroblock row (if needed)
-int VP8FinishRow(VP8Decoder* const dec, VP8Io* io);   // multi threaded call
 // Process the last decoded row (filtering + output)
 int VP8ProcessRow(VP8Decoder* const dec, VP8Io* const io);
-// Store a block, along with filtering params
-void VP8StoreBlock(VP8Decoder* const dec);
-// Finalize and transmit a complete row. Return false in case of user-abort.
-int VP8FinishRow(VP8Decoder* const dec, VP8Io* const io);
 // To be called at the start of a new scanline, to initialize predictors.
 void VP8InitScanline(VP8Decoder* const dec);
 // Decode one macroblock. Returns false if there is not enough data.
--- a/src/dec/vp8l.c
+++ b/src/dec/vp8l.c
--- a/src/dec/vp8li.h
+++ b/src/dec/vp8li.h
@ -0,0 +1,124 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Lossless decoder: internal header.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+//         Vikas Arora(vikaas.arora@gmail.com)
+
+#ifndef WEBP_DEC_VP8LI_H_
+#define WEBP_DEC_VP8LI_H_
+
+#include <string.h>     // for memcpy()
+#include "./webpi.h"
+#include "../utils/bit_reader.h"
+#include "../utils/color_cache.h"
+#include "../utils/huffman.h"
+#include "../webp/format_constants.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+typedef enum {
+  READ_DATA = 0,
+  READ_HDR = 1,
+  READ_DIM = 2
+} VP8LDecodeState;
+
+typedef struct VP8LTransform VP8LTransform;
+struct VP8LTransform {
+  VP8LImageTransformType type_;   // transform type.
+  int                    bits_;   // subsampling bits defining transform window.
+  int                    xsize_;  // transform window X index.
+  int                    ysize_;  // transform window Y index.
+  uint32_t              *data_;   // transform data.
+};
+
+typedef struct {
+  HuffmanTree htrees_[HUFFMAN_CODES_PER_META_CODE];
+} HTreeGroup;
+
+typedef struct {
+  int             color_cache_size_;
+  VP8LColorCache  color_cache_;
+
+  int             huffman_mask_;
+  int             huffman_subsample_bits_;
+  int             huffman_xsize_;
+  uint32_t       *huffman_image_;
+  int             num_htree_groups_;
+  HTreeGroup     *htree_groups_;
+} VP8LMetadata;
+
+typedef struct {
+  VP8StatusCode    status_;
+  VP8LDecodeState  action_;
+  VP8LDecodeState  state_;
+  VP8Io           *io_;
+
+  const WebPDecBuffer *output_;    // shortcut to io->opaque->output
+
+  uint32_t        *pixels_;        // Internal data: either uint8_t* for alpha
+                                   // or uint32_t* for BGRA.
+  uint32_t        *argb_cache_;    // Scratch buffer for temporary BGRA storage.
+
+  VP8LBitReader    br_;
+
+  int              width_;
+  int              height_;
+  int              last_row_;      // last input row decoded so far.
+  int              last_out_row_;  // last row output so far.
+
+  VP8LMetadata     hdr_;
+
+  int              next_transform_;
+  VP8LTransform    transforms_[NUM_TRANSFORMS];
+  // or'd bitset storing the transforms types.
+  uint32_t         transforms_seen_;
+
+  uint8_t         *rescaler_memory;  // Working memory for rescaling work.
+  WebPRescaler    *rescaler;         // Common rescaler for all channels.
+} VP8LDecoder;
+
+//------------------------------------------------------------------------------
+// internal functions. Not public.
+
+// in vp8l.c
+
+// Decodes a raw image stream (without header) and store the alpha data
+// into *output, which must be of size width x height. Returns false in case
+// of error.
+int VP8LDecodeAlphaImageStream(int width, int height, const uint8_t* const data,
+                               size_t data_size, uint8_t* const output);
+
+// Allocates and initialize a new lossless decoder instance.
+VP8LDecoder* VP8LNew(void);
+
+// Decodes the image header. Returns false in case of error.
+int VP8LDecodeHeader(VP8LDecoder* const dec, VP8Io* const io);
+
+// Decodes an image. It's required to decode the lossless header before calling
+// this function. Returns false in case of error, with updated dec->status_.
+int VP8LDecodeImage(VP8LDecoder* const dec);
+
+// Resets the decoder in its initial state, reclaiming memory.
+// Preserves the dec->status_ value.
+void VP8LClear(VP8LDecoder* const dec);
+
+// Clears and deallocate a lossless decoder instance.
+void VP8LDelete(VP8LDecoder* const dec);
+
+//------------------------------------------------------------------------------
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
+
+#endif  /* WEBP_DEC_VP8LI_H_ */
--- a/src/dec/webp.c
+++ b/src/dec/webp.c
@ -1,8 +1,10 @@
-// Copyright 2010 Google Inc.
+// Copyright 2010 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Main decoding functions for WEBP images.
@ -10,8 +12,11 @@
 // Author: Skal (pascal.massimino@gmail.com)

 #include <stdlib.h>
-#include "vp8i.h"
-#include "webpi.h"
+
+#include "./vp8i.h"
+#include "./vp8li.h"
+#include "./webpi.h"
+#include "../webp/mux_types.h"  // ALPHA_FLAG

 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
@ -24,105 +29,151 @@ extern "C" {
 //   4...7   size of image data (including metadata) starting at offset 8
 //   8...11  "WEBP"   our form-type signature
 // The RIFF container (12 bytes) is followed by appropriate chunks:
-//   12..15  "VP8 ": 4-bytes tags, describing the raw video format used
+//   12..15  "VP8 ": 4-bytes tags, signaling the use of VP8 video format
 //   16..19  size of the raw VP8 image data, starting at offset 20
 //   20....  the VP8 bytes
 // Or,
+//   12..15  "VP8L": 4-bytes tags, signaling the use of VP8L lossless format
+//   16..19  size of the raw VP8L image data, starting at offset 20
+//   20....  the VP8L bytes
+// Or,
 //   12..15  "VP8X": 4-bytes tags, describing the extended-VP8 chunk.
 //   16..19  size of the VP8X chunk starting at offset 20.
 //   20..23  VP8X flags bit-map corresponding to the chunk-types present.
-//   24..27  Width of the Canvas Image.
-//   28..31  Height of the Canvas Image.
-// There can be extra chunks after the "VP8X" chunk (ICCP, TILE, FRM, VP8,
-// META  ...)
-// All 32-bits sizes are in little-endian order.
-// Note: chunk data must be padded to multiple of 2 in size
+//   24..26  Width of the Canvas Image.
+//   27..29  Height of the Canvas Image.
+// There can be extra chunks after the "VP8X" chunk (ICCP, FRGM, ANMF, VP8,
+// VP8L, XMP, EXIF  ...)
+// All sizes are in little-endian order.
+// Note: chunk data size must be padded to multiple of 2 when written.

-static inline uint32_t get_le32(const uint8_t* const data) {
-  return data[0] | (data[1] << 8) | (data[2] << 16) | (data[3] << 24);
+static WEBP_INLINE uint32_t get_le24(const uint8_t* const data) {
+  return data[0] | (data[1] << 8) | (data[2] << 16);
 }

-VP8StatusCode WebPParseRIFF(const uint8_t** data, uint32_t* data_size,
-                            uint32_t* riff_size) {
-  assert(data);
-  assert(data_size);
-  assert(riff_size);
+static WEBP_INLINE uint32_t get_le32(const uint8_t* const data) {
+  return (uint32_t)get_le24(data) | (data[3] << 24);
+}

-  if (*data_size >= RIFF_HEADER_SIZE &&
-      !memcmp(*data, "RIFF", TAG_SIZE)) {
+// Validates the RIFF container (if detected) and skips over it.
+// If a RIFF container is detected,
+// Returns VP8_STATUS_BITSTREAM_ERROR for invalid header, and
+//         VP8_STATUS_OK otherwise.
+// In case there are not enough bytes (partial RIFF container), return 0 for
+// *riff_size. Else return the RIFF size extracted from the header.
+static VP8StatusCode ParseRIFF(const uint8_t** const data,
+                               size_t* const data_size,
+                               size_t* const riff_size) {
+  assert(data != NULL);
+  assert(data_size != NULL);
+  assert(riff_size != NULL);
+
+  *riff_size = 0;  // Default: no RIFF present.
+  if (*data_size >= RIFF_HEADER_SIZE && !memcmp(*data, "RIFF", TAG_SIZE)) {
    if (memcmp(*data + 8, "WEBP", TAG_SIZE)) {
      return VP8_STATUS_BITSTREAM_ERROR;  // Wrong image file signature.
    } else {
-      *riff_size = get_le32(*data + TAG_SIZE);
+      const uint32_t size = get_le32(*data + TAG_SIZE);
      // Check that we have at least one chunk (i.e "WEBP" + "VP8?nnnn").
-      if (*riff_size < TAG_SIZE + CHUNK_HEADER_SIZE) {
+      if (size < TAG_SIZE + CHUNK_HEADER_SIZE) {
+        return VP8_STATUS_BITSTREAM_ERROR;
+      }
+      if (size > MAX_CHUNK_PAYLOAD) {
        return VP8_STATUS_BITSTREAM_ERROR;
      }
      // We have a RIFF container. Skip it.
+      *riff_size = size;
      *data += RIFF_HEADER_SIZE;
      *data_size -= RIFF_HEADER_SIZE;
    }
-  } else {
-    *riff_size = 0;  // Did not get full RIFF Header.
  }
  return VP8_STATUS_OK;
 }

-VP8StatusCode WebPParseVP8X(const uint8_t** data, uint32_t* data_size,
-                            uint32_t* bytes_skipped,
-                            int* width, int* height, uint32_t* flags) {
-  assert(data);
-  assert(data_size);
-  assert(bytes_skipped);
+// Validates the VP8X header and skips over it.
+// Returns VP8_STATUS_BITSTREAM_ERROR for invalid VP8X header,
+//         VP8_STATUS_NOT_ENOUGH_DATA in case of insufficient data, and
+//         VP8_STATUS_OK otherwise.
+// If a VP8X chunk is found, found_vp8x is set to true and *width_ptr,
+// *height_ptr and *flags_ptr are set to the corresponding values extracted
+// from the VP8X chunk.
+static VP8StatusCode ParseVP8X(const uint8_t** const data,
+                               size_t* const data_size,
+                               int* const found_vp8x,
+                               int* const width_ptr, int* const height_ptr,
+                               uint32_t* const flags_ptr) {
+  const uint32_t vp8x_size = CHUNK_HEADER_SIZE + VP8X_CHUNK_SIZE;
+  assert(data != NULL);
+  assert(data_size != NULL);
+  assert(found_vp8x != NULL);

-  *bytes_skipped = 0;
+  *found_vp8x = 0;

-  if (*data_size < CHUNK_HEADER_SIZE + VP8X_CHUNK_SIZE) {
+  if (*data_size < CHUNK_HEADER_SIZE) {
    return VP8_STATUS_NOT_ENOUGH_DATA;  // Insufficient data.
  }

  if (!memcmp(*data, "VP8X", TAG_SIZE)) {
+    int width, height;
+    uint32_t flags;
    const uint32_t chunk_size = get_le32(*data + TAG_SIZE);
    if (chunk_size != VP8X_CHUNK_SIZE) {
      return VP8_STATUS_BITSTREAM_ERROR;  // Wrong chunk size.
    }
-    if (flags) {
-      *flags = get_le32(*data + 8);
+
+    // Verify if enough data is available to validate the VP8X chunk.
+    if (*data_size < vp8x_size) {
+      return VP8_STATUS_NOT_ENOUGH_DATA;  // Insufficient data.
    }
-    if (width) {
-      *width = get_le32(*data + 12);
+    flags = get_le32(*data + 8);
+    width = 1 + get_le24(*data + 12);
+    height = 1 + get_le24(*data + 15);
+    if (width * (uint64_t)height >= MAX_IMAGE_AREA) {
+      return VP8_STATUS_BITSTREAM_ERROR;  // image is too large
    }
-    if (height) {
-      *height = get_le32(*data + 16);
-    }
-    // We have consumed 20 bytes from VP8X. Skip them.
-    *bytes_skipped = CHUNK_HEADER_SIZE + VP8X_CHUNK_SIZE;
-    *data += *bytes_skipped;
-    *data_size -= *bytes_skipped;
+
+    if (flags_ptr != NULL) *flags_ptr = flags;
+    if (width_ptr != NULL) *width_ptr = width;
+    if (height_ptr != NULL) *height_ptr = height;
+    // Skip over VP8X header bytes.
+    *data += vp8x_size;
+    *data_size -= vp8x_size;
+    *found_vp8x = 1;
  }
  return VP8_STATUS_OK;
 }

-VP8StatusCode WebPParseOptionalChunks(const uint8_t** data, uint32_t* data_size,
-                                      uint32_t riff_size,
-                                      uint32_t* bytes_skipped) {
+// Skips to the next VP8/VP8L chunk header in the data given the size of the
+// RIFF chunk 'riff_size'.
+// Returns VP8_STATUS_BITSTREAM_ERROR if any invalid chunk size is encountered,
+//         VP8_STATUS_NOT_ENOUGH_DATA in case of insufficient data, and
+//         VP8_STATUS_OK otherwise.
+// If an alpha chunk is found, *alpha_data and *alpha_size are set
+// appropriately.
+static VP8StatusCode ParseOptionalChunks(const uint8_t** const data,
+                                         size_t* const data_size,
+                                         size_t const riff_size,
+                                         const uint8_t** const alpha_data,
+                                         size_t* const alpha_size) {
  const uint8_t* buf;
-  uint32_t buf_size;
-
-  assert(data);
-  assert(data_size);
-  assert(bytes_skipped);
-
+  size_t buf_size;
+  uint32_t total_size = TAG_SIZE +           // "WEBP".
+                        CHUNK_HEADER_SIZE +  // "VP8Xnnnn".
+                        VP8X_CHUNK_SIZE;     // data.
+  assert(data != NULL);
+  assert(data_size != NULL);
  buf = *data;
  buf_size = *data_size;
-  *bytes_skipped = 0;
+
+  assert(alpha_data != NULL);
+  assert(alpha_size != NULL);
+  *alpha_data = NULL;
+  *alpha_size = 0;

  while (1) {
    uint32_t chunk_size;
-    uint32_t cur_skip_size;
-    const uint32_t bytes_skipped_header = TAG_SIZE +           // "WEBP".
-                                          CHUNK_HEADER_SIZE +  // "VP8Xnnnn".
-                                          VP8X_CHUNK_SIZE;     // Data.
+    uint32_t disk_chunk_size;   // chunk_size with padding
+
    *data = buf;
    *data_size = buf_size;

@ -131,119 +182,217 @@ VP8StatusCode WebPParseOptionalChunks(const uint8_t** data, uint32_t* data_size,
    }

    chunk_size = get_le32(buf + TAG_SIZE);
-    cur_skip_size = CHUNK_HEADER_SIZE + chunk_size;
+    if (chunk_size > MAX_CHUNK_PAYLOAD) {
+      return VP8_STATUS_BITSTREAM_ERROR;          // Not a valid chunk size.
+    }
+    // For odd-sized chunk-payload, there's one byte padding at the end.
+    disk_chunk_size = (CHUNK_HEADER_SIZE + chunk_size + 1) & ~1;
+    total_size += disk_chunk_size;

-    // Check that total bytes skipped along with current chunk size
-    // does not exceed riff_size.
-    if (riff_size > 0 &&
-        (bytes_skipped_header + *bytes_skipped + cur_skip_size > riff_size)) {
-      return VP8_STATUS_BITSTREAM_ERROR;  // Not a valid chunk size.
+    // Check that total bytes skipped so far does not exceed riff_size.
+    if (riff_size > 0 && (total_size > riff_size)) {
+      return VP8_STATUS_BITSTREAM_ERROR;          // Not a valid chunk size.
    }

-    if (buf_size < cur_skip_size) {  // Insufficient data.
+    // Start of a (possibly incomplete) VP8/VP8L chunk implies that we have
+    // parsed all the optional chunks.
+    // Note: This check must occur before the check 'buf_size < disk_chunk_size'
+    // below to allow incomplete VP8/VP8L chunks.
+    if (!memcmp(buf, "VP8 ", TAG_SIZE) ||
+        !memcmp(buf, "VP8L", TAG_SIZE)) {
+      return VP8_STATUS_OK;
+    }
+
+    if (buf_size < disk_chunk_size) {             // Insufficient data.
      return VP8_STATUS_NOT_ENOUGH_DATA;
    }

-    if (!memcmp(buf, "VP8 ", TAG_SIZE)) {  // A valid VP8 header.
-      return VP8_STATUS_OK;  // Found.
+    if (!memcmp(buf, "ALPH", TAG_SIZE)) {         // A valid ALPH header.
+      *alpha_data = buf + CHUNK_HEADER_SIZE;
+      *alpha_size = chunk_size;
    }

-    // We have a full & valid chunk; skip it.
-    buf += cur_skip_size;
-    buf_size -= cur_skip_size;
-    *bytes_skipped += cur_skip_size;
+    // We have a full and valid chunk; skip it.
+    buf += disk_chunk_size;
+    buf_size -= disk_chunk_size;
  }
 }

-VP8StatusCode WebPParseVP8Header(const uint8_t** data, uint32_t* data_size,
-                                 uint32_t riff_size, uint32_t* bytes_skipped,
-                                 uint32_t* vp8_chunk_size) {
-  assert(data);
-  assert(data_size);
-  assert(bytes_skipped);
-  assert(vp8_chunk_size);
-
-  *bytes_skipped = 0;
-  *vp8_chunk_size = 0;
+// Validates the VP8/VP8L Header ("VP8 nnnn" or "VP8L nnnn") and skips over it.
+// Returns VP8_STATUS_BITSTREAM_ERROR for invalid (chunk larger than
+//         riff_size) VP8/VP8L header,
+//         VP8_STATUS_NOT_ENOUGH_DATA in case of insufficient data, and
+//         VP8_STATUS_OK otherwise.
+// If a VP8/VP8L chunk is found, *chunk_size is set to the total number of bytes
+// extracted from the VP8/VP8L chunk header.
+// The flag '*is_lossless' is set to 1 in case of VP8L chunk / raw VP8L data.
+static VP8StatusCode ParseVP8Header(const uint8_t** const data_ptr,
+                                    size_t* const data_size,
+                                    size_t riff_size,
+                                    size_t* const chunk_size,
+                                    int* const is_lossless) {
+  const uint8_t* const data = *data_ptr;
+  const int is_vp8 = !memcmp(data, "VP8 ", TAG_SIZE);
+  const int is_vp8l = !memcmp(data, "VP8L", TAG_SIZE);
+  const uint32_t minimal_size =
+      TAG_SIZE + CHUNK_HEADER_SIZE;  // "WEBP" + "VP8 nnnn" OR
+                                     // "WEBP" + "VP8Lnnnn"
+  assert(data != NULL);
+  assert(data_size != NULL);
+  assert(chunk_size != NULL);
+  assert(is_lossless != NULL);

  if (*data_size < CHUNK_HEADER_SIZE) {
    return VP8_STATUS_NOT_ENOUGH_DATA;  // Insufficient data.
  }

-  if (!memcmp(*data, "VP8 ", TAG_SIZE)) {
-    *vp8_chunk_size = get_le32(*data + TAG_SIZE);
-    if (riff_size >= TAG_SIZE + CHUNK_HEADER_SIZE &&  // "WEBP" + "VP8 nnnn".
-        (*vp8_chunk_size > riff_size - (TAG_SIZE + CHUNK_HEADER_SIZE))) {
+  if (is_vp8 || is_vp8l) {
+    // Bitstream contains VP8/VP8L header.
+    const uint32_t size = get_le32(data + TAG_SIZE);
+    if ((riff_size >= minimal_size) && (size > riff_size - minimal_size)) {
      return VP8_STATUS_BITSTREAM_ERROR;  // Inconsistent size information.
    }
-    // We have consumed CHUNK_HEADER_SIZE bytes from VP8 Header. Skip them.
-    *bytes_skipped = CHUNK_HEADER_SIZE;
-    *data += *bytes_skipped;
-    *data_size -= *bytes_skipped;
+    // Skip over CHUNK_HEADER_SIZE bytes from VP8/VP8L Header.
+    *chunk_size = size;
+    *data_ptr += CHUNK_HEADER_SIZE;
+    *data_size -= CHUNK_HEADER_SIZE;
+    *is_lossless = is_vp8l;
+  } else {
+    // Raw VP8/VP8L bitstream (no header).
+    *is_lossless = VP8LCheckSignature(data, *data_size);
+    *chunk_size = *data_size;
  }
+
  return VP8_STATUS_OK;
 }

-VP8StatusCode WebPParseHeaders(const uint8_t** data, uint32_t* data_size,
-                               uint32_t* vp8_size, uint32_t* bytes_skipped) {
-  const uint8_t* buf;
-  uint32_t buf_size;
-  uint32_t riff_size;
-  uint32_t vp8_size_tmp;
-  uint32_t optional_data_size;
-  uint32_t vp8x_skip_size;
-  uint32_t vp8_skip_size;
+//------------------------------------------------------------------------------
+
+// Fetch '*width', '*height', '*has_alpha' and fill out 'headers' based on
+// 'data'. All the output parameters may be NULL. If 'headers' is NULL only the
+// minimal amount will be read to fetch the remaining parameters.
+// If 'headers' is non-NULL this function will attempt to locate both alpha
+// data (with or without a VP8X chunk) and the bitstream chunk (VP8/VP8L).
+// Note: The following chunk sequences (before the raw VP8/VP8L data) are
+// considered valid by this function:
+// RIFF + VP8(L)
+// RIFF + VP8X + (optional chunks) + VP8(L)
+// ALPH + VP8 <-- Not a valid WebP format: only allowed for internal purpose.
+// VP8(L)     <-- Not a valid WebP format: only allowed for internal purpose.
+static VP8StatusCode ParseHeadersInternal(const uint8_t* data,
+                                          size_t data_size,
+                                          int* const width,
+                                          int* const height,
+                                          int* const has_alpha,
+                                          int* const has_animation,
+                                          WebPHeaderStructure* const headers) {
+  int found_riff = 0;
+  int found_vp8x = 0;
  VP8StatusCode status;
+  WebPHeaderStructure hdrs;

-  assert(data);
-  assert(data_size);
-  assert(vp8_size);
-  assert(bytes_skipped);
-
-  buf = *data;
-  buf_size = *data_size;
-
-  *vp8_size = 0;
-  *bytes_skipped = 0;
-
-  if (buf == NULL || buf_size < RIFF_HEADER_SIZE) {
+  if (data == NULL || data_size < RIFF_HEADER_SIZE) {
    return VP8_STATUS_NOT_ENOUGH_DATA;
  }
+  memset(&hdrs, 0, sizeof(hdrs));
+  hdrs.data = data;
+  hdrs.data_size = data_size;

  // Skip over RIFF header.
-  if (WebPParseRIFF(&buf, &buf_size, &riff_size) != VP8_STATUS_OK) {
-    return VP8_STATUS_BITSTREAM_ERROR;  // Wrong RIFF Header.
-  }
-
-  // Skip over VP8X header.
-  status = WebPParseVP8X(&buf, &buf_size, &vp8x_skip_size, NULL, NULL, NULL);
+  status = ParseRIFF(&data, &data_size, &hdrs.riff_size);
  if (status != VP8_STATUS_OK) {
-    return status;  // Wrong VP8X Chunk / Insufficient data.
+    return status;   // Wrong RIFF header / insufficient data.
  }
-  if (vp8x_skip_size > 0) {
-    // Skip over optional chunks.
-    status = WebPParseOptionalChunks(&buf, &buf_size, riff_size,
-                                     &optional_data_size);
+  found_riff = (hdrs.riff_size > 0);
+
+  // Skip over VP8X.
+  {
+    uint32_t flags = 0;
+    status = ParseVP8X(&data, &data_size, &found_vp8x, width, height, &flags);
    if (status != VP8_STATUS_OK) {
-      return status;  // Found an invalid chunk size / Insufficient data.
+      return status;  // Wrong VP8X / insufficient data.
+    }
+    if (!found_riff && found_vp8x) {
+      // Note: This restriction may be removed in the future, if it becomes
+      // necessary to send VP8X chunk to the decoder.
+      return VP8_STATUS_BITSTREAM_ERROR;
+    }
+    if (has_alpha != NULL) *has_alpha = !!(flags & ALPHA_FLAG);
+    if (has_animation != NULL) *has_animation = !!(flags & ANIMATION_FLAG);
+    if (found_vp8x && headers == NULL) {
+      return VP8_STATUS_OK;  // Return features from VP8X header.
    }
  }

-  // Skip over VP8 chunk header.
-  status = WebPParseVP8Header(&buf, &buf_size, riff_size, &vp8_skip_size,
-                              &vp8_size_tmp);
-  if (status != VP8_STATUS_OK) {
-    return status;  // Invalid VP8 header / Insufficient data.
-  }
-  if (vp8_skip_size > 0) {
-    *vp8_size = vp8_size_tmp;
+  if (data_size < TAG_SIZE) return VP8_STATUS_NOT_ENOUGH_DATA;
+
+  // Skip over optional chunks if data started with "RIFF + VP8X" or "ALPH".
+  if ((found_riff && found_vp8x) ||
+      (!found_riff && !found_vp8x && !memcmp(data, "ALPH", TAG_SIZE))) {
+    status = ParseOptionalChunks(&data, &data_size, hdrs.riff_size,
+                                 &hdrs.alpha_data, &hdrs.alpha_data_size);
+    if (status != VP8_STATUS_OK) {
+      return status;  // Found an invalid chunk size / insufficient data.
+    }
  }

-  *bytes_skipped = buf - *data;
-  assert(*bytes_skipped == *data_size - buf_size);
-  *data = buf;
-  *data_size = buf_size;
-  return VP8_STATUS_OK;
+  // Skip over VP8/VP8L header.
+  status = ParseVP8Header(&data, &data_size, hdrs.riff_size,
+                          &hdrs.compressed_size, &hdrs.is_lossless);
+  if (status != VP8_STATUS_OK) {
+    return status;  // Wrong VP8/VP8L chunk-header / insufficient data.
+  }
+  if (hdrs.compressed_size > MAX_CHUNK_PAYLOAD) {
+    return VP8_STATUS_BITSTREAM_ERROR;
+  }
+
+  if (!hdrs.is_lossless) {
+    if (data_size < VP8_FRAME_HEADER_SIZE) {
+      return VP8_STATUS_NOT_ENOUGH_DATA;
+    }
+    // Validates raw VP8 data.
+    if (!VP8GetInfo(data, data_size,
+                    (uint32_t)hdrs.compressed_size, width, height)) {
+      return VP8_STATUS_BITSTREAM_ERROR;
+    }
+  } else {
+    if (data_size < VP8L_FRAME_HEADER_SIZE) {
+      return VP8_STATUS_NOT_ENOUGH_DATA;
+    }
+    // Validates raw VP8L data.
+    if (!VP8LGetInfo(data, data_size, width, height, has_alpha)) {
+      return VP8_STATUS_BITSTREAM_ERROR;
+    }
+  }
+
+  if (has_alpha != NULL) {
+    // If the data did not contain a VP8X/VP8L chunk the only definitive way
+    // to set this is by looking for alpha data (from an ALPH chunk).
+    *has_alpha |= (hdrs.alpha_data != NULL);
+  }
+  if (headers != NULL) {
+    *headers = hdrs;
+    headers->offset = data - headers->data;
+    assert((uint64_t)(data - headers->data) < MAX_CHUNK_PAYLOAD);
+    assert(headers->offset == headers->data_size - data_size);
+  }
+  return VP8_STATUS_OK;  // Return features from VP8 header.
+}
+
+VP8StatusCode WebPParseHeaders(WebPHeaderStructure* const headers) {
+  VP8StatusCode status;
+  int has_animation = 0;
+  assert(headers != NULL);
+  // fill out headers, ignore width/height/has_alpha.
+  status = ParseHeadersInternal(headers->data, headers->data_size,
+                                NULL, NULL, NULL, &has_animation, headers);
+  if (status == VP8_STATUS_OK || status == VP8_STATUS_NOT_ENOUGH_DATA) {
+    // TODO(jzern): full support of animation frames will require API additions.
+    if (has_animation) {
+      status = VP8_STATUS_UNSUPPORTED_FEATURE;
+    }
+  }
+  return status;
 }

 //------------------------------------------------------------------------------
@ -259,43 +408,72 @@ void WebPResetDecParams(WebPDecParams* const params) {
 // "Into" decoding variants

 // Main flow
-static VP8StatusCode DecodeInto(const uint8_t* data, uint32_t data_size,
+static VP8StatusCode DecodeInto(const uint8_t* const data, size_t data_size,
                                WebPDecParams* const params) {
-  VP8Decoder* dec = VP8New();
-  VP8StatusCode status = VP8_STATUS_OK;
+  VP8StatusCode status;
  VP8Io io;
+  WebPHeaderStructure headers;

-  assert(params);
-  if (dec == NULL) {
-    return VP8_STATUS_INVALID_PARAM;
+  headers.data = data;
+  headers.data_size = data_size;
+  status = WebPParseHeaders(&headers);   // Process Pre-VP8 chunks.
+  if (status != VP8_STATUS_OK) {
+    return status;
  }

+  assert(params != NULL);
  VP8InitIo(&io);
-  io.data = data;
-  io.data_size = data_size;
+  io.data = headers.data + headers.offset;
+  io.data_size = headers.data_size - headers.offset;
  WebPInitCustomIo(params, &io);  // Plug the I/O functions.

+  if (!headers.is_lossless) {
+    VP8Decoder* const dec = VP8New();
+    if (dec == NULL) {
+      return VP8_STATUS_OUT_OF_MEMORY;
+    }
 #ifdef WEBP_USE_THREAD
-  dec->use_threads_ = params->options && (params->options->use_threads > 0);
+    dec->use_threads_ = params->options && (params->options->use_threads > 0);
 #else
-  dec->use_threads_ = 0;
+    dec->use_threads_ = 0;
 #endif
+    dec->alpha_data_ = headers.alpha_data;
+    dec->alpha_data_size_ = headers.alpha_data_size;

-  // Decode bitstream header, update io->width/io->height.
-  if (!VP8GetHeaders(dec, &io)) {
-    status = VP8_STATUS_BITSTREAM_ERROR;
-  } else {
-    // Allocate/check output buffers.
-    status = WebPAllocateDecBuffer(io.width, io.height, params->options,
-                                   params->output);
-    if (status == VP8_STATUS_OK) {
-      // Decode
-      if (!VP8Decode(dec, &io)) {
-        status = dec->status_;
+    // Decode bitstream header, update io->width/io->height.
+    if (!VP8GetHeaders(dec, &io)) {
+      status = dec->status_;   // An error occurred. Grab error status.
+    } else {
+      // Allocate/check output buffers.
+      status = WebPAllocateDecBuffer(io.width, io.height, params->options,
+                                     params->output);
+      if (status == VP8_STATUS_OK) {  // Decode
+        if (!VP8Decode(dec, &io)) {
+          status = dec->status_;
+        }
      }
    }
+    VP8Delete(dec);
+  } else {
+    VP8LDecoder* const dec = VP8LNew();
+    if (dec == NULL) {
+      return VP8_STATUS_OUT_OF_MEMORY;
+    }
+    if (!VP8LDecodeHeader(dec, &io)) {
+      status = dec->status_;   // An error occurred. Grab error status.
+    } else {
+      // Allocate/check output buffers.
+      status = WebPAllocateDecBuffer(io.width, io.height, params->options,
+                                     params->output);
+      if (status == VP8_STATUS_OK) {  // Decode
+        if (!VP8LDecodeImage(dec)) {
+          status = dec->status_;
+        }
+      }
+    }
+    VP8LDelete(dec);
  }
-  VP8Delete(dec);
+
  if (status != VP8_STATUS_OK) {
    WebPFreeDecBuffer(params->output);
  }
@ -304,8 +482,10 @@ static VP8StatusCode DecodeInto(const uint8_t* data, uint32_t data_size,

 // Helpers
 static uint8_t* DecodeIntoRGBABuffer(WEBP_CSP_MODE colorspace,
-                                     const uint8_t* data, uint32_t data_size,
-                                     uint8_t* rgba, int stride, int size) {
+                                     const uint8_t* const data,
+                                     size_t data_size,
+                                     uint8_t* const rgba,
+                                     int stride, size_t size) {
  WebPDecParams params;
  WebPDecBuffer buf;
  if (rgba == NULL) {
@ -325,35 +505,35 @@ static uint8_t* DecodeIntoRGBABuffer(WEBP_CSP_MODE colorspace,
  return rgba;
 }

-uint8_t* WebPDecodeRGBInto(const uint8_t* data, uint32_t data_size,
-                           uint8_t* output, int size, int stride) {
+uint8_t* WebPDecodeRGBInto(const uint8_t* data, size_t data_size,
+                           uint8_t* output, size_t size, int stride) {
  return DecodeIntoRGBABuffer(MODE_RGB, data, data_size, output, stride, size);
 }

-uint8_t* WebPDecodeRGBAInto(const uint8_t* data, uint32_t data_size,
-                            uint8_t* output, int size, int stride) {
+uint8_t* WebPDecodeRGBAInto(const uint8_t* data, size_t data_size,
+                            uint8_t* output, size_t size, int stride) {
  return DecodeIntoRGBABuffer(MODE_RGBA, data, data_size, output, stride, size);
 }

-uint8_t* WebPDecodeARGBInto(const uint8_t* data, uint32_t data_size,
-                            uint8_t* output, int size, int stride) {
+uint8_t* WebPDecodeARGBInto(const uint8_t* data, size_t data_size,
+                            uint8_t* output, size_t size, int stride) {
  return DecodeIntoRGBABuffer(MODE_ARGB, data, data_size, output, stride, size);
 }

-uint8_t* WebPDecodeBGRInto(const uint8_t* data, uint32_t data_size,
-                           uint8_t* output, int size, int stride) {
+uint8_t* WebPDecodeBGRInto(const uint8_t* data, size_t data_size,
+                           uint8_t* output, size_t size, int stride) {
  return DecodeIntoRGBABuffer(MODE_BGR, data, data_size, output, stride, size);
 }

-uint8_t* WebPDecodeBGRAInto(const uint8_t* data, uint32_t data_size,
-                            uint8_t* output, int size, int stride) {
+uint8_t* WebPDecodeBGRAInto(const uint8_t* data, size_t data_size,
+                            uint8_t* output, size_t size, int stride) {
  return DecodeIntoRGBABuffer(MODE_BGRA, data, data_size, output, stride, size);
 }

-uint8_t* WebPDecodeYUVInto(const uint8_t* data, uint32_t data_size,
-                           uint8_t* luma, int luma_size, int luma_stride,
-                           uint8_t* u, int u_size, int u_stride,
-                           uint8_t* v, int v_size, int v_stride) {
+uint8_t* WebPDecodeYUVInto(const uint8_t* data, size_t data_size,
+                           uint8_t* luma, size_t luma_size, int luma_stride,
+                           uint8_t* u, size_t u_size, int u_stride,
+                           uint8_t* v, size_t v_size, int v_stride) {
  WebPDecParams params;
  WebPDecBuffer output;
  if (luma == NULL) return NULL;
@ -379,9 +559,9 @@ uint8_t* WebPDecodeYUVInto(const uint8_t* data, uint32_t data_size,

 //------------------------------------------------------------------------------

-static uint8_t* Decode(WEBP_CSP_MODE mode, const uint8_t* data,
-                       uint32_t data_size, int* width, int* height,
-                       WebPDecBuffer* keep_info) {
+static uint8_t* Decode(WEBP_CSP_MODE mode, const uint8_t* const data,
+                       size_t data_size, int* const width, int* const height,
+                       WebPDecBuffer* const keep_info) {
  WebPDecParams params;
  WebPDecBuffer output;

@ -394,53 +574,53 @@ static uint8_t* Decode(WEBP_CSP_MODE mode, const uint8_t* data,
  if (!WebPGetInfo(data, data_size, &output.width, &output.height)) {
    return NULL;
  }
-  if (width) *width = output.width;
-  if (height) *height = output.height;
+  if (width != NULL) *width = output.width;
+  if (height != NULL) *height = output.height;

  // Decode
  if (DecodeInto(data, data_size, &params) != VP8_STATUS_OK) {
    return NULL;
  }
-  if (keep_info) {    // keep track of the side-info
+  if (keep_info != NULL) {    // keep track of the side-info
    WebPCopyDecBuffer(&output, keep_info);
  }
  // return decoded samples (don't clear 'output'!)
-  return (mode >= MODE_YUV) ? output.u.YUVA.y : output.u.RGBA.rgba;
+  return WebPIsRGBMode(mode) ? output.u.RGBA.rgba : output.u.YUVA.y;
 }

-uint8_t* WebPDecodeRGB(const uint8_t* data, uint32_t data_size,
+uint8_t* WebPDecodeRGB(const uint8_t* data, size_t data_size,
                       int* width, int* height) {
  return Decode(MODE_RGB, data, data_size, width, height, NULL);
 }

-uint8_t* WebPDecodeRGBA(const uint8_t* data, uint32_t data_size,
+uint8_t* WebPDecodeRGBA(const uint8_t* data, size_t data_size,
                        int* width, int* height) {
  return Decode(MODE_RGBA, data, data_size, width, height, NULL);
 }

-uint8_t* WebPDecodeARGB(const uint8_t* data, uint32_t data_size,
+uint8_t* WebPDecodeARGB(const uint8_t* data, size_t data_size,
                        int* width, int* height) {
  return Decode(MODE_ARGB, data, data_size, width, height, NULL);
 }

-uint8_t* WebPDecodeBGR(const uint8_t* data, uint32_t data_size,
+uint8_t* WebPDecodeBGR(const uint8_t* data, size_t data_size,
                       int* width, int* height) {
  return Decode(MODE_BGR, data, data_size, width, height, NULL);
 }

-uint8_t* WebPDecodeBGRA(const uint8_t* data, uint32_t data_size,
+uint8_t* WebPDecodeBGRA(const uint8_t* data, size_t data_size,
                        int* width, int* height) {
  return Decode(MODE_BGRA, data, data_size, width, height, NULL);
 }

-uint8_t* WebPDecodeYUV(const uint8_t* data, uint32_t data_size,
+uint8_t* WebPDecodeYUV(const uint8_t* data, size_t data_size,
                       int* width, int* height, uint8_t** u, uint8_t** v,
                       int* stride, int* uv_stride) {
  WebPDecBuffer output;   // only to preserve the side-infos
  uint8_t* const out = Decode(MODE_YUV, data, data_size,
                              width, height, &output);

-  if (out) {
+  if (out != NULL) {
    const WebPYUVABuffer* const buf = &output.u.YUVA;
    *u = buf->u;
    *v = buf->v;
@ -452,69 +632,29 @@ uint8_t* WebPDecodeYUV(const uint8_t* data, uint32_t data_size,
 }

 static void DefaultFeatures(WebPBitstreamFeatures* const features) {
-  assert(features);
+  assert(features != NULL);
  memset(features, 0, sizeof(*features));
  features->bitstream_version = 0;
 }

-static VP8StatusCode GetFeatures(const uint8_t* data, uint32_t data_size,
+static VP8StatusCode GetFeatures(const uint8_t* const data, size_t data_size,
                                 WebPBitstreamFeatures* const features) {
-  uint32_t vp8_chunk_size = 0;
-  uint32_t riff_size = 0;
-  uint32_t flags = 0;
-  uint32_t vp8x_skip_size = 0;
-  uint32_t vp8_skip_size = 0;
-  VP8StatusCode status;
-
-  if (features == NULL) {
+  if (features == NULL || data == NULL) {
    return VP8_STATUS_INVALID_PARAM;
  }
  DefaultFeatures(features);

-  if (data == NULL) {
-    return VP8_STATUS_INVALID_PARAM;
-  }
-
-  // Skip over RIFF header.
-  status = WebPParseRIFF(&data, &data_size, &riff_size);
-  if (status != VP8_STATUS_OK) {
-    return status;   // Wrong RIFF Header / Insufficient data.
-  }
-
-  // Skip over VP8X.
-  status = WebPParseVP8X(&data, &data_size, &vp8x_skip_size, &features->width,
-                         &features->height, &flags);
-  if (status != VP8_STATUS_OK) {
-    return status;  // Wrong VP8X / insufficient data.
-
-  }
-  if (vp8x_skip_size > 0) {
-    return VP8_STATUS_OK;  // Return features from VP8X header.
-  }
-
-  // Skip over VP8 header.
-  status = WebPParseVP8Header(&data, &data_size, riff_size, &vp8_skip_size,
-                              &vp8_chunk_size);
-  if (status != VP8_STATUS_OK) {
-    return status;  // Wrong VP8 Chunk-header / insufficient data.
-  }
-  if (vp8_skip_size == 0) {
-    vp8_chunk_size = data_size;  // No VP8 chunk wrapper over raw VP8 data.
-  }
-
-  // Validates raw VP8 data.
-  if (!VP8GetInfo(data, data_size, vp8_chunk_size,
-                  &features->width, &features->height, &features->has_alpha)) {
-    return VP8_STATUS_BITSTREAM_ERROR;
-  }
-
-  return VP8_STATUS_OK;  // Return features from VP8 header.
+  // Only parse enough of the data to retrieve the features.
+  return ParseHeadersInternal(data, data_size,
+                              &features->width, &features->height,
+                              &features->has_alpha, &features->has_animation,
+                              NULL);
 }

 //------------------------------------------------------------------------------
 // WebPGetInfo()

-int WebPGetInfo(const uint8_t* data, uint32_t data_size,
+int WebPGetInfo(const uint8_t* data, size_t data_size,
                int* width, int* height) {
  WebPBitstreamFeatures features;

@ -522,10 +662,10 @@ int WebPGetInfo(const uint8_t* data, uint32_t data_size,
    return 0;
  }

-  if (width) {
+  if (width != NULL) {
    *width  = features.width;
  }
-  if (height) {
+  if (height != NULL) {
    *height = features.height;
  }

@ -535,9 +675,9 @@ int WebPGetInfo(const uint8_t* data, uint32_t data_size,
 //------------------------------------------------------------------------------
 // Advance decoding API

-int WebPInitDecoderConfigInternal(WebPDecoderConfig* const config,
+int WebPInitDecoderConfigInternal(WebPDecoderConfig* config,
                                  int version) {
-  if (version != WEBP_DECODER_ABI_VERSION) {
+  if (WEBP_ABI_IS_INCOMPATIBLE(version, WEBP_DECODER_ABI_VERSION)) {
    return 0;   // version mismatch
  }
  if (config == NULL) {
@ -549,37 +689,31 @@ int WebPInitDecoderConfigInternal(WebPDecoderConfig* const config,
  return 1;
 }

-VP8StatusCode WebPGetFeaturesInternal(const uint8_t* data, uint32_t data_size,
-                                      WebPBitstreamFeatures* const features,
+VP8StatusCode WebPGetFeaturesInternal(const uint8_t* data, size_t data_size,
+                                      WebPBitstreamFeatures* features,
                                      int version) {
-  VP8StatusCode status;
-  if (version != WEBP_DECODER_ABI_VERSION) {
+  if (WEBP_ABI_IS_INCOMPATIBLE(version, WEBP_DECODER_ABI_VERSION)) {
    return VP8_STATUS_INVALID_PARAM;   // version mismatch
  }
  if (features == NULL) {
    return VP8_STATUS_INVALID_PARAM;
  }
-
-  status = GetFeatures(data, data_size, features);
-  if (status == VP8_STATUS_NOT_ENOUGH_DATA) {
-    return VP8_STATUS_BITSTREAM_ERROR;  // Not enough data treated as error.
-  }
-  return status;
+  return GetFeatures(data, data_size, features);
 }

-VP8StatusCode WebPDecode(const uint8_t* data, uint32_t data_size,
-                         WebPDecoderConfig* const config) {
+VP8StatusCode WebPDecode(const uint8_t* data, size_t data_size,
+                         WebPDecoderConfig* config) {
  WebPDecParams params;
  VP8StatusCode status;

-  if (!config) {
+  if (config == NULL) {
    return VP8_STATUS_INVALID_PARAM;
  }

  status = GetFeatures(data, data_size, &config->input);
  if (status != VP8_STATUS_OK) {
    if (status == VP8_STATUS_NOT_ENOUGH_DATA) {
-      return VP8_STATUS_BITSTREAM_ERROR;  // Not enough data treated as error.
+      return VP8_STATUS_BITSTREAM_ERROR;  // Not-enough-data treated as error.
    }
    return status;
  }
@ -592,6 +726,66 @@ VP8StatusCode WebPDecode(const uint8_t* data, uint32_t data_size,
  return status;
 }

+//------------------------------------------------------------------------------
+// Cropping and rescaling.
+
+int WebPIoInitFromOptions(const WebPDecoderOptions* const options,
+                          VP8Io* const io, WEBP_CSP_MODE src_colorspace) {
+  const int W = io->width;
+  const int H = io->height;
+  int x = 0, y = 0, w = W, h = H;
+
+  // Cropping
+  io->use_cropping = (options != NULL) && (options->use_cropping > 0);
+  if (io->use_cropping) {
+    w = options->crop_width;
+    h = options->crop_height;
+    x = options->crop_left;
+    y = options->crop_top;
+    if (!WebPIsRGBMode(src_colorspace)) {   // only snap for YUV420 or YUV422
+      x &= ~1;
+      y &= ~1;    // TODO(later): only for YUV420, not YUV422.
+    }
+    if (x < 0 || y < 0 || w <= 0 || h <= 0 || x + w > W || y + h > H) {
+      return 0;  // out of frame boundary error
+    }
+  }
+  io->crop_left   = x;
+  io->crop_top    = y;
+  io->crop_right  = x + w;
+  io->crop_bottom = y + h;
+  io->mb_w = w;
+  io->mb_h = h;
+
+  // Scaling
+  io->use_scaling = (options != NULL) && (options->use_scaling > 0);
+  if (io->use_scaling) {
+    if (options->scaled_width <= 0 || options->scaled_height <= 0) {
+      return 0;
+    }
+    io->scaled_width = options->scaled_width;
+    io->scaled_height = options->scaled_height;
+  }
+
+  // Filter
+  io->bypass_filtering = options && options->bypass_filtering;
+
+  // Fancy upsampler
+#ifdef FANCY_UPSAMPLING
+  io->fancy_upsampling = (options == NULL) || (!options->no_fancy_upsampling);
+#endif
+
+  if (io->use_scaling) {
+    // disable filter (only for large downscaling ratio).
+    io->bypass_filtering = (io->scaled_width < W * 3 / 4) &&
+                           (io->scaled_height < H * 3 / 4);
+    io->fancy_upsampling = 0;
+  }
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
--- a/src/dec/webpi.h
+++ b/src/dec/webpi.h
@ -1,8 +1,10 @@
-// Copyright 2011 Google Inc.
+// Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Internal header: WebP decoding parameters and custom IO on buffer
@ -16,29 +18,15 @@
 extern "C" {
 #endif

-#include "../webp/decode_vp8.h"
+#include "../utils/rescaler.h"
+#include "./decode_vp8.h"

 //------------------------------------------------------------------------------
 // WebPDecParams: Decoding output parameters. Transient internal object.

 typedef struct WebPDecParams WebPDecParams;
 typedef int (*OutputFunc)(const VP8Io* const io, WebPDecParams* const p);
-
-// Structure use for on-the-fly rescaling
-typedef struct {
-  int x_expand;               // true if we're expanding in the x direction
-  int fy_scale, fx_scale;     // fixed-point scaling factor
-  int64_t fxy_scale;          // ''
-  // we need hpel-precise add/sub increments, for the downsampled U/V planes.
-  int y_accum;                // vertical accumulator
-  int y_add, y_sub;           // vertical increments (add ~= src, sub ~= dst)
-  int x_add, x_sub;           // horizontal increments (add ~= src, sub ~= dst)
-  int src_width, src_height;  // source dimensions
-  int dst_width, dst_height;  // destination dimensions
-  uint8_t* dst;
-  int dst_stride;
-  int32_t* irow, *frow;       // work buffer
-} WebPRescaler;
+typedef int (*OutputRowFunc)(WebPDecParams* const p, int y_pos);

 struct WebPDecParams {
  WebPDecBuffer* output;             // output buffer.
@ -49,9 +37,11 @@ struct WebPDecParams {
  const WebPDecoderOptions* options;  // if not NULL, use alt decoding features
  // rescalers
  WebPRescaler scaler_y, scaler_u, scaler_v, scaler_a;
-  void* memory;               // overall scratch memory for the output work.
-  OutputFunc emit;            // output RGB or YUV samples
-  OutputFunc emit_alpha;      // output alpha channel
+  void* memory;                  // overall scratch memory for the output work.
+
+  OutputFunc emit;               // output RGB or YUV samples
+  OutputFunc emit_alpha;         // output alpha channel
+  OutputRowFunc emit_alpha_row;  // output one line of rescaled alpha values
 };

 // Should be called first, before any use of the WebPDecParams object.
@ -60,69 +50,25 @@ void WebPResetDecParams(WebPDecParams* const params);
 //------------------------------------------------------------------------------
 // Header parsing helpers

-#define TAG_SIZE 4
-#define CHUNK_HEADER_SIZE 8
-#define RIFF_HEADER_SIZE 12
-#define FRAME_CHUNK_SIZE 20
-#define LOOP_CHUNK_SIZE 4
-#define TILE_CHUNK_SIZE 8
-#define VP8X_CHUNK_SIZE 12
-#define VP8_FRAME_HEADER_SIZE 10  // Size of the frame header within VP8 data.
+// Structure storing a description of the RIFF headers.
+typedef struct {
+  const uint8_t* data;         // input buffer
+  size_t data_size;            // input buffer size
+  size_t offset;               // offset to main data chunk (VP8 or VP8L)
+  const uint8_t* alpha_data;   // points to alpha chunk (if present)
+  size_t alpha_data_size;      // alpha chunk size
+  size_t compressed_size;      // VP8/VP8L compressed data size
+  size_t riff_size;            // size of the riff payload (or 0 if absent)
+  int is_lossless;             // true if a VP8L chunk is present
+} WebPHeaderStructure;

-// Validates the RIFF container (if detected) and skips over it.
-// If a RIFF container is detected,
-// Returns VP8_STATUS_BITSTREAM_ERROR for invalid header, and
-//         VP8_STATUS_OK otherwise.
-// In case there are not enough bytes (partial RIFF container), return 0 for
-// riff_size. Else return the riff_size extracted from the header.
-VP8StatusCode WebPParseRIFF(const uint8_t** data, uint32_t* data_size,
-                            uint32_t* riff_size);
-
-// Validates the VP8X Header and skips over it.
-// Returns VP8_STATUS_BITSTREAM_ERROR for invalid VP8X header,
-//         VP8_STATUS_NOT_ENOUGH_DATA in case of insufficient data, and
-//         VP8_STATUS_OK otherwise.
-// If a VP8 chunk is found, bytes_skipped is set to the total number of bytes
-// that are skipped; also Width, Height & Flags are set to the corresponding
-// fields extracted from the VP8X chunk.
-VP8StatusCode WebPParseVP8X(const uint8_t** data, uint32_t* data_size,
-                            uint32_t* bytes_skipped,
-                            int* width, int* height, uint32_t* flags);
-
-// Skips to the next VP8 chunk header in the data given the size of the RIFF
-// chunk 'riff_size'.
-// Returns VP8_STATUS_BITSTREAM_ERROR if any invalid chunk size is encountered,
-//         VP8_STATUS_NOT_ENOUGH_DATA in case of insufficient data, and
-//         VP8_STATUS_OK otherwise.
-// If a VP8 chunk is found, bytes_skipped is set to the total number of bytes
-// that are skipped.
-VP8StatusCode WebPParseOptionalChunks(const uint8_t** data, uint32_t* data_size,
-                                      uint32_t riff_size,
-                                      uint32_t* bytes_skipped);
-
-// Validates the VP8 Header ("VP8 nnnn") and skips over it.
-// Returns VP8_STATUS_BITSTREAM_ERROR for invalid (vp8_chunk_size greater than
-//         riff_size) VP8 header,
-//         VP8_STATUS_NOT_ENOUGH_DATA in case of insufficient data, and
-//         VP8_STATUS_OK otherwise.
-// If a VP8 chunk is found, bytes_skipped is set to the total number of bytes
-// that are skipped and vp8_chunk_size is set to the corresponding size
-// extracted from the VP8 chunk header.
-// For a partial VP8 chunk, vp8_chunk_size is set to 0.
-VP8StatusCode WebPParseVP8Header(const uint8_t** data, uint32_t* data_size,
-                                 uint32_t riff_size, uint32_t* bytes_skipped,
-                                 uint32_t* vp8_chunk_size);
-
-// Skips over all valid chunks prior to the first VP8 frame header.
-// Returns VP8_STATUS_OK on success,
-//         VP8_STATUS_BITSTREAM_ERROR if an invalid header/chunk is found, and
-//         VP8_STATUS_NOT_ENOUGH_DATA if case of insufficient data.
-// Also, data, data_size, vp8_size & bytes_skipped are updated appropriately
-// on success, where
-// vp8_size is the size of VP8 chunk data (extracted from VP8 chunk header) and
-// bytes_skipped is set to the total number of bytes that are skipped.
-VP8StatusCode WebPParseHeaders(const uint8_t** data, uint32_t* data_size,
-                               uint32_t* vp8_size, uint32_t* bytes_skipped);
+// Skips over all valid chunks prior to the first VP8/VP8L frame header.
+// Returns: VP8_STATUS_OK, VP8_STATUS_BITSTREAM_ERROR (invalid header/chunk),
+// VP8_STATUS_NOT_ENOUGH_DATA (partial input) or VP8_STATUS_UNSUPPORTED_FEATURE
+// in the case of non-decodable features (animation for instance).
+// In 'headers', compressed_size, offset, alpha_data, alpha_size, and lossless
+// fields are updated appropriately upon success.
+VP8StatusCode WebPParseHeaders(WebPHeaderStructure* const headers);

 //------------------------------------------------------------------------------
 // Misc utils
@ -131,6 +77,11 @@ VP8StatusCode WebPParseHeaders(const uint8_t** data, uint32_t* data_size,
 // hooks will use the supplied 'params' as io->opaque handle.
 void WebPInitCustomIo(WebPDecParams* const params, VP8Io* const io);

+// Setup crop_xxx fields, mb_w and mb_h in io. 'src_colorspace' refers
+// to the *compressed* format, not the output one.
+int WebPIoInitFromOptions(const WebPDecoderOptions* const options,
+                          VP8Io* const io, WEBP_CSP_MODE src_colorspace);
+
 //------------------------------------------------------------------------------
 // Internal functions regarding WebPDecBuffer memory (in buffer.c).
 // Don't really need to be externally visible for now.
@ -154,6 +105,8 @@ void WebPCopyDecBuffer(const WebPDecBuffer* const src,
 // Copy and transfer ownership from src to dst (beware of parameter order!)
 void WebPGrabDecBuffer(WebPDecBuffer* const src, WebPDecBuffer* const dst);

+
+
 //------------------------------------------------------------------------------

 #if defined(__cplusplus) || defined(c_plusplus)
--- a/src/demux/Makefile.am
+++ b/src/demux/Makefile.am
@ -0,0 +1,15 @@
+AM_CPPFLAGS = -I$(top_srcdir)/src
+lib_LTLIBRARIES = libwebpdemux.la
+
+libwebpdemux_la_SOURCES =
+libwebpdemux_la_SOURCES += demux.c
+
+libwebpdemuxinclude_HEADERS =
+libwebpdemuxinclude_HEADERS += ../webp/demux.h
+libwebpdemuxinclude_HEADERS += ../webp/mux_types.h
+libwebpdemuxinclude_HEADERS += ../webp/types.h
+
+libwebpdemux_la_LIBADD = ../libwebp.la
+libwebpdemux_la_LDFLAGS = -no-undefined -version-info 0:1:0
+libwebpdemuxincludedir = $(includedir)/webp
+pkgconfig_DATA = libwebpdemux.pc
--- a/src/demux/demux.c
+++ b/src/demux/demux.c
@ -0,0 +1,954 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+//  WebP container demux.
+//
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../utils/utils.h"
+#include "../webp/decode.h"     // WebPGetFeatures
+#include "../webp/demux.h"
+#include "../webp/format_constants.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+#define DMUX_MAJ_VERSION 0
+#define DMUX_MIN_VERSION 1
+#define DMUX_REV_VERSION 1
+
+typedef struct {
+  size_t start_;        // start location of the data
+  size_t end_;          // end location
+  size_t riff_end_;     // riff chunk end location, can be > end_.
+  size_t buf_size_;     // size of the buffer
+  const uint8_t* buf_;
+} MemBuffer;
+
+typedef struct {
+  size_t offset_;
+  size_t size_;
+} ChunkData;
+
+typedef struct Frame {
+  int x_offset_, y_offset_;
+  int width_, height_;
+  int duration_;
+  WebPMuxAnimDispose dispose_method_;
+  int is_fragment_;  // this is a frame fragment (and not a full frame).
+  int frame_num_;  // the referent frame number for use in assembling fragments.
+  int complete_;   // img_components_ contains a full image.
+  ChunkData img_components_[2];  // 0=VP8{,L} 1=ALPH
+  struct Frame* next_;
+} Frame;
+
+typedef struct Chunk {
+  ChunkData data_;
+  struct Chunk* next_;
+} Chunk;
+
+struct WebPDemuxer {
+  MemBuffer mem_;
+  WebPDemuxState state_;
+  int is_ext_format_;
+  uint32_t feature_flags_;
+  int canvas_width_, canvas_height_;
+  int loop_count_;
+  uint32_t bgcolor_;
+  int num_frames_;
+  Frame* frames_;
+  Frame** frames_tail_;
+  Chunk* chunks_;  // non-image chunks
+};
+
+typedef enum {
+  PARSE_OK,
+  PARSE_NEED_MORE_DATA,
+  PARSE_ERROR
+} ParseStatus;
+
+typedef struct ChunkParser {
+  uint8_t id[4];
+  ParseStatus (*parse)(WebPDemuxer* const dmux);
+  int (*valid)(const WebPDemuxer* const dmux);
+} ChunkParser;
+
+static ParseStatus ParseSingleImage(WebPDemuxer* const dmux);
+static ParseStatus ParseVP8X(WebPDemuxer* const dmux);
+static int IsValidSimpleFormat(const WebPDemuxer* const dmux);
+static int IsValidExtendedFormat(const WebPDemuxer* const dmux);
+
+static const ChunkParser kMasterChunks[] = {
+  { { 'V', 'P', '8', ' ' }, ParseSingleImage, IsValidSimpleFormat },
+  { { 'V', 'P', '8', 'L' }, ParseSingleImage, IsValidSimpleFormat },
+  { { 'V', 'P', '8', 'X' }, ParseVP8X,        IsValidExtendedFormat },
+  { { '0', '0', '0', '0' }, NULL,             NULL },
+};
+
+//------------------------------------------------------------------------------
+
+int WebPGetDemuxVersion(void) {
+  return (DMUX_MAJ_VERSION << 16) | (DMUX_MIN_VERSION << 8) | DMUX_REV_VERSION;
+}
+
+// -----------------------------------------------------------------------------
+// MemBuffer
+
+static int RemapMemBuffer(MemBuffer* const mem,
+                          const uint8_t* data, size_t size) {
+  if (size < mem->buf_size_) return 0;  // can't remap to a shorter buffer!
+
+  mem->buf_ = data;
+  mem->end_ = mem->buf_size_ = size;
+  return 1;
+}
+
+static int InitMemBuffer(MemBuffer* const mem,
+                         const uint8_t* data, size_t size) {
+  memset(mem, 0, sizeof(*mem));
+  return RemapMemBuffer(mem, data, size);
+}
+
+// Return the remaining data size available in 'mem'.
+static WEBP_INLINE size_t MemDataSize(const MemBuffer* const mem) {
+  return (mem->end_ - mem->start_);
+}
+
+// Return true if 'size' exceeds the end of the RIFF chunk.
+static WEBP_INLINE int SizeIsInvalid(const MemBuffer* const mem, size_t size) {
+  return (size > mem->riff_end_ - mem->start_);
+}
+
+static WEBP_INLINE void Skip(MemBuffer* const mem, size_t size) {
+  mem->start_ += size;
+}
+
+static WEBP_INLINE void Rewind(MemBuffer* const mem, size_t size) {
+  mem->start_ -= size;
+}
+
+static WEBP_INLINE const uint8_t* GetBuffer(MemBuffer* const mem) {
+  return mem->buf_ + mem->start_;
+}
+
+// Read from 'mem' and skip the read bytes.
+static WEBP_INLINE uint8_t ReadByte(MemBuffer* const mem) {
+  const uint8_t byte = mem->buf_[mem->start_];
+  Skip(mem, 1);
+  return byte;
+}
+
+static WEBP_INLINE int ReadLE16s(MemBuffer* const mem) {
+  const uint8_t* const data = mem->buf_ + mem->start_;
+  const int val = GetLE16(data);
+  Skip(mem, 2);
+  return val;
+}
+
+static WEBP_INLINE int ReadLE24s(MemBuffer* const mem) {
+  const uint8_t* const data = mem->buf_ + mem->start_;
+  const int val = GetLE24(data);
+  Skip(mem, 3);
+  return val;
+}
+
+static WEBP_INLINE uint32_t ReadLE32(MemBuffer* const mem) {
+  const uint8_t* const data = mem->buf_ + mem->start_;
+  const uint32_t val = GetLE32(data);
+  Skip(mem, 4);
+  return val;
+}
+
+// -----------------------------------------------------------------------------
+// Secondary chunk parsing
+
+static void AddChunk(WebPDemuxer* const dmux, Chunk* const chunk) {
+  Chunk** c = &dmux->chunks_;
+  while (*c != NULL) c = &(*c)->next_;
+  *c = chunk;
+  chunk->next_ = NULL;
+}
+
+// Add a frame to the end of the list, ensuring the last frame is complete.
+// Returns true on success, false otherwise.
+static int AddFrame(WebPDemuxer* const dmux, Frame* const frame) {
+  const Frame* const last_frame = *dmux->frames_tail_;
+  if (last_frame != NULL && !last_frame->complete_) return 0;
+
+  *dmux->frames_tail_ = frame;
+  frame->next_ = NULL;
+  dmux->frames_tail_ = &frame->next_;
+  return 1;
+}
+
+// Store image bearing chunks to 'frame'.
+// If 'has_vp8l_alpha' is not NULL, it will be set to true if the frame is a
+// lossless image with alpha.
+static ParseStatus StoreFrame(int frame_num, uint32_t min_size,
+                              MemBuffer* const mem, Frame* const frame,
+                              int* const has_vp8l_alpha) {
+  int alpha_chunks = 0;
+  int image_chunks = 0;
+  int done = (MemDataSize(mem) < min_size);
+  ParseStatus status = PARSE_OK;
+
+  if (has_vp8l_alpha != NULL) *has_vp8l_alpha = 0;  // Default.
+
+  if (done) return PARSE_NEED_MORE_DATA;
+
+  do {
+    const size_t chunk_start_offset = mem->start_;
+    const uint32_t fourcc = ReadLE32(mem);
+    const uint32_t payload_size = ReadLE32(mem);
+    const uint32_t payload_size_padded = payload_size + (payload_size & 1);
+    const size_t payload_available = (payload_size_padded > MemDataSize(mem))
+                                   ? MemDataSize(mem) : payload_size_padded;
+    const size_t chunk_size = CHUNK_HEADER_SIZE + payload_available;
+
+    if (payload_size > MAX_CHUNK_PAYLOAD) return PARSE_ERROR;
+    if (SizeIsInvalid(mem, payload_size_padded)) return PARSE_ERROR;
+    if (payload_size_padded > MemDataSize(mem)) status = PARSE_NEED_MORE_DATA;
+
+    switch (fourcc) {
+      case MKFOURCC('A', 'L', 'P', 'H'):
+        if (alpha_chunks == 0) {
+          ++alpha_chunks;
+          frame->img_components_[1].offset_ = chunk_start_offset;
+          frame->img_components_[1].size_ = chunk_size;
+          frame->frame_num_ = frame_num;
+          Skip(mem, payload_available);
+        } else {
+          goto Done;
+        }
+        break;
+      case MKFOURCC('V', 'P', '8', 'L'):
+        if (alpha_chunks > 0) return PARSE_ERROR;  // VP8L has its own alpha
+        // fall through
+      case MKFOURCC('V', 'P', '8', ' '):
+        if (image_chunks == 0) {
+          // Extract the bitstream features, tolerating failures when the data
+          // is incomplete.
+          WebPBitstreamFeatures features;
+          const VP8StatusCode vp8_status =
+              WebPGetFeatures(mem->buf_ + chunk_start_offset, chunk_size,
+                              &features);
+          if (status == PARSE_NEED_MORE_DATA &&
+              vp8_status == VP8_STATUS_NOT_ENOUGH_DATA) {
+            return PARSE_NEED_MORE_DATA;
+          } else if (vp8_status != VP8_STATUS_OK) {
+            // We have enough data, and yet WebPGetFeatures() failed.
+            return PARSE_ERROR;
+          }
+          ++image_chunks;
+          frame->img_components_[0].offset_ = chunk_start_offset;
+          frame->img_components_[0].size_ = chunk_size;
+          frame->width_ = features.width;
+          frame->height_ = features.height;
+          if (has_vp8l_alpha != NULL) *has_vp8l_alpha = features.has_alpha;
+          frame->frame_num_ = frame_num;
+          frame->complete_ = (status == PARSE_OK);
+          Skip(mem, payload_available);
+        } else {
+          goto Done;
+        }
+        break;
+ Done:
+      default:
+        // Restore fourcc/size when moving up one level in parsing.
+        Rewind(mem, CHUNK_HEADER_SIZE);
+        done = 1;
+        break;
+    }
+
+    if (mem->start_ == mem->riff_end_) {
+      done = 1;
+    } else if (MemDataSize(mem) < CHUNK_HEADER_SIZE) {
+      status = PARSE_NEED_MORE_DATA;
+    }
+  } while (!done && status == PARSE_OK);
+
+  return status;
+}
+
+// Creates a new Frame if 'actual_size' is within bounds and 'mem' contains
+// enough data ('min_size') to parse the payload.
+// Returns PARSE_OK on success with *frame pointing to the new Frame.
+// Returns PARSE_NEED_MORE_DATA with insufficient data, PARSE_ERROR otherwise.
+static ParseStatus NewFrame(const MemBuffer* const mem,
+                            uint32_t min_size, uint32_t actual_size,
+                            Frame** frame) {
+  if (SizeIsInvalid(mem, min_size)) return PARSE_ERROR;
+  if (actual_size < min_size) return PARSE_ERROR;
+  if (MemDataSize(mem) < min_size)  return PARSE_NEED_MORE_DATA;
+
+  *frame = (Frame*)calloc(1, sizeof(**frame));
+  return (*frame == NULL) ? PARSE_ERROR : PARSE_OK;
+}
+
+// Parse a 'ANMF' chunk and any image bearing chunks that immediately follow.
+// 'frame_chunk_size' is the previously validated, padded chunk size.
+static ParseStatus ParseAnimationFrame(
+    WebPDemuxer* const dmux, uint32_t frame_chunk_size) {
+  const int has_frames = !!(dmux->feature_flags_ & ANIMATION_FLAG);
+  const uint32_t anmf_payload_size = frame_chunk_size - ANMF_CHUNK_SIZE;
+  int added_frame = 0;
+  MemBuffer* const mem = &dmux->mem_;
+  Frame* frame;
+  ParseStatus status =
+      NewFrame(mem, ANMF_CHUNK_SIZE, frame_chunk_size, &frame);
+  if (status != PARSE_OK) return status;
+
+  frame->x_offset_       = 2 * ReadLE24s(mem);
+  frame->y_offset_       = 2 * ReadLE24s(mem);
+  frame->width_          = 1 + ReadLE24s(mem);
+  frame->height_         = 1 + ReadLE24s(mem);
+  frame->duration_       = ReadLE24s(mem);
+  frame->dispose_method_ = (WebPMuxAnimDispose)(ReadByte(mem) & 1);
+  if (frame->width_ * (uint64_t)frame->height_ >= MAX_IMAGE_AREA) {
+    free(frame);
+    return PARSE_ERROR;
+  }
+
+  // Store a frame only if the animation flag is set there is some data for
+  // this frame is available.
+  status = StoreFrame(dmux->num_frames_ + 1, anmf_payload_size, mem, frame,
+                      NULL);
+  if (status != PARSE_ERROR && has_frames && frame->frame_num_ > 0) {
+    added_frame = AddFrame(dmux, frame);
+    if (added_frame) {
+      ++dmux->num_frames_;
+    } else {
+      status = PARSE_ERROR;
+    }
+  }
+
+  if (!added_frame) free(frame);
+  return status;
+}
+
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+// Parse a 'FRGM' chunk and any image bearing chunks that immediately follow.
+// 'fragment_chunk_size' is the previously validated, padded chunk size.
+static ParseStatus ParseFragment(WebPDemuxer* const dmux,
+                                 uint32_t fragment_chunk_size) {
+  const int frame_num = 1;  // All fragments belong to the 1st (and only) frame.
+  const int has_fragments = !!(dmux->feature_flags_ & FRAGMENTS_FLAG);
+  const uint32_t frgm_payload_size = fragment_chunk_size - FRGM_CHUNK_SIZE;
+  int added_fragment = 0;
+  MemBuffer* const mem = &dmux->mem_;
+  Frame* frame;
+  ParseStatus status =
+      NewFrame(mem, FRGM_CHUNK_SIZE, fragment_chunk_size, &frame);
+  if (status != PARSE_OK) return status;
+
+  frame->is_fragment_ = 1;
+  frame->x_offset_ = 2 * ReadLE24s(mem);
+  frame->y_offset_ = 2 * ReadLE24s(mem);
+
+  // Store a fragment only if the fragments flag is set there is some data for
+  // this fragment is available.
+  status = StoreFrame(frame_num, frgm_payload_size, mem, frame, NULL);
+  if (status != PARSE_ERROR && has_fragments && frame->frame_num_ > 0) {
+    added_fragment = AddFrame(dmux, frame);
+    if (!added_fragment) {
+      status = PARSE_ERROR;
+    } else {
+      dmux->num_frames_ = 1;
+    }
+  }
+
+  if (!added_fragment) free(frame);
+  return status;
+}
+#endif  // WEBP_EXPERIMENTAL_FEATURES
+
+// General chunk storage, starting with the header at 'start_offset', allowing
+// the user to request the payload via a fourcc string. 'size' includes the
+// header and the unpadded payload size.
+// Returns true on success, false otherwise.
+static int StoreChunk(WebPDemuxer* const dmux,
+                      size_t start_offset, uint32_t size) {
+  Chunk* const chunk = (Chunk*)calloc(1, sizeof(*chunk));
+  if (chunk == NULL) return 0;
+
+  chunk->data_.offset_ = start_offset;
+  chunk->data_.size_ = size;
+  AddChunk(dmux, chunk);
+  return 1;
+}
+
+// -----------------------------------------------------------------------------
+// Primary chunk parsing
+
+static int ReadHeader(MemBuffer* const mem) {
+  const size_t min_size = RIFF_HEADER_SIZE + CHUNK_HEADER_SIZE;
+  uint32_t riff_size;
+
+  // Basic file level validation.
+  if (MemDataSize(mem) < min_size) return 0;
+  if (memcmp(GetBuffer(mem), "RIFF", CHUNK_SIZE_BYTES) ||
+      memcmp(GetBuffer(mem) + CHUNK_HEADER_SIZE, "WEBP", CHUNK_SIZE_BYTES)) {
+    return 0;
+  }
+
+  riff_size = GetLE32(GetBuffer(mem) + TAG_SIZE);
+  if (riff_size < CHUNK_HEADER_SIZE) return 0;
+  if (riff_size > MAX_CHUNK_PAYLOAD) return 0;
+
+  // There's no point in reading past the end of the RIFF chunk
+  mem->riff_end_ = riff_size + CHUNK_HEADER_SIZE;
+  if (mem->buf_size_ > mem->riff_end_) {
+    mem->buf_size_ = mem->end_ = mem->riff_end_;
+  }
+
+  Skip(mem, RIFF_HEADER_SIZE);
+  return 1;
+}
+
+static ParseStatus ParseSingleImage(WebPDemuxer* const dmux) {
+  const size_t min_size = CHUNK_HEADER_SIZE;
+  MemBuffer* const mem = &dmux->mem_;
+  Frame* frame;
+  ParseStatus status;
+  int has_vp8l_alpha = 0;  // Frame contains a lossless image with alpha.
+
+  if (dmux->frames_ != NULL) return PARSE_ERROR;
+  if (SizeIsInvalid(mem, min_size)) return PARSE_ERROR;
+  if (MemDataSize(mem) < min_size) return PARSE_NEED_MORE_DATA;
+
+  frame = (Frame*)calloc(1, sizeof(*frame));
+  if (frame == NULL) return PARSE_ERROR;
+
+  // For the single image case we allow parsing of a partial frame, but we need
+  // at least CHUNK_HEADER_SIZE for parsing.
+  status = StoreFrame(1, CHUNK_HEADER_SIZE, &dmux->mem_, frame,
+                      &has_vp8l_alpha);
+  if (status != PARSE_ERROR) {
+    const int has_alpha = !!(dmux->feature_flags_ & ALPHA_FLAG);
+    // Clear any alpha when the alpha flag is missing.
+    if (!has_alpha && frame->img_components_[1].size_ > 0) {
+      frame->img_components_[1].offset_ = 0;
+      frame->img_components_[1].size_ = 0;
+    }
+
+    // Use the frame width/height as the canvas values for non-vp8x files.
+    // Also, set ALPHA_FLAG if this is a lossless image with alpha.
+    if (!dmux->is_ext_format_ && frame->width_ > 0 && frame->height_ > 0) {
+      dmux->state_ = WEBP_DEMUX_PARSED_HEADER;
+      dmux->canvas_width_ = frame->width_;
+      dmux->canvas_height_ = frame->height_;
+      dmux->feature_flags_ |= has_vp8l_alpha ? ALPHA_FLAG : 0;
+    }
+    AddFrame(dmux, frame);
+    dmux->num_frames_ = 1;
+  } else {
+    free(frame);
+  }
+
+  return status;
+}
+
+static ParseStatus ParseVP8X(WebPDemuxer* const dmux) {
+  MemBuffer* const mem = &dmux->mem_;
+  int anim_chunks = 0;
+  uint32_t vp8x_size;
+  ParseStatus status = PARSE_OK;
+
+  if (MemDataSize(mem) < CHUNK_HEADER_SIZE) return PARSE_NEED_MORE_DATA;
+
+  dmux->is_ext_format_ = 1;
+  Skip(mem, TAG_SIZE);  // VP8X
+  vp8x_size = ReadLE32(mem);
+  if (vp8x_size > MAX_CHUNK_PAYLOAD) return PARSE_ERROR;
+  if (vp8x_size < VP8X_CHUNK_SIZE) return PARSE_ERROR;
+  vp8x_size += vp8x_size & 1;
+  if (SizeIsInvalid(mem, vp8x_size)) return PARSE_ERROR;
+  if (MemDataSize(mem) < vp8x_size) return PARSE_NEED_MORE_DATA;
+
+  dmux->feature_flags_ = ReadByte(mem);
+  Skip(mem, 3);  // Reserved.
+  dmux->canvas_width_  = 1 + ReadLE24s(mem);
+  dmux->canvas_height_ = 1 + ReadLE24s(mem);
+  if (dmux->canvas_width_ * (uint64_t)dmux->canvas_height_ >= MAX_IMAGE_AREA) {
+    return PARSE_ERROR;  // image final dimension is too large
+  }
+  Skip(mem, vp8x_size - VP8X_CHUNK_SIZE);  // skip any trailing data.
+  dmux->state_ = WEBP_DEMUX_PARSED_HEADER;
+
+  if (SizeIsInvalid(mem, CHUNK_HEADER_SIZE)) return PARSE_ERROR;
+  if (MemDataSize(mem) < CHUNK_HEADER_SIZE) return PARSE_NEED_MORE_DATA;
+
+  do {
+    int store_chunk = 1;
+    const size_t chunk_start_offset = mem->start_;
+    const uint32_t fourcc = ReadLE32(mem);
+    const uint32_t chunk_size = ReadLE32(mem);
+    const uint32_t chunk_size_padded = chunk_size + (chunk_size & 1);
+
+    if (chunk_size > MAX_CHUNK_PAYLOAD) return PARSE_ERROR;
+    if (SizeIsInvalid(mem, chunk_size_padded)) return PARSE_ERROR;
+
+    switch (fourcc) {
+      case MKFOURCC('V', 'P', '8', 'X'): {
+        return PARSE_ERROR;
+      }
+      case MKFOURCC('A', 'L', 'P', 'H'):
+      case MKFOURCC('V', 'P', '8', ' '):
+      case MKFOURCC('V', 'P', '8', 'L'): {
+        // check that this isn't an animation (all frames should be in an ANMF).
+        if (anim_chunks > 0) return PARSE_ERROR;
+
+        Rewind(mem, CHUNK_HEADER_SIZE);
+        status = ParseSingleImage(dmux);
+        break;
+      }
+      case MKFOURCC('A', 'N', 'I', 'M'): {
+        if (chunk_size_padded < ANIM_CHUNK_SIZE) return PARSE_ERROR;
+
+        if (MemDataSize(mem) < chunk_size_padded) {
+          status = PARSE_NEED_MORE_DATA;
+        } else if (anim_chunks == 0) {
+          ++anim_chunks;
+          dmux->bgcolor_ = ReadLE32(mem);
+          dmux->loop_count_ = ReadLE16s(mem);
+          Skip(mem, chunk_size_padded - ANIM_CHUNK_SIZE);
+        } else {
+          store_chunk = 0;
+          goto Skip;
+        }
+        break;
+      }
+      case MKFOURCC('A', 'N', 'M', 'F'): {
+        if (anim_chunks == 0) return PARSE_ERROR;  // 'ANIM' precedes frames.
+        status = ParseAnimationFrame(dmux, chunk_size_padded);
+        break;
+      }
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+      case MKFOURCC('F', 'R', 'G', 'M'): {
+        status = ParseFragment(dmux, chunk_size_padded);
+        break;
+      }
+#endif
+      case MKFOURCC('I', 'C', 'C', 'P'): {
+        store_chunk = !!(dmux->feature_flags_ & ICCP_FLAG);
+        goto Skip;
+      }
+      case MKFOURCC('X', 'M', 'P', ' '): {
+        store_chunk = !!(dmux->feature_flags_ & XMP_FLAG);
+        goto Skip;
+      }
+      case MKFOURCC('E', 'X', 'I', 'F'): {
+        store_chunk = !!(dmux->feature_flags_ & EXIF_FLAG);
+        goto Skip;
+      }
+ Skip:
+      default: {
+        if (chunk_size_padded <= MemDataSize(mem)) {
+          if (store_chunk) {
+            // Store only the chunk header and unpadded size as only the payload
+            // will be returned to the user.
+            if (!StoreChunk(dmux, chunk_start_offset,
+                            CHUNK_HEADER_SIZE + chunk_size)) {
+              return PARSE_ERROR;
+            }
+          }
+          Skip(mem, chunk_size_padded);
+        } else {
+          status = PARSE_NEED_MORE_DATA;
+        }
+      }
+    }
+
+    if (mem->start_ == mem->riff_end_) {
+      break;
+    } else if (MemDataSize(mem) < CHUNK_HEADER_SIZE) {
+      status = PARSE_NEED_MORE_DATA;
+    }
+  } while (status == PARSE_OK);
+
+  return status;
+}
+
+// -----------------------------------------------------------------------------
+// Format validation
+
+static int IsValidSimpleFormat(const WebPDemuxer* const dmux) {
+  const Frame* const frame = dmux->frames_;
+  if (dmux->state_ == WEBP_DEMUX_PARSING_HEADER) return 1;
+
+  if (dmux->canvas_width_ <= 0 || dmux->canvas_height_ <= 0) return 0;
+  if (dmux->state_ == WEBP_DEMUX_DONE && frame == NULL) return 0;
+
+  if (frame->width_ <= 0 || frame->height_ <= 0) return 0;
+  return 1;
+}
+
+static int IsValidExtendedFormat(const WebPDemuxer* const dmux) {
+  const int has_fragments = !!(dmux->feature_flags_ & FRAGMENTS_FLAG);
+  const int has_frames = !!(dmux->feature_flags_ & ANIMATION_FLAG);
+  const Frame* f;
+
+  if (dmux->state_ == WEBP_DEMUX_PARSING_HEADER) return 1;
+
+  if (dmux->canvas_width_ <= 0 || dmux->canvas_height_ <= 0) return 0;
+  if (dmux->loop_count_ < 0) return 0;
+  if (dmux->state_ == WEBP_DEMUX_DONE && dmux->frames_ == NULL) return 0;
+
+  for (f = dmux->frames_; f != NULL; f = f->next_) {
+    const int cur_frame_set = f->frame_num_;
+    int frame_count = 0, fragment_count = 0;
+
+    // Check frame properties and if the image is composed of fragments that
+    // each fragment came from a fragment.
+    for (; f != NULL && f->frame_num_ == cur_frame_set; f = f->next_) {
+      const ChunkData* const image = f->img_components_;
+      const ChunkData* const alpha = f->img_components_ + 1;
+
+      if (!has_fragments && f->is_fragment_) return 0;
+      if (!has_frames && f->frame_num_ > 1) return 0;
+      if (f->x_offset_ < 0 || f->y_offset_ < 0) return 0;
+      if (f->complete_) {
+        if (alpha->size_ == 0 && image->size_ == 0) return 0;
+        // Ensure alpha precedes image bitstream.
+        if (alpha->size_ > 0 && alpha->offset_ > image->offset_) {
+          return 0;
+        }
+
+        if (f->width_ <= 0 || f->height_ <= 0) return 0;
+      } else {
+        // There shouldn't be a partial frame in a complete file.
+        if (dmux->state_ == WEBP_DEMUX_DONE) return 0;
+
+        // Ensure alpha precedes image bitstream.
+        if (alpha->size_ > 0 && image->size_ > 0 &&
+            alpha->offset_ > image->offset_) {
+          return 0;
+        }
+        // There shouldn't be any frames after an incomplete one.
+        if (f->next_ != NULL) return 0;
+      }
+
+      fragment_count += f->is_fragment_;
+      ++frame_count;
+    }
+    if (!has_fragments && frame_count > 1) return 0;
+    if (fragment_count > 0 && frame_count != fragment_count) return 0;
+    if (f == NULL) break;
+  }
+  return 1;
+}
+
+// -----------------------------------------------------------------------------
+// WebPDemuxer object
+
+static void InitDemux(WebPDemuxer* const dmux, const MemBuffer* const mem) {
+  dmux->state_ = WEBP_DEMUX_PARSING_HEADER;
+  dmux->loop_count_ = 1;
+  dmux->bgcolor_ = 0xFFFFFFFF;  // White background by default.
+  dmux->canvas_width_ = -1;
+  dmux->canvas_height_ = -1;
+  dmux->frames_tail_ = &dmux->frames_;
+  dmux->mem_ = *mem;
+}
+
+WebPDemuxer* WebPDemuxInternal(const WebPData* data, int allow_partial,
+                               WebPDemuxState* state, int version) {
+  const ChunkParser* parser;
+  int partial;
+  ParseStatus status = PARSE_ERROR;
+  MemBuffer mem;
+  WebPDemuxer* dmux;
+
+  if (WEBP_ABI_IS_INCOMPATIBLE(version, WEBP_DEMUX_ABI_VERSION)) return NULL;
+  if (data == NULL || data->bytes == NULL || data->size == 0) return NULL;
+
+  if (!InitMemBuffer(&mem, data->bytes, data->size)) return NULL;
+  if (!ReadHeader(&mem)) return NULL;
+
+  partial = (mem.buf_size_ < mem.riff_end_);
+  if (!allow_partial && partial) return NULL;
+
+  dmux = (WebPDemuxer*)calloc(1, sizeof(*dmux));
+  if (dmux == NULL) return NULL;
+  InitDemux(dmux, &mem);
+
+  for (parser = kMasterChunks; parser->parse != NULL; ++parser) {
+    if (!memcmp(parser->id, GetBuffer(&dmux->mem_), TAG_SIZE)) {
+      status = parser->parse(dmux);
+      if (status == PARSE_OK) dmux->state_ = WEBP_DEMUX_DONE;
+      if (status == PARSE_NEED_MORE_DATA && !partial) status = PARSE_ERROR;
+      if (status != PARSE_ERROR && !parser->valid(dmux)) status = PARSE_ERROR;
+      break;
+    }
+  }
+  if (state) *state = dmux->state_;
+
+  if (status == PARSE_ERROR) {
+    WebPDemuxDelete(dmux);
+    return NULL;
+  }
+  return dmux;
+}
+
+void WebPDemuxDelete(WebPDemuxer* dmux) {
+  Chunk* c;
+  Frame* f;
+  if (dmux == NULL) return;
+
+  for (f = dmux->frames_; f != NULL;) {
+    Frame* const cur_frame = f;
+    f = f->next_;
+    free(cur_frame);
+  }
+  for (c = dmux->chunks_; c != NULL;) {
+    Chunk* const cur_chunk = c;
+    c = c->next_;
+    free(cur_chunk);
+  }
+  free(dmux);
+}
+
+// -----------------------------------------------------------------------------
+
+uint32_t WebPDemuxGetI(const WebPDemuxer* dmux, WebPFormatFeature feature) {
+  if (dmux == NULL) return 0;
+
+  switch (feature) {
+    case WEBP_FF_FORMAT_FLAGS:     return dmux->feature_flags_;
+    case WEBP_FF_CANVAS_WIDTH:     return (uint32_t)dmux->canvas_width_;
+    case WEBP_FF_CANVAS_HEIGHT:    return (uint32_t)dmux->canvas_height_;
+    case WEBP_FF_LOOP_COUNT:       return (uint32_t)dmux->loop_count_;
+    case WEBP_FF_BACKGROUND_COLOR: return dmux->bgcolor_;
+    case WEBP_FF_FRAME_COUNT:      return (uint32_t)dmux->num_frames_;
+  }
+  return 0;
+}
+
+// -----------------------------------------------------------------------------
+// Frame iteration
+
+// Find the first 'frame_num' frame. There may be multiple such frames in a
+// fragmented frame.
+static const Frame* GetFrame(const WebPDemuxer* const dmux, int frame_num) {
+  const Frame* f;
+  for (f = dmux->frames_; f != NULL; f = f->next_) {
+    if (frame_num == f->frame_num_) break;
+  }
+  return f;
+}
+
+// Returns fragment 'fragment_num' and the total count.
+static const Frame* GetFragment(
+    const Frame* const frame_set, int fragment_num, int* const count) {
+  const int this_frame = frame_set->frame_num_;
+  const Frame* f = frame_set;
+  const Frame* fragment = NULL;
+  int total;
+
+  for (total = 0; f != NULL && f->frame_num_ == this_frame; f = f->next_) {
+    if (++total == fragment_num) fragment = f;
+  }
+  *count = total;
+  return fragment;
+}
+
+static const uint8_t* GetFramePayload(const uint8_t* const mem_buf,
+                                      const Frame* const frame,
+                                      size_t* const data_size) {
+  *data_size = 0;
+  if (frame != NULL) {
+    const ChunkData* const image = frame->img_components_;
+    const ChunkData* const alpha = frame->img_components_ + 1;
+    size_t start_offset = image->offset_;
+    *data_size = image->size_;
+
+    // if alpha exists it precedes image, update the size allowing for
+    // intervening chunks.
+    if (alpha->size_ > 0) {
+      const size_t inter_size = (image->offset_ > 0)
+                              ? image->offset_ - (alpha->offset_ + alpha->size_)
+                              : 0;
+      start_offset = alpha->offset_;
+      *data_size  += alpha->size_ + inter_size;
+    }
+    return mem_buf + start_offset;
+  }
+  return NULL;
+}
+
+// Create a whole 'frame' from VP8 (+ alpha) or lossless.
+static int SynthesizeFrame(const WebPDemuxer* const dmux,
+                           const Frame* const first_frame,
+                           int fragment_num, WebPIterator* const iter) {
+  const uint8_t* const mem_buf = dmux->mem_.buf_;
+  int num_fragments;
+  size_t payload_size = 0;
+  const Frame* const fragment =
+      GetFragment(first_frame, fragment_num, &num_fragments);
+  const uint8_t* const payload =
+      GetFramePayload(mem_buf, fragment, &payload_size);
+  if (payload == NULL) return 0;
+  assert(first_frame != NULL);
+
+  iter->frame_num      = first_frame->frame_num_;
+  iter->num_frames     = dmux->num_frames_;
+  iter->fragment_num   = fragment_num;
+  iter->num_fragments  = num_fragments;
+  iter->x_offset       = fragment->x_offset_;
+  iter->y_offset       = fragment->y_offset_;
+  iter->width          = fragment->width_;
+  iter->height         = fragment->height_;
+  iter->duration       = fragment->duration_;
+  iter->dispose_method = fragment->dispose_method_;
+  iter->complete       = fragment->complete_;
+  iter->fragment.bytes = payload;
+  iter->fragment.size  = payload_size;
+  // TODO(jzern): adjust offsets for 'FRGM's embedded in 'ANMF's
+  return 1;
+}
+
+static int SetFrame(int frame_num, WebPIterator* const iter) {
+  const Frame* frame;
+  const WebPDemuxer* const dmux = (WebPDemuxer*)iter->private_;
+  if (dmux == NULL || frame_num < 0) return 0;
+  if (frame_num > dmux->num_frames_) return 0;
+  if (frame_num == 0) frame_num = dmux->num_frames_;
+
+  frame = GetFrame(dmux, frame_num);
+  if (frame == NULL) return 0;
+
+  return SynthesizeFrame(dmux, frame, 1, iter);
+}
+
+int WebPDemuxGetFrame(const WebPDemuxer* dmux, int frame, WebPIterator* iter) {
+  if (iter == NULL) return 0;
+
+  memset(iter, 0, sizeof(*iter));
+  iter->private_ = (void*)dmux;
+  return SetFrame(frame, iter);
+}
+
+int WebPDemuxNextFrame(WebPIterator* iter) {
+  if (iter == NULL) return 0;
+  return SetFrame(iter->frame_num + 1, iter);
+}
+
+int WebPDemuxPrevFrame(WebPIterator* iter) {
+  if (iter == NULL) return 0;
+  if (iter->frame_num <= 1) return 0;
+  return SetFrame(iter->frame_num - 1, iter);
+}
+
+int WebPDemuxSelectFragment(WebPIterator* iter, int fragment_num) {
+  if (iter != NULL && iter->private_ != NULL && fragment_num > 0) {
+    const WebPDemuxer* const dmux = (WebPDemuxer*)iter->private_;
+    const Frame* const frame = GetFrame(dmux, iter->frame_num);
+    if (frame == NULL) return 0;
+
+    return SynthesizeFrame(dmux, frame, fragment_num, iter);
+  }
+  return 0;
+}
+
+void WebPDemuxReleaseIterator(WebPIterator* iter) {
+  (void)iter;
+}
+
+// -----------------------------------------------------------------------------
+// Chunk iteration
+
+static int ChunkCount(const WebPDemuxer* const dmux, const char fourcc[4]) {
+  const uint8_t* const mem_buf = dmux->mem_.buf_;
+  const Chunk* c;
+  int count = 0;
+  for (c = dmux->chunks_; c != NULL; c = c->next_) {
+    const uint8_t* const header = mem_buf + c->data_.offset_;
+    if (!memcmp(header, fourcc, TAG_SIZE)) ++count;
+  }
+  return count;
+}
+
+static const Chunk* GetChunk(const WebPDemuxer* const dmux,
+                             const char fourcc[4], int chunk_num) {
+  const uint8_t* const mem_buf = dmux->mem_.buf_;
+  const Chunk* c;
+  int count = 0;
+  for (c = dmux->chunks_; c != NULL; c = c->next_) {
+    const uint8_t* const header = mem_buf + c->data_.offset_;
+    if (!memcmp(header, fourcc, TAG_SIZE)) ++count;
+    if (count == chunk_num) break;
+  }
+  return c;
+}
+
+static int SetChunk(const char fourcc[4], int chunk_num,
+                    WebPChunkIterator* const iter) {
+  const WebPDemuxer* const dmux = (WebPDemuxer*)iter->private_;
+  int count;
+
+  if (dmux == NULL || fourcc == NULL || chunk_num < 0) return 0;
+  count = ChunkCount(dmux, fourcc);
+  if (count == 0) return 0;
+  if (chunk_num == 0) chunk_num = count;
+
+  if (chunk_num <= count) {
+    const uint8_t* const mem_buf = dmux->mem_.buf_;
+    const Chunk* const chunk = GetChunk(dmux, fourcc, chunk_num);
+    iter->chunk.bytes = mem_buf + chunk->data_.offset_ + CHUNK_HEADER_SIZE;
+    iter->chunk.size  = chunk->data_.size_ - CHUNK_HEADER_SIZE;
+    iter->num_chunks  = count;
+    iter->chunk_num   = chunk_num;
+    return 1;
+  }
+  return 0;
+}
+
+int WebPDemuxGetChunk(const WebPDemuxer* dmux,
+                      const char fourcc[4], int chunk_num,
+                      WebPChunkIterator* iter) {
+  if (iter == NULL) return 0;
+
+  memset(iter, 0, sizeof(*iter));
+  iter->private_ = (void*)dmux;
+  return SetChunk(fourcc, chunk_num, iter);
+}
+
+int WebPDemuxNextChunk(WebPChunkIterator* iter) {
+  if (iter != NULL) {
+    const char* const fourcc =
+        (const char*)iter->chunk.bytes - CHUNK_HEADER_SIZE;
+    return SetChunk(fourcc, iter->chunk_num + 1, iter);
+  }
+  return 0;
+}
+
+int WebPDemuxPrevChunk(WebPChunkIterator* iter) {
+  if (iter != NULL && iter->chunk_num > 1) {
+    const char* const fourcc =
+        (const char*)iter->chunk.bytes - CHUNK_HEADER_SIZE;
+    return SetChunk(fourcc, iter->chunk_num - 1, iter);
+  }
+  return 0;
+}
+
+void WebPDemuxReleaseChunkIterator(WebPChunkIterator* iter) {
+  (void)iter;
+}
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}  // extern "C"
+#endif
--- a/src/demux/libwebpdemux.pc.in
+++ b/src/demux/libwebpdemux.pc.in
@ -0,0 +1,11 @@
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+
+Name: libwebpdemux
+Description: Library for parsing the WebP graphics format container
+Version: @PACKAGE_VERSION@
+Requires: libwebp >= 0.2.0
+Cflags: -I${includedir}
+Libs: -L${libdir} -lwebpdemux
--- a/src/dsp/Makefile.am
+++ b/src/dsp/Makefile.am
@ -1,14 +1,44 @@
 AM_CPPFLAGS = -I$(top_srcdir)/src
-
-libwebpdsp_la_SOURCES = dsp.h cpu.c \
-                        enc.c enc_sse2.c \
-                        dec.c dec_sse2.c dec_neon.c \
-                        upsampling.c upsampling_sse2.c \
-                        yuv.h yuv.c
-libwebpdsp_la_LDFLAGS = -version-info 0:0:0 -lm
-libwebpdsp_la_CPPFLAGS = $(USE_EXPERIMENTAL_CODE)
-libwebpdspinclude_HEADERS = ../webp/types.h
-libwebpdspincludedir = $(includedir)/webp
-
-noinst_HEADERS = dsp.h yuv.h
 noinst_LTLIBRARIES = libwebpdsp.la
+
+if BUILD_LIBWEBPDECODER
+  noinst_LTLIBRARIES += libwebpdspdecode.la
+endif
+
+common_HEADERS = ../webp/types.h
+commondir = $(includedir)/webp
+
+COMMON_SOURCES =
+COMMON_SOURCES += cpu.c
+COMMON_SOURCES += dec.c
+COMMON_SOURCES += dec_neon.c
+COMMON_SOURCES += dec_sse2.c
+COMMON_SOURCES += dsp.h
+COMMON_SOURCES += lossless.c
+COMMON_SOURCES += lossless.h
+COMMON_SOURCES += upsampling.c
+COMMON_SOURCES += upsampling_neon.c
+COMMON_SOURCES += upsampling_sse2.c
+COMMON_SOURCES += yuv.c
+COMMON_SOURCES += yuv.h
+
+ENC_SOURCES =
+ENC_SOURCES += enc.c
+ENC_SOURCES += enc_neon.c
+ENC_SOURCES += enc_sse2.c
+
+libwebpdsp_la_SOURCES = $(COMMON_SOURCES) $(ENC_SOURCES)
+
+noinst_HEADERS =
+noinst_HEADERS += ../dec/decode_vp8.h
+noinst_HEADERS += ../webp/decode.h
+
+libwebpdsp_la_LDFLAGS = -lm
+libwebpdsp_la_CPPFLAGS = $(USE_EXPERIMENTAL_CODE) $(USE_SWAP_16BIT_CSP)
+
+if BUILD_LIBWEBPDECODER
+  libwebpdspdecode_la_SOURCES = $(COMMON_SOURCES)
+
+  libwebpdspdecode_la_LDFLAGS = $(libwebpdsp_la_LDFLAGS)
+  libwebpdspdecode_la_CPPFLAGS = $(libwebpdsp_la_CPPFLAGS)
+endif
--- a/src/dsp/cpu.c
+++ b/src/dsp/cpu.c
@ -1,18 +1,22 @@
-// Copyright 2011 Google Inc.
+// Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // CPU detection
 //
 // Author: Christian Duvivier (cduvivier@google.com)

-#include <stddef.h>  // for NULL
-
 #include "./dsp.h"

+#if defined(__ANDROID__)
+#include <cpu-features.h>
+#endif
+
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
@ -21,8 +25,9 @@ extern "C" {
 // SSE2 detection.
 //

-#if defined(__pic__) && defined(__i386__)
-static inline void GetCPUInfo(int cpu_info[4], int info_type) {
+// apple/darwin gcc-4.0.1 defines __PIC__, but not __pic__ with -fPIC.
+#if (defined(__pic__) || defined(__PIC__)) && defined(__i386__)
+static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) {
  __asm__ volatile (
    "mov %%ebx, %%edi\n"
    "cpuid\n"
@ -31,17 +36,17 @@ static inline void GetCPUInfo(int cpu_info[4], int info_type) {
    : "a"(info_type));
 }
 #elif defined(__i386__) || defined(__x86_64__)
-static inline void GetCPUInfo(int cpu_info[4], int info_type) {
+static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) {
  __asm__ volatile (
    "cpuid\n"
    : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
    : "a"(info_type));
 }
-#elif defined(_MSC_VER)  // Visual C++
+#elif defined(WEBP_MSC_SSE2)
 #define GetCPUInfo __cpuid
 #endif

-#if defined(__i386__) || defined(__x86_64__) || defined(_MSC_VER)
+#if defined(__i386__) || defined(__x86_64__) || defined(WEBP_MSC_SSE2)
 static int x86CPUInfo(CPUFeature feature) {
  int cpu_info[4];
  GetCPUInfo(cpu_info, 1);
@ -54,10 +59,22 @@ static int x86CPUInfo(CPUFeature feature) {
  return 0;
 }
 VP8CPUInfo VP8GetCPUInfo = x86CPUInfo;
+#elif defined(WEBP_ANDROID_NEON)
+static int AndroidCPUInfo(CPUFeature feature) {
+  const AndroidCpuFamily cpu_family = android_getCpuFamily();
+  const uint64_t cpu_features = android_getCpuFeatures();
+  if (feature == kNEON) {
+    return (cpu_family == ANDROID_CPU_FAMILY_ARM &&
+            0 != (cpu_features & ANDROID_CPU_ARM_FEATURE_NEON));
+  }
+  return 0;
+}
+VP8CPUInfo VP8GetCPUInfo = AndroidCPUInfo;
 #elif defined(__ARM_NEON__)
 // define a dummy function to enable turning off NEON at runtime by setting
 // VP8DecGetCPUInfo = NULL
 static int armCPUInfo(CPUFeature feature) {
+  (void)feature;
  return 1;
 }
 VP8CPUInfo VP8GetCPUInfo = armCPUInfo;
--- a/src/dsp/dec.c
+++ b/src/dsp/dec.c
@ -1,8 +1,10 @@
-// Copyright 2010 Google Inc.
+// Copyright 2010 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Speed-critical decoding functions.
@ -49,7 +51,7 @@ static void DspInitTables(void) {
  }
 }

-static inline uint8_t clip_8b(int v) {
+static WEBP_INLINE uint8_t clip_8b(int v) {
  return (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
 }

@ -171,7 +173,7 @@ void (*VP8TransformWHT)(const int16_t* in, int16_t* out) = TransformWHT;

 #define DST(x, y) dst[(x) + (y) * BPS]

-static inline void TrueMotion(uint8_t *dst, int size) {
+static WEBP_INLINE void TrueMotion(uint8_t *dst, int size) {
  const uint8_t* top = dst - BPS;
  const uint8_t* const clip0 = clip1 + 255 - top[-1];
  int y;
@ -206,7 +208,7 @@ static void HE16(uint8_t *dst) {     // horizontal
  }
 }

-static inline void Put16(int v, uint8_t* dst) {
+static WEBP_INLINE void Put16(int v, uint8_t* dst) {
  int j;
  for (j = 0; j < 16; ++j) {
    memset(dst + j * BPS, v, 16);
@ -426,11 +428,16 @@ static void HE8uv(uint8_t *dst) {    // horizontal
 }

 // helper for chroma-DC predictions
-static inline void Put8x8uv(uint64_t v, uint8_t* dst) {
+static WEBP_INLINE void Put8x8uv(uint8_t value, uint8_t* dst) {
  int j;
+#ifndef WEBP_REFERENCE_IMPLEMENTATION
+  const uint64_t v = (uint64_t)value * 0x0101010101010101ULL;
  for (j = 0; j < 8; ++j) {
    *(uint64_t*)(dst + j * BPS) = v;
  }
+#else
+  for (j = 0; j < 8; ++j) memset(dst + j * BPS, value, 8);
+#endif
 }

 static void DC8uv(uint8_t *dst) {     // DC
@ -439,7 +446,7 @@ static void DC8uv(uint8_t *dst) {     // DC
  for (i = 0; i < 8; ++i) {
    dc0 += dst[i - BPS] + dst[-1 + i * BPS];
  }
-  Put8x8uv((uint64_t)((dc0 >> 4) * 0x0101010101010101ULL), dst);
+  Put8x8uv(dc0 >> 4, dst);
 }

 static void DC8uvNoLeft(uint8_t *dst) {   // DC with no left samples
@ -448,7 +455,7 @@ static void DC8uvNoLeft(uint8_t *dst) {   // DC with no left samples
  for (i = 0; i < 8; ++i) {
    dc0 += dst[i - BPS];
  }
-  Put8x8uv((uint64_t)((dc0 >> 3) * 0x0101010101010101ULL), dst);
+  Put8x8uv(dc0 >> 3, dst);
 }

 static void DC8uvNoTop(uint8_t *dst) {  // DC with no top samples
@ -457,26 +464,26 @@ static void DC8uvNoTop(uint8_t *dst) {  // DC with no top samples
  for (i = 0; i < 8; ++i) {
    dc0 += dst[-1 + i * BPS];
  }
-  Put8x8uv((uint64_t)((dc0 >> 3) * 0x0101010101010101ULL), dst);
+  Put8x8uv(dc0 >> 3, dst);
 }

 static void DC8uvNoTopLeft(uint8_t *dst) {    // DC with nothing
-  Put8x8uv(0x8080808080808080ULL, dst);
+  Put8x8uv(0x80, dst);
 }

 //------------------------------------------------------------------------------
 // default C implementations

-VP8PredFunc VP8PredLuma4[/* NUM_BMODES */] = {
+const VP8PredFunc VP8PredLuma4[NUM_BMODES] = {
  DC4, TM4, VE4, HE4, RD4, VR4, LD4, VL4, HD4, HU4
 };

-VP8PredFunc VP8PredLuma16[/*NUM_B_DC_MODES */] = {
+const VP8PredFunc VP8PredLuma16[NUM_B_DC_MODES] = {
  DC16, TM16, VE16, HE16,
  DC16NoTop, DC16NoLeft, DC16NoTopLeft
 };

-VP8PredFunc VP8PredChroma8[/*NUM_B_DC_MODES */] = {
+const VP8PredFunc VP8PredChroma8[NUM_B_DC_MODES] = {
  DC8uv, TM8uv, VE8uv, HE8uv,
  DC8uvNoTop, DC8uvNoLeft, DC8uvNoTopLeft
 };
@ -485,7 +492,7 @@ VP8PredFunc VP8PredChroma8[/*NUM_B_DC_MODES */] = {
 // Edge filtering functions

 // 4 pixels in, 2 pixels out
-static inline void do_filter2(uint8_t* p, int step) {
+static WEBP_INLINE void do_filter2(uint8_t* p, int step) {
  const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
  const int a = 3 * (q0 - p0) + sclip1[1020 + p1 - q1];
  const int a1 = sclip2[112 + ((a + 4) >> 3)];
@ -495,7 +502,7 @@ static inline void do_filter2(uint8_t* p, int step) {
 }

 // 4 pixels in, 4 pixels out
-static inline void do_filter4(uint8_t* p, int step) {
+static WEBP_INLINE void do_filter4(uint8_t* p, int step) {
  const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
  const int a = 3 * (q0 - p0);
  const int a1 = sclip2[112 + ((a + 4) >> 3)];
@ -508,7 +515,7 @@ static inline void do_filter4(uint8_t* p, int step) {
 }

 // 6 pixels in, 6 pixels out
-static inline void do_filter6(uint8_t* p, int step) {
+static WEBP_INLINE void do_filter6(uint8_t* p, int step) {
  const int p2 = p[-3*step], p1 = p[-2*step], p0 = p[-step];
  const int q0 = p[0], q1 = p[step], q2 = p[2*step];
  const int a = sclip1[1020 + 3 * (q0 - p0) + sclip1[1020 + p1 - q1]];
@ -523,17 +530,18 @@ static inline void do_filter6(uint8_t* p, int step) {
  p[ 2*step] = clip1[255 + q2 - a3];
 }

-static inline int hev(const uint8_t* p, int step, int thresh) {
+static WEBP_INLINE int hev(const uint8_t* p, int step, int thresh) {
  const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
  return (abs0[255 + p1 - p0] > thresh) || (abs0[255 + q1 - q0] > thresh);
 }

-static inline int needs_filter(const uint8_t* p, int step, int thresh) {
+static WEBP_INLINE int needs_filter(const uint8_t* p, int step, int thresh) {
  const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
  return (2 * abs0[255 + p0 - q0] + abs1[255 + p1 - q1]) <= thresh;
 }

-static inline int needs_filter2(const uint8_t* p, int step, int t, int it) {
+static WEBP_INLINE int needs_filter2(const uint8_t* p,
+                                     int step, int t, int it) {
  const int p3 = p[-4*step], p2 = p[-3*step], p1 = p[-2*step], p0 = p[-step];
  const int q0 = p[0], q1 = p[step], q2 = p[2*step], q3 = p[3*step];
  if ((2 * abs0[255 + p0 - q0] + abs1[255 + p1 - q1]) > t)
@ -583,8 +591,9 @@ static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
 //------------------------------------------------------------------------------
 // Complex In-loop filtering (Paragraph 15.3)

-static inline void FilterLoop26(uint8_t* p, int hstride, int vstride, int size,
-                                int thresh, int ithresh, int hev_thresh) {
+static WEBP_INLINE void FilterLoop26(uint8_t* p,
+                                     int hstride, int vstride, int size,
+                                     int thresh, int ithresh, int hev_thresh) {
  while (size-- > 0) {
    if (needs_filter2(p, hstride, thresh, ithresh)) {
      if (hev(p, hstride, hev_thresh)) {
@ -597,8 +606,9 @@ static inline void FilterLoop26(uint8_t* p, int hstride, int vstride, int size,
  }
 }

-static inline void FilterLoop24(uint8_t* p, int hstride, int vstride, int size,
-                                int thresh, int ithresh, int hev_thresh) {
+static WEBP_INLINE void FilterLoop24(uint8_t* p,
+                                     int hstride, int vstride, int size,
+                                     int thresh, int ithresh, int hev_thresh) {
  while (size-- > 0) {
    if (needs_filter2(p, hstride, thresh, ithresh)) {
      if (hev(p, hstride, hev_thresh)) {
@ -712,11 +722,11 @@ void VP8DspInit(void) {

  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
  if (VP8GetCPUInfo) {
-#if defined(__SSE2__) || defined(_MSC_VER)
+#if defined(WEBP_USE_SSE2)
    if (VP8GetCPUInfo(kSSE2)) {
      VP8DspInitSSE2();
    }
-#elif defined(__GNUC__) && defined(__ARM_NEON__)
+#elif defined(WEBP_USE_NEON)
    if (VP8GetCPUInfo(kNEON)) {
      VP8DspInitNEON();
    }
--- a/src/dsp/dec_neon.c
+++ b/src/dsp/dec_neon.c
@ -1,22 +1,27 @@
-// Copyright 2011 Google Inc.
+// Copyright 2012 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // ARM NEON version of dsp functions and loop filtering.
 //
-// Author: somnath@google.com (Somnath Banerjee)
+// Authors: Somnath Banerjee (somnath@google.com)
+//          Johann Koenig (johannkoenig@google.com)

-#if defined(__GNUC__) && defined(__ARM_NEON__)
-
-#include "../dec/vp8i.h"
+#include "./dsp.h"

 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif

+#if defined(WEBP_USE_NEON)
+
+#include "../dec/vp8i.h"
+
 #define QRegs "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",                  \
              "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

@ -76,7 +81,7 @@ extern "C" {
  "vld4.8   {" #c1"[6], " #c2"[6], " #c3"[6], " #c4"[6]}," #b1 "," #stride"\n" \
  "vld4.8   {" #c1"[7], " #c2"[7], " #c3"[7], " #c4"[7]}," #b2 "," #stride"\n"

-#define STORE8x2(c1, c2, p,stride)                                             \
+#define STORE8x2(c1, c2, p, stride)                                            \
  "vst2.8   {" #c1"[0], " #c2"[0]}," #p "," #stride " \n"                      \
  "vst2.8   {" #c1"[1], " #c2"[1]}," #p "," #stride " \n"                      \
  "vst2.8   {" #c1"[2], " #c2"[2]}," #p "," #stride " \n"                      \
@ -152,17 +157,251 @@ static void SimpleHFilter16iNEON(uint8_t* p, int stride, int thresh) {
  }
 }

+//-----------------------------------------------------------------------------
+// Inverse transforms (Paragraph 14.4)
+
+static void TransformOneNEON(const int16_t *in, uint8_t *dst) {
+  const int kBPS = BPS;
+  const int16_t constants[] = {20091, 17734, 0, 0};
+  /* kC1, kC2. Padded because vld1.16 loads 8 bytes
+   * Technically these are unsigned but vqdmulh is only available in signed.
+   * vqdmulh returns high half (effectively >> 16) but also doubles the value,
+   * changing the >> 16 to >> 15 and requiring an additional >> 1.
+   * We use this to our advantage with kC2. The canonical value is 35468.
+   * However, the high bit is set so treating it as signed will give incorrect
+   * results. We avoid this by down shifting by 1 here to clear the highest bit.
+   * Combined with the doubling effect of vqdmulh we get >> 16.
+   * This can not be applied to kC1 because the lowest bit is set. Down shifting
+   * the constant would reduce precision.
+   */
+
+  /* libwebp uses a trick to avoid some extra addition that libvpx does.
+   * Instead of:
+   * temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16);
+   * libwebp adds 1 << 16 to cospi8sqrt2minus1 (kC1). However, this causes the
+   * same issue with kC1 and vqdmulh that we work around by down shifting kC2
+   */
+
+  /* Adapted from libvpx: vp8/common/arm/neon/shortidct4x4llm_neon.asm */
+  __asm__ volatile (
+    "vld1.16         {q1, q2}, [%[in]]           \n"
+    "vld1.16         {d0}, [%[constants]]        \n"
+
+    /* d2: in[0]
+     * d3: in[8]
+     * d4: in[4]
+     * d5: in[12]
+     */
+    "vswp            d3, d4                      \n"
+
+    /* q8 = {in[4], in[12]} * kC1 * 2 >> 16
+     * q9 = {in[4], in[12]} * kC2 >> 16
+     */
+    "vqdmulh.s16     q8, q2, d0[0]               \n"
+    "vqdmulh.s16     q9, q2, d0[1]               \n"
+
+    /* d22 = a = in[0] + in[8]
+     * d23 = b = in[0] - in[8]
+     */
+    "vqadd.s16       d22, d2, d3                 \n"
+    "vqsub.s16       d23, d2, d3                 \n"
+
+    /* The multiplication should be x * kC1 >> 16
+     * However, with vqdmulh we get x * kC1 * 2 >> 16
+     * (multiply, double, return high half)
+     * We avoided this in kC2 by pre-shifting the constant.
+     * q8 = in[4]/[12] * kC1 >> 16
+     */
+    "vshr.s16        q8, q8, #1                  \n"
+
+    /* Add {in[4], in[12]} back after the multiplication. This is handled by
+     * adding 1 << 16 to kC1 in the libwebp C code.
+     */
+    "vqadd.s16       q8, q2, q8                  \n"
+
+    /* d20 = c = in[4]*kC2 - in[12]*kC1
+     * d21 = d = in[4]*kC1 + in[12]*kC2
+     */
+    "vqsub.s16       d20, d18, d17               \n"
+    "vqadd.s16       d21, d19, d16               \n"
+
+    /* d2 = tmp[0] = a + d
+     * d3 = tmp[1] = b + c
+     * d4 = tmp[2] = b - c
+     * d5 = tmp[3] = a - d
+     */
+    "vqadd.s16       d2, d22, d21                \n"
+    "vqadd.s16       d3, d23, d20                \n"
+    "vqsub.s16       d4, d23, d20                \n"
+    "vqsub.s16       d5, d22, d21                \n"
+
+    "vzip.16         q1, q2                      \n"
+    "vzip.16         q1, q2                      \n"
+
+    "vswp            d3, d4                      \n"
+
+    /* q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16
+     * q9 = {tmp[4], tmp[12]} * kC2 >> 16
+     */
+    "vqdmulh.s16     q8, q2, d0[0]               \n"
+    "vqdmulh.s16     q9, q2, d0[1]               \n"
+
+    /* d22 = a = tmp[0] + tmp[8]
+     * d23 = b = tmp[0] - tmp[8]
+     */
+    "vqadd.s16       d22, d2, d3                 \n"
+    "vqsub.s16       d23, d2, d3                 \n"
+
+    /* See long winded explanations prior */
+    "vshr.s16        q8, q8, #1                  \n"
+    "vqadd.s16       q8, q2, q8                  \n"
+
+    /* d20 = c = in[4]*kC2 - in[12]*kC1
+     * d21 = d = in[4]*kC1 + in[12]*kC2
+     */
+    "vqsub.s16       d20, d18, d17               \n"
+    "vqadd.s16       d21, d19, d16               \n"
+
+    /* d2 = tmp[0] = a + d
+     * d3 = tmp[1] = b + c
+     * d4 = tmp[2] = b - c
+     * d5 = tmp[3] = a - d
+     */
+    "vqadd.s16       d2, d22, d21                \n"
+    "vqadd.s16       d3, d23, d20                \n"
+    "vqsub.s16       d4, d23, d20                \n"
+    "vqsub.s16       d5, d22, d21                \n"
+
+    "vld1.32         d6[0], [%[dst]], %[kBPS]    \n"
+    "vld1.32         d6[1], [%[dst]], %[kBPS]    \n"
+    "vld1.32         d7[0], [%[dst]], %[kBPS]    \n"
+    "vld1.32         d7[1], [%[dst]], %[kBPS]    \n"
+
+    "sub         %[dst], %[dst], %[kBPS], lsl #2 \n"
+
+    /* (val) + 4 >> 3 */
+    "vrshr.s16       d2, d2, #3                  \n"
+    "vrshr.s16       d3, d3, #3                  \n"
+    "vrshr.s16       d4, d4, #3                  \n"
+    "vrshr.s16       d5, d5, #3                  \n"
+
+    "vzip.16         q1, q2                      \n"
+    "vzip.16         q1, q2                      \n"
+
+    /* Must accumulate before saturating */
+    "vmovl.u8        q8, d6                      \n"
+    "vmovl.u8        q9, d7                      \n"
+
+    "vqadd.s16       q1, q1, q8                  \n"
+    "vqadd.s16       q2, q2, q9                  \n"
+
+    "vqmovun.s16     d0, q1                      \n"
+    "vqmovun.s16     d1, q2                      \n"
+
+    "vst1.32         d0[0], [%[dst]], %[kBPS]    \n"
+    "vst1.32         d0[1], [%[dst]], %[kBPS]    \n"
+    "vst1.32         d1[0], [%[dst]], %[kBPS]    \n"
+    "vst1.32         d1[1], [%[dst]]             \n"
+
+    : [in] "+r"(in), [dst] "+r"(dst)  /* modified registers */
+    : [kBPS] "r"(kBPS), [constants] "r"(constants)  /* constants */
+    : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11"  /* clobbered */
+  );
+}
+
+static void TransformTwoNEON(const int16_t* in, uint8_t* dst, int do_two) {
+  TransformOneNEON(in, dst);
+  if (do_two) {
+    TransformOneNEON(in + 16, dst + 4);
+  }
+}
+
+static void TransformWHT(const int16_t* in, int16_t* out) {
+  const int kStep = 32;  // The store is only incrementing the pointer as if we
+                         // had stored a single byte.
+  __asm__ volatile (
+    // part 1
+    // load data into q0, q1
+    "vld1.16         {q0, q1}, [%[in]]           \n"
+
+    "vaddl.s16       q2, d0, d3                  \n" // a0 = in[0] + in[12]
+    "vaddl.s16       q3, d1, d2                  \n" // a1 = in[4] + in[8]
+    "vsubl.s16       q4, d1, d2                  \n" // a2 = in[4] - in[8]
+    "vsubl.s16       q5, d0, d3                  \n" // a3 = in[0] - in[12]
+
+    "vadd.s32        q0, q2, q3                  \n" // tmp[0] = a0 + a1
+    "vsub.s32        q2, q2, q3                  \n" // tmp[8] = a0 - a1
+    "vadd.s32        q1, q5, q4                  \n" // tmp[4] = a3 + a2
+    "vsub.s32        q3, q5, q4                  \n" // tmp[12] = a3 - a2
+
+    // Transpose
+    // q0 = tmp[0, 4, 8, 12], q1 = tmp[2, 6, 10, 14]
+    // q2 = tmp[1, 5, 9, 13], q3 = tmp[3, 7, 11, 15]
+    "vswp            d1, d4                      \n" // vtrn.64 q0, q2
+    "vswp            d3, d6                      \n" // vtrn.64 q1, q3
+    "vtrn.32         q0, q1                      \n"
+    "vtrn.32         q2, q3                      \n"
+
+    "vmov.s32        q4, #3                      \n" // dc = 3
+    "vadd.s32        q0, q0, q4                  \n" // dc = tmp[0] + 3
+    "vadd.s32        q6, q0, q3                  \n" // a0 = dc + tmp[3]
+    "vadd.s32        q7, q1, q2                  \n" // a1 = tmp[1] + tmp[2]
+    "vsub.s32        q8, q1, q2                  \n" // a2 = tmp[1] - tmp[2]
+    "vsub.s32        q9, q0, q3                  \n" // a3 = dc - tmp[3]
+
+    "vadd.s32        q0, q6, q7                  \n"
+    "vshrn.s32       d0, q0, #3                  \n" // (a0 + a1) >> 3
+    "vadd.s32        q1, q9, q8                  \n"
+    "vshrn.s32       d1, q1, #3                  \n" // (a3 + a2) >> 3
+    "vsub.s32        q2, q6, q7                  \n"
+    "vshrn.s32       d2, q2, #3                  \n" // (a0 - a1) >> 3
+    "vsub.s32        q3, q9, q8                  \n"
+    "vshrn.s32       d3, q3, #3                  \n" // (a3 - a2) >> 3
+
+    // set the results to output
+    "vst1.16         d0[0], [%[out]], %[kStep]   \n"
+    "vst1.16         d1[0], [%[out]], %[kStep]   \n"
+    "vst1.16         d2[0], [%[out]], %[kStep]   \n"
+    "vst1.16         d3[0], [%[out]], %[kStep]   \n"
+    "vst1.16         d0[1], [%[out]], %[kStep]   \n"
+    "vst1.16         d1[1], [%[out]], %[kStep]   \n"
+    "vst1.16         d2[1], [%[out]], %[kStep]   \n"
+    "vst1.16         d3[1], [%[out]], %[kStep]   \n"
+    "vst1.16         d0[2], [%[out]], %[kStep]   \n"
+    "vst1.16         d1[2], [%[out]], %[kStep]   \n"
+    "vst1.16         d2[2], [%[out]], %[kStep]   \n"
+    "vst1.16         d3[2], [%[out]], %[kStep]   \n"
+    "vst1.16         d0[3], [%[out]], %[kStep]   \n"
+    "vst1.16         d1[3], [%[out]], %[kStep]   \n"
+    "vst1.16         d2[3], [%[out]], %[kStep]   \n"
+    "vst1.16         d3[3], [%[out]], %[kStep]   \n"
+
+    : [out] "+r"(out)  // modified registers
+    : [in] "r"(in), [kStep] "r"(kStep)  // constants
+    : "memory", "q0", "q1", "q2", "q3", "q4",
+      "q5", "q6", "q7", "q8", "q9"  // clobbered
+  );
+}
+
+#endif   // WEBP_USE_NEON
+
+//------------------------------------------------------------------------------
+// Entry point
+
 extern void VP8DspInitNEON(void);

 void VP8DspInitNEON(void) {
+#if defined(WEBP_USE_NEON)
+  VP8Transform = TransformTwoNEON;
+  VP8TransformWHT = TransformWHT;
+
  VP8SimpleVFilter16 = SimpleVFilter16NEON;
  VP8SimpleHFilter16 = SimpleHFilter16NEON;
  VP8SimpleVFilter16i = SimpleVFilter16iNEON;
  VP8SimpleHFilter16i = SimpleHFilter16iNEON;
+#endif   // WEBP_USE_NEON
 }

 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
-
-#endif   // __GNUC__ && __ARM_NEON__
--- a/src/dsp/dec_sse2.c
+++ b/src/dsp/dec_sse2.c
@ -1,8 +1,10 @@
-// Copyright 2011 Google Inc.
+// Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // SSE2 version of some decoding functions (idct, loop filtering).
@ -10,15 +12,17 @@
 // Author: somnath@google.com (Somnath Banerjee)
 //         cduvivier@google.com (Christian Duvivier)

-#if defined(__SSE2__) || defined(_MSC_VER)
-
-#include <emmintrin.h>
-#include "../dec/vp8i.h"
+#include "./dsp.h"

 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif

+#if defined(WEBP_USE_SSE2)
+
+#include <emmintrin.h>
+#include "../dec/vp8i.h"
+
 //------------------------------------------------------------------------------
 // Transforms (Paragraph 14.4)

@ -192,7 +196,7 @@ static void TransformSSE2(const int16_t* in, uint8_t* dst, int do_two) {

  // Add inverse transform to 'dst' and store.
  {
-    const __m128i zero = _mm_set1_epi16(0);
+    const __m128i zero = _mm_setzero_si128();
    // Load the reference(s).
    __m128i dst0, dst1, dst2, dst3;
    if (do_two) {
@ -276,14 +280,14 @@ static void TransformSSE2(const int16_t* in, uint8_t* dst, int do_two) {

 #define GET_NOTHEV(p1, p0, q0, q1, hev_thresh, not_hev) {                      \
  const __m128i zero = _mm_setzero_si128();                                    \
-  const __m128i t1 = MM_ABS(p1, p0);                                           \
-  const __m128i t2 = MM_ABS(q1, q0);                                           \
+  const __m128i t_1 = MM_ABS(p1, p0);                                          \
+  const __m128i t_2 = MM_ABS(q1, q0);                                          \
                                                                               \
  const __m128i h = _mm_set1_epi8(hev_thresh);                                 \
-  const __m128i t3 = _mm_subs_epu8(t1, h);  /* abs(p1 - p0) - hev_tresh */     \
-  const __m128i t4 = _mm_subs_epu8(t2, h);  /* abs(q1 - q0) - hev_tresh */     \
+  const __m128i t_3 = _mm_subs_epu8(t_1, h);  /* abs(p1 - p0) - hev_tresh */   \
+  const __m128i t_4 = _mm_subs_epu8(t_2, h);  /* abs(q1 - q0) - hev_tresh */   \
                                                                               \
-  not_hev = _mm_or_si128(t3, t4);                                              \
+  not_hev = _mm_or_si128(t_3, t_4);                                            \
  not_hev = _mm_cmpeq_epi8(not_hev, zero); /* not_hev <= t1 && not_hev <= t2 */\
 }

@ -312,13 +316,13 @@ static void TransformSSE2(const int16_t* in, uint8_t* dst, int do_two) {

 // Updates values of 2 pixels at MB edge during complex filtering.
 // Update operations:
-// q = q - a and p = p + a; where a = [(a_hi >> 7), (a_lo >> 7)]
+// q = q - delta and p = p + delta; where delta = [(a_hi >> 7), (a_lo >> 7)]
 #define UPDATE_2PIXELS(pi, qi, a_lo, a_hi) {                                   \
  const __m128i a_lo7 = _mm_srai_epi16(a_lo, 7);                               \
  const __m128i a_hi7 = _mm_srai_epi16(a_hi, 7);                               \
-  const __m128i a = _mm_packs_epi16(a_lo7, a_hi7);                             \
-  pi = _mm_adds_epi8(pi, a);                                                   \
-  qi = _mm_subs_epi8(qi, a);                                                   \
+  const __m128i delta = _mm_packs_epi16(a_lo7, a_hi7);                         \
+  pi = _mm_adds_epi8(pi, delta);                                               \
+  qi = _mm_subs_epi8(qi, delta);                                               \
 }

 static void NeedsFilter(const __m128i* p1, const __m128i* p0, const __m128i* q0,
@ -341,8 +345,8 @@ static void NeedsFilter(const __m128i* p1, const __m128i* p0, const __m128i* q0,
 // Edge filtering functions

 // Applies filter on 2 pixels (p0 and q0)
-static inline void DoFilter2(const __m128i* p1, __m128i* p0, __m128i* q0,
-                             const __m128i* q1, int thresh) {
+static WEBP_INLINE void DoFilter2(const __m128i* p1, __m128i* p0, __m128i* q0,
+                                  const __m128i* q1, int thresh) {
  __m128i a, mask;
  const __m128i sign_bit = _mm_set1_epi8(0x80);
  const __m128i p1s = _mm_xor_si128(*p1, sign_bit);
@ -362,8 +366,9 @@ static inline void DoFilter2(const __m128i* p1, __m128i* p0, __m128i* q0,
 }

 // Applies filter on 4 pixels (p1, p0, q0 and q1)
-static inline void DoFilter4(__m128i* p1, __m128i *p0, __m128i* q0, __m128i* q1,
-                             const __m128i* mask, int hev_thresh) {
+static WEBP_INLINE void DoFilter4(__m128i* p1, __m128i *p0,
+                                  __m128i* q0, __m128i* q1,
+                                  const __m128i* mask, int hev_thresh) {
  __m128i not_hev;
  __m128i t1, t2, t3;
  const __m128i sign_bit = _mm_set1_epi8(0x80);
@ -408,9 +413,9 @@ static inline void DoFilter4(__m128i* p1, __m128i *p0, __m128i* q0, __m128i* q1,
 }

 // Applies filter on 6 pixels (p2, p1, p0, q0, q1 and q2)
-static inline void DoFilter6(__m128i *p2, __m128i* p1, __m128i *p0,
-                             __m128i* q0, __m128i* q1, __m128i *q2,
-                             const __m128i* mask, int hev_thresh) {
+static WEBP_INLINE void DoFilter6(__m128i *p2, __m128i* p1, __m128i *p0,
+                                  __m128i* q0, __m128i* q1, __m128i *q2,
+                                  const __m128i* mask, int hev_thresh) {
  __m128i a, not_hev;
  const __m128i sign_bit = _mm_set1_epi8(0x80);

@ -466,8 +471,8 @@ static inline void DoFilter6(__m128i *p2, __m128i* p1, __m128i *p0,
 //
 // TODO(somnath): Investigate _mm_shuffle* also see if it can be broken into
 // two Load4x4() to avoid code duplication.
-static inline void Load8x4(const uint8_t* b, int stride,
-                           __m128i* p, __m128i* q) {
+static WEBP_INLINE void Load8x4(const uint8_t* b, int stride,
+                                __m128i* p, __m128i* q) {
  __m128i t1, t2;

  // Load 0th, 1st, 4th and 5th rows
@ -506,9 +511,10 @@ static inline void Load8x4(const uint8_t* b, int stride,
  *q = _mm_unpackhi_epi32(t1, t2);
 }

-static inline void Load16x4(const uint8_t* r0, const uint8_t* r8, int stride,
-                            __m128i* p1, __m128i* p0,
-                            __m128i* q0, __m128i* q1) {
+static WEBP_INLINE void Load16x4(const uint8_t* r0, const uint8_t* r8,
+                                 int stride,
+                                 __m128i* p1, __m128i* p0,
+                                 __m128i* q0, __m128i* q1) {
  __m128i t1, t2;
  // Assume the pixels around the edge (|) are numbered as follows
  //                00 01 | 02 03
@ -540,7 +546,7 @@ static inline void Load16x4(const uint8_t* r0, const uint8_t* r8, int stride,
  *q1 = _mm_unpackhi_epi64(t2, *q1);
 }

-static inline void Store4x4(__m128i* x, uint8_t* dst, int stride) {
+static WEBP_INLINE void Store4x4(__m128i* x, uint8_t* dst, int stride) {
  int i;
  for (i = 0; i < 4; ++i, dst += stride) {
    *((int32_t*)dst) = _mm_cvtsi128_si32(*x);
@ -549,8 +555,9 @@ static inline void Store4x4(__m128i* x, uint8_t* dst, int stride) {
 }

 // Transpose back and store
-static inline void Store16x4(uint8_t* r0, uint8_t* r8, int stride, __m128i* p1,
-                             __m128i* p0, __m128i* q0, __m128i* q1) {
+static WEBP_INLINE void Store16x4(uint8_t* r0, uint8_t* r8, int stride,
+                                  __m128i* p1, __m128i* p0,
+                                  __m128i* q0, __m128i* q1) {
  __m128i t1;

  // p0 = 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
@ -871,9 +878,15 @@ static void HFilter8iSSE2(uint8_t* u, uint8_t* v, int stride,
  Store16x4(u, v, stride, &p1, &p0, &q0, &q1);
 }

+#endif   // WEBP_USE_SSE2
+
+//------------------------------------------------------------------------------
+// Entry point
+
 extern void VP8DspInitSSE2(void);

 void VP8DspInitSSE2(void) {
+#if defined(WEBP_USE_SSE2)
  VP8Transform = TransformSSE2;

  VP8VFilter16 = VFilter16SSE2;
@ -889,10 +902,9 @@ void VP8DspInitSSE2(void) {
  VP8SimpleHFilter16 = SimpleHFilter16SSE2;
  VP8SimpleVFilter16i = SimpleVFilter16iSSE2;
  VP8SimpleHFilter16i = SimpleHFilter16iSSE2;
+#endif   // WEBP_USE_SSE2
 }

 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
-
-#endif   //__SSE2__ || _MSC_VER
--- a/src/dsp/dsp.h
+++ b/src/dsp/dsp.h
@ -1,8 +1,10 @@
-// Copyright 2011 Google Inc.
+// Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 //   Speed-critical functions.
@ -21,6 +23,22 @@ extern "C" {
 //------------------------------------------------------------------------------
 // CPU detection

+#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
+#define WEBP_MSC_SSE2  // Visual C++ SSE2 targets
+#endif
+
+#if defined(__SSE2__) || defined(WEBP_MSC_SSE2)
+#define WEBP_USE_SSE2
+#endif
+
+#if defined(__ANDROID__) && defined(__ARM_ARCH_7A__)
+#define WEBP_ANDROID_NEON  // Android targets that might support NEON
+#endif
+
+#if defined(__ARM_NEON__) || defined(WEBP_ANDROID_NEON)
+#define WEBP_USE_NEON
+#endif
+
 typedef enum {
  kSSE2,
  kSSE3,
@ -33,8 +51,6 @@ extern VP8CPUInfo VP8GetCPUInfo;
 //------------------------------------------------------------------------------
 // Encoding

-int VP8GetAlpha(const int histo[]);
-
 // Transforms
 // VP8Idct: Does one of two inverse transforms. If do_two is set, the transforms
 //          will be done for (ref, in, dst) and (ref + 4, in + 16, dst + 4).
@ -63,18 +79,17 @@ extern VP8WMetric VP8TDisto4x4, VP8TDisto16x16;

 typedef void (*VP8BlockCopy)(const uint8_t* src, uint8_t* dst);
 extern VP8BlockCopy VP8Copy4x4;
-extern VP8BlockCopy VP8Copy8x8;
-extern VP8BlockCopy VP8Copy16x16;
 // Quantization
 struct VP8Matrix;   // forward declaration
 typedef int (*VP8QuantizeBlock)(int16_t in[16], int16_t out[16],
                                int n, const struct VP8Matrix* const mtx);
 extern VP8QuantizeBlock VP8EncQuantizeBlock;

-// Compute susceptibility based on DCT-coeff histograms:
-// the higher, the "easier" the macroblock is to compress.
-typedef int (*VP8CHisto)(const uint8_t* ref, const uint8_t* pred,
-                         int start_block, int end_block);
+// Collect histogram for susceptibility calculation and accumulate in histo[].
+struct VP8Histogram;
+typedef void (*VP8CHisto)(const uint8_t* ref, const uint8_t* pred,
+                          int start_block, int end_block,
+                          struct VP8Histogram* const histo);
 extern const int VP8DspScan[16 + 4 + 4];
 extern VP8CHisto VP8CollectHistogram;

@ -90,14 +105,14 @@ extern VP8DecIdct2 VP8Transform;
 extern VP8DecIdct VP8TransformUV;
 extern VP8DecIdct VP8TransformDC;
 extern VP8DecIdct VP8TransformDCUV;
-extern void (*VP8TransformWHT)(const int16_t* in, int16_t* out);
+extern VP8WHT VP8TransformWHT;

 // *dst is the destination block, with stride BPS. Boundary samples are
 // assumed accessible when needed.
 typedef void (*VP8PredFunc)(uint8_t* dst);
-extern VP8PredFunc VP8PredLuma16[/* NUM_B_DC_MODES */];
-extern VP8PredFunc VP8PredChroma8[/* NUM_B_DC_MODES */];
-extern VP8PredFunc VP8PredLuma4[/* NUM_BMODES */];
+extern const VP8PredFunc VP8PredLuma16[/* NUM_B_DC_MODES */];
+extern const VP8PredFunc VP8PredChroma8[/* NUM_B_DC_MODES */];
+extern const VP8PredFunc VP8PredLuma4[/* NUM_BMODES */];

 // simple filter (only for luma)
 typedef void (*VP8SimpleFilterFunc)(uint8_t* p, int stride, int thresh);
@ -124,28 +139,30 @@ extern VP8ChromaFilterFunc VP8VFilter8i;  // filtering u and v altogether
 extern VP8ChromaFilterFunc VP8HFilter8i;

 // must be called before anything using the above
-extern void VP8DspInit(void);
+void VP8DspInit(void);

 //------------------------------------------------------------------------------
 // WebP I/O

 #define FANCY_UPSAMPLING   // undefined to remove fancy upsampling support

-#ifdef FANCY_UPSAMPLING
 typedef void (*WebPUpsampleLinePairFunc)(
    const uint8_t* top_y, const uint8_t* bottom_y,
    const uint8_t* top_u, const uint8_t* top_v,
    const uint8_t* cur_u, const uint8_t* cur_v,
    uint8_t* top_dst, uint8_t* bottom_dst, int len);

+#ifdef FANCY_UPSAMPLING

 // Fancy upsampling functions to convert YUV to RGB(A) modes
 extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
-extern WebPUpsampleLinePairFunc WebPUpsamplersKeepAlpha[/* MODE_LAST */];

 // Initializes SSE2 version of the fancy upsamplers.
 void WebPInitUpsamplersSSE2(void);

+// NEON version
+void WebPInitUpsamplersNEON(void);
+
 #endif    // FANCY_UPSAMPLING

 // Point-sampling methods.
@ -156,6 +173,11 @@ typedef void (*WebPSampleLinePairFunc)(

 extern const WebPSampleLinePairFunc WebPSamplers[/* MODE_LAST */];

+// General function for converting two lines of ARGB or RGBA.
+// 'alpha_is_last' should be true if 0xff000000 is stored in memory as
+// as 0x00, 0x00, 0x00, 0xff (little endian).
+WebPUpsampleLinePairFunc WebPGetLinePairConverter(int alpha_is_last);
+
 // YUV444->RGB converters
 typedef void (*WebPYUV444Converter)(const uint8_t* y,
                                    const uint8_t* u, const uint8_t* v,
@ -166,6 +188,24 @@ extern const WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */];
 // Main function to be called
 void WebPInitUpsamplers(void);

+//------------------------------------------------------------------------------
+// Pre-multiply planes with alpha values
+
+// Apply alpha pre-multiply on an rgba, bgra or argb plane of size w * h.
+// alpha_first should be 0 for argb, 1 for rgba or bgra (where alpha is last).
+extern void (*WebPApplyAlphaMultiply)(
+    uint8_t* rgba, int alpha_first, int w, int h, int stride);
+
+// Same, buf specifically for RGBA4444 format
+extern void (*WebPApplyAlphaMultiply4444)(
+    uint8_t* rgba4444, int w, int h, int stride);
+
+// To be called first before using the above.
+void WebPInitPremultiply(void);
+
+void WebPInitPremultiplySSE2(void);   // should not be called directly.
+void WebPInitPremultiplyNEON(void);
+
 //------------------------------------------------------------------------------

 #if defined(__cplusplus) || defined(c_plusplus)
--- a/src/dsp/enc.c
+++ b/src/dsp/enc.c
@ -1,45 +1,36 @@
-// Copyright 2011 Google Inc.
+// Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Speed-critical encoding functions.
 //
 // Author: Skal (pascal.massimino@gmail.com)

+#include <stdlib.h>  // for abs()
+#include "./dsp.h"
 #include "../enc/vp8enci.h"

 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif

+static WEBP_INLINE uint8_t clip_8b(int v) {
+  return (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
+}
+
+static WEBP_INLINE int clip_max(int v, int max) {
+  return (v > max) ? max : v;
+}
+
 //------------------------------------------------------------------------------
 // Compute susceptibility based on DCT-coeff histograms:
 // the higher, the "easier" the macroblock is to compress.

-static int ClipAlpha(int alpha) {
-  return alpha < 0 ? 0 : alpha > 255 ? 255 : alpha;
-}
-
-int VP8GetAlpha(const int histo[MAX_COEFF_THRESH + 1]) {
-  int num = 0, den = 0, val = 0;
-  int k;
-  int alpha;
-  // note: changing this loop to avoid the numerous "k + 1" slows things down.
-  for (k = 0; k < MAX_COEFF_THRESH; ++k) {
-    if (histo[k + 1]) {
-      val += histo[k + 1];
-      num += val * (k + 1);
-      den += (k + 1) * (k + 1);
-    }
-  }
-  // we scale the value to a usable [0..255] range
-  alpha = den ? 10 * num / den - 5 : 0;
-  return ClipAlpha(alpha);
-}
-
 const int VP8DspScan[16 + 4 + 4] = {
  // Luma
  0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
@ -51,27 +42,23 @@ const int VP8DspScan[16 + 4 + 4] = {
  8 + 0 * BPS,  12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS     // V
 };

-static int CollectHistogram(const uint8_t* ref, const uint8_t* pred,
-                            int start_block, int end_block) {
-  int histo[MAX_COEFF_THRESH + 1] = { 0 };
-  int16_t out[16];
-  int j, k;
+static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
+                             int start_block, int end_block,
+                             VP8Histogram* const histo) {
+  int j;
  for (j = start_block; j < end_block; ++j) {
+    int k;
+    int16_t out[16];
+
    VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);

-    // Convert coefficients to bin (within out[]).
+    // Convert coefficients to bin.
    for (k = 0; k < 16; ++k) {
-      const int v = abs(out[k]) >> 2;
-      out[k] = (v > MAX_COEFF_THRESH) ? MAX_COEFF_THRESH : v;
-    }
-
-    // Use bin to update histogram.
-    for (k = 0; k < 16; ++k) {
-      histo[out[k]]++;
+      const int v = abs(out[k]) >> 3;  // TODO(skal): add rounding?
+      const int clipped_value = clip_max(v, MAX_COEFF_THRESH);
+      histo->distribution[clipped_value]++;
    }
  }
-
-  return VP8GetAlpha(histo);
 }

 //------------------------------------------------------------------------------
@ -87,15 +74,12 @@ static void InitTables(void) {
  if (!tables_ok) {
    int i;
    for (i = -255; i <= 255 + 255; ++i) {
-      clip1[255 + i] = (i < 0) ? 0 : (i > 255) ? 255 : i;
+      clip1[255 + i] = clip_8b(i);
    }
    tables_ok = 1;
  }
 }

-static inline uint8_t clip_8b(int v) {
-  return (!(v & ~0xff)) ? v : v < 0 ? 0 : 255;
-}

 //------------------------------------------------------------------------------
 // Transforms (Paragraph 14.4)
@ -107,8 +91,8 @@ static const int kC1 = 20091 + (1 << 16);
 static const int kC2 = 35468;
 #define MUL(a, b) (((a) * (b)) >> 16)

-static inline void ITransformOne(const uint8_t* ref, const int16_t* in,
-                                 uint8_t* dst) {
+static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
+                                      uint8_t* dst) {
  int C[4 * 4], *tmp;
  int i;
  tmp = C;
@ -152,25 +136,25 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
  int i;
  int tmp[16];
  for (i = 0; i < 4; ++i, src += BPS, ref += BPS) {
-    const int d0 = src[0] - ref[0];
+    const int d0 = src[0] - ref[0];   // 9bit dynamic range ([-255,255])
    const int d1 = src[1] - ref[1];
    const int d2 = src[2] - ref[2];
    const int d3 = src[3] - ref[3];
-    const int a0 = (d0 + d3) << 3;
-    const int a1 = (d1 + d2) << 3;
-    const int a2 = (d1 - d2) << 3;
-    const int a3 = (d0 - d3) << 3;
-    tmp[0 + i * 4] = (a0 + a1);
-    tmp[1 + i * 4] = (a2 * 2217 + a3 * 5352 + 14500) >> 12;
-    tmp[2 + i * 4] = (a0 - a1);
-    tmp[3 + i * 4] = (a3 * 2217 - a2 * 5352 +  7500) >> 12;
+    const int a0 = (d0 + d3);         // 10b                      [-510,510]
+    const int a1 = (d1 + d2);
+    const int a2 = (d1 - d2);
+    const int a3 = (d0 - d3);
+    tmp[0 + i * 4] = (a0 + a1) * 8;   // 14b                      [-8160,8160]
+    tmp[1 + i * 4] = (a2 * 2217 + a3 * 5352 + 1812) >> 9;      // [-7536,7542]
+    tmp[2 + i * 4] = (a0 - a1) * 8;
+    tmp[3 + i * 4] = (a3 * 2217 - a2 * 5352 +  937) >> 9;
  }
  for (i = 0; i < 4; ++i) {
-    const int a0 = (tmp[0 + i] + tmp[12 + i]);
+    const int a0 = (tmp[0 + i] + tmp[12 + i]);  // 15b
    const int a1 = (tmp[4 + i] + tmp[ 8 + i]);
    const int a2 = (tmp[4 + i] - tmp[ 8 + i]);
    const int a3 = (tmp[0 + i] - tmp[12 + i]);
-    out[0 + i] = (a0 + a1 + 7) >> 4;
+    out[0 + i] = (a0 + a1 + 7) >> 4;            // 12b
    out[4 + i] = ((a2 * 2217 + a3 * 5352 + 12000) >> 16) + (a3 != 0);
    out[8 + i] = (a0 - a1 + 7) >> 4;
    out[12+ i] = ((a3 * 2217 - a2 * 5352 + 51000) >> 16);
@ -205,31 +189,32 @@ static void ITransformWHT(const int16_t* in, int16_t* out) {
 }

 static void FTransformWHT(const int16_t* in, int16_t* out) {
-  int tmp[16];
+  // input is 12b signed
+  int16_t tmp[16];
  int i;
  for (i = 0; i < 4; ++i, in += 64) {
-    const int a0 = (in[0 * 16] + in[2 * 16]) << 2;
-    const int a1 = (in[1 * 16] + in[3 * 16]) << 2;
-    const int a2 = (in[1 * 16] - in[3 * 16]) << 2;
-    const int a3 = (in[0 * 16] - in[2 * 16]) << 2;
-    tmp[0 + i * 4] = (a0 + a1) + (a0 != 0);
+    const int a0 = (in[0 * 16] + in[2 * 16]);  // 13b
+    const int a1 = (in[1 * 16] + in[3 * 16]);
+    const int a2 = (in[1 * 16] - in[3 * 16]);
+    const int a3 = (in[0 * 16] - in[2 * 16]);
+    tmp[0 + i * 4] = a0 + a1;   // 14b
    tmp[1 + i * 4] = a3 + a2;
    tmp[2 + i * 4] = a3 - a2;
    tmp[3 + i * 4] = a0 - a1;
  }
  for (i = 0; i < 4; ++i) {
-    const int a0 = (tmp[0 + i] + tmp[8 + i]);
+    const int a0 = (tmp[0 + i] + tmp[8 + i]);  // 15b
    const int a1 = (tmp[4 + i] + tmp[12+ i]);
    const int a2 = (tmp[4 + i] - tmp[12+ i]);
    const int a3 = (tmp[0 + i] - tmp[8 + i]);
-    const int b0 = a0 + a1;
+    const int b0 = a0 + a1;    // 16b
    const int b1 = a3 + a2;
    const int b2 = a3 - a2;
    const int b3 = a0 - a1;
-    out[ 0 + i] = (b0 + (b0 > 0) + 3) >> 3;
-    out[ 4 + i] = (b1 + (b1 > 0) + 3) >> 3;
-    out[ 8 + i] = (b2 + (b2 > 0) + 3) >> 3;
-    out[12 + i] = (b3 + (b3 > 0) + 3) >> 3;
+    out[ 0 + i] = b0 >> 1;     // 15b
+    out[ 4 + i] = b1 >> 1;
+    out[ 8 + i] = b2 >> 1;
+    out[12 + i] = b3 >> 1;
  }
 }

@ -241,14 +226,15 @@ static void FTransformWHT(const int16_t* in, int16_t* out) {

 #define DST(x, y) dst[(x) + (y) * BPS]

-static inline void Fill(uint8_t* dst, int value, int size) {
+static WEBP_INLINE void Fill(uint8_t* dst, int value, int size) {
  int j;
  for (j = 0; j < size; ++j) {
    memset(dst + j * BPS, value, size);
  }
 }

-static inline void VerticalPred(uint8_t* dst, const uint8_t* top, int size) {
+static WEBP_INLINE void VerticalPred(uint8_t* dst,
+                                     const uint8_t* top, int size) {
  int j;
  if (top) {
    for (j = 0; j < size; ++j) memcpy(dst + j * BPS, top, size);
@ -257,7 +243,8 @@ static inline void VerticalPred(uint8_t* dst, const uint8_t* top, int size) {
  }
 }

-static inline void HorizontalPred(uint8_t* dst, const uint8_t* left, int size) {
+static WEBP_INLINE void HorizontalPred(uint8_t* dst,
+                                       const uint8_t* left, int size) {
  if (left) {
    int j;
    for (j = 0; j < size; ++j) {
@ -268,8 +255,8 @@ static inline void HorizontalPred(uint8_t* dst, const uint8_t* left, int size) {
  }
 }

-static inline void TrueMotion(uint8_t* dst, const uint8_t* left,
-                              const uint8_t* top, int size) {
+static WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left,
+                                   const uint8_t* top, int size) {
  int y;
  if (left) {
    if (top) {
@ -298,9 +285,9 @@ static inline void TrueMotion(uint8_t* dst, const uint8_t* left,
  }
 }

-static inline void DCMode(uint8_t* dst, const uint8_t* left,
-                          const uint8_t* top,
-                          int size, int round, int shift) {
+static WEBP_INLINE void DCMode(uint8_t* dst, const uint8_t* left,
+                               const uint8_t* top,
+                               int size, int round, int shift) {
  int DC = 0;
  int j;
  if (top) {
@ -543,7 +530,8 @@ static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
 //------------------------------------------------------------------------------
 // Metric

-static inline int GetSSE(const uint8_t* a, const uint8_t* b, int w, int h) {
+static WEBP_INLINE int GetSSE(const uint8_t* a, const uint8_t* b,
+                              int w, int h) {
  int count = 0;
  int y, x;
  for (y = 0; y < h; ++y) {
@ -584,30 +572,30 @@ static int TTransform(const uint8_t* in, const uint16_t* w) {
  int i;
  // horizontal pass
  for (i = 0; i < 4; ++i, in += BPS) {
-    const int a0 = (in[0] + in[2]) << 2;
-    const int a1 = (in[1] + in[3]) << 2;
-    const int a2 = (in[1] - in[3]) << 2;
-    const int a3 = (in[0] - in[2]) << 2;
-    tmp[0 + i * 4] = a0 + a1 + (a0 != 0);
+    const int a0 = in[0] + in[2];
+    const int a1 = in[1] + in[3];
+    const int a2 = in[1] - in[3];
+    const int a3 = in[0] - in[2];
+    tmp[0 + i * 4] = a0 + a1;
    tmp[1 + i * 4] = a3 + a2;
    tmp[2 + i * 4] = a3 - a2;
    tmp[3 + i * 4] = a0 - a1;
  }
  // vertical pass
  for (i = 0; i < 4; ++i, ++w) {
-    const int a0 = (tmp[0 + i] + tmp[8 + i]);
-    const int a1 = (tmp[4 + i] + tmp[12+ i]);
-    const int a2 = (tmp[4 + i] - tmp[12+ i]);
-    const int a3 = (tmp[0 + i] - tmp[8 + i]);
+    const int a0 = tmp[0 + i] + tmp[8 + i];
+    const int a1 = tmp[4 + i] + tmp[12+ i];
+    const int a2 = tmp[4 + i] - tmp[12+ i];
+    const int a3 = tmp[0 + i] - tmp[8 + i];
    const int b0 = a0 + a1;
    const int b1 = a3 + a2;
    const int b2 = a3 - a2;
    const int b3 = a0 - a1;
-    // abs((b + (b<0) + 3) >> 3) = (abs(b) + 3) >> 3
-    sum += w[ 0] * ((abs(b0) + 3) >> 3);
-    sum += w[ 4] * ((abs(b1) + 3) >> 3);
-    sum += w[ 8] * ((abs(b2) + 3) >> 3);
-    sum += w[12] * ((abs(b3) + 3) >> 3);
+
+    sum += w[ 0] * abs(b0);
+    sum += w[ 4] * abs(b1);
+    sum += w[ 8] * abs(b2);
+    sum += w[12] * abs(b3);
  }
  return sum;
 }
@ -616,7 +604,7 @@ static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
                    const uint16_t* const w) {
  const int sum1 = TTransform(a, w);
  const int sum2 = TTransform(b, w);
-  return (abs(sum2 - sum1) + 8) >> 4;
+  return abs(sum2 - sum1) >> 5;
 }

 static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
@ -646,13 +634,13 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
  for (; n < 16; ++n) {
    const int j = kZigzag[n];
    const int sign = (in[j] < 0);
-    int coeff = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
-    if (coeff > 2047) coeff = 2047;
+    const int coeff = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
    if (coeff > mtx->zthresh_[j]) {
      const int Q = mtx->q_[j];
      const int iQ = mtx->iq_[j];
      const int B = mtx->bias_[j];
      out[n] = QUANTDIV(coeff, iQ, B);
+      if (out[n] > MAX_LEVEL) out[n] = MAX_LEVEL;
      if (sign) out[n] = -out[n];
      in[j] = out[n] * Q;
      if (out[n]) last = n;
@ -667,7 +655,7 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
 //------------------------------------------------------------------------------
 // Block copy

-static inline void Copy(const uint8_t* src, uint8_t* dst, int size) {
+static WEBP_INLINE void Copy(const uint8_t* src, uint8_t* dst, int size) {
  int y;
  for (y = 0; y < size; ++y) {
    memcpy(dst, src, size);
@ -677,8 +665,6 @@ static inline void Copy(const uint8_t* src, uint8_t* dst, int size) {
 }

 static void Copy4x4(const uint8_t* src, uint8_t* dst) { Copy(src, dst, 4); }
-static void Copy8x8(const uint8_t* src, uint8_t* dst) { Copy(src, dst, 8); }
-static void Copy16x16(const uint8_t* src, uint8_t* dst) { Copy(src, dst, 16); }

 //------------------------------------------------------------------------------
 // Initialization
@ -701,10 +687,9 @@ VP8WMetric VP8TDisto4x4;
 VP8WMetric VP8TDisto16x16;
 VP8QuantizeBlock VP8EncQuantizeBlock;
 VP8BlockCopy VP8Copy4x4;
-VP8BlockCopy VP8Copy8x8;
-VP8BlockCopy VP8Copy16x16;

 extern void VP8EncDspInitSSE2(void);
+extern void VP8EncDspInitNEON(void);

 void VP8EncDspInit(void) {
  InitTables();
@ -726,15 +711,17 @@ void VP8EncDspInit(void) {
  VP8TDisto16x16 = Disto16x16;
  VP8EncQuantizeBlock = QuantizeBlock;
  VP8Copy4x4 = Copy4x4;
-  VP8Copy8x8 = Copy8x8;
-  VP8Copy16x16 = Copy16x16;

  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
  if (VP8GetCPUInfo) {
-#if defined(__SSE2__) || defined(_MSC_VER)
+#if defined(WEBP_USE_SSE2)
    if (VP8GetCPUInfo(kSSE2)) {
      VP8EncDspInitSSE2();
    }
+#elif defined(WEBP_USE_NEON)
+    if (VP8GetCPUInfo(kNEON)) {
+      VP8EncDspInitNEON();
+    }
 #endif
  }
 }
--- a/src/dsp/enc_neon.c
+++ b/src/dsp/enc_neon.c
@ -0,0 +1,639 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// ARM NEON version of speed-critical encoding functions.
+//
+// adapted from libvpx (http://www.webmproject.org/code/)
+
+#include "./dsp.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+#if defined(WEBP_USE_NEON)
+
+#include "../enc/vp8enci.h"
+
+//------------------------------------------------------------------------------
+// Transforms (Paragraph 14.4)
+
+// Inverse transform.
+// This code is pretty much the same as TransformOneNEON in the decoder, except
+// for subtraction to *ref. See the comments there for algorithmic explanations.
+static void ITransformOne(const uint8_t* ref,
+                          const int16_t* in, uint8_t* dst) {
+  const int kBPS = BPS;
+  const int16_t kC1C2[] = { 20091, 17734, 0, 0 };  // kC1 / (kC2 >> 1) / 0 / 0
+
+  __asm__ volatile (
+    "vld1.16         {q1, q2}, [%[in]]           \n"
+    "vld1.16         {d0}, [%[kC1C2]]            \n"
+
+    // d2: in[0]
+    // d3: in[8]
+    // d4: in[4]
+    // d5: in[12]
+    "vswp            d3, d4                      \n"
+
+    // q8 = {in[4], in[12]} * kC1 * 2 >> 16
+    // q9 = {in[4], in[12]} * kC2 >> 16
+    "vqdmulh.s16     q8, q2, d0[0]               \n"
+    "vqdmulh.s16     q9, q2, d0[1]               \n"
+
+    // d22 = a = in[0] + in[8]
+    // d23 = b = in[0] - in[8]
+    "vqadd.s16       d22, d2, d3                 \n"
+    "vqsub.s16       d23, d2, d3                 \n"
+
+    //  q8 = in[4]/[12] * kC1 >> 16
+    "vshr.s16        q8, q8, #1                  \n"
+
+    // Add {in[4], in[12]} back after the multiplication.
+    "vqadd.s16       q8, q2, q8                  \n"
+
+    // d20 = c = in[4]*kC2 - in[12]*kC1
+    // d21 = d = in[4]*kC1 + in[12]*kC2
+    "vqsub.s16       d20, d18, d17               \n"
+    "vqadd.s16       d21, d19, d16               \n"
+
+    // d2 = tmp[0] = a + d
+    // d3 = tmp[1] = b + c
+    // d4 = tmp[2] = b - c
+    // d5 = tmp[3] = a - d
+    "vqadd.s16       d2, d22, d21                \n"
+    "vqadd.s16       d3, d23, d20                \n"
+    "vqsub.s16       d4, d23, d20                \n"
+    "vqsub.s16       d5, d22, d21                \n"
+
+    "vzip.16         q1, q2                      \n"
+    "vzip.16         q1, q2                      \n"
+
+    "vswp            d3, d4                      \n"
+
+    // q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16
+    // q9 = {tmp[4], tmp[12]} * kC2 >> 16
+    "vqdmulh.s16     q8, q2, d0[0]               \n"
+    "vqdmulh.s16     q9, q2, d0[1]               \n"
+
+    // d22 = a = tmp[0] + tmp[8]
+    // d23 = b = tmp[0] - tmp[8]
+    "vqadd.s16       d22, d2, d3                 \n"
+    "vqsub.s16       d23, d2, d3                 \n"
+
+    "vshr.s16        q8, q8, #1                  \n"
+    "vqadd.s16       q8, q2, q8                  \n"
+
+    // d20 = c = in[4]*kC2 - in[12]*kC1
+    // d21 = d = in[4]*kC1 + in[12]*kC2
+    "vqsub.s16       d20, d18, d17               \n"
+    "vqadd.s16       d21, d19, d16               \n"
+
+    // d2 = tmp[0] = a + d
+    // d3 = tmp[1] = b + c
+    // d4 = tmp[2] = b - c
+    // d5 = tmp[3] = a - d
+    "vqadd.s16       d2, d22, d21                \n"
+    "vqadd.s16       d3, d23, d20                \n"
+    "vqsub.s16       d4, d23, d20                \n"
+    "vqsub.s16       d5, d22, d21                \n"
+
+    "vld1.32         d6[0], [%[ref]], %[kBPS]    \n"
+    "vld1.32         d6[1], [%[ref]], %[kBPS]    \n"
+    "vld1.32         d7[0], [%[ref]], %[kBPS]    \n"
+    "vld1.32         d7[1], [%[ref]], %[kBPS]    \n"
+
+    "sub         %[ref], %[ref], %[kBPS], lsl #2 \n"
+
+    // (val) + 4 >> 3
+    "vrshr.s16       d2, d2, #3                  \n"
+    "vrshr.s16       d3, d3, #3                  \n"
+    "vrshr.s16       d4, d4, #3                  \n"
+    "vrshr.s16       d5, d5, #3                  \n"
+
+    "vzip.16         q1, q2                      \n"
+    "vzip.16         q1, q2                      \n"
+
+    // Must accumulate before saturating
+    "vmovl.u8        q8, d6                      \n"
+    "vmovl.u8        q9, d7                      \n"
+
+    "vqadd.s16       q1, q1, q8                  \n"
+    "vqadd.s16       q2, q2, q9                  \n"
+
+    "vqmovun.s16     d0, q1                      \n"
+    "vqmovun.s16     d1, q2                      \n"
+
+    "vst1.32         d0[0], [%[dst]], %[kBPS]    \n"
+    "vst1.32         d0[1], [%[dst]], %[kBPS]    \n"
+    "vst1.32         d1[0], [%[dst]], %[kBPS]    \n"
+    "vst1.32         d1[1], [%[dst]]             \n"
+
+    : [in] "+r"(in), [dst] "+r"(dst)               // modified registers
+    : [kBPS] "r"(kBPS), [kC1C2] "r"(kC1C2), [ref] "r"(ref)  // constants
+    : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11"  // clobbered
+  );
+}
+
+static void ITransform(const uint8_t* ref,
+                       const int16_t* in, uint8_t* dst, int do_two) {
+  ITransformOne(ref, in, dst);
+  if (do_two) {
+    ITransformOne(ref + 4, in + 16, dst + 4);
+  }
+}
+
+// Same code as dec_neon.c
+static void ITransformWHT(const int16_t* in, int16_t* out) {
+  const int kStep = 32;  // The store is only incrementing the pointer as if we
+                         // had stored a single byte.
+  __asm__ volatile (
+    // part 1
+    // load data into q0, q1
+    "vld1.16         {q0, q1}, [%[in]]           \n"
+
+    "vaddl.s16       q2, d0, d3                  \n" // a0 = in[0] + in[12]
+    "vaddl.s16       q3, d1, d2                  \n" // a1 = in[4] + in[8]
+    "vsubl.s16       q4, d1, d2                  \n" // a2 = in[4] - in[8]
+    "vsubl.s16       q5, d0, d3                  \n" // a3 = in[0] - in[12]
+
+    "vadd.s32        q0, q2, q3                  \n" // tmp[0] = a0 + a1
+    "vsub.s32        q2, q2, q3                  \n" // tmp[8] = a0 - a1
+    "vadd.s32        q1, q5, q4                  \n" // tmp[4] = a3 + a2
+    "vsub.s32        q3, q5, q4                  \n" // tmp[12] = a3 - a2
+
+    // Transpose
+    // q0 = tmp[0, 4, 8, 12], q1 = tmp[2, 6, 10, 14]
+    // q2 = tmp[1, 5, 9, 13], q3 = tmp[3, 7, 11, 15]
+    "vswp            d1, d4                      \n" // vtrn.64 q0, q2
+    "vswp            d3, d6                      \n" // vtrn.64 q1, q3
+    "vtrn.32         q0, q1                      \n"
+    "vtrn.32         q2, q3                      \n"
+
+    "vmov.s32        q4, #3                      \n" // dc = 3
+    "vadd.s32        q0, q0, q4                  \n" // dc = tmp[0] + 3
+    "vadd.s32        q6, q0, q3                  \n" // a0 = dc + tmp[3]
+    "vadd.s32        q7, q1, q2                  \n" // a1 = tmp[1] + tmp[2]
+    "vsub.s32        q8, q1, q2                  \n" // a2 = tmp[1] - tmp[2]
+    "vsub.s32        q9, q0, q3                  \n" // a3 = dc - tmp[3]
+
+    "vadd.s32        q0, q6, q7                  \n"
+    "vshrn.s32       d0, q0, #3                  \n" // (a0 + a1) >> 3
+    "vadd.s32        q1, q9, q8                  \n"
+    "vshrn.s32       d1, q1, #3                  \n" // (a3 + a2) >> 3
+    "vsub.s32        q2, q6, q7                  \n"
+    "vshrn.s32       d2, q2, #3                  \n" // (a0 - a1) >> 3
+    "vsub.s32        q3, q9, q8                  \n"
+    "vshrn.s32       d3, q3, #3                  \n" // (a3 - a2) >> 3
+
+    // set the results to output
+    "vst1.16         d0[0], [%[out]], %[kStep]      \n"
+    "vst1.16         d1[0], [%[out]], %[kStep]      \n"
+    "vst1.16         d2[0], [%[out]], %[kStep]      \n"
+    "vst1.16         d3[0], [%[out]], %[kStep]      \n"
+    "vst1.16         d0[1], [%[out]], %[kStep]      \n"
+    "vst1.16         d1[1], [%[out]], %[kStep]      \n"
+    "vst1.16         d2[1], [%[out]], %[kStep]      \n"
+    "vst1.16         d3[1], [%[out]], %[kStep]      \n"
+    "vst1.16         d0[2], [%[out]], %[kStep]      \n"
+    "vst1.16         d1[2], [%[out]], %[kStep]      \n"
+    "vst1.16         d2[2], [%[out]], %[kStep]      \n"
+    "vst1.16         d3[2], [%[out]], %[kStep]      \n"
+    "vst1.16         d0[3], [%[out]], %[kStep]      \n"
+    "vst1.16         d1[3], [%[out]], %[kStep]      \n"
+    "vst1.16         d2[3], [%[out]], %[kStep]      \n"
+    "vst1.16         d3[3], [%[out]], %[kStep]      \n"
+
+    : [out] "+r"(out)  // modified registers
+    : [in] "r"(in), [kStep] "r"(kStep)  // constants
+    : "memory", "q0", "q1", "q2", "q3", "q4",
+      "q5", "q6", "q7", "q8", "q9" // clobbered
+  );
+}
+
+// Forward transform.
+
+// adapted from vp8/encoder/arm/neon/shortfdct_neon.asm
+static const int16_t kCoeff16[] = {
+  5352,  5352,  5352, 5352, 2217,  2217,  2217, 2217
+};
+static const int32_t kCoeff32[] = {
+   1812,  1812,  1812,  1812,
+    937,   937,   937,   937,
+  12000, 12000, 12000, 12000,
+  51000, 51000, 51000, 51000
+};
+
+static void FTransform(const uint8_t* src, const uint8_t* ref,
+                       int16_t* out) {
+  const int kBPS = BPS;
+  const uint8_t* src_ptr = src;
+  const uint8_t* ref_ptr = ref;
+  const int16_t* coeff16 = kCoeff16;
+  const int32_t* coeff32 = kCoeff32;
+
+  __asm__ volatile (
+    // load src into q4, q5 in high half
+    "vld1.8 {d8},  [%[src_ptr]], %[kBPS]      \n"
+    "vld1.8 {d10}, [%[src_ptr]], %[kBPS]      \n"
+    "vld1.8 {d9},  [%[src_ptr]], %[kBPS]      \n"
+    "vld1.8 {d11}, [%[src_ptr]]               \n"
+
+    // load ref into q6, q7 in high half
+    "vld1.8 {d12}, [%[ref_ptr]], %[kBPS]      \n"
+    "vld1.8 {d14}, [%[ref_ptr]], %[kBPS]      \n"
+    "vld1.8 {d13}, [%[ref_ptr]], %[kBPS]      \n"
+    "vld1.8 {d15}, [%[ref_ptr]]               \n"
+
+    // Pack the high values in to q4 and q6
+    "vtrn.32     q4, q5                       \n"
+    "vtrn.32     q6, q7                       \n"
+
+    // d[0-3] = src - ref
+    "vsubl.u8    q0, d8, d12                  \n"
+    "vsubl.u8    q1, d9, d13                  \n"
+
+    // load coeff16 into q8(d16=5352, d17=2217)
+    "vld1.16     {q8}, [%[coeff16]]           \n"
+
+    // load coeff32 high half into q9 = 1812, q10 = 937
+    "vld1.32     {q9, q10}, [%[coeff32]]!     \n"
+
+    // load coeff32 low half into q11=12000, q12=51000
+    "vld1.32     {q11,q12}, [%[coeff32]]      \n"
+
+    // part 1
+    // Transpose. Register dN is the same as dN in C
+    "vtrn.32         d0, d2                   \n"
+    "vtrn.32         d1, d3                   \n"
+    "vtrn.16         d0, d1                   \n"
+    "vtrn.16         d2, d3                   \n"
+
+    "vadd.s16        d4, d0, d3               \n" // a0 = d0 + d3
+    "vadd.s16        d5, d1, d2               \n" // a1 = d1 + d2
+    "vsub.s16        d6, d1, d2               \n" // a2 = d1 - d2
+    "vsub.s16        d7, d0, d3               \n" // a3 = d0 - d3
+
+    "vadd.s16        d0, d4, d5               \n" // a0 + a1
+    "vshl.s16        d0, d0, #3               \n" // temp[0+i*4] = (a0+a1) << 3
+    "vsub.s16        d2, d4, d5               \n" // a0 - a1
+    "vshl.s16        d2, d2, #3               \n" // (temp[2+i*4] = (a0-a1) << 3
+
+    "vmlal.s16       q9, d7, d16              \n" // a3*5352 + 1812
+    "vmlal.s16       q10, d7, d17             \n" // a3*2217 + 937
+    "vmlal.s16       q9, d6, d17              \n" // a2*2217 + a3*5352 + 1812
+    "vmlsl.s16       q10, d6, d16             \n" // a3*2217 + 937 - a2*5352
+
+    // temp[1+i*4] = (d2*2217 + d3*5352 + 1812) >> 9
+    // temp[3+i*4] = (d3*2217 + 937 - d2*5352) >> 9
+    "vshrn.s32       d1, q9, #9               \n"
+    "vshrn.s32       d3, q10, #9              \n"
+
+    // part 2
+    // transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
+    "vtrn.32         d0, d2                   \n"
+    "vtrn.32         d1, d3                   \n"
+    "vtrn.16         d0, d1                   \n"
+    "vtrn.16         d2, d3                   \n"
+
+    "vmov.s16        d26, #7                  \n"
+
+    "vadd.s16        d4, d0, d3               \n" // a1 = ip[0] + ip[12]
+    "vadd.s16        d5, d1, d2               \n" // b1 = ip[4] + ip[8]
+    "vsub.s16        d6, d1, d2               \n" // c1 = ip[4] - ip[8]
+    "vadd.s16        d4, d4, d26              \n" // a1 + 7
+    "vsub.s16        d7, d0, d3               \n" // d1 = ip[0] - ip[12]
+
+    "vadd.s16        d0, d4, d5               \n" // op[0] = a1 + b1 + 7
+    "vsub.s16        d2, d4, d5               \n" // op[8] = a1 - b1 + 7
+
+    "vmlal.s16       q11, d7, d16             \n" // d1*5352 + 12000
+    "vmlal.s16       q12, d7, d17             \n" // d1*2217 + 51000
+
+    "vceq.s16        d4, d7, #0               \n"
+
+    "vshr.s16        d0, d0, #4               \n"
+    "vshr.s16        d2, d2, #4               \n"
+
+    "vmlal.s16       q11, d6, d17             \n" // c1*2217 + d1*5352 + 12000
+    "vmlsl.s16       q12, d6, d16             \n" // d1*2217 - c1*5352 + 51000
+
+    "vmvn            d4, d4                   \n" // !(d1 == 0)
+    // op[4] = (c1*2217 + d1*5352 + 12000)>>16
+    "vshrn.s32       d1, q11, #16             \n"
+    // op[4] += (d1!=0)
+    "vsub.s16        d1, d1, d4               \n"
+    // op[12]= (d1*2217 - c1*5352 + 51000)>>16
+    "vshrn.s32       d3, q12, #16             \n"
+
+    // set result to out array
+    "vst1.16         {q0, q1}, [%[out]]   \n"
+    : [src_ptr] "+r"(src_ptr), [ref_ptr] "+r"(ref_ptr),
+      [coeff32] "+r"(coeff32)          // modified registers
+    : [kBPS] "r"(kBPS), [coeff16] "r"(coeff16),
+      [out] "r"(out)                   // constants
+    : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
+      "q10", "q11", "q12", "q13"       // clobbered
+  );
+}
+
+static void FTransformWHT(const int16_t* in, int16_t* out) {
+  const int kStep = 32;
+  __asm__ volatile (
+    // d0 = in[0 * 16] , d1 = in[1 * 16]
+    // d2 = in[2 * 16] , d3 = in[3 * 16]
+    "vld1.16         d0[0], [%[in]], %[kStep]   \n"
+    "vld1.16         d1[0], [%[in]], %[kStep]   \n"
+    "vld1.16         d2[0], [%[in]], %[kStep]   \n"
+    "vld1.16         d3[0], [%[in]], %[kStep]   \n"
+    "vld1.16         d0[1], [%[in]], %[kStep]   \n"
+    "vld1.16         d1[1], [%[in]], %[kStep]   \n"
+    "vld1.16         d2[1], [%[in]], %[kStep]   \n"
+    "vld1.16         d3[1], [%[in]], %[kStep]   \n"
+    "vld1.16         d0[2], [%[in]], %[kStep]   \n"
+    "vld1.16         d1[2], [%[in]], %[kStep]   \n"
+    "vld1.16         d2[2], [%[in]], %[kStep]   \n"
+    "vld1.16         d3[2], [%[in]], %[kStep]   \n"
+    "vld1.16         d0[3], [%[in]], %[kStep]   \n"
+    "vld1.16         d1[3], [%[in]], %[kStep]   \n"
+    "vld1.16         d2[3], [%[in]], %[kStep]   \n"
+    "vld1.16         d3[3], [%[in]], %[kStep]   \n"
+
+    "vaddl.s16       q2, d0, d2                 \n" // a0=(in[0*16]+in[2*16])
+    "vaddl.s16       q3, d1, d3                 \n" // a1=(in[1*16]+in[3*16])
+    "vsubl.s16       q4, d1, d3                 \n" // a2=(in[1*16]-in[3*16])
+    "vsubl.s16       q5, d0, d2                 \n" // a3=(in[0*16]-in[2*16])
+
+    "vqadd.s32       q6, q2, q3                 \n" // a0 + a1
+    "vqadd.s32       q7, q5, q4                 \n" // a3 + a2
+    "vqsub.s32       q8, q5, q4                 \n" // a3 - a2
+    "vqsub.s32       q9, q2, q3                 \n" // a0 - a1
+
+    // Transpose
+    // q6 = tmp[0, 1,  2,  3] ; q7 = tmp[ 4,  5,  6,  7]
+    // q8 = tmp[8, 9, 10, 11] ; q9 = tmp[12, 13, 14, 15]
+    "vswp            d13, d16                   \n" // vtrn.64 q0, q2
+    "vswp            d15, d18                   \n" // vtrn.64 q1, q3
+    "vtrn.32         q6, q7                     \n"
+    "vtrn.32         q8, q9                     \n"
+
+    "vqadd.s32       q0, q6, q8                 \n" // a0 = tmp[0] + tmp[8]
+    "vqadd.s32       q1, q7, q9                 \n" // a1 = tmp[4] + tmp[12]
+    "vqsub.s32       q2, q7, q9                 \n" // a2 = tmp[4] - tmp[12]
+    "vqsub.s32       q3, q6, q8                 \n" // a3 = tmp[0] - tmp[8]
+
+    "vqadd.s32       q4, q0, q1                 \n" // b0 = a0 + a1
+    "vqadd.s32       q5, q3, q2                 \n" // b1 = a3 + a2
+    "vqsub.s32       q6, q3, q2                 \n" // b2 = a3 - a2
+    "vqsub.s32       q7, q0, q1                 \n" // b3 = a0 - a1
+
+    "vshrn.s32       d18, q4, #1                \n" // b0 >> 1
+    "vshrn.s32       d19, q5, #1                \n" // b1 >> 1
+    "vshrn.s32       d20, q6, #1                \n" // b2 >> 1
+    "vshrn.s32       d21, q7, #1                \n" // b3 >> 1
+
+    "vst1.16         {q9, q10}, [%[out]]        \n"
+
+    : [in] "+r"(in)
+    : [kStep] "r"(kStep), [out] "r"(out)
+    : "memory", "q0", "q1", "q2", "q3", "q4", "q5",
+      "q6", "q7", "q8", "q9", "q10"       // clobbered
+  ) ;
+}
+
+//------------------------------------------------------------------------------
+// Texture distortion
+//
+// We try to match the spectral content (weighted) between source and
+// reconstructed samples.
+
+// Hadamard transform
+// Returns the weighted sum of the absolute value of transformed coefficients.
+// This uses a TTransform helper function in C
+static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
+                    const uint16_t* const w) {
+  const int kBPS = BPS;
+  const uint8_t* A = a;
+  const uint8_t* B = b;
+  const uint16_t* W = w;
+  int sum;
+  __asm__ volatile (
+    "vld1.32         d0[0], [%[a]], %[kBPS]   \n"
+    "vld1.32         d0[1], [%[a]], %[kBPS]   \n"
+    "vld1.32         d2[0], [%[a]], %[kBPS]   \n"
+    "vld1.32         d2[1], [%[a]]            \n"
+
+    "vld1.32         d1[0], [%[b]], %[kBPS]   \n"
+    "vld1.32         d1[1], [%[b]], %[kBPS]   \n"
+    "vld1.32         d3[0], [%[b]], %[kBPS]   \n"
+    "vld1.32         d3[1], [%[b]]            \n"
+
+    // a d0/d2, b d1/d3
+    // d0/d1: 01 01 01 01
+    // d2/d3: 23 23 23 23
+    // But: it goes 01 45 23 67
+    // Notice the middle values are transposed
+    "vtrn.16         q0, q1                   \n"
+
+    // {a0, a1} = {in[0] + in[2], in[1] + in[3]}
+    "vaddl.u8        q2, d0, d2               \n"
+    "vaddl.u8        q10, d1, d3              \n"
+    // {a3, a2} = {in[0] - in[2], in[1] - in[3]}
+    "vsubl.u8        q3, d0, d2               \n"
+    "vsubl.u8        q11, d1, d3              \n"
+
+    // tmp[0] = a0 + a1
+    "vpaddl.s16      q0, q2                   \n"
+    "vpaddl.s16      q8, q10                  \n"
+
+    // tmp[1] = a3 + a2
+    "vpaddl.s16      q1, q3                   \n"
+    "vpaddl.s16      q9, q11                  \n"
+
+    // No pair subtract
+    // q2 = {a0, a3}
+    // q3 = {a1, a2}
+    "vtrn.16         q2, q3                   \n"
+    "vtrn.16         q10, q11                 \n"
+
+    // {tmp[3], tmp[2]} = {a0 - a1, a3 - a2}
+    "vsubl.s16       q12, d4, d6              \n"
+    "vsubl.s16       q13, d5, d7              \n"
+    "vsubl.s16       q14, d20, d22            \n"
+    "vsubl.s16       q15, d21, d23            \n"
+
+    // separate tmp[3] and tmp[2]
+    // q12 = tmp[3]
+    // q13 = tmp[2]
+    "vtrn.32         q12, q13                 \n"
+    "vtrn.32         q14, q15                 \n"
+
+    // Transpose tmp for a
+    "vswp            d1, d26                  \n" // vtrn.64
+    "vswp            d3, d24                  \n" // vtrn.64
+    "vtrn.32         q0, q1                   \n"
+    "vtrn.32         q13, q12                 \n"
+
+    // Transpose tmp for b
+    "vswp            d17, d30                 \n" // vtrn.64
+    "vswp            d19, d28                 \n" // vtrn.64
+    "vtrn.32         q8, q9                   \n"
+    "vtrn.32         q15, q14                 \n"
+
+    // The first Q register is a, the second b.
+    // q0/8 tmp[0-3]
+    // q13/15 tmp[4-7]
+    // q1/9 tmp[8-11]
+    // q12/14 tmp[12-15]
+
+    // These are still in 01 45 23 67 order. We fix it easily in the addition
+    // case but the subtraction propegates them.
+    "vswp            d3, d27                  \n"
+    "vswp            d19, d31                 \n"
+
+    // a0 = tmp[0] + tmp[8]
+    "vadd.s32        q2, q0, q1               \n"
+    "vadd.s32        q3, q8, q9               \n"
+
+    // a1 = tmp[4] + tmp[12]
+    "vadd.s32        q10, q13, q12            \n"
+    "vadd.s32        q11, q15, q14            \n"
+
+    // a2 = tmp[4] - tmp[12]
+    "vsub.s32        q13, q13, q12            \n"
+    "vsub.s32        q15, q15, q14            \n"
+
+    // a3 = tmp[0] - tmp[8]
+    "vsub.s32        q0, q0, q1               \n"
+    "vsub.s32        q8, q8, q9               \n"
+
+    // b0 = a0 + a1
+    "vadd.s32        q1, q2, q10              \n"
+    "vadd.s32        q9, q3, q11              \n"
+
+    // b1 = a3 + a2
+    "vadd.s32        q12, q0, q13             \n"
+    "vadd.s32        q14, q8, q15             \n"
+
+    // b2 = a3 - a2
+    "vsub.s32        q0, q0, q13              \n"
+    "vsub.s32        q8, q8, q15              \n"
+
+    // b3 = a0 - a1
+    "vsub.s32        q2, q2, q10              \n"
+    "vsub.s32        q3, q3, q11              \n"
+
+    "vld1.64         {q10, q11}, [%[w]]       \n"
+
+    // abs(b0)
+    "vabs.s32        q1, q1                   \n"
+    "vabs.s32        q9, q9                   \n"
+    // abs(b1)
+    "vabs.s32        q12, q12                 \n"
+    "vabs.s32        q14, q14                 \n"
+    // abs(b2)
+    "vabs.s32        q0, q0                   \n"
+    "vabs.s32        q8, q8                   \n"
+    // abs(b3)
+    "vabs.s32        q2, q2                   \n"
+    "vabs.s32        q3, q3                   \n"
+
+    // expand w before using.
+    "vmovl.u16       q13, d20                 \n"
+    "vmovl.u16       q15, d21                 \n"
+
+    // w[0] * abs(b0)
+    "vmul.u32        q1, q1, q13              \n"
+    "vmul.u32        q9, q9, q13              \n"
+
+    // w[4] * abs(b1)
+    "vmla.u32        q1, q12, q15             \n"
+    "vmla.u32        q9, q14, q15             \n"
+
+    // expand w before using.
+    "vmovl.u16       q13, d22                 \n"
+    "vmovl.u16       q15, d23                 \n"
+
+    // w[8] * abs(b1)
+    "vmla.u32        q1, q0, q13              \n"
+    "vmla.u32        q9, q8, q13              \n"
+
+    // w[12] * abs(b1)
+    "vmla.u32        q1, q2, q15              \n"
+    "vmla.u32        q9, q3, q15              \n"
+
+    // Sum the arrays
+    "vpaddl.u32      q1, q1                   \n"
+    "vpaddl.u32      q9, q9                   \n"
+    "vadd.u64        d2, d3                   \n"
+    "vadd.u64        d18, d19                 \n"
+
+    // Hadamard transform needs 4 bits of extra precision (2 bits in each
+    // direction) for dynamic raw. Weights w[] are 16bits at max, so the maximum
+    // precision for coeff is 8bit of input + 4bits of Hadamard transform +
+    // 16bits for w[] + 2 bits of abs() summation.
+    //
+    // This uses a maximum of 31 bits (signed). Discarding the top 32 bits is
+    // A-OK.
+
+    // sum2 - sum1
+    "vsub.u32        d0, d2, d18              \n"
+    // abs(sum2 - sum1)
+    "vabs.s32        d0, d0                   \n"
+    // abs(sum2 - sum1) >> 5
+    "vshr.u32        d0, #5                   \n"
+
+    // It would be better to move the value straight into r0 but I'm not
+    // entirely sure how this works with inline assembly.
+    "vmov.32         %[sum], d0[0]            \n"
+
+    : [sum] "=r"(sum), [a] "+r"(A), [b] "+r"(B), [w] "+r"(W)
+    : [kBPS] "r"(kBPS)
+    : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
+      "q10", "q11", "q12", "q13", "q14", "q15"  // clobbered
+  ) ;
+
+  return sum;
+}
+
+static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
+                      const uint16_t* const w) {
+  int D = 0;
+  int x, y;
+  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
+    for (x = 0; x < 16; x += 4) {
+      D += Disto4x4(a + x + y, b + x + y, w);
+    }
+  }
+  return D;
+}
+
+#endif   // WEBP_USE_NEON
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8EncDspInitNEON(void);
+
+void VP8EncDspInitNEON(void) {
+#if defined(WEBP_USE_NEON)
+  VP8ITransform = ITransform;
+  VP8FTransform = FTransform;
+
+  VP8ITransformWHT = ITransformWHT;
+  VP8FTransformWHT = FTransformWHT;
+
+  VP8TDisto4x4 = Disto4x4;
+  VP8TDisto16x16 = Disto16x16;
+#endif   // WEBP_USE_NEON
+}
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/src/dsp/enc_sse2.c
+++ b/src/dsp/enc_sse2.c
@ -1,34 +1,70 @@
-// Copyright 2011 Google Inc.
+// Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // SSE2 version of speed-critical encoding functions.
 //
 // Author: Christian Duvivier (cduvivier@google.com)

-#if defined(__SSE2__) || defined(_MSC_VER)
+#include "./dsp.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+#if defined(WEBP_USE_SSE2)
+#include <stdlib.h>  // for abs()
 #include <emmintrin.h>

 #include "../enc/vp8enci.h"

-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
+//------------------------------------------------------------------------------
+// Quite useful macro for debugging. Left here for convenience.
+
+#if 0
+#include <stdio.h>
+static void PrintReg(const __m128i r, const char* const name, int size) {
+  int n;
+  union {
+    __m128i r;
+    uint8_t i8[16];
+    uint16_t i16[8];
+    uint32_t i32[4];
+    uint64_t i64[2];
+  } tmp;
+  tmp.r = r;
+  printf("%s\t: ", name);
+  if (size == 8) {
+    for (n = 0; n < 16; ++n) printf("%.2x ", tmp.i8[n]);
+  } else if (size == 16) {
+    for (n = 0; n < 8; ++n) printf("%.4x ", tmp.i16[n]);
+  } else if (size == 32) {
+    for (n = 0; n < 4; ++n) printf("%.8x ", tmp.i32[n]);
+  } else {
+    for (n = 0; n < 2; ++n) printf("%.16lx ", tmp.i64[n]);
+  }
+  printf("\n");
+}
 #endif

 //------------------------------------------------------------------------------
 // Compute susceptibility based on DCT-coeff histograms:
 // the higher, the "easier" the macroblock is to compress.

-static int CollectHistogramSSE2(const uint8_t* ref, const uint8_t* pred,
-                                int start_block, int end_block) {
-  int histo[MAX_COEFF_THRESH + 1] = { 0 };
-  int16_t out[16];
-  int j, k;
+static void CollectHistogramSSE2(const uint8_t* ref, const uint8_t* pred,
+                                 int start_block, int end_block,
+                                 VP8Histogram* const histo) {
  const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);
+  int j;
  for (j = start_block; j < end_block; ++j) {
+    int16_t out[16];
+    int k;
+
    VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);

    // Convert coefficients to bin (within out[]).
@ -44,9 +80,9 @@ static int CollectHistogramSSE2(const uint8_t* ref, const uint8_t* pred,
      const __m128i xor1 = _mm_xor_si128(out1, sign1);
      const __m128i abs0 = _mm_sub_epi16(xor0, sign0);
      const __m128i abs1 = _mm_sub_epi16(xor1, sign1);
-      // v = abs(out) >> 2
-      const __m128i v0 = _mm_srai_epi16(abs0, 2);
-      const __m128i v1 = _mm_srai_epi16(abs1, 2);
+      // v = abs(out) >> 3
+      const __m128i v0 = _mm_srai_epi16(abs0, 3);
+      const __m128i v1 = _mm_srai_epi16(abs1, 3);
      // bin = min(v, MAX_COEFF_THRESH)
      const __m128i bin0 = _mm_min_epi16(v0, max_coeff_thresh);
      const __m128i bin1 = _mm_min_epi16(v1, max_coeff_thresh);
@ -55,13 +91,11 @@ static int CollectHistogramSSE2(const uint8_t* ref, const uint8_t* pred,
      _mm_storeu_si128((__m128i*)&out[8], bin1);
    }

-    // Use bin to update histogram.
+    // Convert coefficients to bin.
    for (k = 0; k < 16; ++k) {
-      histo[out[k]]++;
+      histo->distribution[out[k]]++;
    }
  }
-
-  return VP8GetAlpha(histo);
 }

 //------------------------------------------------------------------------------
@ -240,7 +274,7 @@ static void ITransformSSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst,

  // Add inverse transform to 'ref' and store.
  {
-    const __m128i zero = _mm_set1_epi16(0);
+    const __m128i zero = _mm_setzero_si128();
    // Load the reference(s).
    __m128i ref0, ref1, ref2, ref3;
    if (do_two) {
@ -292,17 +326,23 @@ static void FTransformSSE2(const uint8_t* src, const uint8_t* ref,
                           int16_t* out) {
  const __m128i zero = _mm_setzero_si128();
  const __m128i seven = _mm_set1_epi16(7);
-  const __m128i k7500 = _mm_set1_epi32(7500);
-  const __m128i k14500 = _mm_set1_epi32(14500);
+  const __m128i k937 = _mm_set1_epi32(937);
+  const __m128i k1812 = _mm_set1_epi32(1812);
  const __m128i k51000 = _mm_set1_epi32(51000);
  const __m128i k12000_plus_one = _mm_set1_epi32(12000 + (1 << 16));
  const __m128i k5352_2217 = _mm_set_epi16(5352,  2217, 5352,  2217,
                                           5352,  2217, 5352,  2217);
  const __m128i k2217_5352 = _mm_set_epi16(2217, -5352, 2217, -5352,
                                           2217, -5352, 2217, -5352);
-
+  const __m128i k88p = _mm_set_epi16(8, 8, 8, 8, 8, 8, 8, 8);
+  const __m128i k88m = _mm_set_epi16(-8, 8, -8, 8, -8, 8, -8, 8);
+  const __m128i k5352_2217p = _mm_set_epi16(2217, 5352, 2217, 5352,
+                                            2217, 5352, 2217, 5352);
+  const __m128i k5352_2217m = _mm_set_epi16(-5352, 2217, -5352, 2217,
+                                            -5352, 2217, -5352, 2217);
  __m128i v01, v32;

+
  // Difference between src and ref and initial transpose.
  {
    // Load src and convert to 16b.
@ -323,73 +363,52 @@ static void FTransformSSE2(const uint8_t* src, const uint8_t* ref,
    const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero);
    const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero);
    const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero);
-    // Compute difference.
+    // Compute difference. -> 00 01 02 03 00 00 00 00
    const __m128i diff0 = _mm_sub_epi16(src_0, ref_0);
    const __m128i diff1 = _mm_sub_epi16(src_1, ref_1);
    const __m128i diff2 = _mm_sub_epi16(src_2, ref_2);
    const __m128i diff3 = _mm_sub_epi16(src_3, ref_3);

-    // Transpose.
+
+    // Unpack and shuffle
    // 00 01 02 03   0 0 0 0
    // 10 11 12 13   0 0 0 0
    // 20 21 22 23   0 0 0 0
    // 30 31 32 33   0 0 0 0
-    const __m128i transpose0_0 = _mm_unpacklo_epi16(diff0, diff1);
-    const __m128i transpose0_1 = _mm_unpacklo_epi16(diff2, diff3);
-    // 00 10 01 11   02 12 03 13
-    // 20 30 21 31   22 32 23 33
-    const __m128i v23 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
-    v01 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
-    v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2));
-    // a02 a12 a22 a32   a03 a13 a23 a33
-    // a00 a10 a20 a30   a01 a11 a21 a31
-    // a03 a13 a23 a33   a02 a12 a22 a32
-  }
+    const __m128i shuf01 = _mm_unpacklo_epi32(diff0, diff1);
+    const __m128i shuf23 = _mm_unpacklo_epi32(diff2, diff3);
+    // 00 01 10 11 02 03 12 13
+    // 20 21 30 31 22 23 32 33
+    const __m128i shuf01_p =
+        _mm_shufflehi_epi16(shuf01, _MM_SHUFFLE(2, 3, 0, 1));
+    const __m128i shuf23_p =
+        _mm_shufflehi_epi16(shuf23, _MM_SHUFFLE(2, 3, 0, 1));
+    // 00 01 10 11 03 02 13 12
+    // 20 21 30 31 23 22 33 32
+    const __m128i s01 = _mm_unpacklo_epi64(shuf01_p, shuf23_p);
+    const __m128i s32 = _mm_unpackhi_epi64(shuf01_p, shuf23_p);
+    // 00 01 10 11 20 21 30 31
+    // 03 02 13 12 23 22 33 32
+    const __m128i a01 = _mm_add_epi16(s01, s32);
+    const __m128i a32 = _mm_sub_epi16(s01, s32);
+    // [d0 + d3 | d1 + d2 | ...] = [a0 a1 | a0' a1' | ... ]
+    // [d0 - d3 | d1 - d2 | ...] = [a3 a2 | a3' a2' | ... ]

-  // First pass and subsequent transpose.
-  {
-    // Same operations are done on the (0,3) and (1,2) pairs.
-    // b0 = (a0 + a3) << 3
-    // b1 = (a1 + a2) << 3
-    // b3 = (a0 - a3) << 3
-    // b2 = (a1 - a2) << 3
-    const __m128i a01 = _mm_add_epi16(v01, v32);
-    const __m128i a32 = _mm_sub_epi16(v01, v32);
-    const __m128i b01 = _mm_slli_epi16(a01, 3);
-    const __m128i b32 = _mm_slli_epi16(a32, 3);
-    const __m128i b11 = _mm_unpackhi_epi64(b01, b01);
-    const __m128i b22 = _mm_unpackhi_epi64(b32, b32);
-
-    // e0 = b0 + b1
-    // e2 = b0 - b1
-    const __m128i e0 = _mm_add_epi16(b01, b11);
-    const __m128i e2 = _mm_sub_epi16(b01, b11);
-    const __m128i e02 = _mm_unpacklo_epi64(e0, e2);
-
-    // e1 = (b3 * 5352 + b2 * 2217 + 14500) >> 12
-    // e3 = (b3 * 2217 - b2 * 5352 +  7500) >> 12
-    const __m128i b23 = _mm_unpacklo_epi16(b22, b32);
-    const __m128i c1 = _mm_madd_epi16(b23, k5352_2217);
-    const __m128i c3 = _mm_madd_epi16(b23, k2217_5352);
-    const __m128i d1 = _mm_add_epi32(c1, k14500);
-    const __m128i d3 = _mm_add_epi32(c3, k7500);
-    const __m128i e1 = _mm_srai_epi32(d1, 12);
-    const __m128i e3 = _mm_srai_epi32(d3, 12);
-    const __m128i e13 = _mm_packs_epi32(e1, e3);
-
-    // Transpose.
-    // 00 01 02 03  20 21 22 23
-    // 10 11 12 13  30 31 32 33
-    const __m128i transpose0_0 = _mm_unpacklo_epi16(e02, e13);
-    const __m128i transpose0_1 = _mm_unpackhi_epi16(e02, e13);
-    // 00 10 01 11   02 12 03 13
-    // 20 30 21 31   22 32 23 33
-    const __m128i v23 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
-    v01 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
-    v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2));
-    // 02 12 22 32   03 13 23 33
-    // 00 10 20 30   01 11 21 31
-    // 03 13 23 33   02 12 22 32
+    const __m128i tmp0 = _mm_madd_epi16(a01, k88p);  // [ (a0 + a1) << 3, ... ]
+    const __m128i tmp2 = _mm_madd_epi16(a01, k88m);  // [ (a0 - a1) << 3, ... ]
+    const __m128i tmp1_1 = _mm_madd_epi16(a32, k5352_2217p);
+    const __m128i tmp3_1 = _mm_madd_epi16(a32, k5352_2217m);
+    const __m128i tmp1_2 = _mm_add_epi32(tmp1_1, k1812);
+    const __m128i tmp3_2 = _mm_add_epi32(tmp3_1, k937);
+    const __m128i tmp1   = _mm_srai_epi32(tmp1_2, 9);
+    const __m128i tmp3   = _mm_srai_epi32(tmp3_2, 9);
+    const __m128i s03 = _mm_packs_epi32(tmp0, tmp2);
+    const __m128i s12 = _mm_packs_epi32(tmp1, tmp3);
+    const __m128i s_lo = _mm_unpacklo_epi16(s03, s12);   // 0 1 0 1 0 1...
+    const __m128i s_hi = _mm_unpackhi_epi16(s03, s12);   // 2 3 2 3 2 3
+    const __m128i v23 = _mm_unpackhi_epi32(s_lo, s_hi);
+    v01 = _mm_unpacklo_epi32(s_lo, s_hi);
+    v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2));  // 3 2 3 2 3 2..
  }

  // Second pass
@ -403,13 +422,12 @@ static void FTransformSSE2(const uint8_t* src, const uint8_t* ref,
    const __m128i a32 = _mm_sub_epi16(v01, v32);
    const __m128i a11 = _mm_unpackhi_epi64(a01, a01);
    const __m128i a22 = _mm_unpackhi_epi64(a32, a32);
+    const __m128i a01_plus_7 = _mm_add_epi16(a01, seven);

    // d0 = (a0 + a1 + 7) >> 4;
    // d2 = (a0 - a1 + 7) >> 4;
-    const __m128i b0 = _mm_add_epi16(a01, a11);
-    const __m128i b2 = _mm_sub_epi16(a01, a11);
-    const __m128i c0 = _mm_add_epi16(b0, seven);
-    const __m128i c2 = _mm_add_epi16(b2, seven);
+    const __m128i c0 = _mm_add_epi16(a01_plus_7, a11);
+    const __m128i c2 = _mm_sub_epi16(a01_plus_7, a11);
    const __m128i d0 = _mm_srai_epi16(c0, 4);
    const __m128i d2 = _mm_srai_epi16(c2, 4);

@ -427,6 +445,7 @@ static void FTransformSSE2(const uint8_t* src, const uint8_t* ref,
    // f1 = f1 + (a3 != 0);
    // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the
    // desired (0, 1), we add one earlier through k12000_plus_one.
+    // -> f1 = f1 + 1 - (a3 == 0)
    const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero));

    _mm_storel_epi64((__m128i*)&out[ 0], d0);
@ -436,13 +455,137 @@ static void FTransformSSE2(const uint8_t* src, const uint8_t* ref,
  }
 }

+static void FTransformWHTSSE2(const int16_t* in, int16_t* out) {
+  int16_t tmp[16];
+  int i;
+  for (i = 0; i < 4; ++i, in += 64) {
+    const int a0 = (in[0 * 16] + in[2 * 16]);
+    const int a1 = (in[1 * 16] + in[3 * 16]);
+    const int a2 = (in[1 * 16] - in[3 * 16]);
+    const int a3 = (in[0 * 16] - in[2 * 16]);
+    tmp[0 + i * 4] = a0 + a1;
+    tmp[1 + i * 4] = a3 + a2;
+    tmp[2 + i * 4] = a3 - a2;
+    tmp[3 + i * 4] = a0 - a1;
+  }
+  {
+    const __m128i src0 = _mm_loadl_epi64((__m128i*)&tmp[0]);
+    const __m128i src1 = _mm_loadl_epi64((__m128i*)&tmp[4]);
+    const __m128i src2 = _mm_loadl_epi64((__m128i*)&tmp[8]);
+    const __m128i src3 = _mm_loadl_epi64((__m128i*)&tmp[12]);
+    const __m128i a0 = _mm_add_epi16(src0, src2);
+    const __m128i a1 = _mm_add_epi16(src1, src3);
+    const __m128i a2 = _mm_sub_epi16(src1, src3);
+    const __m128i a3 = _mm_sub_epi16(src0, src2);
+    const __m128i b0 = _mm_srai_epi16(_mm_adds_epi16(a0, a1), 1);
+    const __m128i b1 = _mm_srai_epi16(_mm_adds_epi16(a3, a2), 1);
+    const __m128i b2 = _mm_srai_epi16(_mm_subs_epi16(a3, a2), 1);
+    const __m128i b3 = _mm_srai_epi16(_mm_subs_epi16(a0, a1), 1);
+    _mm_storel_epi64((__m128i*)&out[ 0], b0);
+    _mm_storel_epi64((__m128i*)&out[ 4], b1);
+    _mm_storel_epi64((__m128i*)&out[ 8], b2);
+    _mm_storel_epi64((__m128i*)&out[12], b3);
+  }
+}
+
 //------------------------------------------------------------------------------
 // Metric

-static int SSE4x4SSE2(const uint8_t* a, const uint8_t* b) {
-  const __m128i zero = _mm_set1_epi16(0);
+static int SSE_Nx4SSE2(const uint8_t* a, const uint8_t* b,
+                       int num_quads, int do_16) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i sum1 = zero;
+  __m128i sum2 = zero;

-  // Load values.
+  while (num_quads-- > 0) {
+    // Note: for the !do_16 case, we read 16 pixels instead of 8 but that's ok,
+    // thanks to buffer over-allocation to that effect.
+    const __m128i a0 = _mm_loadu_si128((__m128i*)&a[BPS * 0]);
+    const __m128i a1 = _mm_loadu_si128((__m128i*)&a[BPS * 1]);
+    const __m128i a2 = _mm_loadu_si128((__m128i*)&a[BPS * 2]);
+    const __m128i a3 = _mm_loadu_si128((__m128i*)&a[BPS * 3]);
+    const __m128i b0 = _mm_loadu_si128((__m128i*)&b[BPS * 0]);
+    const __m128i b1 = _mm_loadu_si128((__m128i*)&b[BPS * 1]);
+    const __m128i b2 = _mm_loadu_si128((__m128i*)&b[BPS * 2]);
+    const __m128i b3 = _mm_loadu_si128((__m128i*)&b[BPS * 3]);
+
+    // compute clip0(a-b) and clip0(b-a)
+    const __m128i a0p = _mm_subs_epu8(a0, b0);
+    const __m128i a0m = _mm_subs_epu8(b0, a0);
+    const __m128i a1p = _mm_subs_epu8(a1, b1);
+    const __m128i a1m = _mm_subs_epu8(b1, a1);
+    const __m128i a2p = _mm_subs_epu8(a2, b2);
+    const __m128i a2m = _mm_subs_epu8(b2, a2);
+    const __m128i a3p = _mm_subs_epu8(a3, b3);
+    const __m128i a3m = _mm_subs_epu8(b3, a3);
+
+    // compute |a-b| with 8b arithmetic as clip0(a-b) | clip0(b-a)
+    const __m128i diff0 = _mm_or_si128(a0p, a0m);
+    const __m128i diff1 = _mm_or_si128(a1p, a1m);
+    const __m128i diff2 = _mm_or_si128(a2p, a2m);
+    const __m128i diff3 = _mm_or_si128(a3p, a3m);
+
+    // unpack (only four operations, instead of eight)
+    const __m128i low0 = _mm_unpacklo_epi8(diff0, zero);
+    const __m128i low1 = _mm_unpacklo_epi8(diff1, zero);
+    const __m128i low2 = _mm_unpacklo_epi8(diff2, zero);
+    const __m128i low3 = _mm_unpacklo_epi8(diff3, zero);
+
+    // multiply with self
+    const __m128i low_madd0 = _mm_madd_epi16(low0, low0);
+    const __m128i low_madd1 = _mm_madd_epi16(low1, low1);
+    const __m128i low_madd2 = _mm_madd_epi16(low2, low2);
+    const __m128i low_madd3 = _mm_madd_epi16(low3, low3);
+
+    // collect in a cascading way
+    const __m128i low_sum0 = _mm_add_epi32(low_madd0, low_madd1);
+    const __m128i low_sum1 = _mm_add_epi32(low_madd2, low_madd3);
+    sum1 = _mm_add_epi32(sum1, low_sum0);
+    sum2 = _mm_add_epi32(sum2, low_sum1);
+
+    if (do_16) {  // if necessary, process the higher 8 bytes similarly
+      const __m128i hi0 = _mm_unpackhi_epi8(diff0, zero);
+      const __m128i hi1 = _mm_unpackhi_epi8(diff1, zero);
+      const __m128i hi2 = _mm_unpackhi_epi8(diff2, zero);
+      const __m128i hi3 = _mm_unpackhi_epi8(diff3, zero);
+
+      const __m128i hi_madd0 = _mm_madd_epi16(hi0, hi0);
+      const __m128i hi_madd1 = _mm_madd_epi16(hi1, hi1);
+      const __m128i hi_madd2 = _mm_madd_epi16(hi2, hi2);
+      const __m128i hi_madd3 = _mm_madd_epi16(hi3, hi3);
+      const __m128i hi_sum0 = _mm_add_epi32(hi_madd0, hi_madd1);
+      const __m128i hi_sum1 = _mm_add_epi32(hi_madd2, hi_madd3);
+      sum1 = _mm_add_epi32(sum1, hi_sum0);
+      sum2 = _mm_add_epi32(sum2, hi_sum1);
+    }
+    a += 4 * BPS;
+    b += 4 * BPS;
+  }
+  {
+    int32_t tmp[4];
+    const __m128i sum = _mm_add_epi32(sum1, sum2);
+    _mm_storeu_si128((__m128i*)tmp, sum);
+    return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
+  }
+}
+
+static int SSE16x16SSE2(const uint8_t* a, const uint8_t* b) {
+  return SSE_Nx4SSE2(a, b, 4, 1);
+}
+
+static int SSE16x8SSE2(const uint8_t* a, const uint8_t* b) {
+  return SSE_Nx4SSE2(a, b, 2, 1);
+}
+
+static int SSE8x8SSE2(const uint8_t* a, const uint8_t* b) {
+  return SSE_Nx4SSE2(a, b, 2, 0);
+}
+
+static int SSE4x4SSE2(const uint8_t* a, const uint8_t* b) {
+  const __m128i zero = _mm_setzero_si128();
+
+  // Load values. Note that we read 8 pixels instead of 4,
+  // but the a/b buffers are over-allocated to that effect.
  const __m128i a0 = _mm_loadl_epi64((__m128i*)&a[BPS * 0]);
  const __m128i a1 = _mm_loadl_epi64((__m128i*)&a[BPS * 1]);
  const __m128i a2 = _mm_loadl_epi64((__m128i*)&a[BPS * 2]);
@ -480,6 +623,7 @@ static int SSE4x4SSE2(const uint8_t* a, const uint8_t* b) {
  const __m128i sum0 = _mm_add_epi32(madd0, madd1);
  const __m128i sum1 = _mm_add_epi32(madd2, madd3);
  const __m128i sum2 = _mm_add_epi32(sum0, sum1);
+
  int32_t tmp[4];
  _mm_storeu_si128((__m128i*)tmp, sum2);
  return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
@ -499,8 +643,6 @@ static int TTransformSSE2(const uint8_t* inA, const uint8_t* inB,
  int32_t sum[4];
  __m128i tmp_0, tmp_1, tmp_2, tmp_3;
  const __m128i zero = _mm_setzero_si128();
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i three = _mm_set1_epi16(3);

  // Load, combine and tranpose inputs.
  {
@ -547,17 +689,14 @@ static int TTransformSSE2(const uint8_t* inA, const uint8_t* inB,
  // Horizontal pass and subsequent transpose.
  {
    // Calculate a and b (two 4x4 at once).
-    const __m128i a0 = _mm_slli_epi16(_mm_add_epi16(tmp_0, tmp_2), 2);
-    const __m128i a1 = _mm_slli_epi16(_mm_add_epi16(tmp_1, tmp_3), 2);
-    const __m128i a2 = _mm_slli_epi16(_mm_sub_epi16(tmp_1, tmp_3), 2);
-    const __m128i a3 = _mm_slli_epi16(_mm_sub_epi16(tmp_0, tmp_2), 2);
-    // b0_extra = (a0 != 0);
-    const __m128i b0_extra = _mm_andnot_si128(_mm_cmpeq_epi16 (a0, zero), one);
-    const __m128i b0_base = _mm_add_epi16(a0, a1);
+    const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);
+    const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3);
+    const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3);
+    const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2);
+    const __m128i b0 = _mm_add_epi16(a0, a1);
    const __m128i b1 = _mm_add_epi16(a3, a2);
    const __m128i b2 = _mm_sub_epi16(a3, a2);
    const __m128i b3 = _mm_sub_epi16(a0, a1);
-    const __m128i b0 = _mm_add_epi16(b0_base, b0_extra);
    // a00 a01 a02 a03   b00 b01 b02 b03
    // a10 a11 a12 a13   b10 b11 b12 b13
    // a20 a21 a22 a23   b20 b21 b22 b23
@ -632,19 +771,6 @@ static int TTransformSSE2(const uint8_t* inA, const uint8_t* inB,
      B_b2 = _mm_sub_epi16(B_b2, sign_B_b2);
    }

-    // b = abs(b) + 3
-    A_b0 = _mm_add_epi16(A_b0, three);
-    A_b2 = _mm_add_epi16(A_b2, three);
-    B_b0 = _mm_add_epi16(B_b0, three);
-    B_b2 = _mm_add_epi16(B_b2, three);
-
-    // abs((b + (b<0) + 3) >> 3) = (abs(b) + 3) >> 3
-    // b = (abs(b) + 3) >> 3
-    A_b0 = _mm_srai_epi16(A_b0, 3);
-    A_b2 = _mm_srai_epi16(A_b2, 3);
-    B_b0 = _mm_srai_epi16(B_b0, 3);
-    B_b2 = _mm_srai_epi16(B_b2, 3);
-
    // weighted sums
    A_b0 = _mm_madd_epi16(A_b0, w_0);
    A_b2 = _mm_madd_epi16(A_b2, w_8);
@ -663,7 +789,7 @@ static int TTransformSSE2(const uint8_t* inA, const uint8_t* inB,
 static int Disto4x4SSE2(const uint8_t* const a, const uint8_t* const b,
                        const uint16_t* const w) {
  const int diff_sum = TTransformSSE2(a, b, w);
-  return (abs(diff_sum) + 8) >> 4;
+  return abs(diff_sum) >> 5;
 }

 static int Disto16x16SSE2(const uint8_t* const a, const uint8_t* const b,
@ -678,7 +804,6 @@ static int Disto16x16SSE2(const uint8_t* const a, const uint8_t* const b,
  return D;
 }

-
 //------------------------------------------------------------------------------
 // Quantization
 //
@ -686,9 +811,8 @@ static int Disto16x16SSE2(const uint8_t* const a, const uint8_t* const b,
 // Simple quantization
 static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
                             int n, const VP8Matrix* const mtx) {
-  const __m128i max_coeff_2047 = _mm_set1_epi16(2047);
-  const __m128i zero = _mm_set1_epi16(0);
-  __m128i sign0, sign8;
+  const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL);
+  const __m128i zero = _mm_setzero_si128();
  __m128i coeff0, coeff8;
  __m128i out0, out8;
  __m128i packed_out;
@ -710,8 +834,8 @@ static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
  const __m128i zthresh8 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[8]);

  // sign(in) = in >> 15  (0x0000 if positive, 0xffff if negative)
-  sign0 = _mm_srai_epi16(in0, 15);
-  sign8 = _mm_srai_epi16(in8, 15);
+  const __m128i sign0 = _mm_srai_epi16(in0, 15);
+  const __m128i sign8 = _mm_srai_epi16(in8, 15);

  // coeff = abs(in) = (in ^ sign) - sign
  coeff0 = _mm_xor_si128(in0, sign0);
@ -723,10 +847,6 @@ static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
  coeff0 = _mm_add_epi16(coeff0, sharpen0);
  coeff8 = _mm_add_epi16(coeff8, sharpen8);

-  // if (coeff > 2047) coeff = 2047
-  coeff0 = _mm_min_epi16(coeff0, max_coeff_2047);
-  coeff8 = _mm_min_epi16(coeff8, max_coeff_2047);
-
  // out = (coeff * iQ + B) >> QFIX;
  {
    // doing calculations with 32b precision (QFIX=17)
@ -754,9 +874,14 @@ static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
    out_04 = _mm_srai_epi32(out_04, QFIX);
    out_08 = _mm_srai_epi32(out_08, QFIX);
    out_12 = _mm_srai_epi32(out_12, QFIX);
+
    // pack result as 16b
    out0 = _mm_packs_epi32(out_00, out_04);
    out8 = _mm_packs_epi32(out_08, out_12);
+
+    // if (coeff > 2047) coeff = 2047
+    out0 = _mm_min_epi16(out0, max_coeff_2047);
+    out8 = _mm_min_epi16(out8, max_coeff_2047);
  }

  // get sign back (if (sign[j]) out_n = -out_n)
@ -816,19 +941,29 @@ static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
  }
 }

+#endif   // WEBP_USE_SSE2
+
+//------------------------------------------------------------------------------
+// Entry point
+
 extern void VP8EncDspInitSSE2(void);
+
 void VP8EncDspInitSSE2(void) {
+#if defined(WEBP_USE_SSE2)
  VP8CollectHistogram = CollectHistogramSSE2;
  VP8EncQuantizeBlock = QuantizeBlockSSE2;
  VP8ITransform = ITransformSSE2;
  VP8FTransform = FTransformSSE2;
+  VP8FTransformWHT = FTransformWHTSSE2;
+  VP8SSE16x16 = SSE16x16SSE2;
+  VP8SSE16x8 = SSE16x8SSE2;
+  VP8SSE8x8 = SSE8x8SSE2;
  VP8SSE4x4 = SSE4x4SSE2;
  VP8TDisto4x4 = Disto4x4SSE2;
  VP8TDisto16x16 = Disto16x16SSE2;
+#endif   // WEBP_USE_SSE2
 }

 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
-
-#endif   //__SSE2__
--- a/src/dsp/lossless.c
+++ b/src/dsp/lossless.c
--- a/src/dsp/lossless.h
+++ b/src/dsp/lossless.h
@ -0,0 +1,104 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Image transforms and color space conversion methods for lossless decoder.
+//
+// Authors: Vikas Arora (vikaas.arora@gmail.com)
+//          Jyrki Alakuijala (jyrki@google.com)
+
+#ifndef WEBP_DSP_LOSSLESS_H_
+#define WEBP_DSP_LOSSLESS_H_
+
+#include "../webp/types.h"
+#include "../webp/decode.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+//------------------------------------------------------------------------------
+// Image transforms.
+
+struct VP8LTransform;  // Defined in dec/vp8li.h.
+
+// Performs inverse transform of data given transform information, start and end
+// rows. Transform will be applied to rows [row_start, row_end[.
+// The *in and *out pointers refer to source and destination data respectively
+// corresponding to the intermediate row (row_start).
+void VP8LInverseTransform(const struct VP8LTransform* const transform,
+                          int row_start, int row_end,
+                          const uint32_t* const in, uint32_t* const out);
+
+// Similar to the static method ColorIndexInverseTransform() that is part of
+// lossless.c, but used only for alpha decoding. It takes uint8_t (rather than
+// uint32_t) arguments for 'src' and 'dst'.
+void VP8LColorIndexInverseTransformAlpha(
+    const struct VP8LTransform* const transform, int y_start, int y_end,
+    const uint8_t* src, uint8_t* dst);
+
+// Subtracts green from blue and red channels.
+void VP8LSubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixs);
+
+void VP8LResidualImage(int width, int height, int bits,
+                       uint32_t* const argb, uint32_t* const argb_scratch,
+                       uint32_t* const image);
+
+void VP8LColorSpaceTransform(int width, int height, int bits, int step,
+                             uint32_t* const argb, uint32_t* image);
+
+//------------------------------------------------------------------------------
+// Color space conversion.
+
+// Converts from BGRA to other color spaces.
+void VP8LConvertFromBGRA(const uint32_t* const in_data, int num_pixels,
+                         WEBP_CSP_MODE out_colorspace, uint8_t* const rgba);
+
+//------------------------------------------------------------------------------
+// Misc methods.
+
+// Computes sampled size of 'size' when sampling using 'sampling bits'.
+static WEBP_INLINE uint32_t VP8LSubSampleSize(uint32_t size,
+                                              uint32_t sampling_bits) {
+  return (size + (1 << sampling_bits) - 1) >> sampling_bits;
+}
+
+// Faster logarithm for integers. Small values use a look-up table.
+#define LOG_LOOKUP_IDX_MAX 256
+extern const float kLog2Table[LOG_LOOKUP_IDX_MAX];
+extern const float kSLog2Table[LOG_LOOKUP_IDX_MAX];
+extern float VP8LFastLog2Slow(int v);
+extern float VP8LFastSLog2Slow(int v);
+static WEBP_INLINE float VP8LFastLog2(int v) {
+  return (v < LOG_LOOKUP_IDX_MAX) ? kLog2Table[v] : VP8LFastLog2Slow(v);
+}
+// Fast calculation of v * log2(v) for integer input.
+static WEBP_INLINE float VP8LFastSLog2(int v) {
+  return (v < LOG_LOOKUP_IDX_MAX) ? kSLog2Table[v] : VP8LFastSLog2Slow(v);
+}
+
+
+// In-place difference of each component with mod 256.
+static WEBP_INLINE uint32_t VP8LSubPixels(uint32_t a, uint32_t b) {
+  const uint32_t alpha_and_green =
+      0x00ff00ffu + (a & 0xff00ff00u) - (b & 0xff00ff00u);
+  const uint32_t red_and_blue =
+      0xff00ff00u + (a & 0x00ff00ffu) - (b & 0x00ff00ffu);
+  return (alpha_and_green & 0xff00ff00u) | (red_and_blue & 0x00ff00ffu);
+}
+
+void VP8LBundleColorMap(const uint8_t* const row, int width,
+                        int xbits, uint32_t* const dst);
+
+//------------------------------------------------------------------------------
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
+
+#endif  // WEBP_DSP_LOSSLESS_H_
--- a/src/dsp/upsampling.c
+++ b/src/dsp/upsampling.c
@ -1,8 +1,10 @@
-// Copyright 2011 Google Inc.
+// Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // YUV to RGB upsampling functions.
@ -11,7 +13,6 @@

 #include "./dsp.h"
 #include "./yuv.h"
-#include "../dec/webpi.h"

 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
@ -24,7 +25,6 @@ extern "C" {

 // Fancy upsampling functions to convert YUV to RGB
 WebPUpsampleLinePairFunc WebPUpsamplers[MODE_LAST];
-WebPUpsampleLinePairFunc WebPUpsamplersKeepAlpha[MODE_LAST];

 // Given samples laid out in a square as:
 //  [a b]
@ -34,7 +34,7 @@ WebPUpsampleLinePairFunc WebPUpsamplersKeepAlpha[MODE_LAST];
 //  ([3*a +   b + 9*c + 3*d      a + 3*b + 3*c + 9*d]   [8 8]) / 16

 // We process u and v together stashed into 32bit (16bit each).
-#define LOAD_UV(u,v) ((u) | ((v) << 16))
+#define LOAD_UV(u, v) ((u) | ((v) << 16))

 #define UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP)                                  \
 static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
@ -101,11 +101,6 @@ UPSAMPLE_FUNC(UpsampleBgraLinePair, VP8YuvToBgra, 4)
 UPSAMPLE_FUNC(UpsampleArgbLinePair, VP8YuvToArgb, 4)
 UPSAMPLE_FUNC(UpsampleRgba4444LinePair, VP8YuvToRgba4444, 2)
 UPSAMPLE_FUNC(UpsampleRgb565LinePair,  VP8YuvToRgb565,  2)
-// These two don't erase the alpha value
-UPSAMPLE_FUNC(UpsampleRgbKeepAlphaLinePair, VP8YuvToRgb, 4)
-UPSAMPLE_FUNC(UpsampleBgrKeepAlphaLinePair, VP8YuvToBgr, 4)
-UPSAMPLE_FUNC(UpsampleArgbKeepAlphaLinePair, VP8YuvToArgbKeepA, 4)
-UPSAMPLE_FUNC(UpsampleRgba4444KeepAlphaLinePair, VP8YuvToRgba4444KeepA, 2)

 #undef LOAD_UV
 #undef UPSAMPLE_FUNC
@ -156,9 +151,55 @@ const WebPSampleLinePairFunc WebPSamplers[MODE_LAST] = {
  SampleBgraLinePair,      // MODE_BGRA
  SampleArgbLinePair,      // MODE_ARGB
  SampleRgba4444LinePair,  // MODE_RGBA_4444
-  SampleRgb565LinePair     // MODE_RGB_565
+  SampleRgb565LinePair,    // MODE_RGB_565
+  SampleRgbaLinePair,      // MODE_rgbA
+  SampleBgraLinePair,      // MODE_bgrA
+  SampleArgbLinePair,      // MODE_Argb
+  SampleRgba4444LinePair   // MODE_rgbA_4444
 };

+//------------------------------------------------------------------------------
+
+#if !defined(FANCY_UPSAMPLING)
+#define DUAL_SAMPLE_FUNC(FUNC_NAME, FUNC)                                      \
+static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bot_y,              \
+                      const uint8_t* top_u, const uint8_t* top_v,              \
+                      const uint8_t* bot_u, const uint8_t* bot_v,              \
+                      uint8_t* top_dst, uint8_t* bot_dst, int len) {           \
+  const int half_len = len >> 1;                                               \
+  int x;                                                                       \
+  if (top_dst != NULL) {                                                       \
+    for (x = 0; x < half_len; ++x) {                                           \
+      FUNC(top_y[2 * x + 0], top_u[x], top_v[x], top_dst + 8 * x + 0);         \
+      FUNC(top_y[2 * x + 1], top_u[x], top_v[x], top_dst + 8 * x + 4);         \
+    }                                                                          \
+    if (len & 1) FUNC(top_y[2 * x + 0], top_u[x], top_v[x], top_dst + 8 * x);  \
+  }                                                                            \
+  if (bot_dst != NULL) {                                                       \
+    for (x = 0; x < half_len; ++x) {                                           \
+      FUNC(bot_y[2 * x + 0], bot_u[x], bot_v[x], bot_dst + 8 * x + 0);         \
+      FUNC(bot_y[2 * x + 1], bot_u[x], bot_v[x], bot_dst + 8 * x + 4);         \
+    }                                                                          \
+    if (len & 1) FUNC(bot_y[2 * x + 0], bot_u[x], bot_v[x], bot_dst + 8 * x);  \
+  }                                                                            \
+}
+
+DUAL_SAMPLE_FUNC(DualLineSamplerBGRA, VP8YuvToBgra)
+DUAL_SAMPLE_FUNC(DualLineSamplerARGB, VP8YuvToArgb)
+#undef DUAL_SAMPLE_FUNC
+
+#endif  // !FANCY_UPSAMPLING
+
+WebPUpsampleLinePairFunc WebPGetLinePairConverter(int alpha_is_last) {
+  WebPInitUpsamplers();
+  VP8YUVInit();
+#ifdef FANCY_UPSAMPLING
+  return WebPUpsamplers[alpha_is_last ? MODE_BGRA : MODE_ARGB];
+#else
+  return (alpha_is_last ? DualLineSamplerBGRA : DualLineSamplerARGB);
+#endif
+}
+
 //------------------------------------------------------------------------------
 // YUV444 converter

@ -186,9 +227,89 @@ const WebPYUV444Converter WebPYUV444Converters[MODE_LAST] = {
  Yuv444ToBgra,      // MODE_BGRA
  Yuv444ToArgb,      // MODE_ARGB
  Yuv444ToRgba4444,  // MODE_RGBA_4444
-  Yuv444ToRgb565     // MODE_RGB_565
+  Yuv444ToRgb565,    // MODE_RGB_565
+  Yuv444ToRgba,      // MODE_rgbA
+  Yuv444ToBgra,      // MODE_bgrA
+  Yuv444ToArgb,      // MODE_Argb
+  Yuv444ToRgba4444   // MODE_rgbA_4444
 };

+//------------------------------------------------------------------------------
+// Premultiplied modes
+
+// non dithered-modes
+
+// (x * a * 32897) >> 23 is bit-wise equivalent to (int)(x * a / 255.)
+// for all 8bit x or a. For bit-wise equivalence to (int)(x * a / 255. + .5),
+// one can use instead: (x * a * 65793 + (1 << 23)) >> 24
+#if 1     // (int)(x * a / 255.)
+#define MULTIPLIER(a)   ((a) * 32897UL)
+#define PREMULTIPLY(x, m) (((x) * (m)) >> 23)
+#else     // (int)(x * a / 255. + .5)
+#define MULTIPLIER(a) ((a) * 65793UL)
+#define PREMULTIPLY(x, m) (((x) * (m) + (1UL << 23)) >> 24)
+#endif
+
+static void ApplyAlphaMultiply(uint8_t* rgba, int alpha_first,
+                               int w, int h, int stride) {
+  while (h-- > 0) {
+    uint8_t* const rgb = rgba + (alpha_first ? 1 : 0);
+    const uint8_t* const alpha = rgba + (alpha_first ? 0 : 3);
+    int i;
+    for (i = 0; i < w; ++i) {
+      const uint32_t a = alpha[4 * i];
+      if (a != 0xff) {
+        const uint32_t mult = MULTIPLIER(a);
+        rgb[4 * i + 0] = PREMULTIPLY(rgb[4 * i + 0], mult);
+        rgb[4 * i + 1] = PREMULTIPLY(rgb[4 * i + 1], mult);
+        rgb[4 * i + 2] = PREMULTIPLY(rgb[4 * i + 2], mult);
+      }
+    }
+    rgba += stride;
+  }
+}
+#undef MULTIPLIER
+#undef PREMULTIPLY
+
+// rgbA4444
+
+#define MULTIPLIER(a)  ((a) * 0x1111)    // 0x1111 ~= (1 << 16) / 15
+
+static WEBP_INLINE uint8_t dither_hi(uint8_t x) {
+  return (x & 0xf0) | (x >> 4);
+}
+
+static WEBP_INLINE uint8_t dither_lo(uint8_t x) {
+  return (x & 0x0f) | (x << 4);
+}
+
+static WEBP_INLINE uint8_t multiply(uint8_t x, uint32_t m) {
+  return (x * m) >> 16;
+}
+
+static void ApplyAlphaMultiply4444(uint8_t* rgba4444,
+                                   int w, int h, int stride) {
+  while (h-- > 0) {
+    int i;
+    for (i = 0; i < w; ++i) {
+      const uint8_t a = (rgba4444[2 * i + 1] & 0x0f);
+      const uint32_t mult = MULTIPLIER(a);
+      const uint8_t r = multiply(dither_hi(rgba4444[2 * i + 0]), mult);
+      const uint8_t g = multiply(dither_lo(rgba4444[2 * i + 0]), mult);
+      const uint8_t b = multiply(dither_hi(rgba4444[2 * i + 1]), mult);
+      rgba4444[2 * i + 0] = (r & 0xf0) | ((g >> 4) & 0x0f);
+      rgba4444[2 * i + 1] = (b & 0xf0) | a;
+    }
+    rgba4444 += stride;
+  }
+}
+#undef MULTIPLIER
+
+void (*WebPApplyAlphaMultiply)(uint8_t*, int, int, int, int)
+    = ApplyAlphaMultiply;
+void (*WebPApplyAlphaMultiply4444)(uint8_t*, int, int, int)
+    = ApplyAlphaMultiply4444;
+
 //------------------------------------------------------------------------------
 // Main call

@ -202,20 +323,42 @@ void WebPInitUpsamplers(void) {
  WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair;
  WebPUpsamplers[MODE_RGB_565]   = UpsampleRgb565LinePair;

-  WebPUpsamplersKeepAlpha[MODE_RGB]       = UpsampleRgbLinePair;
-  WebPUpsamplersKeepAlpha[MODE_RGBA]      = UpsampleRgbKeepAlphaLinePair;
-  WebPUpsamplersKeepAlpha[MODE_BGR]       = UpsampleBgrLinePair;
-  WebPUpsamplersKeepAlpha[MODE_BGRA]      = UpsampleBgrKeepAlphaLinePair;
-  WebPUpsamplersKeepAlpha[MODE_ARGB]      = UpsampleArgbKeepAlphaLinePair;
-  WebPUpsamplersKeepAlpha[MODE_RGBA_4444] = UpsampleRgba4444KeepAlphaLinePair;
-  WebPUpsamplersKeepAlpha[MODE_RGB_565]   = UpsampleRgb565LinePair;
-
  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
-  if (VP8GetCPUInfo) {
-#if defined(__SSE2__) || defined(_MSC_VER)
+  if (VP8GetCPUInfo != NULL) {
+#if defined(WEBP_USE_SSE2)
    if (VP8GetCPUInfo(kSSE2)) {
      WebPInitUpsamplersSSE2();
    }
+#endif
+#if defined(WEBP_USE_NEON)
+    if (VP8GetCPUInfo(kNEON)) {
+      WebPInitUpsamplersNEON();
+    }
+#endif
+  }
+#endif  // FANCY_UPSAMPLING
+}
+
+void WebPInitPremultiply(void) {
+  WebPApplyAlphaMultiply = ApplyAlphaMultiply;
+  WebPApplyAlphaMultiply4444 = ApplyAlphaMultiply4444;
+
+#ifdef FANCY_UPSAMPLING
+  WebPUpsamplers[MODE_rgbA]      = UpsampleRgbaLinePair;
+  WebPUpsamplers[MODE_bgrA]      = UpsampleBgraLinePair;
+  WebPUpsamplers[MODE_Argb]      = UpsampleArgbLinePair;
+  WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair;
+
+  if (VP8GetCPUInfo != NULL) {
+#if defined(WEBP_USE_SSE2)
+    if (VP8GetCPUInfo(kSSE2)) {
+      WebPInitPremultiplySSE2();
+    }
+#endif
+#if defined(WEBP_USE_NEON)
+    if (VP8GetCPUInfo(kNEON)) {
+      WebPInitPremultiplyNEON();
+    }
 #endif
  }
 #endif  // FANCY_UPSAMPLING
--- a/src/dsp/upsampling_neon.c
+++ b/src/dsp/upsampling_neon.c
@ -0,0 +1,294 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// NEON version of YUV to RGB upsampling functions.
+//
+// Author: mans@mansr.com (Mans Rullgard)
+// Based on SSE code by: somnath@google.com (Somnath Banerjee)
+
+#include "./dsp.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+#if defined(WEBP_USE_NEON)
+
+#include <assert.h>
+#include <arm_neon.h>
+#include <string.h>
+#include "./yuv.h"
+
+#ifdef FANCY_UPSAMPLING
+
+// Loads 9 pixels each from rows r1 and r2 and generates 16 pixels.
+#define UPSAMPLE_16PIXELS(r1, r2, out) {                                \
+  uint8x8_t a = vld1_u8(r1);                                            \
+  uint8x8_t b = vld1_u8(r1 + 1);                                        \
+  uint8x8_t c = vld1_u8(r2);                                            \
+  uint8x8_t d = vld1_u8(r2 + 1);                                        \
+                                                                        \
+  uint16x8_t al = vshll_n_u8(a, 1);                                     \
+  uint16x8_t bl = vshll_n_u8(b, 1);                                     \
+  uint16x8_t cl = vshll_n_u8(c, 1);                                     \
+  uint16x8_t dl = vshll_n_u8(d, 1);                                     \
+                                                                        \
+  uint8x8_t diag1, diag2;                                               \
+  uint16x8_t sl;                                                        \
+                                                                        \
+  /* a + b + c + d */                                                   \
+  sl = vaddl_u8(a,  b);                                                 \
+  sl = vaddw_u8(sl, c);                                                 \
+  sl = vaddw_u8(sl, d);                                                 \
+                                                                        \
+  al = vaddq_u16(sl, al); /* 3a +  b +  c +  d */                       \
+  bl = vaddq_u16(sl, bl); /*  a + 3b +  c +  d */                       \
+                                                                        \
+  al = vaddq_u16(al, dl); /* 3a +  b +  c + 3d */                       \
+  bl = vaddq_u16(bl, cl); /*  a + 3b + 3c +  d */                       \
+                                                                        \
+  diag2 = vshrn_n_u16(al, 3);                                           \
+  diag1 = vshrn_n_u16(bl, 3);                                           \
+                                                                        \
+  a = vrhadd_u8(a, diag1);                                              \
+  b = vrhadd_u8(b, diag2);                                              \
+  c = vrhadd_u8(c, diag2);                                              \
+  d = vrhadd_u8(d, diag1);                                              \
+                                                                        \
+  {                                                                     \
+    const uint8x8x2_t a_b = {{ a, b }};                                 \
+    const uint8x8x2_t c_d = {{ c, d }};                                 \
+    vst2_u8(out,      a_b);                                             \
+    vst2_u8(out + 32, c_d);                                             \
+  }                                                                     \
+}
+
+// Turn the macro into a function for reducing code-size when non-critical
+static void Upsample16Pixels(const uint8_t *r1, const uint8_t *r2,
+                             uint8_t *out) {
+  UPSAMPLE_16PIXELS(r1, r2, out);
+}
+
+#define UPSAMPLE_LAST_BLOCK(tb, bb, num_pixels, out) {                  \
+  uint8_t r1[9], r2[9];                                                 \
+  memcpy(r1, (tb), (num_pixels));                                       \
+  memcpy(r2, (bb), (num_pixels));                                       \
+  /* replicate last byte */                                             \
+  memset(r1 + (num_pixels), r1[(num_pixels) - 1], 9 - (num_pixels));    \
+  memset(r2 + (num_pixels), r2[(num_pixels) - 1], 9 - (num_pixels));    \
+  Upsample16Pixels(r1, r2, out);                                        \
+}
+
+#define CY  76283
+#define CVR 89858
+#define CUG 22014
+#define CVG 45773
+#define CUB 113618
+
+static const int16_t coef[4] = { CVR / 4, CUG, CVG / 2, CUB / 4 };
+
+#define CONVERT8(FMT, XSTEP, N, src_y, src_uv, out, cur_x) {            \
+  int i;                                                                \
+  for (i = 0; i < N; i += 8) {                                          \
+    int off = ((cur_x) + i) * XSTEP;                                    \
+    uint8x8_t y  = vld1_u8(src_y + (cur_x)  + i);                       \
+    uint8x8_t u  = vld1_u8((src_uv) + i);                               \
+    uint8x8_t v  = vld1_u8((src_uv) + i + 16);                          \
+    int16x8_t yy = vreinterpretq_s16_u16(vsubl_u8(y, u16));             \
+    int16x8_t uu = vreinterpretq_s16_u16(vsubl_u8(u, u128));            \
+    int16x8_t vv = vreinterpretq_s16_u16(vsubl_u8(v, u128));            \
+                                                                        \
+    int16x8_t ud = vshlq_n_s16(uu, 1);                                  \
+    int16x8_t vd = vshlq_n_s16(vv, 1);                                  \
+                                                                        \
+    int32x4_t vrl = vqdmlal_lane_s16(vshll_n_s16(vget_low_s16(vv), 1),  \
+                                     vget_low_s16(vd),  cf16, 0);       \
+    int32x4_t vrh = vqdmlal_lane_s16(vshll_n_s16(vget_high_s16(vv), 1), \
+                                     vget_high_s16(vd), cf16, 0);       \
+    int16x8_t vr = vcombine_s16(vrshrn_n_s32(vrl, 16),                  \
+                                vrshrn_n_s32(vrh, 16));                 \
+                                                                        \
+    int32x4_t vl = vmovl_s16(vget_low_s16(vv));                         \
+    int32x4_t vh = vmovl_s16(vget_high_s16(vv));                        \
+    int32x4_t ugl = vmlal_lane_s16(vl, vget_low_s16(uu),  cf16, 1);     \
+    int32x4_t ugh = vmlal_lane_s16(vh, vget_high_s16(uu), cf16, 1);     \
+    int32x4_t gcl = vqdmlal_lane_s16(ugl, vget_low_s16(vv),  cf16, 2);  \
+    int32x4_t gch = vqdmlal_lane_s16(ugh, vget_high_s16(vv), cf16, 2);  \
+    int16x8_t gc = vcombine_s16(vrshrn_n_s32(gcl, 16),                  \
+                                vrshrn_n_s32(gch, 16));                 \
+                                                                        \
+    int32x4_t ubl = vqdmlal_lane_s16(vshll_n_s16(vget_low_s16(uu), 1),  \
+                                     vget_low_s16(ud),  cf16, 3);       \
+    int32x4_t ubh = vqdmlal_lane_s16(vshll_n_s16(vget_high_s16(uu), 1), \
+                                     vget_high_s16(ud), cf16, 3);       \
+    int16x8_t ub = vcombine_s16(vrshrn_n_s32(ubl, 16),                  \
+                                vrshrn_n_s32(ubh, 16));                 \
+                                                                        \
+    int32x4_t rl = vaddl_s16(vget_low_s16(yy),  vget_low_s16(vr));      \
+    int32x4_t rh = vaddl_s16(vget_high_s16(yy), vget_high_s16(vr));     \
+    int32x4_t gl = vsubl_s16(vget_low_s16(yy),  vget_low_s16(gc));      \
+    int32x4_t gh = vsubl_s16(vget_high_s16(yy), vget_high_s16(gc));     \
+    int32x4_t bl = vaddl_s16(vget_low_s16(yy),  vget_low_s16(ub));      \
+    int32x4_t bh = vaddl_s16(vget_high_s16(yy), vget_high_s16(ub));     \
+                                                                        \
+    rl = vmulq_lane_s32(rl, cf32, 0);                                   \
+    rh = vmulq_lane_s32(rh, cf32, 0);                                   \
+    gl = vmulq_lane_s32(gl, cf32, 0);                                   \
+    gh = vmulq_lane_s32(gh, cf32, 0);                                   \
+    bl = vmulq_lane_s32(bl, cf32, 0);                                   \
+    bh = vmulq_lane_s32(bh, cf32, 0);                                   \
+                                                                        \
+    y = vqmovun_s16(vcombine_s16(vrshrn_n_s32(rl, 16),                  \
+                                 vrshrn_n_s32(rh, 16)));                \
+    u = vqmovun_s16(vcombine_s16(vrshrn_n_s32(gl, 16),                  \
+                                 vrshrn_n_s32(gh, 16)));                \
+    v = vqmovun_s16(vcombine_s16(vrshrn_n_s32(bl, 16),                  \
+                                 vrshrn_n_s32(bh, 16)));                \
+    STR_ ## FMT(out + off, y, u, v);                                    \
+  }                                                                     \
+}
+
+#define v255 vmov_n_u8(255)
+
+#define STR_Rgb(out, r, g, b) do {                                      \
+  const uint8x8x3_t r_g_b = {{ r, g, b }};                              \
+  vst3_u8(out, r_g_b);                                                  \
+} while (0)
+
+#define STR_Bgr(out, r, g, b) do {                                      \
+  const uint8x8x3_t b_g_r = {{ b, g, r }};                              \
+  vst3_u8(out, b_g_r);                                                  \
+} while (0)
+
+#define STR_Rgba(out, r, g, b) do {                                     \
+  const uint8x8x4_t r_g_b_v255 = {{ r, g, b, v255 }};                   \
+  vst4_u8(out, r_g_b_v255);                                             \
+} while (0)
+
+#define STR_Bgra(out, r, g, b) do {                                     \
+  const uint8x8x4_t b_g_r_v255 = {{ b, g, r, v255 }};                   \
+  vst4_u8(out, b_g_r_v255);                                             \
+} while (0)
+
+#define CONVERT1(FMT, XSTEP, N, src_y, src_uv, rgb, cur_x) {            \
+  int i;                                                                \
+  for (i = 0; i < N; i++) {                                             \
+    int off = ((cur_x) + i) * XSTEP;                                    \
+    int y = src_y[(cur_x) + i];                                         \
+    int u = (src_uv)[i];                                                \
+    int v = (src_uv)[i + 16];                                           \
+    VP8YuvTo ## FMT(y, u, v, rgb + off);                                \
+  }                                                                     \
+}
+
+#define CONVERT2RGB_8(FMT, XSTEP, top_y, bottom_y, uv,                  \
+                      top_dst, bottom_dst, cur_x, len) {                \
+  if (top_y) {                                                          \
+    CONVERT8(FMT, XSTEP, len, top_y, uv, top_dst, cur_x)                \
+  }                                                                     \
+  if (bottom_y) {                                                       \
+    CONVERT8(FMT, XSTEP, len, bottom_y, (uv) + 32, bottom_dst, cur_x)   \
+  }                                                                     \
+}
+
+#define CONVERT2RGB_1(FMT, XSTEP, top_y, bottom_y, uv,                  \
+                      top_dst, bottom_dst, cur_x, len) {                \
+  if (top_y) {                                                          \
+    CONVERT1(FMT, XSTEP, len, top_y, uv, top_dst, cur_x);               \
+  }                                                                     \
+  if (bottom_y) {                                                       \
+    CONVERT1(FMT, XSTEP, len, bottom_y, (uv) + 32, bottom_dst, cur_x);  \
+  }                                                                     \
+}
+
+#define NEON_UPSAMPLE_FUNC(FUNC_NAME, FMT, XSTEP)                       \
+static void FUNC_NAME(const uint8_t *top_y, const uint8_t *bottom_y,    \
+                      const uint8_t *top_u, const uint8_t *top_v,       \
+                      const uint8_t *cur_u, const uint8_t *cur_v,       \
+                      uint8_t *top_dst, uint8_t *bottom_dst, int len) { \
+  int block;                                                            \
+  /* 16 byte aligned array to cache reconstructed u and v */            \
+  uint8_t uv_buf[2 * 32 + 15];                                          \
+  uint8_t *const r_uv = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15);     \
+  const int uv_len = (len + 1) >> 1;                                    \
+  /* 9 pixels must be read-able for each block */                       \
+  const int num_blocks = (uv_len - 1) >> 3;                             \
+  const int leftover = uv_len - num_blocks * 8;                         \
+  const int last_pos = 1 + 16 * num_blocks;                             \
+                                                                        \
+  const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1;                  \
+  const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1;                  \
+                                                                        \
+  const int16x4_t cf16 = vld1_s16(coef);                                \
+  const int32x2_t cf32 = vmov_n_s32(CY);                                \
+  const uint8x8_t u16  = vmov_n_u8(16);                                 \
+  const uint8x8_t u128 = vmov_n_u8(128);                                \
+                                                                        \
+  /* Treat the first pixel in regular way */                            \
+  if (top_y) {                                                          \
+    const int u0 = (top_u[0] + u_diag) >> 1;                            \
+    const int v0 = (top_v[0] + v_diag) >> 1;                            \
+    VP8YuvTo ## FMT(top_y[0], u0, v0, top_dst);                         \
+  }                                                                     \
+  if (bottom_y) {                                                       \
+    const int u0 = (cur_u[0] + u_diag) >> 1;                            \
+    const int v0 = (cur_v[0] + v_diag) >> 1;                            \
+    VP8YuvTo ## FMT(bottom_y[0], u0, v0, bottom_dst);                   \
+  }                                                                     \
+                                                                        \
+  for (block = 0; block < num_blocks; ++block) {                        \
+    UPSAMPLE_16PIXELS(top_u, cur_u, r_uv);                              \
+    UPSAMPLE_16PIXELS(top_v, cur_v, r_uv + 16);                         \
+    CONVERT2RGB_8(FMT, XSTEP, top_y, bottom_y, r_uv,                    \
+                  top_dst, bottom_dst, 16 * block + 1, 16);             \
+    top_u += 8;                                                         \
+    cur_u += 8;                                                         \
+    top_v += 8;                                                         \
+    cur_v += 8;                                                         \
+  }                                                                     \
+                                                                        \
+  UPSAMPLE_LAST_BLOCK(top_u, cur_u, leftover, r_uv);                    \
+  UPSAMPLE_LAST_BLOCK(top_v, cur_v, leftover, r_uv + 16);               \
+  CONVERT2RGB_1(FMT, XSTEP, top_y, bottom_y, r_uv,                      \
+                top_dst, bottom_dst, last_pos, len - last_pos);         \
+}
+
+// NEON variants of the fancy upsampler.
+NEON_UPSAMPLE_FUNC(UpsampleRgbLinePairNEON,  Rgb,  3)
+NEON_UPSAMPLE_FUNC(UpsampleBgrLinePairNEON,  Bgr,  3)
+NEON_UPSAMPLE_FUNC(UpsampleRgbaLinePairNEON, Rgba, 4)
+NEON_UPSAMPLE_FUNC(UpsampleBgraLinePairNEON, Bgra, 4)
+
+#endif  // FANCY_UPSAMPLING
+
+#endif   // WEBP_USE_NEON
+
+//------------------------------------------------------------------------------
+
+extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
+
+void WebPInitUpsamplersNEON(void) {
+#if defined(WEBP_USE_NEON)
+  WebPUpsamplers[MODE_RGB]  = UpsampleRgbLinePairNEON;
+  WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePairNEON;
+  WebPUpsamplers[MODE_BGR]  = UpsampleBgrLinePairNEON;
+  WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePairNEON;
+#endif   // WEBP_USE_NEON
+}
+
+void WebPInitPremultiplyNEON(void) {
+#if defined(WEBP_USE_NEON)
+  WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePairNEON;
+  WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePairNEON;
+#endif   // WEBP_USE_NEON
+}
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/src/dsp/upsampling_sse2.c
+++ b/src/dsp/upsampling_sse2.c
@ -1,27 +1,29 @@
-// Copyright 2011 Google Inc.
+// Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // SSE2 version of YUV to RGB upsampling functions.
 //
 // Author: somnath@google.com (Somnath Banerjee)

-#if defined(__SSE2__) || defined(_MSC_VER)
-
-#include <assert.h>
-#include <emmintrin.h>
-#include <string.h>
 #include "./dsp.h"
-#include "./yuv.h"
-#include "../dec/webpi.h"

 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif

+#if defined(WEBP_USE_SSE2)
+
+#include <assert.h>
+#include <emmintrin.h>
+#include <string.h>
+#include "./yuv.h"
+
 #ifdef FANCY_UPSAMPLING

 // We compute (9*a + 3*b + 3*c + d + 8) / 16 as follows
@ -51,12 +53,12 @@ extern "C" {

 // pack and store two alterning pixel rows
 #define PACK_AND_STORE(a, b, da, db, out) do {                                 \
-  const __m128i ta = _mm_avg_epu8(a, da);  /* (9a + 3b + 3c +  d + 8) / 16 */  \
-  const __m128i tb = _mm_avg_epu8(b, db);  /* (3a + 9b +  c + 3d + 8) / 16 */  \
-  const __m128i t1 = _mm_unpacklo_epi8(ta, tb);                                \
-  const __m128i t2 = _mm_unpackhi_epi8(ta, tb);                                \
-  _mm_store_si128(((__m128i*)(out)) + 0, t1);                                  \
-  _mm_store_si128(((__m128i*)(out)) + 1, t2);                                  \
+  const __m128i t_a = _mm_avg_epu8(a, da);  /* (9a + 3b + 3c +  d + 8) / 16 */ \
+  const __m128i t_b = _mm_avg_epu8(b, db);  /* (3a + 9b +  c + 3d + 8) / 16 */ \
+  const __m128i t_1 = _mm_unpacklo_epi8(t_a, t_b);                             \
+  const __m128i t_2 = _mm_unpackhi_epi8(t_a, t_b);                             \
+  _mm_store_si128(((__m128i*)(out)) + 0, t_1);                                 \
+  _mm_store_si128(((__m128i*)(out)) + 1, t_2);                                 \
 } while (0)

 // Loads 17 pixels each from rows r1 and r2 and generates 32 pixels.
@ -128,7 +130,7 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
                      const uint8_t* top_u, const uint8_t* top_v,              \
                      const uint8_t* cur_u, const uint8_t* cur_v,              \
                      uint8_t* top_dst, uint8_t* bottom_dst, int len) {        \
-  int b;                                                                       \
+  int block;                                                                   \
  /* 16 byte aligned array to cache reconstructed u and v */                   \
  uint8_t uv_buf[4 * 32 + 15];                                                 \
  uint8_t* const r_uv = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15);            \
@ -154,11 +156,11 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
    FUNC(bottom_y[0], u0, v0, bottom_dst);                                     \
  }                                                                            \
                                                                               \
-  for (b = 0; b < num_blocks; ++b) {                                           \
+  for (block = 0; block < num_blocks; ++block) {                               \
    UPSAMPLE_32PIXELS(top_u, cur_u, r_uv + 0 * 32);                            \
    UPSAMPLE_32PIXELS(top_v, cur_v, r_uv + 1 * 32);                            \
    CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, r_uv, top_dst, bottom_dst,       \
-                32 * b + 1, 32)                                                \
+                32 * block + 1, 32)                                            \
    top_u += 16;                                                               \
    cur_u += 16;                                                               \
    top_v += 16;                                                               \
@ -176,9 +178,6 @@ SSE2_UPSAMPLE_FUNC(UpsampleRgbLinePairSSE2,  VP8YuvToRgb,  3)
 SSE2_UPSAMPLE_FUNC(UpsampleBgrLinePairSSE2,  VP8YuvToBgr,  3)
 SSE2_UPSAMPLE_FUNC(UpsampleRgbaLinePairSSE2, VP8YuvToRgba, 4)
 SSE2_UPSAMPLE_FUNC(UpsampleBgraLinePairSSE2, VP8YuvToBgra, 4)
-// These two don't erase the alpha value
-SSE2_UPSAMPLE_FUNC(UpsampleRgbKeepAlphaLinePairSSE2, VP8YuvToRgb, 4)
-SSE2_UPSAMPLE_FUNC(UpsampleBgrKeepAlphaLinePairSSE2, VP8YuvToBgr, 4)

 #undef GET_M
 #undef PACK_AND_STORE
@ -187,29 +186,32 @@ SSE2_UPSAMPLE_FUNC(UpsampleBgrKeepAlphaLinePairSSE2, VP8YuvToBgr, 4)
 #undef CONVERT2RGB
 #undef SSE2_UPSAMPLE_FUNC

+#endif  // FANCY_UPSAMPLING
+
+#endif   // WEBP_USE_SSE2
+
 //------------------------------------------------------------------------------

 extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
-extern WebPUpsampleLinePairFunc WebPUpsamplersKeepAlpha[/* MODE_LAST */];
-
-#endif  // FANCY_UPSAMPLING

 void WebPInitUpsamplersSSE2(void) {
-#ifdef FANCY_UPSAMPLING
+#if defined(WEBP_USE_SSE2)
  WebPUpsamplers[MODE_RGB]  = UpsampleRgbLinePairSSE2;
  WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePairSSE2;
  WebPUpsamplers[MODE_BGR]  = UpsampleBgrLinePairSSE2;
  WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePairSSE2;
+#endif   // WEBP_USE_SSE2
+}

-  WebPUpsamplersKeepAlpha[MODE_RGB]  = UpsampleRgbLinePairSSE2;
-  WebPUpsamplersKeepAlpha[MODE_RGBA] = UpsampleRgbKeepAlphaLinePairSSE2;
-  WebPUpsamplersKeepAlpha[MODE_BGR]  = UpsampleBgrLinePairSSE2;
-  WebPUpsamplersKeepAlpha[MODE_BGRA] = UpsampleBgrKeepAlphaLinePairSSE2;
-#endif  // FANCY_UPSAMPLING
+void WebPInitPremultiplySSE2(void) {
+#if defined(WEBP_USE_SSE2)
+  WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePairSSE2;
+  WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePairSSE2;
+#endif   // WEBP_USE_SSE2
 }

 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif

-#endif   //__SSE2__ || _MSC_VER
+
--- a/src/dsp/yuv.c
+++ b/src/dsp/yuv.c
@ -1,8 +1,10 @@
-// Copyright 2010 Google Inc.
+// Copyright 2010 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // YUV->RGB conversion function
@ -15,7 +17,7 @@
 extern "C" {
 #endif

-enum { YUV_HALF = 1 << (YUV_FIX - 1) };
+#ifdef WEBP_YUV_USE_TABLE

 int16_t VP8kVToR[256], VP8kUToB[256];
 int32_t VP8kVToG[256], VP8kUToG[256];
@ -24,7 +26,7 @@ uint8_t VP8kClip4Bits[YUV_RANGE_MAX - YUV_RANGE_MIN];

 static int done = 0;

-static inline uint8_t clip(int v, int max_value) {
+static WEBP_INLINE uint8_t clip(int v, int max_value) {
  return v < 0 ? 0 : v > max_value ? max_value : v;
 }

@ -33,6 +35,7 @@ void VP8YUVInit(void) {
  if (done) {
    return;
  }
+#ifndef USE_YUVj
  for (i = 0; i < 256; ++i) {
    VP8kVToR[i] = (89858 * (i - 128) + YUV_HALF) >> YUV_FIX;
    VP8kUToG[i] = -22014 * (i - 128) + YUV_HALF;
@ -44,9 +47,29 @@ void VP8YUVInit(void) {
    VP8kClip[i - YUV_RANGE_MIN] = clip(k, 255);
    VP8kClip4Bits[i - YUV_RANGE_MIN] = clip((k + 8) >> 4, 15);
  }
+#else
+  for (i = 0; i < 256; ++i) {
+    VP8kVToR[i] = (91881 * (i - 128) + YUV_HALF) >> YUV_FIX;
+    VP8kUToG[i] = -22554 * (i - 128) + YUV_HALF;
+    VP8kVToG[i] = -46802 * (i - 128);
+    VP8kUToB[i] = (116130 * (i - 128) + YUV_HALF) >> YUV_FIX;
+  }
+  for (i = YUV_RANGE_MIN; i < YUV_RANGE_MAX; ++i) {
+    const int k = i;
+    VP8kClip[i - YUV_RANGE_MIN] = clip(k, 255);
+    VP8kClip4Bits[i - YUV_RANGE_MIN] = clip((k + 8) >> 4, 15);
+  }
+#endif
+
  done = 1;
 }

+#else
+
+void VP8YUVInit(void) {}
+
+#endif  // WEBP_YUV_USE_TABLE
+
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
--- a/src/dsp/yuv.h
+++ b/src/dsp/yuv.h
@ -1,34 +1,77 @@
-// Copyright 2010 Google Inc.
+// Copyright 2010 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
-// inline YUV->RGB conversion function
+// inline YUV<->RGB conversion function
+//
+// The exact naming is Y'CbCr, following the ITU-R BT.601 standard.
+// More information at: http://en.wikipedia.org/wiki/YCbCr
+// Y = 0.2569 * R + 0.5044 * G + 0.0979 * B + 16
+// U = -0.1483 * R - 0.2911 * G + 0.4394 * B + 128
+// V = 0.4394 * R - 0.3679 * G - 0.0715 * B + 128
+// We use 16bit fixed point operations for RGB->YUV conversion.
+//
+// For the Y'CbCr to RGB conversion, the BT.601 specification reads:
+//   R = 1.164 * (Y-16) + 1.596 * (V-128)
+//   G = 1.164 * (Y-16) - 0.813 * (V-128) - 0.391 * (U-128)
+//   B = 1.164 * (Y-16)                   + 2.018 * (U-128)
+// where Y is in the [16,235] range, and U/V in the [16,240] range.
+// In the table-lookup version (WEBP_YUV_USE_TABLE), the common factor
+// "1.164 * (Y-16)" can be handled as an offset in the VP8kClip[] table.
+// So in this case the formulae should be read as:
+//   R = 1.164 * [Y + 1.371 * (V-128)                  ] - 18.624
+//   G = 1.164 * [Y - 0.698 * (V-128) - 0.336 * (U-128)] - 18.624
+//   B = 1.164 * [Y                   + 1.733 * (U-128)] - 18.624
+// once factorized. Here too, 16bit fixed precision is used.
 //
 // Author: Skal (pascal.massimino@gmail.com)

 #ifndef WEBP_DSP_YUV_H_
 #define WEBP_DSP_YUV_H_

-#include "../webp/decode_vp8.h"
+#include "../dec/decode_vp8.h"
+
+// Define the following to use the LUT-based code:
+#define WEBP_YUV_USE_TABLE
+
+#if defined(WEBP_EXPERIMENTAL_FEATURES)
+// Do NOT activate this feature for real compression. This is only experimental!
+// This flag is for comparison purpose against JPEG's "YUVj" natural colorspace.
+// This colorspace is close to Rec.601's Y'CbCr model with the notable
+// difference of allowing larger range for luma/chroma.
+// See http://en.wikipedia.org/wiki/YCbCr#JPEG_conversion paragraph, and its
+// difference with http://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion
+// #define USE_YUVj
+#endif
+
+//------------------------------------------------------------------------------
+// YUV -> RGB conversion

 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif

 enum { YUV_FIX = 16,                // fixed-point precision
+       YUV_HALF = 1 << (YUV_FIX - 1),
+       YUV_MASK = (256 << YUV_FIX) - 1,
       YUV_RANGE_MIN = -227,        // min value of r/g/b output
       YUV_RANGE_MAX = 256 + 226    // max value of r/g/b output
 };
+
+#ifdef WEBP_YUV_USE_TABLE
+
 extern int16_t VP8kVToR[256], VP8kUToB[256];
 extern int32_t VP8kVToG[256], VP8kUToG[256];
 extern uint8_t VP8kClip[YUV_RANGE_MAX - YUV_RANGE_MIN];
 extern uint8_t VP8kClip4Bits[YUV_RANGE_MAX - YUV_RANGE_MIN];

-static inline void VP8YuvToRgb(uint8_t y, uint8_t u, uint8_t v,
-                               uint8_t* const rgb) {
+static WEBP_INLINE void VP8YuvToRgb(uint8_t y, uint8_t u, uint8_t v,
+                                    uint8_t* const rgb) {
  const int r_off = VP8kVToR[v];
  const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
  const int b_off = VP8kUToB[u];
@ -37,48 +80,8 @@ static inline void VP8YuvToRgb(uint8_t y, uint8_t u, uint8_t v,
  rgb[2] = VP8kClip[y + b_off - YUV_RANGE_MIN];
 }

-static inline void VP8YuvToRgb565(uint8_t y, uint8_t u, uint8_t v,
-                                  uint8_t* const rgb) {
-  const int r_off = VP8kVToR[v];
-  const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
-  const int b_off = VP8kUToB[u];
-  rgb[0] = ((VP8kClip[y + r_off - YUV_RANGE_MIN] & 0xf8) |
-            (VP8kClip[y + g_off - YUV_RANGE_MIN] >> 5));
-  rgb[1] = (((VP8kClip[y + g_off - YUV_RANGE_MIN] << 3) & 0xe0) |
-            (VP8kClip[y + b_off - YUV_RANGE_MIN] >> 3));
-}
-
-static inline void VP8YuvToArgbKeepA(uint8_t y, uint8_t u, uint8_t v,
-                                     uint8_t* const argb) {
-  // Don't update Aplha (argb[0])
-  VP8YuvToRgb(y, u, v, argb + 1);
-}
-
-static inline void VP8YuvToArgb(uint8_t y, uint8_t u, uint8_t v,
-                                uint8_t* const argb) {
-  argb[0] = 0xff;
-  VP8YuvToArgbKeepA(y, u, v, argb);
-}
-
-static inline void VP8YuvToRgba4444KeepA(uint8_t y, uint8_t u, uint8_t v,
-                                         uint8_t* const argb) {
-  const int r_off = VP8kVToR[v];
-  const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
-  const int b_off = VP8kUToB[u];
-  // Don't update Aplha (last 4 bits of argb[1])
-  argb[0] = ((VP8kClip4Bits[y + r_off - YUV_RANGE_MIN] << 4) |
-             VP8kClip4Bits[y + g_off - YUV_RANGE_MIN]);
-  argb[1] = (argb[1] & 0x0f) | (VP8kClip4Bits[y + b_off - YUV_RANGE_MIN] << 4);
-}
-
-static inline void VP8YuvToRgba4444(uint8_t y, uint8_t u, uint8_t v,
-                                    uint8_t* const argb) {
-  argb[1] = 0x0f;
-  VP8YuvToRgba4444KeepA(y, u, v, argb);
-}
-
-static inline void VP8YuvToBgr(uint8_t y, uint8_t u, uint8_t v,
-                               uint8_t* const bgr) {
+static WEBP_INLINE void VP8YuvToBgr(uint8_t y, uint8_t u, uint8_t v,
+                                    uint8_t* const bgr) {
  const int r_off = VP8kVToR[v];
  const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
  const int b_off = VP8kUToB[u];
@ -87,14 +90,137 @@ static inline void VP8YuvToBgr(uint8_t y, uint8_t u, uint8_t v,
  bgr[2] = VP8kClip[y + r_off - YUV_RANGE_MIN];
 }

-static inline void VP8YuvToBgra(uint8_t y, uint8_t u, uint8_t v,
-                                uint8_t* const bgra) {
+static WEBP_INLINE void VP8YuvToRgb565(uint8_t y, uint8_t u, uint8_t v,
+                                       uint8_t* const rgb) {
+  const int r_off = VP8kVToR[v];
+  const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
+  const int b_off = VP8kUToB[u];
+  const uint8_t rg = ((VP8kClip[y + r_off - YUV_RANGE_MIN] & 0xf8) |
+                      (VP8kClip[y + g_off - YUV_RANGE_MIN] >> 5));
+  const uint8_t gb = (((VP8kClip[y + g_off - YUV_RANGE_MIN] << 3) & 0xe0) |
+                      (VP8kClip[y + b_off - YUV_RANGE_MIN] >> 3));
+#ifdef WEBP_SWAP_16BIT_CSP
+  rgb[0] = gb;
+  rgb[1] = rg;
+#else
+  rgb[0] = rg;
+  rgb[1] = gb;
+#endif
+}
+
+static WEBP_INLINE void VP8YuvToRgba4444(uint8_t y, uint8_t u, uint8_t v,
+                                         uint8_t* const argb) {
+  const int r_off = VP8kVToR[v];
+  const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
+  const int b_off = VP8kUToB[u];
+  const uint8_t rg = ((VP8kClip4Bits[y + r_off - YUV_RANGE_MIN] << 4) |
+                      VP8kClip4Bits[y + g_off - YUV_RANGE_MIN]);
+  const uint8_t ba = (VP8kClip4Bits[y + b_off - YUV_RANGE_MIN] << 4) | 0x0f;
+#ifdef WEBP_SWAP_16BIT_CSP
+  argb[0] = ba;
+  argb[1] = rg;
+#else
+  argb[0] = rg;
+  argb[1] = ba;
+#endif
+}
+
+#else   // Table-free version (slower on x86)
+
+// These constants are 16b fixed-point version of ITU-R BT.601 constants
+#define kYScale 76309      // 1.164 = 255 / 219
+#define kVToR   104597     // 1.596 = 255 / 112 * 0.701
+#define kUToG   25674      // 0.391 = 255 / 112 * 0.886 * 0.114 / 0.587
+#define kVToG   53278      // 0.813 = 255 / 112 * 0.701 * 0.299 / 0.587
+#define kUToB   132201     // 2.018 = 255 / 112 * 0.886
+#define kRCst (-kYScale * 16 - kVToR * 128 + YUV_HALF)
+#define kGCst (-kYScale * 16 + kUToG * 128 + kVToG * 128 + YUV_HALF)
+#define kBCst (-kYScale * 16 - kUToB * 128 + YUV_HALF)
+
+static WEBP_INLINE uint8_t VP8Clip8(int v) {
+  return ((v & ~YUV_MASK) == 0) ? (uint8_t)(v >> YUV_FIX)
+                                : (v < 0) ? 0u : 255u;
+}
+
+static WEBP_INLINE uint8_t VP8ClipN(int v, int N) {  // clip to N bits
+  return ((v & ~YUV_MASK) == 0) ? (uint8_t)(v >> (YUV_FIX + (8 - N)))
+                                : (v < 0) ? 0u : (255u >> (8 - N));
+}
+
+static WEBP_INLINE int VP8YUVToR(int y, int v) {
+  return kYScale * y + kVToR * v + kRCst;
+}
+
+static WEBP_INLINE int VP8YUVToG(int y, int u, int v) {
+  return kYScale * y - kUToG * u - kVToG * v + kGCst;
+}
+
+static WEBP_INLINE int VP8YUVToB(int y, int u) {
+  return kYScale * y  + kUToB * u + kBCst;
+}
+
+static WEBP_INLINE void VP8YuvToRgb(uint8_t y, uint8_t u, uint8_t v,
+                                    uint8_t* const rgb) {
+  rgb[0] = VP8Clip8(VP8YUVToR(y, v));
+  rgb[1] = VP8Clip8(VP8YUVToG(y, u, v));
+  rgb[2] = VP8Clip8(VP8YUVToB(y, u));
+}
+
+static WEBP_INLINE void VP8YuvToBgr(uint8_t y, uint8_t u, uint8_t v,
+                                    uint8_t* const bgr) {
+  bgr[0] = VP8Clip8(VP8YUVToB(y, u));
+  bgr[1] = VP8Clip8(VP8YUVToG(y, u, v));
+  bgr[2] = VP8Clip8(VP8YUVToR(y, v));
+}
+
+static WEBP_INLINE void VP8YuvToRgb565(uint8_t y, uint8_t u, uint8_t v,
+                                       uint8_t* const rgb) {
+  const int r = VP8Clip8(VP8YUVToR(y, u));
+  const int g = VP8ClipN(VP8YUVToG(y, u, v), 6);
+  const int b = VP8ClipN(VP8YUVToB(y, v), 5);
+  const uint8_t rg = (r & 0xf8) | (g >> 3);
+  const uint8_t gb = (g << 5) | b;
+#ifdef WEBP_SWAP_16BIT_CSP
+  rgb[0] = gb;
+  rgb[1] = rg;
+#else
+  rgb[0] = rg;
+  rgb[1] = gb;
+#endif
+}
+
+static WEBP_INLINE void VP8YuvToRgba4444(uint8_t y, uint8_t u, uint8_t v,
+                                         uint8_t* const argb) {
+  const int r = VP8Clip8(VP8YUVToR(y, u));
+  const int g = VP8ClipN(VP8YUVToG(y, u, v), 4);
+  const int b = VP8Clip8(VP8YUVToB(y, v));
+  const uint8_t rg = (r & 0xf0) | g;
+  const uint8_t ba = b | 0x0f;   // overwrite the lower 4 bits
+#ifdef WEBP_SWAP_16BIT_CSP
+  argb[0] = ba;
+  argb[1] = rg;
+#else
+  argb[0] = rg;
+  argb[1] = ba;
+#endif
+}
+
+#endif  // WEBP_YUV_USE_TABLE
+
+static WEBP_INLINE void VP8YuvToArgb(uint8_t y, uint8_t u, uint8_t v,
+                                     uint8_t* const argb) {
+  argb[0] = 0xff;
+  VP8YuvToRgb(y, u, v, argb + 1);
+}
+
+static WEBP_INLINE void VP8YuvToBgra(uint8_t y, uint8_t u, uint8_t v,
+                                     uint8_t* const bgra) {
  VP8YuvToBgr(y, u, v, bgra);
  bgra[3] = 0xff;
 }

-static inline void VP8YuvToRgba(uint8_t y, uint8_t u, uint8_t v,
-                                uint8_t* const rgba) {
+static WEBP_INLINE void VP8YuvToRgba(uint8_t y, uint8_t u, uint8_t v,
+                                     uint8_t* const rgba) {
  VP8YuvToRgb(y, u, v, rgba);
  rgba[3] = 0xff;
 }
@ -102,6 +228,55 @@ static inline void VP8YuvToRgba(uint8_t y, uint8_t u, uint8_t v,
 // Must be called before everything, to initialize the tables.
 void VP8YUVInit(void);

+//------------------------------------------------------------------------------
+// RGB -> YUV conversion
+
+static WEBP_INLINE int VP8ClipUV(int v) {
+  v = (v + (257 << (YUV_FIX + 2 - 1))) >> (YUV_FIX + 2);
+  return ((v & ~0xff) == 0) ? v : (v < 0) ? 0 : 255;
+}
+
+#ifndef USE_YUVj
+
+static WEBP_INLINE int VP8RGBToY(int r, int g, int b) {
+  const int kRound = (1 << (YUV_FIX - 1)) + (16 << YUV_FIX);
+  const int luma = 16839 * r + 33059 * g + 6420 * b;
+  return (luma + kRound) >> YUV_FIX;  // no need to clip
+}
+
+static WEBP_INLINE int VP8RGBToU(int r, int g, int b) {
+  const int u = -9719 * r - 19081 * g + 28800 * b;
+  return VP8ClipUV(u);
+}
+
+static WEBP_INLINE int VP8RGBToV(int r, int g, int b) {
+  const int v = +28800 * r - 24116 * g - 4684 * b;
+  return VP8ClipUV(v);
+}
+
+#else
+
+// This JPEG-YUV colorspace, only for comparison!
+// These are also 16-bit precision coefficients from Rec.601, but with full
+// [0..255] output range.
+static WEBP_INLINE int VP8RGBToY(int r, int g, int b) {
+  const int kRound = (1 << (YUV_FIX - 1));
+  const int luma = 19595 * r + 38470 * g + 7471 * b;
+  return (luma + kRound) >> YUV_FIX;  // no need to clip
+}
+
+static WEBP_INLINE int VP8RGBToU(int r, int g, int b) {
+  const int u = -11058 * r - 21710 * g + 32768 * b;
+  return VP8ClipUV(u);
+}
+
+static WEBP_INLINE int VP8RGBToV(int r, int g, int b) {
+  const int v = 32768 * r - 27439 * g - 5329 * b;
+  return VP8ClipUV(v);
+}
+
+#endif    // USE_YUVj
+
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
--- a/src/enc/Makefile.am
+++ b/src/enc/Makefile.am
@ -1,13 +1,33 @@
 AM_CPPFLAGS = -I$(top_srcdir)/src
-
-libwebpencode_la_SOURCES = analysis.c config.c cost.c cost.h filter.c \
-                           frame.c iterator.c picture.c quant.c  \
-                           syntax.c tree.c vp8enci.h webpenc.c alpha.c \
-                           layer.c
-libwebpencode_la_LDFLAGS = -version-info 2:0:0 -lm
-libwebpencode_la_CPPFLAGS = $(USE_EXPERIMENTAL_CODE)
-libwebpencodeinclude_HEADERS = ../webp/encode.h ../webp/types.h
-libwebpencodeincludedir = $(includedir)/webp
-
-noinst_HEADERS = cost.h vp8enci.h
 noinst_LTLIBRARIES = libwebpencode.la
+
+libwebpencode_la_SOURCES =
+libwebpencode_la_SOURCES += alpha.c
+libwebpencode_la_SOURCES += analysis.c
+libwebpencode_la_SOURCES += backward_references.c
+libwebpencode_la_SOURCES += config.c
+libwebpencode_la_SOURCES += cost.c
+libwebpencode_la_SOURCES += cost.h
+libwebpencode_la_SOURCES += filter.c
+libwebpencode_la_SOURCES += frame.c
+libwebpencode_la_SOURCES += histogram.c
+libwebpencode_la_SOURCES += iterator.c
+libwebpencode_la_SOURCES += layer.c
+libwebpencode_la_SOURCES += picture.c
+libwebpencode_la_SOURCES += quant.c
+libwebpencode_la_SOURCES += syntax.c
+libwebpencode_la_SOURCES += token.c
+libwebpencode_la_SOURCES += tree.c
+libwebpencode_la_SOURCES += vp8enci.h
+libwebpencode_la_SOURCES += vp8l.c
+libwebpencode_la_SOURCES += webpenc.c
+
+libwebpencodeinclude_HEADERS =
+libwebpencodeinclude_HEADERS += ../webp/encode.h
+libwebpencodeinclude_HEADERS += ../webp/types.h
+noinst_HEADERS =
+noinst_HEADERS += ../webp/format_constants.h
+
+libwebpencode_la_LDFLAGS = -lm
+libwebpencode_la_CPPFLAGS = $(USE_EXPERIMENTAL_CODE)
+libwebpencodeincludedir = $(includedir)/webp
--- a/src/enc/alpha.c
+++ b/src/enc/alpha.c
@ -1,8 +1,10 @@
-// Copyright 2011 Google Inc.
+// Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Alpha-plane compression.
@ -11,102 +13,394 @@

 #include <assert.h>
 #include <stdlib.h>
-#include "vp8enci.h"

-#ifdef WEBP_EXPERIMENTAL_FEATURES
-#include "zlib.h"
-#endif
+#include "./vp8enci.h"
+#include "../utils/filters.h"
+#include "../utils/quant_levels.h"
+#include "../webp/format_constants.h"

 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif

-#ifdef WEBP_EXPERIMENTAL_FEATURES
+// -----------------------------------------------------------------------------
+// Encodes the given alpha data via specified compression method 'method'.
+// The pre-processing (quantization) is performed if 'quality' is less than 100.
+// For such cases, the encoding is lossy. The valid range is [0, 100] for
+// 'quality' and [0, 1] for 'method':
+//   'method = 0' - No compression;
+//   'method = 1' - Use lossless coder on the alpha plane only
+// 'filter' values [0, 4] correspond to prediction modes none, horizontal,
+// vertical & gradient filters. The prediction mode 4 will try all the
+// prediction modes 0 to 3 and pick the best one.
+// 'effort_level': specifies how much effort must be spent to try and reduce
+//  the compressed output size. In range 0 (quick) to 6 (slow).
+//
+// 'output' corresponds to the buffer containing compressed alpha data.
+//          This buffer is allocated by this method and caller should call
+//          free(*output) when done.
+// 'output_size' corresponds to size of this compressed alpha buffer.
+//
+// Returns 1 on successfully encoding the alpha and
+//         0 if either:
+//           invalid quality or method, or
+//           memory allocation for the compressed data fails.
+
+#include "../enc/vp8li.h"
+
+static int EncodeLossless(const uint8_t* const data, int width, int height,
+                          int effort_level,  // in [0..6] range
+                          VP8BitWriter* const bw,
+                          WebPAuxStats* const stats) {
+  int ok = 0;
+  WebPConfig config;
+  WebPPicture picture;
+  VP8LBitWriter tmp_bw;
+
+  WebPPictureInit(&picture);
+  picture.width = width;
+  picture.height = height;
+  picture.use_argb = 1;
+  picture.stats = stats;
+  if (!WebPPictureAlloc(&picture)) return 0;
+
+  // Transfer the alpha values to the green channel.
+  {
+    int i, j;
+    uint32_t* dst = picture.argb;
+    const uint8_t* src = data;
+    for (j = 0; j < picture.height; ++j) {
+      for (i = 0; i < picture.width; ++i) {
+        dst[i] = (src[i] << 8) | 0xff000000u;
+      }
+      src += width;
+      dst += picture.argb_stride;
+    }
+  }
+
+  WebPConfigInit(&config);
+  config.lossless = 1;
+  config.method = effort_level;  // impact is very small
+  // Set a moderate default quality setting for alpha.
+  config.quality = 10.f * effort_level;
+  assert(config.quality >= 0 && config.quality <= 100.f);
+
+  ok = VP8LBitWriterInit(&tmp_bw, (width * height) >> 3);
+  ok = ok && (VP8LEncodeStream(&config, &picture, &tmp_bw) == VP8_ENC_OK);
+  WebPPictureFree(&picture);
+  if (ok) {
+    const uint8_t* const buffer = VP8LBitWriterFinish(&tmp_bw);
+    const size_t buffer_size = VP8LBitWriterNumBytes(&tmp_bw);
+    VP8BitWriterAppend(bw, buffer, buffer_size);
+  }
+  VP8LBitWriterDestroy(&tmp_bw);
+  return ok && !bw->error_;
+}
+
+// -----------------------------------------------------------------------------
+
+static int EncodeAlphaInternal(const uint8_t* const data, int width, int height,
+                               int method, int filter, int reduce_levels,
+                               int effort_level,  // in [0..6] range
+                               uint8_t* const tmp_alpha,
+                               VP8BitWriter* const bw,
+                               WebPAuxStats* const stats) {
+  int ok = 0;
+  const uint8_t* alpha_src;
+  WebPFilterFunc filter_func;
+  uint8_t header;
+  size_t expected_size;
+  const size_t data_size = width * height;
+
+  assert((uint64_t)data_size == (uint64_t)width * height);  // as per spec
+  assert(filter >= 0 && filter < WEBP_FILTER_LAST);
+  assert(method >= ALPHA_NO_COMPRESSION);
+  assert(method <= ALPHA_LOSSLESS_COMPRESSION);
+  assert(sizeof(header) == ALPHA_HEADER_LEN);
+  // TODO(skal): have a common function and #define's to validate alpha params.
+
+  expected_size =
+      (method == ALPHA_NO_COMPRESSION) ? (ALPHA_HEADER_LEN + data_size)
+                                       : (data_size >> 5);
+  header = method | (filter << 2);
+  if (reduce_levels) header |= ALPHA_PREPROCESSED_LEVELS << 4;
+
+  VP8BitWriterInit(bw, expected_size);
+  VP8BitWriterAppend(bw, &header, ALPHA_HEADER_LEN);
+
+  filter_func = WebPFilters[filter];
+  if (filter_func != NULL) {
+    filter_func(data, width, height, width, tmp_alpha);
+    alpha_src = tmp_alpha;
+  }  else {
+    alpha_src = data;
+  }
+
+  if (method == ALPHA_NO_COMPRESSION) {
+    ok = VP8BitWriterAppend(bw, alpha_src, width * height);
+    ok = ok && !bw->error_;
+  } else {
+    ok = EncodeLossless(alpha_src, width, height, effort_level, bw, stats);
+    VP8BitWriterFinish(bw);
+  }
+  return ok;
+}
+
+// -----------------------------------------------------------------------------
+
+// TODO(skal): move to dsp/ ?
+static void CopyPlane(const uint8_t* src, int src_stride,
+                      uint8_t* dst, int dst_stride, int width, int height) {
+  while (height-- > 0) {
+    memcpy(dst, src, width);
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static int GetNumColors(const uint8_t* data, int width, int height,
+                        int stride) {
+  int j;
+  int colors = 0;
+  uint8_t color[256] = { 0 };
+
+  for (j = 0; j < height; ++j) {
+    int i;
+    const uint8_t* const p = data + j * stride;
+    for (i = 0; i < width; ++i) {
+      color[p[i]] = 1;
+    }
+  }
+  for (j = 0; j < 256; ++j) {
+    if (color[j] > 0) ++colors;
+  }
+  return colors;
+}
+
+static int EncodeAlpha(VP8Encoder* const enc,
+                       int quality, int method, int filter,
+                       int effort_level,
+                       uint8_t** const output, size_t* const output_size) {
+  const WebPPicture* const pic = enc->pic_;
+  const int width = pic->width;
+  const int height = pic->height;
+
+  uint8_t* quant_alpha = NULL;
+  const size_t data_size = width * height;
+  uint64_t sse = 0;
+  int ok = 1;
+  const int reduce_levels = (quality < 100);
+
+  // quick sanity checks
+  assert((uint64_t)data_size == (uint64_t)width * height);  // as per spec
+  assert(enc != NULL && pic != NULL && pic->a != NULL);
+  assert(output != NULL && output_size != NULL);
+  assert(width > 0 && height > 0);
+  assert(pic->a_stride >= width);
+  assert(filter >= WEBP_FILTER_NONE && filter <= WEBP_FILTER_FAST);
+
+  if (quality < 0 || quality > 100) {
+    return 0;
+  }
+
+  if (method < ALPHA_NO_COMPRESSION || method > ALPHA_LOSSLESS_COMPRESSION) {
+    return 0;
+  }
+
+  quant_alpha = (uint8_t*)malloc(data_size);
+  if (quant_alpha == NULL) {
+    return 0;
+  }
+
+  // Extract alpha data (width x height) from raw_data (stride x height).
+  CopyPlane(pic->a, pic->a_stride, quant_alpha, width, width, height);
+
+  if (reduce_levels) {  // No Quantization required for 'quality = 100'.
+    // 16 alpha levels gives quite a low MSE w.r.t original alpha plane hence
+    // mapped to moderate quality 70. Hence Quality:[0, 70] -> Levels:[2, 16]
+    // and Quality:]70, 100] -> Levels:]16, 256].
+    const int alpha_levels = (quality <= 70) ? (2 + quality / 5)
+                                             : (16 + (quality - 70) * 8);
+    ok = QuantizeLevels(quant_alpha, width, height, alpha_levels, &sse);
+  }
+
+  if (ok) {
+    VP8BitWriter bw;
+    int test_filter;
+    uint8_t* filtered_alpha = NULL;
+    int try_filter_none = (effort_level > 3);
+
+    if (filter == WEBP_FILTER_FAST) {  // Quick estimate of the best candidate.
+      const int kMinColorsForFilterNone = 16;
+      const int kMaxColorsForFilterNone = 192;
+      const int num_colors = GetNumColors(quant_alpha, width, height, width);
+      // For low number of colors, NONE yeilds better compression.
+      filter = (num_colors <= kMinColorsForFilterNone) ? WEBP_FILTER_NONE :
+               EstimateBestFilter(quant_alpha, width, height, width);
+      // For large number of colors, try FILTER_NONE in addition to the best
+      // filter as well.
+      if (num_colors > kMaxColorsForFilterNone) {
+        try_filter_none = 1;
+      }
+    }
+
+    // Test for WEBP_FILTER_NONE for higher effort levels.
+    if (try_filter_none || filter == WEBP_FILTER_NONE) {
+      ok = EncodeAlphaInternal(quant_alpha, width, height,
+                               method, WEBP_FILTER_NONE, reduce_levels,
+                               effort_level, NULL, &bw, pic->stats);
+
+      if (!ok) {
+        VP8BitWriterWipeOut(&bw);
+        goto End;
+      }
+    }
+    // Stop?
+    if (filter == WEBP_FILTER_NONE) {
+      goto Ok;
+    }
+
+    filtered_alpha = (uint8_t*)malloc(data_size);
+    ok = (filtered_alpha != NULL);
+    if (!ok) {
+      goto End;
+    }
+
+    // Try the other mode(s).
+    {
+      WebPAuxStats best_stats;
+      size_t best_score = try_filter_none ?
+                          VP8BitWriterSize(&bw) : (size_t)~0U;
+      int wipe_tmp_bw = try_filter_none;
+
+      memset(&best_stats, 0, sizeof(best_stats));  // prevent spurious warning
+      if (pic->stats != NULL) best_stats = *pic->stats;
+      for (test_filter =
+           try_filter_none ? WEBP_FILTER_HORIZONTAL : WEBP_FILTER_NONE;
+           ok && (test_filter <= WEBP_FILTER_GRADIENT);
+           ++test_filter) {
+        VP8BitWriter tmp_bw;
+        if (filter != WEBP_FILTER_BEST && test_filter != filter) {
+          continue;
+        }
+        ok = EncodeAlphaInternal(quant_alpha, width, height,
+                                 method, test_filter, reduce_levels,
+                                 effort_level, filtered_alpha, &tmp_bw,
+                                 pic->stats);
+        if (ok) {
+          const size_t score = VP8BitWriterSize(&tmp_bw);
+          if (score < best_score) {
+            // swap bitwriter objects.
+            VP8BitWriter tmp = tmp_bw;
+            tmp_bw = bw;
+            bw = tmp;
+            best_score = score;
+            if (pic->stats != NULL) best_stats = *pic->stats;
+          }
+        } else {
+          VP8BitWriterWipeOut(&bw);
+        }
+        if (wipe_tmp_bw) {
+          VP8BitWriterWipeOut(&tmp_bw);
+        }
+        wipe_tmp_bw = 1;  // For next filter trial for WEBP_FILTER_BEST.
+      }
+      if (pic->stats != NULL) *pic->stats = best_stats;
+    }
+ Ok:
+    if (ok) {
+      *output_size = VP8BitWriterSize(&bw);
+      *output = VP8BitWriterBuf(&bw);
+      if (pic->stats != NULL) {         // need stats?
+        pic->stats->coded_size += (int)(*output_size);
+        enc->sse_[3] = sse;
+      }
+    }
+    free(filtered_alpha);
+  }
+ End:
+  free(quant_alpha);
+  return ok;
+}

-#define CHUNK_SIZE 8192

 //------------------------------------------------------------------------------
+// Main calls

-static int CompressAlpha(const uint8_t* data, size_t data_size,
-                         uint8_t** output, size_t* output_size,
-                         int algo) {
-  int ret = Z_OK;
-  z_stream strm;
-  unsigned char chunk[CHUNK_SIZE];
-
-  *output = NULL;
-  *output_size = 0;
-  memset(&strm, 0, sizeof(strm));
-  if (deflateInit(&strm, algo ? Z_BEST_SPEED : Z_BEST_COMPRESSION) != Z_OK) {
+static int CompressAlphaJob(VP8Encoder* const enc, void* dummy) {
+  const WebPConfig* config = enc->config_;
+  uint8_t* alpha_data = NULL;
+  size_t alpha_size = 0;
+  const int effort_level = config->method;  // maps to [0..6]
+  const WEBP_FILTER_TYPE filter =
+      (config->alpha_filtering == 0) ? WEBP_FILTER_NONE :
+      (config->alpha_filtering == 1) ? WEBP_FILTER_FAST :
+                                       WEBP_FILTER_BEST;
+  if (!EncodeAlpha(enc, config->alpha_quality, config->alpha_compression,
+                   filter, effort_level, &alpha_data, &alpha_size)) {
    return 0;
  }
-  strm.next_in = (unsigned char*)data;
-  strm.avail_in = data_size;
-  do {
-    size_t size_out;
-
-    strm.next_out = chunk;
-    strm.avail_out = CHUNK_SIZE;
-    ret = deflate(&strm, Z_FINISH);
-    if (ret == Z_STREAM_ERROR) {
-      break;
-    }
-    size_out = CHUNK_SIZE - strm.avail_out;
-    if (size_out) {
-      size_t new_size = *output_size + size_out;
-      uint8_t* new_output = realloc(*output, new_size);
-      if (new_output == NULL) {
-        ret = Z_MEM_ERROR;
-        break;
-      }
-      memcpy(new_output + *output_size, chunk, size_out);
-      *output_size = new_size;
-      *output = new_output;
-    }
-  } while (ret != Z_STREAM_END || strm.avail_out == 0);
-
-  deflateEnd(&strm);
-  if (ret != Z_STREAM_END) {
-    free(*output);
-    output_size = 0;
+  if (alpha_size != (uint32_t)alpha_size) {  // Sanity check.
+    free(alpha_data);
    return 0;
  }
+  enc->alpha_data_size_ = (uint32_t)alpha_size;
+  enc->alpha_data_ = alpha_data;
+  (void)dummy;
  return 1;
 }

-#endif    /* WEBP_EXPERIMENTAL_FEATURES */
-
-void VP8EncInitAlpha(VP8Encoder* enc) {
-  enc->has_alpha_ = (enc->pic_->a != NULL);
+void VP8EncInitAlpha(VP8Encoder* const enc) {
+  enc->has_alpha_ = WebPPictureHasTransparency(enc->pic_);
  enc->alpha_data_ = NULL;
  enc->alpha_data_size_ = 0;
+  if (enc->thread_level_ > 0) {
+    WebPWorker* const worker = &enc->alpha_worker_;
+    WebPWorkerInit(worker);
+    worker->data1 = enc;
+    worker->data2 = NULL;
+    worker->hook = (WebPWorkerHook)CompressAlphaJob;
+  }
 }

-void VP8EncCodeAlphaBlock(VP8EncIterator* it) {
-  (void)it;
-  // Nothing for now. We just ZLIB-compress in the end.
-}
-
-int VP8EncFinishAlpha(VP8Encoder* enc) {
+int VP8EncStartAlpha(VP8Encoder* const enc) {
  if (enc->has_alpha_) {
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-    const WebPPicture* pic = enc->pic_;
-    assert(pic->a);
-    if (!CompressAlpha(pic->a, pic->width * pic->height,
-                       &enc->alpha_data_, &enc->alpha_data_size_,
-                       enc->config_->alpha_compression)) {
-      return 0;
+    if (enc->thread_level_ > 0) {
+      WebPWorker* const worker = &enc->alpha_worker_;
+      if (!WebPWorkerReset(worker)) {    // Makes sure worker is good to go.
+        return 0;
+      }
+      WebPWorkerLaunch(worker);
+      return 1;
+    } else {
+      return CompressAlphaJob(enc, NULL);   // just do the job right away
    }
-#endif
  }
  return 1;
 }

-void VP8EncDeleteAlpha(VP8Encoder* enc) {
+int VP8EncFinishAlpha(VP8Encoder* const enc) {
+  if (enc->has_alpha_) {
+    if (enc->thread_level_ > 0) {
+      WebPWorker* const worker = &enc->alpha_worker_;
+      if (!WebPWorkerSync(worker)) return 0;  // error
+    }
+  }
+  return WebPReportProgress(enc->pic_, enc->percent_ + 20, &enc->percent_);
+}
+
+int VP8EncDeleteAlpha(VP8Encoder* const enc) {
+  int ok = 1;
+  if (enc->thread_level_ > 0) {
+    WebPWorker* const worker = &enc->alpha_worker_;
+    ok = WebPWorkerSync(worker);  // finish anything left in flight
+    WebPWorkerEnd(worker);  // still need to end the worker, even if !ok
+  }
  free(enc->alpha_data_);
  enc->alpha_data_ = NULL;
  enc->alpha_data_size_ = 0;
  enc->has_alpha_ = 0;
+  return ok;
 }

 #if defined(__cplusplus) || defined(c_plusplus)
--- a/src/enc/analysis.c
+++ b/src/enc/analysis.c
@ -1,8 +1,10 @@
-// Copyright 2011 Google Inc.
+// Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Macroblock analysis
@ -13,8 +15,9 @@
 #include <string.h>
 #include <assert.h>

-#include "vp8enci.h"
-#include "cost.h"
+#include "./vp8enci.h"
+#include "./cost.h"
+#include "../utils/utils.h"

 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
@ -22,10 +25,6 @@ extern "C" {

 #define MAX_ITERS_K_MEANS  6

-static int ClipAlpha(int alpha) {
-  return alpha < 0 ? 0 : alpha > 255 ? 255 : alpha;
-}
-
 //------------------------------------------------------------------------------
 // Smooth the segment map by replacing isolated block by the majority of its
 // neighbours.
@ -35,7 +34,8 @@ static void SmoothSegmentMap(VP8Encoder* const enc) {
  const int w = enc->mb_w_;
  const int h = enc->mb_h_;
  const int majority_cnt_3_x_3_grid = 5;
-  uint8_t* tmp = (uint8_t*)malloc(w * h * sizeof(uint8_t));
+  uint8_t* const tmp = (uint8_t*)WebPSafeMalloc((uint64_t)w * h, sizeof(*tmp));
+  assert((uint64_t)(w * h) == (uint64_t)w * h);   // no overflow, as per spec

  if (tmp == NULL) return;
  for (y = 1; y < h - 1; ++y) {
@ -70,50 +70,10 @@ static void SmoothSegmentMap(VP8Encoder* const enc) {
 }

 //------------------------------------------------------------------------------
-// Finalize Segment probability based on the coding tree
+// set segment susceptibility alpha_ / beta_

-static int GetProba(int a, int b) {
-  int proba;
-  const int total = a + b;
-  if (total == 0) return 255;  // that's the default probability.
-  proba = (255 * a + total / 2) / total;
-  return proba;
-}
-
-static void SetSegmentProbas(VP8Encoder* const enc) {
-  int p[NUM_MB_SEGMENTS] = { 0 };
-  int n;
-
-  for (n = 0; n < enc->mb_w_ * enc->mb_h_; ++n) {
-    const VP8MBInfo* const mb = &enc->mb_info_[n];
-    p[mb->segment_]++;
-  }
-  if (enc->pic_->stats) {
-    for (n = 0; n < NUM_MB_SEGMENTS; ++n) {
-      enc->pic_->stats->segment_size[n] = p[n];
-    }
-  }
-  if (enc->segment_hdr_.num_segments_ > 1) {
-    uint8_t* const probas = enc->proba_.segments_;
-    probas[0] = GetProba(p[0] + p[1], p[2] + p[3]);
-    probas[1] = GetProba(p[0], p[1]);
-    probas[2] = GetProba(p[2], p[3]);
-
-    enc->segment_hdr_.update_map_ =
-        (probas[0] != 255) || (probas[1] != 255) || (probas[2] != 255);
-    enc->segment_hdr_.size_ =
-      p[0] * (VP8BitCost(0, probas[0]) + VP8BitCost(0, probas[1])) +
-      p[1] * (VP8BitCost(0, probas[0]) + VP8BitCost(1, probas[1])) +
-      p[2] * (VP8BitCost(1, probas[0]) + VP8BitCost(0, probas[2])) +
-      p[3] * (VP8BitCost(1, probas[0]) + VP8BitCost(1, probas[2]));
-  } else {
-    enc->segment_hdr_.update_map_ = 0;
-    enc->segment_hdr_.size_ = 0;
-  }
-}
-
-static inline int clip(int v, int m, int M) {
-  return v < m ? m : v > M ? M : v;
+static WEBP_INLINE int clip(int v, int m, int M) {
+  return (v < m) ? m : (v > M) ? M : v;
 }

 static void SetSegmentAlphas(VP8Encoder* const enc,
@ -139,23 +99,64 @@ static void SetSegmentAlphas(VP8Encoder* const enc,
  }
 }

+//------------------------------------------------------------------------------
+// Compute susceptibility based on DCT-coeff histograms:
+// the higher, the "easier" the macroblock is to compress.
+
+#define MAX_ALPHA 255                // 8b of precision for susceptibilities.
+#define ALPHA_SCALE (2 * MAX_ALPHA)  // scaling factor for alpha.
+#define DEFAULT_ALPHA (-1)
+#define IS_BETTER_ALPHA(alpha, best_alpha) ((alpha) > (best_alpha))
+
+static int FinalAlphaValue(int alpha) {
+  alpha = MAX_ALPHA - alpha;
+  return clip(alpha, 0, MAX_ALPHA);
+}
+
+static int GetAlpha(const VP8Histogram* const histo) {
+  int max_value = 0, last_non_zero = 1;
+  int k;
+  int alpha;
+  for (k = 0; k <= MAX_COEFF_THRESH; ++k) {
+    const int value = histo->distribution[k];
+    if (value > 0) {
+      if (value > max_value) max_value = value;
+      last_non_zero = k;
+    }
+  }
+  // 'alpha' will later be clipped to [0..MAX_ALPHA] range, clamping outer
+  // values which happen to be mostly noise. This leaves the maximum precision
+  // for handling the useful small values which contribute most.
+  alpha = (max_value > 1) ? ALPHA_SCALE * last_non_zero / max_value : 0;
+  return alpha;
+}
+
+static void MergeHistograms(const VP8Histogram* const in,
+                            VP8Histogram* const out) {
+  int i;
+  for (i = 0; i <= MAX_COEFF_THRESH; ++i) {
+    out->distribution[i] += in->distribution[i];
+  }
+}
+
 //------------------------------------------------------------------------------
 // Simplified k-Means, to assign Nb segments based on alpha-histogram

-static void AssignSegments(VP8Encoder* const enc, const int alphas[256]) {
+static void AssignSegments(VP8Encoder* const enc,
+                           const int alphas[MAX_ALPHA + 1]) {
  const int nb = enc->segment_hdr_.num_segments_;
  int centers[NUM_MB_SEGMENTS];
-  int weighted_average;
-  int map[256];
+  int weighted_average = 0;
+  int map[MAX_ALPHA + 1];
  int a, n, k;
-  int min_a = 0, max_a = 255, range_a;
+  int min_a = 0, max_a = MAX_ALPHA, range_a;
  // 'int' type is ok for histo, and won't overflow
  int accum[NUM_MB_SEGMENTS], dist_accum[NUM_MB_SEGMENTS];

  // bracket the input
-  for (n = 0; n < 256 && alphas[n] == 0; ++n) {}
+  for (n = 0; n <= MAX_ALPHA && alphas[n] == 0; ++n) {}
  min_a = n;
-  for (n = 255; n > min_a && alphas[n] == 0; --n) {}
+  for (n = MAX_ALPHA; n > min_a && alphas[n] == 0; --n) {}
  max_a = n;
  range_a = max_a - min_a;

@ -206,9 +207,9 @@ static void AssignSegments(VP8Encoder* const enc, const int alphas[256]) {
  // Map each original value to the closest centroid
  for (n = 0; n < enc->mb_w_ * enc->mb_h_; ++n) {
    VP8MBInfo* const mb = &enc->mb_info_[n];
-    const int a = mb->alpha_;
-    mb->segment_ = map[a];
-    mb->alpha_ = centers[map[a]];     // just for the record.
+    const int alpha = mb->alpha_;
+    mb->segment_ = map[alpha];
+    mb->alpha_ = centers[map[alpha]];  // for the record.
  }

  if (nb > 1) {
@ -216,7 +217,6 @@ static void AssignSegments(VP8Encoder* const enc, const int alphas[256]) {
    if (smooth) SmoothSegmentMap(enc);
  }

-  SetSegmentProbas(enc);                             // Assign final proba
  SetSegmentAlphas(enc, centers, weighted_average);  // pick some alphas.
 }

@ -225,24 +225,32 @@ static void AssignSegments(VP8Encoder* const enc, const int alphas[256]) {
 // susceptibility and set best modes for this macroblock.
 // Segment assignment is done later.

-// Number of modes to inspect for alpha_ evaluation. For high-quality settings,
-// we don't need to test all the possible modes during the analysis phase.
+// Number of modes to inspect for alpha_ evaluation. For high-quality settings
+// (method >= FAST_ANALYSIS_METHOD) we don't need to test all the possible modes
+// during the analysis phase.
+#define FAST_ANALYSIS_METHOD 4  // method above which we do partial analysis
 #define MAX_INTRA16_MODE 2
 #define MAX_INTRA4_MODE  2
 #define MAX_UV_MODE      2

 static int MBAnalyzeBestIntra16Mode(VP8EncIterator* const it) {
-  const int max_mode = (it->enc_->method_ >= 3) ? MAX_INTRA16_MODE : 4;
+  const int max_mode =
+      (it->enc_->method_ >= FAST_ANALYSIS_METHOD) ? MAX_INTRA16_MODE
+                                                  : NUM_PRED_MODES;
  int mode;
-  int best_alpha = -1;
+  int best_alpha = DEFAULT_ALPHA;
  int best_mode = 0;

  VP8MakeLuma16Preds(it);
  for (mode = 0; mode < max_mode; ++mode) {
-    const int alpha = VP8CollectHistogram(it->yuv_in_ + Y_OFF,
-                                          it->yuv_p_ + VP8I16ModeOffsets[mode],
-                                          0, 16);
-    if (alpha > best_alpha) {
+    VP8Histogram histo = { { 0 } };
+    int alpha;
+
+    VP8CollectHistogram(it->yuv_in_ + Y_OFF,
+                        it->yuv_p_ + VP8I16ModeOffsets[mode],
+                        0, 16, &histo);
+    alpha = GetAlpha(&histo);
+    if (IS_BETTER_ALPHA(alpha, best_alpha)) {
      best_alpha = alpha;
      best_mode = mode;
    }
@ -253,47 +261,64 @@ static int MBAnalyzeBestIntra16Mode(VP8EncIterator* const it) {

 static int MBAnalyzeBestIntra4Mode(VP8EncIterator* const it,
                                   int best_alpha) {
-  int modes[16];
-  const int max_mode = (it->enc_->method_ >= 3) ? MAX_INTRA4_MODE : NUM_BMODES;
-  int i4_alpha = 0;
+  uint8_t modes[16];
+  const int max_mode =
+      (it->enc_->method_ >= FAST_ANALYSIS_METHOD) ? MAX_INTRA4_MODE
+                                                  : NUM_BMODES;
+  int i4_alpha;
+  VP8Histogram total_histo = { { 0 } };
+  int cur_histo = 0;
+
  VP8IteratorStartI4(it);
  do {
    int mode;
-    int best_mode_alpha = -1;
+    int best_mode_alpha = DEFAULT_ALPHA;
+    VP8Histogram histos[2];
    const uint8_t* const src = it->yuv_in_ + Y_OFF + VP8Scan[it->i4_];

    VP8MakeIntra4Preds(it);
    for (mode = 0; mode < max_mode; ++mode) {
-      const int alpha = VP8CollectHistogram(src,
-                                            it->yuv_p_ + VP8I4ModeOffsets[mode],
-                                            0, 1);
-      if (alpha > best_mode_alpha) {
+      int alpha;
+
+      memset(&histos[cur_histo], 0, sizeof(histos[cur_histo]));
+      VP8CollectHistogram(src, it->yuv_p_ + VP8I4ModeOffsets[mode],
+                          0, 1, &histos[cur_histo]);
+      alpha = GetAlpha(&histos[cur_histo]);
+      if (IS_BETTER_ALPHA(alpha, best_mode_alpha)) {
        best_mode_alpha = alpha;
        modes[it->i4_] = mode;
+        cur_histo ^= 1;   // keep track of best histo so far.
      }
    }
-    i4_alpha += best_mode_alpha;
+    // accumulate best histogram
+    MergeHistograms(&histos[cur_histo ^ 1], &total_histo);
    // Note: we reuse the original samples for predictors
  } while (VP8IteratorRotateI4(it, it->yuv_in_ + Y_OFF));

-  if (i4_alpha > best_alpha) {
+  i4_alpha = GetAlpha(&total_histo);
+  if (IS_BETTER_ALPHA(i4_alpha, best_alpha)) {
    VP8SetIntra4Mode(it, modes);
-    best_alpha = ClipAlpha(i4_alpha);
+    best_alpha = i4_alpha;
  }
  return best_alpha;
 }

 static int MBAnalyzeBestUVMode(VP8EncIterator* const it) {
-  int best_alpha = -1;
+  int best_alpha = DEFAULT_ALPHA;
  int best_mode = 0;
-  const int max_mode = (it->enc_->method_ >= 3) ? MAX_UV_MODE : 4;
+  const int max_mode =
+      (it->enc_->method_ >= FAST_ANALYSIS_METHOD) ? MAX_UV_MODE
+                                                  : NUM_PRED_MODES;
  int mode;
  VP8MakeChroma8Preds(it);
  for (mode = 0; mode < max_mode; ++mode) {
-    const int alpha = VP8CollectHistogram(it->yuv_in_ + U_OFF,
-                                          it->yuv_p_ + VP8UVModeOffsets[mode],
-                                          16, 16 + 4 + 4);
-    if (alpha > best_alpha) {
+    VP8Histogram histo = { { 0 } };
+    int alpha;
+    VP8CollectHistogram(it->yuv_in_ + U_OFF,
+                        it->yuv_p_ + VP8UVModeOffsets[mode],
+                        16, 16 + 4 + 4, &histo);
+    alpha = GetAlpha(&histo);
+    if (IS_BETTER_ALPHA(alpha, best_alpha)) {
      best_alpha = alpha;
      best_mode = mode;
    }
@ -303,7 +328,8 @@ static int MBAnalyzeBestUVMode(VP8EncIterator* const it) {
 }

 static void MBAnalyze(VP8EncIterator* const it,
-                      int alphas[256], int* const uv_alpha) {
+                      int alphas[MAX_ALPHA + 1],
+                      int* const alpha, int* const uv_alpha) {
  const VP8Encoder* const enc = it->enc_;
  int best_alpha, best_uv_alpha;

@ -312,7 +338,7 @@ static void MBAnalyze(VP8EncIterator* const it,
  VP8SetSegment(it, 0);      // default segment, spec-wise.

  best_alpha = MBAnalyzeBestIntra16Mode(it);
-  if (enc->method_ != 3) {
+  if (enc->method_ >= 5) {
    // We go and make a fast decision for intra4/intra16.
    // It's usually not a good and definitive pick, but helps seeding the stats
    // about level bit-cost.
@ -322,10 +348,22 @@ static void MBAnalyze(VP8EncIterator* const it,
  best_uv_alpha = MBAnalyzeBestUVMode(it);

  // Final susceptibility mix
-  best_alpha = (best_alpha + best_uv_alpha + 1) / 2;
+  best_alpha = (3 * best_alpha + best_uv_alpha + 2) >> 2;
+  best_alpha = FinalAlphaValue(best_alpha);
  alphas[best_alpha]++;
+  it->mb_->alpha_ = best_alpha;   // for later remapping.
+
+  // Accumulate for later complexity analysis.
+  *alpha += best_alpha;   // mixed susceptibility (not just luma)
  *uv_alpha += best_uv_alpha;
-  it->mb_->alpha_ = best_alpha;   // Informative only.
+}
+
+static void DefaultMBInfo(VP8MBInfo* const mb) {
+  mb->type_ = 1;     // I16x16
+  mb->uv_mode_ = 0;
+  mb->skip_ = 0;     // not skipped
+  mb->segment_ = 0;  // default segment
+  mb->alpha_ = 0;
 }

 //------------------------------------------------------------------------------
@ -338,21 +376,44 @@ static void MBAnalyze(VP8EncIterator* const it,
 // and decide intra4/intra16, but that's usually almost always a bad choice at
 // this stage.

+static void ResetAllMBInfo(VP8Encoder* const enc) {
+  int n;
+  for (n = 0; n < enc->mb_w_ * enc->mb_h_; ++n) {
+    DefaultMBInfo(&enc->mb_info_[n]);
+  }
+  // Default susceptibilities.
+  enc->dqm_[0].alpha_ = 0;
+  enc->dqm_[0].beta_ = 0;
+  // Note: we can't compute this alpha_ / uv_alpha_.
+  WebPReportProgress(enc->pic_, enc->percent_ + 20, &enc->percent_);
+}
+
 int VP8EncAnalyze(VP8Encoder* const enc) {
-  int alphas[256] = { 0 };
-  VP8EncIterator it;
-
-  VP8IteratorInit(enc, &it);
+  int ok = 1;
+  const int do_segments =
+      enc->config_->emulate_jpeg_size ||   // We need the complexity evaluation.
+      (enc->segment_hdr_.num_segments_ > 1) ||
+      (enc->method_ == 0);  // for method 0, we need preds_[] to be filled.
+  enc->alpha_ = 0;
  enc->uv_alpha_ = 0;
-  do {
-    VP8IteratorImport(&it);
-    MBAnalyze(&it, alphas, &enc->uv_alpha_);
-    // Let's pretend we have perfect lossless reconstruction.
-  } while (VP8IteratorNext(&it, it.yuv_in_));
-  enc->uv_alpha_ /= enc->mb_w_ * enc->mb_h_;
-  AssignSegments(enc, alphas);
+  if (do_segments) {
+    int alphas[MAX_ALPHA + 1] = { 0 };
+    VP8EncIterator it;

-  return 1;
+    VP8IteratorInit(enc, &it);
+    do {
+      VP8IteratorImport(&it);
+      MBAnalyze(&it, alphas, &enc->alpha_, &enc->uv_alpha_);
+      ok = VP8IteratorProgress(&it, 20);
+      // Let's pretend we have perfect lossless reconstruction.
+    } while (ok && VP8IteratorNext(&it, it.yuv_in_));
+    enc->alpha_ /= enc->mb_w_ * enc->mb_h_;
+    enc->uv_alpha_ /= enc->mb_w_ * enc->mb_h_;
+    if (ok) AssignSegments(enc, alphas);
+  } else {   // Use only one default segment.
+    ResetAllMBInfo(enc);
+  }
+  return ok;
 }

 #if defined(__cplusplus) || defined(c_plusplus)
--- a/src/enc/backward_references.c
+++ b/src/enc/backward_references.c
@ -0,0 +1,894 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Author: Jyrki Alakuijala (jyrki@google.com)
+//
+
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "./backward_references.h"
+#include "./histogram.h"
+#include "../dsp/lossless.h"
+#include "../utils/color_cache.h"
+#include "../utils/utils.h"
+
+#define VALUES_IN_BYTE 256
+
+#define HASH_BITS 18
+#define HASH_SIZE (1 << HASH_BITS)
+#define HASH_MULTIPLIER (0xc6a4a7935bd1e995ULL)
+
+// 1M window (4M bytes) minus 120 special codes for short distances.
+#define WINDOW_SIZE ((1 << 20) - 120)
+
+// Bounds for the match length.
+#define MIN_LENGTH 2
+#define MAX_LENGTH 4096
+
+typedef struct {
+  // Stores the most recently added position with the given hash value.
+  int32_t hash_to_first_index_[HASH_SIZE];
+  // chain_[pos] stores the previous position with the same hash value
+  // for every pixel in the image.
+  int32_t* chain_;
+} HashChain;
+
+// -----------------------------------------------------------------------------
+
+static const uint8_t plane_to_code_lut[128] = {
+ 96,   73,  55,  39,  23,  13,   5,  1,  255, 255, 255, 255, 255, 255, 255, 255,
+ 101,  78,  58,  42,  26,  16,   8,  2,    0,   3,  9,   17,  27,  43,  59,  79,
+ 102,  86,  62,  46,  32,  20,  10,  6,    4,   7,  11,  21,  33,  47,  63,  87,
+ 105,  90,  70,  52,  37,  28,  18,  14,  12,  15,  19,  29,  38,  53,  71,  91,
+ 110,  99,  82,  66,  48,  35,  30,  24,  22,  25,  31,  36,  49,  67,  83, 100,
+ 115, 108,  94,  76,  64,  50,  44,  40,  34,  41,  45,  51,  65,  77,  95, 109,
+ 118, 113, 103,  92,  80,  68,  60,  56,  54,  57,  61,  69,  81,  93, 104, 114,
+ 119, 116, 111, 106,  97,  88,  84,  74,  72,  75,  85,  89,  98, 107, 112, 117
+};
+
+static int DistanceToPlaneCode(int xsize, int dist) {
+  const int yoffset = dist / xsize;
+  const int xoffset = dist - yoffset * xsize;
+  if (xoffset <= 8 && yoffset < 8) {
+    return plane_to_code_lut[yoffset * 16 + 8 - xoffset] + 1;
+  } else if (xoffset > xsize - 8 && yoffset < 7) {
+    return plane_to_code_lut[(yoffset + 1) * 16 + 8 + (xsize - xoffset)] + 1;
+  }
+  return dist + 120;
+}
+
+static WEBP_INLINE int FindMatchLength(const uint32_t* const array1,
+                                       const uint32_t* const array2,
+                                       const int max_limit) {
+  int match_len = 0;
+  while (match_len < max_limit && array1[match_len] == array2[match_len]) {
+    ++match_len;
+  }
+  return match_len;
+}
+
+// -----------------------------------------------------------------------------
+//  VP8LBackwardRefs
+
+void VP8LInitBackwardRefs(VP8LBackwardRefs* const refs) {
+  if (refs != NULL) {
+    refs->refs = NULL;
+    refs->size = 0;
+    refs->max_size = 0;
+  }
+}
+
+void VP8LClearBackwardRefs(VP8LBackwardRefs* const refs) {
+  if (refs != NULL) {
+    free(refs->refs);
+    VP8LInitBackwardRefs(refs);
+  }
+}
+
+int VP8LBackwardRefsAlloc(VP8LBackwardRefs* const refs, int max_size) {
+  assert(refs != NULL);
+  refs->size = 0;
+  refs->max_size = 0;
+  refs->refs = (PixOrCopy*)WebPSafeMalloc((uint64_t)max_size,
+                                          sizeof(*refs->refs));
+  if (refs->refs == NULL) return 0;
+  refs->max_size = max_size;
+  return 1;
+}
+
+// -----------------------------------------------------------------------------
+// Hash chains
+
+static WEBP_INLINE uint64_t GetPixPairHash64(const uint32_t* const argb) {
+  uint64_t key = ((uint64_t)(argb[1]) << 32) | argb[0];
+  key = (key * HASH_MULTIPLIER) >> (64 - HASH_BITS);
+  return key;
+}
+
+static int HashChainInit(HashChain* const p, int size) {
+  int i;
+  p->chain_ = (int*)WebPSafeMalloc((uint64_t)size, sizeof(*p->chain_));
+  if (p->chain_ == NULL) {
+    return 0;
+  }
+  for (i = 0; i < size; ++i) {
+    p->chain_[i] = -1;
+  }
+  for (i = 0; i < HASH_SIZE; ++i) {
+    p->hash_to_first_index_[i] = -1;
+  }
+  return 1;
+}
+
+static void HashChainDelete(HashChain* const p) {
+  if (p != NULL) {
+    free(p->chain_);
+    free(p);
+  }
+}
+
+// Insertion of two pixels at a time.
+static void HashChainInsert(HashChain* const p,
+                            const uint32_t* const argb, int pos) {
+  const uint64_t hash_code = GetPixPairHash64(argb);
+  p->chain_[pos] = p->hash_to_first_index_[hash_code];
+  p->hash_to_first_index_[hash_code] = pos;
+}
+
+static void GetParamsForHashChainFindCopy(int quality, int xsize,
+                                          int cache_bits, int* window_size,
+                                          int* iter_pos, int* iter_limit) {
+  const int iter_mult = (quality < 27) ? 1 : 1 + ((quality - 27) >> 4);
+  const int iter_neg = -iter_mult * (quality >> 1);
+  // Limit the backward-ref window size for lower qualities.
+  const int max_window_size = (quality > 50) ? WINDOW_SIZE
+                            : (quality > 25) ? (xsize << 8)
+                            : (xsize << 4);
+  assert(xsize > 0);
+  *window_size = (max_window_size > WINDOW_SIZE) ? WINDOW_SIZE
+               : max_window_size;
+  *iter_pos = 8 + (quality >> 3);
+  // For lower entropy images, the rigourous search loop in HashChainFindCopy
+  // can be relaxed.
+  *iter_limit = (cache_bits > 0) ? iter_neg : iter_neg / 2;
+}
+
+static int HashChainFindCopy(const HashChain* const p,
+                             int base_position, int xsize_signed,
+                             const uint32_t* const argb, int maxlen,
+                             int window_size, int iter_pos, int iter_limit,
+                             int* const distance_ptr,
+                             int* const length_ptr) {
+  const uint32_t* const argb_start = argb + base_position;
+  uint64_t best_val = 0;
+  uint32_t best_length = 1;
+  uint32_t best_distance = 0;
+  const uint32_t xsize = (uint32_t)xsize_signed;
+  const int min_pos =
+      (base_position > window_size) ? base_position - window_size : 0;
+  int pos;
+  assert(xsize > 0);
+  for (pos = p->hash_to_first_index_[GetPixPairHash64(argb_start)];
+       pos >= min_pos;
+       pos = p->chain_[pos]) {
+    uint64_t val;
+    uint32_t curr_length;
+    uint32_t distance;
+    if (iter_pos < 0) {
+      if (iter_pos < iter_limit || best_val >= 0xff0000) {
+        break;
+      }
+    }
+    --iter_pos;
+    if (argb[pos + best_length - 1] != argb_start[best_length - 1]) {
+      continue;
+    }
+    curr_length = FindMatchLength(argb + pos, argb_start, maxlen);
+    if (curr_length < best_length) {
+      continue;
+    }
+    distance = (uint32_t)(base_position - pos);
+    val = curr_length << 16;
+    // Favoring 2d locality here gives savings for certain images.
+    if (distance < 9 * xsize) {
+      const uint32_t y = distance / xsize;
+      uint32_t x = distance % xsize;
+      if (x > (xsize >> 1)) {
+        x = xsize - x;
+      }
+      if (x <= 7) {
+        val += 9 * 9 + 9 * 9;
+        val -= y * y + x * x;
+      }
+    }
+    if (best_val < val) {
+      best_val = val;
+      best_length = curr_length;
+      best_distance = distance;
+      if (curr_length >= MAX_LENGTH) {
+        break;
+      }
+      if ((best_distance == 1 || distance == xsize) &&
+          best_length >= 128) {
+        break;
+      }
+    }
+  }
+  *distance_ptr = (int)best_distance;
+  *length_ptr = best_length;
+  return (best_length >= MIN_LENGTH);
+}
+
+static WEBP_INLINE void PushBackCopy(VP8LBackwardRefs* const refs, int length) {
+  int size = refs->size;
+  while (length >= MAX_LENGTH) {
+    refs->refs[size++] = PixOrCopyCreateCopy(1, MAX_LENGTH);
+    length -= MAX_LENGTH;
+  }
+  if (length > 0) {
+    refs->refs[size++] = PixOrCopyCreateCopy(1, length);
+  }
+  refs->size = size;
+}
+
+static void BackwardReferencesRle(int xsize, int ysize,
+                                  const uint32_t* const argb,
+                                  VP8LBackwardRefs* const refs) {
+  const int pix_count = xsize * ysize;
+  int match_len = 0;
+  int i;
+  refs->size = 0;
+  PushBackCopy(refs, match_len);    // i=0 case
+  refs->refs[refs->size++] = PixOrCopyCreateLiteral(argb[0]);
+  for (i = 1; i < pix_count; ++i) {
+    if (argb[i] == argb[i - 1]) {
+      ++match_len;
+    } else {
+      PushBackCopy(refs, match_len);
+      match_len = 0;
+      refs->refs[refs->size++] = PixOrCopyCreateLiteral(argb[i]);
+    }
+  }
+  PushBackCopy(refs, match_len);
+}
+
+static int BackwardReferencesHashChain(int xsize, int ysize,
+                                       const uint32_t* const argb,
+                                       int cache_bits, int quality,
+                                       VP8LBackwardRefs* const refs) {
+  int i;
+  int ok = 0;
+  int cc_init = 0;
+  const int use_color_cache = (cache_bits > 0);
+  const int pix_count = xsize * ysize;
+  HashChain* const hash_chain = (HashChain*)malloc(sizeof(*hash_chain));
+  VP8LColorCache hashers;
+  int window_size = WINDOW_SIZE;
+  int iter_pos = 1;
+  int iter_limit = -1;
+
+  if (hash_chain == NULL) return 0;
+  if (use_color_cache) {
+    cc_init = VP8LColorCacheInit(&hashers, cache_bits);
+    if (!cc_init) goto Error;
+  }
+
+  if (!HashChainInit(hash_chain, pix_count)) goto Error;
+
+  refs->size = 0;
+  GetParamsForHashChainFindCopy(quality, xsize, cache_bits,
+                                &window_size, &iter_pos, &iter_limit);
+  for (i = 0; i < pix_count; ) {
+    // Alternative#1: Code the pixels starting at 'i' using backward reference.
+    int offset = 0;
+    int len = 0;
+    if (i < pix_count - 1) {  // FindCopy(i,..) reads pixels at [i] and [i + 1].
+      int maxlen = pix_count - i;
+      if (maxlen > MAX_LENGTH) {
+        maxlen = MAX_LENGTH;
+      }
+      HashChainFindCopy(hash_chain, i, xsize, argb, maxlen,
+                        window_size, iter_pos, iter_limit,
+                        &offset, &len);
+    }
+    if (len >= MIN_LENGTH) {
+      // Alternative#2: Insert the pixel at 'i' as literal, and code the
+      // pixels starting at 'i + 1' using backward reference.
+      int offset2 = 0;
+      int len2 = 0;
+      int k;
+      HashChainInsert(hash_chain, &argb[i], i);
+      if (i < pix_count - 2) {  // FindCopy(i+1,..) reads [i + 1] and [i + 2].
+        int maxlen = pix_count - (i + 1);
+        if (maxlen > MAX_LENGTH) {
+          maxlen = MAX_LENGTH;
+        }
+        HashChainFindCopy(hash_chain, i + 1, xsize, argb, maxlen,
+                          window_size, iter_pos, iter_limit,
+                          &offset2, &len2);
+        if (len2 > len + 1) {
+          const uint32_t pixel = argb[i];
+          // Alternative#2 is a better match. So push pixel at 'i' as literal.
+          if (use_color_cache && VP8LColorCacheContains(&hashers, pixel)) {
+            const int ix = VP8LColorCacheGetIndex(&hashers, pixel);
+            refs->refs[refs->size] = PixOrCopyCreateCacheIdx(ix);
+          } else {
+            refs->refs[refs->size] = PixOrCopyCreateLiteral(pixel);
+          }
+          ++refs->size;
+          if (use_color_cache) VP8LColorCacheInsert(&hashers, pixel);
+          i++;  // Backward reference to be done for next pixel.
+          len = len2;
+          offset = offset2;
+        }
+      }
+      if (len >= MAX_LENGTH) {
+        len = MAX_LENGTH - 1;
+      }
+      refs->refs[refs->size++] = PixOrCopyCreateCopy(offset, len);
+      if (use_color_cache) {
+        for (k = 0; k < len; ++k) {
+          VP8LColorCacheInsert(&hashers, argb[i + k]);
+        }
+      }
+      // Add to the hash_chain (but cannot add the last pixel).
+      {
+        const int last = (len < pix_count - 1 - i) ? len : pix_count - 1 - i;
+        for (k = 1; k < last; ++k) {
+          HashChainInsert(hash_chain, &argb[i + k], i + k);
+        }
+      }
+      i += len;
+    } else {
+      const uint32_t pixel = argb[i];
+      if (use_color_cache && VP8LColorCacheContains(&hashers, pixel)) {
+        // push pixel as a PixOrCopyCreateCacheIdx pixel
+        const int ix = VP8LColorCacheGetIndex(&hashers, pixel);
+        refs->refs[refs->size] = PixOrCopyCreateCacheIdx(ix);
+      } else {
+        refs->refs[refs->size] = PixOrCopyCreateLiteral(pixel);
+      }
+      ++refs->size;
+      if (use_color_cache) VP8LColorCacheInsert(&hashers, pixel);
+      if (i + 1 < pix_count) {
+        HashChainInsert(hash_chain, &argb[i], i);
+      }
+      ++i;
+    }
+  }
+  ok = 1;
+Error:
+  if (cc_init) VP8LColorCacheClear(&hashers);
+  HashChainDelete(hash_chain);
+  return ok;
+}
+
+// -----------------------------------------------------------------------------
+
+typedef struct {
+  double alpha_[VALUES_IN_BYTE];
+  double red_[VALUES_IN_BYTE];
+  double literal_[PIX_OR_COPY_CODES_MAX];
+  double blue_[VALUES_IN_BYTE];
+  double distance_[NUM_DISTANCE_CODES];
+} CostModel;
+
+static int BackwardReferencesTraceBackwards(
+    int xsize, int ysize, int recursive_cost_model,
+    const uint32_t* const argb, int quality, int cache_bits,
+    VP8LBackwardRefs* const refs);
+
+static void ConvertPopulationCountTableToBitEstimates(
+    int num_symbols, const int population_counts[], double output[]) {
+  int sum = 0;
+  int nonzeros = 0;
+  int i;
+  for (i = 0; i < num_symbols; ++i) {
+    sum += population_counts[i];
+    if (population_counts[i] > 0) {
+      ++nonzeros;
+    }
+  }
+  if (nonzeros <= 1) {
+    memset(output, 0, num_symbols * sizeof(*output));
+  } else {
+    const double logsum = VP8LFastLog2(sum);
+    for (i = 0; i < num_symbols; ++i) {
+      output[i] = logsum - VP8LFastLog2(population_counts[i]);
+    }
+  }
+}
+
+static int CostModelBuild(CostModel* const m, int xsize, int ysize,
+                          int recursion_level, const uint32_t* const argb,
+                          int quality, int cache_bits) {
+  int ok = 0;
+  VP8LHistogram histo;
+  VP8LBackwardRefs refs;
+
+  if (!VP8LBackwardRefsAlloc(&refs, xsize * ysize)) goto Error;
+
+  if (recursion_level > 0) {
+    if (!BackwardReferencesTraceBackwards(xsize, ysize, recursion_level - 1,
+                                          argb, quality, cache_bits, &refs)) {
+      goto Error;
+    }
+  } else {
+    if (!BackwardReferencesHashChain(xsize, ysize, argb, cache_bits, quality,
+                                     &refs)) {
+      goto Error;
+    }
+  }
+  VP8LHistogramCreate(&histo, &refs, cache_bits);
+  ConvertPopulationCountTableToBitEstimates(
+      VP8LHistogramNumCodes(&histo), histo.literal_, m->literal_);
+  ConvertPopulationCountTableToBitEstimates(
+      VALUES_IN_BYTE, histo.red_, m->red_);
+  ConvertPopulationCountTableToBitEstimates(
+      VALUES_IN_BYTE, histo.blue_, m->blue_);
+  ConvertPopulationCountTableToBitEstimates(
+      VALUES_IN_BYTE, histo.alpha_, m->alpha_);
+  ConvertPopulationCountTableToBitEstimates(
+      NUM_DISTANCE_CODES, histo.distance_, m->distance_);
+  ok = 1;
+
+ Error:
+  VP8LClearBackwardRefs(&refs);
+  return ok;
+}
+
+static WEBP_INLINE double GetLiteralCost(const CostModel* const m, uint32_t v) {
+  return m->alpha_[v >> 24] +
+         m->red_[(v >> 16) & 0xff] +
+         m->literal_[(v >> 8) & 0xff] +
+         m->blue_[v & 0xff];
+}
+
+static WEBP_INLINE double GetCacheCost(const CostModel* const m, uint32_t idx) {
+  const int literal_idx = VALUES_IN_BYTE + NUM_LENGTH_CODES + idx;
+  return m->literal_[literal_idx];
+}
+
+static WEBP_INLINE double GetLengthCost(const CostModel* const m,
+                                        uint32_t length) {
+  int code, extra_bits_count, extra_bits_value;
+  PrefixEncode(length, &code, &extra_bits_count, &extra_bits_value);
+  return m->literal_[VALUES_IN_BYTE + code] + extra_bits_count;
+}
+
+static WEBP_INLINE double GetDistanceCost(const CostModel* const m,
+                                          uint32_t distance) {
+  int code, extra_bits_count, extra_bits_value;
+  PrefixEncode(distance, &code, &extra_bits_count, &extra_bits_value);
+  return m->distance_[code] + extra_bits_count;
+}
+
+static int BackwardReferencesHashChainDistanceOnly(
+    int xsize, int ysize, int recursive_cost_model, const uint32_t* const argb,
+    int quality, int cache_bits, uint32_t* const dist_array) {
+  int i;
+  int ok = 0;
+  int cc_init = 0;
+  const int pix_count = xsize * ysize;
+  const int use_color_cache = (cache_bits > 0);
+  float* const cost =
+      (float*)WebPSafeMalloc((uint64_t)pix_count, sizeof(*cost));
+  CostModel* cost_model = (CostModel*)malloc(sizeof(*cost_model));
+  HashChain* hash_chain = (HashChain*)malloc(sizeof(*hash_chain));
+  VP8LColorCache hashers;
+  const double mul0 = (recursive_cost_model != 0) ? 1.0 : 0.68;
+  const double mul1 = (recursive_cost_model != 0) ? 1.0 : 0.82;
+  const int min_distance_code = 2;  // TODO(vikasa): tune as function of quality
+  int window_size = WINDOW_SIZE;
+  int iter_pos = 1;
+  int iter_limit = -1;
+
+  if (cost == NULL || cost_model == NULL || hash_chain == NULL) goto Error;
+
+  if (!HashChainInit(hash_chain, pix_count)) goto Error;
+
+  if (use_color_cache) {
+    cc_init = VP8LColorCacheInit(&hashers, cache_bits);
+    if (!cc_init) goto Error;
+  }
+
+  if (!CostModelBuild(cost_model, xsize, ysize, recursive_cost_model, argb,
+                      quality, cache_bits)) {
+    goto Error;
+  }
+
+  for (i = 0; i < pix_count; ++i) cost[i] = 1e38f;
+
+  // We loop one pixel at a time, but store all currently best points to
+  // non-processed locations from this point.
+  dist_array[0] = 0;
+  GetParamsForHashChainFindCopy(quality, xsize, cache_bits,
+                                &window_size, &iter_pos, &iter_limit);
+  for (i = 0; i < pix_count; ++i) {
+    double prev_cost = 0.0;
+    int shortmax;
+    if (i > 0) {
+      prev_cost = cost[i - 1];
+    }
+    for (shortmax = 0; shortmax < 2; ++shortmax) {
+      int offset = 0;
+      int len = 0;
+      if (i < pix_count - 1) {  // FindCopy reads pixels at [i] and [i + 1].
+        int maxlen = shortmax ? 2 : MAX_LENGTH;
+        if (maxlen > pix_count - i) {
+          maxlen = pix_count - i;
+        }
+        HashChainFindCopy(hash_chain, i, xsize, argb, maxlen,
+                          window_size, iter_pos, iter_limit,
+                          &offset, &len);
+      }
+      if (len >= MIN_LENGTH) {
+        const int code = DistanceToPlaneCode(xsize, offset);
+        const double distance_cost =
+            prev_cost + GetDistanceCost(cost_model, code);
+        int k;
+        for (k = 1; k < len; ++k) {
+          const double cost_val = distance_cost + GetLengthCost(cost_model, k);
+          if (cost[i + k] > cost_val) {
+            cost[i + k] = (float)cost_val;
+            dist_array[i + k] = k + 1;
+          }
+        }
+        // This if is for speedup only. It roughly doubles the speed, and
+        // makes compression worse by .1 %.
+        if (len >= 128 && code <= min_distance_code) {
+          // Long copy for short distances, let's skip the middle
+          // lookups for better copies.
+          // 1) insert the hashes.
+          if (use_color_cache) {
+            for (k = 0; k < len; ++k) {
+              VP8LColorCacheInsert(&hashers, argb[i + k]);
+            }
+          }
+          // 2) Add to the hash_chain (but cannot add the last pixel)
+          {
+            const int last = (len + i < pix_count - 1) ? len + i
+                                                       : pix_count - 1;
+            for (k = i; k < last; ++k) {
+              HashChainInsert(hash_chain, &argb[k], k);
+            }
+          }
+          // 3) jump.
+          i += len - 1;  // for loop does ++i, thus -1 here.
+          goto next_symbol;
+        }
+      }
+    }
+    if (i < pix_count - 1) {
+      HashChainInsert(hash_chain, &argb[i], i);
+    }
+    {
+      // inserting a literal pixel
+      double cost_val = prev_cost;
+      if (use_color_cache && VP8LColorCacheContains(&hashers, argb[i])) {
+        const int ix = VP8LColorCacheGetIndex(&hashers, argb[i]);
+        cost_val += GetCacheCost(cost_model, ix) * mul0;
+      } else {
+        cost_val += GetLiteralCost(cost_model, argb[i]) * mul1;
+      }
+      if (cost[i] > cost_val) {
+        cost[i] = (float)cost_val;
+        dist_array[i] = 1;  // only one is inserted.
+      }
+      if (use_color_cache) VP8LColorCacheInsert(&hashers, argb[i]);
+    }
+ next_symbol: ;
+  }
+  // Last pixel still to do, it can only be a single step if not reached
+  // through cheaper means already.
+  ok = 1;
+Error:
+  if (cc_init) VP8LColorCacheClear(&hashers);
+  HashChainDelete(hash_chain);
+  free(cost_model);
+  free(cost);
+  return ok;
+}
+
+// We pack the path at the end of *dist_array and return
+// a pointer to this part of the array. Example:
+// dist_array = [1x2xx3x2] => packed [1x2x1232], chosen_path = [1232]
+static void TraceBackwards(uint32_t* const dist_array,
+                           int dist_array_size,
+                           uint32_t** const chosen_path,
+                           int* const chosen_path_size) {
+  uint32_t* path = dist_array + dist_array_size;
+  uint32_t* cur = dist_array + dist_array_size - 1;
+  while (cur >= dist_array) {
+    const int k = *cur;
+    --path;
+    *path = k;
+    cur -= k;
+  }
+  *chosen_path = path;
+  *chosen_path_size = (int)(dist_array + dist_array_size - path);
+}
+
+static int BackwardReferencesHashChainFollowChosenPath(
+    int xsize, int ysize, const uint32_t* const argb,
+    int quality, int cache_bits,
+    const uint32_t* const chosen_path, int chosen_path_size,
+    VP8LBackwardRefs* const refs) {
+  const int pix_count = xsize * ysize;
+  const int use_color_cache = (cache_bits > 0);
+  int size = 0;
+  int i = 0;
+  int k;
+  int ix;
+  int ok = 0;
+  int cc_init = 0;
+  int window_size = WINDOW_SIZE;
+  int iter_pos = 1;
+  int iter_limit = -1;
+  HashChain* hash_chain = (HashChain*)malloc(sizeof(*hash_chain));
+  VP8LColorCache hashers;
+
+  if (hash_chain == NULL || !HashChainInit(hash_chain, pix_count)) {
+    goto Error;
+  }
+  if (use_color_cache) {
+    cc_init = VP8LColorCacheInit(&hashers, cache_bits);
+    if (!cc_init) goto Error;
+  }
+
+  refs->size = 0;
+  GetParamsForHashChainFindCopy(quality, xsize, cache_bits,
+                                &window_size, &iter_pos, &iter_limit);
+  for (ix = 0; ix < chosen_path_size; ++ix, ++size) {
+    int offset = 0;
+    int len = 0;
+    int maxlen = chosen_path[ix];
+    if (maxlen != 1) {
+      HashChainFindCopy(hash_chain, i, xsize, argb, maxlen,
+                        window_size, iter_pos, iter_limit,
+                        &offset, &len);
+      assert(len == maxlen);
+      refs->refs[size] = PixOrCopyCreateCopy(offset, len);
+      if (use_color_cache) {
+        for (k = 0; k < len; ++k) {
+          VP8LColorCacheInsert(&hashers, argb[i + k]);
+        }
+      }
+      {
+        const int last = (len < pix_count - 1 - i) ? len : pix_count - 1 - i;
+        for (k = 0; k < last; ++k) {
+          HashChainInsert(hash_chain, &argb[i + k], i + k);
+        }
+      }
+      i += len;
+    } else {
+      if (use_color_cache && VP8LColorCacheContains(&hashers, argb[i])) {
+        // push pixel as a color cache index
+        const int idx = VP8LColorCacheGetIndex(&hashers, argb[i]);
+        refs->refs[size] = PixOrCopyCreateCacheIdx(idx);
+      } else {
+        refs->refs[size] = PixOrCopyCreateLiteral(argb[i]);
+      }
+      if (use_color_cache) VP8LColorCacheInsert(&hashers, argb[i]);
+      if (i + 1 < pix_count) {
+        HashChainInsert(hash_chain, &argb[i], i);
+      }
+      ++i;
+    }
+  }
+  assert(size <= refs->max_size);
+  refs->size = size;
+  ok = 1;
+Error:
+  if (cc_init) VP8LColorCacheClear(&hashers);
+  HashChainDelete(hash_chain);
+  return ok;
+}
+
+// Returns 1 on success.
+static int BackwardReferencesTraceBackwards(int xsize, int ysize,
+                                            int recursive_cost_model,
+                                            const uint32_t* const argb,
+                                            int quality, int cache_bits,
+                                            VP8LBackwardRefs* const refs) {
+  int ok = 0;
+  const int dist_array_size = xsize * ysize;
+  uint32_t* chosen_path = NULL;
+  int chosen_path_size = 0;
+  uint32_t* dist_array =
+      (uint32_t*)WebPSafeMalloc((uint64_t)dist_array_size, sizeof(*dist_array));
+
+  if (dist_array == NULL) goto Error;
+
+  if (!BackwardReferencesHashChainDistanceOnly(
+      xsize, ysize, recursive_cost_model, argb, quality, cache_bits,
+      dist_array)) {
+    goto Error;
+  }
+  TraceBackwards(dist_array, dist_array_size, &chosen_path, &chosen_path_size);
+  if (!BackwardReferencesHashChainFollowChosenPath(
+      xsize, ysize, argb, quality, cache_bits, chosen_path, chosen_path_size,
+      refs)) {
+    goto Error;
+  }
+  ok = 1;
+ Error:
+  free(dist_array);
+  return ok;
+}
+
+static void BackwardReferences2DLocality(int xsize,
+                                         VP8LBackwardRefs* const refs) {
+  int i;
+  for (i = 0; i < refs->size; ++i) {
+    if (PixOrCopyIsCopy(&refs->refs[i])) {
+      const int dist = refs->refs[i].argb_or_distance;
+      const int transformed_dist = DistanceToPlaneCode(xsize, dist);
+      refs->refs[i].argb_or_distance = transformed_dist;
+    }
+  }
+}
+
+int VP8LGetBackwardReferences(int width, int height,
+                              const uint32_t* const argb,
+                              int quality, int cache_bits, int use_2d_locality,
+                              VP8LBackwardRefs* const best) {
+  int ok = 0;
+  int lz77_is_useful;
+  VP8LBackwardRefs refs_rle, refs_lz77;
+  const int num_pix = width * height;
+
+  VP8LBackwardRefsAlloc(&refs_rle, num_pix);
+  VP8LBackwardRefsAlloc(&refs_lz77, num_pix);
+  VP8LInitBackwardRefs(best);
+  if (refs_rle.refs == NULL || refs_lz77.refs == NULL) {
+ Error1:
+    VP8LClearBackwardRefs(&refs_rle);
+    VP8LClearBackwardRefs(&refs_lz77);
+    goto End;
+  }
+
+  if (!BackwardReferencesHashChain(width, height, argb, cache_bits, quality,
+                                   &refs_lz77)) {
+    goto End;
+  }
+  // Backward Reference using RLE only.
+  BackwardReferencesRle(width, height, argb, &refs_rle);
+
+  {
+    double bit_cost_lz77, bit_cost_rle;
+    VP8LHistogram* const histo = (VP8LHistogram*)malloc(sizeof(*histo));
+    if (histo == NULL) goto Error1;
+    // Evaluate lz77 coding
+    VP8LHistogramCreate(histo, &refs_lz77, cache_bits);
+    bit_cost_lz77 = VP8LHistogramEstimateBits(histo);
+    // Evaluate RLE coding
+    VP8LHistogramCreate(histo, &refs_rle, cache_bits);
+    bit_cost_rle = VP8LHistogramEstimateBits(histo);
+    // Decide if LZ77 is useful.
+    lz77_is_useful = (bit_cost_lz77 < bit_cost_rle);
+    free(histo);
+  }
+
+  // Choose appropriate backward reference.
+  if (lz77_is_useful) {
+    // TraceBackwards is costly. Don't execute it at lower quality (q <= 10).
+    const int try_lz77_trace_backwards = (quality > 10);
+    *best = refs_lz77;   // default guess: lz77 is better
+    VP8LClearBackwardRefs(&refs_rle);
+    if (try_lz77_trace_backwards) {
+      // Set recursion level for large images using a color cache.
+      const int recursion_level =
+          (num_pix < 320 * 200) && (cache_bits > 0) ? 1 : 0;
+      VP8LBackwardRefs refs_trace;
+      if (!VP8LBackwardRefsAlloc(&refs_trace, num_pix)) {
+        goto End;
+      }
+      if (BackwardReferencesTraceBackwards(width, height, recursion_level, argb,
+                                           quality, cache_bits, &refs_trace)) {
+        VP8LClearBackwardRefs(&refs_lz77);
+        *best = refs_trace;
+      }
+    }
+  } else {
+    VP8LClearBackwardRefs(&refs_lz77);
+    *best = refs_rle;
+  }
+
+  if (use_2d_locality) BackwardReferences2DLocality(width, best);
+
+  ok = 1;
+
+ End:
+  if (!ok) {
+    VP8LClearBackwardRefs(best);
+  }
+  return ok;
+}
+
+// Returns 1 on success.
+static int ComputeCacheHistogram(const uint32_t* const argb,
+                                 int xsize, int ysize,
+                                 const VP8LBackwardRefs* const refs,
+                                 int cache_bits,
+                                 VP8LHistogram* const histo) {
+  int pixel_index = 0;
+  int i;
+  uint32_t k;
+  VP8LColorCache hashers;
+  const int use_color_cache = (cache_bits > 0);
+  int cc_init = 0;
+
+  if (use_color_cache) {
+    cc_init = VP8LColorCacheInit(&hashers, cache_bits);
+    if (!cc_init) return 0;
+  }
+
+  for (i = 0; i < refs->size; ++i) {
+    const PixOrCopy* const v = &refs->refs[i];
+    if (PixOrCopyIsLiteral(v)) {
+      if (use_color_cache &&
+          VP8LColorCacheContains(&hashers, argb[pixel_index])) {
+        // push pixel as a cache index
+        const int ix = VP8LColorCacheGetIndex(&hashers, argb[pixel_index]);
+        const PixOrCopy token = PixOrCopyCreateCacheIdx(ix);
+        VP8LHistogramAddSinglePixOrCopy(histo, &token);
+      } else {
+        VP8LHistogramAddSinglePixOrCopy(histo, v);
+      }
+    } else {
+      VP8LHistogramAddSinglePixOrCopy(histo, v);
+    }
+    if (use_color_cache) {
+      for (k = 0; k < PixOrCopyLength(v); ++k) {
+        VP8LColorCacheInsert(&hashers, argb[pixel_index + k]);
+      }
+    }
+    pixel_index += PixOrCopyLength(v);
+  }
+  assert(pixel_index == xsize * ysize);
+  (void)xsize;  // xsize is not used in non-debug compilations otherwise.
+  (void)ysize;  // ysize is not used in non-debug compilations otherwise.
+  if (cc_init) VP8LColorCacheClear(&hashers);
+  return 1;
+}
+
+// Returns how many bits are to be used for a color cache.
+int VP8LCalculateEstimateForCacheSize(const uint32_t* const argb,
+                                      int xsize, int ysize,
+                                      int* const best_cache_bits) {
+  int ok = 0;
+  int cache_bits;
+  double lowest_entropy = 1e99;
+  VP8LBackwardRefs refs;
+  static const double kSmallPenaltyForLargeCache = 4.0;
+  static const int quality = 30;
+  if (!VP8LBackwardRefsAlloc(&refs, xsize * ysize) ||
+      !BackwardReferencesHashChain(xsize, ysize, argb, 0, quality, &refs)) {
+    goto Error;
+  }
+  for (cache_bits = 0; cache_bits <= MAX_COLOR_CACHE_BITS; ++cache_bits) {
+    double cur_entropy;
+    VP8LHistogram histo;
+    VP8LHistogramInit(&histo, cache_bits);
+    ComputeCacheHistogram(argb, xsize, ysize, &refs, cache_bits, &histo);
+    cur_entropy = VP8LHistogramEstimateBits(&histo) +
+        kSmallPenaltyForLargeCache * cache_bits;
+    if (cache_bits == 0 || cur_entropy < lowest_entropy) {
+      *best_cache_bits = cache_bits;
+      lowest_entropy = cur_entropy;
+    }
+  }
+  ok = 1;
+ Error:
+  VP8LClearBackwardRefs(&refs);
+  return ok;
+}
--- a/src/enc/backward_references.h
+++ b/src/enc/backward_references.h
@ -0,0 +1,219 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Author: Jyrki Alakuijala (jyrki@google.com)
+//
+
+#ifndef WEBP_ENC_BACKWARD_REFERENCES_H_
+#define WEBP_ENC_BACKWARD_REFERENCES_H_
+
+#include <assert.h>
+#include <stdlib.h>
+#include "../webp/types.h"
+#include "../webp/format_constants.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+// The spec allows 11, we use 9 bits to reduce memory consumption in encoding.
+// Having 9 instead of 11 only removes about 0.25 % of compression density.
+#define MAX_COLOR_CACHE_BITS 9
+
+// Max ever number of codes we'll use:
+#define PIX_OR_COPY_CODES_MAX \
+    (NUM_LITERAL_CODES + NUM_LENGTH_CODES + (1 << MAX_COLOR_CACHE_BITS))
+
+// -----------------------------------------------------------------------------
+// PrefixEncode()
+
+// use GNU builtins where available.
+#if defined(__GNUC__) && \
+    ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4)
+static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
+  assert(n != 0);
+  return 31 ^ __builtin_clz(n);
+}
+#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
+#include <intrin.h>
+#pragma intrinsic(_BitScanReverse)
+
+static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
+  unsigned long first_set_bit;
+  assert(n != 0);
+  _BitScanReverse(&first_set_bit, n);
+  return first_set_bit;
+}
+#else
+// Returns (int)floor(log2(n)). n must be > 0.
+static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
+  int log = 0;
+  uint32_t value = n;
+  int i;
+
+  assert(n != 0);
+  for (i = 4; i >= 0; --i) {
+    const int shift = (1 << i);
+    const uint32_t x = value >> shift;
+    if (x != 0) {
+      value = x;
+      log += shift;
+    }
+  }
+  return log;
+}
+#endif
+
+static WEBP_INLINE int VP8LBitsLog2Ceiling(uint32_t n) {
+  const int log_floor = BitsLog2Floor(n);
+  if (n == (n & ~(n - 1)))  // zero or a power of two.
+    return log_floor;
+  else
+    return log_floor + 1;
+}
+
+// Splitting of distance and length codes into prefixes and
+// extra bits. The prefixes are encoded with an entropy code
+// while the extra bits are stored just as normal bits.
+static WEBP_INLINE void PrefixEncode(int distance, int* const code,
+                                     int* const extra_bits_count,
+                                     int* const extra_bits_value) {
+  if (distance > 2) {  // Collect the two most significant bits.
+    const int highest_bit = BitsLog2Floor(--distance);
+    const int second_highest_bit = (distance >> (highest_bit - 1)) & 1;
+    *extra_bits_count = highest_bit - 1;
+    *extra_bits_value = distance & ((1 << *extra_bits_count) - 1);
+    *code = 2 * highest_bit + second_highest_bit;
+  } else {
+    *extra_bits_count = 0;
+    *extra_bits_value = 0;
+    *code = (distance == 2) ? 1 : 0;
+  }
+}
+
+// -----------------------------------------------------------------------------
+// PixOrCopy
+
+enum Mode {
+  kLiteral,
+  kCacheIdx,
+  kCopy,
+  kNone
+};
+
+typedef struct {
+  // mode as uint8_t to make the memory layout to be exactly 8 bytes.
+  uint8_t mode;
+  uint16_t len;
+  uint32_t argb_or_distance;
+} PixOrCopy;
+
+static WEBP_INLINE PixOrCopy PixOrCopyCreateCopy(uint32_t distance,
+                                                 uint16_t len) {
+  PixOrCopy retval;
+  retval.mode = kCopy;
+  retval.argb_or_distance = distance;
+  retval.len = len;
+  return retval;
+}
+
+static WEBP_INLINE PixOrCopy PixOrCopyCreateCacheIdx(int idx) {
+  PixOrCopy retval;
+  assert(idx >= 0);
+  assert(idx < (1 << MAX_COLOR_CACHE_BITS));
+  retval.mode = kCacheIdx;
+  retval.argb_or_distance = idx;
+  retval.len = 1;
+  return retval;
+}
+
+static WEBP_INLINE PixOrCopy PixOrCopyCreateLiteral(uint32_t argb) {
+  PixOrCopy retval;
+  retval.mode = kLiteral;
+  retval.argb_or_distance = argb;
+  retval.len = 1;
+  return retval;
+}
+
+static WEBP_INLINE int PixOrCopyIsLiteral(const PixOrCopy* const p) {
+  return (p->mode == kLiteral);
+}
+
+static WEBP_INLINE int PixOrCopyIsCacheIdx(const PixOrCopy* const p) {
+  return (p->mode == kCacheIdx);
+}
+
+static WEBP_INLINE int PixOrCopyIsCopy(const PixOrCopy* const p) {
+  return (p->mode == kCopy);
+}
+
+static WEBP_INLINE uint32_t PixOrCopyLiteral(const PixOrCopy* const p,
+                                             int component) {
+  assert(p->mode == kLiteral);
+  return (p->argb_or_distance >> (component * 8)) & 0xff;
+}
+
+static WEBP_INLINE uint32_t PixOrCopyLength(const PixOrCopy* const p) {
+  return p->len;
+}
+
+static WEBP_INLINE uint32_t PixOrCopyArgb(const PixOrCopy* const p) {
+  assert(p->mode == kLiteral);
+  return p->argb_or_distance;
+}
+
+static WEBP_INLINE uint32_t PixOrCopyCacheIdx(const PixOrCopy* const p) {
+  assert(p->mode == kCacheIdx);
+  assert(p->argb_or_distance < (1U << MAX_COLOR_CACHE_BITS));
+  return p->argb_or_distance;
+}
+
+static WEBP_INLINE uint32_t PixOrCopyDistance(const PixOrCopy* const p) {
+  assert(p->mode == kCopy);
+  return p->argb_or_distance;
+}
+
+// -----------------------------------------------------------------------------
+// VP8LBackwardRefs
+
+typedef struct {
+  PixOrCopy* refs;
+  int size;      // currently used
+  int max_size;  // maximum capacity
+} VP8LBackwardRefs;
+
+// Initialize the object. Must be called first. 'refs' can be NULL.
+void VP8LInitBackwardRefs(VP8LBackwardRefs* const refs);
+
+// Release memory and re-initialize the object. 'refs' can be NULL.
+void VP8LClearBackwardRefs(VP8LBackwardRefs* const refs);
+
+// Allocate 'max_size' references. Returns false in case of memory error.
+int VP8LBackwardRefsAlloc(VP8LBackwardRefs* const refs, int max_size);
+
+// -----------------------------------------------------------------------------
+// Main entry points
+
+// Evaluates best possible backward references for specified quality.
+// Further optimize for 2D locality if use_2d_locality flag is set.
+int VP8LGetBackwardReferences(int width, int height,
+                              const uint32_t* const argb,
+                              int quality, int cache_bits, int use_2d_locality,
+                              VP8LBackwardRefs* const best);
+
+// Produce an estimate for a good color cache size for the image.
+int VP8LCalculateEstimateForCacheSize(const uint32_t* const argb,
+                                      int xsize, int ysize,
+                                      int* const best_cache_bits);
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}
+#endif
+
+#endif  // WEBP_ENC_BACKWARD_REFERENCES_H_
--- a/src/enc/config.c
+++ b/src/enc/config.c
@ -1,15 +1,16 @@
-// Copyright 2011 Google Inc.
+// Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Coding tools configuration
 //
 // Author: Skal (pascal.massimino@gmail.com)

-#include <assert.h>
 #include "../webp/encode.h"

 #if defined(__cplusplus) || defined(c_plusplus)
@ -20,9 +21,9 @@ extern "C" {
 // WebPConfig
 //------------------------------------------------------------------------------

-int WebPConfigInitInternal(WebPConfig* const config,
+int WebPConfigInitInternal(WebPConfig* config,
                           WebPPreset preset, float quality, int version) {
-  if (version != WEBP_ENCODER_ABI_VERSION) {
+  if (WEBP_ABI_IS_INCOMPATIBLE(version, WEBP_ENCODER_ABI_VERSION)) {
    return 0;   // caller/system version mismatch!
  }
  if (config == NULL) return 0;
@ -32,17 +33,24 @@ int WebPConfigInitInternal(WebPConfig* const config,
  config->target_PSNR = 0.;
  config->method = 4;
  config->sns_strength = 50;
-  config->filter_strength = 20;   // default: light filtering
+  config->filter_strength = 60;   // rather high filtering, helps w/ gradients.
  config->filter_sharpness = 0;
-  config->filter_type = 0;        // default: simple
+  config->filter_type = 1;        // default: strong (so U/V is filtered too)
  config->partitions = 0;
  config->segments = 4;
  config->pass = 1;
  config->show_compressed = 0;
  config->preprocessing = 0;
  config->autofilter = 0;
-  config->alpha_compression = 0;
  config->partition_limit = 0;
+  config->alpha_compression = 1;
+  config->alpha_filtering = 1;
+  config->alpha_quality = 100;
+  config->lossless = 0;
+  config->image_hint = WEBP_HINT_DEFAULT;
+  config->emulate_jpeg_size = 0;
+  config->thread_level = 0;
+  config->low_memory = 0;

  // TODO(skal): tune.
  switch (preset) {
@ -77,7 +85,7 @@ int WebPConfigInitInternal(WebPConfig* const config,
  return WebPValidateConfig(config);
 }

-int WebPValidateConfig(const WebPConfig* const config) {
+int WebPValidateConfig(const WebPConfig* config) {
  if (config == NULL) return 0;
  if (config->quality < 0 || config->quality > 100)
    return 0;
@ -111,6 +119,20 @@ int WebPValidateConfig(const WebPConfig* const config) {
    return 0;
  if (config->alpha_compression < 0)
    return 0;
+  if (config->alpha_filtering < 0)
+    return 0;
+  if (config->alpha_quality < 0 || config->alpha_quality > 100)
+    return 0;
+  if (config->lossless < 0 || config->lossless > 1)
+    return 0;
+  if (config->image_hint >= WEBP_HINT_LAST)
+    return 0;
+  if (config->emulate_jpeg_size < 0 || config->emulate_jpeg_size > 1)
+    return 0;
+  if (config->thread_level < 0 || config->thread_level > 1)
+    return 0;
+  if (config->low_memory < 0 || config->low_memory > 1)
+    return 0;
  return 1;
 }

--- a/src/enc/cost.c
+++ b/src/enc/cost.c
@ -1,17 +1,17 @@
-// Copyright 2011 Google Inc.
+// Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Cost tables for level and modes
 //
 // Author: Skal (pascal.massimino@gmail.com)

-#include <assert.h>
-
-#include "cost.h"
+#include "./cost.h"

 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
@ -52,9 +52,9 @@ const uint16_t VP8EntropyCost[256] = {
 //------------------------------------------------------------------------------
 // Level cost tables

-// For each given level, the following table given the pattern of contexts
-// to use for coding it (in [][0]) as well as the bit value to use for
-// each context (in [][1]).
+// For each given level, the following table gives the pattern of contexts to
+// use for coding it (in [][0]) as well as the bit value to use for each
+// context (in [][1]).
 const uint16_t VP8LevelCodes[MAX_VARIABLE_LEVEL][2] = {
                  {0x001, 0x000}, {0x007, 0x001}, {0x00f, 0x005},
  {0x00f, 0x00d}, {0x033, 0x003}, {0x033, 0x003}, {0x033, 0x023},
@ -77,7 +77,7 @@ const uint16_t VP8LevelCodes[MAX_VARIABLE_LEVEL][2] = {

 // fixed costs for coding levels, deduce from the coding tree.
 // This is only the part that doesn't depend on the probability state.
-const uint16_t VP8LevelFixedCosts[2048] = {
+const uint16_t VP8LevelFixedCosts[MAX_LEVEL + 1] = {
     0,  256,  256,  256,  256,  432,  618,  630,
   731,  640,  640,  828,  901,  948, 1021, 1101,
  1174, 1221, 1294, 1042, 1085, 1115, 1158, 1202,
@ -356,9 +356,12 @@ static int VariableLevelCost(int level, const uint8_t probas[NUM_PROBAS]) {

 void VP8CalculateLevelCosts(VP8Proba* const proba) {
  int ctype, band, ctx;
+
+  if (!proba->dirty_) return;  // nothing to do.
+
  for (ctype = 0; ctype < NUM_TYPES; ++ctype) {
    for (band = 0; band < NUM_BANDS; ++band) {
-      for(ctx = 0; ctx < NUM_CTX; ++ctx) {
+      for (ctx = 0; ctx < NUM_CTX; ++ctx) {
        const uint8_t* const p = proba->coeffs_[ctype][band][ctx];
        uint16_t* const table = proba->level_cost_[ctype][band][ctx];
        const int cost_base = VP8BitCost(1, p[1]);
@ -372,6 +375,7 @@ void VP8CalculateLevelCosts(VP8Proba* const proba) {
      }
    }
  }
+  proba->dirty_ = 0;
 }

 //------------------------------------------------------------------------------
--- a/src/enc/cost.h
+++ b/src/enc/cost.h
@ -1,8 +1,10 @@
-// Copyright 2011 Google Inc.
+// Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Cost tables for level and modes.
@ -12,32 +14,27 @@
 #ifndef WEBP_ENC_COST_H_
 #define WEBP_ENC_COST_H_

-#include "vp8enci.h"
+#include "./vp8enci.h"

 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif

-extern const uint16_t VP8LevelFixedCosts[2048];   // approximate cost per level
+// approximate cost per level:
+extern const uint16_t VP8LevelFixedCosts[MAX_LEVEL + 1];
 extern const uint16_t VP8EntropyCost[256];        // 8bit fixed-point log(p)

 // Cost of coding one event with probability 'proba'.
-static inline int VP8BitCost(int bit, uint8_t proba) {
+static WEBP_INLINE int VP8BitCost(int bit, uint8_t proba) {
  return !bit ? VP8EntropyCost[proba] : VP8EntropyCost[255 - proba];
 }

-// Cost of coding 'nb' 1's and 'total-nb' 0's using 'proba' probability.
-static inline uint64_t VP8BranchCost(uint64_t nb, uint64_t total,
-                                     uint8_t proba) {
-  return nb * VP8BitCost(1, proba) + (total - nb) * VP8BitCost(0, proba);
-}
-
 // Level cost calculations
 extern const uint16_t VP8LevelCodes[MAX_VARIABLE_LEVEL][2];
 void VP8CalculateLevelCosts(VP8Proba* const proba);
-static inline int VP8LevelCost(const uint16_t* const table, int level) {
+static WEBP_INLINE int VP8LevelCost(const uint16_t* const table, int level) {
  return VP8LevelFixedCosts[level]
-       + table[level > MAX_VARIABLE_LEVEL ? MAX_VARIABLE_LEVEL : level];
+       + table[(level > MAX_VARIABLE_LEVEL) ? MAX_VARIABLE_LEVEL : level];
 }

 // Mode costs
--- a/src/enc/enc.c
+++ b/src/enc/enc.c
@ -1,770 +0,0 @@
-// Copyright 2011 Google Inc.
-//
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
-// -----------------------------------------------------------------------------
-//
-// speed-critical functions.
-//
-// Author: Skal (pascal.massimino@gmail.com)
-
-#include <assert.h>
-#include "vp8enci.h"
-
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
-//------------------------------------------------------------------------------
-// Compute susceptibility based on DCT-coeff histograms:
-// the higher, the "easier" the macroblock is to compress.
-
-static int ClipAlpha(int alpha) {
-  return alpha < 0 ? 0 : alpha > 255 ? 255 : alpha;
-}
-
-int VP8GetAlpha(const int histo[MAX_COEFF_THRESH + 1]) {
-  int num = 0, den = 0, val = 0;
-  int k;
-  int alpha;
-  // note: changing this loop to avoid the numerous "k + 1" slows things down.
-  for (k = 0; k < MAX_COEFF_THRESH; ++k) {
-    if (histo[k + 1]) {
-      val += histo[k + 1];
-      num += val * (k + 1);
-      den += (k + 1) * (k + 1);
-    }
-  }
-  // we scale the value to a usable [0..255] range
-  alpha = den ? 10 * num / den - 5 : 0;
-  return ClipAlpha(alpha);
-}
-
-static int CollectHistogram(const uint8_t* ref, const uint8_t* pred,
-                            int start_block, int end_block) {
-  int histo[MAX_COEFF_THRESH + 1] = { 0 };
-  int16_t out[16];
-  int j, k;
-  for (j = start_block; j < end_block; ++j) {
-    VP8FTransform(ref + VP8Scan[j], pred + VP8Scan[j], out);
-
-    // Convert coefficients to bin (within out[]).
-    for (k = 0; k < 16; ++k) {
-      const int v = abs(out[k]) >> 2;
-      out[k] = (v > MAX_COEFF_THRESH) ? MAX_COEFF_THRESH : v;
-    }
-
-    // Use bin to update histogram.
-    for (k = 0; k < 16; ++k) {
-      histo[out[k]]++;
-    }
-  }
-
-  return VP8GetAlpha(histo);
-}
-
-//------------------------------------------------------------------------------
-// run-time tables (~4k)
-
-static uint8_t clip1[255 + 510 + 1];    // clips [-255,510] to [0,255]
-
-// We declare this variable 'volatile' to prevent instruction reordering
-// and make sure it's set to true _last_ (so as to be thread-safe)
-static volatile int tables_ok = 0;
-
-static void InitTables(void) {
-  if (!tables_ok) {
-    int i;
-    for (i = -255; i <= 255 + 255; ++i) {
-      clip1[255 + i] = (i < 0) ? 0 : (i > 255) ? 255 : i;
-    }
-    tables_ok = 1;
-  }
-}
-
-static inline uint8_t clip_8b(int v) {
-  return (!(v & ~0xff)) ? v : v < 0 ? 0 : 255;
-}
-
-//------------------------------------------------------------------------------
-// Transforms (Paragraph 14.4)
-
-#define STORE(x, y, v) \
-  dst[(x) + (y) * BPS] = clip_8b(ref[(x) + (y) * BPS] + ((v) >> 3))
-
-static const int kC1 = 20091 + (1 << 16);
-static const int kC2 = 35468;
-#define MUL(a, b) (((a) * (b)) >> 16)
-
-static inline void ITransformOne(const uint8_t* ref, const int16_t* in,
-                                 uint8_t* dst) {
-  int C[4 * 4], *tmp;
-  int i;
-  tmp = C;
-  for (i = 0; i < 4; ++i) {    // vertical pass
-    const int a = in[0] + in[8];
-    const int b = in[0] - in[8];
-    const int c = MUL(in[4], kC2) - MUL(in[12], kC1);
-    const int d = MUL(in[4], kC1) + MUL(in[12], kC2);
-    tmp[0] = a + d;
-    tmp[1] = b + c;
-    tmp[2] = b - c;
-    tmp[3] = a - d;
-    tmp += 4;
-    in++;
-  }
-
-  tmp = C;
-  for (i = 0; i < 4; ++i) {    // horizontal pass
-    const int dc = tmp[0] + 4;
-    const int a =  dc +  tmp[8];
-    const int b =  dc -  tmp[8];
-    const int c = MUL(tmp[4], kC2) - MUL(tmp[12], kC1);
-    const int d = MUL(tmp[4], kC1) + MUL(tmp[12], kC2);
-    STORE(0, i, a + d);
-    STORE(1, i, b + c);
-    STORE(2, i, b - c);
-    STORE(3, i, a - d);
-    tmp++;
-  }
-}
-
-static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
-                       int do_two) {
-  ITransformOne(ref, in, dst);
-  if (do_two) {
-    ITransformOne(ref + 4, in + 16, dst + 4);
-  }
-}
-
-static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
-  int i;
-  int tmp[16];
-  for (i = 0; i < 4; ++i, src += BPS, ref += BPS) {
-    const int d0 = src[0] - ref[0];
-    const int d1 = src[1] - ref[1];
-    const int d2 = src[2] - ref[2];
-    const int d3 = src[3] - ref[3];
-    const int a0 = (d0 + d3) << 3;
-    const int a1 = (d1 + d2) << 3;
-    const int a2 = (d1 - d2) << 3;
-    const int a3 = (d0 - d3) << 3;
-    tmp[0 + i * 4] = (a0 + a1);
-    tmp[1 + i * 4] = (a2 * 2217 + a3 * 5352 + 14500) >> 12;
-    tmp[2 + i * 4] = (a0 - a1);
-    tmp[3 + i * 4] = (a3 * 2217 - a2 * 5352 +  7500) >> 12;
-  }
-  for (i = 0; i < 4; ++i) {
-    const int a0 = (tmp[0 + i] + tmp[12 + i]);
-    const int a1 = (tmp[4 + i] + tmp[ 8 + i]);
-    const int a2 = (tmp[4 + i] - tmp[ 8 + i]);
-    const int a3 = (tmp[0 + i] - tmp[12 + i]);
-    out[0 + i] = (a0 + a1 + 7) >> 4;
-    out[4 + i] = ((a2 * 2217 + a3 * 5352 + 12000) >> 16) + (a3 != 0);
-    out[8 + i] = (a0 - a1 + 7) >> 4;
-    out[12+ i] = ((a3 * 2217 - a2 * 5352 + 51000) >> 16);
-  }
-}
-
-static void ITransformWHT(const int16_t* in, int16_t* out) {
-  int tmp[16];
-  int i;
-  for (i = 0; i < 4; ++i) {
-    const int a0 = in[0 + i] + in[12 + i];
-    const int a1 = in[4 + i] + in[ 8 + i];
-    const int a2 = in[4 + i] - in[ 8 + i];
-    const int a3 = in[0 + i] - in[12 + i];
-    tmp[0  + i] = a0 + a1;
-    tmp[8  + i] = a0 - a1;
-    tmp[4  + i] = a3 + a2;
-    tmp[12 + i] = a3 - a2;
-  }
-  for (i = 0; i < 4; ++i) {
-    const int dc = tmp[0 + i * 4] + 3;    // w/ rounder
-    const int a0 = dc             + tmp[3 + i * 4];
-    const int a1 = tmp[1 + i * 4] + tmp[2 + i * 4];
-    const int a2 = tmp[1 + i * 4] - tmp[2 + i * 4];
-    const int a3 = dc             - tmp[3 + i * 4];
-    out[ 0] = (a0 + a1) >> 3;
-    out[16] = (a3 + a2) >> 3;
-    out[32] = (a0 - a1) >> 3;
-    out[48] = (a3 - a2) >> 3;
-    out += 64;
-  }
-}
-
-static void FTransformWHT(const int16_t* in, int16_t* out) {
-  int tmp[16];
-  int i;
-  for (i = 0; i < 4; ++i, in += 64) {
-    const int a0 = (in[0 * 16] + in[2 * 16]) << 2;
-    const int a1 = (in[1 * 16] + in[3 * 16]) << 2;
-    const int a2 = (in[1 * 16] - in[3 * 16]) << 2;
-    const int a3 = (in[0 * 16] - in[2 * 16]) << 2;
-    tmp[0 + i * 4] = (a0 + a1) + (a0 != 0);
-    tmp[1 + i * 4] = a3 + a2;
-    tmp[2 + i * 4] = a3 - a2;
-    tmp[3 + i * 4] = a0 - a1;
-  }
-  for (i = 0; i < 4; ++i) {
-    const int a0 = (tmp[0 + i] + tmp[8 + i]);
-    const int a1 = (tmp[4 + i] + tmp[12+ i]);
-    const int a2 = (tmp[4 + i] - tmp[12+ i]);
-    const int a3 = (tmp[0 + i] - tmp[8 + i]);
-    const int b0 = a0 + a1;
-    const int b1 = a3 + a2;
-    const int b2 = a3 - a2;
-    const int b3 = a0 - a1;
-    out[ 0 + i] = (b0 + (b0 > 0) + 3) >> 3;
-    out[ 4 + i] = (b1 + (b1 > 0) + 3) >> 3;
-    out[ 8 + i] = (b2 + (b2 > 0) + 3) >> 3;
-    out[12 + i] = (b3 + (b3 > 0) + 3) >> 3;
-  }
-}
-
-#undef MUL
-#undef STORE
-
-//------------------------------------------------------------------------------
-// Intra predictions
-
-#define OUT(x, y) dst[(x) + (y) * BPS]
-
-static inline void Fill(uint8_t* dst, int value, int size) {
-  int j;
-  for (j = 0; j < size; ++j) {
-    memset(dst + j * BPS, value, size);
-  }
-}
-
-static inline void VerticalPred(uint8_t* dst, const uint8_t* top, int size) {
-  int j;
-  if (top) {
-    for (j = 0; j < size; ++j) memcpy(dst + j * BPS, top, size);
-  } else {
-    Fill(dst, 127, size);
-  }
-}
-
-static inline void HorizontalPred(uint8_t* dst, const uint8_t* left, int size) {
-  if (left) {
-    int j;
-    for (j = 0; j < size; ++j) {
-      memset(dst + j * BPS, left[j], size);
-    }
-  } else {
-    Fill(dst, 129, size);
-  }
-}
-
-static inline void TrueMotion(uint8_t* dst, const uint8_t* left,
-                              const uint8_t* top, int size) {
-  int y;
-  if (left) {
-    if (top) {
-      const uint8_t* const clip = clip1 + 255 - left[-1];
-      for (y = 0; y < size; ++y) {
-        const uint8_t* const clip_table = clip + left[y];
-        int x;
-        for (x = 0; x < size; ++x) {
-          dst[x] = clip_table[top[x]];
-        }
-        dst += BPS;
-      }
-    } else {
-      HorizontalPred(dst, left, size);
-    }
-  } else {
-    // true motion without left samples (hence: with default 129 value)
-    // is equivalent to VE prediction where you just copy the top samples.
-    // Note that if top samples are not available, the default value is
-    // then 129, and not 127 as in the VerticalPred case.
-    if (top) {
-      VerticalPred(dst, top, size);
-    } else {
-      Fill(dst, 129, size);
-    }
-  }
-}
-
-static inline void DCMode(uint8_t* dst, const uint8_t* left,
-                          const uint8_t* top,
-                          int size, int round, int shift) {
-  int DC = 0;
-  int j;
-  if (top) {
-    for (j = 0; j < size; ++j) DC += top[j];
-    if (left) {   // top and left present
-      for (j = 0; j < size; ++j) DC += left[j];
-    } else {      // top, but no left
-      DC += DC;
-    }
-    DC = (DC + round) >> shift;
-  } else if (left) {   // left but no top
-    for (j = 0; j < size; ++j) DC += left[j];
-    DC += DC;
-    DC = (DC + round) >> shift;
-  } else {   // no top, no left, nothing.
-    DC = 0x80;
-  }
-  Fill(dst, DC, size);
-}
-
-//------------------------------------------------------------------------------
-// Chroma 8x8 prediction (paragraph 12.2)
-
-static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
-                             const uint8_t* top) {
-  // U block
-  DCMode(C8DC8 + dst, left, top, 8, 8, 4);
-  VerticalPred(C8VE8 + dst, top, 8);
-  HorizontalPred(C8HE8 + dst, left, 8);
-  TrueMotion(C8TM8 + dst, left, top, 8);
-  // V block
-  dst += 8;
-  if (top) top += 8;
-  if (left) left += 16;
-  DCMode(C8DC8 + dst, left, top, 8, 8, 4);
-  VerticalPred(C8VE8 + dst, top, 8);
-  HorizontalPred(C8HE8 + dst, left, 8);
-  TrueMotion(C8TM8 + dst, left, top, 8);
-}
-
-//------------------------------------------------------------------------------
-// luma 16x16 prediction (paragraph 12.3)
-
-static void Intra16Preds(uint8_t* dst,
-                         const uint8_t* left, const uint8_t* top) {
-  DCMode(I16DC16 + dst, left, top, 16, 16, 5);
-  VerticalPred(I16VE16 + dst, top, 16);
-  HorizontalPred(I16HE16 + dst, left, 16);
-  TrueMotion(I16TM16 + dst, left, top, 16);
-}
-
-//------------------------------------------------------------------------------
-// luma 4x4 prediction
-
-#define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
-#define AVG2(a, b) (((a) + (b) + 1) >> 1)
-
-static void VE4(uint8_t* dst, const uint8_t* top) {    // vertical
-  const uint8_t vals[4] = {
-    AVG3(top[-1], top[0], top[1]),
-    AVG3(top[ 0], top[1], top[2]),
-    AVG3(top[ 1], top[2], top[3]),
-    AVG3(top[ 2], top[3], top[4])
-  };
-  int i;
-  for (i = 0; i < 4; ++i) {
-    memcpy(dst + i * BPS, vals, 4);
-  }
-}
-
-static void HE4(uint8_t* dst, const uint8_t* top) {    // horizontal
-  const int X = top[-1];
-  const int I = top[-2];
-  const int J = top[-3];
-  const int K = top[-4];
-  const int L = top[-5];
-  *(uint32_t*)(dst + 0 * BPS) = 0x01010101U * AVG3(X, I, J);
-  *(uint32_t*)(dst + 1 * BPS) = 0x01010101U * AVG3(I, J, K);
-  *(uint32_t*)(dst + 2 * BPS) = 0x01010101U * AVG3(J, K, L);
-  *(uint32_t*)(dst + 3 * BPS) = 0x01010101U * AVG3(K, L, L);
-}
-
-static void DC4(uint8_t* dst, const uint8_t* top) {
-  uint32_t dc = 4;
-  int i;
-  for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i];
-  Fill(dst, dc >> 3, 4);
-}
-
-static void RD4(uint8_t* dst, const uint8_t* top) {
-  const int X = top[-1];
-  const int I = top[-2];
-  const int J = top[-3];
-  const int K = top[-4];
-  const int L = top[-5];
-  const int A = top[0];
-  const int B = top[1];
-  const int C = top[2];
-  const int D = top[3];
-  OUT(0, 3)                                     = AVG3(J, K, L);
-  OUT(0, 2) = OUT(1, 3)                         = AVG3(I, J, K);
-  OUT(0, 1) = OUT(1, 2) = OUT(2, 3)             = AVG3(X, I, J);
-  OUT(0, 0) = OUT(1, 1) = OUT(2, 2) = OUT(3, 3) = AVG3(A, X, I);
-  OUT(1, 0) = OUT(2, 1) = OUT(3, 2)             = AVG3(B, A, X);
-  OUT(2, 0) = OUT(3, 1)                         = AVG3(C, B, A);
-  OUT(3, 0)                                     = AVG3(D, C, B);
-}
-
-static void LD4(uint8_t* dst, const uint8_t* top) {
-  const int A = top[0];
-  const int B = top[1];
-  const int C = top[2];
-  const int D = top[3];
-  const int E = top[4];
-  const int F = top[5];
-  const int G = top[6];
-  const int H = top[7];
-  OUT(0, 0)                                     = AVG3(A, B, C);
-  OUT(1, 0) = OUT(0, 1)                         = AVG3(B, C, D);
-  OUT(2, 0) = OUT(1, 1) = OUT(0, 2)             = AVG3(C, D, E);
-  OUT(3, 0) = OUT(2, 1) = OUT(1, 2) = OUT(0, 3) = AVG3(D, E, F);
-  OUT(3, 1) = OUT(2, 2) = OUT(1, 3)             = AVG3(E, F, G);
-  OUT(3, 2) = OUT(2, 3)                         = AVG3(F, G, H);
-  OUT(3, 3)                                     = AVG3(G, H, H);
-}
-
-static void VR4(uint8_t* dst, const uint8_t* top) {
-  const int X = top[-1];
-  const int I = top[-2];
-  const int J = top[-3];
-  const int K = top[-4];
-  const int A = top[0];
-  const int B = top[1];
-  const int C = top[2];
-  const int D = top[3];
-  OUT(0, 0) = OUT(1, 2) = AVG2(X, A);
-  OUT(1, 0) = OUT(2, 2) = AVG2(A, B);
-  OUT(2, 0) = OUT(3, 2) = AVG2(B, C);
-  OUT(3, 0)             = AVG2(C, D);
-
-  OUT(0, 3) =             AVG3(K, J, I);
-  OUT(0, 2) =             AVG3(J, I, X);
-  OUT(0, 1) = OUT(1, 3) = AVG3(I, X, A);
-  OUT(1, 1) = OUT(2, 3) = AVG3(X, A, B);
-  OUT(2, 1) = OUT(3, 3) = AVG3(A, B, C);
-  OUT(3, 1) =             AVG3(B, C, D);
-}
-
-static void VL4(uint8_t* dst, const uint8_t* top) {
-  const int A = top[0];
-  const int B = top[1];
-  const int C = top[2];
-  const int D = top[3];
-  const int E = top[4];
-  const int F = top[5];
-  const int G = top[6];
-  const int H = top[7];
-  OUT(0, 0) =             AVG2(A, B);
-  OUT(1, 0) = OUT(0, 2) = AVG2(B, C);
-  OUT(2, 0) = OUT(1, 2) = AVG2(C, D);
-  OUT(3, 0) = OUT(2, 2) = AVG2(D, E);
-
-  OUT(0, 1) =             AVG3(A, B, C);
-  OUT(1, 1) = OUT(0, 3) = AVG3(B, C, D);
-  OUT(2, 1) = OUT(1, 3) = AVG3(C, D, E);
-  OUT(3, 1) = OUT(2, 3) = AVG3(D, E, F);
-              OUT(3, 2) = AVG3(E, F, G);
-              OUT(3, 3) = AVG3(F, G, H);
-}
-
-static void HU4(uint8_t* dst, const uint8_t* top) {
-  const int I = top[-2];
-  const int J = top[-3];
-  const int K = top[-4];
-  const int L = top[-5];
-  OUT(0, 0) =             AVG2(I, J);
-  OUT(2, 0) = OUT(0, 1) = AVG2(J, K);
-  OUT(2, 1) = OUT(0, 2) = AVG2(K, L);
-  OUT(1, 0) =             AVG3(I, J, K);
-  OUT(3, 0) = OUT(1, 1) = AVG3(J, K, L);
-  OUT(3, 1) = OUT(1, 2) = AVG3(K, L, L);
-  OUT(3, 2) = OUT(2, 2) =
-  OUT(0, 3) = OUT(1, 3) = OUT(2, 3) = OUT(3, 3) = L;
-}
-
-static void HD4(uint8_t* dst, const uint8_t* top) {
-  const int X = top[-1];
-  const int I = top[-2];
-  const int J = top[-3];
-  const int K = top[-4];
-  const int L = top[-5];
-  const int A = top[0];
-  const int B = top[1];
-  const int C = top[2];
-
-  OUT(0, 0) = OUT(2, 1) = AVG2(I, X);
-  OUT(0, 1) = OUT(2, 2) = AVG2(J, I);
-  OUT(0, 2) = OUT(2, 3) = AVG2(K, J);
-  OUT(0, 3)             = AVG2(L, K);
-
-  OUT(3, 0)             = AVG3(A, B, C);
-  OUT(2, 0)             = AVG3(X, A, B);
-  OUT(1, 0) = OUT(3, 1) = AVG3(I, X, A);
-  OUT(1, 1) = OUT(3, 2) = AVG3(J, I, X);
-  OUT(1, 2) = OUT(3, 3) = AVG3(K, J, I);
-  OUT(1, 3)             = AVG3(L, K, J);
-}
-
-static void TM4(uint8_t* dst, const uint8_t* top) {
-  int x, y;
-  const uint8_t* const clip = clip1 + 255 - top[-1];
-  for (y = 0; y < 4; ++y) {
-    const uint8_t* const clip_table = clip + top[-2 - y];
-    for (x = 0; x < 4; ++x) {
-      dst[x] = clip_table[top[x]];
-    }
-    dst += BPS;
-  }
-}
-
-#undef AVG3
-#undef AVG2
-
-// Left samples are top[-5 .. -2], top_left is top[-1], top are
-// located at top[0..3], and top right is top[4..7]
-static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
-  DC4(I4DC4 + dst, top);
-  TM4(I4TM4 + dst, top);
-  VE4(I4VE4 + dst, top);
-  HE4(I4HE4 + dst, top);
-  RD4(I4RD4 + dst, top);
-  VR4(I4VR4 + dst, top);
-  LD4(I4LD4 + dst, top);
-  VL4(I4VL4 + dst, top);
-  HD4(I4HD4 + dst, top);
-  HU4(I4HU4 + dst, top);
-}
-
-//------------------------------------------------------------------------------
-// Metric
-
-static inline int GetSSE(const uint8_t* a, const uint8_t* b, int w, int h) {
-  int count = 0;
-  int y, x;
-  for (y = 0; y < h; ++y) {
-    for (x = 0; x < w; ++x) {
-      const int diff = (int)a[x] - b[x];
-      count += diff * diff;
-    }
-    a += BPS;
-    b += BPS;
-  }
-  return count;
-}
-
-static int SSE16x16(const uint8_t* a, const uint8_t* b) {
-  return GetSSE(a, b, 16, 16);
-}
-static int SSE16x8(const uint8_t* a, const uint8_t* b) {
-  return GetSSE(a, b, 16, 8);
-}
-static int SSE8x8(const uint8_t* a, const uint8_t* b) {
-  return GetSSE(a, b, 8, 8);
-}
-static int SSE4x4(const uint8_t* a, const uint8_t* b) {
-  return GetSSE(a, b, 4, 4);
-}
-
-//------------------------------------------------------------------------------
-// Texture distortion
-//
-// We try to match the spectral content (weighted) between source and
-// reconstructed samples.
-
-// Hadamard transform
-// Returns the weighted sum of the absolute value of transformed coefficients.
-static int TTransform(const uint8_t* in, const uint16_t* w) {
-  int sum = 0;
-  int tmp[16];
-  int i;
-  // horizontal pass
-  for (i = 0; i < 4; ++i, in += BPS) {
-    const int a0 = (in[0] + in[2]) << 2;
-    const int a1 = (in[1] + in[3]) << 2;
-    const int a2 = (in[1] - in[3]) << 2;
-    const int a3 = (in[0] - in[2]) << 2;
-    tmp[0 + i * 4] = a0 + a1 + (a0 != 0);
-    tmp[1 + i * 4] = a3 + a2;
-    tmp[2 + i * 4] = a3 - a2;
-    tmp[3 + i * 4] = a0 - a1;
-  }
-  // vertical pass
-  for (i = 0; i < 4; ++i, ++w) {
-    const int a0 = (tmp[0 + i] + tmp[8 + i]);
-    const int a1 = (tmp[4 + i] + tmp[12+ i]);
-    const int a2 = (tmp[4 + i] - tmp[12+ i]);
-    const int a3 = (tmp[0 + i] - tmp[8 + i]);
-    const int b0 = a0 + a1;
-    const int b1 = a3 + a2;
-    const int b2 = a3 - a2;
-    const int b3 = a0 - a1;
-    // abs((b + (b<0) + 3) >> 3) = (abs(b) + 3) >> 3
-    sum += w[ 0] * ((abs(b0) + 3) >> 3);
-    sum += w[ 4] * ((abs(b1) + 3) >> 3);
-    sum += w[ 8] * ((abs(b2) + 3) >> 3);
-    sum += w[12] * ((abs(b3) + 3) >> 3);
-  }
-  return sum;
-}
-
-static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
-                    const uint16_t* const w) {
-  const int sum1 = TTransform(a, w);
-  const int sum2 = TTransform(b, w);
-  return (abs(sum2 - sum1) + 8) >> 4;
-}
-
-static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
-                      const uint16_t* const w) {
-  int D = 0;
-  int x, y;
-  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
-    for (x = 0; x < 16; x += 4) {
-      D += Disto4x4(a + x + y, b + x + y, w);
-    }
-  }
-  return D;
-}
-
-//------------------------------------------------------------------------------
-// Quantization
-//
-
-// Simple quantization
-static int QuantizeBlock(int16_t in[16], int16_t out[16],
-                         int n, const VP8Matrix* const mtx) {
-  int last = -1;
-  for (; n < 16; ++n) {
-    const int j = VP8Zigzag[n];
-    const int sign = (in[j] < 0);
-    int coeff = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
-    if (coeff > 2047) coeff = 2047;
-    if (coeff > mtx->zthresh_[j]) {
-      const int Q = mtx->q_[j];
-      const int iQ = mtx->iq_[j];
-      const int B = mtx->bias_[j];
-      out[n] = QUANTDIV(coeff, iQ, B);
-      if (sign) out[n] = -out[n];
-      in[j] = out[n] * Q;
-      if (out[n]) last = n;
-    } else {
-      out[n] = 0;
-      in[j] = 0;
-    }
-  }
-  return (last >= 0);
-}
-
-//------------------------------------------------------------------------------
-// Block copy
-
-static inline void Copy(const uint8_t* src, uint8_t* dst, int size) {
-  int y;
-  for (y = 0; y < size; ++y) {
-    memcpy(dst, src, size);
-    src += BPS;
-    dst += BPS;
-  }
-}
-
-static void Copy4x4(const uint8_t* src, uint8_t* dst) { Copy(src, dst, 4); }
-static void Copy8x8(const uint8_t* src, uint8_t* dst) { Copy(src, dst, 8); }
-static void Copy16x16(const uint8_t* src, uint8_t* dst) { Copy(src, dst, 16); }
-
-//------------------------------------------------------------------------------
-// SSE2 detection.
-//
-
-#if defined(__pic__) && defined(__i386__)
-static inline void GetCPUInfo(int cpu_info[4], int info_type) {
-  __asm__ volatile (
-    "mov %%ebx, %%edi\n"
-    "cpuid\n"
-    "xchg %%edi, %%ebx\n"
-    : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
-    : "a"(info_type));
-}
-#elif defined(__i386__) || defined(__x86_64__)
-static inline void GetCPUInfo(int cpu_info[4], int info_type) {
-  __asm__ volatile (
-    "cpuid\n"
-    : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
-    : "a"(info_type));
-}
-#elif defined(_MSC_VER)  // Visual C++
-#define GetCPUInfo __cpuid
-#endif
-
-#if defined(__i386__) || defined(__x86_64__) || defined(_MSC_VER)
-static int x86CPUInfo(CPUFeature feature) {
-  int cpu_info[4];
-  GetCPUInfo(cpu_info, 1);
-  if (feature == kSSE2) {
-    return 0 != (cpu_info[3] & 0x04000000);
-  }
-  if (feature == kSSE3) {
-    return 0 != (cpu_info[2] & 0x00000001);
-  }
-  return 0;
-}
-VP8CPUInfo VP8EncGetCPUInfo = x86CPUInfo;
-#else
-VP8CPUInfo VP8EncGetCPUInfo = NULL;
-#endif
-
-// Speed-critical function pointers. We have to initialize them to the default
-// implementations within VP8EncDspInit().
-VP8CHisto VP8CollectHistogram;
-VP8Idct VP8ITransform;
-VP8Fdct VP8FTransform;
-VP8WHT VP8ITransformWHT;
-VP8WHT VP8FTransformWHT;
-VP8Intra4Preds VP8EncPredLuma4;
-VP8IntraPreds VP8EncPredLuma16;
-VP8IntraPreds VP8EncPredChroma8;
-VP8Metric VP8SSE16x16;
-VP8Metric VP8SSE8x8;
-VP8Metric VP8SSE16x8;
-VP8Metric VP8SSE4x4;
-VP8WMetric VP8TDisto4x4;
-VP8WMetric VP8TDisto16x16;
-VP8QuantizeBlock VP8EncQuantizeBlock;
-VP8BlockCopy VP8Copy4x4;
-VP8BlockCopy VP8Copy8x8;
-VP8BlockCopy VP8Copy16x16;
-
-extern void VP8EncDspInitSSE2(void);
-
-void VP8EncDspInit(void) {
-  InitTables();
-
-  // default C implementations
-  VP8CollectHistogram = CollectHistogram;
-  VP8ITransform = ITransform;
-  VP8FTransform = FTransform;
-  VP8ITransformWHT = ITransformWHT;
-  VP8FTransformWHT = FTransformWHT;
-  VP8EncPredLuma4 = Intra4Preds;
-  VP8EncPredLuma16 = Intra16Preds;
-  VP8EncPredChroma8 = IntraChromaPreds;
-  VP8SSE16x16 = SSE16x16;
-  VP8SSE8x8 = SSE8x8;
-  VP8SSE16x8 = SSE16x8;
-  VP8SSE4x4 = SSE4x4;
-  VP8TDisto4x4 = Disto4x4;
-  VP8TDisto16x16 = Disto16x16;
-  VP8EncQuantizeBlock = QuantizeBlock;
-  VP8Copy4x4 = Copy4x4;
-  VP8Copy8x8 = Copy8x8;
-  VP8Copy16x16 = Copy16x16;
-
-  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
-  if (VP8EncGetCPUInfo) {
-    if (VP8EncGetCPUInfo(kSSE2)) {
-#if defined(__SSE2__) || defined(_MSC_VER)
-      VP8EncDspInitSSE2();
-#endif
-    }
-    if (VP8EncGetCPUInfo(kSSE3)) {
-      // later we'll plug some SSE3 variant here
-    }
-  }
-}
-
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
--- a/src/enc/enc_sse2.c
+++ b/src/enc/enc_sse2.c
@ -1,834 +0,0 @@
-// Copyright 2011 Google Inc.
-//
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
-// -----------------------------------------------------------------------------
-//
-// SSE2 version of speed-critical functions.
-//
-// Author: Christian Duvivier (cduvivier@google.com)
-
-#if defined(__SSE2__) || defined(_MSC_VER)
-#include <emmintrin.h>
-
-#include "vp8enci.h"
-
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
-//------------------------------------------------------------------------------
-// Compute susceptibility based on DCT-coeff histograms:
-// the higher, the "easier" the macroblock is to compress.
-
-static int CollectHistogramSSE2(const uint8_t* ref, const uint8_t* pred,
-                                int start_block, int end_block) {
-  int histo[MAX_COEFF_THRESH + 1] = { 0 };
-  int16_t out[16];
-  int j, k;
-  const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);
-  for (j = start_block; j < end_block; ++j) {
-    VP8FTransform(ref + VP8Scan[j], pred + VP8Scan[j], out);
-
-    // Convert coefficients to bin (within out[]).
-    {
-      // Load.
-      const __m128i out0 = _mm_loadu_si128((__m128i*)&out[0]);
-      const __m128i out1 = _mm_loadu_si128((__m128i*)&out[8]);
-      // sign(out) = out >> 15  (0x0000 if positive, 0xffff if negative)
-      const __m128i sign0 = _mm_srai_epi16(out0, 15);
-      const __m128i sign1 = _mm_srai_epi16(out1, 15);
-      // abs(out) = (out ^ sign) - sign
-      const __m128i xor0 = _mm_xor_si128(out0, sign0);
-      const __m128i xor1 = _mm_xor_si128(out1, sign1);
-      const __m128i abs0 = _mm_sub_epi16(xor0, sign0);
-      const __m128i abs1 = _mm_sub_epi16(xor1, sign1);
-      // v = abs(out) >> 2
-      const __m128i v0 = _mm_srai_epi16(abs0, 2);
-      const __m128i v1 = _mm_srai_epi16(abs1, 2);
-      // bin = min(v, MAX_COEFF_THRESH)
-      const __m128i bin0 = _mm_min_epi16(v0, max_coeff_thresh);
-      const __m128i bin1 = _mm_min_epi16(v1, max_coeff_thresh);
-      // Store.
-      _mm_storeu_si128((__m128i*)&out[0], bin0);
-      _mm_storeu_si128((__m128i*)&out[8], bin1);
-    }
-
-    // Use bin to update histogram.
-    for (k = 0; k < 16; ++k) {
-      histo[out[k]]++;
-    }
-  }
-
-  return VP8GetAlpha(histo);
-}
-
-//------------------------------------------------------------------------------
-// Transforms (Paragraph 14.4)
-
-// Does one or two inverse transforms.
-static void ITransformSSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst,
-                           int do_two) {
-  // This implementation makes use of 16-bit fixed point versions of two
-  // multiply constants:
-  //    K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
-  //    K2 = sqrt(2) * sin (pi/8) ~= 35468 / 2^16
-  //
-  // To be able to use signed 16-bit integers, we use the following trick to
-  // have constants within range:
-  // - Associated constants are obtained by subtracting the 16-bit fixed point
-  //   version of one:
-  //      k = K - (1 << 16)  =>  K = k + (1 << 16)
-  //      K1 = 85267  =>  k1 =  20091
-  //      K2 = 35468  =>  k2 = -30068
-  // - The multiplication of a variable by a constant become the sum of the
-  //   variable and the multiplication of that variable by the associated
-  //   constant:
-  //      (x * K) >> 16 = (x * (k + (1 << 16))) >> 16 = ((x * k ) >> 16) + x
-  const __m128i k1 = _mm_set1_epi16(20091);
-  const __m128i k2 = _mm_set1_epi16(-30068);
-  __m128i T0, T1, T2, T3;
-
-  // Load and concatenate the transform coefficients (we'll do two inverse
-  // transforms in parallel). In the case of only one inverse transform, the
-  // second half of the vectors will just contain random value we'll never
-  // use nor store.
-  __m128i in0, in1, in2, in3;
-  {
-    in0 = _mm_loadl_epi64((__m128i*)&in[0]);
-    in1 = _mm_loadl_epi64((__m128i*)&in[4]);
-    in2 = _mm_loadl_epi64((__m128i*)&in[8]);
-    in3 = _mm_loadl_epi64((__m128i*)&in[12]);
-    // a00 a10 a20 a30   x x x x
-    // a01 a11 a21 a31   x x x x
-    // a02 a12 a22 a32   x x x x
-    // a03 a13 a23 a33   x x x x
-    if (do_two) {
-      const __m128i inB0 = _mm_loadl_epi64((__m128i*)&in[16]);
-      const __m128i inB1 = _mm_loadl_epi64((__m128i*)&in[20]);
-      const __m128i inB2 = _mm_loadl_epi64((__m128i*)&in[24]);
-      const __m128i inB3 = _mm_loadl_epi64((__m128i*)&in[28]);
-      in0 = _mm_unpacklo_epi64(in0, inB0);
-      in1 = _mm_unpacklo_epi64(in1, inB1);
-      in2 = _mm_unpacklo_epi64(in2, inB2);
-      in3 = _mm_unpacklo_epi64(in3, inB3);
-      // a00 a10 a20 a30   b00 b10 b20 b30
-      // a01 a11 a21 a31   b01 b11 b21 b31
-      // a02 a12 a22 a32   b02 b12 b22 b32
-      // a03 a13 a23 a33   b03 b13 b23 b33
-    }
-  }
-
-  // Vertical pass and subsequent transpose.
-  {
-    // First pass, c and d calculations are longer because of the "trick"
-    // multiplications.
-    const __m128i a = _mm_add_epi16(in0, in2);
-    const __m128i b = _mm_sub_epi16(in0, in2);
-    // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3
-    const __m128i c1 = _mm_mulhi_epi16(in1, k2);
-    const __m128i c2 = _mm_mulhi_epi16(in3, k1);
-    const __m128i c3 = _mm_sub_epi16(in1, in3);
-    const __m128i c4 = _mm_sub_epi16(c1, c2);
-    const __m128i c = _mm_add_epi16(c3, c4);
-    // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3
-    const __m128i d1 = _mm_mulhi_epi16(in1, k1);
-    const __m128i d2 = _mm_mulhi_epi16(in3, k2);
-    const __m128i d3 = _mm_add_epi16(in1, in3);
-    const __m128i d4 = _mm_add_epi16(d1, d2);
-    const __m128i d = _mm_add_epi16(d3, d4);
-
-    // Second pass.
-    const __m128i tmp0 = _mm_add_epi16(a, d);
-    const __m128i tmp1 = _mm_add_epi16(b, c);
-    const __m128i tmp2 = _mm_sub_epi16(b, c);
-    const __m128i tmp3 = _mm_sub_epi16(a, d);
-
-    // Transpose the two 4x4.
-    // a00 a01 a02 a03   b00 b01 b02 b03
-    // a10 a11 a12 a13   b10 b11 b12 b13
-    // a20 a21 a22 a23   b20 b21 b22 b23
-    // a30 a31 a32 a33   b30 b31 b32 b33
-    const __m128i transpose0_0 = _mm_unpacklo_epi16(tmp0, tmp1);
-    const __m128i transpose0_1 = _mm_unpacklo_epi16(tmp2, tmp3);
-    const __m128i transpose0_2 = _mm_unpackhi_epi16(tmp0, tmp1);
-    const __m128i transpose0_3 = _mm_unpackhi_epi16(tmp2, tmp3);
-    // a00 a10 a01 a11   a02 a12 a03 a13
-    // a20 a30 a21 a31   a22 a32 a23 a33
-    // b00 b10 b01 b11   b02 b12 b03 b13
-    // b20 b30 b21 b31   b22 b32 b23 b33
-    const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
-    const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);
-    const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
-    const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);
-    // a00 a10 a20 a30 a01 a11 a21 a31
-    // b00 b10 b20 b30 b01 b11 b21 b31
-    // a02 a12 a22 a32 a03 a13 a23 a33
-    // b02 b12 a22 b32 b03 b13 b23 b33
-    T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);
-    T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);
-    T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
-    T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
-    // a00 a10 a20 a30   b00 b10 b20 b30
-    // a01 a11 a21 a31   b01 b11 b21 b31
-    // a02 a12 a22 a32   b02 b12 b22 b32
-    // a03 a13 a23 a33   b03 b13 b23 b33
-  }
-
-  // Horizontal pass and subsequent transpose.
-  {
-    // First pass, c and d calculations are longer because of the "trick"
-    // multiplications.
-    const __m128i four = _mm_set1_epi16(4);
-    const __m128i dc = _mm_add_epi16(T0, four);
-    const __m128i a =  _mm_add_epi16(dc, T2);
-    const __m128i b =  _mm_sub_epi16(dc, T2);
-    // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3
-    const __m128i c1 = _mm_mulhi_epi16(T1, k2);
-    const __m128i c2 = _mm_mulhi_epi16(T3, k1);
-    const __m128i c3 = _mm_sub_epi16(T1, T3);
-    const __m128i c4 = _mm_sub_epi16(c1, c2);
-    const __m128i c = _mm_add_epi16(c3, c4);
-    // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3
-    const __m128i d1 = _mm_mulhi_epi16(T1, k1);
-    const __m128i d2 = _mm_mulhi_epi16(T3, k2);
-    const __m128i d3 = _mm_add_epi16(T1, T3);
-    const __m128i d4 = _mm_add_epi16(d1, d2);
-    const __m128i d = _mm_add_epi16(d3, d4);
-
-    // Second pass.
-    const __m128i tmp0 = _mm_add_epi16(a, d);
-    const __m128i tmp1 = _mm_add_epi16(b, c);
-    const __m128i tmp2 = _mm_sub_epi16(b, c);
-    const __m128i tmp3 = _mm_sub_epi16(a, d);
-    const __m128i shifted0 = _mm_srai_epi16(tmp0, 3);
-    const __m128i shifted1 = _mm_srai_epi16(tmp1, 3);
-    const __m128i shifted2 = _mm_srai_epi16(tmp2, 3);
-    const __m128i shifted3 = _mm_srai_epi16(tmp3, 3);
-
-    // Transpose the two 4x4.
-    // a00 a01 a02 a03   b00 b01 b02 b03
-    // a10 a11 a12 a13   b10 b11 b12 b13
-    // a20 a21 a22 a23   b20 b21 b22 b23
-    // a30 a31 a32 a33   b30 b31 b32 b33
-    const __m128i transpose0_0 = _mm_unpacklo_epi16(shifted0, shifted1);
-    const __m128i transpose0_1 = _mm_unpacklo_epi16(shifted2, shifted3);
-    const __m128i transpose0_2 = _mm_unpackhi_epi16(shifted0, shifted1);
-    const __m128i transpose0_3 = _mm_unpackhi_epi16(shifted2, shifted3);
-    // a00 a10 a01 a11   a02 a12 a03 a13
-    // a20 a30 a21 a31   a22 a32 a23 a33
-    // b00 b10 b01 b11   b02 b12 b03 b13
-    // b20 b30 b21 b31   b22 b32 b23 b33
-    const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
-    const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);
-    const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
-    const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);
-    // a00 a10 a20 a30 a01 a11 a21 a31
-    // b00 b10 b20 b30 b01 b11 b21 b31
-    // a02 a12 a22 a32 a03 a13 a23 a33
-    // b02 b12 a22 b32 b03 b13 b23 b33
-    T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);
-    T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);
-    T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
-    T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
-    // a00 a10 a20 a30   b00 b10 b20 b30
-    // a01 a11 a21 a31   b01 b11 b21 b31
-    // a02 a12 a22 a32   b02 b12 b22 b32
-    // a03 a13 a23 a33   b03 b13 b23 b33
-  }
-
-  // Add inverse transform to 'ref' and store.
-  {
-    const __m128i zero = _mm_set1_epi16(0);
-    // Load the reference(s).
-    __m128i ref0, ref1, ref2, ref3;
-    if (do_two) {
-      // Load eight bytes/pixels per line.
-      ref0 = _mm_loadl_epi64((__m128i*)&ref[0 * BPS]);
-      ref1 = _mm_loadl_epi64((__m128i*)&ref[1 * BPS]);
-      ref2 = _mm_loadl_epi64((__m128i*)&ref[2 * BPS]);
-      ref3 = _mm_loadl_epi64((__m128i*)&ref[3 * BPS]);
-    } else {
-      // Load four bytes/pixels per line.
-      ref0 = _mm_cvtsi32_si128(*(int*)&ref[0 * BPS]);
-      ref1 = _mm_cvtsi32_si128(*(int*)&ref[1 * BPS]);
-      ref2 = _mm_cvtsi32_si128(*(int*)&ref[2 * BPS]);
-      ref3 = _mm_cvtsi32_si128(*(int*)&ref[3 * BPS]);
-    }
-    // Convert to 16b.
-    ref0 = _mm_unpacklo_epi8(ref0, zero);
-    ref1 = _mm_unpacklo_epi8(ref1, zero);
-    ref2 = _mm_unpacklo_epi8(ref2, zero);
-    ref3 = _mm_unpacklo_epi8(ref3, zero);
-    // Add the inverse transform(s).
-    ref0 = _mm_add_epi16(ref0, T0);
-    ref1 = _mm_add_epi16(ref1, T1);
-    ref2 = _mm_add_epi16(ref2, T2);
-    ref3 = _mm_add_epi16(ref3, T3);
-    // Unsigned saturate to 8b.
-    ref0 = _mm_packus_epi16(ref0, ref0);
-    ref1 = _mm_packus_epi16(ref1, ref1);
-    ref2 = _mm_packus_epi16(ref2, ref2);
-    ref3 = _mm_packus_epi16(ref3, ref3);
-    // Store the results.
-    if (do_two) {
-      // Store eight bytes/pixels per line.
-      _mm_storel_epi64((__m128i*)&dst[0 * BPS], ref0);
-      _mm_storel_epi64((__m128i*)&dst[1 * BPS], ref1);
-      _mm_storel_epi64((__m128i*)&dst[2 * BPS], ref2);
-      _mm_storel_epi64((__m128i*)&dst[3 * BPS], ref3);
-    } else {
-      // Store four bytes/pixels per line.
-      *((int32_t *)&dst[0 * BPS]) = _mm_cvtsi128_si32(ref0);
-      *((int32_t *)&dst[1 * BPS]) = _mm_cvtsi128_si32(ref1);
-      *((int32_t *)&dst[2 * BPS]) = _mm_cvtsi128_si32(ref2);
-      *((int32_t *)&dst[3 * BPS]) = _mm_cvtsi128_si32(ref3);
-    }
-  }
-}
-
-static void FTransformSSE2(const uint8_t* src, const uint8_t* ref,
-                           int16_t* out) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i seven = _mm_set1_epi16(7);
-  const __m128i k7500 = _mm_set1_epi32(7500);
-  const __m128i k14500 = _mm_set1_epi32(14500);
-  const __m128i k51000 = _mm_set1_epi32(51000);
-  const __m128i k12000_plus_one = _mm_set1_epi32(12000 + (1 << 16));
-  const __m128i k5352_2217 = _mm_set_epi16(5352,  2217, 5352,  2217,
-                                           5352,  2217, 5352,  2217);
-  const __m128i k2217_5352 = _mm_set_epi16(2217, -5352, 2217, -5352,
-                                           2217, -5352, 2217, -5352);
-
-  __m128i v01, v32;
-
-  // Difference between src and ref and initial transpose.
-  {
-    // Load src and convert to 16b.
-    const __m128i src0 = _mm_loadl_epi64((__m128i*)&src[0 * BPS]);
-    const __m128i src1 = _mm_loadl_epi64((__m128i*)&src[1 * BPS]);
-    const __m128i src2 = _mm_loadl_epi64((__m128i*)&src[2 * BPS]);
-    const __m128i src3 = _mm_loadl_epi64((__m128i*)&src[3 * BPS]);
-    const __m128i src_0 = _mm_unpacklo_epi8(src0, zero);
-    const __m128i src_1 = _mm_unpacklo_epi8(src1, zero);
-    const __m128i src_2 = _mm_unpacklo_epi8(src2, zero);
-    const __m128i src_3 = _mm_unpacklo_epi8(src3, zero);
-    // Load ref and convert to 16b.
-    const __m128i ref0 = _mm_loadl_epi64((__m128i*)&ref[0 * BPS]);
-    const __m128i ref1 = _mm_loadl_epi64((__m128i*)&ref[1 * BPS]);
-    const __m128i ref2 = _mm_loadl_epi64((__m128i*)&ref[2 * BPS]);
-    const __m128i ref3 = _mm_loadl_epi64((__m128i*)&ref[3 * BPS]);
-    const __m128i ref_0 = _mm_unpacklo_epi8(ref0, zero);
-    const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero);
-    const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero);
-    const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero);
-    // Compute difference.
-    const __m128i diff0 = _mm_sub_epi16(src_0, ref_0);
-    const __m128i diff1 = _mm_sub_epi16(src_1, ref_1);
-    const __m128i diff2 = _mm_sub_epi16(src_2, ref_2);
-    const __m128i diff3 = _mm_sub_epi16(src_3, ref_3);
-
-    // Transpose.
-    // 00 01 02 03   0 0 0 0
-    // 10 11 12 13   0 0 0 0
-    // 20 21 22 23   0 0 0 0
-    // 30 31 32 33   0 0 0 0
-    const __m128i transpose0_0 = _mm_unpacklo_epi16(diff0, diff1);
-    const __m128i transpose0_1 = _mm_unpacklo_epi16(diff2, diff3);
-    // 00 10 01 11   02 12 03 13
-    // 20 30 21 31   22 32 23 33
-    const __m128i v23 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
-    v01 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
-    v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2));
-    // a02 a12 a22 a32   a03 a13 a23 a33
-    // a00 a10 a20 a30   a01 a11 a21 a31
-    // a03 a13 a23 a33   a02 a12 a22 a32
-  }
-
-  // First pass and subsequent transpose.
-  {
-    // Same operations are done on the (0,3) and (1,2) pairs.
-    // b0 = (a0 + a3) << 3
-    // b1 = (a1 + a2) << 3
-    // b3 = (a0 - a3) << 3
-    // b2 = (a1 - a2) << 3
-    const __m128i a01 = _mm_add_epi16(v01, v32);
-    const __m128i a32 = _mm_sub_epi16(v01, v32);
-    const __m128i b01 = _mm_slli_epi16(a01, 3);
-    const __m128i b32 = _mm_slli_epi16(a32, 3);
-    const __m128i b11 = _mm_unpackhi_epi64(b01, b01);
-    const __m128i b22 = _mm_unpackhi_epi64(b32, b32);
-
-    // e0 = b0 + b1
-    // e2 = b0 - b1
-    const __m128i e0 = _mm_add_epi16(b01, b11);
-    const __m128i e2 = _mm_sub_epi16(b01, b11);
-    const __m128i e02 = _mm_unpacklo_epi64(e0, e2);
-
-    // e1 = (b3 * 5352 + b2 * 2217 + 14500) >> 12
-    // e3 = (b3 * 2217 - b2 * 5352 +  7500) >> 12
-    const __m128i b23 = _mm_unpacklo_epi16(b22, b32);
-    const __m128i c1 = _mm_madd_epi16(b23, k5352_2217);
-    const __m128i c3 = _mm_madd_epi16(b23, k2217_5352);
-    const __m128i d1 = _mm_add_epi32(c1, k14500);
-    const __m128i d3 = _mm_add_epi32(c3, k7500);
-    const __m128i e1 = _mm_srai_epi32(d1, 12);
-    const __m128i e3 = _mm_srai_epi32(d3, 12);
-    const __m128i e13 = _mm_packs_epi32(e1, e3);
-
-    // Transpose.
-    // 00 01 02 03  20 21 22 23
-    // 10 11 12 13  30 31 32 33
-    const __m128i transpose0_0 = _mm_unpacklo_epi16(e02, e13);
-    const __m128i transpose0_1 = _mm_unpackhi_epi16(e02, e13);
-    // 00 10 01 11   02 12 03 13
-    // 20 30 21 31   22 32 23 33
-    const __m128i v23 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
-    v01 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
-    v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2));
-    // 02 12 22 32   03 13 23 33
-    // 00 10 20 30   01 11 21 31
-    // 03 13 23 33   02 12 22 32
-  }
-
-  // Second pass
-  {
-    // Same operations are done on the (0,3) and (1,2) pairs.
-    // a0 = v0 + v3
-    // a1 = v1 + v2
-    // a3 = v0 - v3
-    // a2 = v1 - v2
-    const __m128i a01 = _mm_add_epi16(v01, v32);
-    const __m128i a32 = _mm_sub_epi16(v01, v32);
-    const __m128i a11 = _mm_unpackhi_epi64(a01, a01);
-    const __m128i a22 = _mm_unpackhi_epi64(a32, a32);
-
-    // d0 = (a0 + a1 + 7) >> 4;
-    // d2 = (a0 - a1 + 7) >> 4;
-    const __m128i b0 = _mm_add_epi16(a01, a11);
-    const __m128i b2 = _mm_sub_epi16(a01, a11);
-    const __m128i c0 = _mm_add_epi16(b0, seven);
-    const __m128i c2 = _mm_add_epi16(b2, seven);
-    const __m128i d0 = _mm_srai_epi16(c0, 4);
-    const __m128i d2 = _mm_srai_epi16(c2, 4);
-
-    // f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16)
-    // f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16)
-    const __m128i b23 = _mm_unpacklo_epi16(a22, a32);
-    const __m128i c1 = _mm_madd_epi16(b23, k5352_2217);
-    const __m128i c3 = _mm_madd_epi16(b23, k2217_5352);
-    const __m128i d1 = _mm_add_epi32(c1, k12000_plus_one);
-    const __m128i d3 = _mm_add_epi32(c3, k51000);
-    const __m128i e1 = _mm_srai_epi32(d1, 16);
-    const __m128i e3 = _mm_srai_epi32(d3, 16);
-    const __m128i f1 = _mm_packs_epi32(e1, e1);
-    const __m128i f3 = _mm_packs_epi32(e3, e3);
-    // f1 = f1 + (a3 != 0);
-    // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the
-    // desired (0, 1), we add one earlier through k12000_plus_one.
-    const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero));
-
-    _mm_storel_epi64((__m128i*)&out[ 0], d0);
-    _mm_storel_epi64((__m128i*)&out[ 4], g1);
-    _mm_storel_epi64((__m128i*)&out[ 8], d2);
-    _mm_storel_epi64((__m128i*)&out[12], f3);
-  }
-}
-
-//------------------------------------------------------------------------------
-// Metric
-
-static int SSE4x4SSE2(const uint8_t* a, const uint8_t* b) {
-  const __m128i zero = _mm_set1_epi16(0);
-
-  // Load values.
-  const __m128i a0 = _mm_loadl_epi64((__m128i*)&a[BPS * 0]);
-  const __m128i a1 = _mm_loadl_epi64((__m128i*)&a[BPS * 1]);
-  const __m128i a2 = _mm_loadl_epi64((__m128i*)&a[BPS * 2]);
-  const __m128i a3 = _mm_loadl_epi64((__m128i*)&a[BPS * 3]);
-  const __m128i b0 = _mm_loadl_epi64((__m128i*)&b[BPS * 0]);
-  const __m128i b1 = _mm_loadl_epi64((__m128i*)&b[BPS * 1]);
-  const __m128i b2 = _mm_loadl_epi64((__m128i*)&b[BPS * 2]);
-  const __m128i b3 = _mm_loadl_epi64((__m128i*)&b[BPS * 3]);
-
-  // Combine pair of lines and convert to 16b.
-  const __m128i a01 = _mm_unpacklo_epi32(a0, a1);
-  const __m128i a23 = _mm_unpacklo_epi32(a2, a3);
-  const __m128i b01 = _mm_unpacklo_epi32(b0, b1);
-  const __m128i b23 = _mm_unpacklo_epi32(b2, b3);
-  const __m128i a01s = _mm_unpacklo_epi8(a01, zero);
-  const __m128i a23s = _mm_unpacklo_epi8(a23, zero);
-  const __m128i b01s = _mm_unpacklo_epi8(b01, zero);
-  const __m128i b23s = _mm_unpacklo_epi8(b23, zero);
-
-  // Compute differences; (a-b)^2 = (abs(a-b))^2 = (sat8(a-b) + sat8(b-a))^2
-  // TODO(cduvivier): Dissassemble and figure out why this is fastest. We don't
-  //                  need absolute values, there is no need to do calculation
-  //                  in 8bit as we are already in 16bit, ... Yet this is what
-  //                  benchmarks the fastest!
-  const __m128i d0 = _mm_subs_epu8(a01s, b01s);
-  const __m128i d1 = _mm_subs_epu8(b01s, a01s);
-  const __m128i d2 = _mm_subs_epu8(a23s, b23s);
-  const __m128i d3 = _mm_subs_epu8(b23s, a23s);
-
-  // Square and add them all together.
-  const __m128i madd0 = _mm_madd_epi16(d0, d0);
-  const __m128i madd1 = _mm_madd_epi16(d1, d1);
-  const __m128i madd2 = _mm_madd_epi16(d2, d2);
-  const __m128i madd3 = _mm_madd_epi16(d3, d3);
-  const __m128i sum0 = _mm_add_epi32(madd0, madd1);
-  const __m128i sum1 = _mm_add_epi32(madd2, madd3);
-  const __m128i sum2 = _mm_add_epi32(sum0, sum1);
-  int32_t tmp[4];
-  _mm_storeu_si128((__m128i*)tmp, sum2);
-  return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
-}
-
-//------------------------------------------------------------------------------
-// Texture distortion
-//
-// We try to match the spectral content (weighted) between source and
-// reconstructed samples.
-
-// Hadamard transform
-// Returns the difference between the weighted sum of the absolute value of
-// transformed coefficients.
-static int TTransformSSE2(const uint8_t* inA, const uint8_t* inB,
-                          const uint16_t* const w) {
-  int32_t sum[4];
-  __m128i tmp_0, tmp_1, tmp_2, tmp_3;
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i three = _mm_set1_epi16(3);
-
-  // Load, combine and tranpose inputs.
-  {
-    const __m128i inA_0 = _mm_loadl_epi64((__m128i*)&inA[BPS * 0]);
-    const __m128i inA_1 = _mm_loadl_epi64((__m128i*)&inA[BPS * 1]);
-    const __m128i inA_2 = _mm_loadl_epi64((__m128i*)&inA[BPS * 2]);
-    const __m128i inA_3 = _mm_loadl_epi64((__m128i*)&inA[BPS * 3]);
-    const __m128i inB_0 = _mm_loadl_epi64((__m128i*)&inB[BPS * 0]);
-    const __m128i inB_1 = _mm_loadl_epi64((__m128i*)&inB[BPS * 1]);
-    const __m128i inB_2 = _mm_loadl_epi64((__m128i*)&inB[BPS * 2]);
-    const __m128i inB_3 = _mm_loadl_epi64((__m128i*)&inB[BPS * 3]);
-
-    // Combine inA and inB (we'll do two transforms in parallel).
-    const __m128i inAB_0 = _mm_unpacklo_epi8(inA_0, inB_0);
-    const __m128i inAB_1 = _mm_unpacklo_epi8(inA_1, inB_1);
-    const __m128i inAB_2 = _mm_unpacklo_epi8(inA_2, inB_2);
-    const __m128i inAB_3 = _mm_unpacklo_epi8(inA_3, inB_3);
-    // a00 b00 a01 b01 a02 b03 a03 b03   0 0 0 0 0 0 0 0
-    // a10 b10 a11 b11 a12 b12 a13 b13   0 0 0 0 0 0 0 0
-    // a20 b20 a21 b21 a22 b22 a23 b23   0 0 0 0 0 0 0 0
-    // a30 b30 a31 b31 a32 b32 a33 b33   0 0 0 0 0 0 0 0
-
-    // Transpose the two 4x4, discarding the filling zeroes.
-    const __m128i transpose0_0 = _mm_unpacklo_epi8(inAB_0, inAB_2);
-    const __m128i transpose0_1 = _mm_unpacklo_epi8(inAB_1, inAB_3);
-    // a00 a20  b00 b20  a01 a21  b01 b21  a02 a22  b02 b22  a03 a23  b03 b23
-    // a10 a30  b10 b30  a11 a31  b11 b31  a12 a32  b12 b32  a13 a33  b13 b33
-    const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1);
-    const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1);
-    // a00 a10 a20 a30  b00 b10 b20 b30  a01 a11 a21 a31  b01 b11 b21 b31
-    // a02 a12 a22 a32  b02 b12 b22 b32  a03 a13 a23 a33  b03 b13 b23 b33
-
-    // Convert to 16b.
-    tmp_0 = _mm_unpacklo_epi8(transpose1_0, zero);
-    tmp_1 = _mm_unpackhi_epi8(transpose1_0, zero);
-    tmp_2 = _mm_unpacklo_epi8(transpose1_1, zero);
-    tmp_3 = _mm_unpackhi_epi8(transpose1_1, zero);
-    // a00 a10 a20 a30   b00 b10 b20 b30
-    // a01 a11 a21 a31   b01 b11 b21 b31
-    // a02 a12 a22 a32   b02 b12 b22 b32
-    // a03 a13 a23 a33   b03 b13 b23 b33
-  }
-
-  // Horizontal pass and subsequent transpose.
-  {
-    // Calculate a and b (two 4x4 at once).
-    const __m128i a0 = _mm_slli_epi16(_mm_add_epi16(tmp_0, tmp_2), 2);
-    const __m128i a1 = _mm_slli_epi16(_mm_add_epi16(tmp_1, tmp_3), 2);
-    const __m128i a2 = _mm_slli_epi16(_mm_sub_epi16(tmp_1, tmp_3), 2);
-    const __m128i a3 = _mm_slli_epi16(_mm_sub_epi16(tmp_0, tmp_2), 2);
-    // b0_extra = (a0 != 0);
-    const __m128i b0_extra = _mm_andnot_si128(_mm_cmpeq_epi16 (a0, zero), one);
-    const __m128i b0_base = _mm_add_epi16(a0, a1);
-    const __m128i b1 = _mm_add_epi16(a3, a2);
-    const __m128i b2 = _mm_sub_epi16(a3, a2);
-    const __m128i b3 = _mm_sub_epi16(a0, a1);
-    const __m128i b0 = _mm_add_epi16(b0_base, b0_extra);
-    // a00 a01 a02 a03   b00 b01 b02 b03
-    // a10 a11 a12 a13   b10 b11 b12 b13
-    // a20 a21 a22 a23   b20 b21 b22 b23
-    // a30 a31 a32 a33   b30 b31 b32 b33
-
-    // Transpose the two 4x4.
-    const __m128i transpose0_0 = _mm_unpacklo_epi16(b0, b1);
-    const __m128i transpose0_1 = _mm_unpacklo_epi16(b2, b3);
-    const __m128i transpose0_2 = _mm_unpackhi_epi16(b0, b1);
-    const __m128i transpose0_3 = _mm_unpackhi_epi16(b2, b3);
-    // a00 a10 a01 a11   a02 a12 a03 a13
-    // a20 a30 a21 a31   a22 a32 a23 a33
-    // b00 b10 b01 b11   b02 b12 b03 b13
-    // b20 b30 b21 b31   b22 b32 b23 b33
-    const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
-    const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);
-    const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
-    const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);
-    // a00 a10 a20 a30 a01 a11 a21 a31
-    // b00 b10 b20 b30 b01 b11 b21 b31
-    // a02 a12 a22 a32 a03 a13 a23 a33
-    // b02 b12 a22 b32 b03 b13 b23 b33
-    tmp_0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);
-    tmp_1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);
-    tmp_2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
-    tmp_3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
-    // a00 a10 a20 a30   b00 b10 b20 b30
-    // a01 a11 a21 a31   b01 b11 b21 b31
-    // a02 a12 a22 a32   b02 b12 b22 b32
-    // a03 a13 a23 a33   b03 b13 b23 b33
-  }
-
-  // Vertical pass and difference of weighted sums.
-  {
-    // Load all inputs.
-    // TODO(cduvivier): Make variable declarations and allocations aligned so
-    //                  we can use _mm_load_si128 instead of _mm_loadu_si128.
-    const __m128i w_0 = _mm_loadu_si128((__m128i*)&w[0]);
-    const __m128i w_8 = _mm_loadu_si128((__m128i*)&w[8]);
-
-    // Calculate a and b (two 4x4 at once).
-    const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);
-    const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3);
-    const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3);
-    const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2);
-    const __m128i b0 = _mm_add_epi16(a0, a1);
-    const __m128i b1 = _mm_add_epi16(a3, a2);
-    const __m128i b2 = _mm_sub_epi16(a3, a2);
-    const __m128i b3 = _mm_sub_epi16(a0, a1);
-
-    // Separate the transforms of inA and inB.
-    __m128i A_b0 = _mm_unpacklo_epi64(b0, b1);
-    __m128i A_b2 = _mm_unpacklo_epi64(b2, b3);
-    __m128i B_b0 = _mm_unpackhi_epi64(b0, b1);
-    __m128i B_b2 = _mm_unpackhi_epi64(b2, b3);
-
-    {
-      // sign(b) = b >> 15  (0x0000 if positive, 0xffff if negative)
-      const __m128i sign_A_b0 = _mm_srai_epi16(A_b0, 15);
-      const __m128i sign_A_b2 = _mm_srai_epi16(A_b2, 15);
-      const __m128i sign_B_b0 = _mm_srai_epi16(B_b0, 15);
-      const __m128i sign_B_b2 = _mm_srai_epi16(B_b2, 15);
-
-      // b = abs(b) = (b ^ sign) - sign
-      A_b0 = _mm_xor_si128(A_b0, sign_A_b0);
-      A_b2 = _mm_xor_si128(A_b2, sign_A_b2);
-      B_b0 = _mm_xor_si128(B_b0, sign_B_b0);
-      B_b2 = _mm_xor_si128(B_b2, sign_B_b2);
-      A_b0 = _mm_sub_epi16(A_b0, sign_A_b0);
-      A_b2 = _mm_sub_epi16(A_b2, sign_A_b2);
-      B_b0 = _mm_sub_epi16(B_b0, sign_B_b0);
-      B_b2 = _mm_sub_epi16(B_b2, sign_B_b2);
-    }
-
-    // b = abs(b) + 3
-    A_b0 = _mm_add_epi16(A_b0, three);
-    A_b2 = _mm_add_epi16(A_b2, three);
-    B_b0 = _mm_add_epi16(B_b0, three);
-    B_b2 = _mm_add_epi16(B_b2, three);
-
-    // abs((b + (b<0) + 3) >> 3) = (abs(b) + 3) >> 3
-    // b = (abs(b) + 3) >> 3
-    A_b0 = _mm_srai_epi16(A_b0, 3);
-    A_b2 = _mm_srai_epi16(A_b2, 3);
-    B_b0 = _mm_srai_epi16(B_b0, 3);
-    B_b2 = _mm_srai_epi16(B_b2, 3);
-
-    // weighted sums
-    A_b0 = _mm_madd_epi16(A_b0, w_0);
-    A_b2 = _mm_madd_epi16(A_b2, w_8);
-    B_b0 = _mm_madd_epi16(B_b0, w_0);
-    B_b2 = _mm_madd_epi16(B_b2, w_8);
-    A_b0 = _mm_add_epi32(A_b0, A_b2);
-    B_b0 = _mm_add_epi32(B_b0, B_b2);
-
-    // difference of weighted sums
-    A_b0 = _mm_sub_epi32(A_b0, B_b0);
-    _mm_storeu_si128((__m128i*)&sum[0], A_b0);
-  }
-  return sum[0] + sum[1] + sum[2] + sum[3];
-}
-
-static int Disto4x4SSE2(const uint8_t* const a, const uint8_t* const b,
-                        const uint16_t* const w) {
-  const int diff_sum = TTransformSSE2(a, b, w);
-  return (abs(diff_sum) + 8) >> 4;
-}
-
-static int Disto16x16SSE2(const uint8_t* const a, const uint8_t* const b,
-                          const uint16_t* const w) {
-  int D = 0;
-  int x, y;
-  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
-    for (x = 0; x < 16; x += 4) {
-      D += Disto4x4SSE2(a + x + y, b + x + y, w);
-    }
-  }
-  return D;
-}
-
-
-//------------------------------------------------------------------------------
-// Quantization
-//
-
-// Simple quantization
-static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
-                             int n, const VP8Matrix* const mtx) {
-  const __m128i max_coeff_2047 = _mm_set1_epi16(2047);
-  const __m128i zero = _mm_set1_epi16(0);
-  __m128i sign0, sign8;
-  __m128i coeff0, coeff8;
-  __m128i out0, out8;
-  __m128i packed_out;
-
-  // Load all inputs.
-  // TODO(cduvivier): Make variable declarations and allocations aligned so that
-  //                  we can use _mm_load_si128 instead of _mm_loadu_si128.
-  __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]);
-  __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]);
-  const __m128i sharpen0 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[0]);
-  const __m128i sharpen8 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[8]);
-  const __m128i iq0 = _mm_loadu_si128((__m128i*)&mtx->iq_[0]);
-  const __m128i iq8 = _mm_loadu_si128((__m128i*)&mtx->iq_[8]);
-  const __m128i bias0 = _mm_loadu_si128((__m128i*)&mtx->bias_[0]);
-  const __m128i bias8 = _mm_loadu_si128((__m128i*)&mtx->bias_[8]);
-  const __m128i q0 = _mm_loadu_si128((__m128i*)&mtx->q_[0]);
-  const __m128i q8 = _mm_loadu_si128((__m128i*)&mtx->q_[8]);
-  const __m128i zthresh0 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[0]);
-  const __m128i zthresh8 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[8]);
-
-  // sign(in) = in >> 15  (0x0000 if positive, 0xffff if negative)
-  sign0 = _mm_srai_epi16(in0, 15);
-  sign8 = _mm_srai_epi16(in8, 15);
-
-  // coeff = abs(in) = (in ^ sign) - sign
-  coeff0 = _mm_xor_si128(in0, sign0);
-  coeff8 = _mm_xor_si128(in8, sign8);
-  coeff0 = _mm_sub_epi16(coeff0, sign0);
-  coeff8 = _mm_sub_epi16(coeff8, sign8);
-
-  // coeff = abs(in) + sharpen
-  coeff0 = _mm_add_epi16(coeff0, sharpen0);
-  coeff8 = _mm_add_epi16(coeff8, sharpen8);
-
-  // if (coeff > 2047) coeff = 2047
-  coeff0 = _mm_min_epi16(coeff0, max_coeff_2047);
-  coeff8 = _mm_min_epi16(coeff8, max_coeff_2047);
-
-  // out = (coeff * iQ + B) >> QFIX;
-  {
-    // doing calculations with 32b precision (QFIX=17)
-    // out = (coeff * iQ)
-    __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0);
-    __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0);
-    __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8);
-    __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8);
-    __m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H);
-    __m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H);
-    __m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H);
-    __m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H);
-    // expand bias from 16b to 32b
-    __m128i bias_00 = _mm_unpacklo_epi16(bias0, zero);
-    __m128i bias_04 = _mm_unpackhi_epi16(bias0, zero);
-    __m128i bias_08 = _mm_unpacklo_epi16(bias8, zero);
-    __m128i bias_12 = _mm_unpackhi_epi16(bias8, zero);
-    // out = (coeff * iQ + B)
-    out_00 = _mm_add_epi32(out_00, bias_00);
-    out_04 = _mm_add_epi32(out_04, bias_04);
-    out_08 = _mm_add_epi32(out_08, bias_08);
-    out_12 = _mm_add_epi32(out_12, bias_12);
-    // out = (coeff * iQ + B) >> QFIX;
-    out_00 = _mm_srai_epi32(out_00, QFIX);
-    out_04 = _mm_srai_epi32(out_04, QFIX);
-    out_08 = _mm_srai_epi32(out_08, QFIX);
-    out_12 = _mm_srai_epi32(out_12, QFIX);
-    // pack result as 16b
-    out0 = _mm_packs_epi32(out_00, out_04);
-    out8 = _mm_packs_epi32(out_08, out_12);
-  }
-
-  // get sign back (if (sign[j]) out_n = -out_n)
-  out0 = _mm_xor_si128(out0, sign0);
-  out8 = _mm_xor_si128(out8, sign8);
-  out0 = _mm_sub_epi16(out0, sign0);
-  out8 = _mm_sub_epi16(out8, sign8);
-
-  // in = out * Q
-  in0 = _mm_mullo_epi16(out0, q0);
-  in8 = _mm_mullo_epi16(out8, q8);
-
-  // if (coeff <= mtx->zthresh_) {in=0; out=0;}
-  {
-    __m128i cmp0 = _mm_cmpgt_epi16(coeff0, zthresh0);
-    __m128i cmp8 = _mm_cmpgt_epi16(coeff8, zthresh8);
-    in0 = _mm_and_si128(in0, cmp0);
-    in8 = _mm_and_si128(in8, cmp8);
-    _mm_storeu_si128((__m128i*)&in[0], in0);
-    _mm_storeu_si128((__m128i*)&in[8], in8);
-    out0 = _mm_and_si128(out0, cmp0);
-    out8 = _mm_and_si128(out8, cmp8);
-  }
-
-  // zigzag the output before storing it.
-  //
-  // The zigzag pattern can almost be reproduced with a small sequence of
-  // shuffles. After it, we only need to swap the 7th (ending up in third
-  // position instead of twelfth) and 8th values.
-  {
-    __m128i outZ0, outZ8;
-    outZ0 = _mm_shufflehi_epi16(out0,  _MM_SHUFFLE(2, 1, 3, 0));
-    outZ0 = _mm_shuffle_epi32  (outZ0, _MM_SHUFFLE(3, 1, 2, 0));
-    outZ0 = _mm_shufflehi_epi16(outZ0, _MM_SHUFFLE(3, 1, 0, 2));
-    outZ8 = _mm_shufflelo_epi16(out8,  _MM_SHUFFLE(3, 0, 2, 1));
-    outZ8 = _mm_shuffle_epi32  (outZ8, _MM_SHUFFLE(3, 1, 2, 0));
-    outZ8 = _mm_shufflelo_epi16(outZ8, _MM_SHUFFLE(1, 3, 2, 0));
-    _mm_storeu_si128((__m128i*)&out[0], outZ0);
-    _mm_storeu_si128((__m128i*)&out[8], outZ8);
-    packed_out = _mm_packs_epi16(outZ0, outZ8);
-  }
-  {
-    const int16_t outZ_12 = out[12];
-    const int16_t outZ_3 = out[3];
-    out[3] = outZ_12;
-    out[12] = outZ_3;
-  }
-
-  // detect if all 'out' values are zeroes or not
-  {
-    int32_t tmp[4];
-    _mm_storeu_si128((__m128i*)tmp, packed_out);
-    if (n) {
-      tmp[0] &= ~0xff;
-    }
-    return (tmp[3] || tmp[2] || tmp[1] || tmp[0]);
-  }
-}
-
-extern void VP8EncDspInitSSE2(void);
-void VP8EncDspInitSSE2(void) {
-  VP8CollectHistogram = CollectHistogramSSE2;
-  VP8EncQuantizeBlock = QuantizeBlockSSE2;
-  VP8ITransform = ITransformSSE2;
-  VP8FTransform = FTransformSSE2;
-  VP8SSE4x4 = SSE4x4SSE2;
-  VP8TDisto4x4 = Disto4x4SSE2;
-  VP8TDisto16x16 = Disto16x16SSE2;
-}
-
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
-
-#endif   //__SSE2__
--- a/src/enc/filter.c
+++ b/src/enc/filter.c
@ -1,16 +1,17 @@
-// Copyright 2011 Google Inc.
+// Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Selecting filter level
 //
 // Author: somnath@google.com (Somnath Banerjee)

-#include <math.h>
-#include "vp8enci.h"
+#include "./vp8enci.h"

 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
@ -49,7 +50,7 @@ static void InitTables(void) {
 // Edge filtering functions

 // 4 pixels in, 2 pixels out
-static inline void do_filter2(uint8_t* p, int step) {
+static WEBP_INLINE void do_filter2(uint8_t* p, int step) {
  const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
  const int a = 3 * (q0 - p0) + sclip1[1020 + p1 - q1];
  const int a1 = sclip2[112 + ((a + 4) >> 3)];
@ -59,7 +60,7 @@ static inline void do_filter2(uint8_t* p, int step) {
 }

 // 4 pixels in, 4 pixels out
-static inline void do_filter4(uint8_t* p, int step) {
+static WEBP_INLINE void do_filter4(uint8_t* p, int step) {
  const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
  const int a = 3 * (q0 - p0);
  const int a1 = sclip2[112 + ((a + 4) >> 3)];
@ -72,17 +73,18 @@ static inline void do_filter4(uint8_t* p, int step) {
 }

 // high edge-variance
-static inline int hev(const uint8_t* p, int step, int thresh) {
+static WEBP_INLINE int hev(const uint8_t* p, int step, int thresh) {
  const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
  return (abs0[255 + p1 - p0] > thresh) || (abs0[255 + q1 - q0] > thresh);
 }

-static inline int needs_filter(const uint8_t* p, int step, int thresh) {
+static WEBP_INLINE int needs_filter(const uint8_t* p, int step, int thresh) {
  const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
  return (2 * abs0[255 + p0 - q0] + abs1[255 + p1 - q1]) <= thresh;
 }

-static inline int needs_filter2(const uint8_t* p, int step, int t, int it) {
+static WEBP_INLINE int needs_filter2(const uint8_t* p,
+                                     int step, int t, int it) {
  const int p3 = p[-4*step], p2 = p[-3*step], p1 = p[-2*step], p0 = p[-step];
  const int q0 = p[0], q1 = p[step], q2 = p[2*step], q3 = p[3*step];
  if ((2 * abs0[255 + p0 - q0] + abs1[255 + p1 - q1]) > t)
@ -132,8 +134,9 @@ static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
 //------------------------------------------------------------------------------
 // Complex In-loop filtering (Paragraph 15.3)

-static inline void FilterLoop24(uint8_t* p, int hstride, int vstride, int size,
-                                int thresh, int ithresh, int hev_thresh) {
+static WEBP_INLINE void FilterLoop24(uint8_t* p,
+                                     int hstride, int vstride, int size,
+                                     int thresh, int ithresh, int hev_thresh) {
  while (size-- > 0) {
    if (needs_filter2(p, hstride, thresh, ithresh)) {
      if (hev(p, hstride, hev_thresh)) {
@ -233,14 +236,21 @@ static void DoFilter(const VP8EncIterator* const it, int level) {
 // SSIM metric

 enum { KERNEL = 3 };
-typedef struct {
-  double w, xm, ym, xxm, xym, yym;
-} SSIMStats;
+static const double kMinValue = 1.e-10;  // minimal threshold

-static void Accumulate(const uint8_t* src1, int stride1,
-                       const uint8_t* src2, int stride2,
-                       int xo, int yo, int W, int H,
-                       SSIMStats* const stats) {
+void VP8SSIMAddStats(const DistoStats* const src, DistoStats* const dst) {
+  dst->w   += src->w;
+  dst->xm  += src->xm;
+  dst->ym  += src->ym;
+  dst->xxm += src->xxm;
+  dst->xym += src->xym;
+  dst->yym += src->yym;
+}
+
+static void VP8SSIMAccumulate(const uint8_t* src1, int stride1,
+                              const uint8_t* src2, int stride2,
+                              int xo, int yo, int W, int H,
+                              DistoStats* const stats) {
  const int ymin = (yo - KERNEL < 0) ? 0 : yo - KERNEL;
  const int ymax = (yo + KERNEL > H - 1) ? H - 1 : yo + KERNEL;
  const int xmin = (xo - KERNEL < 0) ? 0 : xo - KERNEL;
@ -262,7 +272,7 @@ static void Accumulate(const uint8_t* src1, int stride1,
  }
 }

-static double GetSSIM(const SSIMStats* const stats) {
+double VP8SSIMGet(const DistoStats* const stats) {
  const double xmxm = stats->xm * stats->xm;
  const double ymym = stats->ym * stats->ym;
  const double xmym = stats->xm * stats->ym;
@ -280,26 +290,49 @@ static double GetSSIM(const SSIMStats* const stats) {
  C2 = 58.5225 * w2;
  fnum = (2 * xmym + C1) * (2 * sxy + C2);
  fden = (xmxm + ymym + C1) * (sxx + syy + C2);
-  return (fden != 0) ? fnum / fden : 0.;
+  return (fden != 0.) ? fnum / fden : kMinValue;
+}
+
+double VP8SSIMGetSquaredError(const DistoStats* const s) {
+  if (s->w > 0.) {
+    const double iw2 = 1. / (s->w * s->w);
+    const double sxx = s->xxm * s->w - s->xm * s->xm;
+    const double syy = s->yym * s->w - s->ym * s->ym;
+    const double sxy = s->xym * s->w - s->xm * s->ym;
+    const double SSE = iw2 * (sxx + syy - 2. * sxy);
+    if (SSE > kMinValue) return SSE;
+  }
+  return kMinValue;
+}
+
+void VP8SSIMAccumulatePlane(const uint8_t* src1, int stride1,
+                            const uint8_t* src2, int stride2,
+                            int W, int H, DistoStats* const stats) {
+  int x, y;
+  for (y = 0; y < H; ++y) {
+    for (x = 0; x < W; ++x) {
+      VP8SSIMAccumulate(src1, stride1, src2, stride2, x, y, W, H, stats);
+    }
+  }
 }

 static double GetMBSSIM(const uint8_t* yuv1, const uint8_t* yuv2) {
  int x, y;
-  SSIMStats s = { .0, .0, .0, .0, .0, .0 };
+  DistoStats s = { .0, .0, .0, .0, .0, .0 };

  // compute SSIM in a 10 x 10 window
  for (x = 3; x < 13; x++) {
    for (y = 3; y < 13; y++) {
-      Accumulate(yuv1 + Y_OFF, BPS, yuv2 + Y_OFF, BPS, x, y, 16, 16, &s);
+      VP8SSIMAccumulate(yuv1 + Y_OFF, BPS, yuv2 + Y_OFF, BPS, x, y, 16, 16, &s);
    }
  }
  for (x = 1; x < 7; x++) {
    for (y = 1; y < 7; y++) {
-      Accumulate(yuv1 + U_OFF, BPS, yuv2 + U_OFF, BPS, x, y, 8, 8, &s);
-      Accumulate(yuv1 + V_OFF, BPS, yuv2 + V_OFF, BPS, x, y, 8, 8, &s);
+      VP8SSIMAccumulate(yuv1 + U_OFF, BPS, yuv2 + U_OFF, BPS, x, y, 8, 8, &s);
+      VP8SSIMAccumulate(yuv1 + V_OFF, BPS, yuv2 + V_OFF, BPS, x, y, 8, 8, &s);
    }
  }
-  return GetSSIM(&s);
+  return VP8SSIMGet(&s);
 }

 //------------------------------------------------------------------------------
--- a/src/enc/frame.c
+++ b/src/enc/frame.c
@ -1,21 +1,23 @@
-// Copyright 2011 Google Inc.
+// Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 //   frame coding and analysis
 //
 // Author: Skal (pascal.massimino@gmail.com)

+#include <assert.h>
 #include <stdlib.h>
 #include <string.h>
-#include <assert.h>
 #include <math.h>

-#include "vp8enci.h"
-#include "cost.h"
+#include "./vp8enci.h"
+#include "./cost.h"

 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
@ -45,24 +47,26 @@ const uint8_t VP8EncBands[16 + 1] = {
  0  // sentinel
 };

-static const uint8_t kCat3[] = { 173, 148, 140 };
-static const uint8_t kCat4[] = { 176, 155, 140, 135 };
-static const uint8_t kCat5[] = { 180, 157, 141, 134, 130 };
-static const uint8_t kCat6[] =
+const uint8_t VP8Cat3[] = { 173, 148, 140 };
+const uint8_t VP8Cat4[] = { 176, 155, 140, 135 };
+const uint8_t VP8Cat5[] = { 180, 157, 141, 134, 130 };
+const uint8_t VP8Cat6[] =
    { 254, 254, 243, 230, 196, 177, 153, 140, 133, 130, 129 };

 //------------------------------------------------------------------------------
 // Reset the statistics about: number of skips, token proba, level cost,...

-static void ResetStats(VP8Encoder* const enc, int precalc_cost) {
+static void ResetStats(VP8Encoder* const enc) {
  VP8Proba* const proba = &enc->proba_;
-  if (precalc_cost) VP8CalculateLevelCosts(proba);
+  VP8CalculateLevelCosts(proba);
  proba->nb_skip_ = 0;
 }

 //------------------------------------------------------------------------------
 // Skip decision probability

+#define SKIP_PROBA_THRESHOLD 250  // value below which using skip_proba is OK.
+
 static int CalcSkipProba(uint64_t nb, uint64_t total) {
  return (int)(total ? (total - nb) * 255 / total : 255);
 }
@ -74,7 +78,7 @@ static int FinalizeSkipProba(VP8Encoder* const enc) {
  const int nb_events = proba->nb_skip_;
  int size;
  proba->skip_proba_ = CalcSkipProba(nb_events, nb_mbs);
-  proba->use_skip_proba_ = (proba->skip_proba_ < 250);
+  proba->use_skip_proba_ = (proba->skip_proba_ < SKIP_PROBA_THRESHOLD);
  size = 256;   // 'use_skip_proba' bit
  if (proba->use_skip_proba_) {
    size +=  nb_events * VP8BitCost(1, proba->skip_proba_)
@ -93,9 +97,14 @@ static void ResetTokenStats(VP8Encoder* const enc) {
 }

 // Record proba context used
-static int Record(int bit, uint64_t* const stats) {
-  stats[0] += bit;
-  stats[1] += 1;
+static int Record(int bit, proba_t* const stats) {
+  proba_t p = *stats;
+  if (p >= 0xffff0000u) {               // an overflow is inbound.
+    p = ((p + 1u) >> 1) & 0x7fff7fffu;  // -> divide the stats by 2.
+  }
+  // record bit count (lower 16 bits) and increment total count (upper 16 bits).
+  p += 0x00010000u + bit;
+  *stats = p;
  return bit;
 }

@ -104,33 +113,36 @@ static int Record(int bit, uint64_t* const stats) {

 // Simulate block coding, but only record statistics.
 // Note: no need to record the fixed probas.
-static int RecordCoeffs(int ctx, VP8Residual* res) {
+static int RecordCoeffs(int ctx, const VP8Residual* const res) {
  int n = res->first;
-  uint64_t (*s)[2] = res->stats[VP8EncBands[n]][ctx];
-  if (!Record(res->last >= 0, s[0])) {
+  // should be stats[VP8EncBands[n]], but it's equivalent for n=0 or 1
+  proba_t* s = res->stats[n][ctx];
+  if (res->last  < 0) {
+    Record(0, s + 0);
    return 0;
  }
-
-  while (1) {
-    int v = res->coeffs[n++];
-    if (!Record(v != 0, s[1])) {
+  while (n <= res->last) {
+    int v;
+    Record(1, s + 0);  // order of record doesn't matter
+    while ((v = res->coeffs[n++]) == 0) {
+      Record(0, s + 1);
      s = res->stats[VP8EncBands[n]][0];
-      continue;
    }
-    if (!Record(2u < (unsigned int)(v + 1), s[2])) {  // v = -1 or 1
+    Record(1, s + 1);
+    if (!Record(2u < (unsigned int)(v + 1), s + 2)) {  // v = -1 or 1
      s = res->stats[VP8EncBands[n]][1];
    } else {
      v = abs(v);
 #if !defined(USE_LEVEL_CODE_TABLE)
-      if (!Record(v > 4, s[3])) {
-        if (Record(v != 2, s[4]))
-          Record(v == 4, s[5]);
-      } else if (!Record(v > 10, s[6])) {
-        Record(v > 6, s[7]);
-      } else if (!Record((v >= 3 + (8 << 2)), s[8])) {
-        Record((v >= 3 + (8 << 1)), s[9]);
+      if (!Record(v > 4, s + 3)) {
+        if (Record(v != 2, s + 4))
+          Record(v == 4, s + 5);
+      } else if (!Record(v > 10, s + 6)) {
+        Record(v > 6, s + 7);
+      } else if (!Record((v >= 3 + (8 << 2)), s + 8)) {
+        Record((v >= 3 + (8 << 1)), s + 9);
      } else {
-        Record((v >= 3 + (8 << 3)), s[10]);
+        Record((v >= 3 + (8 << 3)), s + 10);
      }
 #else
      if (v > MAX_VARIABLE_LEVEL)
@ -142,44 +154,53 @@ static int RecordCoeffs(int ctx, VP8Residual* res) {
        int i;
        for (i = 0; (pattern >>= 1) != 0; ++i) {
          const int mask = 2 << i;
-          if (pattern & 1) Record(!!(bits & mask), s[3 + i]);
+          if (pattern & 1) Record(!!(bits & mask), s + 3 + i);
        }
      }
 #endif
      s = res->stats[VP8EncBands[n]][2];
    }
-    if (n == 16 || !Record(n <= res->last, s[0])) {
-      return 1;
-    }
  }
+  if (n < 16) Record(0, s + 0);
+  return 1;
 }

 // Collect statistics and deduce probabilities for next coding pass.
 // Return the total bit-cost for coding the probability updates.
-static int CalcTokenProba(uint64_t nb, uint64_t total) {
-  return (int)(nb ? ((total - nb) * 255 + total / 2) / total : 255);
+static int CalcTokenProba(int nb, int total) {
+  assert(nb <= total);
+  return nb ? (255 - nb * 255 / total) : 255;
 }

-static int FinalizeTokenProbas(VP8Encoder* const enc) {
-  VP8Proba* const proba = &enc->proba_;
+// Cost of coding 'nb' 1's and 'total-nb' 0's using 'proba' probability.
+static int BranchCost(int nb, int total, int proba) {
+  return nb * VP8BitCost(1, proba) + (total - nb) * VP8BitCost(0, proba);
+}
+
+static int FinalizeTokenProbas(VP8Proba* const proba) {
+  int has_changed = 0;
  int size = 0;
  int t, b, c, p;
  for (t = 0; t < NUM_TYPES; ++t) {
    for (b = 0; b < NUM_BANDS; ++b) {
      for (c = 0; c < NUM_CTX; ++c) {
        for (p = 0; p < NUM_PROBAS; ++p) {
-          const uint64_t* const cnt = proba->stats_[t][b][c][p];
+          const proba_t stats = proba->stats_[t][b][c][p];
+          const int nb = (stats >> 0) & 0xffff;
+          const int total = (stats >> 16) & 0xffff;
          const int update_proba = VP8CoeffsUpdateProba[t][b][c][p];
          const int old_p = VP8CoeffsProba0[t][b][c][p];
-          const int new_p = CalcTokenProba(cnt[0], cnt[1]);
-          const uint64_t old_cost = VP8BranchCost(cnt[0], cnt[1], old_p)
-                                  + VP8BitCost(0, update_proba);
-          const uint64_t new_cost = VP8BranchCost(cnt[0], cnt[1], new_p)
-                                  + VP8BitCost(1, update_proba) + 8 * 256;
+          const int new_p = CalcTokenProba(nb, total);
+          const int old_cost = BranchCost(nb, total, old_p)
+                             + VP8BitCost(0, update_proba);
+          const int new_cost = BranchCost(nb, total, new_p)
+                             + VP8BitCost(1, update_proba)
+                             + 8 * 256;
          const int use_new_p = (old_cost > new_cost);
          size += VP8BitCost(use_new_p, update_proba);
          if (use_new_p) {  // only use proba that seem meaningful enough.
            proba->coeffs_[t][b][c][p] = new_p;
+            has_changed |= (new_p != old_p);
            size += 8 * 256;
          } else {
            proba->coeffs_[t][b][c][p] = old_p;
@ -188,9 +209,51 @@ static int FinalizeTokenProbas(VP8Encoder* const enc) {
      }
    }
  }
+  proba->dirty_ = has_changed;
  return size;
 }

+//------------------------------------------------------------------------------
+// Finalize Segment probability based on the coding tree
+
+static int GetProba(int a, int b) {
+  const int total = a + b;
+  return (total == 0) ? 255     // that's the default probability.
+                      : (255 * a + total / 2) / total;  // rounded proba
+}
+
+static void SetSegmentProbas(VP8Encoder* const enc) {
+  int p[NUM_MB_SEGMENTS] = { 0 };
+  int n;
+
+  for (n = 0; n < enc->mb_w_ * enc->mb_h_; ++n) {
+    const VP8MBInfo* const mb = &enc->mb_info_[n];
+    p[mb->segment_]++;
+  }
+  if (enc->pic_->stats != NULL) {
+    for (n = 0; n < NUM_MB_SEGMENTS; ++n) {
+      enc->pic_->stats->segment_size[n] = p[n];
+    }
+  }
+  if (enc->segment_hdr_.num_segments_ > 1) {
+    uint8_t* const probas = enc->proba_.segments_;
+    probas[0] = GetProba(p[0] + p[1], p[2] + p[3]);
+    probas[1] = GetProba(p[0], p[1]);
+    probas[2] = GetProba(p[2], p[3]);
+
+    enc->segment_hdr_.update_map_ =
+        (probas[0] != 255) || (probas[1] != 255) || (probas[2] != 255);
+    enc->segment_hdr_.size_ =
+        p[0] * (VP8BitCost(0, probas[0]) + VP8BitCost(0, probas[1])) +
+        p[1] * (VP8BitCost(0, probas[0]) + VP8BitCost(1, probas[1])) +
+        p[2] * (VP8BitCost(1, probas[0]) + VP8BitCost(0, probas[2])) +
+        p[3] * (VP8BitCost(1, probas[0]) + VP8BitCost(1, probas[2]));
+  } else {
+    enc->segment_hdr_.update_map_ = 0;
+    enc->segment_hdr_.size_ = 0;
+  }
+}
+
 //------------------------------------------------------------------------------
 // helper functions for residuals struct VP8Residual.

@ -219,34 +282,47 @@ static void SetResidualCoeffs(const int16_t* const coeffs,
 //------------------------------------------------------------------------------
 // Mode costs

-static int GetResidualCost(int ctx, const VP8Residual* const res) {
+static int GetResidualCost(int ctx0, const VP8Residual* const res) {
  int n = res->first;
-  const uint8_t* p = res->prob[VP8EncBands[n]][ctx];
-  const uint16_t *t = res->cost[VP8EncBands[n]][ctx];
+  // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
+  int p0 = res->prob[n][ctx0][0];
+  const uint16_t* t = res->cost[n][ctx0];
  int cost;

-  cost = VP8BitCost(res->last >= 0, p[0]);
  if (res->last < 0) {
-    return cost;
+    return VP8BitCost(0, p0);
  }
-  while (n <= res->last) {
-    const int v = res->coeffs[n++];
+  cost = 0;
+  while (n < res->last) {
+    int v = res->coeffs[n];
+    const int b = VP8EncBands[n + 1];
+    ++n;
    if (v == 0) {
-      cost += VP8LevelCost(t, 0);
-      p = res->prob[VP8EncBands[n]][0];
-      t = res->cost[VP8EncBands[n]][0];
+      // short-case for VP8LevelCost(t, 0) (note: VP8LevelFixedCosts[0] == 0):
+      cost += t[0];
+      t = res->cost[b][0];
      continue;
-    } else if (2u >= (unsigned int)(v + 1)) {   // v = -1 or 1
-      cost += VP8LevelCost(t, 1);
-      p = res->prob[VP8EncBands[n]][1];
-      t = res->cost[VP8EncBands[n]][1];
-    } else {
-      cost += VP8LevelCost(t, abs(v));
-      p = res->prob[VP8EncBands[n]][2];
-      t = res->cost[VP8EncBands[n]][2];
    }
-    if (n < 16) {
-      cost += VP8BitCost(n <= res->last, p[0]);
+    v = abs(v);
+    cost += VP8BitCost(1, p0);
+    cost += VP8LevelCost(t, v);
+    {
+      const int ctx = (v == 1) ? 1 : 2;
+      p0 = res->prob[b][ctx][0];
+      t = res->cost[b][ctx];
+    }
+  }
+  // Last coefficient is always non-zero
+  {
+    const int v = abs(res->coeffs[n]);
+    assert(v != 0);
+    cost += VP8BitCost(1, p0);
+    cost += VP8LevelCost(t, v);
+    if (n < 15) {
+      const int b = VP8EncBands[n + 1];
+      const int ctx = (v == 1) ? 1 : 2;
+      const int last_p0 = res->prob[b][ctx][0];
+      cost += VP8BitCost(0, last_p0);
    }
  }
  return cost;
@ -255,10 +331,11 @@ static int GetResidualCost(int ctx, const VP8Residual* const res) {
 int VP8GetCostLuma4(VP8EncIterator* const it, const int16_t levels[16]) {
  const int x = (it->i4_ & 3), y = (it->i4_ >> 2);
  VP8Residual res;
+  VP8Encoder* const enc = it->enc_;
  int R = 0;
  int ctx;

-  InitResidual(0, 3, it->enc_, &res);
+  InitResidual(0, 3, enc, &res);
  ctx = it->top_nz_[x] + it->left_nz_[y];
  SetResidualCoeffs(levels, &res);
  R += GetResidualCost(ctx, &res);
@ -267,18 +344,19 @@ int VP8GetCostLuma4(VP8EncIterator* const it, const int16_t levels[16]) {

 int VP8GetCostLuma16(VP8EncIterator* const it, const VP8ModeScore* const rd) {
  VP8Residual res;
+  VP8Encoder* const enc = it->enc_;
  int x, y;
  int R = 0;

  VP8IteratorNzToBytes(it);   // re-import the non-zero context

  // DC
-  InitResidual(0, 1, it->enc_, &res);
+  InitResidual(0, 1, enc, &res);
  SetResidualCoeffs(rd->y_dc_levels, &res);
  R += GetResidualCost(it->top_nz_[8] + it->left_nz_[8], &res);

  // AC
-  InitResidual(1, 0, it->enc_, &res);
+  InitResidual(1, 0, enc, &res);
  for (y = 0; y < 4; ++y) {
    for (x = 0; x < 4; ++x) {
      const int ctx = it->top_nz_[x] + it->left_nz_[y];
@ -292,12 +370,13 @@ int VP8GetCostLuma16(VP8EncIterator* const it, const VP8ModeScore* const rd) {

 int VP8GetCostUV(VP8EncIterator* const it, const VP8ModeScore* const rd) {
  VP8Residual res;
+  VP8Encoder* const enc = it->enc_;
  int ch, x, y;
  int R = 0;

  VP8IteratorNzToBytes(it);  // re-import the non-zero context

-  InitResidual(0, 2, it->enc_, &res);
+  InitResidual(0, 2, enc, &res);
  for (ch = 0; ch <= 2; ch += 2) {
    for (y = 0; y < 2; ++y) {
      for (x = 0; x < 2; ++x) {
@ -316,7 +395,8 @@ int VP8GetCostUV(VP8EncIterator* const it, const VP8ModeScore* const rd) {

 static int PutCoeffs(VP8BitWriter* const bw, int ctx, const VP8Residual* res) {
  int n = res->first;
-  const uint8_t* p = res->prob[VP8EncBands[n]][ctx];
+  // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
+  const uint8_t* p = res->prob[n][ctx];
  if (!VP8PutBit(bw, res->last >= 0, p[0])) {
    return 0;
  }
@ -345,30 +425,30 @@ static int PutCoeffs(VP8BitWriter* const bw, int ctx, const VP8Residual* res) {
      } else {
        int mask;
        const uint8_t* tab;
-        if (v < 3 + (8 << 1)) {          // kCat3  (3b)
+        if (v < 3 + (8 << 1)) {          // VP8Cat3  (3b)
          VP8PutBit(bw, 0, p[8]);
          VP8PutBit(bw, 0, p[9]);
          v -= 3 + (8 << 0);
          mask = 1 << 2;
-          tab = kCat3;
-        } else if (v < 3 + (8 << 2)) {   // kCat4  (4b)
+          tab = VP8Cat3;
+        } else if (v < 3 + (8 << 2)) {   // VP8Cat4  (4b)
          VP8PutBit(bw, 0, p[8]);
          VP8PutBit(bw, 1, p[9]);
          v -= 3 + (8 << 1);
          mask = 1 << 3;
-          tab = kCat4;
-        } else if (v < 3 + (8 << 3)) {   // kCat5  (5b)
+          tab = VP8Cat4;
+        } else if (v < 3 + (8 << 3)) {   // VP8Cat5  (5b)
          VP8PutBit(bw, 1, p[8]);
          VP8PutBit(bw, 0, p[10]);
          v -= 3 + (8 << 2);
          mask = 1 << 4;
-          tab = kCat5;
-        } else {                         // kCat6 (11b)
+          tab = VP8Cat5;
+        } else {                         // VP8Cat6 (11b)
          VP8PutBit(bw, 1, p[8]);
          VP8PutBit(bw, 1, p[10]);
          v -= 3 + (8 << 3);
          mask = 1 << 10;
-          tab = kCat6;
+          tab = VP8Cat6;
        }
        while (mask) {
          VP8PutBit(bw, !!(v & mask), *tab++);
@ -385,26 +465,26 @@ static int PutCoeffs(VP8BitWriter* const bw, int ctx, const VP8Residual* res) {
  return 1;
 }

-static void CodeResiduals(VP8BitWriter* const bw,
-                          VP8EncIterator* const it,
+static void CodeResiduals(VP8BitWriter* const bw, VP8EncIterator* const it,
                          const VP8ModeScore* const rd) {
  int x, y, ch;
  VP8Residual res;
  uint64_t pos1, pos2, pos3;
  const int i16 = (it->mb_->type_ == 1);
  const int segment = it->mb_->segment_;
+  VP8Encoder* const enc = it->enc_;

  VP8IteratorNzToBytes(it);

  pos1 = VP8BitWriterPos(bw);
  if (i16) {
-    InitResidual(0, 1, it->enc_, &res);
+    InitResidual(0, 1, enc, &res);
    SetResidualCoeffs(rd->y_dc_levels, &res);
    it->top_nz_[8] = it->left_nz_[8] =
      PutCoeffs(bw, it->top_nz_[8] + it->left_nz_[8], &res);
-    InitResidual(1, 0, it->enc_, &res);
+    InitResidual(1, 0, enc, &res);
  } else {
-    InitResidual(0, 3, it->enc_, &res);
+    InitResidual(0, 3, enc, &res);
  }

  // luma-AC
@ -418,7 +498,7 @@ static void CodeResiduals(VP8BitWriter* const bw,
  pos2 = VP8BitWriterPos(bw);

  // U/V
-  InitResidual(0, 2, it->enc_, &res);
+  InitResidual(0, 2, enc, &res);
  for (ch = 0; ch <= 2; ch += 2) {
    for (y = 0; y < 2; ++y) {
      for (x = 0; x < 2; ++x) {
@ -443,17 +523,18 @@ static void RecordResiduals(VP8EncIterator* const it,
                            const VP8ModeScore* const rd) {
  int x, y, ch;
  VP8Residual res;
+  VP8Encoder* const enc = it->enc_;

  VP8IteratorNzToBytes(it);

  if (it->mb_->type_ == 1) {   // i16x16
-    InitResidual(0, 1, it->enc_, &res);
+    InitResidual(0, 1, enc, &res);
    SetResidualCoeffs(rd->y_dc_levels, &res);
    it->top_nz_[8] = it->left_nz_[8] =
      RecordCoeffs(it->top_nz_[8] + it->left_nz_[8], &res);
-    InitResidual(1, 0, it->enc_, &res);
+    InitResidual(1, 0, enc, &res);
  } else {
-    InitResidual(0, 3, it->enc_, &res);
+    InitResidual(0, 3, enc, &res);
  }

  // luma-AC
@ -466,7 +547,7 @@ static void RecordResiduals(VP8EncIterator* const it,
  }

  // U/V
-  InitResidual(0, 2, it->enc_, &res);
+  InitResidual(0, 2, enc, &res);
  for (ch = 0; ch <= 2; ch += 2) {
    for (y = 0; y < 2; ++y) {
      for (x = 0; x < 2; ++x) {
@ -481,6 +562,62 @@ static void RecordResiduals(VP8EncIterator* const it,
  VP8IteratorBytesToNz(it);
 }

+//------------------------------------------------------------------------------
+// Token buffer
+
+#if !defined(DISABLE_TOKEN_BUFFER)
+
+static void RecordTokens(VP8EncIterator* const it, const VP8ModeScore* const rd,
+                         VP8TBuffer* const tokens) {
+  int x, y, ch;
+  VP8Residual res;
+  VP8Encoder* const enc = it->enc_;
+
+  VP8IteratorNzToBytes(it);
+  if (it->mb_->type_ == 1) {   // i16x16
+    const int ctx = it->top_nz_[8] + it->left_nz_[8];
+    InitResidual(0, 1, enc, &res);
+    SetResidualCoeffs(rd->y_dc_levels, &res);
+    it->top_nz_[8] = it->left_nz_[8] =
+        VP8RecordCoeffTokens(ctx, 1,
+                             res.first, res.last, res.coeffs, tokens);
+    RecordCoeffs(ctx, &res);
+    InitResidual(1, 0, enc, &res);
+  } else {
+    InitResidual(0, 3, enc, &res);
+  }
+
+  // luma-AC
+  for (y = 0; y < 4; ++y) {
+    for (x = 0; x < 4; ++x) {
+      const int ctx = it->top_nz_[x] + it->left_nz_[y];
+      SetResidualCoeffs(rd->y_ac_levels[x + y * 4], &res);
+      it->top_nz_[x] = it->left_nz_[y] =
+          VP8RecordCoeffTokens(ctx, res.coeff_type,
+                               res.first, res.last, res.coeffs, tokens);
+      RecordCoeffs(ctx, &res);
+    }
+  }
+
+  // U/V
+  InitResidual(0, 2, enc, &res);
+  for (ch = 0; ch <= 2; ch += 2) {
+    for (y = 0; y < 2; ++y) {
+      for (x = 0; x < 2; ++x) {
+        const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
+        SetResidualCoeffs(rd->uv_levels[ch * 2 + x + y * 2], &res);
+        it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] =
+            VP8RecordCoeffTokens(ctx, 2,
+                                 res.first, res.last, res.coeffs, tokens);
+        RecordCoeffs(ctx, &res);
+      }
+    }
+  }
+  VP8IteratorBytesToNz(it);
+}
+
+#endif    // !DISABLE_TOKEN_BUFFER
+
 //------------------------------------------------------------------------------
 // ExtraInfo map / Debug function

@ -495,7 +632,10 @@ static void SetBlock(uint8_t* p, int value, int size) {
 #endif

 static void ResetSSE(VP8Encoder* const enc) {
-  memset(enc->sse_, 0, sizeof(enc->sse_));
+  enc->sse_[0] = 0;
+  enc->sse_[1] = 0;
+  enc->sse_[2] = 0;
+  // Note: enc->sse_[3] is managed by alpha.c
  enc->sse_count_ = 0;
 }

@ -515,16 +655,16 @@ static void StoreSideInfo(const VP8EncIterator* const it) {
  const VP8MBInfo* const mb = it->mb_;
  WebPPicture* const pic = enc->pic_;

-  if (pic->stats) {
+  if (pic->stats != NULL) {
    StoreSSE(it);
    enc->block_count_[0] += (mb->type_ == 0);
    enc->block_count_[1] += (mb->type_ == 1);
    enc->block_count_[2] += (mb->skip_ != 0);
  }

-  if (pic->extra_info) {
+  if (pic->extra_info != NULL) {
    uint8_t* const info = &pic->extra_info[it->x_ + it->y_ * enc->mb_w_];
-    switch(pic->extra_info_type) {
+    switch (pic->extra_info_type) {
      case 1: *info = mb->type_; break;
      case 2: *info = mb->segment_; break;
      case 3: *info = enc->dqm_[mb->segment_].quant_; break;
@ -534,6 +674,7 @@ static void StoreSideInfo(const VP8EncIterator* const it) {
        const int b = (int)((it->luma_bits_ + it->uv_bits_ + 7) >> 3);
        *info = (b > 255) ? 255 : b; break;
      }
+      case 7: *info = mb->alpha_; break;
      default: *info = 0; break;
    };
  }
@ -545,92 +686,13 @@ static void StoreSideInfo(const VP8EncIterator* const it) {
 }

 //------------------------------------------------------------------------------
-// Main loops
-//
-//  VP8EncLoop(): does the final bitstream coding.
-
-static void ResetAfterSkip(VP8EncIterator* const it) {
-  if (it->mb_->type_ == 1) {
-    *it->nz_ = 0;  // reset all predictors
-    it->left_nz_[8] = 0;
-  } else {
-    *it->nz_ &= (1 << 24);  // preserve the dc_nz bit
-  }
-}
-
-int VP8EncLoop(VP8Encoder* const enc) {
-  int i, s, p;
-  VP8EncIterator it;
-  VP8ModeScore info;
-  const int dont_use_skip = !enc->proba_.use_skip_proba_;
-  const int rd_opt = enc->rd_opt_level_;
-  const int kAverageBytesPerMB = 5;     // TODO: have a kTable[quality/10]
-  const int bytes_per_parts =
-    enc->mb_w_ * enc->mb_h_ * kAverageBytesPerMB / enc->num_parts_;
-
-  // Initialize the bit-writers
-  for (p = 0; p < enc->num_parts_; ++p) {
-    VP8BitWriterInit(enc->parts_ + p, bytes_per_parts);
-  }
-
-  ResetStats(enc, rd_opt != 0);
-  ResetSSE(enc);
-
-  VP8IteratorInit(enc, &it);
-  VP8InitFilter(&it);
-  do {
-    VP8IteratorImport(&it);
-    // Warning! order is important: first call VP8Decimate() and
-    // *then* decide how to code the skip decision if there's one.
-    if (!VP8Decimate(&it, &info, rd_opt) || dont_use_skip) {
-      CodeResiduals(it.bw_, &it, &info);
-    } else {   // reset predictors after a skip
-      ResetAfterSkip(&it);
-    }
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-    if (enc->has_alpha_) {
-      VP8EncCodeAlphaBlock(&it);
-    }
-    if (enc->use_layer_) {
-      VP8EncCodeLayerBlock(&it);
-    }
-#endif
-    StoreSideInfo(&it);
-    VP8StoreFilterStats(&it);
-    VP8IteratorExport(&it);
-  } while (VP8IteratorNext(&it, it.yuv_out_));
-  VP8AdjustFilterStrength(&it);
-
-  // Finalize the partitions
-  for (p = 0; p < enc->num_parts_; ++p) {
-    VP8BitWriterFinish(enc->parts_ + p);
-  }
-  // and byte counters
-  if (enc->pic_->stats) {
-    for (i = 0; i <= 2; ++i) {
-      for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
-        enc->residual_bytes_[i][s] = (int)((it.bit_count_[s][i] + 7) >> 3);
-      }
-    }
-  }
-  return 1;
-}
-
-//------------------------------------------------------------------------------
-//  VP8StatLoop(): only collect statistics (number of skips, token usage, ...)
-//                 This is used for deciding optimal probabilities. It also
-//                 modifies the quantizer value if some target (size, PNSR)
-//                 was specified.
+//  StatLoop(): only collect statistics (number of skips, token usage, ...).
+//  This is used for deciding optimal probabilities. It also modifies the
+//  quantizer value if some target (size, PNSR) was specified.

 #define kHeaderSizeEstimate (15 + 20 + 10)      // TODO: fix better

-static int OneStatPass(VP8Encoder* const enc, float q, int rd_opt, int nb_mbs,
-                       float* const PSNR) {
-  VP8EncIterator it;
-  uint64_t size = 0;
-  uint64_t distortion = 0;
-  const uint64_t pixel_count = nb_mbs * 384;
-
+static void SetLoopParams(VP8Encoder* const enc, float q) {
  // Make sure the quality parameter is inside valid bounds
  if (q < 0.) {
    q = 0;
@ -639,10 +701,23 @@ static int OneStatPass(VP8Encoder* const enc, float q, int rd_opt, int nb_mbs,
  }

  VP8SetSegmentParams(enc, q);      // setup segment quantizations and filters
+  SetSegmentProbas(enc);            // compute segment probabilities

-  ResetStats(enc, rd_opt != 0);
+  ResetStats(enc);
  ResetTokenStats(enc);

+  ResetSSE(enc);
+}
+
+static int OneStatPass(VP8Encoder* const enc, float q, VP8RDLevel rd_opt,
+                       int nb_mbs, float* const PSNR, int percent_delta) {
+  VP8EncIterator it;
+  uint64_t size = 0;
+  uint64_t distortion = 0;
+  const uint64_t pixel_count = nb_mbs * 384;
+
+  SetLoopParams(enc, q);
+
  VP8IteratorInit(enc, &it);
  do {
    VP8ModeScore info;
@ -654,9 +729,11 @@ static int OneStatPass(VP8Encoder* const enc, float q, int rd_opt, int nb_mbs,
    RecordResiduals(&it, &info);
    size += info.R;
    distortion += info.D;
+    if (percent_delta && !VP8IteratorProgress(&it, percent_delta))
+      return 0;
  } while (VP8IteratorNext(&it, it.yuv_out_) && --nb_mbs > 0);
  size += FinalizeSkipProba(enc);
-  size += FinalizeTokenProbas(enc);
+  size += FinalizeTokenProbas(&enc->proba_);
  size += enc->segment_hdr_.size_;
  size = ((size + 1024) >> 11) + kHeaderSizeEstimate;

@ -669,52 +746,232 @@ static int OneStatPass(VP8Encoder* const enc, float q, int rd_opt, int nb_mbs,
 // successive refinement increments.
 static const int dqs[] = { 20, 15, 10, 8, 6, 4, 2, 1, 0 };

-int VP8StatLoop(VP8Encoder* const enc) {
-  const int do_search =
-    (enc->config_->target_size > 0 || enc->config_->target_PSNR > 0);
-  const int fast_probe = (enc->method_ < 2 && !do_search);
+static int StatLoop(VP8Encoder* const enc) {
+  const int method = enc->method_;
+  const int do_search = enc->do_search_;
+  const int fast_probe = ((method == 0 || method == 3) && !do_search);
  float q = enc->config_->quality;
+  const int max_passes = enc->config_->pass;
+  const int task_percent = 20;
+  const int percent_per_pass = (task_percent + max_passes / 2) / max_passes;
+  const int final_percent = enc->percent_ + task_percent;
  int pass;
  int nb_mbs;

  // Fast mode: quick analysis pass over few mbs. Better than nothing.
  nb_mbs = enc->mb_w_ * enc->mb_h_;
-  if (fast_probe && nb_mbs > 100) nb_mbs = 100;
+  if (fast_probe) {
+    if (method == 3) {  // we need more stats for method 3 to be reliable.
+      nb_mbs = (nb_mbs > 200) ? nb_mbs >> 1 : 100;
+    } else {
+      nb_mbs = (nb_mbs > 200) ? nb_mbs >> 2 : 50;
+    }
+  }

  // No target size: just do several pass without changing 'q'
  if (!do_search) {
-    for (pass = 0; pass < enc->config_->pass; ++pass) {
-      const int rd_opt = (enc->method_ > 2);
-      OneStatPass(enc, q, rd_opt, nb_mbs, NULL);
+    for (pass = 0; pass < max_passes; ++pass) {
+      const VP8RDLevel rd_opt = (method >= 3) ? RD_OPT_BASIC : RD_OPT_NONE;
+      if (!OneStatPass(enc, q, rd_opt, nb_mbs, NULL, percent_per_pass)) {
+        return 0;
+      }
    }
-    return 1;
-  }
-
-  // binary search for a size close to target
-  for (pass = 0; pass < enc->config_->pass && (dqs[pass] > 0); ++pass) {
-    const int rd_opt = 1;
-    float PSNR;
-    int criterion;
-    const int size = OneStatPass(enc, q, rd_opt, nb_mbs, &PSNR);
+  } else {
+    // binary search for a size close to target
+    for (pass = 0; pass < max_passes && (dqs[pass] > 0); ++pass) {
+      float PSNR;
+      int criterion;
+      const int size = OneStatPass(enc, q, RD_OPT_BASIC, nb_mbs, &PSNR,
+                                   percent_per_pass);
 #if DEBUG_SEARCH
-    printf("#%d size=%d PSNR=%.2f q=%.2f\n", pass, size, PSNR, q);
+      printf("#%d size=%d PSNR=%.2f q=%.2f\n", pass, size, PSNR, q);
 #endif
-
-    if (enc->config_->target_PSNR > 0) {
-      criterion = (PSNR < enc->config_->target_PSNR);
-    } else {
-      criterion = (size < enc->config_->target_size);
-    }
-    // dichotomize
-    if (criterion) {
-      q += dqs[pass];
-    } else {
-      q -= dqs[pass];
+      if (size == 0) return 0;
+      if (enc->config_->target_PSNR > 0) {
+        criterion = (PSNR < enc->config_->target_PSNR);
+      } else {
+        criterion = (size < enc->config_->target_size);
+      }
+      // dichotomize
+      if (criterion) {
+        q += dqs[pass];
+      } else {
+        q -= dqs[pass];
+      }
    }
  }
-  return 1;
+  VP8CalculateLevelCosts(&enc->proba_);  // finalize costs
+  return WebPReportProgress(enc->pic_, final_percent, &enc->percent_);
 }

+//------------------------------------------------------------------------------
+// Main loops
+//
+
+static const int kAverageBytesPerMB[8] = { 50, 24, 16, 9, 7, 5, 3, 2 };
+
+static int PreLoopInitialize(VP8Encoder* const enc) {
+  int p;
+  int ok = 1;
+  const int average_bytes_per_MB = kAverageBytesPerMB[enc->base_quant_ >> 4];
+  const int bytes_per_parts =
+      enc->mb_w_ * enc->mb_h_ * average_bytes_per_MB / enc->num_parts_;
+  // Initialize the bit-writers
+  for (p = 0; ok && p < enc->num_parts_; ++p) {
+    ok = VP8BitWriterInit(enc->parts_ + p, bytes_per_parts);
+  }
+  if (!ok) VP8EncFreeBitWriters(enc);  // malloc error occurred
+  return ok;
+}
+
+static int PostLoopFinalize(VP8EncIterator* const it, int ok) {
+  VP8Encoder* const enc = it->enc_;
+  if (ok) {      // Finalize the partitions, check for extra errors.
+    int p;
+    for (p = 0; p < enc->num_parts_; ++p) {
+      VP8BitWriterFinish(enc->parts_ + p);
+      ok &= !enc->parts_[p].error_;
+    }
+  }
+
+  if (ok) {      // All good. Finish up.
+    if (enc->pic_->stats) {           // finalize byte counters...
+      int i, s;
+      for (i = 0; i <= 2; ++i) {
+        for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
+          enc->residual_bytes_[i][s] = (int)((it->bit_count_[s][i] + 7) >> 3);
+        }
+      }
+    }
+    VP8AdjustFilterStrength(it);     // ...and store filter stats.
+  } else {
+    // Something bad happened -> need to do some memory cleanup.
+    VP8EncFreeBitWriters(enc);
+  }
+  return ok;
+}
+
+//------------------------------------------------------------------------------
+//  VP8EncLoop(): does the final bitstream coding.
+
+static void ResetAfterSkip(VP8EncIterator* const it) {
+  if (it->mb_->type_ == 1) {
+    *it->nz_ = 0;  // reset all predictors
+    it->left_nz_[8] = 0;
+  } else {
+    *it->nz_ &= (1 << 24);  // preserve the dc_nz bit
+  }
+}
+
+int VP8EncLoop(VP8Encoder* const enc) {
+  VP8EncIterator it;
+  int ok = PreLoopInitialize(enc);
+  if (!ok) return 0;
+
+  StatLoop(enc);  // stats-collection loop
+
+  VP8IteratorInit(enc, &it);
+  VP8InitFilter(&it);
+  do {
+    VP8ModeScore info;
+    const int dont_use_skip = !enc->proba_.use_skip_proba_;
+    const VP8RDLevel rd_opt = enc->rd_opt_level_;
+
+    VP8IteratorImport(&it);
+    // Warning! order is important: first call VP8Decimate() and
+    // *then* decide how to code the skip decision if there's one.
+    if (!VP8Decimate(&it, &info, rd_opt) || dont_use_skip) {
+      CodeResiduals(it.bw_, &it, &info);
+    } else {   // reset predictors after a skip
+      ResetAfterSkip(&it);
+    }
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+    if (enc->use_layer_) {
+      VP8EncCodeLayerBlock(&it);
+    }
+#endif
+    StoreSideInfo(&it);
+    VP8StoreFilterStats(&it);
+    VP8IteratorExport(&it);
+    ok = VP8IteratorProgress(&it, 20);
+  } while (ok && VP8IteratorNext(&it, it.yuv_out_));
+
+  return PostLoopFinalize(&it, ok);
+}
+
+//------------------------------------------------------------------------------
+// Single pass using Token Buffer.
+
+#if !defined(DISABLE_TOKEN_BUFFER)
+
+#define MIN_COUNT 96   // minimum number of macroblocks before updating stats
+
+int VP8EncTokenLoop(VP8Encoder* const enc) {
+  int ok;
+  // Roughly refresh the proba height times per pass
+  int max_count = (enc->mb_w_ * enc->mb_h_) >> 3;
+  int cnt;
+  VP8EncIterator it;
+  VP8Proba* const proba = &enc->proba_;
+  const VP8RDLevel rd_opt = enc->rd_opt_level_;
+
+  if (max_count < MIN_COUNT) max_count = MIN_COUNT;
+  cnt = max_count;
+
+  assert(enc->num_parts_ == 1);
+  assert(enc->use_tokens_);
+  assert(proba->use_skip_proba_ == 0);
+  assert(rd_opt >= RD_OPT_BASIC);   // otherwise, token-buffer won't be useful
+  assert(!enc->do_search_);         // TODO(skal): handle pass and dichotomy
+
+  SetLoopParams(enc, enc->config_->quality);
+
+  ok = PreLoopInitialize(enc);
+  if (!ok) return 0;
+
+  VP8IteratorInit(enc, &it);
+  VP8InitFilter(&it);
+  do {
+    VP8ModeScore info;
+    VP8IteratorImport(&it);
+    if (--cnt < 0) {
+      FinalizeTokenProbas(proba);
+      VP8CalculateLevelCosts(proba);  // refresh cost tables for rd-opt
+      cnt = max_count;
+    }
+    VP8Decimate(&it, &info, rd_opt);
+    RecordTokens(&it, &info, &enc->tokens_);
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+    if (enc->use_layer_) {
+      VP8EncCodeLayerBlock(&it);
+    }
+#endif
+    StoreSideInfo(&it);
+    VP8StoreFilterStats(&it);
+    VP8IteratorExport(&it);
+    ok = VP8IteratorProgress(&it, 20);
+  } while (ok && VP8IteratorNext(&it, it.yuv_out_));
+
+  ok = ok && WebPReportProgress(enc->pic_, enc->percent_ + 20, &enc->percent_);
+
+  if (ok) {
+    FinalizeTokenProbas(proba);
+    ok = VP8EmitTokens(&enc->tokens_, enc->parts_ + 0,
+                       (const uint8_t*)proba->coeffs_, 1);
+  }
+
+  return PostLoopFinalize(&it, ok);
+}
+
+#else
+
+int VP8EncTokenLoop(VP8Encoder* const enc) {
+  (void)enc;
+  return 0;   // we shouldn't be here.
+}
+
+#endif    // DISABLE_TOKEN_BUFFER
+
 //------------------------------------------------------------------------------

 #if defined(__cplusplus) || defined(c_plusplus)
--- a/src/enc/histogram.c
+++ b/src/enc/histogram.c
@ -0,0 +1,514 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Author: Jyrki Alakuijala (jyrki@google.com)
+//
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <math.h>
+#include <stdio.h>
+
+#include "./backward_references.h"
+#include "./histogram.h"
+#include "../dsp/lossless.h"
+#include "../utils/utils.h"
+
+static void HistogramClear(VP8LHistogram* const p) {
+  memset(p->literal_, 0, sizeof(p->literal_));
+  memset(p->red_, 0, sizeof(p->red_));
+  memset(p->blue_, 0, sizeof(p->blue_));
+  memset(p->alpha_, 0, sizeof(p->alpha_));
+  memset(p->distance_, 0, sizeof(p->distance_));
+  p->bit_cost_ = 0;
+}
+
+void VP8LHistogramStoreRefs(const VP8LBackwardRefs* const refs,
+                            VP8LHistogram* const histo) {
+  int i;
+  for (i = 0; i < refs->size; ++i) {
+    VP8LHistogramAddSinglePixOrCopy(histo, &refs->refs[i]);
+  }
+}
+
+void VP8LHistogramCreate(VP8LHistogram* const p,
+                         const VP8LBackwardRefs* const refs,
+                         int palette_code_bits) {
+  if (palette_code_bits >= 0) {
+    p->palette_code_bits_ = palette_code_bits;
+  }
+  HistogramClear(p);
+  VP8LHistogramStoreRefs(refs, p);
+}
+
+void VP8LHistogramInit(VP8LHistogram* const p, int palette_code_bits) {
+  p->palette_code_bits_ = palette_code_bits;
+  HistogramClear(p);
+}
+
+VP8LHistogramSet* VP8LAllocateHistogramSet(int size, int cache_bits) {
+  int i;
+  VP8LHistogramSet* set;
+  VP8LHistogram* bulk;
+  const uint64_t total_size = sizeof(*set)
+                            + (uint64_t)size * sizeof(*set->histograms)
+                            + (uint64_t)size * sizeof(**set->histograms);
+  uint8_t* memory = (uint8_t*)WebPSafeMalloc(total_size, sizeof(*memory));
+  if (memory == NULL) return NULL;
+
+  set = (VP8LHistogramSet*)memory;
+  memory += sizeof(*set);
+  set->histograms = (VP8LHistogram**)memory;
+  memory += size * sizeof(*set->histograms);
+  bulk = (VP8LHistogram*)memory;
+  set->max_size = size;
+  set->size = size;
+  for (i = 0; i < size; ++i) {
+    set->histograms[i] = bulk + i;
+    VP8LHistogramInit(set->histograms[i], cache_bits);
+  }
+  return set;
+}
+
+// -----------------------------------------------------------------------------
+
+void VP8LHistogramAddSinglePixOrCopy(VP8LHistogram* const histo,
+                                     const PixOrCopy* const v) {
+  if (PixOrCopyIsLiteral(v)) {
+    ++histo->alpha_[PixOrCopyLiteral(v, 3)];
+    ++histo->red_[PixOrCopyLiteral(v, 2)];
+    ++histo->literal_[PixOrCopyLiteral(v, 1)];
+    ++histo->blue_[PixOrCopyLiteral(v, 0)];
+  } else if (PixOrCopyIsCacheIdx(v)) {
+    int literal_ix = 256 + NUM_LENGTH_CODES + PixOrCopyCacheIdx(v);
+    ++histo->literal_[literal_ix];
+  } else {
+    int code, extra_bits_count, extra_bits_value;
+    PrefixEncode(PixOrCopyLength(v),
+                 &code, &extra_bits_count, &extra_bits_value);
+    ++histo->literal_[256 + code];
+    PrefixEncode(PixOrCopyDistance(v),
+                 &code, &extra_bits_count, &extra_bits_value);
+    ++histo->distance_[code];
+  }
+}
+
+static double BitsEntropy(const int* const array, int n) {
+  double retval = 0.;
+  int sum = 0;
+  int nonzeros = 0;
+  int max_val = 0;
+  int i;
+  double mix;
+  for (i = 0; i < n; ++i) {
+    if (array[i] != 0) {
+      sum += array[i];
+      ++nonzeros;
+      retval -= VP8LFastSLog2(array[i]);
+      if (max_val < array[i]) {
+        max_val = array[i];
+      }
+    }
+  }
+  retval += VP8LFastSLog2(sum);
+
+  if (nonzeros < 5) {
+    if (nonzeros <= 1) {
+      return 0;
+    }
+    // Two symbols, they will be 0 and 1 in a Huffman code.
+    // Let's mix in a bit of entropy to favor good clustering when
+    // distributions of these are combined.
+    if (nonzeros == 2) {
+      return 0.99 * sum + 0.01 * retval;
+    }
+    // No matter what the entropy says, we cannot be better than min_limit
+    // with Huffman coding. I am mixing a bit of entropy into the
+    // min_limit since it produces much better (~0.5 %) compression results
+    // perhaps because of better entropy clustering.
+    if (nonzeros == 3) {
+      mix = 0.95;
+    } else {
+      mix = 0.7;  // nonzeros == 4.
+    }
+  } else {
+    mix = 0.627;
+  }
+
+  {
+    double min_limit = 2 * sum - max_val;
+    min_limit = mix * min_limit + (1.0 - mix) * retval;
+    return (retval < min_limit) ? min_limit : retval;
+  }
+}
+
+// Returns the cost encode the rle-encoded entropy code.
+// The constants in this function are experimental.
+static double HuffmanCost(const int* const population, int length) {
+  // Small bias because Huffman code length is typically not stored in
+  // full length.
+  static const int kHuffmanCodeOfHuffmanCodeSize = CODE_LENGTH_CODES * 3;
+  static const double kSmallBias = 9.1;
+  double retval = kHuffmanCodeOfHuffmanCodeSize - kSmallBias;
+  int streak = 0;
+  int i = 0;
+  for (; i < length - 1; ++i) {
+    ++streak;
+    if (population[i] == population[i + 1]) {
+      continue;
+    }
+ last_streak_hack:
+    // population[i] points now to the symbol in the streak of same values.
+    if (streak > 3) {
+      if (population[i] == 0) {
+        retval += 1.5625 + 0.234375 * streak;
+      } else {
+        retval += 2.578125 + 0.703125 * streak;
+      }
+    } else {
+      if (population[i] == 0) {
+        retval += 1.796875 * streak;
+      } else {
+        retval += 3.28125 * streak;
+      }
+    }
+    streak = 0;
+  }
+  if (i == length - 1) {
+    ++streak;
+    goto last_streak_hack;
+  }
+  return retval;
+}
+
+static double PopulationCost(const int* const population, int length) {
+  return BitsEntropy(population, length) + HuffmanCost(population, length);
+}
+
+static double ExtraCost(const int* const population, int length) {
+  int i;
+  double cost = 0.;
+  for (i = 2; i < length - 2; ++i) cost += (i >> 1) * population[i + 2];
+  return cost;
+}
+
+// Estimates the Entropy + Huffman + other block overhead size cost.
+double VP8LHistogramEstimateBits(const VP8LHistogram* const p) {
+  return PopulationCost(p->literal_, VP8LHistogramNumCodes(p))
+       + PopulationCost(p->red_, 256)
+       + PopulationCost(p->blue_, 256)
+       + PopulationCost(p->alpha_, 256)
+       + PopulationCost(p->distance_, NUM_DISTANCE_CODES)
+       + ExtraCost(p->literal_ + 256, NUM_LENGTH_CODES)
+       + ExtraCost(p->distance_, NUM_DISTANCE_CODES);
+}
+
+double VP8LHistogramEstimateBitsBulk(const VP8LHistogram* const p) {
+  return BitsEntropy(p->literal_, VP8LHistogramNumCodes(p))
+       + BitsEntropy(p->red_, 256)
+       + BitsEntropy(p->blue_, 256)
+       + BitsEntropy(p->alpha_, 256)
+       + BitsEntropy(p->distance_, NUM_DISTANCE_CODES)
+       + ExtraCost(p->literal_ + 256, NUM_LENGTH_CODES)
+       + ExtraCost(p->distance_, NUM_DISTANCE_CODES);
+}
+
+// -----------------------------------------------------------------------------
+// Various histogram combine/cost-eval functions
+
+// Adds 'in' histogram to 'out'
+static void HistogramAdd(const VP8LHistogram* const in,
+                         VP8LHistogram* const out) {
+  int i;
+  for (i = 0; i < PIX_OR_COPY_CODES_MAX; ++i) {
+    out->literal_[i] += in->literal_[i];
+  }
+  for (i = 0; i < NUM_DISTANCE_CODES; ++i) {
+    out->distance_[i] += in->distance_[i];
+  }
+  for (i = 0; i < 256; ++i) {
+    out->red_[i] += in->red_[i];
+    out->blue_[i] += in->blue_[i];
+    out->alpha_[i] += in->alpha_[i];
+  }
+}
+
+// Performs out = a + b, computing the cost C(a+b) - C(a) - C(b) while comparing
+// to the threshold value 'cost_threshold'. The score returned is
+//  Score = C(a+b) - C(a) - C(b), where C(a) + C(b) is known and fixed.
+// Since the previous score passed is 'cost_threshold', we only need to compare
+// the partial cost against 'cost_threshold + C(a) + C(b)' to possibly bail-out
+// early.
+static double HistogramAddEval(const VP8LHistogram* const a,
+                               const VP8LHistogram* const b,
+                               VP8LHistogram* const out,
+                               double cost_threshold) {
+  double cost = 0;
+  const double sum_cost = a->bit_cost_ + b->bit_cost_;
+  int i;
+
+  cost_threshold += sum_cost;
+
+  // palette_code_bits_ is part of the cost evaluation for literal_.
+  // TODO(skal): remove/simplify this palette_code_bits_?
+  out->palette_code_bits_ =
+      (a->palette_code_bits_ > b->palette_code_bits_) ? a->palette_code_bits_ :
+                                                        b->palette_code_bits_;
+  for (i = 0; i < PIX_OR_COPY_CODES_MAX; ++i) {
+    out->literal_[i] = a->literal_[i] + b->literal_[i];
+  }
+  cost += PopulationCost(out->literal_, VP8LHistogramNumCodes(out));
+  cost += ExtraCost(out->literal_ + 256, NUM_LENGTH_CODES);
+  if (cost > cost_threshold) return cost;
+
+  for (i = 0; i < 256; ++i) out->red_[i] = a->red_[i] + b->red_[i];
+  cost += PopulationCost(out->red_, 256);
+  if (cost > cost_threshold) return cost;
+
+  for (i = 0; i < 256; ++i) out->blue_[i] = a->blue_[i] + b->blue_[i];
+  cost += PopulationCost(out->blue_, 256);
+  if (cost > cost_threshold) return cost;
+
+  for (i = 0; i < NUM_DISTANCE_CODES; ++i) {
+    out->distance_[i] = a->distance_[i] + b->distance_[i];
+  }
+  cost += PopulationCost(out->distance_, NUM_DISTANCE_CODES);
+  cost += ExtraCost(out->distance_, NUM_DISTANCE_CODES);
+  if (cost > cost_threshold) return cost;
+
+  for (i = 0; i < 256; ++i) out->alpha_[i] = a->alpha_[i] + b->alpha_[i];
+  cost += PopulationCost(out->alpha_, 256);
+
+  out->bit_cost_ = cost;
+  return cost - sum_cost;
+}
+
+// Same as HistogramAddEval(), except that the resulting histogram
+// is not stored. Only the cost C(a+b) - C(a) is evaluated. We omit
+// the term C(b) which is constant over all the evaluations.
+static double HistogramAddThresh(const VP8LHistogram* const a,
+                                 const VP8LHistogram* const b,
+                                 double cost_threshold) {
+  int tmp[PIX_OR_COPY_CODES_MAX];  // <= max storage we'll need
+  int i;
+  double cost = -a->bit_cost_;
+
+  for (i = 0; i < PIX_OR_COPY_CODES_MAX; ++i) {
+    tmp[i] = a->literal_[i] + b->literal_[i];
+  }
+  // note that the tests are ordered so that the usually largest
+  // cost shares come first.
+  cost += PopulationCost(tmp, VP8LHistogramNumCodes(a));
+  cost += ExtraCost(tmp + 256, NUM_LENGTH_CODES);
+  if (cost > cost_threshold) return cost;
+
+  for (i = 0; i < 256; ++i) tmp[i] = a->red_[i] + b->red_[i];
+  cost += PopulationCost(tmp, 256);
+  if (cost > cost_threshold) return cost;
+
+  for (i = 0; i < 256; ++i) tmp[i] = a->blue_[i] + b->blue_[i];
+  cost += PopulationCost(tmp, 256);
+  if (cost > cost_threshold) return cost;
+
+  for (i = 0; i < NUM_DISTANCE_CODES; ++i) {
+    tmp[i] = a->distance_[i] + b->distance_[i];
+  }
+  cost += PopulationCost(tmp, NUM_DISTANCE_CODES);
+  cost += ExtraCost(tmp, NUM_DISTANCE_CODES);
+  if (cost > cost_threshold) return cost;
+
+  for (i = 0; i < 256; ++i) tmp[i] = a->alpha_[i] + b->alpha_[i];
+  cost += PopulationCost(tmp, 256);
+
+  return cost;
+}
+
+// -----------------------------------------------------------------------------
+
+static void HistogramBuildImage(int xsize, int histo_bits,
+                                const VP8LBackwardRefs* const backward_refs,
+                                VP8LHistogramSet* const image) {
+  int i;
+  int x = 0, y = 0;
+  const int histo_xsize = VP8LSubSampleSize(xsize, histo_bits);
+  VP8LHistogram** const histograms = image->histograms;
+  assert(histo_bits > 0);
+  for (i = 0; i < backward_refs->size; ++i) {
+    const PixOrCopy* const v = &backward_refs->refs[i];
+    const int ix = (y >> histo_bits) * histo_xsize + (x >> histo_bits);
+    VP8LHistogramAddSinglePixOrCopy(histograms[ix], v);
+    x += PixOrCopyLength(v);
+    while (x >= xsize) {
+      x -= xsize;
+      ++y;
+    }
+  }
+}
+
+static uint32_t MyRand(uint32_t *seed) {
+  *seed *= 16807U;
+  if (*seed == 0) {
+    *seed = 1;
+  }
+  return *seed;
+}
+
+static int HistogramCombine(const VP8LHistogramSet* const in,
+                            VP8LHistogramSet* const out, int iter_mult,
+                            int num_pairs, int num_tries_no_success) {
+  int ok = 0;
+  int i, iter;
+  uint32_t seed = 0;
+  int tries_with_no_success = 0;
+  int out_size = in->size;
+  const int outer_iters = in->size * iter_mult;
+  const int min_cluster_size = 2;
+  VP8LHistogram* const histos = (VP8LHistogram*)malloc(2 * sizeof(*histos));
+  VP8LHistogram* cur_combo = histos + 0;    // trial merged histogram
+  VP8LHistogram* best_combo = histos + 1;   // best merged histogram so far
+  if (histos == NULL) goto End;
+
+  // Copy histograms from in[] to out[].
+  assert(in->size <= out->size);
+  for (i = 0; i < in->size; ++i) {
+    in->histograms[i]->bit_cost_ = VP8LHistogramEstimateBits(in->histograms[i]);
+    *out->histograms[i] = *in->histograms[i];
+  }
+
+  // Collapse similar histograms in 'out'.
+  for (iter = 0; iter < outer_iters && out_size >= min_cluster_size; ++iter) {
+    double best_cost_diff = 0.;
+    int best_idx1 = -1, best_idx2 = 1;
+    int j;
+    const int num_tries = (num_pairs < out_size) ? num_pairs : out_size;
+    seed += iter;
+    for (j = 0; j < num_tries; ++j) {
+      double curr_cost_diff;
+      // Choose two histograms at random and try to combine them.
+      const uint32_t idx1 = MyRand(&seed) % out_size;
+      const uint32_t tmp = (j & 7) + 1;
+      const uint32_t diff = (tmp < 3) ? tmp : MyRand(&seed) % (out_size - 1);
+      const uint32_t idx2 = (idx1 + diff + 1) % out_size;
+      if (idx1 == idx2) {
+        continue;
+      }
+      // Calculate cost reduction on combining.
+      curr_cost_diff = HistogramAddEval(out->histograms[idx1],
+                                        out->histograms[idx2],
+                                        cur_combo, best_cost_diff);
+      if (curr_cost_diff < best_cost_diff) {    // found a better pair?
+        {     // swap cur/best combo histograms
+          VP8LHistogram* const tmp_histo = cur_combo;
+          cur_combo = best_combo;
+          best_combo = tmp_histo;
+        }
+        best_cost_diff = curr_cost_diff;
+        best_idx1 = idx1;
+        best_idx2 = idx2;
+      }
+    }
+
+    if (best_idx1 >= 0) {
+      *out->histograms[best_idx1] = *best_combo;
+      // swap best_idx2 slot with last one (which is now unused)
+      --out_size;
+      if (best_idx2 != out_size) {
+        out->histograms[best_idx2] = out->histograms[out_size];
+        out->histograms[out_size] = NULL;   // just for sanity check.
+      }
+      tries_with_no_success = 0;
+    }
+    if (++tries_with_no_success >= num_tries_no_success) {
+      break;
+    }
+  }
+  out->size = out_size;
+  ok = 1;
+
+ End:
+  free(histos);
+  return ok;
+}
+
+// -----------------------------------------------------------------------------
+// Histogram refinement
+
+// What is the bit cost of moving square_histogram from cur_symbol to candidate.
+static double HistogramDistance(const VP8LHistogram* const square_histogram,
+                                const VP8LHistogram* const candidate,
+                                double cost_threshold) {
+  return HistogramAddThresh(candidate, square_histogram, cost_threshold);
+}
+
+// Find the best 'out' histogram for each of the 'in' histograms.
+// Note: we assume that out[]->bit_cost_ is already up-to-date.
+static void HistogramRemap(const VP8LHistogramSet* const in,
+                           const VP8LHistogramSet* const out,
+                           uint16_t* const symbols) {
+  int i;
+  for (i = 0; i < in->size; ++i) {
+    int best_out = 0;
+    double best_bits =
+        HistogramDistance(in->histograms[i], out->histograms[0], 1.e38);
+    int k;
+    for (k = 1; k < out->size; ++k) {
+      const double cur_bits =
+          HistogramDistance(in->histograms[i], out->histograms[k], best_bits);
+      if (cur_bits < best_bits) {
+        best_bits = cur_bits;
+        best_out = k;
+      }
+    }
+    symbols[i] = best_out;
+  }
+
+  // Recompute each out based on raw and symbols.
+  for (i = 0; i < out->size; ++i) {
+    HistogramClear(out->histograms[i]);
+  }
+  for (i = 0; i < in->size; ++i) {
+    HistogramAdd(in->histograms[i], out->histograms[symbols[i]]);
+  }
+}
+
+int VP8LGetHistoImageSymbols(int xsize, int ysize,
+                             const VP8LBackwardRefs* const refs,
+                             int quality, int histo_bits, int cache_bits,
+                             VP8LHistogramSet* const image_in,
+                             uint16_t* const histogram_symbols) {
+  int ok = 0;
+  const int histo_xsize = histo_bits ? VP8LSubSampleSize(xsize, histo_bits) : 1;
+  const int histo_ysize = histo_bits ? VP8LSubSampleSize(ysize, histo_bits) : 1;
+  const int histo_image_raw_size = histo_xsize * histo_ysize;
+
+  // Heuristic params for HistogramCombine().
+  const int num_tries_no_success = 8 + (quality >> 1);
+  const int iter_mult = (quality < 27) ? 1 : 1 + ((quality - 27) >> 4);
+  const int num_pairs = (quality < 25) ? 10 : (5 * quality) >> 3;
+
+  VP8LHistogramSet* const image_out =
+      VP8LAllocateHistogramSet(histo_image_raw_size, cache_bits);
+  if (image_out == NULL) return 0;
+
+  // Build histogram image.
+  HistogramBuildImage(xsize, histo_bits, refs, image_out);
+  // Collapse similar histograms.
+  if (!HistogramCombine(image_out, image_in, iter_mult, num_pairs,
+                        num_tries_no_success)) {
+    goto Error;
+  }
+  // Find the optimal map from original histograms to the final ones.
+  HistogramRemap(image_out, image_in, histogram_symbols);
+  ok = 1;
+
+Error:
+  free(image_out);
+  return ok;
+}
--- a/src/enc/histogram.h
+++ b/src/enc/histogram.h
@ -0,0 +1,101 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Author: Jyrki Alakuijala (jyrki@google.com)
+//
+// Models the histograms of literal and distance codes.
+
+#ifndef WEBP_ENC_HISTOGRAM_H_
+#define WEBP_ENC_HISTOGRAM_H_
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "./backward_references.h"
+#include "../webp/format_constants.h"
+#include "../webp/types.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+// A simple container for histograms of data.
+typedef struct {
+  // literal_ contains green literal, palette-code and
+  // copy-length-prefix histogram
+  int literal_[PIX_OR_COPY_CODES_MAX];
+  int red_[256];
+  int blue_[256];
+  int alpha_[256];
+  // Backward reference prefix-code histogram.
+  int distance_[NUM_DISTANCE_CODES];
+  int palette_code_bits_;
+  double bit_cost_;   // cached value of VP8LHistogramEstimateBits(this)
+} VP8LHistogram;
+
+// Collection of histograms with fixed capacity, allocated as one
+// big memory chunk. Can be destroyed by simply calling 'free()'.
+typedef struct {
+  int size;         // number of slots currently in use
+  int max_size;     // maximum capacity
+  VP8LHistogram** histograms;
+} VP8LHistogramSet;
+
+// Create the histogram.
+//
+// The input data is the PixOrCopy data, which models the literals, stop
+// codes and backward references (both distances and lengths).  Also: if
+// palette_code_bits is >= 0, initialize the histogram with this value.
+void VP8LHistogramCreate(VP8LHistogram* const p,
+                         const VP8LBackwardRefs* const refs,
+                         int palette_code_bits);
+
+// Set the palette_code_bits and reset the stats.
+void VP8LHistogramInit(VP8LHistogram* const p, int palette_code_bits);
+
+// Collect all the references into a histogram (without reset)
+void VP8LHistogramStoreRefs(const VP8LBackwardRefs* const refs,
+                            VP8LHistogram* const histo);
+
+// Allocate an array of pointer to histograms, allocated and initialized
+// using 'cache_bits'. Return NULL in case of memory error.
+VP8LHistogramSet* VP8LAllocateHistogramSet(int size, int cache_bits);
+
+// Accumulate a token 'v' into a histogram.
+void VP8LHistogramAddSinglePixOrCopy(VP8LHistogram* const histo,
+                                     const PixOrCopy* const v);
+
+// Estimate how many bits the combined entropy of literals and distance
+// approximately maps to.
+double VP8LHistogramEstimateBits(const VP8LHistogram* const p);
+
+// This function estimates the cost in bits excluding the bits needed to
+// represent the entropy code itself.
+double VP8LHistogramEstimateBitsBulk(const VP8LHistogram* const p);
+
+static WEBP_INLINE int VP8LHistogramNumCodes(const VP8LHistogram* const p) {
+  return 256 + NUM_LENGTH_CODES +
+      ((p->palette_code_bits_ > 0) ? (1 << p->palette_code_bits_) : 0);
+}
+
+// Builds the histogram image.
+int VP8LGetHistoImageSymbols(int xsize, int ysize,
+                             const VP8LBackwardRefs* const refs,
+                             int quality, int histogram_bits, int cache_bits,
+                             VP8LHistogramSet* const image_in,
+                             uint16_t* const histogram_symbols);
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}
+#endif
+
+#endif  // WEBP_ENC_HISTOGRAM_H_
--- a/src/enc/iterator.c
+++ b/src/enc/iterator.c
@ -1,17 +1,19 @@
-// Copyright 2011 Google Inc.
+// Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // VP8Iterator: block iterator
 //
 // Author: Skal (pascal.massimino@gmail.com)

-#include <stdlib.h>
 #include <string.h>
-#include "vp8enci.h"
+
+#include "./vp8enci.h"

 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
@ -24,7 +26,7 @@ extern "C" {
 static void InitLeft(VP8EncIterator* const it) {
  const VP8Encoder* const enc = it->enc_;
  enc->y_left_[-1] = enc->u_left_[-1] = enc->v_left_[-1] =
-      (it->y_) > 0 ? 129 : 127;
+      (it->y_ > 0) ? 129 : 127;
  memset(enc->y_left_, 129, 16);
  memset(enc->u_left_, 129, 8);
  memset(enc->v_left_, 129, 8);
@ -33,7 +35,7 @@ static void InitLeft(VP8EncIterator* const it) {

 static void InitTop(VP8EncIterator* const it) {
  const VP8Encoder* const enc = it->enc_;
-  const int top_size = enc->mb_w_ * 16;
+  const size_t top_size = enc->mb_w_ * 16;
  memset(enc->y_top_, 127, 2 * top_size);
  memset(enc->nz_, 0, enc->mb_w_ * sizeof(*enc->nz_));
 }
@ -65,66 +67,81 @@ void VP8IteratorInit(VP8Encoder* const enc, VP8EncIterator* const it) {
  it->yuv_out2_ = enc->yuv_out2_;
  it->yuv_p_    = enc->yuv_p_;
  it->lf_stats_ = enc->lf_stats_;
+  it->percent0_ = enc->percent_;
  VP8IteratorReset(it);
 }

+int VP8IteratorProgress(const VP8EncIterator* const it, int delta) {
+  VP8Encoder* const enc = it->enc_;
+  if (delta && enc->pic_->progress_hook) {
+    const int percent = (enc->mb_h_ <= 1)
+                      ? it->percent0_
+                      : it->percent0_ + delta * it->y_ / (enc->mb_h_ - 1);
+    return WebPReportProgress(enc->pic_, percent, &enc->percent_);
+  }
+  return 1;
+}
+
 //------------------------------------------------------------------------------
 // Import the source samples into the cache. Takes care of replicating
 // boundary pixels if necessary.

+static void ImportBlock(const uint8_t* src, int src_stride,
+                        uint8_t* dst, int w, int h, int size) {
+  int i;
+  for (i = 0; i < h; ++i) {
+    memcpy(dst, src, w);
+    if (w < size) {
+      memset(dst + w, dst[w - 1], size - w);
+    }
+    dst += BPS;
+    src += src_stride;
+  }
+  for (i = h; i < size; ++i) {
+    memcpy(dst, dst - BPS, size);
+    dst += BPS;
+  }
+}
+
 void VP8IteratorImport(const VP8EncIterator* const it) {
  const VP8Encoder* const enc = it->enc_;
  const int x = it->x_, y = it->y_;
  const WebPPicture* const pic = enc->pic_;
-  const uint8_t* ysrc = pic->y + (y * pic->y_stride + x) * 16;
-  const uint8_t* usrc = pic->u + (y * pic->uv_stride + x) * 8;
-  const uint8_t* vsrc = pic->v + (y * pic->uv_stride + x) * 8;
-  uint8_t* ydst = it->yuv_in_ + Y_OFF;
-  uint8_t* udst = it->yuv_in_ + U_OFF;
-  uint8_t* vdst = it->yuv_in_ + V_OFF;
+  const uint8_t* const ysrc = pic->y + (y * pic->y_stride + x) * 16;
+  const uint8_t* const usrc = pic->u + (y * pic->uv_stride + x) * 8;
+  const uint8_t* const vsrc = pic->v + (y * pic->uv_stride + x) * 8;
+  uint8_t* const ydst = it->yuv_in_ + Y_OFF;
+  uint8_t* const udst = it->yuv_in_ + U_OFF;
+  uint8_t* const vdst = it->yuv_in_ + V_OFF;
  int w = (pic->width - x * 16);
  int h = (pic->height - y * 16);
-  int i;

  if (w > 16) w = 16;
  if (h > 16) h = 16;
+
  // Luma plane
-  for (i = 0; i < h; ++i) {
-    memcpy(ydst, ysrc, w);
-    if (w < 16) memset(ydst + w, ydst[w - 1], 16 - w);
-    ydst += BPS;
-    ysrc += pic->y_stride;
-  }
-  for (i = h; i < 16; ++i) {
-    memcpy(ydst, ydst - BPS, 16);
-    ydst += BPS;
-  }
-  // U/V plane
-  w = (w + 1) / 2;
-  h = (h + 1) / 2;
-  for (i = 0; i < h; ++i) {
-    memcpy(udst, usrc, w);
-    memcpy(vdst, vsrc, w);
-    if (w < 8) {
-      memset(udst + w, udst[w - 1], 8 - w);
-      memset(vdst + w, vdst[w - 1], 8 - w);
-    }
-    udst += BPS;
-    vdst += BPS;
-    usrc += pic->uv_stride;
-    vsrc += pic->uv_stride;
-  }
-  for (i = h; i < 8; ++i) {
-    memcpy(udst, udst - BPS, 8);
-    memcpy(vdst, vdst - BPS, 8);
-    udst += BPS;
-    vdst += BPS;
+  ImportBlock(ysrc, pic->y_stride, ydst, w, h, 16);
+
+  {   // U/V planes
+    const int uv_w = (w + 1) >> 1;
+    const int uv_h = (h + 1) >> 1;
+    ImportBlock(usrc, pic->uv_stride, udst, uv_w, uv_h, 8);
+    ImportBlock(vsrc, pic->uv_stride, vdst, uv_w, uv_h, 8);
  }
 }

 //------------------------------------------------------------------------------
 // Copy back the compressed samples into user space if requested.

+static void ExportBlock(const uint8_t* src, uint8_t* dst, int dst_stride,
+                        int w, int h) {
+  while (h-- > 0) {
+    memcpy(dst, src, w);
+    dst += dst_stride;
+    src += BPS;
+  }
+}
+
 void VP8IteratorExport(const VP8EncIterator* const it) {
  const VP8Encoder* const enc = it->enc_;
  if (enc->config_->show_compressed) {
@ -133,28 +150,23 @@ void VP8IteratorExport(const VP8EncIterator* const it) {
    const uint8_t* const usrc = it->yuv_out_ + U_OFF;
    const uint8_t* const vsrc = it->yuv_out_ + V_OFF;
    const WebPPicture* const pic = enc->pic_;
-    uint8_t* ydst = pic->y + (y * pic->y_stride + x) * 16;
-    uint8_t* udst = pic->u + (y * pic->uv_stride + x) * 8;
-    uint8_t* vdst = pic->v + (y * pic->uv_stride + x) * 8;
+    uint8_t* const ydst = pic->y + (y * pic->y_stride + x) * 16;
+    uint8_t* const udst = pic->u + (y * pic->uv_stride + x) * 8;
+    uint8_t* const vdst = pic->v + (y * pic->uv_stride + x) * 8;
    int w = (pic->width - x * 16);
    int h = (pic->height - y * 16);
-    int i;

    if (w > 16) w = 16;
    if (h > 16) h = 16;

    // Luma plane
-    for (i = 0; i < h; ++i) {
-      memcpy(ydst + i * pic->y_stride, ysrc + i * BPS, w);
-    }
-    // U/V plane
-    {
-      const int uv_w = (w + 1) / 2;
-      const int uv_h = (h + 1) / 2;
-      for (i = 0; i < uv_h; ++i) {
-        memcpy(udst + i * pic->uv_stride, usrc + i * BPS, uv_w);
-        memcpy(vdst + i * pic->uv_stride, vsrc + i * BPS, uv_w);
-      }
+    ExportBlock(ysrc, ydst, pic->y_stride, w, h);
+
+    {   // U/V planes
+      const int uv_w = (w + 1) >> 1;
+      const int uv_h = (h + 1) >> 1;
+      ExportBlock(usrc, udst, pic->uv_stride, uv_w, uv_h);
+      ExportBlock(vsrc, vdst, pic->uv_stride, uv_w, uv_h);
    }
  }
 }
@ -178,47 +190,51 @@ void VP8IteratorExport(const VP8EncIterator* const it) {

 void VP8IteratorNzToBytes(VP8EncIterator* const it) {
  const int tnz = it->nz_[0], lnz = it->nz_[-1];
+  int* const top_nz = it->top_nz_;
+  int* const left_nz = it->left_nz_;

  // Top-Y
-  it->top_nz_[0] = BIT(tnz, 12);
-  it->top_nz_[1] = BIT(tnz, 13);
-  it->top_nz_[2] = BIT(tnz, 14);
-  it->top_nz_[3] = BIT(tnz, 15);
+  top_nz[0] = BIT(tnz, 12);
+  top_nz[1] = BIT(tnz, 13);
+  top_nz[2] = BIT(tnz, 14);
+  top_nz[3] = BIT(tnz, 15);
  // Top-U
-  it->top_nz_[4] = BIT(tnz, 18);
-  it->top_nz_[5] = BIT(tnz, 19);
+  top_nz[4] = BIT(tnz, 18);
+  top_nz[5] = BIT(tnz, 19);
  // Top-V
-  it->top_nz_[6] = BIT(tnz, 22);
-  it->top_nz_[7] = BIT(tnz, 23);
+  top_nz[6] = BIT(tnz, 22);
+  top_nz[7] = BIT(tnz, 23);
  // DC
-  it->top_nz_[8] = BIT(tnz, 24);
+  top_nz[8] = BIT(tnz, 24);

  // left-Y
-  it->left_nz_[0] = BIT(lnz,  3);
-  it->left_nz_[1] = BIT(lnz,  7);
-  it->left_nz_[2] = BIT(lnz, 11);
-  it->left_nz_[3] = BIT(lnz, 15);
+  left_nz[0] = BIT(lnz,  3);
+  left_nz[1] = BIT(lnz,  7);
+  left_nz[2] = BIT(lnz, 11);
+  left_nz[3] = BIT(lnz, 15);
  // left-U
-  it->left_nz_[4] = BIT(lnz, 17);
-  it->left_nz_[5] = BIT(lnz, 19);
+  left_nz[4] = BIT(lnz, 17);
+  left_nz[5] = BIT(lnz, 19);
  // left-V
-  it->left_nz_[6] = BIT(lnz, 21);
-  it->left_nz_[7] = BIT(lnz, 23);
+  left_nz[6] = BIT(lnz, 21);
+  left_nz[7] = BIT(lnz, 23);
  // left-DC is special, iterated separately
 }

 void VP8IteratorBytesToNz(VP8EncIterator* const it) {
  uint32_t nz = 0;
+  const int* const top_nz = it->top_nz_;
+  const int* const left_nz = it->left_nz_;
  // top
-  nz |= (it->top_nz_[0] << 12) | (it->top_nz_[1] << 13);
-  nz |= (it->top_nz_[2] << 14) | (it->top_nz_[3] << 15);
-  nz |= (it->top_nz_[4] << 18) | (it->top_nz_[5] << 19);
-  nz |= (it->top_nz_[6] << 22) | (it->top_nz_[7] << 23);
-  nz |= (it->top_nz_[8] << 24);  // we propagate the _top_ bit, esp. for intra4
+  nz |= (top_nz[0] << 12) | (top_nz[1] << 13);
+  nz |= (top_nz[2] << 14) | (top_nz[3] << 15);
+  nz |= (top_nz[4] << 18) | (top_nz[5] << 19);
+  nz |= (top_nz[6] << 22) | (top_nz[7] << 23);
+  nz |= (top_nz[8] << 24);  // we propagate the _top_ bit, esp. for intra4
  // left
-  nz |= (it->left_nz_[0] << 3) | (it->left_nz_[1] << 7);
-  nz |= (it->left_nz_[2] << 11);
-  nz |= (it->left_nz_[4] << 17) | (it->left_nz_[6] << 21);
+  nz |= (left_nz[0] << 3) | (left_nz[1] << 7);
+  nz |= (left_nz[2] << 11);
+  nz |= (left_nz[4] << 17) | (left_nz[6] << 21);

  *it->nz_ = nz;
 }
@ -274,8 +290,8 @@ int VP8IteratorNext(VP8EncIterator* const it,
 // Helper function to set mode properties

 void VP8SetIntra16Mode(const VP8EncIterator* const it, int mode) {
-  int y;
  uint8_t* preds = it->preds_;
+  int y;
  for (y = 0; y < 4; ++y) {
    memset(preds, mode, 4);
    preds += it->enc_->preds_w_;
@ -283,14 +299,13 @@ void VP8SetIntra16Mode(const VP8EncIterator* const it, int mode) {
  it->mb_->type_ = 1;
 }

-void VP8SetIntra4Mode(const VP8EncIterator* const it, int modes[16]) {
-  int x, y;
+void VP8SetIntra4Mode(const VP8EncIterator* const it, const uint8_t* modes) {
  uint8_t* preds = it->preds_;
-  for (y = 0; y < 4; ++y) {
-    for (x = 0; x < 4; ++x) {
-      preds[x] = modes[x + y * 4];
-    }
+  int y;
+  for (y = 4; y > 0; --y) {
+    memcpy(preds, modes, 4 * sizeof(*modes));
    preds += it->enc_->preds_w_;
+    modes += 4;
  }
  it->mb_->type_ = 0;
 }
@ -347,7 +362,7 @@ static const uint8_t VP8TopLeftI4[16] = {
 };

 void VP8IteratorStartI4(VP8EncIterator* const it) {
-  VP8Encoder* const enc = it->enc_;
+  const VP8Encoder* const enc = it->enc_;
  int i;

  it->i4_ = 0;    // first 4x4 sub-block
@ -393,7 +408,7 @@ int VP8IteratorRotateI4(VP8EncIterator* const it,
    }
  }
  // move pointers to next sub-block
-  it->i4_++;
+  ++it->i4_;
  if (it->i4_ == 16) {    // we're done
    return 0;
  }
--- a/src/enc/layer.c
+++ b/src/enc/layer.c
@ -1,26 +1,24 @@
-// Copyright 2011 Google Inc.
+// Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Enhancement layer (for YUV444/422)
 //
 // Author: Skal (pascal.massimino@gmail.com)

-#include <assert.h>
 #include <stdlib.h>
-#include "vp8enci.h"
+
+#include "./vp8enci.h"

 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif

-#ifdef WEBP_EXPERIMENTAL_FEATURES
-
-#endif    /* WEBP_EXPERIMENTAL_FEATURES */
-
 //------------------------------------------------------------------------------

 void VP8EncInitLayer(VP8Encoder* const enc) {
@ -34,8 +32,6 @@ void VP8EncInitLayer(VP8Encoder* const enc) {

 void VP8EncCodeLayerBlock(VP8EncIterator* it) {
  (void)it;   // remove a warning
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-#endif    /* WEBP_EXPERIMENTAL_FEATURES */
 }

 int VP8EncFinishLayer(VP8Encoder* const enc) {
--- a/src/enc/picture.c
+++ b/src/enc/picture.c
--- a/src/enc/quant.c
+++ b/src/enc/quant.c
@ -1,8 +1,10 @@
-// Copyright 2011 Google Inc.
+// Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 //   Quantization
@ -12,8 +14,8 @@
 #include <assert.h>
 #include <math.h>

-#include "vp8enci.h"
-#include "cost.h"
+#include "./vp8enci.h"
+#include "./cost.h"

 #define DO_TRELLIS_I4  1
 #define DO_TRELLIS_I16 1   // not a huge gain, but ok at low bitrate.
@ -27,6 +29,8 @@
 #define SNS_TO_DQ 0.9     // Scaling constant between the sns value and the QP
                          // power-law modulation. Must be strictly less than 1.

+#define I4_PENALTY 4000   // Rate-penalty for quick i4/i16 decision
+
 #define MULT_8B(a, b) (((a) * (b) + 128) >> 8)

 #if defined(__cplusplus) || defined(c_plusplus)
@ -35,7 +39,7 @@ extern "C" {

 //------------------------------------------------------------------------------

-static inline int clip(int v, int m, int M) {
+static WEBP_INLINE int clip(int v, int m, int M) {
  return v < m ? m : v > M ? M : v;
 }

@ -224,28 +228,90 @@ static void SetupFilterStrength(VP8Encoder* const enc) {
 // We want to emulate jpeg-like behaviour where the expected "good" quality
 // is around q=75. Internally, our "good" middle is around c=50. So we
 // map accordingly using linear piece-wise function
-static double QualityToCompression(double q) {
-  const double c = q / 100.;
-  return (c < 0.75) ? c * (2. / 3.) : 2. * c - 1.;
+static double QualityToCompression(double c) {
+  const double linear_c = (c < 0.75) ? c * (2. / 3.) : 2. * c - 1.;
+  // The file size roughly scales as pow(quantizer, 3.). Actually, the
+  // exponent is somewhere between 2.8 and 3.2, but we're mostly interested
+  // in the mid-quant range. So we scale the compressibility inversely to
+  // this power-law: quant ~= compression ^ 1/3. This law holds well for
+  // low quant. Finer modelling for high-quant would make use of kAcTable[]
+  // more explicitly.
+  const double v = pow(linear_c, 1 / 3.);
+  return v;
+}
+
+static double QualityToJPEGCompression(double c, double alpha) {
+  // We map the complexity 'alpha' and quality setting 'c' to a compression
+  // exponent empirically matched to the compression curve of libjpeg6b.
+  // On average, the WebP output size will be roughly similar to that of a
+  // JPEG file compressed with same quality factor.
+  const double amin = 0.30;
+  const double amax = 0.85;
+  const double exp_min = 0.4;
+  const double exp_max = 0.9;
+  const double slope = (exp_min - exp_max) / (amax - amin);
+  // Linearly interpolate 'expn' from exp_min to exp_max
+  // in the [amin, amax] range.
+  const double expn = (alpha > amax) ? exp_min
+                    : (alpha < amin) ? exp_max
+                    : exp_max + slope * (alpha - amin);
+  const double v = pow(c, expn);
+  return v;
+}
+
+static int SegmentsAreEquivalent(const VP8SegmentInfo* const S1,
+                                 const VP8SegmentInfo* const S2) {
+  return (S1->quant_ == S2->quant_) && (S1->fstrength_ == S2->fstrength_);
+}
+
+static void SimplifySegments(VP8Encoder* const enc) {
+  int map[NUM_MB_SEGMENTS] = { 0, 1, 2, 3 };
+  const int num_segments = enc->segment_hdr_.num_segments_;
+  int num_final_segments = 1;
+  int s1, s2;
+  for (s1 = 1; s1 < num_segments; ++s1) {    // find similar segments
+    const VP8SegmentInfo* const S1 = &enc->dqm_[s1];
+    int found = 0;
+    // check if we already have similar segment
+    for (s2 = 0; s2 < num_final_segments; ++s2) {
+      const VP8SegmentInfo* const S2 = &enc->dqm_[s2];
+      if (SegmentsAreEquivalent(S1, S2)) {
+        found = 1;
+        break;
+      }
+    }
+    map[s1] = s2;
+    if (!found) {
+      if (num_final_segments != s1) {
+        enc->dqm_[num_final_segments] = enc->dqm_[s1];
+      }
+      ++num_final_segments;
+    }
+  }
+  if (num_final_segments < num_segments) {  // Remap
+    int i = enc->mb_w_ * enc->mb_h_;
+    while (i-- > 0) enc->mb_info_[i].segment_ = map[enc->mb_info_[i].segment_];
+    enc->segment_hdr_.num_segments_ = num_final_segments;
+    // Replicate the trailing segment infos (it's mostly cosmetics)
+    for (i = num_final_segments; i < num_segments; ++i) {
+      enc->dqm_[i] = enc->dqm_[num_final_segments - 1];
+    }
+  }
 }

 void VP8SetSegmentParams(VP8Encoder* const enc, float quality) {
  int i;
  int dq_uv_ac, dq_uv_dc;
-  const int num_segments = enc->config_->segments;
+  const int num_segments = enc->segment_hdr_.num_segments_;
  const double amp = SNS_TO_DQ * enc->config_->sns_strength / 100. / 128.;
-  const double c_base = QualityToCompression(quality);
+  const double Q = quality / 100.;
+  const double c_base = enc->config_->emulate_jpeg_size ?
+      QualityToJPEGCompression(Q, enc->alpha_ / 255.) :
+      QualityToCompression(Q);
  for (i = 0; i < num_segments; ++i) {
-    // The file size roughly scales as pow(quantizer, 3.). Actually, the
-    // exponent is somewhere between 2.8 and 3.2, but we're mostly interested
-    // in the mid-quant range. So we scale the compressibility inversely to
-    // this power-law: quant ~= compression ^ 1/3. This law holds well for
-    // low quant. Finer modelling for high-quant would make use of kAcTable[]
-    // more explicitely.
-    // Additionally, we modulate the base exponent 1/3 to accommodate for the
-    // quantization susceptibility and allow denser segments to be quantized
-    // more.
-    const double expn = (1. - amp * enc->dqm_[i].alpha_) / 3.;
+    // We modulate the base coefficient to accommodate for the quantization
+    // susceptibility and allow denser segments to be quantized more.
+    const double expn = 1. - amp * enc->dqm_[i].alpha_;
    const double c = pow(c_base, expn);
    const int q = (int)(127. * (1. - c));
    assert(expn > 0.);
@ -281,9 +347,11 @@ void VP8SetSegmentParams(VP8Encoder* const enc, float quality) {
  enc->dq_uv_dc_ = dq_uv_dc;
  enc->dq_uv_ac_ = dq_uv_ac;

-  SetupMatrices(enc);
-
  SetupFilterStrength(enc);   // initialize segments' filtering, eventually
+
+  if (num_segments > 1) SimplifySegments(enc);
+
+  SetupMatrices(enc);         // finalize quantization matrices
 }

 //------------------------------------------------------------------------------
@ -299,16 +367,16 @@ const int VP8I4ModeOffsets[NUM_BMODES] = {
 };

 void VP8MakeLuma16Preds(const VP8EncIterator* const it) {
-  VP8Encoder* const enc = it->enc_;
-  const uint8_t* left = it->x_ ? enc->y_left_ : NULL;
-  const uint8_t* top = it->y_ ? enc->y_top_ + it->x_ * 16 : NULL;
+  const VP8Encoder* const enc = it->enc_;
+  const uint8_t* const left = it->x_ ? enc->y_left_ : NULL;
+  const uint8_t* const top = it->y_ ? enc->y_top_ + it->x_ * 16 : NULL;
  VP8EncPredLuma16(it->yuv_p_, left, top);
 }

 void VP8MakeChroma8Preds(const VP8EncIterator* const it) {
-  VP8Encoder* const enc = it->enc_;
-  const uint8_t* left = it->x_ ? enc->u_left_ : NULL;
-  const uint8_t* top = it->y_ ? enc->uv_top_ + it->x_ * 16 : NULL;
+  const VP8Encoder* const enc = it->enc_;
+  const uint8_t* const left = it->x_ ? enc->u_left_ : NULL;
+  const uint8_t* const top = it->y_ ? enc->uv_top_ + it->x_ * 16 : NULL;
  VP8EncPredChroma8(it->yuv_p_, left, top);
 }

@ -406,13 +474,13 @@ typedef struct {
 #define NUM_NODES (MIN_DELTA + 1 + MAX_DELTA)
 #define NODE(n, l) (nodes[(n) + 1][(l) + MIN_DELTA])

-static inline void SetRDScore(int lambda, VP8ModeScore* const rd) {
+static WEBP_INLINE void SetRDScore(int lambda, VP8ModeScore* const rd) {
  // TODO: incorporate the "* 256" in the tables?
  rd->score = rd->R * lambda + 256 * (rd->D + rd->SD);
 }

-static inline score_t RDScoreTrellis(int lambda, score_t rate,
-                                     score_t distortion) {
+static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate,
+                                          score_t distortion) {
  return rate * lambda + 256 * distortion;
 }

@ -700,7 +768,7 @@ static void SwapOut(VP8EncIterator* const it) {
 }

 static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* const rd) {
-  VP8Encoder* const enc = it->enc_;
+  const VP8Encoder* const enc = it->enc_;
  const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
  const int lambda = dqm->lambda_i16_;
  const int tlambda = dqm->tlambda_;
@ -709,7 +777,7 @@ static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* const rd) {
  int mode;

  rd->mode_i16 = -1;
-  for (mode = 0; mode < 4; ++mode) {
+  for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
    uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF;  // scratch buffer
    int nz;

@ -742,7 +810,7 @@ static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* const rd) {

 // return the cost array corresponding to the surrounding prediction modes.
 static const uint16_t* GetCostModeI4(VP8EncIterator* const it,
-                                     const int modes[16]) {
+                                     const uint8_t modes[16]) {
  const int preds_w = it->enc_->preds_w_;
  const int x = (it->i4_ & 3), y = it->i4_ >> 2;
  const int left = (x == 0) ? it->preds_[y * preds_w - 1] : modes[it->i4_ - 1];
@ -751,7 +819,7 @@ static const uint16_t* GetCostModeI4(VP8EncIterator* const it,
 }

 static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
-  VP8Encoder* const enc = it->enc_;
+  const VP8Encoder* const enc = it->enc_;
  const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
  const int lambda = dqm->lambda_i4_;
  const int tlambda = dqm->tlambda_;
@ -827,7 +895,7 @@ static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
 //------------------------------------------------------------------------------

 static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
-  VP8Encoder* const enc = it->enc_;
+  const VP8Encoder* const enc = it->enc_;
  const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
  const int lambda = dqm->lambda_uv_;
  const uint8_t* const src = it->yuv_in_ + U_OFF;
@ -838,7 +906,7 @@ static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {

  rd->mode_uv = -1;
  InitScore(&rd_best);
-  for (mode = 0; mode < 4; ++mode) {
+  for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
    VP8ModeScore rd_uv;

    // Reconstruct
@ -867,10 +935,10 @@ static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {

 static void SimpleQuantize(VP8EncIterator* const it, VP8ModeScore* const rd) {
  const VP8Encoder* const enc = it->enc_;
-  const int i16 = (it->mb_->type_ == 1);
+  const int is_i16 = (it->mb_->type_ == 1);
  int nz = 0;

-  if (i16) {
+  if (is_i16) {
    nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF, it->preds_[0]);
  } else {
    VP8IteratorStartI4(it);
@ -889,11 +957,66 @@ static void SimpleQuantize(VP8EncIterator* const it, VP8ModeScore* const rd) {
  rd->nz = nz;
 }

+// Refine intra16/intra4 sub-modes based on distortion only (not rate).
+static void DistoRefine(VP8EncIterator* const it, int try_both_i4_i16) {
+  const int is_i16 = (it->mb_->type_ == 1);
+  score_t best_score = MAX_COST;
+
+  if (try_both_i4_i16 || is_i16) {
+    int mode;
+    int best_mode = -1;
+    for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
+      const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
+      const uint8_t* const src = it->yuv_in_ + Y_OFF;
+      const score_t score = VP8SSE16x16(src, ref);
+      if (score < best_score) {
+        best_mode = mode;
+        best_score = score;
+      }
+    }
+    VP8SetIntra16Mode(it, best_mode);
+  }
+  if (try_both_i4_i16 || !is_i16) {
+    uint8_t modes_i4[16];
+    // We don't evaluate the rate here, but just account for it through a
+    // constant penalty (i4 mode usually needs more bits compared to i16).
+    score_t score_i4 = (score_t)I4_PENALTY;
+
+    VP8IteratorStartI4(it);
+    do {
+      int mode;
+      int best_sub_mode = -1;
+      score_t best_sub_score = MAX_COST;
+      const uint8_t* const src = it->yuv_in_ + Y_OFF + VP8Scan[it->i4_];
+
+      // TODO(skal): we don't really need the prediction pixels here,
+      // but just the distortion against 'src'.
+      VP8MakeIntra4Preds(it);
+      for (mode = 0; mode < NUM_BMODES; ++mode) {
+        const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
+        const score_t score = VP8SSE4x4(src, ref);
+        if (score < best_sub_score) {
+          best_sub_mode = mode;
+          best_sub_score = score;
+        }
+      }
+      modes_i4[it->i4_] = best_sub_mode;
+      score_i4 += best_sub_score;
+      if (score_i4 >= best_score) break;
+    } while (VP8IteratorRotateI4(it, it->yuv_in_ + Y_OFF));
+    if (score_i4 < best_score) {
+      VP8SetIntra4Mode(it, modes_i4);
+    }
+  }
+}
+
 //------------------------------------------------------------------------------
 // Entry point

-int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd, int rd_opt) {
+int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd,
+                VP8RDLevel rd_opt) {
  int is_skipped;
+  const int method = it->enc_->method_;

  InitScore(rd);

@ -902,22 +1025,21 @@ int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd, int rd_opt) {
  VP8MakeLuma16Preds(it);
  VP8MakeChroma8Preds(it);

-  // for rd_opt = 2, we perform trellis-quant on the final decision only.
-  // for rd_opt > 2, we use it for every scoring (=much slower).
-  if (rd_opt > 0) {
-    it->do_trellis_ = (rd_opt > 2);
+  if (rd_opt > RD_OPT_NONE) {
+    it->do_trellis_ = (rd_opt >= RD_OPT_TRELLIS_ALL);
    PickBestIntra16(it, rd);
-    if (it->enc_->method_ >= 2) {
+    if (method >= 2) {
      PickBestIntra4(it, rd);
    }
    PickBestUV(it, rd);
-    if (rd_opt == 2) {
+    if (rd_opt == RD_OPT_TRELLIS) {   // finish off with trellis-optim now
      it->do_trellis_ = 1;
      SimpleQuantize(it, rd);
    }
  } else {
-    // TODO: for method_ == 2, pick the best intra4/intra16 based on SSE
-    it->do_trellis_ = (it->enc_->method_ == 2);
+    // For method == 2, pick the best intra4/intra16 based on SSE (~tad slower).
+    // For method <= 1, we refine intra4 or intra16 (but don't re-examine mode).
+    DistoRefine(it, (method >= 2));
    SimpleQuantize(it, rd);
  }
  is_skipped = (rd->nz == 0);
--- a/src/enc/syntax.c
+++ b/src/enc/syntax.c
@ -1,8 +1,10 @@
-// Copyright 2011 Google Inc.
+// Copyright 2011 Google Inc. All Rights Reserved.
 //
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Header syntax writing
@ -10,74 +12,179 @@
 // Author: Skal (pascal.massimino@gmail.com)

 #include <assert.h>
-#include <math.h>

-#include "vp8enci.h"
+#include "../utils/utils.h"
+#include "../webp/format_constants.h"  // RIFF constants
+#include "../webp/mux_types.h"         // ALPHA_FLAG
+#include "./vp8enci.h"

 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif

-#define KSIGNATURE 0x9d012a
-#define KHEADER_SIZE 10
-#define KRIFF_SIZE 20
-#define KSIZE_OFFSET (KRIFF_SIZE - 8)
+//------------------------------------------------------------------------------
+// Helper functions

-#define MAX_PARTITION0_SIZE (1 << 19)   // max size of mode partition
-#define MAX_PARTITION_SIZE  (1 << 24)   // max size for token partition
+static int IsVP8XNeeded(const VP8Encoder* const enc) {
+  return !!enc->has_alpha_;  // Currently the only case when VP8X is needed.
+                             // This could change in the future.
+}
+
+static int PutPaddingByte(const WebPPicture* const pic) {
+  const uint8_t pad_byte[1] = { 0 };
+  return !!pic->writer(pad_byte, 1, pic);
+}

 //------------------------------------------------------------------------------
 // Writers for header's various pieces (in order of appearance)

-// Main keyframe header
-
-static void PutLE32(uint8_t* const data, uint32_t val) {
-  data[0] = (val >>  0) & 0xff;
-  data[1] = (val >>  8) & 0xff;
-  data[2] = (val >> 16) & 0xff;
-  data[3] = (val >> 24) & 0xff;
+static WebPEncodingError PutRIFFHeader(const VP8Encoder* const enc,
+                                       size_t riff_size) {
+  const WebPPicture* const pic = enc->pic_;
+  uint8_t riff[RIFF_HEADER_SIZE] = {
+    'R', 'I', 'F', 'F', 0, 0, 0, 0, 'W', 'E', 'B', 'P'
+  };
+  assert(riff_size == (uint32_t)riff_size);
+  PutLE32(riff + TAG_SIZE, (uint32_t)riff_size);
+  if (!pic->writer(riff, sizeof(riff), pic)) {
+    return VP8_ENC_ERROR_BAD_WRITE;
+  }
+  return VP8_ENC_OK;
 }

-static int PutHeader(int profile, size_t size0, size_t total_size,
-                     WebPPicture* const pic) {
-  uint8_t buf[KHEADER_SIZE];
-  uint8_t RIFF[KRIFF_SIZE] = {
-    'R', 'I', 'F', 'F', 0, 0, 0, 0, 'W', 'E', 'B', 'P', 'V', 'P', '8', ' '
+static WebPEncodingError PutVP8XHeader(const VP8Encoder* const enc) {
+  const WebPPicture* const pic = enc->pic_;
+  uint8_t vp8x[CHUNK_HEADER_SIZE + VP8X_CHUNK_SIZE] = {
+    'V', 'P', '8', 'X'
  };
+  uint32_t flags = 0;
+
+  assert(IsVP8XNeeded(enc));
+  assert(pic->width >= 1 && pic->height >= 1);
+  assert(pic->width <= MAX_CANVAS_SIZE && pic->height <= MAX_CANVAS_SIZE);
+
+  if (enc->has_alpha_) {
+    flags |= ALPHA_FLAG;
+  }
+
+  PutLE32(vp8x + TAG_SIZE,              VP8X_CHUNK_SIZE);
+  PutLE32(vp8x + CHUNK_HEADER_SIZE,     flags);
+  PutLE24(vp8x + CHUNK_HEADER_SIZE + 4, pic->width - 1);
+  PutLE24(vp8x + CHUNK_HEADER_SIZE + 7, pic->height - 1);
+  if (!pic->writer(vp8x, sizeof(vp8x), pic)) {
+    return VP8_ENC_ERROR_BAD_WRITE;
+  }
+  return VP8_ENC_OK;
+}
+
+static WebPEncodingError PutAlphaChunk(const VP8Encoder* const enc) {
+  const WebPPicture* const pic = enc->pic_;
+  uint8_t alpha_chunk_hdr[CHUNK_HEADER_SIZE] = {
+    'A', 'L', 'P', 'H'
+  };
+
+  assert(enc->has_alpha_);
+
+  // Alpha chunk header.
+  PutLE32(alpha_chunk_hdr + TAG_SIZE, enc->alpha_data_size_);
+  if (!pic->writer(alpha_chunk_hdr, sizeof(alpha_chunk_hdr), pic)) {
+    return VP8_ENC_ERROR_BAD_WRITE;
+  }
+
+  // Alpha chunk data.
+  if (!pic->writer(enc->alpha_data_, enc->alpha_data_size_, pic)) {
+    return VP8_ENC_ERROR_BAD_WRITE;
+  }
+
+  // Padding.
+  if ((enc->alpha_data_size_ & 1) && !PutPaddingByte(pic)) {
+    return VP8_ENC_ERROR_BAD_WRITE;
+  }
+  return VP8_ENC_OK;
+}
+
+static WebPEncodingError PutVP8Header(const WebPPicture* const pic,
+                                      size_t vp8_size) {
+  uint8_t vp8_chunk_hdr[CHUNK_HEADER_SIZE] = {
+    'V', 'P', '8', ' '
+  };
+  assert(vp8_size == (uint32_t)vp8_size);
+  PutLE32(vp8_chunk_hdr + TAG_SIZE, (uint32_t)vp8_size);
+  if (!pic->writer(vp8_chunk_hdr, sizeof(vp8_chunk_hdr), pic)) {
+    return VP8_ENC_ERROR_BAD_WRITE;
+  }
+  return VP8_ENC_OK;
+}
+
+static WebPEncodingError PutVP8FrameHeader(const WebPPicture* const pic,
+                                           int profile, size_t size0) {
+  uint8_t vp8_frm_hdr[VP8_FRAME_HEADER_SIZE];
  uint32_t bits;

-  if (size0 >= MAX_PARTITION0_SIZE) {  // partition #0 is too big to fit
-    return WebPEncodingSetError(pic, VP8_ENC_ERROR_PARTITION0_OVERFLOW);
-  }
-
-  if (total_size > 0xfffffffeU - KRIFF_SIZE) {
-    return WebPEncodingSetError(pic, VP8_ENC_ERROR_FILE_TOO_BIG);
-  }
-
-  PutLE32(RIFF + 4, (uint32_t)(total_size + KSIZE_OFFSET));
-  PutLE32(RIFF + 16, (uint32_t)total_size);
-  if (!pic->writer(RIFF, sizeof(RIFF), pic)) {
-    return WebPEncodingSetError(pic, VP8_ENC_ERROR_BAD_WRITE);
+  if (size0 >= VP8_MAX_PARTITION0_SIZE) {  // partition #0 is too big to fit
+    return VP8_ENC_ERROR_PARTITION0_OVERFLOW;
  }

+  // Paragraph 9.1.
  bits = 0                         // keyframe (1b)
       | (profile << 1)            // profile (3b)
       | (1 << 4)                  // visible (1b)
       | ((uint32_t)size0 << 5);   // partition length (19b)
-  buf[0] = bits & 0xff;
-  buf[1] = (bits >> 8) & 0xff;
-  buf[2] = (bits >> 16) & 0xff;
+  vp8_frm_hdr[0] = (bits >>  0) & 0xff;
+  vp8_frm_hdr[1] = (bits >>  8) & 0xff;
+  vp8_frm_hdr[2] = (bits >> 16) & 0xff;
  // signature
-  buf[3] = (KSIGNATURE >> 16) & 0xff;
-  buf[4] = (KSIGNATURE >> 8) & 0xff;
-  buf[5] = (KSIGNATURE >> 0) & 0xff;
+  vp8_frm_hdr[3] = (VP8_SIGNATURE >> 16) & 0xff;
+  vp8_frm_hdr[4] = (VP8_SIGNATURE >>  8) & 0xff;
+  vp8_frm_hdr[5] = (VP8_SIGNATURE >>  0) & 0xff;
  // dimensions
-  buf[6] = pic->width & 0xff;
-  buf[7] = pic->width >> 8;
-  buf[8] = pic->height & 0xff;
-  buf[9] = pic->height >> 8;
+  vp8_frm_hdr[6] = pic->width & 0xff;
+  vp8_frm_hdr[7] = pic->width >> 8;
+  vp8_frm_hdr[8] = pic->height & 0xff;
+  vp8_frm_hdr[9] = pic->height >> 8;

-  return pic->writer(buf, sizeof(buf), pic);
+  if (!pic->writer(vp8_frm_hdr, sizeof(vp8_frm_hdr), pic)) {
+    return VP8_ENC_ERROR_BAD_WRITE;
+  }
+  return VP8_ENC_OK;
+}
+
+// WebP Headers.
+static int PutWebPHeaders(const VP8Encoder* const enc, size_t size0,
+                          size_t vp8_size, size_t riff_size) {
+  WebPPicture* const pic = enc->pic_;
+  WebPEncodingError err = VP8_ENC_OK;
+
+  // RIFF header.
+  err = PutRIFFHeader(enc, riff_size);
+  if (err != VP8_ENC_OK) goto Error;
+
+  // VP8X.
+  if (IsVP8XNeeded(enc)) {
+    err = PutVP8XHeader(enc);
+    if (err != VP8_ENC_OK) goto Error;
+  }
+
+  // Alpha.
+  if (enc->has_alpha_) {
+    err = PutAlphaChunk(enc);
+    if (err != VP8_ENC_OK) goto Error;
+  }
+
+  // VP8 header.
+  err = PutVP8Header(pic, vp8_size);
+  if (err != VP8_ENC_OK) goto Error;
+
+  // VP8 frame header.
+  err = PutVP8FrameHeader(pic, enc->profile_, size0);
+  if (err != VP8_ENC_OK) goto Error;
+
+  // All OK.
+  return 1;
+
+  // Error.
+ Error:
+  return WebPEncodingSetError(pic, err);
 }

 // Segmentation header
@ -148,7 +255,7 @@ static int EmitPartitionsSize(const VP8Encoder* const enc,
  int p;
  for (p = 0; p < enc->num_parts_ - 1; ++p) {
    const size_t part_size = VP8BitWriterSize(enc->parts_ + p);
-    if (part_size >= MAX_PARTITION_SIZE) {
+    if (part_size >= VP8_MAX_PARTITION_SIZE) {
      return WebPEncodingSetError(pic, VP8_ENC_ERROR_PARTITION_OVERFLOW);
    }
    buf[3 * p + 0] = (part_size >>  0) & 0xff;
@ -164,12 +271,6 @@ static int EmitPartitionsSize(const VP8Encoder* const enc,

 #define KTRAILER_SIZE 8

-static void PutLE24(uint8_t* buf, size_t value) {
-  buf[0] = (value >>  0) & 0xff;
-  buf[1] = (value >>  8) & 0xff;
-  buf[2] = (value >> 16) & 0xff;
-}
-
 static int WriteExtensions(VP8Encoder* const enc) {
  uint8_t buffer[KTRAILER_SIZE];
  VP8BitWriter* const bw = &enc->bw_;
@ -186,14 +287,6 @@ static int WriteExtensions(VP8Encoder* const enc) {
      return WebPEncodingSetError(pic, VP8_ENC_ERROR_BITSTREAM_OUT_OF_MEMORY);
    }
  }
-  // Alpha (bytes 4..6)
-  PutLE24(buffer + 4, enc->alpha_data_size_);
-  if (enc->alpha_data_size_ > 0) {
-    assert(enc->has_alpha_);
-    if (!VP8BitWriterAppend(bw, enc->alpha_data_, enc->alpha_data_size_)) {
-      return WebPEncodingSetError(pic, VP8_ENC_ERROR_BITSTREAM_OUT_OF_MEMORY);
-    }
-  }

  buffer[KTRAILER_SIZE - 1] = 0x01;  // marker
  if (!VP8BitWriterAppend(bw, buffer, KTRAILER_SIZE)) {
@ -211,7 +304,7 @@ static size_t GeneratePartition0(VP8Encoder* const enc) {
  const int mb_size = enc->mb_w_ * enc->mb_h_;
  uint64_t pos1, pos2, pos3;
 #ifdef WEBP_EXPERIMENTAL_FEATURES
-  const int need_extensions = enc->has_alpha_ || enc->use_layer_;
+  const int need_extensions = enc->use_layer_;
 #endif

  pos1 = VP8BitWriterPos(bw);
@ -225,7 +318,9 @@ static size_t GeneratePartition0(VP8Encoder* const enc) {

  PutSegmentHeader(bw, enc);
  PutFilterHeader(bw, &enc->filter_hdr_);
-  VP8PutValue(bw, enc->config_->partitions, 2);
+  VP8PutValue(bw, enc->num_parts_ == 8 ? 3 :
+                  enc->num_parts_ == 4 ? 2 :
+                  enc->num_parts_ == 2 ? 1 : 0, 2);
  PutQuant(bw, enc);
  VP8PutBitUniform(bw, 0);   // no proba update
  VP8WriteProbas(bw, &enc->proba_);
@ -250,32 +345,61 @@ static size_t GeneratePartition0(VP8Encoder* const enc) {
  return !bw->error_;
 }

+void VP8EncFreeBitWriters(VP8Encoder* const enc) {
+  int p;
+  VP8BitWriterWipeOut(&enc->bw_);
+  for (p = 0; p < enc->num_parts_; ++p) {
+    VP8BitWriterWipeOut(enc->parts_ + p);
+  }
+}
+
 int VP8EncWrite(VP8Encoder* const enc) {
  WebPPicture* const pic = enc->pic_;
  VP8BitWriter* const bw = &enc->bw_;
+  const int task_percent = 19;
+  const int percent_per_part = task_percent / enc->num_parts_;
+  const int final_percent = enc->percent_ + task_percent;
  int ok = 0;
-  size_t coded_size, pad;
+  size_t vp8_size, pad, riff_size;
  int p;

  // Partition #0 with header and partition sizes
  ok = !!GeneratePartition0(enc);

-  // Compute total size (for the RIFF header)
-  coded_size = KHEADER_SIZE + VP8BitWriterSize(bw) + 3 * (enc->num_parts_ - 1);
+  // Compute VP8 size
+  vp8_size = VP8_FRAME_HEADER_SIZE +
+             VP8BitWriterSize(bw) +
+             3 * (enc->num_parts_ - 1);
  for (p = 0; p < enc->num_parts_; ++p) {
-    coded_size += VP8BitWriterSize(enc->parts_ + p);
+    vp8_size += VP8BitWriterSize(enc->parts_ + p);
+  }
+  pad = vp8_size & 1;
+  vp8_size += pad;
+
+  // Compute RIFF size
+  // At the minimum it is: "WEBPVP8 nnnn" + VP8 data size.
+  riff_size = TAG_SIZE + CHUNK_HEADER_SIZE + vp8_size;
+  if (IsVP8XNeeded(enc)) {  // Add size for: VP8X header + data.
+    riff_size += CHUNK_HEADER_SIZE + VP8X_CHUNK_SIZE;
+  }
+  if (enc->has_alpha_) {  // Add size for: ALPH header + data.
+    const uint32_t padded_alpha_size = enc->alpha_data_size_ +
+                                       (enc->alpha_data_size_ & 1);
+    riff_size += CHUNK_HEADER_SIZE + padded_alpha_size;
+  }
+  // Sanity check.
+  if (riff_size > 0xfffffffeU) {
+    return WebPEncodingSetError(pic, VP8_ENC_ERROR_FILE_TOO_BIG);
  }
-  pad = coded_size & 1;
-  coded_size += pad;

  // Emit headers and partition #0
  {
    const uint8_t* const part0 = VP8BitWriterBuf(bw);
    const size_t size0 = VP8BitWriterSize(bw);
-    ok = ok && PutHeader(enc->profile_, size0, coded_size, pic)
+    ok = ok && PutWebPHeaders(enc, size0, vp8_size, riff_size)
            && pic->writer(part0, size0, pic)
            && EmitPartitionsSize(enc, pic);
-    free((void*)part0);
+    VP8BitWriterWipeOut(bw);    // will free the internal buffer.
  }

  // Token partitions
@ -284,16 +408,18 @@ int VP8EncWrite(VP8Encoder* const enc) {
    const size_t size = VP8BitWriterSize(enc->parts_ + p);
    if (size)
      ok = ok && pic->writer(buf, size, pic);
-    free((void*)buf);
+    VP8BitWriterWipeOut(enc->parts_ + p);    // will free the internal buffer.
+    ok = ok && WebPReportProgress(pic, enc->percent_ + percent_per_part,
+                                  &enc->percent_);
  }

  // Padding byte
  if (ok && pad) {
-    const uint8_t pad_byte[1] = { 0 };
-    ok = pic->writer(pad_byte, 1, pic);
+    ok = PutPaddingByte(pic);
  }

-  enc->coded_size_ = (int)coded_size + KRIFF_SIZE;
+  enc->coded_size_ = (int)(CHUNK_HEADER_SIZE + riff_size);
+  ok = ok && WebPReportProgress(pic, final_percent, &enc->percent_);
  return ok;
 }

--- a/src/enc/token.c
+++ b/src/enc/token.c
@ -0,0 +1,256 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Paginated token buffer
+//
+//  A 'token' is a bit value associated with a probability, either fixed
+// or a later-to-be-determined after statistics have been collected.
+// For dynamic probability, we just record the slot id (idx) for the probability
+// value in the final probability array (uint8_t* probas in VP8EmitTokens).
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "./vp8enci.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+#if !defined(DISABLE_TOKEN_BUFFER)
+
+// we use pages to reduce the number of memcpy()
+#define MAX_NUM_TOKEN 8192          // max number of token per page
+#define FIXED_PROBA_BIT (1u << 14)
+
+struct VP8Tokens {
+  uint16_t tokens_[MAX_NUM_TOKEN];  // bit#15: bit
+                                    // bit #14: constant proba or idx
+                                    // bits 0..13: slot or constant proba
+  VP8Tokens* next_;
+};
+
+//------------------------------------------------------------------------------
+
+void VP8TBufferInit(VP8TBuffer* const b) {
+  b->tokens_ = NULL;
+  b->pages_ = NULL;
+  b->last_page_ = &b->pages_;
+  b->left_ = 0;
+  b->error_ = 0;
+}
+
+void VP8TBufferClear(VP8TBuffer* const b) {
+  if (b != NULL) {
+    const VP8Tokens* p = b->pages_;
+    while (p != NULL) {
+      const VP8Tokens* const next = p->next_;
+      free((void*)p);
+      p = next;
+    }
+    VP8TBufferInit(b);
+  }
+}
+
+static int TBufferNewPage(VP8TBuffer* const b) {
+  VP8Tokens* const page = b->error_ ? NULL : (VP8Tokens*)malloc(sizeof(*page));
+  if (page == NULL) {
+    b->error_ = 1;
+    return 0;
+  }
+  *b->last_page_ = page;
+  b->last_page_ = &page->next_;
+  b->left_ = MAX_NUM_TOKEN;
+  b->tokens_ = page->tokens_;
+  page->next_ = NULL;
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+
+#define TOKEN_ID(t, b, ctx, p) \
+    ((p) + NUM_PROBAS * ((ctx) + NUM_CTX * ((b) + NUM_BANDS * (t))))
+
+static WEBP_INLINE int AddToken(VP8TBuffer* const b,
+                                int bit, uint32_t proba_idx) {
+  assert(proba_idx < FIXED_PROBA_BIT);
+  assert(bit == 0 || bit == 1);
+  if (b->left_ > 0 || TBufferNewPage(b)) {
+    const int slot = --b->left_;
+    b->tokens_[slot] = (bit << 15) | proba_idx;
+  }
+  return bit;
+}
+
+static WEBP_INLINE void AddConstantToken(VP8TBuffer* const b,
+                                         int bit, int proba) {
+  assert(proba < 256);
+  assert(bit == 0 || bit == 1);
+  if (b->left_ > 0 || TBufferNewPage(b)) {
+    const int slot = --b->left_;
+    b->tokens_[slot] = (bit << 15) | FIXED_PROBA_BIT | proba;
+  }
+}
+
+int VP8RecordCoeffTokens(int ctx, int coeff_type, int first, int last,
+                         const int16_t* const coeffs,
+                         VP8TBuffer* const tokens) {
+  int n = first;
+  uint32_t base_id = TOKEN_ID(coeff_type, n, ctx, 0);
+  if (!AddToken(tokens, last >= 0, base_id + 0)) {
+    return 0;
+  }
+
+  while (n < 16) {
+    const int c = coeffs[n++];
+    const int sign = c < 0;
+    int v = sign ? -c : c;
+    if (!AddToken(tokens, v != 0, base_id + 1)) {
+      ctx = 0;
+      base_id = TOKEN_ID(coeff_type, VP8EncBands[n], ctx, 0);
+      continue;
+    }
+    if (!AddToken(tokens, v > 1, base_id + 2)) {
+      ctx = 1;
+    } else {
+      if (!AddToken(tokens, v > 4, base_id + 3)) {
+        if (AddToken(tokens, v != 2, base_id + 4))
+          AddToken(tokens, v == 4, base_id + 5);
+      } else if (!AddToken(tokens, v > 10, base_id + 6)) {
+        if (!AddToken(tokens, v > 6, base_id + 7)) {
+          AddConstantToken(tokens, v == 6, 159);
+        } else {
+          AddConstantToken(tokens, v >= 9, 165);
+          AddConstantToken(tokens, !(v & 1), 145);
+        }
+      } else {
+        int mask;
+        const uint8_t* tab;
+        if (v < 3 + (8 << 1)) {          // VP8Cat3  (3b)
+          AddToken(tokens, 0, base_id + 8);
+          AddToken(tokens, 0, base_id + 9);
+          v -= 3 + (8 << 0);
+          mask = 1 << 2;
+          tab = VP8Cat3;
+        } else if (v < 3 + (8 << 2)) {   // VP8Cat4  (4b)
+          AddToken(tokens, 0, base_id + 8);
+          AddToken(tokens, 1, base_id + 9);
+          v -= 3 + (8 << 1);
+          mask = 1 << 3;
+          tab = VP8Cat4;
+        } else if (v < 3 + (8 << 3)) {   // VP8Cat5  (5b)
+          AddToken(tokens, 1, base_id + 8);
+          AddToken(tokens, 0, base_id + 10);
+          v -= 3 + (8 << 2);
+          mask = 1 << 4;
+          tab = VP8Cat5;
+        } else {                         // VP8Cat6 (11b)
+          AddToken(tokens, 1, base_id + 8);
+          AddToken(tokens, 1, base_id + 10);
+          v -= 3 + (8 << 3);
+          mask = 1 << 10;
+          tab = VP8Cat6;
+        }
+        while (mask) {
+          AddConstantToken(tokens, !!(v & mask), *tab++);
+          mask >>= 1;
+        }
+      }
+      ctx = 2;
+    }
+    AddConstantToken(tokens, sign, 128);
+    base_id = TOKEN_ID(coeff_type, VP8EncBands[n], ctx, 0);
+    if (n == 16 || !AddToken(tokens, n <= last, base_id + 0)) {
+      return 1;   // EOB
+    }
+  }
+  return 1;
+}
+
+#undef TOKEN_ID
+
+//------------------------------------------------------------------------------
+// This function works, but isn't currently used. Saved for later.
+
+#if 0
+
+static void Record(int bit, proba_t* const stats) {
+  proba_t p = *stats;
+  if (p >= 0xffff0000u) {               // an overflow is inbound.
+    p = ((p + 1u) >> 1) & 0x7fff7fffu;  // -> divide the stats by 2.
+  }
+  // record bit count (lower 16 bits) and increment total count (upper 16 bits).
+  p += 0x00010000u + bit;
+  *stats = p;
+}
+
+void VP8TokenToStats(const VP8TBuffer* const b, proba_t* const stats) {
+  const VP8Tokens* p = b->pages_;
+  while (p != NULL) {
+    const int N = (p->next_ == NULL) ? b->left_ : 0;
+    int n = MAX_NUM_TOKEN;
+    while (n-- > N) {
+      const uint16_t token = p->tokens_[n];
+      if (!(token & FIXED_PROBA_BIT)) {
+        Record((token >> 15) & 1, stats + (token & 0x3fffu));
+      }
+    }
+    p = p->next_;
+  }
+}
+
+#endif   // 0
+
+//------------------------------------------------------------------------------
+// Final coding pass, with known probabilities
+
+int VP8EmitTokens(VP8TBuffer* const b, VP8BitWriter* const bw,
+                  const uint8_t* const probas, int final_pass) {
+  const VP8Tokens* p = b->pages_;
+  (void)final_pass;
+  if (b->error_) return 0;
+  while (p != NULL) {
+    const VP8Tokens* const next = p->next_;
+    const int N = (next == NULL) ? b->left_ : 0;
+    int n = MAX_NUM_TOKEN;
+    while (n-- > N) {
+      const uint16_t token = p->tokens_[n];
+      const int bit = (token >> 15) & 1;
+      if (token & FIXED_PROBA_BIT) {
+        VP8PutBit(bw, bit, token & 0xffu);  // constant proba
+      } else {
+        VP8PutBit(bw, bit, probas[token & 0x3fffu]);
+      }
+    }
+    if (final_pass) free((void*)p);
+    p = next;
+  }
+  if (final_pass) b->pages_ = NULL;
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+
+#else     // DISABLE_TOKEN_BUFFER
+
+void VP8TBufferInit(VP8TBuffer* const b) {
+  (void)b;
+}
+void VP8TBufferClear(VP8TBuffer* const b) {
+  (void)b;
+}
+
+#endif    // !DISABLE_TOKEN_BUFFER
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/Show More
+++ b/Show More